Revision 139

tmp/org.txm.cooccurrence.core/.project (revision 139)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>org.txm.cooccurrence.core</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
	</buildSpec>
24
	<natures>
25
		<nature>org.eclipse.pde.PluginNature</nature>
26
		<nature>org.eclipse.jdt.core.javanature</nature>
27
	</natures>
28
</projectDescription>
0 29

  
tmp/org.txm.cooccurrence.core/src/org/txm/functions/cooccurrences/Cooccurrence.java (revision 139)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.functions.cooccurrences;
29

  
30
import java.io.BufferedWriter;
31
import java.io.File;
32
import java.io.FileOutputStream;
33
import java.io.IOException;
34
import java.io.OutputStreamWriter;
35
import java.io.Writer;
36
import java.util.ArrayList;
37
import java.util.Collections;
38
import java.util.HashMap;
39
import java.util.HashSet;
40
import java.util.List;
41
import java.util.Map;
42
import java.util.Set;
43

  
44
import org.rosuda.REngine.REXPMismatchException;
45
import org.txm.HasResults;
46
import org.txm.Messages;
47
import org.txm.functions.Function;
48
import org.txm.functions.TXMResult;
49
import org.txm.functions.concordances.Concordance;
50
import org.txm.functions.concordances.Line;
51
import org.txm.functions.cooccurrences.comparators.CLineComparator;
52
import org.txm.functions.index.Index;
53
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
54
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
55
import org.txm.searchengine.cqp.corpus.Corpus;
56
import org.txm.searchengine.cqp.corpus.CorpusManager;
57
import org.txm.searchengine.cqp.corpus.Property;
58
import org.txm.searchengine.cqp.corpus.QueryResult;
59
import org.txm.searchengine.cqp.corpus.StructuralUnit;
60
import org.txm.searchengine.cqp.corpus.query.Match;
61
import org.txm.searchengine.cqp.corpus.query.Query;
62
import org.txm.searchengine.cqp.serverException.CqiServerError;
63
import org.txm.specificities.core.functions.Specificites;
64
import org.txm.specificities.core.functions.SpecificitesResult;
65
import org.txm.stat.StatException;
66
import org.txm.stat.data.LexicalTable;
67
import org.txm.stat.engine.r.RWorkspace;
68
import org.txm.stat.engine.r.RWorkspaceException;
69
import org.txm.stat.engine.r.data.LexicalTableImpl;
70
import org.txm.utils.logger.Log;
71

  
72
// TODO: Auto-generated Javadoc
73
/**
74
 * Compute a coocurrence from a concordance.
75
 *
76
 * @author mdecorde
77
 * 
78
 */
79
public class Cooccurrence extends Function implements TXMResult  {
80

  
81
	/** The voc. */
82
	Index voc;
83

  
84
	/** The conc. */
85
	Concordance conc;
86
	/**
87
	 * The reference corpus to use = the R symbol that point to a matrix WordxFreqs
88
	 */
89
	String referenceCorpus;
90

  
91
	/** The conclines. */
92
	List<org.txm.functions.concordances.Line> conclines;
93

  
94
	/** The corpus. */
95
	Corpus corpus;
96

  
97
	/** The query. */
98
	Query query;
99
	String query_occ = "[]"; //$NON-NLS-1$
100

  
101
	/** The properties. */
102
	List<Property> properties;
103

  
104
	/** The limit. */
105
	StructuralUnit limit;
106

  
107
	/** The maxleft. */
108
	int maxleft;
109

  
110
	/** The maxright. */
111
	int maxright;
112

  
113
	/** The minleft. */
114
	int minleft = 1;
115

  
116
	/** The minright. */
117
	int minright = 1;
118

  
119
	/** The lines. */
120
	List<CLine> lines = new ArrayList<CLine>();
121

  
122
	/** The occproperties. */
123
	HashMap<String, List<String>> occproperties;
124

  
125
	/** The count. */
126
	HashMap<String, Integer> count;
127

  
128
	/** The dist. */
129
	HashMap<String, Float> dist;
130

  
131
	/** The freq. */
132
	HashMap<String, Integer> freq;
133

  
134
	/** The scores. */
135
	HashMap<String, Double> scores;
136

  
137
	/** The FA. */
138
	int FA = -1;
139

  
140
	/** The P. */
141
	int P = -1;
142

  
143
	/** The number of keyword. */
144
	int numberOfKeyword = 0;
145

  
146
	/** The seuil_freq. */
147
	int seuil_freq = -1;
148

  
149
	/** The seuil_count. */
150
	int seuil_count = -1;
151

  
152
	/** The seuil_score. */
153
	double seuil_score = -1;
154

  
155
	/** The contextquery. */
156
	private Query contextquery;
157

  
158
	/** The minf. */
159
	private int minf;
160

  
161
	/** The minscore. */
162
	private double minscore;
163

  
164
	/** The mincof. */
165
	private int mincof;
166

  
167
	/** The anticontextquery. */
168
	private Query anticontextquery;
169

  
170
	/** The include xpivot. */
171
	private boolean includeXpivot;
172

  
173
	//System.out.println("Matches: focus: "+m1.size()+" full: "+m2.size()+" anti: "+m3.size());
174
	//System.out.println("T matches : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$
175
	/** The distances. */
176
	HashMap<String, Double> distances = new HashMap<String, Double>();
177
	// contains the sum of distances
178
	/** The distancescounts. */
179
	HashMap<String, Integer> distancescounts = new HashMap<String, Integer>();
180
	// contains the sum of distances
181
	/** The counts. */
182
	HashMap<String, Integer> counts = new HashMap<String, Integer>();
183

  
184
	/** The indexfreqs. */
185
	HashMap<String, Integer> indexfreqs = new HashMap<String, Integer>();
186
	// contains the number of encounter
187
	/** The counted. */
188
	HashMap<Integer, Integer> counted = new HashMap<Integer, Integer>();
189

  
190
	/** The m1. */
191
	private List<Match> m1;
192
	//contains the list of positions already counteds
193
	/** The m2. */
194
	private List<Match> m2;
195

  
196
	/** The m3. */
197
	private List<Match> m3;
198

  
199
	/** The allsignaturesstr. */
200
	private HashMap<Integer, String> allsignaturesstr;
201

  
202
	/** The lt. */
203
	private LexicalTable lt;
204

  
205
	/** The keys to string. */
206
	private HashMap<String, String> keysToString;
207

  
208
	/** The symbol. */
209
	private String symbol;
210

  
211
	/** The writer. */
212
	private BufferedWriter writer;
213

  
214
	private boolean buildLexicalTableWithCooccurrents;
215

  
216
	/**
217
	 * specif y exactly on which concordance lines compute the cooc.
218
	 *
219
	 * @param conclines the conclines
220
	 */
221
	public Cooccurrence(List<Line> conclines) {
222
		if (conclines.size() > 0) {
223
			initConcInfos(conclines.get(0).getConcordance(), conclines);
224
		}
225
	}
226

  
227
	/**
228
	 * the cooc will be computed on all lines.
229
	 *
230
	 * @param conc the conc
231
	 */
232
	public Cooccurrence(Concordance conc) {
233
		initConcInfos(conc);
234
	}
235
	//
236
	//	/**
237
	//	 * build the conc and compute on all lines.
238
	//	 *
239
	//	 * @param corpus the corpus
240
	//	 * @param query the query
241
	//	 * @param properties the properties
242
	//	 * @param limit the limit
243
	//	 * @param leftlimit the leftlimit
244
	//	 * @param rightlimit the rightlimit
245
	//	 */
246
	//	public Cooccurrence(Corpus corpus, Query query, List<Property> properties,
247
	//			StructuralUnit limit, int leftlimit, int rightlimit) {
248
	//		try {
249
	//			Concordance conc = new Concordance(corpus, query,
250
	//					properties.get(0), properties,
251
	//					new ReferencePattern(), new ReferencePattern(),
252
	//					leftlimit, rightlimit);
253
	//			// corpus.storeResult(conc);
254
	//			initConcInfos(conc);
255
	//		} catch (CqiClientException e) {
256
	//			// TODO Auto-generated catch block
257
	//			org.txm.utils.logger.Log.printStackTrace(e);
258
	//		}
259
	//
260
	//	}
261

  
262
	/**
263
	 * Instantiates a new cooccurrence.
264
	 *
265
	 * @param corpus the corpus
266
	 * @param query the query
267
	 * @param properties the properties
268
	 * @param limit the limit
269
	 * @param maxleft the maxleft
270
	 * @param minleft the minleft
271
	 * @param minright the minright
272
	 * @param maxright the maxright
273
	 * @param minf the minf
274
	 * @param mincof the mincof
275
	 * @param minscore the minscore
276
	 * @param includeXpivot the include xpivot
277
	 * @throws Exception the exception
278
	 */
279
	public Cooccurrence(Corpus corpus, Query query, List<Property> properties,
280
			StructuralUnit limit, int maxleft, int minleft, int minright,
281
			int maxright, int minf, int mincof, double minscore, boolean includeXpivot, boolean buildLexicalTableWithCooccurrents)
282
					throws Exception {
283
		long time = System.currentTimeMillis();
284

  
285
		// System.out.println(Messages.Cooccurrence_0);
286
		this.corpus = corpus;
287
		this.query = query;
288

  
289
		//System.out.println("cooc params= corpus: "+corpus+" query: "+query+" props: "+properties+" limit: "+limit+" maxleft: "+maxleft+" minleft: "+minleft+" minright: "+minright+" maxright: "+maxright);
290

  
291
		this.properties = properties;
292
		this.limit = limit;
293
		this.minleft = minleft;
294
		this.maxleft = maxleft;
295
		this.minright = minright;
296
		this.maxright = maxright;
297
		this.minf = minf;
298
		this.minscore = minscore;
299
		this.mincof = mincof;
300
		this.includeXpivot = includeXpivot;
301
		this.buildLexicalTableWithCooccurrents = buildLexicalTableWithCooccurrents;
302

  
303
		//System.out.println("-- Done"); //$NON-NLS-1$
304
	}
305

  
306
	public boolean getIncludeXPivot() {
307
		return includeXpivot;
308
	}
309

  
310
	/**
311
	 * Gets the number of keyword.
312
	 *
313
	 * @return the number of keyword
314
	 */
315
	public int getNumberOfKeyword()
316
	{
317
		return numberOfKeyword;
318
	}
319

  
320
	public void setReferenceCorpus(String symbol) {
321
		referenceCorpus = symbol;
322
	}
323

  
324
	public void setCoocQuery(String q) {
325
		query_occ = q;
326
	}
327

  
328
	/**
329
	 * Process.
330
	 *
331
	 * @throws CqiClientException the cqi client exception
332
	 * @throws IOException Signals that an I/O exception has occurred.
333
	 * @throws CqiServerError the cqi server error
334
	 * @throws StatException the stat exception
335
	 */
336
	public void process() throws CqiClientException, IOException, CqiServerError, StatException
337
	{
338
		stepQueryLimits();
339
		stepGetMatches();
340
		stepBuildSignatures();
341
		stepCount();
342
		stepBuildLexicalTable();
343
		stepGetScores();
344
		clearMemory();
345
		getLines();
346
	}
347

  
348
	public void clearMemory() {
349
		distances = null;
350
		distancescounts = null;
351
		counts = null;
352
		indexfreqs = null;
353
		counted = null;
354
		m1 = null;
355
		m2 = null;
356
		m3 = null;
357
		allsignaturesstr = null;
358
		lt = null;
359
		keysToString = null;
360
		occproperties = null;
361
		count = null;
362
		dist = null;
363
		freq = null;
364
		scores = null;
365
	}
366

  
367
	/**
368
	 * Step query limits.
369
	 *
370
	 * @return true, if successful
371
	 */
372
	public boolean stepQueryLimits()
373
	{
374
		if (limit != null) // structural context
375
		{
376
			String tempquery =""; //$NON-NLS-1$
377
			String lname = limit.getName();
378
			if (minleft > 0 ) { // test if there is a left context
379
				tempquery += "(<"	+ lname + ">[]* </" + lname + ">){" + (maxleft) + "," + (maxleft) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
380
			}
381
			// (<p>[]*</p>){0, 50} "je" (<p>[]*</p>){0, 50}
382
			tempquery +=" <" + lname + ">[]* " + query.getQueryString() + " []* </" + lname + "> "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
383
			if (minright > 0) { // test if there is a right context
384
				tempquery+="(<" + lname + ">[]* </" + lname + ">){" + (maxright) + "," + (maxright) + "}";  //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
385
			}
386
			this.contextquery = new Query(tempquery);
387

  
388
			if (includeXpivot) {
389
				String anticontextquerystring = ""; //$NON-NLS-1$
390
				if (minleft > 1)// minleft = 2..N
391
					anticontextquerystring += "(<" + lname + ">[]* </" + lname + ">){" + (minleft - 1) + "," + (minleft - 1) + "} <" + lname + ">[]* "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
392
				anticontextquerystring += query.getQueryString();
393
				if (minright > 1) // minright = 2..N
394
					anticontextquerystring += " []* </" + lname + "> (<" + lname + ">[]* </" + lname + ">){" + (minright - 1) + "," + (minright - 1) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
395
				this.anticontextquery = new Query(anticontextquerystring);
396
			} else {
397
				String anticontextquerystring = ""; //$NON-NLS-1$
398
				if (minleft > 0)// minleft = 2..N
399
					anticontextquerystring += "(<" + lname + ">[]* </" + lname + ">){" + (minleft - 1) + "," + (minleft - 1) + "} <" + lname + ">[]* "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
400
				anticontextquerystring += query.getQueryString();
401
				if (minright > 0) // minright = 2..N
402
					anticontextquerystring += " []* </" + lname + "> (<" + lname + ">[]* </" + lname + ">){" + (minright - 1) + "," + (minright - 1) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
403
				this.anticontextquery = new Query(anticontextquerystring);
404
			}
405
		} else // word context
406
		{
407
			String tempquery =""; //$NON-NLS-1$
408
			if (minleft > 0 ) { // test if there is a left context
409
				tempquery += "[]{"	+ maxleft + "," + maxleft + "} "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
410
			}
411
			tempquery+= query.getQueryString();
412

  
413
			if (minright > 0) { // test if there is a right context
414
				tempquery+=" []{" + maxright + "," + maxright + "} "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
415
			}
416
			this.contextquery = new Query(tempquery); 
417
			String anticontextquerystring = ""; //$NON-NLS-1$
418
			if (minleft > 1)
419
				anticontextquerystring += "[]{" + (minleft - 1) + ", " + (minleft - 1) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
420
			anticontextquerystring += query.getQueryString();
421
			if (minright > 1)
422
				anticontextquerystring += "[]{" + (minright -1) + "," + (minright -1)+ "} "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
423
			this.anticontextquery = new Query(anticontextquerystring);
424
		}
425
		return true;
426
	}
427

  
428
	/**
429
	 * Step get matches.
430
	 *
431
	 * @return true, if successful
432
	 * @throws CqiClientException 
433
	 */
434
	public boolean stepGetMatches() throws CqiClientException 
435
	{
436
		QueryResult r1 = corpus.query(query, "CoocFocusQuery", false); // keywords positions //$NON-NLS-1$
437
		QueryResult r2 = corpus.query(contextquery, "CoocContextFocusQuery", false); // max context //$NON-NLS-1$
438
		QueryResult r3 = corpus.query(anticontextquery, "CoocAntiContextFocusQuery", false); // no context //$NON-NLS-1$
439

  
440
		m1 = r1.getMatches();
441
		if (Thread.interrupted()) return false; // stop if interrupted by user
442
		numberOfKeyword = m1.size();
443
		m2 = r2.getMatches();
444
		if (Thread.interrupted()) return false; // stop if interrupted by user
445
		m3 = r3.getMatches();
446
		if (Thread.interrupted()) return false; // stop if interrupted by user
447

  
448
		//		System.out.println(query+" M1 size: "+m1.size());
449
		//		System.out.println(contextquery+" M2 size: "+m2.size());
450
		//		System.out.println(anticontextquery+" M3 size: "+m3.size());
451
		r1.drop();
452
		r2.drop();
453
		r3.drop();
454
		return true;
455
	}
456

  
457
	/**
458
	 * Step build signatures.
459
	 *
460
	 * @return true, if successful
461
	 * @throws UnexpectedAnswerException the unexpected answer exception
462
	 * @throws IOException Signals that an I/O exception has occurred.
463
	 * @throws CqiServerError the cqi server error
464
	 */
465
	public boolean stepBuildSignatures() throws UnexpectedAnswerException, IOException, CqiServerError
466
	{
467
		allsignaturesstr = new HashMap<Integer, String>();
468
		Set<Integer> allpositions = new HashSet<Integer>(); // no duplicates
469
		for (Match n : m2) {
470
			for (int i = n.getStart(); i <= n.getEnd(); i++)
471
				allpositions.add(i);
472
		}
473
		//System.out.println("Position set: "+allpositions.size());
474

  
475
		int[] allpositionsarray = new int[allpositions.size()];
476
		int pcount = 0;
477
		for (int p : allpositions) {
478
			allpositionsarray[pcount++] = p;
479
		}
480

  
481
		HashMap<Property, int[]> propsId = new HashMap<Property, int[]>();
482
		// HashMap<Property, String[]> propsValues = new HashMap<Property, String[]>();
483
		for (Property property : properties) {
484
			int[] indices = CorpusManager.getCorpusManager().getCqiClient()
485
					.cpos2Id(property.getQualifiedName(), allpositionsarray);
486
			// String[] values =
487
			// CorpusManager.getCorpusManager().getCqiClient().cpos2Str(property.getQualifiedName(),allpositionsarray);
488
			propsId.put(property, indices);
489
			// propsValues.put(property, values);
490
			//System.out.println("all "+property+" indices: "+propsId.get(property).length);
491
		}
492
		if (Thread.interrupted()) return false; // stop if interrupted by user
493

  
494
		//System.out.println("T values + ids: "+(System.currentTimeMillis()- time)); //$NON-NLS-1$
495

  
496
		pcount = 0;
497
		for (int position : allpositionsarray) {
498
			//String sign = ""; //$NON-NLS-1$
499
			String signstr = ""; //$NON-NLS-1$
500
			for (Property property : properties) {
501
				signstr += "[" + propsId.get(property)[pcount] + "]"; //$NON-NLS-1$ //$NON-NLS-2$
502
				//signstr+="["+propsValues.get(property)[pcount]+"]"; //$NON-NLS-1$ //$NON-NLS-2$
503
			}
504
			// allsignatures.put(position, sign);
505
			allsignaturesstr.put(position, signstr);
506
			pcount++;
507

  
508
		}
509
		return true;
510
	}
511

  
512
	/**
513
	 * Step count.
514
	 *
515
	 * @return true, if successful
516
	 */
517
	public boolean stepCount()
518
	{
519
		ArrayList<Integer> keepedPosition = new ArrayList<Integer>();
520

  
521
		int startsearchM2 = 0; // optimisation: m2 is ordered
522
		int startsearchM3 = 0; // optimisation: m3 is ordered
523
		//time = System.currentTimeMillis();
524
		for (Match m : m1) { // for each match = for each focus
525
			//System.out.println("for match m: "+m);
526
			Match n = null; // the match which contains the context
527
			Match o = null; // the match which does not contain the context
528
			boolean matchFound = false;
529
			for (int i = startsearchM2; i < m2.size(); i++) { // find n
530
				n = m2.get(i);
531
				if (n.getStart() <= m.getStart() && m.getEnd() <= n.getEnd()) {
532
					startsearchM2 = i;	
533
					matchFound = true;
534
					break;
535
				}
536
			}
537
			if (Thread.interrupted()) return false; // stop if interrupted by user
538
			//System.out.println("found n: "+n);
539

  
540
			for (int i = startsearchM3; i < m3.size(); i++)// find next match m3 contained by m2
541
			{
542
				o = m3.get(i);
543
				if (o.getStart() <= m.getStart() && m.getEnd() <= o.getEnd()) {
544
					startsearchM3 = i;
545
					matchFound = matchFound && true;
546
					break;
547
				}
548
			}
549
			if (Thread.interrupted()) return false; // stop if interrupted by user
550
			//System.out.println("found o: "+o);
551

  
552
			if (!matchFound) {
553
				//System.out.println(Messages.Cooccurrence_1 + n + " ; " + o); //$NON-NLS-1$ 
554
				continue;
555
			}
556

  
557
			int start = n.getStart();
558
			int size = n.getEnd() - start +1;
559
			//if (size > 0)
560
			//	size++;
561
			int ignore = 0;
562
			int chevauche = 0;
563
			// System.out.println("Process focus:"+m+" with maxcontext:"+n+" and anticontext:"+o);
564
			// System.out.println("NbOccs "+(size));
565
			int[] positions = new int[size];
566
			int noOcc = 0;
567

  
568
			//System.out.println("positions");
569
			// System.out.println("start: "+(start)+" end:"+n.getEnd());
570
			for (int position = start; position <= n.getEnd(); position++) 
571
				// creates the list of positions, anticontext not yet removed
572
			{
573
				positions[noOcc++] = position;
574
				//System.out.print(" "+position);
575
			}
576
			if (Thread.interrupted()) return false; // stop if interrupted by user
577
			//System.out.println();
578

  
579
			noOcc = 0;
580
			for (int position : positions) {
581
				// String signature = allsignatures.get(position);
582
				String signaturestr = allsignaturesstr.get(position);
583
				if (o.getStart() <= position && position <= o.getEnd()) { 
584
					// ignore positions in the anticontext positions
585
					ignore++;
586
					continue;
587
				}
588

  
589
				if (!distances.containsKey(signaturestr)) {
590
					distances.put(signaturestr, 0.0);
591
					distancescounts.put(signaturestr, 0);
592
				}
593

  
594
				if (counted.containsKey(position)) {
595
					// ignore positions already counted
596
					chevauche++;
597
					counted.put(position, counted.get(position) + 1);
598
					// continue;
599
				} else {
600
					counted.put(position, 1);
601
					if (!counts.containsKey(signaturestr))
602
						counts.put(signaturestr, 0);
603
					counts.put(signaturestr, (counts.get(signaturestr)) + 1);
604
					keepedPosition.add(position);
605

  
606
					double dist;
607
					if (position < m.getStart())
608
						dist = m.getStart() - position;
609
					else
610
						dist = position - m.getStart();
611

  
612
					distances.put(signaturestr, (distances.get(signaturestr))
613
							+ dist);
614
					distancescounts.put(signaturestr, (distancescounts
615
							.get(signaturestr)) + 1);
616

  
617
				}
618

  
619
				noOcc++;
620
				if (Thread.interrupted()) return false; // stop if interrupted by user
621
			}
622

  
623
			// System.out.println("nb Occ ignored: "+ignore);
624
			// System.out.println("nb Occ chevauche: "+chevauche);
625
		}
626
		//System.out.println("T counts : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$
627
		
628
		allsignaturesstr = null; // no more need
629
		counted = null;
630
		return true;
631
	}
632

  
633
	/**
634
	 * Step build lexical table.
635
	 *
636
	 * @return true, if successful
637
	 * @throws RWorkspaceException the r workspace exception
638
	 */
639
	public boolean stepBuildLexicalTable() throws RWorkspaceException
640
	{
641
		String[] colnames = {
642
				corpus.getName() + "-" + query.getQueryString(), query.getQueryString() }; //$NON-NLS-1$
643
		keysToString = new HashMap<String, String>();
644

  
645
		//time = System.currentTimeMillis();
646
		Index voc = null;
647
		for (Object rez : corpus.getResults()) {//TODO: fix usages of index for cooc
648
			if (rez instanceof Index) {
649
				Index rezvoc = (Index) rez;
650
				if (rezvoc.getProperties().equals(properties)) {
651
					if (rezvoc.getQuery().equals(new Query(""+query_occ+""))) { //$NON-NLS-1$ //$NON-NLS-2$
652
						if (!rezvoc.isAltered()) {
653
							voc = rezvoc;
654
							break;
655
						}
656
					}
657
				}
658
			}
659
		}
660
		if (Thread.interrupted()) return false; // stop if interrupted by user
661

  
662
		if (voc == null) {
663
			if (properties.size() == 1 && "[]".equals(query_occ)) { //$NON-NLS-1$
664
				//System.out.println("build lexicon of props: "+properties);
665
				voc = new Index(corpus, properties.get(0));
666
			} else {
667
				//System.out.println("build index of props: "+properties);
668
				try {
669
					voc = new Index(corpus, new Query(query_occ), properties);
670
				} catch (Exception e){
671
					Log.severe("Error while computing Index for the cooccurrence: "+e.getLocalizedMessage());
672
					return false;
673
				}
674
			}
675
			corpus.storeResult(voc);
676
		}
677
		if (Thread.interrupted()) return false; // stop if interrupted by user
678

  
679
		// ALTER THE INDEX IF A REFERENCE CORPUS IS SET
680
		if(referenceCorpus != null && referenceCorpus.length() > 0) {
681
			//voc.toTxt(new File("/home/mdecorde/TEMP/before.tsv"), "UTF-8", "\t", "");
682
			try {
683
				voc.setIsAltered(true);
684
				String[] ref_forms = RWorkspace.getRWorkspaceInstance().eval("rownames("+referenceCorpus+")").asStrings(); //$NON-NLS-1$ //$NON-NLS-2$
685
				int[] ref_freqs = RWorkspace.getRWorkspaceInstance().eval(referenceCorpus+"[,1]").asIntegers(); //$NON-NLS-1$
686
				if (ref_forms.length != ref_freqs.length) {
687
					System.out.println(Messages.Cooccurrence_22);
688
					return false;
689
				}
690
				HashMap<String, Integer> ref_counts = new HashMap<String, Integer>();
691
				for(int i = 0 ; i < ref_forms.length ; i++) ref_counts.put(ref_forms[i], ref_freqs[i]);
692

  
693
				for (org.txm.functions.index.Line l : voc.getAllLines()) {
694
					String key = l.toString();
695
					if (ref_counts.containsKey(key)) {
696
						int[] f = {ref_counts.get(key)};
697
						l.setCounts(f, 0);
698
					}
699
				}
700
				//voc.toTxt(new File("/home/mdecorde/TEMP/after.tsv"), "UTF-8", "\t", "");
701
			} catch (REXPMismatchException e) {
702
				// TODO Auto-generated catch block
703
				org.txm.utils.logger.Log.printStackTrace(e);
704
				return false;
705
			}
706
		}
707
		if (Thread.interrupted()) return false; // stop if interrupted by user
708

  
709
		List<org.txm.functions.index.Line> vocLines = voc.getAllLines();
710
		int[][] freqs;
711
		String[] rownames;
712
		if (buildLexicalTableWithCooccurrents) {
713
			freqs = new int[counts.keySet().size()][2];
714
			rownames = new String[counts.keySet().size()];
715
		} else { // all words
716
			freqs = new int[vocLines.size()][2];
717
			rownames = new String[vocLines.size()];
718
		}
719

  
720
		int i = 0;
721
		//System.out.println("T voc : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$
722
		// System.out.println("nb lines voc "+voclines.size());
723
		//System.out.println("counts keys: "+counts.keySet());
724
		for (org.txm.functions.index.Line l : vocLines) {
725
			//System.out.println("L sign '"+l.getSignature()+"'");
726
			if (counts.keySet().contains(l.getSignature())) {
727
				keysToString.put(l.toString(), l.getSignature());
728
				rownames[i] = l.toString();
729
				//System.out.println("set rowname: "+l.toString());
730
				//System.out.println("("+l.getSignature()+", "+l.toString()+") : "+l.getFrequency()+" - "+counts.get(l.getSignature()));
731
				int count = counts.get(l.getSignature());
732
				int tot = l.getFrequency();
733
				indexfreqs.put(l.toString(), tot);
734

  
735
				freqs[i][0] = tot - count;
736
				freqs[i][1] = count;
737
				i++;
738
			} else if (!buildLexicalTableWithCooccurrents) {
739
				keysToString.put(l.toString(), l.getSignature());
740
				rownames[i] = l.toString();
741
				//System.out.println("set rowname: "+l.toString());
742
				//System.out.println("("+l.getSignature()+", "+l.toString()+") : "+l.getFrequency()+" - "+counts.get(l.getSignature()));
743

  
744
				int tot = l.getFrequency();
745
				indexfreqs.put(l.toString(), tot);
746

  
747
				freqs[i][0] = tot;
748
				freqs[i][1] = 0;
749
				i++;
750
			}
751
		}
752
		if (Thread.interrupted()) return false; // stop if interrupted by user
753

  
754
		//time = System.currentTimeMillis();
755
		if (freqs.length == 0) {
756
			System.out.println(Messages.Cooccurrence_23);
757
		}
758
		//		try {
759
		//			PrintWriter writer = IOUtils.getWriter("/home/mdecorde/test_cooc.txt");
760
		//			//writer.println("Build LT: ");
761
		//			//writer.println("freqs: "+Arrays.toString(freqs));
762
		//			//writer.println("Rows: "+);
763
		//			//String rows = Arrays.toString();
764
		//			int nrow = rownames.length;
765
		//			int ncol = colnames.length;
766
		//			for (int ii = 0 ; ii < nrow ; ii++) {
767
		//				writer.write(rownames[ii]);
768
		//				for (int j = 0 ; j < ncol ; j++) {
769
		//					writer.write("\t"+freqs[ii][j]);
770
		//				}
771
		//				writer.write("\n");
772
		//			}
773
		//			writer.close();
774
		//			//writer.println("Cols: "+Arrays.toString(colnames));
775
		//		} catch(Exception e) {e.printStackTrace();}
776

  
777
		lt = LexicalTableImpl.createLexicalTable(freqs, properties.get(0), rownames, colnames, 1);
778
		lt.setCorpus(corpus);
779

  
780
		//		if(referenceCorpus != null && referenceCorpus.length() > 0) {
781
		//			//lt.removeCol(0, false);
782
		//			lt.setReference(referenceCorpus);
783
		//			lt.exchangeColumns(1,2);
784
		//		}
785
		return true;
786
	}
787

  
788
	/**
789
	 * Step get scores.
790
	 *
791
	 * @return true, if successful
792
	 * @throws CqiClientException the cqi client exception
793
	 * @throws StatException the stat exception
794
	 */
795
	public boolean stepGetScores() throws CqiClientException, StatException
796
	{
797
		SpecificitesResult specif = Specificites.specificites(lt, 1000);
798
		//System.out.println("Specif N part: "+specif.getNbrPart()); //$NON-NLS-1$
799
		//System.out.println("Specif N lines number: "+specif.getSpecificitesIndex().length); //$NON-NLS-1$
800
		//System.out.println("T specif e: "+(System.currentTimeMillis()- time)); //$NON-NLS-1$
801
		//specif.toTxt(new File("~/Bureau/coocresults/specif Cooc")); //$NON-NLS-1$
802
		String[] specifrownames = specif.getTypeNames();
803
		double[][] scores = specif.getSpecificitesIndex();
804
		//System.out.println("Nb specif result: "+specif.getSpecificitesIndex().length);
805

  
806
		int iimax = Math.min(specifrownames.length, scores.length);
807
		for (int ii = 0; ii < iimax; ii++) { // counts.keySet())
808
			String signaturestr = keysToString.get(specifrownames[ii]);
809

  
810
			ArrayList<String> props = new ArrayList<String>();
811
			if (properties.size() > 1) {
812
				String[] splited = specifrownames[ii].split("_", properties.size()); //$NON-NLS-1$
813

  
814
				for (int p = 0; p < properties.size(); p++) {
815
					props.add(splited[p]);
816
				}
817
			} else {
818
				props.add(specifrownames[ii]);
819
			}
820
			//			if(specifrownames[ii].equals("(") || specifrownames[ii].equals(")"))
821
			//			{
822
			//				System.out.println("rowname: "+specifrownames[ii]);
823
			//				System.out.println("props: "+props);
824
			//				System.out.println("counts: "+counts.get(signaturestr));
825
			//				System.out.println("speciffreq: "+indexfreqs.get(specifrownames[ii]));
826
			//				System.out.println("specif score: "+scores[ii][1]);
827
			//				System.out.println("distance: "+distances.get(signaturestr));
828
			//				System.out.println("distance count: "+distancescounts.get(signaturestr));
829
			//			}
830
			if (counts.containsKey(signaturestr)) {
831
				CLine cline = new CLine(this, specifrownames[ii], props, counts
832
						.get(signaturestr), indexfreqs.get(specifrownames[ii]), scores[ii][1],
833
						((float) (distances.get(signaturestr) / distancescounts
834
								.get(signaturestr))) - 1.0f, -1);
835
				//System.out.println(cline);
836
				if (cline.freq >= this.minf	&& cline.nbocc >= this.mincof && Math.abs(cline.score) >= this.minscore)
837
				{
838
					if (cline.score >= Integer.MAX_VALUE - 5)
839
						cline.score = Float.MAX_EXPONENT;
840
					lines.add(cline);
841
				}
842
			}
843
			//System.out.println(signaturestr+"\t"+voclines.get(ii).getFrequency()+"\t"+counts.get(signaturestr)+"\t"+(distances.get(signaturestr)/distancescounts.get(signaturestr))+"\t"+scores[ii][1]); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
844
		}
845
		return true;
846
	}
847

  
848
	/**
849
	 * Sort.
850
	 *
851
	 * @param comparator the comparator
852
	 * @throws CqiClientException the cqi client exception
853
	 */
854
	public void sort(CLineComparator comparator) throws CqiClientException {
855
		comparator.initialize(corpus);
856
		if (lines.size() > 0) {
857
			Collections.sort(lines, comparator);
858
		}
859
	}
860

  
861
	/**
862
	 * Inits the conc infos.
863
	 *
864
	 * @param conc the conc
865
	 * @return true, if successful
866
	 */
867
	private boolean initConcInfos(Concordance conc) {
868
		int from = 0;
869
		int to = conc.getNLines() - 1;
870
		return initConcInfos(conc, from, to);
871
	}
872

  
873
	/**
874
	 * Inits the conc infos.
875
	 *
876
	 * @param conc the conc
877
	 * @param from the from
878
	 * @param to the to
879
	 * @return true, if successful
880
	 */
881
	private boolean initConcInfos(Concordance conc, int from, int to) {
882
		try {
883
			this.conc = conc;
884
			conclines = conc.getLines(from, to);
885
			return initConcInfos(conc, conclines);
886
		} catch (Exception e) {
887
			org.txm.utils.logger.Log.printStackTrace(e);
888
		}
889
		return false;
890
	}
891

  
892
	/**
893
	 * Inits the conc infos.
894
	 *
895
	 * @param conc the conc
896
	 * @param conclines the conclines
897
	 * @return true, if successful
898
	 */
899
	private boolean initConcInfos(Concordance conc, List<Line> conclines) {
900
		try {
901
			this.conc = conc;
902
			corpus = conc.getCorpus();
903
			this.P = corpus.getSize()
904
					/ (conc.getLeftContextSize() + conc.getRightContextSize());
905
			this.FA = conc.getNLines();
906
			corpus = conc.getCorpus();
907
			query = conc.getQuery();
908
			// System.out.println("CONC PROPS :"+conc.getViewProperties());
909
			properties = conc.getViewProperties();
910
			limit = null;
911
			maxleft = conc.getLeftContextSize();
912
			maxright = conc.getRightContextSize();
913
			this.conclines = conclines;
914
			return true;
915
		} catch (CqiClientException e) {
916
			org.txm.utils.logger.Log.printStackTrace(e);
917
		}
918
		return false;
919
	}
920

  
921
	/**
922
	 * Sets the thresfold.
923
	 *
924
	 * @param freq the freq
925
	 * @param count the count
926
	 * @param score the score
927
	 */
928
	public void setThresfold(int freq, int count, double score) {
929
		seuil_freq = freq;
930
		seuil_count = count;
931
		seuil_score = score;
932
	}
933

  
934
	/**
935
	 * Compute.
936
	 *
937
	 * @return true, if successful
938
	 */
939
	public boolean compute() {
940
		if (conclines == null || conc == null || P == -1 || FA == -1
941
				|| properties == null || corpus == null) {
942
			System.err.println(Messages.Cooccurrence_5 + conc);
943
			return false;
944
		}
945
		// System.out.println("seuils: freq="+seuil_freq+", count="+seuil_count+", score="+seuil_score);
946
		String queryfreq = "["; //$NON-NLS-1$
947
		for (Property p : properties) {
948
			queryfreq += " " + p.getName() + "=\".*\" |"; //$NON-NLS-1$ //$NON-NLS-2$
949
		}
950
		queryfreq = queryfreq.substring(0, queryfreq.length() - 1) + "]"; //$NON-NLS-1$
951
		try {
952
			voc = new Index(corpus, new Query(queryfreq), properties);
953
		} catch (Exception e){
954
			Log.severe("Error while computing Index for the cooccurrence: "+e.getLocalizedMessage());
955
			return false;
956
		}
957
		// voc.getCorpus().storeResult(voc);
958

  
959
		count = new HashMap<String, Integer>(voc.getV());
960
		dist = new HashMap<String, Float>(voc.getV());
961
		freq = new HashMap<String, Integer>(voc.getV());
962
		scores = new HashMap<String, Double>(voc.getV());
963
		occproperties = new HashMap<String, List<String>>(voc.getV());
964

  
965
		// System.out.println("Lignes");
966
		// System.out.println("leftC\tKeyword\trightC");
967
		for (Line concline : conclines) {
968
			// System.out.println(concline.leftContextToString()+"\t"+concline.keywordToString()+"\t"+concline.rightContextToString());
969
			Map<Property, List<String>> propsvalue = concline
970
					.getLeftCtxViewProperties();
971
			countOcc(propsvalue, false);
972
			propsvalue = concline.getRightCtxViewProperties();
973
			countOcc(propsvalue, true);
974
		}
975

  
976
		for (String cooc : count.keySet()) {
977
			if (count.get(cooc) >= seuil_count && freq.get(cooc) >= seuil_freq) {
978
				// System.out.println("compute dist et score de : "+cooc);
979
				// calcul dist moyenne avant de rectif ier count
980
				dist.put(cooc, dist.get(cooc) / count.get(cooc));
981

  
982
				if (count.get(cooc) > freq.get(cooc))
983
					count.put(cooc, freq.get(cooc));
984

  
985
				// calcul score du cooc
986
				double score = ProbaBinom(FA, freq.get(cooc), P,
987
						count.get(cooc));
988
				if (score >= seuil_score) {
989
					long mode = calcmode(FA, freq.get(cooc), P, count.get(cooc));
990
					CLine line = new CLine(this, cooc, occproperties.get(cooc),
991
							count.get(cooc), freq.get(cooc), score, dist
992
							.get(cooc), mode);
993
					lines.add(line);
994
				}
995
			}
996
		}
997

  
998
		// clear memory
999
		count = null;
1000
		dist = null;
1001
		freq = null;
1002
		scores = null;
1003
		occproperties = null;
1004

  
1005
		return true;
1006
	}
1007

  
1008
	/**
1009
	 * Prints the.
1010
	 */
1011
	public void print() {
1012
		// System.out.println("Lines "+lines.size());
1013
		System.out.println("FA : " + FA); //$NON-NLS-1$
1014
		System.out.println("P : " + P); //$NON-NLS-1$
1015
		System.out.println(Messages.Cooccurrence_6);
1016
		for (CLine line : lines)
1017
			System.out.println(line.resume("\t", "")); //$NON-NLS-1$ //$NON-NLS-2$
1018
	}
1019

  
1020
	/**
1021
	 * Count occ.
1022
	 *
1023
	 * @param propsvalue the propsvalue
1024
	 * @param rightcontext the rightcontext
1025
	 */
1026
	public void countOcc(Map<Property, List<String>> propsvalue,
1027
			boolean rightcontext) {
1028
		// System.out.println("countOcc (R="+rightcontext+") "+propsvalue);
1029
		Property key = properties.get(0);
1030
		List<String> iterationlist = propsvalue.get(key);
1031
		for (int i = 0; i < iterationlist.size(); i++)// pr chq mot
1032
		{
1033
			// build occ
1034
			String occ = ""; //$NON-NLS-1$
1035
			for (Property p : properties)
1036
				occ += propsvalue.get(p).get(i) + "_"; //$NON-NLS-1$
1037
			occ = occ.substring(0, occ.length() - 1);
1038

  
1039
			if (occproperties.get(occ) == null) {
1040
				ArrayList<String> values = new ArrayList<String>();
1041
				for (Property p : properties)
1042
					values.add(propsvalue.get(p).get(i));
1043
				occproperties.put(occ, values);
1044
			}
1045

  
1046
			// System.out.println("occ '"+occ+"'");
1047
			// update nbocc
1048
			if (count.get(occ) == null)
1049
				count.put(occ, 0);
1050
			count.put(occ, count.get(occ) + 1);
1051

  
1052
			// update dist
1053
			if (dist.get(occ) == null)
1054
				dist.put(occ, 0.0f);
1055
			if (rightcontext) {
1056
				dist.put(occ, dist.get(occ) + i + 1);
1057
			} else {
1058
				dist.put(occ, dist.get(occ) + iterationlist.size() - i);
1059
			}
1060

  
1061
			// update freq
1062
			if (freq.get(occ) == null) {
1063
				// System.out.println("compute freq of "+occ);
1064
				int occfreq = -1; // calcul avec l'index
1065
				for (org.txm.functions.index.Line l : voc
1066
						.getLines(0, voc.getV())) {
1067
					if (l.toString().equals(occ)) {
1068
						// System.out.println("FOUND "+occ);
1069
						occfreq = l.getFrequency();
1070
						break;
1071
					}
1072
				}
1073
				freq.put(occ, occfreq);
1074
			}
1075
		}
1076
	}
1077

  
1078
	/**
1079
	 * The Class CLine.
1080
	 */
1081
	public class CLine {
1082

  
1083
		/** The cooc. */
1084
		Cooccurrence cooc;
1085

  
1086
		/** The occ. */
1087
		public String occ;
1088

  
1089
		/** The freq. */
1090
		public int freq;
1091

  
1092
		/** The nbocc. */
1093
		public int nbocc;
1094

  
1095
		/** The score. */
1096
		public double score;
1097

  
1098
		/** The distmoyenne. */
1099
		public float distmoyenne;
1100

  
1101
		/** The mode. */
1102
		public long mode;
1103

  
1104
		/** The props. */
1105
		public List<String> props;
1106

  
1107
		/** The id. */
1108
		public int id;
1109

  
1110
		/* (non-Javadoc)
1111
		 * @see java.lang.Object#toString()
1112
		 */
1113
		@Override
1114
		public String toString()
1115
		{
1116
			return occ+Messages.Cooccurrence_4+freq+Messages.Cooccurrence_7+nbocc+Messages.Cooccurrence_8+score+Messages.Cooccurrence_9+distmoyenne+Messages.Cooccurrence_10+props;
1117
		}
1118

  
1119
		/**
1120
		 * Instantiates a new c line.
1121
		 *
1122
		 * @param cooc the cooc
1123
		 * @param occ the occ
1124
		 * @param props the props
1125
		 * @param nbocc the nbocc
1126
		 * @param freq the freq
1127
		 * @param score the score
1128
		 * @param distmoyenne the distmoyenne
1129
		 * @param mode the mode
1130
		 */
1131
		public CLine(Cooccurrence cooc, String occ, List<String> props,
1132
				int nbocc, int freq, double score, Float distmoyenne, long mode) {
1133
			this.occ = occ;
1134
			this.props = props;
1135
			this.freq = freq;
1136
			this.nbocc = nbocc;
1137
			this.score = score;
1138
			this.distmoyenne = distmoyenne;
1139
			this.mode = mode;
1140
			this.cooc = cooc;
1141
		}
1142

  
1143
		/**
1144
		 * Sets the count and dist.
1145
		 *
1146
		 * @param count the count
1147
		 * @param dist the dist
1148
		 */
1149
		public void setCountAndDist(int count, int dist) {
1150
			this.nbocc = count;
1151
			this.distmoyenne = dist;
1152
		}
1153

  
1154
		/**
1155
		 * Adds the txt sep.
1156
		 *
1157
		 * @param str the str
1158
		 * @param sep the sep
1159
		 * @return the string
1160
		 */
1161
		private String addTxtSep(String str, String sep)
1162
		{
1163
			return sep+str.replace(sep, sep+sep)+sep;
1164
		}
1165

  
1166
		/**
1167
		 * Resume.
1168
		 *
1169
		 * @param colseparator the colseparator
1170
		 * @param txtseparator the txtseparator
1171
		 * @return the string
1172
		 */
1173
		public String resume(String colseparator, String txtseparator) {
1174
			return addTxtSep(""+occ, txtseparator) //$NON-NLS-1$
1175
					+ colseparator + freq + colseparator + nbocc + colseparator + score + colseparator + distmoyenne; 
1176
		}
1177

  
1178
		/**
1179
		 * Sets the freq.
1180
		 *
1181
		 * @param freq the new freq
1182
		 */
1183
		public void setFreq(int freq) {
1184
			this.freq = freq;
1185
		}
1186

  
1187
		/**
1188
		 * Sets the score.
1189
		 */
1190
		public void setScore()// FB == freq, R = nbocc
1191
		{
1192
			this.score = (this.nbocc + this.freq + this.distmoyenne);
1193
		}
1194

  
1195
		/**
1196
		 * Gets the cooc.
1197
		 *
1198
		 * @return the cooc
1199
		 */
1200
		public Cooccurrence getCooc() {
1201
			return cooc;
1202
		}
1203
	}
1204

  
1205
	/**
1206
	 * Calcmode.
1207
	 *
1208
	 * @param inf the inf
1209
	 * @param ing the ing
1210
	 * @param s the s
1211
	 * @param cf the cf
1212
	 * @return the long
1213
	 */
1214
	long calcmode(long inf, long ing, long s, long cf) {
1215
		return (long) Math.floor((double) ((inf + 1) * (ing + 1))
1216
				/ (inf + ing + s + 2));
1217
	}
1218

  
1219
	// Maths
1220
	/**
1221
	 * Proba binom.
1222
	 *
1223
	 * @param inf the inf
1224
	 * @param ing the ing
1225
	 * @param s the s
1226
	 * @param cf the cf
1227
	 * @return the double
1228
	 */
1229
	public static double ProbaBinom(long inf, long ing, long s, long cf) {
1230
		long mode = (long) Math.floor((double) ((inf + 1) * (ing + 1))
1231
				/ (inf + ing + s + 2));
1232
		long f, g, k;
1233
		double P = 0.0, p, lp, dnm;
1234
		int lpflag = 1;
1235

  
1236
		if (cf <= mode)
1237
			return (1.0);
1238

  
1239
		if (inf < ing) {
1240
			f = inf;
1241
			g = ing;
1242
		} else {
1243
			f = ing;
1244
			g = inf;
1245
		}
1246
		;
1247

  
1248
		dnm = rbicoln((int) (f + g + s), (int) g);
1249
		for (k = cf; k <= f; k++) {
1250
			p = Math.exp(rbicoln((int) f, (int) k)
1251
					+ rbicoln((int) (s + g), (int) (g - k)) - dnm);
1252
			// if (lpflag > 0) { // rentre dedans de toute facon
1253
			lp = p;
1254
			lpflag = 0;
1255
			// };
1256
			if ((lp / p) < maxprec)
1257
				break;
1258
			lp = p;
1259
			P += p;
1260
		}
1261
		;
1262

  
1263
		return (P);
1264

  
1265
		/*
1266
		 * for (k = 0; k < cf; k++) { p = exp(rbicoln(f,
1267
		 * k)+rbicoln(s+g,g-k)-rbicoln(f+g+s, g)); P += p; }; return(fabs(1-P));
1268
		 */
1269

  
1270
	}
1271

  
1272
	/** The arbicoln. */
1273
	static double[][] arbicoln = new double[101][101];
1274

  
1275
	/** The cof. */
1276
	static double[] cof = { 76.18009172947146, -86.50532032941677,
1277
		24.01409824083091, -1.231739572450155, 0.1208650973866179e-2,
1278
		-0.5395239384953e-5 };
1279

  
1280
	/** The a. */
1281
	static double[] a = new double[101];
1282

  
1283
	/** The maxprec. */
1284
	static double maxprec = 0.0;
1285

  
1286
	static {// void init_rbicoln()
1287
		int i, j;
1288

  
1289
		for (i = 0; i < 101; i++)
1290
			for (j = 0; j < 101; j++)
1291
				arbicoln[i][j] = -1;
1292
	}
1293

  
1294
	/**
1295
	 * Gammln.
1296
	 *
1297
	 * @param xx the xx
1298
	 * @return the double
1299
	 */
1300
	static double gammln(double xx) {
1301
		double x, y, tmp, ser;
1302

  
1303
		int j;
1304

  
1305
		y = x = xx;
1306
		tmp = x + 5.5;
1307
		tmp -= (x + 0.5) * Math.log(tmp);
1308
		ser = 1.000000000190015;
1309
		for (j = 0; j <= 5; j++)
1310
			ser += cof[j] / ++y;
1311
		return -tmp + Math.log(2.5066282746310005 * ser / x);
1312
	}
1313

  
1314
	/**
1315
	 * Factln.
1316
	 *
1317
	 * @param n the n
1318
	 * @return the double
1319
	 */
1320
	static double factln(int n) {
1321
		if (n < 0)
1322
			System.err.println(Messages.Cooccurrence_102);
1323
		if (n <= 1)
1324
			return 0.0;
1325
		if (n <= 100)
1326
			return (a[n] > 0) ? a[n] : (a[n] = gammln(n + 1.0));
1327
			else
1328
				return gammln(n + 1.0);
1329
	}
1330

  
1331
	/**
1332
	 * Rbicoln.
1333
	 *
1334
	 * @param n the n
1335
	 * @param k the k
1336
	 * @return the double
1337
	 */
1338
	static double rbicoln(int n, int k) {
1339
		if (n < 0)
1340
			System.err.println(Messages.Cooccurrence_103);
1341
		if (k < 0)
1342
			System.err.println(Messages.Cooccurrence_103);
1343
		if (n <= 100 && k <= 100)
1344
			return (arbicoln[n][k] >= 0) ? arbicoln[n][k]
1345
					: (arbicoln[n][k] = factln(n) - factln(k) - factln(n - k));
1346
			else
1347
				return factln(n) - factln(k) - factln(n - k);
1348
	}
1349

  
1350
	/**
1351
	 * Gets the lines.
1352
	 *
1353
	 * @return the lines
1354
	 */
1355
	public List<CLine> getLines() {
1356
		return lines;
1357
	}
1358

  
1359
	int numberOfCooccurrents = -1;
1360

  
1361
	/**
1362
	 * Gets the lines.
1363
	 *
1364
	 * @return the lines
1365
	 */
1366
	public int getNumberOfCooccurrents() {
1367
		if (numberOfCooccurrents == -1) {
1368
			numberOfCooccurrents = 0;
1369
			for (CLine line : lines)
1370
				numberOfCooccurrents += line.nbocc;
1371
		}
1372
		return numberOfCooccurrents;
1373
	}
1374

  
1375
	/**
1376
	 * Gets the lines.
1377
	 *
1378
	 * @return the lines
1379
	 */
1380
	public int getNumberOfDifferentCooccurrents() {
1381
		if (lines != null) return lines.size();
1382
		return 0;
1383
	}
1384
	
1385
	/**
1386
	 * Gets the corpus.
1387
	 *
1388
	 * @return the corpus
1389
	 */
1390
	public Corpus getCorpus() {
1391
		return corpus;
1392
	}
1393

  
1394
	/**
1395
	 * Gets the query.
1396
	 *
1397
	 * @return the query
1398
	 */
1399
	public Query getQuery() {
1400
		return query;
1401
	}
1402

  
1403
	/**
1404
	 * To txt.
1405
	 *
1406
	 * @param outfile the outfile
1407
	 * @param encoding the encoding
1408
	 * @return true, if successful
1409
	 */
1410
	public boolean toTxt(File outfile, String encoding) {
1411
		return toTxt(outfile, encoding, "\t", ""); //$NON-NLS-1$ //$NON-NLS-2$
1412
	}
1413

  
1414
	/**
1415
	 * To txt.
1416
	 *
1417
	 * @param outfile the outfile
1418
	 * @param encoding the encoding
1419
	 * @param colseparator the colseparator
1420
	 * @param txtseparator the txtseparator
1421
	 * @return true, if successful
1422
	 */
1423
	public boolean toTxt(File outfile, String encoding, String colseparator, String txtseparator) {
1424
		try {
1425
			// NK: writer declared as class attribute to perform a clean if the operation is interrupted
1426
			this.writer = new BufferedWriter(new OutputStreamWriter(
1427
					new FileOutputStream(outfile), encoding)); 
1428
			// if ("UTF-8".equals(encoding)) writer.write('\ufeff'); // UTF-8 BOM
1429
			toTxt(writer, colseparator, txtseparator);
1430
		} catch (Exception e) {
1431
			Log.severe(Messages.Cooccurrence_107 + e);
1432
			return false;
1433
		}
1434
		return true;
1435
	}
1436

  
1437
	/**
1438
	 * To txt.
1439
	 *
1440
	 * @param writer the writer
1441
	 * @param colseparator the colseparator
1442
	 * @param txtseparator the txtseparator
1443
	 * @return true, if successful
1444
	 */
1445
	public boolean toTxt(Writer writer, String colseparator, String txtseparator) {
1446
		try {
1447
			//Occ	Freq	CoFreq	Score	MeanDist
1448
			writer.write( txtseparator+Messages.Cooccurrence_11+txtseparator+colseparator+
1449
					txtseparator+Messages.Cooccurrence_12+txtseparator+colseparator+
1450
					txtseparator+Messages.Cooccurrence_13+txtseparator+colseparator+
1451
					txtseparator+Messages.Cooccurrence_14+txtseparator+colseparator+
1452
					txtseparator+Messages.Cooccurrence_15+txtseparator+"\n");//colseparator+ //$NON-NLS-1$
1453
			//txtseparator+Messages.Cooccurrence_16+txtseparator+"\n"); //$NON-NLS-1$
1454
			for (CLine line : lines) 
1455
				writer.write(line.resume(colseparator, txtseparator) + "\n"); //$NON-NLS-1$
1456
			writer.close();
1457
		} catch (IOException e) {
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff