Révision 2948
tmp/org.txm.cooccurrence.core/src/org/txm/cooccurrence/core/functions/Cooccurrence.java (revision 2948) | ||
---|---|---|
62 | 62 |
import org.txm.searchengine.cqp.corpus.QueryResult; |
63 | 63 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
64 | 64 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
65 |
import org.txm.searchengine.cqp.corpus.VirtualProperty; |
|
65 | 66 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
66 | 67 |
import org.txm.searchengine.cqp.corpus.query.Match; |
67 | 68 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
... | ... | |
81 | 82 |
* |
82 | 83 |
*/ |
83 | 84 |
public class Cooccurrence extends TXMResult { |
84 |
|
|
85 |
|
|
85 | 86 |
/** The nocooc. */ |
86 | 87 |
protected static int nocooc = 1; |
87 |
|
|
88 |
|
|
88 | 89 |
/** The prefix r. */ |
89 | 90 |
protected static String prefixR = "Cooccurrences"; //$NON-NLS-1$ |
90 |
|
|
91 |
|
|
91 | 92 |
/** The allsignaturesstr. */ |
92 | 93 |
private HashMap<Integer, String> allsignaturesstr; |
93 |
|
|
94 |
|
|
94 | 95 |
/** The anticontextquery. */ |
95 | 96 |
private CQLQuery anticontextquery; |
96 |
|
|
97 |
|
|
97 | 98 |
private boolean buildLexicalTableWithCooccurrents; |
98 |
|
|
99 |
|
|
99 | 100 |
/** The conclines. */ |
100 | 101 |
List<org.txm.concordance.core.functions.Line> conclines; |
101 |
|
|
102 |
|
|
102 | 103 |
/** The conc. */ |
103 | 104 |
Concordance concordance; |
104 |
|
|
105 |
|
|
105 | 106 |
/** The contextquery. */ |
106 | 107 |
private CQLQuery contextQuery; |
107 |
|
|
108 |
|
|
108 | 109 |
/** The count. */ |
109 | 110 |
HashMap<String, Integer> count; |
110 |
|
|
111 |
|
|
111 | 112 |
// contains the sum of distances |
112 | 113 |
/** The counts. */ |
113 | 114 |
HashMap<String, Integer> counts = new HashMap<>(); |
114 |
|
|
115 |
|
|
115 | 116 |
/** The dist. */ |
116 | 117 |
HashMap<String, Float> dist; |
117 |
|
|
118 |
|
|
118 | 119 |
// System.out.println("Matches: focus: "+m1.size()+" full: "+m2.size()+" anti: "+m3.size()); |
119 | 120 |
// System.out.println("T matches : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$ |
120 | 121 |
/** The distances. */ |
121 | 122 |
HashMap<String, Double> distances = new HashMap<>(); |
122 |
|
|
123 |
|
|
123 | 124 |
/** The FA. */ |
124 | 125 |
int FA = -1; |
125 |
|
|
126 |
|
|
126 | 127 |
/** The freq. */ |
127 | 128 |
HashMap<String, Integer> freq; |
128 |
|
|
129 |
|
|
129 | 130 |
/** The index. */ |
130 | 131 |
Index index; |
131 |
|
|
132 |
|
|
132 | 133 |
/** The indexfreqs. */ |
133 | 134 |
HashMap<String, Integer> indexfreqs = new HashMap<>(); |
134 |
|
|
135 |
|
|
135 | 136 |
/** The keys to string. */ |
136 | 137 |
private HashMap<String, String> keysToString; |
137 |
|
|
138 |
|
|
138 | 139 |
/** The lines. */ |
139 | 140 |
List<CLine> lines = new ArrayList<>(); |
140 |
|
|
141 |
|
|
141 | 142 |
/** The lt. */ |
142 | 143 |
private LexicalTableImpl lt; |
143 |
|
|
144 |
|
|
144 | 145 |
/** The m1. */ |
145 | 146 |
private List<Match> m1; |
146 |
|
|
147 |
|
|
147 | 148 |
/** The m2. */ |
148 | 149 |
private List<Match> m2; |
149 |
|
|
150 |
|
|
150 | 151 |
/** The m3. */ |
151 | 152 |
private List<Match> m3; |
152 |
|
|
153 |
|
|
153 | 154 |
int numberOfCooccurrents = -1; |
154 |
|
|
155 |
|
|
155 | 156 |
/** The number of keyword. */ |
156 | 157 |
int numberOfKeyword = 0; |
157 |
|
|
158 |
|
|
158 | 159 |
/** The occproperties. */ |
159 | 160 |
HashMap<String, List<String>> occproperties; |
160 |
|
|
161 |
|
|
161 | 162 |
/** The P. */ |
162 | 163 |
int P = -1; |
163 |
|
|
164 |
|
|
164 | 165 |
/** The reference corpus to use = the R symbol that point to a matrix WordxFreqs. */ |
165 | 166 |
String referenceCorpus; |
166 |
|
|
167 |
|
|
167 | 168 |
/** The scores. */ |
168 | 169 |
HashMap<String, Double> scores; |
169 |
|
|
170 |
|
|
170 | 171 |
/** The symbol. */ |
171 | 172 |
private String symbol; |
172 |
|
|
173 |
|
|
173 | 174 |
/** The writer. */ |
174 | 175 |
private BufferedWriter writer; |
175 |
|
|
176 |
|
|
176 | 177 |
@Parameter(key = CooccurrencePreferences.QUERY_FILTER) |
177 | 178 |
protected String pCooccurentQueryFilter = "[]"; //$NON-NLS-1$ |
178 |
|
|
179 |
|
|
179 | 180 |
/** The mincof. */ |
180 | 181 |
@Parameter(key = CooccurrencePreferences.MIN_COUNT) |
181 | 182 |
protected Integer pFCoocFilter; |
182 |
|
|
183 |
|
|
183 | 184 |
/** The minf. */ |
184 | 185 |
@Parameter(key = TXMPreferences.F_MIN) |
185 | 186 |
protected Integer pFminFilter; |
186 |
|
|
187 |
|
|
187 | 188 |
/** The include xpivot. */ |
188 | 189 |
@Parameter(key = CooccurrencePreferences.INCLUDE_X_PIVOT) |
189 | 190 |
protected Boolean pIncludeXpivot; |
190 |
|
|
191 |
|
|
191 | 192 |
/** The maxleft. */ |
192 | 193 |
@Parameter(key = CooccurrencePreferences.MAX_LEFT) |
193 | 194 |
protected Integer pMaxLeftContextSize; |
194 |
|
|
195 |
|
|
195 | 196 |
/** The maxright. */ |
196 | 197 |
@Parameter(key = CooccurrencePreferences.MAX_RIGHT) |
197 | 198 |
protected Integer pMaxRightContextSize; |
198 |
|
|
199 |
|
|
199 | 200 |
/** The minleft. */ |
200 | 201 |
@Parameter(key = CooccurrencePreferences.MIN_LEFT) |
201 | 202 |
protected Integer pMinLeftContextSize; |
202 |
|
|
203 |
|
|
203 | 204 |
/** The minright. */ |
204 | 205 |
@Parameter(key = CooccurrencePreferences.MIN_RIGHT) |
205 | 206 |
protected Integer pMinRightContextSize; |
206 |
|
|
207 |
|
|
207 | 208 |
/** The cooccurrents properties to display. */ |
208 | 209 |
@Parameter(key = CooccurrencePreferences.UNIT_PROPERTIES) |
209 | 210 |
protected List<WordProperty> pProperties; |
210 |
|
|
211 |
|
|
211 | 212 |
/** The keyword query. */ |
212 | 213 |
@Parameter(key = CooccurrencePreferences.QUERY) |
213 | 214 |
protected CQLQuery pQuery; |
214 |
|
|
215 |
|
|
215 | 216 |
/** The minscore. */ |
216 | 217 |
@Parameter(key = CooccurrencePreferences.MIN_SCORE) |
217 | 218 |
protected Float pScoreMinFilter; |
218 |
|
|
219 |
|
|
219 | 220 |
/** |
220 | 221 |
* The structural unit context limit. |
221 | 222 |
* In null then the unit property is used. |
222 | 223 |
*/ |
223 | 224 |
@Parameter(key = CooccurrencePreferences.STRUCTURAL_UNIT_LIMIT, electric = false) |
224 | 225 |
protected StructuralUnit pStructuralUnitLimit; |
225 |
|
|
226 |
|
|
226 |
|
|
227 |
|
|
227 | 228 |
/** |
228 | 229 |
* Creates a not computed cooccurrence from the specified corpus. |
229 | 230 |
* |
... | ... | |
232 | 233 |
public Cooccurrence(CQPCorpus parent) { |
233 | 234 |
super(parent); |
234 | 235 |
} |
235 |
|
|
236 |
|
|
236 | 237 |
/** |
237 | 238 |
* Creates a not computed cooccurrence from a parameters node. |
238 | 239 |
* |
... | ... | |
241 | 242 |
public Cooccurrence(String parametersNodePath) { |
242 | 243 |
super(parametersNodePath); |
243 | 244 |
} |
244 |
|
|
245 |
|
|
246 |
|
|
245 |
|
|
246 |
|
|
247 |
|
|
247 | 248 |
@Override |
248 | 249 |
protected boolean _compute(TXMProgressMonitor monitor) throws CqiClientException, IOException, CqiServerError, StatException { |
249 | 250 |
// System.out.println("cooc: "+corpus+" "+query+" "+properties+" "+limit+" "+maxLeft+" "+minLeft+" "+minRight+" "+maxRight+" "+minFreq+" "+minCof+" "+minScore+" "+includeXpivot); |
250 |
|
|
251 |
|
|
251 | 252 |
monitor.setTask(CooccurrenceCoreMessages.info_buildingQueries); |
252 |
|
|
253 |
|
|
253 | 254 |
// clear data |
254 | 255 |
try { |
255 | 256 |
this.numberOfCooccurrents = -1; |
... | ... | |
266 | 267 |
} |
267 | 268 |
catch (Exception e) { |
268 | 269 |
} |
269 |
|
|
270 |
|
|
270 | 271 |
if (!this.stepQueryLimits()) { |
271 | 272 |
return false; |
272 | 273 |
} |
273 |
|
|
274 |
|
|
274 | 275 |
monitor.setTask(CooccurrenceCoreMessages.info_retreivingMatches); |
275 | 276 |
if (!this.stepGetMatches()) { |
276 | 277 |
return false; |
277 | 278 |
} |
278 | 279 |
monitor.worked(20); |
279 |
|
|
280 |
|
|
280 | 281 |
monitor.setTask(CooccurrenceCoreMessages.info_buildingLineSignatures); |
281 | 282 |
if (!this.stepBuildSignatures()) { |
282 | 283 |
return false; |
283 | 284 |
} |
284 | 285 |
monitor.worked(20); |
285 |
|
|
286 |
|
|
286 | 287 |
monitor.setTask(CooccurrenceCoreMessages.info_counting); |
287 | 288 |
if (!this.stepCount()) { |
288 | 289 |
return false; |
289 | 290 |
} |
290 | 291 |
monitor.worked(20); |
291 |
|
|
292 |
|
|
292 | 293 |
monitor.setTask(CooccurrenceCoreMessages.info_buildingLexicalTable); |
293 | 294 |
if (!this.stepBuildLexicalTable(monitor)) { |
294 | 295 |
return false; |
295 | 296 |
} |
296 | 297 |
monitor.worked(10); |
297 |
|
|
298 |
|
|
298 | 299 |
monitor.setTask(CooccurrenceCoreMessages.info_computingSpecificitiesScores); |
299 | 300 |
if (!this.stepGetScores()) { |
300 | 301 |
return false; |
301 | 302 |
} |
302 |
|
|
303 |
|
|
303 | 304 |
this.clearMemory(); |
304 | 305 |
monitor.done(); |
305 |
|
|
306 |
|
|
306 | 307 |
return true; |
307 | 308 |
} |
308 |
|
|
309 |
|
|
309 | 310 |
@Override |
310 | 311 |
public boolean loadParameters() throws CqiClientException { |
311 | 312 |
pProperties = (List<WordProperty>) Property.stringToProperties(getCorpus(), this.getStringParameterValue(TXMPreferences.UNIT_PROPERTIES)); |
... | ... | |
313 | 314 |
pStructuralUnitLimit = this.getCorpus().getStructuralUnit(this.getStringParameterValue(CooccurrencePreferences.STRUCTURAL_UNIT_LIMIT)); |
314 | 315 |
return true; |
315 | 316 |
} |
316 |
|
|
317 |
|
|
317 | 318 |
@Override |
318 | 319 |
public boolean saveParameters() { |
319 | 320 |
this.saveParameter(TXMPreferences.UNIT_PROPERTIES, Property.propertiesToString(this.pProperties)); |
320 |
|
|
321 |
|
|
321 | 322 |
if (pQuery != null) { |
322 | 323 |
this.saveParameter(TXMPreferences.QUERY, pQuery.getQueryString()); |
323 | 324 |
} |
324 |
|
|
325 |
|
|
325 | 326 |
if (pStructuralUnitLimit != null) { |
326 | 327 |
this.saveParameter(CooccurrencePreferences.STRUCTURAL_UNIT_LIMIT, this.pStructuralUnitLimit.getName()); |
327 | 328 |
} |
328 |
|
|
329 |
|
|
329 | 330 |
return true; |
330 | 331 |
} |
331 |
|
|
332 |
|
|
332 |
|
|
333 |
|
|
333 | 334 |
/** |
334 | 335 |
* As r matrix. |
335 | 336 |
* |
... | ... | |
338 | 339 |
*/ |
339 | 340 |
public String asRMatrix() throws RWorkspaceException { |
340 | 341 |
symbol = prefixR + nocooc; |
341 |
|
|
342 |
|
|
342 | 343 |
String[] occ = new String[this.lines.size()]; |
343 | 344 |
int[] freq = new int[this.lines.size()]; |
344 | 345 |
int[] cofreq = new int[this.lines.size()]; |
345 | 346 |
double[] score = new double[this.lines.size()]; |
346 | 347 |
double[] dist = new double[this.lines.size()]; |
347 |
|
|
348 |
|
|
348 | 349 |
int i = 0; |
349 | 350 |
for (CLine line : this.lines) { |
350 | 351 |
occ[i] = line.occ; |
... | ... | |
354 | 355 |
dist[i] = line.distmoyenne; |
355 | 356 |
i++; |
356 | 357 |
} |
357 |
|
|
358 |
|
|
358 | 359 |
RWorkspace rw = RWorkspace.getRWorkspaceInstance(); |
359 | 360 |
rw.addVectorToWorkspace("coococc", occ); //$NON-NLS-1$ |
360 | 361 |
rw.addVectorToWorkspace("coocfreq", freq); //$NON-NLS-1$ |
361 | 362 |
rw.addVectorToWorkspace("cooccofreq", cofreq); //$NON-NLS-1$ |
362 | 363 |
rw.addVectorToWorkspace("coocscore", score); //$NON-NLS-1$ |
363 | 364 |
rw.addVectorToWorkspace("coocmeandist", dist); //$NON-NLS-1$ |
364 |
|
|
365 |
|
|
365 | 366 |
rw.eval(symbol + "<- matrix(data = c(coocfreq, cooccofreq, coocscore, coocmeandist), nrow = " + this.lines.size() + ", ncol = 4)"); //$NON-NLS-1$ //$NON-NLS-2$ |
366 | 367 |
rw.eval("rownames(" + symbol + " ) <- coococc"); //$NON-NLS-1$ //$NON-NLS-2$ |
367 | 368 |
rw.eval("colnames(" + symbol + " ) <- c('freq', 'cofreq', 'score', 'dist')"); //$NON-NLS-1$ //$NON-NLS-2$ |
368 | 369 |
rw.eval(symbol + "<- list(data=" + symbol //$NON-NLS-1$ |
369 |
// + ", leftcontext="+this.leftContextSize |
|
370 |
// + ", rightcontext="+this.rightContextSize |
|
371 |
// + ", query=\""+this.query.getQueryString()+"\"" |
|
370 |
// + ", leftcontext="+this.leftContextSize
|
|
371 |
// + ", rightcontext="+this.rightContextSize
|
|
372 |
// + ", query=\""+this.query.getQueryString()+"\""
|
|
372 | 373 |
+ ")"); //$NON-NLS-1$ |
373 |
|
|
374 |
|
|
374 | 375 |
nocooc++; |
375 | 376 |
return symbol; |
376 | 377 |
} |
377 |
|
|
378 |
|
|
378 | 379 |
@Override |
379 | 380 |
public boolean canCompute() { |
380 |
|
|
381 |
|
|
381 | 382 |
if (pQuery == null || pQuery.isEmpty()) { |
382 | 383 |
Log.fine("No query set."); |
383 | 384 |
return false; |
384 | 385 |
} |
385 |
|
|
386 |
|
|
386 | 387 |
if (pProperties == null) { |
387 | 388 |
Log.fine("No properties set."); |
388 | 389 |
return false; |
389 | 390 |
} |
390 |
|
|
391 |
|
|
391 | 392 |
if (getCorpus() == null) { |
392 | 393 |
Log.fine("No corpus set."); |
393 | 394 |
return false; |
394 | 395 |
} |
395 |
|
|
396 |
|
|
396 | 397 |
if (pProperties.size() == 0) { |
397 | 398 |
Log.fine("No properties filled."); |
398 | 399 |
return false; |
399 | 400 |
} |
400 |
|
|
401 |
|
|
401 | 402 |
return true; |
402 | 403 |
} |
403 |
|
|
404 |
|
|
404 | 405 |
@Override |
405 | 406 |
public void clean() { |
406 | 407 |
try { |
... | ... | |
413 | 414 |
org.txm.utils.logger.Log.printStackTrace(e); |
414 | 415 |
} |
415 | 416 |
} |
416 |
|
|
417 |
|
|
417 | 418 |
// FIXME: useless? |
418 | 419 |
public void clearMemory() { |
419 | 420 |
if (distances != null) distances.clear(); |
... | ... | |
429 | 430 |
if (dist != null) dist.clear(); |
430 | 431 |
if (freq != null) freq.clear(); |
431 | 432 |
if (scores != null) scores.clear(); |
432 |
|
|
433 |
|
|
433 | 434 |
lt = null; |
434 | 435 |
} |
435 |
|
|
436 |
|
|
436 | 437 |
// /** |
437 | 438 |
// * Count occ. |
438 | 439 |
// * |
... | ... | |
490 | 491 |
// } |
491 | 492 |
// } |
492 | 493 |
// } |
493 |
|
|
494 |
|
|
494 | 495 |
/** |
495 | 496 |
* Gets the corpus. |
496 | 497 |
* |
... | ... | |
499 | 500 |
public CQPCorpus getCorpus() { |
500 | 501 |
return (CQPCorpus) this.getParent(); |
501 | 502 |
} |
502 |
|
|
503 |
|
|
503 | 504 |
@Override |
504 | 505 |
public String getDetails() { |
505 | 506 |
Object[] params = new Object[] { this.getParent(), this.pQuery, this.pProperties, this.pStructuralUnitLimit, (this.pMinLeftContextSize - 1), (this.pMaxLeftContextSize - 1), |
... | ... | |
507 | 508 |
(this.pMaxRightContextSize - 1), this.pFminFilter, this.pFCoocFilter, this.pScoreMinFilter }; |
508 | 509 |
return NLS.bind(CooccurrenceCoreMessages.info_details, params); |
509 | 510 |
} |
510 |
|
|
511 |
|
|
511 | 512 |
@Override |
512 | 513 |
public String getName() { |
513 | 514 |
try { |
... | ... | |
517 | 518 |
return this.getSimpleName(); |
518 | 519 |
} |
519 | 520 |
} |
520 |
|
|
521 |
|
|
521 |
|
|
522 |
|
|
522 | 523 |
@Override |
523 | 524 |
public String getSimpleName() { |
524 | 525 |
if (this.pQuery != null && !this.pQuery.isEmpty()) { |
525 | 526 |
StringBuffer output = new StringBuffer(); |
526 | 527 |
output.append(this.pQuery.asString()); |
527 | 528 |
output.append(WordProperty.asString(this.pProperties)); |
528 |
|
|
529 |
|
|
529 | 530 |
if (this.pMaxLeftContextSize > 0 && this.pMaxRightContextSize > 0) { |
530 | 531 |
output.append(" " + (this.pMaxLeftContextSize - 1) + " " + (this.pMaxRightContextSize - 1)); //$NON-NLS-1$ //$NON-NLS-2$ |
531 | 532 |
} |
532 | 533 |
output.append(TXMCoreMessages.formatMinFilter(this.pFminFilter)); |
533 | 534 |
output.append(TXMCoreMessages.formatMinFilter(this.pFCoocFilter)); |
534 | 535 |
output.append(TXMCoreMessages.formatMinFilter(this.pScoreMinFilter)); |
535 |
|
|
536 |
|
|
536 | 537 |
// TODO: SJ: improve the hiding or display of value according to the default preferences values |
537 | 538 |
// output.append(TXMCoreMessages.formatMinFilter(this.pFminFilter, CooccurrencePreferences.getInstance().getInt(CooccurrencePreferences.F_MIN) + 1)); |
538 | 539 |
// output.append(TXMCoreMessages.formatMinFilter(this.pFCoocFilter, CooccurrencePreferences.getInstance().getInt(CooccurrencePreferences.MIN_COUNT) + 1)); |
539 | 540 |
// output.append(TXMCoreMessages.formatMinFilter(this.pScoreMinFilter, CooccurrencePreferences.getInstance().getDouble(CooccurrencePreferences.MIN_SCORE) + 1)); |
540 |
|
|
541 |
|
|
541 |
|
|
542 |
|
|
542 | 543 |
return output.toString(); |
543 | 544 |
} |
544 | 545 |
else { |
545 | 546 |
return this.getEmptyName(); |
546 | 547 |
} |
547 | 548 |
} |
548 |
|
|
549 |
|
|
549 |
|
|
550 |
|
|
550 | 551 |
@Override |
551 | 552 |
public String getComputingStartMessage() { |
552 |
return TXMCoreMessages.bind(CooccurrenceCoreMessages.cooccurrentsOfP0PropertieP1InTheP2Corpus, this.pQuery.asString(), WordProperty.asString(this.pProperties),
|
|
553 |
return TXMCoreMessages.bind(CooccurrenceCoreMessages.cooccurrentsOfP0PropertieP1InTheP2Corpus, (this.pQuery != null?this.pQuery.asString():"<no query>"), WordProperty.asString(this.pProperties),
|
|
553 | 554 |
(this.pMaxLeftContextSize - 1), (this.pMaxRightContextSize - 1), this.pFminFilter, this.pFCoocFilter, this.pScoreMinFilter, this.getCorpus().getName()); |
554 | 555 |
} |
555 |
|
|
556 |
|
|
556 |
|
|
557 |
|
|
557 | 558 |
@Override |
558 | 559 |
public String getComputingDoneMessage() { |
559 | 560 |
if (this.lines.isEmpty()) { |
... | ... | |
563 | 564 |
return TXMCoreMessages.bind(CooccurrenceCoreMessages.P0CooccurentsForP1Occurrences, TXMCoreMessages.formatNumber(this.lines.size()), TXMCoreMessages.formatNumber(this.numberOfKeyword)); |
564 | 565 |
} |
565 | 566 |
} |
566 |
|
|
567 |
|
|
568 |
|
|
567 |
|
|
568 |
|
|
569 |
|
|
569 | 570 |
/** |
570 | 571 |
* Gets the fA. |
571 | 572 |
* |
... | ... | |
574 | 575 |
public int getFA() { |
575 | 576 |
return this.FA; |
576 | 577 |
} |
577 |
|
|
578 |
|
|
578 | 579 |
public boolean getIncludeXPivot() { |
579 | 580 |
return pIncludeXpivot; |
580 | 581 |
} |
581 |
|
|
582 |
|
|
582 | 583 |
/** |
583 | 584 |
* Gets the lines. |
584 | 585 |
* |
... | ... | |
587 | 588 |
public List<CLine> getLines() { |
588 | 589 |
return lines; |
589 | 590 |
} |
590 |
|
|
591 |
|
|
591 | 592 |
/** |
592 | 593 |
* Gets the max left. |
593 | 594 |
* |
... | ... | |
596 | 597 |
public int getMaxLeft() { |
597 | 598 |
return pMaxLeftContextSize; |
598 | 599 |
} |
599 |
|
|
600 |
|
|
600 | 601 |
/** |
601 | 602 |
* Gets the max right. |
602 | 603 |
* |
... | ... | |
605 | 606 |
public int getMaxRight() { |
606 | 607 |
return pMaxRightContextSize; |
607 | 608 |
} |
608 |
|
|
609 |
|
|
609 | 610 |
/** |
610 | 611 |
* Gets the min left. |
611 | 612 |
* |
... | ... | |
614 | 615 |
public int getMinLeft() { |
615 | 616 |
return pMinLeftContextSize; |
616 | 617 |
} |
617 |
|
|
618 |
|
|
618 | 619 |
/** |
619 | 620 |
* Gets the min right. |
620 | 621 |
* |
... | ... | |
623 | 624 |
public int getMinRight() { |
624 | 625 |
return pMinRightContextSize; |
625 | 626 |
} |
626 |
|
|
627 |
|
|
627 | 628 |
/** |
628 | 629 |
* Gets the lines. |
629 | 630 |
* |
... | ... | |
638 | 639 |
} |
639 | 640 |
return numberOfCooccurrents; |
640 | 641 |
} |
641 |
|
|
642 |
|
|
642 | 643 |
/** |
643 | 644 |
* Gets the lines. |
644 | 645 |
* |
... | ... | |
650 | 651 |
} |
651 | 652 |
return 0; |
652 | 653 |
} |
653 |
|
|
654 |
|
|
654 | 655 |
/** |
655 | 656 |
* Gets the number of keyword. |
656 | 657 |
* |
... | ... | |
659 | 660 |
public int getNumberOfKeyword() { |
660 | 661 |
return numberOfKeyword; |
661 | 662 |
} |
662 |
|
|
663 |
|
|
663 | 664 |
/** |
664 | 665 |
* Gets the p. |
665 | 666 |
* |
... | ... | |
668 | 669 |
public int getP() { |
669 | 670 |
return P; |
670 | 671 |
} |
671 |
|
|
672 |
|
|
672 | 673 |
/** |
673 | 674 |
* Gets the properties. |
674 | 675 |
* |
... | ... | |
677 | 678 |
public List<WordProperty> getProperties() { |
678 | 679 |
return pProperties; |
679 | 680 |
} |
680 |
|
|
681 |
|
|
681 | 682 |
/** |
682 | 683 |
* Gets the query. |
683 | 684 |
* |
... | ... | |
686 | 687 |
public CQLQuery getQuery() { |
687 | 688 |
return pQuery; |
688 | 689 |
} |
689 |
|
|
690 |
|
|
690 |
|
|
691 |
|
|
691 | 692 |
/** |
692 | 693 |
* Gets the structural unit limit. |
693 | 694 |
* |
... | ... | |
696 | 697 |
public StructuralUnit getStructuralUnitLimit() { |
697 | 698 |
return this.pStructuralUnitLimit; |
698 | 699 |
} |
699 |
|
|
700 |
|
|
700 | 701 |
/** |
701 | 702 |
* Gets the symbol. |
702 | 703 |
* |
... | ... | |
705 | 706 |
public String getSymbol() { |
706 | 707 |
return this.symbol; |
707 | 708 |
} |
708 |
|
|
709 |
|
|
709 | 710 |
/** |
710 | 711 |
* Inits the conc infos. |
711 | 712 |
* |
... | ... | |
713 | 714 |
* @return true, if successful |
714 | 715 |
*/ |
715 | 716 |
private boolean initConcInfos(Concordance conc) { |
716 |
|
|
717 |
|
|
717 | 718 |
int from = 0; |
718 | 719 |
int to = conc.getNLines() - 1; |
719 | 720 |
return initConcInfos(conc, from, to); |
720 | 721 |
} |
721 |
|
|
722 |
|
|
722 | 723 |
/** |
723 | 724 |
* Inits the conc infos. |
724 | 725 |
* |
... | ... | |
728 | 729 |
* @return true, if successful |
729 | 730 |
*/ |
730 | 731 |
private boolean initConcInfos(Concordance conc, int from, int to) { |
731 |
|
|
732 |
|
|
732 | 733 |
try { |
733 | 734 |
this.concordance = conc; |
734 | 735 |
conclines = conc.getLines(from, to); |
... | ... | |
739 | 740 |
} |
740 | 741 |
return false; |
741 | 742 |
} |
742 |
|
|
743 |
|
|
743 | 744 |
/** |
744 | 745 |
* Inits the conc infos. |
745 | 746 |
* |
... | ... | |
748 | 749 |
* @return true, if successful |
749 | 750 |
*/ |
750 | 751 |
private boolean initConcInfos(Concordance conc, List<Line> conclines) { |
751 |
|
|
752 |
|
|
752 | 753 |
try { |
753 | 754 |
CQPCorpus corpus = this.getCorpus(); |
754 | 755 |
this.concordance = conc; |
... | ... | |
770 | 771 |
} |
771 | 772 |
return false; |
772 | 773 |
} |
773 |
|
|
774 |
|
|
774 | 775 |
/** |
775 | 776 |
* Prints the. |
776 | 777 |
*/ |
... | ... | |
782 | 783 |
for (CLine line : lines) |
783 | 784 |
System.out.println(line.resume("\t", "")); //$NON-NLS-1$ //$NON-NLS-2$ |
784 | 785 |
} |
785 |
|
|
786 |
|
|
786 | 787 |
public void setCoocQuery(String q) { |
787 | 788 |
pCooccurentQueryFilter = q; |
788 | 789 |
} |
789 |
|
|
790 |
|
|
790 | 791 |
/** |
791 | 792 |
* Sets the max left. |
792 | 793 |
* |
... | ... | |
795 | 796 |
public void setMaxLeft(int maxleft) { |
796 | 797 |
this.pMaxLeftContextSize = maxleft; |
797 | 798 |
} |
798 |
|
|
799 |
|
|
799 | 800 |
/** |
800 | 801 |
* Sets the max right. |
801 | 802 |
* |
... | ... | |
804 | 805 |
public void setMaxRight(int maxright) { |
805 | 806 |
this.pMaxRightContextSize = maxright; |
806 | 807 |
} |
807 |
|
|
808 |
|
|
808 | 809 |
/** |
809 | 810 |
* Sets the min left. |
810 | 811 |
* |
... | ... | |
813 | 814 |
public void setMinLeft(int minleft) { |
814 | 815 |
this.pMinLeftContextSize = minleft; |
815 | 816 |
} |
816 |
|
|
817 |
|
|
817 | 818 |
/** |
818 | 819 |
* Sets the min right. |
819 | 820 |
* |
... | ... | |
822 | 823 |
public void setMinRight(int minright) { |
823 | 824 |
this.pMinRightContextSize = minright; |
824 | 825 |
} |
825 |
|
|
826 |
|
|
826 | 827 |
public void setParameters(CQLQuery query, List<WordProperty> properties, StructuralUnit limit, int maxLeft, int minLeft, int minRight, |
827 | 828 |
int maxRight, int minFreq, float minScore, int minCof, boolean includeXpivot, boolean buildLexicalTableWithCooccurrents) { |
828 |
|
|
829 |
|
|
829 | 830 |
this.pQuery = query; |
830 | 831 |
this.pProperties = properties; |
831 | 832 |
this.pStructuralUnitLimit = limit; |
... | ... | |
839 | 840 |
this.pIncludeXpivot = includeXpivot; |
840 | 841 |
this.buildLexicalTableWithCooccurrents = buildLexicalTableWithCooccurrents; |
841 | 842 |
} |
842 |
|
|
843 |
|
|
843 | 844 |
public void setIncludeXpivot(boolean b) { |
844 | 845 |
pIncludeXpivot = b; |
845 | 846 |
} |
846 |
|
|
847 |
|
|
847 | 848 |
@Override |
848 | 849 |
public boolean setParameters(TXMParameters parameters) { |
849 | 850 |
try { |
850 | 851 |
CQPCorpus corpus = this.getCorpus(); |
851 | 852 |
boolean includeXpivot = parameters.getBoolean(CooccurrencePreferences.INCLUDE_X_PIVOT); |
852 |
|
|
853 |
|
|
853 | 854 |
String queryString = ""; //$NON-NLS-1$ |
854 | 855 |
if (parameters.get(TXMPreferences.QUERY) != null) { |
855 | 856 |
queryString = parameters.get(TXMPreferences.QUERY).toString(); |
856 | 857 |
} |
857 | 858 |
CQLQuery query = new CQLQuery(queryString); |
858 |
|
|
859 |
|
|
859 | 860 |
StructuralUnit limit = (StructuralUnit) parameters.get(CooccurrencePreferences.STRUCTURAL_UNIT_LIMIT); |
860 |
|
|
861 |
|
|
861 | 862 |
Object propsParam = parameters.get(CooccurrencePreferences.UNIT_PROPERTIES); |
862 | 863 |
List<WordProperty> properties = null; |
863 | 864 |
if (propsParam instanceof List<?>) { |
... | ... | |
866 | 867 |
else if (propsParam instanceof String) { |
867 | 868 |
properties = (List<WordProperty>) Property.stringToProperties(corpus, propsParam.toString()); |
868 | 869 |
} |
869 |
|
|
870 |
|
|
870 | 871 |
int maxLeft = parameters.getInteger(CooccurrencePreferences.MAX_LEFT); |
871 | 872 |
int minLeft = parameters.getInteger(CooccurrencePreferences.MIN_LEFT); |
872 | 873 |
int maxRight = parameters.getInteger(CooccurrencePreferences.MAX_RIGHT); |
... | ... | |
875 | 876 |
int minCof = parameters.getInteger(CooccurrencePreferences.MIN_COUNT); |
876 | 877 |
int minFreq = parameters.getInteger(TXMPreferences.F_MIN); |
877 | 878 |
boolean buildLexicalTableWithCooccurrents = parameters.getBoolean(CooccurrencePreferences.PARTIAL_LEXICAL_TABLE); |
878 |
|
|
879 |
|
|
879 | 880 |
this.setParameters(query, properties, limit, maxLeft, minLeft, minRight, maxRight, minFreq, minScore, minCof, includeXpivot, buildLexicalTableWithCooccurrents); |
880 |
|
|
881 |
|
|
881 | 882 |
} |
882 | 883 |
catch (Exception e) { |
883 | 884 |
System.out.println("Error while setting cooccurrence parameters: " + e.getLocalizedMessage()); |
... | ... | |
886 | 887 |
} |
887 | 888 |
return true; |
888 | 889 |
} |
889 |
|
|
890 |
|
|
890 | 891 |
public void setReferenceCorpus(String symbol) { |
891 | 892 |
referenceCorpus = symbol; |
892 | 893 |
} |
893 |
|
|
894 |
|
|
894 | 895 |
/** |
895 | 896 |
* Sets the structural unit limit. |
896 | 897 |
* |
... | ... | |
899 | 900 |
public void setStructuralUnitLimt(StructuralUnit su) { |
900 | 901 |
pStructuralUnitLimit = su; |
901 | 902 |
} |
902 |
|
|
903 |
|
|
903 | 904 |
/** |
904 | 905 |
* Sets the thresfold. |
905 | 906 |
* |
... | ... | |
908 | 909 |
* @param score the score |
909 | 910 |
*/ |
910 | 911 |
public void setThresfold(int freq, int count, float score) { |
911 |
|
|
912 |
|
|
912 | 913 |
pFminFilter = freq; |
913 | 914 |
pFCoocFilter = count; |
914 | 915 |
pScoreMinFilter = score; |
915 | 916 |
} |
916 |
|
|
917 |
|
|
917 | 918 |
/** |
918 | 919 |
* Sort. |
919 | 920 |
* |
... | ... | |
927 | 928 |
Collections.sort(lines, comparator); |
928 | 929 |
} |
929 | 930 |
} |
930 |
|
|
931 |
|
|
931 | 932 |
/** |
932 | 933 |
* Step build lexical table. |
933 | 934 |
* |
... | ... | |
935 | 936 |
* @throws RWorkspaceException the r workspace exception |
936 | 937 |
*/ |
937 | 938 |
public boolean stepBuildLexicalTable(TXMProgressMonitor monitor) throws RWorkspaceException { |
938 |
|
|
939 |
|
|
939 | 940 |
CQPCorpus corpus = this.getCorpus(); |
940 | 941 |
String[] colnames = { corpus.getName() + "-" + pQuery.getQueryString(), pQuery.getQueryString() }; //$NON-NLS-1$ |
941 | 942 |
keysToString = new HashMap<>(); |
942 |
|
|
943 |
|
|
943 | 944 |
// time = System.currentTimeMillis(); |
944 | 945 |
for (TXMResult rez : corpus.getChildren(Index.class)) { // TODO: fix usages of index for cooc |
945 | 946 |
Index rezvoc = (Index) rez; |
946 |
|
|
947 |
|
|
947 | 948 |
if (rezvoc.getProperties().equals(pProperties)) { |
948 | 949 |
if (rezvoc.getQuery().equals(new CQLQuery(pCooccurentQueryFilter))) { |
949 | 950 |
if (rezvoc.getFilterFmax() == null) { |
... | ... | |
953 | 954 |
} |
954 | 955 |
} |
955 | 956 |
} |
956 |
|
|
957 |
|
|
957 | 958 |
// TODO: MD: implement a sibling dependency system |
958 | 959 |
if (index == null) { |
959 | 960 |
try { |
... | ... | |
970 | 971 |
return false; |
971 | 972 |
} |
972 | 973 |
} |
973 |
|
|
974 |
|
|
974 | 975 |
// ALTER THE INDEX IF A REFERENCE CORPUS IS SET -> this change the base frequencies |
975 | 976 |
if (referenceCorpus != null && referenceCorpus.length() > 0) { |
976 | 977 |
// voc.toTxt(new File("/home/mdecorde/TEMP/before.tsv"), "UTF-8", "\t", ""); |
... | ... | |
982 | 983 |
return false; |
983 | 984 |
} |
984 | 985 |
} |
985 |
|
|
986 |
|
|
986 | 987 |
List<org.txm.index.core.functions.Line> vocLines = index.getAllLines(); |
987 | 988 |
int[][] freqs; |
988 | 989 |
String[] rownames; |
... | ... | |
994 | 995 |
freqs = new int[vocLines.size()][2]; |
995 | 996 |
rownames = new String[vocLines.size()]; |
996 | 997 |
} |
997 |
|
|
998 |
|
|
998 | 999 |
int i = 0; |
999 | 1000 |
// System.out.println("T voc : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$ |
1000 | 1001 |
// System.out.println("nb lines voc "+voclines.size()); |
... | ... | |
1009 | 1010 |
int count = counts.get(l.getSignature()); |
1010 | 1011 |
int tot = l.getFrequency(); |
1011 | 1012 |
indexfreqs.put(l.toString(), tot); |
1012 |
|
|
1013 |
|
|
1013 | 1014 |
freqs[i][0] = tot - count; |
1014 | 1015 |
freqs[i][1] = count; |
1015 | 1016 |
i++; |
... | ... | |
1019 | 1020 |
rownames[i] = l.toString(); |
1020 | 1021 |
// System.out.println("set rowname: "+l.toString()); |
1021 | 1022 |
// System.out.println("("+l.getSignature()+", "+l.toString()+") : "+l.getFrequency()+" - "+counts.get(l.getSignature())); |
1022 |
|
|
1023 |
|
|
1023 | 1024 |
int tot = l.getFrequency(); |
1024 | 1025 |
indexfreqs.put(l.toString(), tot); |
1025 |
|
|
1026 |
|
|
1026 | 1027 |
freqs[i][0] = tot; |
1027 | 1028 |
freqs[i][1] = 0; |
1028 | 1029 |
i++; |
... | ... | |
1030 | 1031 |
} |
1031 | 1032 |
index.delete(); // no more needed |
1032 | 1033 |
index = null; |
1033 |
|
|
1034 |
|
|
1034 | 1035 |
// time = System.currentTimeMillis(); |
1035 | 1036 |
if (freqs.length == 0) { |
1036 | 1037 |
System.out.println(CooccurrenceCoreMessages.errorColonNoCooccurrents); |
... | ... | |
1053 | 1054 |
// writer.close(); |
1054 | 1055 |
// //writer.println("Cols: "+Arrays.toString(colnames)); |
1055 | 1056 |
// } catch(Exception e) {e.printStackTrace();} |
1056 |
|
|
1057 |
|
|
1057 | 1058 |
lt = new LexicalTableImpl(freqs, rownames, colnames); |
1058 |
|
|
1059 |
|
|
1059 |
|
|
1060 |
|
|
1060 | 1061 |
// if(referenceCorpus != null && referenceCorpus.length() > 0) { |
1061 | 1062 |
// //lt.removeCol(0, false); |
1062 | 1063 |
// lt.setReference(referenceCorpus); |
... | ... | |
1064 | 1065 |
// } |
1065 | 1066 |
return true; |
1066 | 1067 |
} |
1067 |
|
|
1068 |
|
|
1068 | 1069 |
/** |
1069 | 1070 |
* Step build signatures. |
1070 | 1071 |
* |
... | ... | |
1082 | 1083 |
} |
1083 | 1084 |
} |
1084 | 1085 |
// System.out.println("Position set: "+allpositions.size()); |
1085 |
|
|
1086 |
|
|
1086 | 1087 |
int[] allpositionsarray = new int[allpositions.size()]; |
1087 | 1088 |
int pcount = 0; |
1088 | 1089 |
for (int p : allpositions) { |
1089 | 1090 |
allpositionsarray[pcount++] = p; |
1090 | 1091 |
} |
1091 |
|
|
1092 |
|
|
1092 | 1093 |
HashMap<Property, int[]> propsId = new HashMap<>(); |
1093 | 1094 |
// HashMap<Property, String[]> propsValues = new HashMap<Property, String[]>(); |
1094 |
for (Property property : pProperties) { |
|
1095 |
int[] indices = CorpusManager.getCorpusManager().getCqiClient() |
|
1096 |
.cpos2Id(property.getQualifiedName(), allpositionsarray); |
|
1095 |
for (WordProperty property : pProperties) { |
|
1096 |
int[] indices = property.cpos2Id(allpositionsarray); |
|
1097 | 1097 |
// String[] values = |
1098 | 1098 |
// CorpusManager.getCorpusManager().getCqiClient().cpos2Str(property.getQualifiedName(),allpositionsarray); |
1099 | 1099 |
propsId.put(property, indices); |
1100 | 1100 |
// propsValues.put(property, values); |
1101 | 1101 |
// System.out.println("all "+property+" indices: "+propsId.get(property).length); |
1102 | 1102 |
} |
1103 |
|
|
1103 |
|
|
1104 | 1104 |
// System.out.println("T values + ids: "+(System.currentTimeMillis()- time)); //$NON-NLS-1$ |
1105 |
|
|
1105 |
|
|
1106 | 1106 |
pcount = 0; |
1107 | 1107 |
for (int position : allpositionsarray) { |
1108 | 1108 |
// String sign = ""; //$NON-NLS-1$ |
... | ... | |
1114 | 1114 |
// allsignatures.put(position, sign); |
1115 | 1115 |
allsignaturesstr.put(position, signstr); |
1116 | 1116 |
pcount++; |
1117 |
|
|
1117 |
|
|
1118 | 1118 |
} |
1119 | 1119 |
return true; |
1120 | 1120 |
} |
1121 |
|
|
1121 |
|
|
1122 | 1122 |
/** |
1123 | 1123 |
* Step count. |
1124 | 1124 |
* |
... | ... | |
1126 | 1126 |
*/ |
1127 | 1127 |
public boolean stepCount() { |
1128 | 1128 |
// ArrayList<Integer> keepedPosition = new ArrayList<>(); |
1129 |
|
|
1129 |
|
|
1130 | 1130 |
int startsearchM2 = 0; // optimisation: m2 is ordered |
1131 | 1131 |
int startsearchM3 = 0; // optimisation: m3 is ordered |
1132 | 1132 |
// time = System.currentTimeMillis(); |
1133 |
|
|
1133 |
|
|
1134 | 1134 |
HashMap<Integer, Integer> positionsDistances = new HashMap<>(); |
1135 |
|
|
1135 |
|
|
1136 | 1136 |
for (Match m : m1) { // for each match = for each focus |
1137 |
|
|
1137 |
|
|
1138 | 1138 |
if (m.getTarget() >= 0) { // if target is set focus on target position |
1139 | 1139 |
m.setStart(m.getTarget()); |
1140 | 1140 |
m.setEnd(m.getTarget()); |
... | ... | |
1153 | 1153 |
} |
1154 | 1154 |
if (Thread.interrupted()) return false; // stop if interrupted by user |
1155 | 1155 |
// System.out.println("found n: "+n); |
1156 |
|
|
1156 |
|
|
1157 | 1157 |
for (int i = startsearchM3; i < m3.size(); i++) { // find next match m3 contained by m2 |
1158 |
|
|
1158 |
|
|
1159 | 1159 |
o = m3.get(i); |
1160 | 1160 |
if (o.getStart() <= m.getStart() && m.getEnd() <= o.getEnd()) { |
1161 | 1161 |
startsearchM3 = i; |
... | ... | |
1164 | 1164 |
} |
1165 | 1165 |
} |
1166 | 1166 |
// System.out.println("found o: "+o); |
1167 |
|
|
1167 |
|
|
1168 | 1168 |
if (!matchFound) { |
1169 | 1169 |
continue; |
1170 | 1170 |
} |
1171 |
|
|
1171 |
|
|
1172 | 1172 |
int start = n.getStart(); |
1173 | 1173 |
int size = n.getEnd() - start + 1; |
1174 | 1174 |
// if (size > 0) |
... | ... | |
1177 | 1177 |
// System.out.println("NbOccs "+(size)); |
1178 | 1178 |
int[] positions = new int[size]; |
1179 | 1179 |
int noOcc = 0; |
1180 |
|
|
1180 |
|
|
1181 | 1181 |
// System.out.println("positions"); |
1182 | 1182 |
// System.out.println("start: "+(start)+" end:"+n.getEnd()); |
1183 | 1183 |
for (int position = start; position <= n.getEnd(); position++) { |
1184 |
|
|
1184 |
|
|
1185 | 1185 |
if (o.getStart() <= position && position <= o.getEnd()) { |
1186 | 1186 |
// ignore positions in the anticontext positions |
1187 | 1187 |
continue; |
1188 | 1188 |
} |
1189 |
|
|
1189 |
|
|
1190 | 1190 |
int dist; |
1191 | 1191 |
if (position < m.getStart()) { |
1192 | 1192 |
dist = m.getStart() - position - 1; |
... | ... | |
1202 | 1202 |
positionsDistances.put(position, dist); |
1203 | 1203 |
} |
1204 | 1204 |
} |
1205 |
|
|
1205 |
|
|
1206 | 1206 |
// System.out.println("nb Occ ignored: "+ignore); |
1207 | 1207 |
// System.out.println("nb Occ chevauche: "+chevauche); |
1208 | 1208 |
} |
1209 |
|
|
1209 |
|
|
1210 | 1210 |
// store and count distances for each position signature |
1211 | 1211 |
int noOcc = 0; |
1212 | 1212 |
for (int position : positionsDistances.keySet()) { // cooccurrent words positions |
1213 | 1213 |
// String signature = allsignatures.get(position); |
1214 | 1214 |
String signaturestr = allsignaturesstr.get(position); |
1215 |
|
|
1215 |
|
|
1216 | 1216 |
int dist = positionsDistances.get(position); |
1217 | 1217 |
if (distances.containsKey(signaturestr)) { |
1218 | 1218 |
distances.put(signaturestr, (distances.get(signaturestr)) + dist); |
... | ... | |
1220 | 1220 |
else { |
1221 | 1221 |
distances.put(signaturestr, 0.0); |
1222 | 1222 |
} |
1223 |
|
|
1223 |
|
|
1224 | 1224 |
if (counts.containsKey(signaturestr)) { |
1225 | 1225 |
counts.put(signaturestr, (counts.get(signaturestr)) + 1); |
1226 | 1226 |
} |
1227 | 1227 |
else { |
1228 | 1228 |
counts.put(signaturestr, 1); |
1229 | 1229 |
} |
1230 |
|
|
1230 |
|
|
1231 | 1231 |
// if ("[1599]".equals(signaturestr)) { |
1232 | 1232 |
// System.out.println("p=" + position + " d=" + dist + " total(d)=" + distances.get(signaturestr) + " c=" + counts.get(signaturestr)); |
1233 | 1233 |
// } |
1234 | 1234 |
// } |
1235 |
|
|
1235 |
|
|
1236 | 1236 |
noOcc++; |
1237 | 1237 |
} |
1238 | 1238 |
// System.out.println("T counts : "+(System.currentTimeMillis()- time)); //$NON-NLS-1$ |
1239 |
|
|
1239 |
|
|
1240 | 1240 |
allsignaturesstr = null; // no more need |
1241 | 1241 |
// counted = null; |
1242 | 1242 |
return true; |
1243 | 1243 |
} |
1244 |
|
|
1244 |
|
|
1245 | 1245 |
/** |
1246 | 1246 |
* Step get matches. |
1247 | 1247 |
* |
... | ... | |
1268 | 1268 |
else { |
1269 | 1269 |
r3 = corpus.query(anticontextquery, "CoocAntiContextFocusQuery", false); // no context //$NON-NLS-1$ |
1270 | 1270 |
} |
1271 |
|
|
1272 |
|
|
1271 |
|
|
1272 |
|
|
1273 | 1273 |
m1 = r1.getMatches(); |
1274 | 1274 |
numberOfKeyword = m1.size(); |
1275 | 1275 |
m2 = r2.getMatches(); |
1276 | 1276 |
m3 = r3.getMatches(); |
1277 |
|
|
1277 |
|
|
1278 | 1278 |
Log.finest("R1 size=" + m1.size()); |
1279 | 1279 |
Log.finest("R2 size=" + m2.size()); |
1280 | 1280 |
Log.finest("R3 size=" + m3.size()); |
1281 |
|
|
1281 |
|
|
1282 | 1282 |
// System.out.println(query+" M1 size: "+m1.size()); |
1283 | 1283 |
// System.out.println(contextquery+" M2 size: "+m2.size()); |
1284 | 1284 |
// System.out.println(anticontextquery+" M3 size: "+m3.size()); |
1285 | 1285 |
r1.drop(); |
1286 |
|
|
1286 |
|
|
1287 | 1287 |
if (r2 != r1) { |
1288 | 1288 |
r2.drop(); |
1289 | 1289 |
} |
... | ... | |
1292 | 1292 |
} |
1293 | 1293 |
return true; |
1294 | 1294 |
} |
1295 |
|
|
1295 |
|
|
1296 | 1296 |
/** |
1297 | 1297 |
* Step get scores. |
1298 | 1298 |
* |
... | ... | |
1301 | 1301 |
* @throws StatException the stat exception |
1302 | 1302 |
*/ |
1303 | 1303 |
public boolean stepGetScores() throws CqiClientException, StatException { |
1304 |
|
|
1304 |
|
|
1305 | 1305 |
SpecificitiesR specif = new SpecificitiesR(lt); |
1306 | 1306 |
// System.out.println("Specif N part: "+specif.getNbrPart()); //$NON-NLS-1$ |
1307 | 1307 |
// System.out.println("Specif N lines number: "+specif.getSpecificitesIndex().length); //$NON-NLS-1$ |
... | ... | |
1310 | 1310 |
String[] specifrownames = specif.getRowNames().asStringsArray(); |
1311 | 1311 |
double[][] scores = specif.getScores(); |
1312 | 1312 |
// System.out.println("Nb specif result: "+specif.getSpecificitesIndex().length); |
1313 |
|
|
1313 |
|
|
1314 | 1314 |
int iimax = Math.min(specifrownames.length, scores.length); |
1315 | 1315 |
for (int ii = 0; ii < iimax; ii++) { // counts.keySet()) |
1316 | 1316 |
String signaturestr = keysToString.get(specifrownames[ii]); |
1317 |
|
|
1317 |
|
|
1318 | 1318 |
ArrayList<String> props = new ArrayList<>(); |
1319 | 1319 |
if (pProperties.size() > 1) { |
1320 | 1320 |
String[] splited = specifrownames[ii].split("_", pProperties.size()); //$NON-NLS-1$ |
1321 |
|
|
1321 |
|
|
1322 | 1322 |
for (int p = 0; p < pProperties.size(); p++) { |
1323 | 1323 |
props.add(splited[p]); |
1324 | 1324 |
} |
... | ... | |
1342 | 1342 |
indexfreqs.get(specifrownames[ii]), scores[ii][1], // freq |
1343 | 1343 |
((float) (distances.get(signaturestr) / counts.get(signaturestr))), // mean distance |
1344 | 1344 |
-1); |
1345 |
|
|
1345 |
|
|
1346 | 1346 |
// select the line |
1347 | 1347 |
if (cline.freq >= this.pFminFilter && cline.nbocc >= this.pFCoocFilter && cline.score >= 0 && cline.score >= this.pScoreMinFilter) { |
1348 | 1348 |
if (cline.score >= Integer.MAX_VALUE - 5) { |
... | ... | |
1356 | 1356 |
} |
1357 | 1357 |
return true; |
1358 | 1358 |
} |
1359 |
|
|
1359 |
|
|
1360 | 1360 |
/** |
1361 | 1361 |
* Step query limits build the context, anticontext and keyword queries using the context size and xincludepivot and structuralLimit parameters |
1362 | 1362 |
* |
... | ... | |
1364 | 1364 |
*/ |
1365 | 1365 |
public boolean stepQueryLimits() { |
1366 | 1366 |
// structural context |
1367 |
|
|
1367 |
|
|
1368 | 1368 |
String fixedQuery = CQLQuery.fixQuery(pQuery.getQueryString()); |
1369 |
|
|
1369 |
|
|
1370 | 1370 |
if (pStructuralUnitLimit != null) { |
1371 | 1371 |
String tempquery = ""; //$NON-NLS-1$ |
1372 | 1372 |
String lname = pStructuralUnitLimit.getName(); |
1373 |
|
|
1373 |
|
|
1374 | 1374 |
// test if there is a left context |
1375 | 1375 |
if (pMinLeftContextSize > 0) { |
1376 | 1376 |
tempquery += "(<" + lname + ">[]* </" + lname + ">){" + (pMaxLeftContextSize) + "," + (pMaxLeftContextSize) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ |
1377 | 1377 |
} |
1378 | 1378 |
// (<p>[]*</p>){0, 50} "je" (<p>[]*</p>){0, 50} |
1379 | 1379 |
tempquery += " <" + lname + ">[]* " + fixedQuery + " []* </" + lname + "> "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
1380 |
|
|
1380 |
|
|
1381 | 1381 |
// test if there is a right context |
1382 | 1382 |
if (pMinRightContextSize > 0) { |
1383 | 1383 |
tempquery += "(<" + lname + ">[]* </" + lname + ">){" + (pMaxRightContextSize) + "," + (pMaxRightContextSize) + "}"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ |
1384 | 1384 |
} |
1385 | 1385 |
this.contextQuery = new CQLQuery(tempquery); |
1386 |
|
|
1386 |
|
|
1387 | 1387 |
if (pIncludeXpivot) { |
1388 | 1388 |
String anticontextquerystring = ""; //$NON-NLS-1$ |
1389 | 1389 |
// minleft = 2..N |
... | ... | |
1412 | 1412 |
} |
1413 | 1413 |
} |
1414 | 1414 |
else { // word context |
1415 |
|
|
1415 |
|
|
1416 | 1416 |
String tempquery = ""; //$NON-NLS-1$ |
1417 | 1417 |
if (pMinLeftContextSize > 0) { // test if there is a left context |
1418 | 1418 |
tempquery += "[]{" + pMaxLeftContextSize + "," + pMaxLeftContextSize + "} "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
1419 | 1419 |
} |
1420 | 1420 |
tempquery += fixedQuery; |
1421 |
|
|
1421 |
|
|
1422 | 1422 |
if (pMinRightContextSize > 0) { // test if there is a right context |
1423 | 1423 |
tempquery += " []{" + pMaxRightContextSize + "," + pMaxRightContextSize + "} "; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
1424 | 1424 |
} |
... | ... | |
1435 | 1435 |
} |
1436 | 1436 |
return true; |
1437 | 1437 |
} |
1438 |
|
|
1438 |
|
|
1439 | 1439 |
/** |
1440 | 1440 |
* To txt. |
1441 | 1441 |
* |
... | ... | |
1447 | 1447 |
public boolean toTxt(File outfile, String encoding) throws Exception { |
1448 | 1448 |
return toTxt(outfile, encoding, "\t", ""); //$NON-NLS-1$ //$NON-NLS-2$ |
1449 | 1449 |
} |
1450 |
|
|
1450 |
|
|
1451 | 1451 |
/** |
1452 | 1452 |
* To txt. |
1453 | 1453 |
* |
... | ... | |
1472 | 1472 |
} |
1473 | 1473 |
return true; |
1474 | 1474 |
} |
1475 |
|
|
1475 |
|
|
1476 | 1476 |
/** |
1477 | 1477 |
* To txt. |
1478 | 1478 |
* |
... | ... | |
1497 | 1497 |
Log.severe(TXMCoreMessages.bind(TXMCoreMessages.error_error, e)); |
1498 | 1498 |
return false; |
1499 | 1499 |
} |
1500 |
|
|
1500 |
|
|
1501 | 1501 |
return true; |
1502 | 1502 |
} |
1503 |
|
|
1504 |
|
|
1503 |
|
|
1504 |
|
|
1505 | 1505 |
/** |
1506 | 1506 |
* Sets the query. |
1507 | 1507 |
* |
... | ... | |
1510 | 1510 |
public void setQuery(CQLQuery query) { |
1511 | 1511 |
this.pQuery = query; |
1512 | 1512 |
} |
1513 |
|
|
1513 |
|
|
1514 | 1514 |
/** |
1515 | 1515 |
* Creates a CQL query string from the specified lines. |
1516 | 1516 |
* |
... | ... | |
1518 | 1518 |
* @return the query |
1519 | 1519 |
*/ |
1520 | 1520 |
public String createQuery(List<CLine> lines) { |
1521 |
|
|
1521 |
|
|
1522 | 1522 |
if (this.getQuery().isEmpty()) { |
1523 | 1523 |
return ""; //$NON-NLS-1$ |
1524 | 1524 |
} |
1525 |
|
|
1526 |
|
|
1525 |
|
|
1526 |
|
|
1527 | 1527 |
int nbProps = this.getProperties().size(); |
1528 | 1528 |
List<WordProperty> props = this.getProperties(); |
1529 |
|
|
1530 |
String query = "["; //$NON-NLS-1$ |
|
1529 |
|
|
1530 |
String query = "@["; //$NON-NLS-1$
|
|
1531 | 1531 |
for (int p = 0; p < nbProps; p++) { |
1532 |
query += props.get(p) + "=\""; //$NON-NLS-1$ |
|
1533 |
for (int l = 0; l < lines.size(); l++) { |
|
1534 |
CLine line = lines.get(l); |
|
1535 |
String s = line.props.get(p); |
|
1536 |
s = CQLQuery.addBackSlash(s); |
|
1537 |
query += s + "|"; //$NON-NLS-1$ |
|
1532 |
|
|
1533 |
if (props.get(p) instanceof WordProperty) { |
|
1534 |
ArrayList<String> values = new ArrayList<String>(); |
|
1535 |
for (int l = 0; l < lines.size(); l++) { |
|
1536 |
CLine line = lines.get(l); |
|
1537 |
String s = line.props.get(p); |
|
1538 |
values.add(s); |
|
1539 |
} |
|
1540 |
String test = ((WordProperty)props.get(p)).getCQLTest(values); |
|
1541 |
if (test != null) { |
|
1542 |
query += test; |
|
1543 |
} |
|
1538 | 1544 |
} |
1539 |
query = query.substring(0, query.length() - 1); |
|
1540 |
query += "\" & "; //$NON-NLS-1$ |
|
1545 |
if (p < nbProps-1) { |
|
1546 |
query += " & "; //$NON-NLS-1$ |
|
1547 |
} |
|
1541 | 1548 |
} |
1542 |
query = query.substring(0, query.length() - 3); |
|
1543 | 1549 |
query += "] "; //$NON-NLS-1$ |
1544 |
|
|
1550 |
|
|
1545 | 1551 |
int maxempan = Math.max(this.getMaxLeft(), this.getMaxRight()); |
1546 | 1552 |
if (this.getIncludeXPivot() && maxempan == 0) maxempan = 1; |
1547 |
|
|
1553 |
|
|
1548 | 1554 |
String maxempanstr = "within " + maxempan + " "; //$NON-NLS-1$ //$NON-NLS-2$ |
1549 | 1555 |
if (this.getStructuralUnitLimit() != null) maxempanstr += this.getStructuralUnitLimit().getName(); |
1550 |
|
|
1551 |
|
|
1556 |
|
|
1557 |
|
|
1552 | 1558 |
String pquery = CQLQuery.fixQuery(this.getQuery().getQueryString()); |
1553 | 1559 |
if (this.getMaxLeft() == 0) { |
1554 | 1560 |
query = "" + pquery + " []* " + query + " " + maxempanstr; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
... | ... | |
1559 | 1565 |
else { |
1560 | 1566 |
query = "(" + pquery + " []* " + query + ") | (" + query + " []* " + pquery + ") " + maxempanstr; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ |
1561 | 1567 |
} |
1562 |
|
|
1568 |
|
|
1563 | 1569 |
return query; |
1564 | 1570 |
} |
1565 |
|
|
1566 |
|
|
1571 |
|
|
1572 |
|
|
1567 | 1573 |
/** |
1568 | 1574 |
* The Class CLine. |
1569 | 1575 |
*/ |
1570 | 1576 |
public class CLine { |
1571 |
|
|
1577 |
|
|
1572 | 1578 |
/** The cooc. */ |
1573 | 1579 |
Cooccurrence cooc; |
1574 |
|
|
1580 |
|
|
1575 | 1581 |
/** The distmoyenne. */ |
1576 | 1582 |
public float distmoyenne; |
1577 |
|
|
1583 |
|
|
1578 | 1584 |
/** The freq. */ |
1579 | 1585 |
public int freq; |
1580 |
|
|
1586 |
|
|
1581 | 1587 |
/** The id. */ |
1582 | 1588 |
public int id; |
1583 |
|
|
1589 |
|
|
1584 | 1590 |
/** The mode. */ |
1585 | 1591 |
public long mode; |
1586 |
|
|
1592 |
|
|
1587 | 1593 |
/** The nbocc. */ |
1588 | 1594 |
public int nbocc; |
1589 |
|
|
1595 |
|
|
1590 | 1596 |
/** The occ. */ |
1591 | 1597 |
public String occ; |
1592 |
|
|
1598 |
|
|
1593 | 1599 |
/** The props. */ |
1594 | 1600 |
public List<String> props; |
1595 |
|
|
1601 |
|
|
1596 | 1602 |
/** The score. */ |
1597 | 1603 |
public double score; |
1598 |
|
|
1604 |
|
|
1599 | 1605 |
/** |
1600 | 1606 |
* Instantiates a new c line. |
1601 | 1607 |
* |
... | ... | |
1619 | 1625 |
this.mode = mode; |
1620 | 1626 |
this.cooc = cooc; |
1621 | 1627 |
} |
1622 |
|
|
1628 |
|
|
1623 | 1629 |
/** |
1624 | 1630 |
* Adds the txt sep. |
1625 | 1631 |
* |
... | ... | |
1630 | 1636 |
private String addTxtSep(String str, String sep) { |
1631 | 1637 |
return sep + str.replace(sep, sep + sep) + sep; |
1632 | 1638 |
} |
1633 |
|
|
1639 |
|
|
1634 | 1640 |
/** |
1635 | 1641 |
* Gets the cooc. |
1636 | 1642 |
* |
... | ... | |
1639 | 1645 |
public Cooccurrence getCooc() { |
1640 | 1646 |
return cooc; |
1641 | 1647 |
} |
1642 |
|
|
1648 |
|
|
1643 | 1649 |
/** |
1644 | 1650 |
* Resume. |
1645 | 1651 |
* |
... | ... | |
1651 | 1657 |
return addTxtSep("" + occ, txtseparator) //$NON-NLS-1$ |
1652 | 1658 |
+ colseparator + freq + colseparator + nbocc + colseparator + score + colseparator + distmoyenne; |
1653 | 1659 |
} |
1654 |
|
|
1660 |
|
|
1655 | 1661 |
/** |
1656 | 1662 |
* Sets the count and dist. |
1657 | 1663 |
* |
... | ... | |
1662 | 1668 |
this.nbocc = count; |
1663 | 1669 |
this.distmoyenne = dist; |
1664 | 1670 |
} |
1665 |
|
|
1671 |
|
|
1666 | 1672 |
/** |
1667 | 1673 |
* Sets the freq. |
1668 | 1674 |
* |
... | ... | |
1671 | 1677 |
public void setFreq(int freq) { |
1672 | 1678 |
this.freq = freq; |
1673 | 1679 |
} |
1674 |
|
|
1680 |
|
|
1675 | 1681 |
/** |
1676 | 1682 |
* Sets the score. |
1677 | 1683 |
*/ |
1678 | 1684 |
public void setScore() {// FB == freq, R = nbocc |
1679 | 1685 |
this.score = (this.nbocc + this.freq + this.distmoyenne); |
1680 | 1686 |
} |
1681 |
|
|
1687 |
|
|
1682 | 1688 |
@Override |
1683 | 1689 |
public String toString() { |
1684 | 1690 |
return occ + CooccurrenceCoreMessages.fColon + freq + CooccurrenceCoreMessages.occColon + nbocc + CooccurrenceCoreMessages.scoreColon + score + CooccurrenceCoreMessages.meanDistColon |
1685 | 1691 |
+ distmoyenne + CooccurrenceCoreMessages.propertiesColon + props; |
Formats disponibles : Unified diff