68 |
68 |
import org.txm.utils.logger.Log;
|
69 |
69 |
|
70 |
70 |
/**
|
71 |
|
* Computes an index of a corpus or a partition.
|
|
71 |
* Computes an index of a partition.
|
72 |
72 |
*
|
73 |
73 |
* @author mdecorde
|
|
74 |
* @author sjacquot
|
74 |
75 |
*/
|
75 |
|
public class PartitionIndex extends TXMResult {
|
|
76 |
public class PartitionIndex extends Index {
|
76 |
77 |
|
77 |
|
/** The counts. */
|
78 |
|
protected LinkedHashMap<String, ArrayList<Integer>> counts = new LinkedHashMap<String, ArrayList<Integer>>();
|
79 |
78 |
|
80 |
|
/** The currentpartid. */
|
81 |
|
protected int currentpartid = 0;
|
82 |
|
|
83 |
|
/** The current Fmax value. */
|
84 |
|
protected int Fmax = 0;
|
85 |
|
|
86 |
|
/** The current Fmin value. */
|
87 |
|
protected int Fmin = 9999999;
|
88 |
|
|
89 |
79 |
/** The lexicon used to compute the index (if query = null || query == [] && properties.size() == 1 */
|
90 |
80 |
@Deprecated
|
91 |
81 |
protected CQPLexicon lexicon;
|
... | ... | |
93 |
83 |
@Deprecated
|
94 |
84 |
protected boolean lexiconMode = false;
|
95 |
85 |
|
96 |
|
/** The current lines. */
|
97 |
|
protected List<Line> lines = new ArrayList<Line>();
|
98 |
|
|
99 |
|
/** The current number of lines. */
|
100 |
|
protected int nTotalTokens = 0;
|
101 |
|
|
102 |
|
/** The partnames. The corpus name if built with a Corpus or the parts names if built with a Partition */
|
103 |
|
protected List<String> partnames = new ArrayList<String>();
|
104 |
|
|
105 |
|
/** The writer. */
|
106 |
|
// FIXME: should be in an exporter extension
|
107 |
|
@Deprecated
|
108 |
|
private BufferedWriter writer;
|
109 |
|
|
110 |
86 |
/**
|
111 |
|
* Maximum frequency filter value.
|
|
87 |
* Partition part names.
|
112 |
88 |
*/
|
113 |
|
@Parameter(key = TXMPreferences.F_MAX)
|
114 |
|
protected Integer pFmaxFilter;
|
|
89 |
protected List<String> partNames = new ArrayList<>();
|
115 |
90 |
|
116 |
|
/**
|
117 |
|
* Minimum frequency filter value.
|
118 |
|
*/
|
119 |
|
@Parameter(key = TXMPreferences.F_MIN)
|
120 |
|
protected Integer pFminFilter;
|
|
91 |
|
121 |
92 |
|
122 |
93 |
/**
|
123 |
|
* Number of lines to display per page.
|
124 |
|
*/
|
125 |
|
@Parameter(key = TXMPreferences.N_LINES_PER_PAGE)
|
126 |
|
protected Integer pNLinesPerPage;
|
127 |
|
|
128 |
|
/**
|
129 |
|
* The word properties to display.
|
130 |
|
*/
|
131 |
|
@Parameter(key = TXMPreferences.UNIT_PROPERTIES)
|
132 |
|
protected List<WordProperty> pProperties;
|
133 |
|
|
134 |
|
/**
|
135 |
|
* The string used to separated property values.
|
136 |
|
*/
|
137 |
|
@Parameter(key = IndexPreferences.PROPERTIES_SEPARATOR)
|
138 |
|
protected String pPropertiesSeparator;
|
139 |
|
|
140 |
|
/**
|
141 |
|
* The CQP query.
|
142 |
|
*/
|
143 |
|
@Parameter(key = TXMPreferences.QUERY)
|
144 |
|
protected IQuery pQuery;
|
145 |
|
|
146 |
|
/**
|
147 |
|
* The line index of the current index page.
|
148 |
|
*/
|
149 |
|
@Parameter(key = IndexPreferences.N_TOP_INDEX)
|
150 |
|
private Integer pTopIndex;
|
151 |
|
|
152 |
|
/**
|
153 |
|
* The vmax filter value parameter.
|
154 |
|
*/
|
155 |
|
@Parameter(key = TXMPreferences.V_MAX)
|
156 |
|
protected Integer pVmaxFilter;
|
157 |
|
|
158 |
|
/**
|
159 |
94 |
*
|
160 |
95 |
* @param parent
|
161 |
96 |
*/
|
... | ... | |
179 |
114 |
super(parametersNodePath, parent);
|
180 |
115 |
|
181 |
116 |
Partition partition = getParent();
|
182 |
|
partnames.clear();
|
|
117 |
partNames.clear();
|
183 |
118 |
for (Part part : partition.getParts()) {
|
184 |
|
partnames.add(part.getName());
|
|
119 |
partNames.add(part.getName());
|
185 |
120 |
}
|
186 |
121 |
}
|
187 |
122 |
|
|
123 |
@Override
|
188 |
124 |
public Partition getParent() {
|
189 |
125 |
return (Partition) parent;
|
190 |
126 |
}
|
191 |
127 |
|
|
128 |
@Override
|
192 |
129 |
public String getComputingDoneMessage() {
|
193 |
130 |
if (this.lines.isEmpty()) {
|
194 |
131 |
return TXMCoreMessages.common_noResults;
|
... | ... | |
202 |
139 |
protected boolean _compute() throws Exception {
|
203 |
140 |
lines.clear();
|
204 |
141 |
counts.clear();
|
205 |
|
partnames.clear();
|
|
142 |
partNames.clear();
|
206 |
143 |
currentpartid = 0;
|
207 |
144 |
nTotalTokens = 0;
|
208 |
145 |
|
209 |
146 |
Partition partition = getParent();
|
210 |
|
partnames.clear();
|
|
147 |
partNames.clear();
|
211 |
148 |
for (Part part : partition.getParts()) {
|
212 |
149 |
scanCorpus(part);
|
213 |
150 |
currentpartid++;
|
214 |
|
partnames.add(part.getName());
|
|
151 |
partNames.add(part.getName());
|
215 |
152 |
}
|
216 |
153 |
|
217 |
154 |
setLineCounts();
|
... | ... | |
237 |
174 |
|
238 |
175 |
|
239 |
176 |
/**
|
240 |
|
* Creates a CQL query string from the specified Index lines.
|
241 |
|
*
|
242 |
|
* @param lines
|
243 |
|
* @return the query
|
244 |
|
*/
|
245 |
|
public static String createQuery(List<Line> lines) {
|
246 |
|
String query = ""; //$NON-NLS-1$
|
247 |
|
if (lines.size() == 0) {
|
248 |
|
return query;
|
249 |
|
}
|
250 |
|
|
251 |
|
Line line = lines.get(0);
|
252 |
|
int nbToken = line.getUnitsProperties().get(0).size();
|
253 |
|
int nbProps = line.getProperties().size();
|
254 |
|
int nbLines = lines.size();
|
255 |
|
List<Property> props = line.getProperties();
|
256 |
|
for (int t = 0; t < nbToken; t++) {
|
257 |
|
query += "["; //$NON-NLS-1$
|
258 |
|
for (int p = 0; p < nbProps; p++) {
|
259 |
|
if (props.get(p) instanceof StructuralUnitProperty) {
|
260 |
|
query += "_." + ((StructuralUnitProperty) props.get(p)).getFullName() + "=\""; //$NON-NLS-1$ //$NON-NLS-2$
|
261 |
|
}
|
262 |
|
else {
|
263 |
|
query += props.get(p) + "=\""; //$NON-NLS-1$
|
264 |
|
}
|
265 |
|
for (int l = 0; l < nbLines; l++) {
|
266 |
|
line = lines.get(l);
|
267 |
|
List<List<String>> values = line.getUnitsProperties();
|
268 |
|
String s = values.get(p).get(t);
|
269 |
|
s = CQLQuery.addBackSlash(s);
|
270 |
|
query += s + "|"; //$NON-NLS-1$
|
271 |
|
}
|
272 |
|
query = query.substring(0, query.length() - 1);
|
273 |
|
query += "\" & "; //$NON-NLS-1$
|
274 |
|
}
|
275 |
|
query = query.substring(0, query.length() - 3);
|
276 |
|
query += "] "; //$NON-NLS-1$
|
277 |
|
}
|
278 |
|
query = query.substring(0, query.length() - 1);
|
279 |
|
return query;
|
280 |
|
}
|
281 |
|
|
282 |
|
|
283 |
|
/**
|
284 |
|
* Creates a Query list from the specified Index lines.
|
285 |
|
*
|
286 |
|
* @param lines
|
287 |
|
* @return
|
288 |
|
*/
|
289 |
|
public static List<CQLQuery> createQueries(List<Line> lines) {
|
290 |
|
List<CQLQuery> queries = new ArrayList<CQLQuery>();
|
291 |
|
for (Line line : lines) {
|
292 |
|
String query = ""; //$NON-NLS-1$
|
293 |
|
int nbToken = line.getUnitsProperties().get(0).size();
|
294 |
|
int nbProps = line.getProperties().size();
|
295 |
|
List<List<String>> values = line.getUnitsProperties();
|
296 |
|
List<Property> props = line.getProperties();
|
297 |
|
for (int t = 0; t < nbToken; t++) {
|
298 |
|
query += "["; //$NON-NLS-1$
|
299 |
|
for (int p = 0; p < nbProps; p++) {
|
300 |
|
query += props.get(p).getName() + "=\""; //$NON-NLS-1$
|
301 |
|
String s = values.get(p).get(t);
|
302 |
|
s = CQLQuery.addBackSlash(s);
|
303 |
|
query += s;
|
304 |
|
query += "\" & "; //$NON-NLS-1$
|
305 |
|
}
|
306 |
|
query = query.substring(0, query.length() - 3);
|
307 |
|
query += "] "; //$NON-NLS-1$
|
308 |
|
}
|
309 |
|
queries.add(new CQLQuery(query));
|
310 |
|
}
|
311 |
|
return queries;
|
312 |
|
|
313 |
|
}
|
314 |
|
|
315 |
|
|
316 |
|
|
317 |
|
/**
|
318 |
177 |
* This method alter the index first column frequencies using a table stored in the R workspace
|
319 |
178 |
*
|
320 |
179 |
* @param referenceCorpus the R table variable name
|
... | ... | |
323 |
182 |
* @throws RWorkspaceException
|
324 |
183 |
* @throws REXPMismatchException
|
325 |
184 |
*/
|
|
185 |
@Override
|
326 |
186 |
public boolean alterFrequencies(String referenceCorpus) throws RWorkspaceException, REXPMismatchException {
|
327 |
187 |
String[] ref_forms = RWorkspace.getRWorkspaceInstance().eval("rownames(" + referenceCorpus + ")").asStrings(); //$NON-NLS-1$ //$NON-NLS-2$
|
328 |
188 |
int[] ref_freqs = RWorkspace.getRWorkspaceInstance().eval(referenceCorpus + "[,1]").asIntegers(); //$NON-NLS-1$
|
... | ... | |
330 |
190 |
System.out.println("Cannot alter index frequencies with the '" + referenceCorpus + "' empty table.");
|
331 |
191 |
return false;
|
332 |
192 |
}
|
333 |
|
HashMap<String, Integer> ref_counts = new HashMap<String, Integer>();
|
|
193 |
HashMap<String, Integer> ref_counts = new HashMap<>();
|
334 |
194 |
for (int i = 0; i < ref_forms.length; i++) {
|
335 |
195 |
ref_counts.put(ref_forms[i], ref_freqs[i]);
|
336 |
196 |
}
|
... | ... | |
367 |
227 |
return true;
|
368 |
228 |
}
|
369 |
229 |
|
370 |
|
@Override
|
371 |
|
public boolean saveParameters() {
|
372 |
|
|
373 |
|
this.saveParameter(TXMPreferences.UNIT_PROPERTIES, WordProperty.propertiesToString(pProperties));
|
374 |
|
|
375 |
|
if (pQuery != null) {
|
376 |
|
this.saveParameter(TXMPreferences.QUERY, pQuery.getQueryString());
|
377 |
|
}
|
378 |
|
|
379 |
|
|
380 |
|
return true;
|
381 |
|
}
|
382 |
230 |
|
383 |
|
@Override
|
384 |
|
public boolean loadParameters() {
|
385 |
|
this.pProperties = (List<WordProperty>) Property.stringToProperties(getCorpus(), this.getStringParameterValue(TXMPreferences.UNIT_PROPERTIES));
|
386 |
|
this.pQuery = new CQLQuery(this.getStringParameterValue(TXMPreferences.QUERY));
|
387 |
|
return true;
|
388 |
|
}
|
389 |
|
|
390 |
|
@Override
|
391 |
|
public void clean() {
|
392 |
|
try {
|
393 |
|
if (this.writer != null) {
|
394 |
|
this.writer.flush();
|
395 |
|
this.writer.close();
|
396 |
|
}
|
397 |
|
}
|
398 |
|
catch (IOException e) {
|
399 |
|
org.txm.utils.logger.Log.printStackTrace(e);
|
400 |
|
}
|
401 |
|
}
|
402 |
|
|
403 |
231 |
/**
|
404 |
|
* keep the vmax lines more frequents.
|
405 |
|
*
|
406 |
|
*/
|
407 |
|
public void cut() {
|
408 |
|
if (pVmaxFilter == null) {
|
409 |
|
return;
|
410 |
|
}
|
411 |
|
|
412 |
|
this.acquireSemaphore();
|
413 |
|
|
414 |
|
Log.fine("Cutting Tmax=" + pVmaxFilter);
|
415 |
|
// assume the lines are sorted
|
416 |
|
// int before = lines.size();
|
417 |
|
this.lines = this.lines.subList(0, Math.min(lines.size(), pVmaxFilter));
|
418 |
|
this.updateFminFmax();
|
419 |
|
|
420 |
|
this.releaseSemaphore();
|
421 |
|
}
|
422 |
|
|
423 |
|
/**
|
424 |
232 |
* Equals.
|
425 |
233 |
*
|
426 |
234 |
* @param index the index
|
... | ... | |
439 |
247 |
}
|
440 |
248 |
|
441 |
249 |
|
442 |
|
/**
|
443 |
|
* Removes lines with frequency not in [Fmin,Fmax] range.
|
444 |
|
*
|
445 |
|
**/
|
446 |
|
public void filterLines() {
|
447 |
|
|
448 |
|
if (!(pFminFilter > 0 && pFmaxFilter > 0 && pFminFilter <= pFmaxFilter)) {
|
449 |
|
return;
|
450 |
|
}
|
451 |
|
|
452 |
|
Log.fine("Filtering Fmin = " + pFminFilter + " and Fmax = " + pFmaxFilter); //$NON-NLS-1$ //$NON-NLS-2$
|
453 |
|
|
454 |
|
for (int i = 0; i < lines.size(); i++) { // for each line
|
455 |
|
|
456 |
|
Line line = lines.get(i);
|
457 |
|
int f = line.getFrequency();
|
458 |
|
if (f < pFminFilter) { // if its frequency is not in the interval, remove it
|
459 |
|
|
460 |
|
nTotalTokens -= line.getFrequency();
|
461 |
|
lines.remove(i);
|
462 |
|
i--;
|
463 |
|
continue; // no need to go further the line is removed
|
464 |
|
}
|
465 |
|
if (f > pFmaxFilter) { // if its frequency is not in the interval, remove it
|
466 |
|
|
467 |
|
nTotalTokens -= line.getFrequency();
|
468 |
|
lines.remove(i);
|
469 |
|
i--;
|
470 |
|
}
|
471 |
|
}
|
472 |
|
|
473 |
|
this.updateFminFmax();
|
474 |
|
}
|
|
250 |
|
475 |
251 |
|
476 |
252 |
/**
|
477 |
253 |
* Gets the corpus.
|
478 |
254 |
*
|
479 |
255 |
* @return the corpus
|
480 |
256 |
*/
|
|
257 |
@Override
|
481 |
258 |
public CQPCorpus getCorpus() {
|
482 |
259 |
if (this.parent instanceof CQPCorpus) {
|
483 |
260 |
return (CQPCorpus) this.parent;
|
... | ... | |
517 |
294 |
}
|
518 |
295 |
}
|
519 |
296 |
|
520 |
|
/**
|
521 |
|
* Gets the filter fmax.
|
522 |
|
*
|
523 |
|
* @return the filter fmax
|
524 |
|
*/
|
525 |
|
public Integer getFilterFmax() {
|
526 |
|
return pFmaxFilter;
|
527 |
|
}
|
|
297 |
|
528 |
298 |
|
529 |
299 |
/**
|
530 |
|
* Gets the filter fmin.
|
531 |
|
*
|
532 |
|
* @return the filter fmin
|
533 |
|
*/
|
534 |
|
public Integer getFilterFmin() {
|
535 |
|
return pFminFilter;
|
536 |
|
}
|
537 |
|
|
538 |
|
/**
|
539 |
|
* Gets the filter vmax.
|
540 |
|
*
|
541 |
|
* @return the filter vmax
|
542 |
|
*/
|
543 |
|
public Integer getFilterVmax() {
|
544 |
|
return pVmaxFilter;
|
545 |
|
}
|
546 |
|
|
547 |
|
/**
|
548 |
|
* Gets the fmax.
|
549 |
|
*
|
550 |
|
* @return the fmax
|
551 |
|
*/
|
552 |
|
public int getFmax() {
|
553 |
|
return Fmax;
|
554 |
|
}
|
555 |
|
|
556 |
|
/**
|
557 |
|
* Gets the fmin.
|
558 |
|
*
|
559 |
|
* @return the fmin
|
560 |
|
*/
|
561 |
|
public int getFmin() {
|
562 |
|
return Fmin;
|
563 |
|
}
|
564 |
|
|
565 |
|
/**
|
566 |
300 |
* If the index has been build with corpus + property, the method returns the lexicon used.
|
567 |
301 |
*
|
568 |
302 |
* @return the lexicon
|
... | ... | |
575 |
309 |
return this.lexiconMode;
|
576 |
310 |
}
|
577 |
311 |
|
578 |
|
/**
|
579 |
|
* return the lines from le "start"th one to the "end"th one.
|
580 |
|
*
|
581 |
|
* @param start the start
|
582 |
|
* @param end the end
|
583 |
|
* @return the lines
|
584 |
|
*/
|
585 |
|
public List<Line> getLines(int start, int end) {
|
586 |
|
// long time = System.currentTimeMillis();
|
587 |
|
List<Line> selectedLines = new ArrayList<Line>();
|
588 |
|
if (lines.size() > 0) {
|
589 |
|
start = Math.max(0, start);
|
590 |
|
end = Math.min(end, lines.size());
|
591 |
|
selectedLines = lines.subList(start, end);
|
592 |
|
|
593 |
|
int p = 0;
|
594 |
|
// for each property get the string values of the tokens
|
595 |
|
for (Property property : pProperties) {
|
596 |
|
|
597 |
|
int len = 0;
|
598 |
|
for (Line l : selectedLines) {
|
599 |
|
len += l.UnitsIds.get(p).length;
|
600 |
|
}
|
601 |
|
|
602 |
|
int[] indices = new int[len]; // build the array of indices
|
603 |
|
len = 0;
|
604 |
|
for (Line l : selectedLines) {
|
605 |
|
int[] ids = l.UnitsIds.get(p);
|
606 |
|
System.arraycopy(ids, 0, indices, len, ids.length);
|
607 |
|
len += ids.length;
|
608 |
|
}
|
609 |
|
String[] strs = null;
|
610 |
|
try {
|
611 |
|
if (property instanceof StructuralUnitProperty) {
|
612 |
|
strs = CorpusManager.getCorpusManager().getCqiClient().struc2Str(property.getQualifiedName(), indices);
|
613 |
|
}
|
614 |
|
else {
|
615 |
|
strs = CorpusManager.getCorpusManager().getCqiClient().id2Str(property.getQualifiedName(), indices);
|
616 |
|
}
|
617 |
|
}
|
618 |
|
catch (Exception e) {
|
619 |
|
org.txm.utils.logger.Log.printStackTrace(e);
|
620 |
|
return null;
|
621 |
|
}
|
622 |
|
len = 0;
|
623 |
|
for (Line l : selectedLines) {
|
624 |
|
int[] ids = l.UnitsIds.get(p);
|
625 |
|
String[] lstr = new String[ids.length];
|
626 |
|
System.arraycopy(strs, len, lstr, 0, ids.length);
|
627 |
|
if (l.UnitsProperty.size() == pProperties.size()) continue; // the line is already initialized
|
628 |
|
l.put(property, Arrays.asList(lstr));
|
629 |
|
len += ids.length;
|
630 |
|
}
|
631 |
|
p++;
|
632 |
|
}
|
633 |
|
}
|
634 |
|
// System.out.println("Time get lines "+(System.currentTimeMillis()-time));
|
635 |
|
return selectedLines;
|
636 |
|
}
|
|
312 |
|
637 |
313 |
|
638 |
|
|
639 |
314 |
/**
|
640 |
|
* return all the lines of the index.
|
641 |
|
*
|
642 |
|
* @return the all lines
|
643 |
|
*/
|
644 |
|
public List<Line> getAllLines() {
|
645 |
|
return getLines(0, lines.size());
|
646 |
|
}
|
647 |
|
|
648 |
|
/**
|
649 |
315 |
* update the lines counts.
|
650 |
316 |
*/
|
|
317 |
@Override
|
651 |
318 |
protected void setLineCounts() {
|
652 |
319 |
for (Line line : lines) {// for each Line set its count
|
653 |
|
int[] c = new int[partnames.size()];
|
654 |
|
for (int i = 0; i < partnames.size(); i++) {
|
|
320 |
int[] c = new int[partNames.size()];
|
|
321 |
for (int i = 0; i < partNames.size(); i++) {
|
655 |
322 |
if (counts.get(line.getSignature()).size() <= i) {
|
656 |
323 |
c[i] = 0;
|
657 |
324 |
}
|
... | ... | |
673 |
340 |
return lexicon.getName();
|
674 |
341 |
}
|
675 |
342 |
else {
|
676 |
|
return IndexCoreMessages.RESULT_TYPE + ": " + this.parent.getSimpleName() + ": " + this.getSimpleName();
|
|
343 |
return IndexCoreMessages.PARTITION_RESULT_TYPE + ": " + this.parent.getSimpleName() + ": " + this.getSimpleName();
|
677 |
344 |
}
|
678 |
345 |
}
|
679 |
346 |
catch (Exception e) {
|
... | ... | |
686 |
353 |
*
|
687 |
354 |
* @return the number of lines per page
|
688 |
355 |
*/
|
|
356 |
@Override
|
689 |
357 |
public Integer getNLinesPerPage() {
|
690 |
358 |
return pNLinesPerPage;
|
691 |
359 |
}
|
... | ... | |
711 |
379 |
* @return the part names
|
712 |
380 |
*/
|
713 |
381 |
public List<String> getPartnames() {
|
714 |
|
return partnames;
|
|
382 |
return partNames;
|
715 |
383 |
}
|
716 |
384 |
|
717 |
|
/**
|
718 |
|
* Gets the properties.
|
719 |
|
*
|
720 |
|
* @return the properties
|
721 |
|
*/
|
722 |
|
public List<WordProperty> getProperties() {
|
723 |
|
return this.pProperties;
|
724 |
|
}
|
|
385 |
|
725 |
386 |
|
726 |
|
|
727 |
|
/**
|
728 |
|
*
|
729 |
|
* @return
|
730 |
|
*/
|
731 |
|
public String getPropertySeparator() {
|
732 |
|
return pPropertiesSeparator;
|
733 |
|
}
|
734 |
|
|
735 |
|
/**
|
736 |
|
* Gets the query.
|
737 |
|
*
|
738 |
|
* @return the query used
|
739 |
|
*/
|
740 |
|
public IQuery getQuery() {
|
741 |
|
return pQuery;
|
742 |
|
}
|
743 |
|
|
744 |
387 |
@Override
|
745 |
388 |
public String getSimpleName() {
|
746 |
389 |
if (lexicon != null) {
|
... | ... | |
766 |
409 |
}
|
767 |
410 |
}
|
768 |
411 |
|
769 |
|
/**
|
770 |
|
* Gets the number of tokens found.
|
771 |
|
*
|
772 |
|
* @return the number of tokens returned by the selection
|
773 |
|
*/
|
774 |
|
public int getT() {
|
775 |
|
return nTotalTokens;
|
776 |
|
}
|
|
412 |
|
777 |
413 |
|
778 |
414 |
/**
|
779 |
|
*
|
780 |
|
* @return
|
781 |
|
*/
|
782 |
|
public int getTopIndex() {
|
783 |
|
return pTopIndex;
|
784 |
|
}
|
785 |
|
|
786 |
|
/**
|
787 |
|
* Gets the v.
|
788 |
|
*
|
789 |
|
* @return the number of entries in the index
|
790 |
|
*/
|
791 |
|
public int getV() {
|
792 |
|
return lines.size();
|
793 |
|
}
|
794 |
|
|
795 |
|
/**
|
796 |
415 |
* Tell if the index has been computed with a partition or not.
|
797 |
416 |
*
|
798 |
417 |
* @return true, if is computed with partition
|
... | ... | |
802 |
421 |
}
|
803 |
422 |
|
804 |
423 |
/**
|
805 |
|
* count tokens.
|
806 |
|
*
|
807 |
|
* @param corpus the corpus to scan
|
808 |
|
* @return true, if successful
|
809 |
|
* @throws CqiClientException
|
810 |
|
* @throws CqiServerError
|
811 |
|
* @throws IOException
|
812 |
|
*/
|
813 |
|
protected boolean scanCorpus(CQPCorpus corpus) throws Exception {
|
814 |
|
// get the cqp result of the query
|
815 |
|
|
816 |
|
// long time = System.currentTimeMillis();
|
817 |
|
Selection result = null;
|
818 |
|
if (pQuery instanceof CQLQuery) {
|
819 |
|
result = corpus.query((CQLQuery) pQuery, "index", true); //$NON-NLS-1$
|
820 |
|
}
|
821 |
|
else {
|
822 |
|
result = pQuery.getSearchEngine().query(corpus, pQuery, "index", true); //$NON-NLS-1$
|
823 |
|
}
|
824 |
|
boolean isTargetUsed = result.isTargetUsed();
|
825 |
|
int nbresults = result.getNMatch();
|
826 |
|
this.nTotalTokens += nbresults; // get number of tokens
|
827 |
|
|
828 |
|
// System.out.println("nLines : "+nLines);
|
829 |
|
List<? extends Match> matches = null;
|
830 |
|
if (nbresults > 0) {
|
831 |
|
matches = result.getMatches(0, nbresults - 1); // get the indexes sequences of result's tokens
|
832 |
|
}
|
833 |
|
else {
|
834 |
|
matches = new ArrayList<Match>();
|
835 |
|
}
|
836 |
|
// count matches
|
837 |
|
// time = System.currentTimeMillis();
|
838 |
|
List<Integer> allpositions = new ArrayList<Integer>();
|
839 |
|
for (int j = 0; j < nbresults; j++) {
|
840 |
|
Match match = matches.get(j);
|
841 |
|
// beginingOfKeywordsPositions.add(match.getStart()); // get the
|
842 |
|
// first index
|
843 |
|
// lengthOfKeywords.add(match.size());// get the last index
|
844 |
|
if (isTargetUsed) {
|
845 |
|
allpositions.add(match.getTarget());
|
846 |
|
}
|
847 |
|
else {
|
848 |
|
for (int i = match.getStart(); i <= match.getEnd(); i++) {
|
849 |
|
allpositions.add(i);
|
850 |
|
}
|
851 |
|
}
|
852 |
|
}
|
853 |
|
// System.out.println("get string data");
|
854 |
|
// time = System.currentTimeMillis();
|
855 |
|
// for (Property property : props) {// for each property get the
|
856 |
|
// string values of the tokens
|
857 |
|
// keywordsViewPropValues.put(property,
|
858 |
|
// cache.get(property).getData(beginingOfKeywordsPositions,
|
859 |
|
// lengthOfKeywords));
|
860 |
|
// }
|
861 |
|
// System.out.println("took "+(System.currentTimeMillis()-time));
|
862 |
|
|
863 |
|
// System.out.println("get count data");
|
864 |
|
|
865 |
|
int[] allpositionsarray = new int[allpositions.size()];
|
866 |
|
int pcount = 0;
|
867 |
|
for (int p : allpositions) {
|
868 |
|
allpositionsarray[pcount++] = p;
|
869 |
|
}
|
870 |
|
|
871 |
|
// time = System.currentTimeMillis();
|
872 |
|
HashMap<Property, int[]> propsId = new HashMap<Property, int[]>();
|
873 |
|
for (Property property : pProperties) {
|
874 |
|
try {
|
875 |
|
if (property instanceof StructuralUnitProperty) {
|
876 |
|
int[] structs = CorpusManager.getCorpusManager().getCqiClient().cpos2Struc(property.getQualifiedName(), allpositionsarray);
|
877 |
|
propsId.put(property, structs);
|
878 |
|
}
|
879 |
|
else {
|
880 |
|
int[] indices = CorpusManager.getCorpusManager().getCqiClient().cpos2Id(property.getQualifiedName(), allpositionsarray);
|
881 |
|
propsId.put(property, indices);
|
882 |
|
}
|
883 |
|
}
|
884 |
|
catch (Exception e) {
|
885 |
|
org.txm.utils.logger.Log.printStackTrace(e);
|
886 |
|
result.drop();
|
887 |
|
return false;
|
888 |
|
}
|
889 |
|
}
|
890 |
|
// System.out.println("Time recup indices "+(System.currentTimeMillis()-time));
|
891 |
|
int currentIndex = 0;
|
892 |
|
// time = System.currentTimeMillis();
|
893 |
|
for (int i = 0; i < nbresults; i++) {
|
894 |
|
Line line = new Line();
|
895 |
|
Match match = matches.get(i);
|
896 |
|
int size = match.size();
|
897 |
|
if (isTargetUsed) {
|
898 |
|
size = 1;
|
899 |
|
}
|
900 |
|
for (int p = 0; p < pProperties.size(); p++) {
|
901 |
|
Property property = pProperties.get(p);
|
902 |
|
int[] allprosids = propsId.get(property);
|
903 |
|
int[] ids = new int[size];
|
904 |
|
System.arraycopy(allprosids, currentIndex, ids, 0, size);
|
905 |
|
line.putIds(property, ids);
|
906 |
|
}
|
907 |
|
currentIndex += size;
|
908 |
|
|
909 |
|
String signature = line.getSignature();
|
910 |
|
|
911 |
|
// if the counts contains the signature, increment its corresponding value
|
912 |
|
if (counts.containsKey(signature)) {
|
913 |
|
while (counts.get(signature).size() <= currentpartid) {
|
914 |
|
counts.get(signature).add(0);
|
915 |
|
}
|
916 |
|
int c = counts.get(signature).get(currentpartid) + 1;
|
917 |
|
counts.get(signature).set(currentpartid, c);
|
918 |
|
}
|
919 |
|
// else initialize count of the signature to 1
|
920 |
|
else {
|
921 |
|
// System.out.println("add new sign "+signature+" of line "+line.toString());
|
922 |
|
ArrayList<Integer> tmp = new ArrayList<Integer>();
|
923 |
|
for (int j = 0; j < currentpartid + 1; j++) {
|
924 |
|
tmp.add(0);
|
925 |
|
}
|
926 |
|
counts.put(signature, tmp);
|
927 |
|
counts.get(signature).set(currentpartid, 1);
|
928 |
|
|
929 |
|
lines.add(line);
|
930 |
|
}
|
931 |
|
}
|
932 |
|
result.drop();
|
933 |
|
// System.out.println("Time count lines "+(System.currentTimeMillis()-time));
|
934 |
|
// System.out.println("took "+(System.currentTimeMillis()-time));
|
935 |
|
return true;
|
936 |
|
|
937 |
|
}
|
|
424 |
|
938 |
425 |
|
939 |
426 |
/**
|
940 |
427 |
*
|
941 |
|
* @param nLinesPerPage
|
942 |
|
*/
|
943 |
|
public void setNLinesPerPage(int nLinesPerPage) {
|
944 |
|
this.pNLinesPerPage = Math.max(nLinesPerPage, 1);
|
945 |
|
}
|
946 |
|
|
947 |
|
/**
|
948 |
|
*
|
949 |
428 |
* @param props
|
950 |
429 |
*/
|
|
430 |
@Override
|
951 |
431 |
public void setParameters(List<WordProperty> props) {
|
952 |
432 |
this.pQuery = new CQLQuery("[]"); //$NON-NLS-1$
|
953 |
433 |
this.pProperties = props;
|
954 |
434 |
this.lexicon = null;
|
955 |
435 |
}
|
956 |
436 |
|
957 |
|
/**
|
958 |
|
* Sets the query.
|
959 |
|
*
|
960 |
|
* @param query
|
961 |
|
*/
|
962 |
|
public void setQuery(CQLQuery query) {
|
963 |
|
this.pQuery = query;
|
964 |
|
}
|
965 |
437 |
|
966 |
|
/**
|
967 |
|
* Sets the properties.
|
968 |
|
*
|
969 |
|
* @param properties
|
970 |
|
*/
|
971 |
|
public void setProperties(List<WordProperty> properties) {
|
972 |
|
this.pProperties = properties;
|
973 |
|
}
|
974 |
438 |
|
975 |
|
/**
|
976 |
|
* Sets the property.
|
977 |
|
* Clears all existing properties.
|
978 |
|
*
|
979 |
|
* @param property
|
980 |
|
*/
|
981 |
|
public void setProperty(WordProperty property) {
|
982 |
|
List<WordProperty> properties = new ArrayList<WordProperty>();
|
983 |
|
properties.add(property);
|
984 |
|
this.setProperties(properties);
|
985 |
|
}
|
|
439 |
|
986 |
440 |
|
987 |
|
|
988 |
|
|
989 |
|
public void setParameters(CQLQuery query, List<WordProperty> props, Integer filterFmin, Integer filterFmax, Integer filterVmax, Integer nLinesPerPage) {
|
990 |
|
this.pQuery = query;
|
991 |
|
this.pProperties = props;
|
992 |
|
if (filterFmax != null) this.pFmaxFilter = filterFmax;
|
993 |
|
if (filterFmin != null) this.pFminFilter = filterFmin;
|
994 |
|
if (filterVmax != null) this.pVmaxFilter = filterVmax;
|
995 |
|
if (nLinesPerPage != null) this.pNLinesPerPage = nLinesPerPage;
|
996 |
|
}
|
997 |
|
|
998 |
|
@Override
|
999 |
|
public boolean setParameters(TXMParameters parameters) {
|
1000 |
|
try {
|
1001 |
|
List<WordProperty> props = (List<WordProperty>) parameters.get("props"); //$NON-NLS-1$
|
1002 |
|
CQLQuery query = (CQLQuery) parameters.get("query"); //$NON-NLS-1$
|
1003 |
|
Integer filterFmin = (Integer) parameters.get("filterFmin"); //$NON-NLS-1$
|
1004 |
|
Integer filterFmax = (Integer) parameters.get("filterFmax"); //$NON-NLS-1$
|
1005 |
|
Integer filterVmax = (Integer) parameters.get("filterVmax"); //$NON-NLS-1$
|
1006 |
|
Integer nLinesPerPage = (Integer) parameters.get("nLinesPerPage"); //$NON-NLS-1$
|
1007 |
|
|
1008 |
|
this.setParameters(query, props, filterFmin, filterFmax, filterVmax, nLinesPerPage);
|
1009 |
|
}
|
1010 |
|
catch (Exception e) {
|
1011 |
|
Log.severe("Error while setting parameters of Index: " + e.getLocalizedMessage()); //$NON-NLS-1$
|
1012 |
|
Log.printStackTrace(e);
|
1013 |
|
return false;
|
1014 |
|
}
|
1015 |
|
return true;
|
1016 |
|
}
|
1017 |
|
|
1018 |
|
|
1019 |
|
public void setTopLine(int i) {
|
1020 |
|
pTopIndex = Math.max(i, 0);
|
1021 |
|
}
|
1022 |
|
|
1023 |
|
public void setVMax(int maxFilter) {
|
1024 |
|
this.pVmaxFilter = maxFilter;
|
1025 |
|
}
|
1026 |
|
|
|
441 |
|
|
442 |
|
1027 |
443 |
/**
|
1028 |
|
* Sort lines.
|
1029 |
|
*
|
1030 |
|
* @param mode the mode
|
1031 |
|
* @param reverse the reverse
|
1032 |
|
*/
|
1033 |
|
public void sortLines(LineComparator.SortMode mode, boolean reverse) {
|
1034 |
|
LineComparator lc = new LineComparator(mode, reverse);
|
1035 |
|
lc.initialize(this.getCorpus());
|
1036 |
|
Collections.sort(lines, lc);
|
1037 |
|
this.pTopIndex = 0; // return to the first page
|
1038 |
|
}
|
1039 |
|
|
1040 |
|
/**
|
1041 |
|
* dump the index in the console.
|
1042 |
|
*
|
1043 |
|
* @throws CqiClientException the cqi client exception
|
1044 |
|
* @throws IOException Signals that an I/O exception has occurred.
|
1045 |
|
*/
|
1046 |
|
public void toConsole() throws CqiClientException, IOException {
|
1047 |
|
System.out.println(NLS.bind(IndexCoreMessages.consoleColonP0, (lines.size())));
|
1048 |
|
toConsole(0, lines.size());
|
1049 |
|
}
|
1050 |
|
|
1051 |
|
/**
|
1052 |
444 |
* dump a part of the index.
|
1053 |
445 |
*
|
1054 |
446 |
* @param from the from
|
... | ... | |
1056 |
448 |
* @throws CqiClientException the cqi client exception
|
1057 |
449 |
* @throws IOException Signals that an I/O exception has occurred.
|
1058 |
450 |
*/
|
|
451 |
@Override
|
1059 |
452 |
public void toConsole(int from, int to) throws CqiClientException, IOException {
|
1060 |
453 |
|
1061 |
454 |
String header = ""; //$NON-NLS-1$
|
... | ... | |
1064 |
457 |
}
|
1065 |
458 |
header = header.substring(0, header.length() - 1);
|
1066 |
459 |
header += "\tF"; //$NON-NLS-1$
|
1067 |
|
if (partnames.size() > 1) {
|
1068 |
|
for (int j = 0; j < partnames.size(); j++) {
|
1069 |
|
header += "\t" + partnames.get(j); //$NON-NLS-1$
|
|
460 |
if (partNames.size() > 1) {
|
|
461 |
for (int j = 0; j < partNames.size(); j++) {
|
|
462 |
header += "\t" + partNames.get(j); //$NON-NLS-1$
|
1070 |
463 |
}
|
1071 |
464 |
}
|
1072 |
465 |
|
... | ... | |
1074 |
467 |
for (int i = from; i < to; i++) {
|
1075 |
468 |
Line ligne = lines.get(i);
|
1076 |
469 |
System.out.print(ligne + "\t" + ligne.getFrequency()); //$NON-NLS-1$
|
1077 |
|
if (partnames.size() > 1) {
|
1078 |
|
for (int j = 0; j < partnames.size(); j++) {
|
|
470 |
if (partNames.size() > 1) {
|
|
471 |
for (int j = 0; j < partNames.size(); j++) {
|
1079 |
472 |
System.out.print("\t" + ligne.getFrequency(j)); //$NON-NLS-1$
|
1080 |
473 |
}
|
1081 |
474 |
}
|
... | ... | |
1097 |
490 |
* @throws CqiClientException the cqi client exception
|
1098 |
491 |
* @throws IOException Signals that an I/O exception has occurred.
|
1099 |
492 |
*/
|
|
493 |
@Override
|
1100 |
494 |
// FIXME: should be in an exporter extension
|
1101 |
495 |
@Deprecated
|
1102 |
496 |
public void toTxt(File outfile, int from, int to, String encoding, String colseparator, String txtseparator)
|
... | ... | |
1110 |
504 |
}
|
1111 |
505 |
header = txtseparator + header.substring(0, header.length() - 1) + txtseparator;
|
1112 |
506 |
header += colseparator + txtseparator + "F" + txtseparator; //$NON-NLS-1$
|
1113 |
|
if (partnames.size() > 1) {
|
1114 |
|
for (int j = 0; j < partnames.size(); j++) {
|
1115 |
|
header += colseparator + txtseparator + partnames.get(j).replace(txtseparator, txtseparator + txtseparator) + txtseparator;
|
|
507 |
if (partNames.size() > 1) {
|
|
508 |
for (int j = 0; j < partNames.size(); j++) {
|
|
509 |
header += colseparator + txtseparator + partNames.get(j).replace(txtseparator, txtseparator + txtseparator) + txtseparator;
|
1116 |
510 |
}
|
1117 |
511 |
}
|
1118 |
512 |
header += "\n"; //$NON-NLS-1$
|
... | ... | |
1122 |
516 |
for (int i = from; i < to; i++) {
|
1123 |
517 |
Line ligne = lines.get(i);
|
1124 |
518 |
writer.write(txtseparator + ligne.toString().replace(txtseparator, txtseparator + txtseparator) + txtseparator + colseparator + ligne.getFrequency());
|
1125 |
|
if (partnames.size() > 1) {
|
1126 |
|
for (int j = 0; j < partnames.size(); j++) {
|
|
519 |
if (partNames.size() > 1) {
|
|
520 |
for (int j = 0; j < partNames.size(); j++) {
|
1127 |
521 |
writer.write(colseparator + ligne.getFrequency(j));
|
1128 |
522 |
}
|
1129 |
523 |
}
|
... | ... | |
1133 |
527 |
writer.close();
|
1134 |
528 |
}
|
1135 |
529 |
|
1136 |
|
/**
|
1137 |
|
* Write all the lines on a writer.
|
1138 |
|
*
|
1139 |
|
* @param outfile the outfile
|
1140 |
|
* @param encoding the encoding
|
1141 |
|
* @param colseparator the colseparator
|
1142 |
|
* @param txtseparator the txtseparator
|
1143 |
|
* @return true, if successful
|
1144 |
|
*/
|
1145 |
|
// FIXME: should be in an exporter extension
|
1146 |
|
@Deprecated
|
1147 |
|
public boolean toTxt(File outfile, String encoding, String colseparator, String txtseparator) {
|
1148 |
|
try {
|
1149 |
|
toTxt(outfile, 0, lines.size(), encoding, colseparator, txtseparator);
|
1150 |
|
}
|
1151 |
|
catch (Exception e) {
|
1152 |
|
Log.severe(TXMCoreMessages.bind(IndexCoreMessages.error_failedToExportLexiconColonP0, Log.toString(e)));
|
1153 |
|
return false;
|
1154 |
|
}
|
1155 |
|
return true;
|
1156 |
|
}
|
|
530 |
|
1157 |
531 |
|
1158 |
|
/**
|
1159 |
|
* checks all lines to update Fmin and Fmax.
|
1160 |
|
*/
|
1161 |
|
private void updateFminFmax() {
|
1162 |
|
Fmin = Integer.MAX_VALUE;
|
1163 |
|
Fmax = Integer.MIN_VALUE;
|
1164 |
|
for (int i = 0; i < lines.size(); i++) {
|
1165 |
|
Line line = lines.get(i);
|
1166 |
|
int f = line.getFrequency();
|
1167 |
|
if (f < Fmin) {
|
1168 |
|
Fmin = f;
|
1169 |
|
}
|
1170 |
|
if (f > Fmax) {
|
1171 |
|
Fmax = f;
|
1172 |
|
}
|
1173 |
|
}
|
1174 |
|
}
|
1175 |
|
|
1176 |
532 |
@Override
|
1177 |
533 |
public String getResultType() {
|
1178 |
534 |
return PartitionIndex.class.getSimpleName();
|