|
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
|
4 |
// Sophia Antipolis, University of Paris 3.
|
|
5 |
//
|
|
6 |
// The TXM platform is free software: you can redistribute it
|
|
7 |
// and/or modify it under the terms of the GNU General Public
|
|
8 |
// License as published by the Free Software Foundation,
|
|
9 |
// either version 2 of the License, or (at your option) any
|
|
10 |
// later version.
|
|
11 |
//
|
|
12 |
// The TXM platform is distributed in the hope that it will be
|
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
15 |
// PURPOSE. See the GNU General Public License for more
|
|
16 |
// details.
|
|
17 |
//
|
|
18 |
// You should have received a copy of the GNU General
|
|
19 |
// Public License along with the TXM platform. If not, see
|
|
20 |
// http://www.gnu.org/licenses.
|
|
21 |
//
|
|
22 |
//
|
|
23 |
//
|
|
24 |
// $LastChangedDate: 2016-10-21 11:09:43 +0200 (Fri, 21 Oct 2016) $
|
|
25 |
// $LastChangedRevision: 3323 $
|
|
26 |
// $LastChangedBy: mdecorde $
|
|
27 |
//
|
|
28 |
package org.txm.functions.index;
|
|
29 |
|
|
30 |
import java.io.BufferedWriter;
|
|
31 |
import java.io.File;
|
|
32 |
import java.io.FileOutputStream;
|
|
33 |
import java.io.IOException;
|
|
34 |
import java.io.OutputStreamWriter;
|
|
35 |
import java.util.ArrayList;
|
|
36 |
import java.util.Arrays;
|
|
37 |
import java.util.Collections;
|
|
38 |
import java.util.HashMap;
|
|
39 |
import java.util.HashSet;
|
|
40 |
import java.util.LinkedHashMap;
|
|
41 |
import java.util.List;
|
|
42 |
|
|
43 |
import org.apache.commons.lang.StringUtils;
|
|
44 |
import org.txm.HasResults;
|
|
45 |
import org.txm.Messages;
|
|
46 |
import org.txm.Toolbox;
|
|
47 |
import org.txm.functions.Function;
|
|
48 |
import org.txm.functions.TXMResult;
|
|
49 |
import org.txm.searchengine.cqp.ICqiClient;
|
|
50 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
|
|
51 |
import org.txm.searchengine.cqp.corpus.Corpus;
|
|
52 |
import org.txm.searchengine.cqp.corpus.CorpusManager;
|
|
53 |
import org.txm.searchengine.cqp.corpus.Lexicon;
|
|
54 |
import org.txm.searchengine.cqp.corpus.Part;
|
|
55 |
import org.txm.searchengine.cqp.corpus.Partition;
|
|
56 |
import org.txm.searchengine.cqp.corpus.Property;
|
|
57 |
import org.txm.searchengine.cqp.corpus.QueryResult;
|
|
58 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty;
|
|
59 |
import org.txm.searchengine.cqp.corpus.query.Match;
|
|
60 |
import org.txm.searchengine.cqp.corpus.query.Query;
|
|
61 |
import org.txm.searchengine.cqp.serverException.CqiServerError;
|
|
62 |
import org.txm.stat.engine.r.RWorkspace;
|
|
63 |
import org.txm.stat.engine.r.RWorkspaceException;
|
|
64 |
import org.txm.utils.logger.Log;
|
|
65 |
|
|
66 |
// TODO: Auto-generated Javadoc
|
|
67 |
/**
|
|
68 |
* compute an index of a corpus or a partition
|
|
69 |
*
|
|
70 |
* may contains LexicalTable built from its values
|
|
71 |
* @author mdecorde.
|
|
72 |
*/
|
|
73 |
public class Index extends Function implements TXMResult, HasResults {
|
|
74 |
|
|
75 |
/** The partition. */
|
|
76 |
protected Partition partition;
|
|
77 |
|
|
78 |
/** The corpus. */
|
|
79 |
protected Corpus corpus;
|
|
80 |
|
|
81 |
/** The query. */
|
|
82 |
protected Query query;
|
|
83 |
|
|
84 |
/** The props. */
|
|
85 |
protected List<Property> props;
|
|
86 |
|
|
87 |
/** The partnames. */
|
|
88 |
protected List<String> partnames = new ArrayList<String>();
|
|
89 |
|
|
90 |
/** The currentpartid. */
|
|
91 |
protected int currentpartid = 0;
|
|
92 |
|
|
93 |
/** The altered. */
|
|
94 |
boolean altered = false;
|
|
95 |
|
|
96 |
/** The n lines. */
|
|
97 |
protected int nLines = 0;
|
|
98 |
|
|
99 |
/** The Fmax. */
|
|
100 |
protected int Fmax = 0;
|
|
101 |
|
|
102 |
/** The Fmin. */
|
|
103 |
protected int Fmin = 9999999;
|
|
104 |
|
|
105 |
/** The Filter vmax. */
|
|
106 |
protected int FilterVmax = 999999999;
|
|
107 |
|
|
108 |
/** The Filter fmin. */
|
|
109 |
protected int FilterFmin = 1;
|
|
110 |
|
|
111 |
/** The Filter fmax. */
|
|
112 |
protected int FilterFmax = 999999999;
|
|
113 |
|
|
114 |
/** The prop separator. */
|
|
115 |
protected String propSeparator = "_"; // the separator of properties values //$NON-NLS-1$
|
|
116 |
|
|
117 |
/** The counts. */
|
|
118 |
protected LinkedHashMap<String, ArrayList<Integer>> counts = new LinkedHashMap<String, ArrayList<Integer>>();
|
|
119 |
|
|
120 |
/** The lines. */
|
|
121 |
protected List<Line> lines = new ArrayList<Line>();
|
|
122 |
|
|
123 |
/** proxy to get the string values of cqp tokens. */
|
|
124 |
//private Map<Property, CqpDataProxy> cache;
|
|
125 |
|
|
126 |
/** The symbol. */
|
|
127 |
private String symbol;
|
|
128 |
|
|
129 |
/**
|
|
130 |
* compute the index with a lexicon.
|
|
131 |
*
|
|
132 |
* @param corpus the corpus
|
|
133 |
* @param property the property
|
|
134 |
*/
|
|
135 |
public Index(Corpus corpus, Property property) {
|
|
136 |
init(corpus, property);
|
|
137 |
altered = false;
|
|
138 |
}
|
|
139 |
|
|
140 |
/** The lexicon. */
|
|
141 |
protected Lexicon lexicon;
|
|
142 |
|
|
143 |
/** The writer. */
|
|
144 |
private BufferedWriter writer;
|
|
145 |
|
|
146 |
/**
|
|
147 |
* Inits the Index using a lexicon.
|
|
148 |
*
|
|
149 |
* @param corpus the corpus
|
|
150 |
* @param property the property
|
|
151 |
*/
|
|
152 |
private void init(Corpus corpus, Property property) {
|
|
153 |
try {
|
|
154 |
partnames.add(corpus.getName());
|
|
155 |
this.query = new Query("[]"); //$NON-NLS-1$
|
|
156 |
this.corpus = corpus;
|
|
157 |
this.props = new ArrayList<Property>();
|
|
158 |
this.props.add(property);
|
|
159 |
|
|
160 |
lexicon = corpus.getLexicon(property);
|
|
161 |
this.nLines = lexicon.nbrOfToken();
|
|
162 |
String[] forms = lexicon.getForms();
|
|
163 |
int[] freqs = lexicon.getFreq();
|
|
164 |
int[] ids = lexicon.getIds();
|
|
165 |
this.lines = new ArrayList<Line>();
|
|
166 |
int nbOfToken = lexicon.nbrOfToken();
|
|
167 |
if (nbOfToken > 0) // no lines
|
|
168 |
for (int i = 0; i < forms.length && i < freqs.length; i++) {
|
|
169 |
Line l = new Line();
|
|
170 |
List<String> values = new ArrayList<String>(1);
|
|
171 |
values.add(forms[i]);
|
|
172 |
l.put(property, values);
|
|
173 |
int[] c = new int[1];
|
|
174 |
c[0] = freqs[i];
|
|
175 |
l.setCounts(c, freqs[i] / lexicon.nbrOfToken());
|
|
176 |
l.putIds(property, new int[] { ids[i] });
|
|
177 |
|
|
178 |
lines.add(l);
|
|
179 |
|
|
180 |
if (Fmin > freqs[i])
|
|
181 |
Fmin = freqs[i];
|
|
182 |
if (Fmax < freqs[i])
|
|
183 |
Fmax = freqs[i];
|
|
184 |
|
|
185 |
// getAllLines();
|
|
186 |
}
|
|
187 |
} catch (Exception e) {
|
|
188 |
// TODO Auto-generated catch block
|
|
189 |
org.txm.utils.logger.Log.printStackTrace(e);
|
|
190 |
}
|
|
191 |
}
|
|
192 |
|
|
193 |
/**
|
|
194 |
* compute a index, given a corpus, a query and analysis properties.
|
|
195 |
*
|
|
196 |
* @param corpus the corpus
|
|
197 |
* @param query the query
|
|
198 |
* @param props the props
|
|
199 |
* @throws CqiClientException
|
|
200 |
* @throws CqiServerError
|
|
201 |
* @throws IOException
|
|
202 |
*/
|
|
203 |
public Index(Corpus corpus, Query query, List<Property> props) throws CqiClientException, IOException, CqiServerError {
|
|
204 |
// if (props.size() == 1 && query.getQueryString().equals("[]")) //$NON-NLS-1$
|
|
205 |
// {
|
|
206 |
// init(corpus, props.get(0));
|
|
207 |
// return;
|
|
208 |
// }
|
|
209 |
partnames.add(corpus.getName());
|
|
210 |
this.corpus = corpus; // the corpus which we compute on
|
|
211 |
this.query = query; // the query of the selection
|
|
212 |
if (query.getQueryString() == "") //$NON-NLS-1$
|
|
213 |
query = new Query("[]"); //$NON-NLS-1$
|
|
214 |
this.props = new ArrayList<Property>(props);
|
|
215 |
|
|
216 |
if (!scanCorpus(corpus)) {
|
|
217 |
lines = new ArrayList<Line>();
|
|
218 |
return;
|
|
219 |
}
|
|
220 |
|
|
221 |
for (Line line : lines)// for each Line set its count
|
|
222 |
{
|
|
223 |
int[] c = new int[partnames.size()];
|
|
224 |
for (int i = 0; i < partnames.size(); i++)
|
|
225 |
c[i] = counts.get(line.getSignature()).get(i);
|
|
226 |
line.setCounts(c, -1);
|
|
227 |
}
|
|
228 |
|
|
229 |
getAllLines();
|
|
230 |
setLineCounts();
|
|
231 |
altered = false;
|
|
232 |
}
|
|
233 |
|
|
234 |
/**
|
|
235 |
* compute a index per Part.
|
|
236 |
*
|
|
237 |
* @param partition the partition
|
|
238 |
* @param query the query
|
|
239 |
* @param props the props
|
|
240 |
* @throws CqiClientException
|
|
241 |
* @throws CqiServerError
|
|
242 |
* @throws IOException
|
|
243 |
*/
|
|
244 |
public Index(Partition partition, Query query, List<Property> props) throws CqiClientException, IOException, CqiServerError {
|
|
245 |
|
|
246 |
this.partition = partition;
|
|
247 |
this.corpus = partition.getCorpus(); // the corpus which we compute on
|
|
248 |
this.query = query; // the query of the selection
|
|
249 |
if (query.getQueryString() == "") //$NON-NLS-1$
|
|
250 |
query = new Query("[]"); //$NON-NLS-1$
|
|
251 |
this.props = new ArrayList<Property>(props);
|
|
252 |
|
|
253 |
for (Part part : partition.getParts()) {
|
|
254 |
scanCorpus(part);
|
|
255 |
currentpartid++;
|
|
256 |
partnames.add(part.getName());
|
|
257 |
}
|
|
258 |
|
|
259 |
setLineCounts();
|
|
260 |
getAllLines();
|
|
261 |
// this.sortLines(LineComparator.SortMode.FREQUNIT);
|
|
262 |
altered = false;
|
|
263 |
}
|
|
264 |
|
|
265 |
/**
|
|
266 |
* tell if the index has been computed with a partition or not.
|
|
267 |
*
|
|
268 |
* @return true, if is computed with partition
|
|
269 |
*/
|
|
270 |
public boolean isComputedWithPartition() {
|
|
271 |
return partition != null;
|
|
272 |
}
|
|
273 |
|
|
274 |
/**
|
|
275 |
* update the lines counts.
|
|
276 |
*/
|
|
277 |
protected void setLineCounts() {
|
|
278 |
for (Line line : lines)// for each Line set its count
|
|
279 |
{
|
|
280 |
int[] c = new int[partnames.size()];
|
|
281 |
for (int i = 0; i < partnames.size(); i++)
|
|
282 |
if (counts.get(line.getSignature()).size() <= i)
|
|
283 |
c[i] = 0;
|
|
284 |
else
|
|
285 |
c[i] = counts.get(line.getSignature()).get(i);
|
|
286 |
line.setCounts(c, -1);
|
|
287 |
}
|
|
288 |
}
|
|
289 |
|
|
290 |
/**
|
|
291 |
* count tokens.
|
|
292 |
*
|
|
293 |
* @param corp the corp
|
|
294 |
* @return true, if successful
|
|
295 |
* @throws CqiClientException
|
|
296 |
* @throws CqiServerError
|
|
297 |
* @throws IOException
|
|
298 |
*/
|
|
299 |
protected boolean scanCorpus(Corpus corp) throws CqiClientException, IOException, CqiServerError {
|
|
300 |
// get the cqp result of the query
|
|
301 |
|
|
302 |
long time = System.currentTimeMillis();
|
|
303 |
QueryResult result = corp.query(query, "index", true); //$NON-NLS-1$
|
|
304 |
boolean isTargetUsed = Toolbox.getCqiClient().subCorpusHasField(result.getQualifiedCqpId(), ICqiClient.CQI_CONST_FIELD_TARGET);
|
|
305 |
int nbresults = result.getNMatch();
|
|
306 |
this.nLines += nbresults; // get number of tokens
|
|
307 |
|
|
308 |
// System.out.println("nLines : "+nLines);
|
|
309 |
List<Match> matches = null;
|
|
310 |
if (nbresults > 0)
|
|
311 |
matches = result.getMatches(0, nbresults - 1); // get the
|
|
312 |
// indexes
|
|
313 |
// sequences of
|
|
314 |
// result's
|
|
315 |
// tokens
|
|
316 |
else
|
|
317 |
matches = new ArrayList<Match>();
|
|
318 |
// List<Integer> beginingOfKeywordsPositions = new
|
|
319 |
// ArrayList<Integer>();
|
|
320 |
// List<Integer> lengthOfKeywords = new ArrayList<Integer>();
|
|
321 |
// Map<Property, List<List<String>>> keywordsViewPropValues = new
|
|
322 |
// HashMap<Property, List<List<String>>>();
|
|
323 |
|
|
324 |
// count matches
|
|
325 |
// time = System.currentTimeMillis();
|
|
326 |
List<Integer> allpositions = new ArrayList<Integer>();
|
|
327 |
for (int j = 0; j < nbresults; j++) {
|
|
328 |
Match match = matches.get(j);
|
|
329 |
// beginingOfKeywordsPositions.add(match.getStart()); // get the
|
|
330 |
// first index
|
|
331 |
// lengthOfKeywords.add(match.size());// get the last index
|
|
332 |
if (isTargetUsed) {
|
|
333 |
allpositions.add(match.getTarget());
|
|
334 |
} else {
|
|
335 |
for (int i = match.getStart(); i <= match.getEnd(); i++)
|
|
336 |
allpositions.add(i);
|
|
337 |
}
|
|
338 |
}
|
|
339 |
// System.out.println("get string data");
|
|
340 |
// time = System.currentTimeMillis();
|
|
341 |
// for (Property property : props) {// for each property get the
|
|
342 |
// string values of the tokens
|
|
343 |
// keywordsViewPropValues.put(property,
|
|
344 |
// cache.get(property).getData(beginingOfKeywordsPositions,
|
|
345 |
// lengthOfKeywords));
|
|
346 |
// }
|
|
347 |
// System.out.println("took "+(System.currentTimeMillis()-time));
|
|
348 |
|
|
349 |
// System.out.println("get count data");
|
|
350 |
|
|
351 |
int[] allpositionsarray = new int[allpositions.size()];
|
|
352 |
int pcount = 0;
|
|
353 |
for (int p : allpositions)
|
|
354 |
allpositionsarray[pcount++] = p;
|
|
355 |
|
|
356 |
// time = System.currentTimeMillis();
|
|
357 |
HashMap<Property, int[]> propsId = new HashMap<Property, int[]>();
|
|
358 |
for (Property property : props) {
|
|
359 |
try {
|
|
360 |
if (property instanceof StructuralUnitProperty) {
|
|
361 |
int[] structs = CorpusManager.getCorpusManager()
|
|
362 |
.getCqiClient().cpos2Struc(
|
|
363 |
property.getQualifiedName(),
|
|
364 |
allpositionsarray);
|
|
365 |
propsId.put(property, structs);
|
|
366 |
} else {
|
|
367 |
int[] indices = CorpusManager.getCorpusManager()
|
|
368 |
.getCqiClient().cpos2Id(
|
|
369 |
property.getQualifiedName(),
|
|
370 |
allpositionsarray);
|
|
371 |
propsId.put(property, indices);
|
|
372 |
}
|
|
373 |
} catch (Exception e) {
|
|
374 |
org.txm.utils.logger.Log.printStackTrace(e);
|
|
375 |
result.drop();
|
|
376 |
return false;
|
|
377 |
}
|
|
378 |
}
|
|
379 |
// System.out.println("Time recup indices "+(System.currentTimeMillis()-time));
|
|
380 |
int currentIndex = 0;
|
|
381 |
// time = System.currentTimeMillis();
|
|
382 |
for (int i = 0; i < nbresults; i++) {
|
|
383 |
Line line = new Line();
|
|
384 |
Match match = matches.get(i);
|
|
385 |
int size = match.size();
|
|
386 |
if (isTargetUsed) size = 1;
|
|
387 |
for (int p = 0; p < props.size(); p++) {
|
|
388 |
Property property = props.get(p);
|
|
389 |
int[] allprosids = propsId.get(property);
|
|
390 |
int[] ids = new int[size];
|
|
391 |
System.arraycopy(allprosids, currentIndex, ids, 0, size);
|
|
392 |
line.putIds(property, ids);
|
|
393 |
}
|
|
394 |
currentIndex += size;
|
|
395 |
|
|
396 |
String signature = line.getSignature();
|
|
397 |
|
|
398 |
if (counts.containsKey(signature)) // if the counts contains the
|
|
399 |
// signature, increment its
|
|
400 |
// corresponding value
|
|
401 |
{
|
|
402 |
while (counts.get(signature).size() <= currentpartid)
|
|
403 |
counts.get(signature).add(0);
|
|
404 |
int c = counts.get(signature).get(currentpartid) + 1;
|
|
405 |
counts.get(signature).set(currentpartid, c);
|
|
406 |
} else // else initialize count of the signature to 1
|
|
407 |
{
|
|
408 |
// System.out.println("add new sign "+signature+" of line "+line.toString());
|
|
409 |
ArrayList<Integer> tmp = new ArrayList<Integer>();
|
|
410 |
for (int j = 0; j < currentpartid + 1; j++)
|
|
411 |
tmp.add(0);
|
|
412 |
counts.put(signature, tmp);
|
|
413 |
counts.get(signature).set(currentpartid, 1);
|
|
414 |
|
|
415 |
lines.add(line);
|
|
416 |
}
|
|
417 |
}
|
|
418 |
result.drop();
|
|
419 |
// System.out.println("Time count lines "+(System.currentTimeMillis()-time));
|
|
420 |
// System.out.println("took "+(System.currentTimeMillis()-time));
|
|
421 |
return true;
|
|
422 |
|
|
423 |
}
|
|
424 |
|
|
425 |
/**
|
|
426 |
* return the lines from le "start"th one to the "end"th one.
|
|
427 |
*
|
|
428 |
* @param start the start
|
|
429 |
* @param end the end
|
|
430 |
* @return the lines
|
|
431 |
*/
|
|
432 |
public List<Line> getLines(int start, int end) {
|
|
433 |
long time = System.currentTimeMillis();
|
|
434 |
List<Line> selectedLines = new ArrayList<Line>();
|
|
435 |
if (lines.size() > 0) {
|
|
436 |
start = Math.max(0, start);
|
|
437 |
end = Math.min(end, lines.size());
|
|
438 |
selectedLines = lines.subList(start, end);
|
|
439 |
|
|
440 |
int p = 0;
|
|
441 |
for (Property property : props) {// for each property get the string values
|
|
442 |
// values of the tokens
|
|
443 |
|
|
444 |
int len = 0;
|
|
445 |
for (Line l : selectedLines) {
|
|
446 |
len += l.UnitsIds.get(p).length;
|
|
447 |
}
|
|
448 |
|
|
449 |
int[] indices = new int[len]; // build the array of indices
|
|
450 |
len = 0;
|
|
451 |
for (Line l : selectedLines) {
|
|
452 |
int[] ids = l.UnitsIds.get(p);
|
|
453 |
System.arraycopy(ids, 0, indices, len, ids.length);
|
|
454 |
len += ids.length;
|
|
455 |
}
|
|
456 |
String[] strs = null;
|
|
457 |
try {
|
|
458 |
if (property instanceof StructuralUnitProperty) {
|
|
459 |
strs = CorpusManager.getCorpusManager().getCqiClient()
|
|
460 |
.struc2Str(property.getQualifiedName(), indices);
|
|
461 |
} else {
|
|
462 |
strs = CorpusManager.getCorpusManager().getCqiClient()
|
|
463 |
.id2Str(property.getQualifiedName(), indices);
|
|
464 |
}
|
|
465 |
} catch (Exception e) {
|
|
466 |
org.txm.utils.logger.Log.printStackTrace(e);
|
|
467 |
return null;
|
|
468 |
}
|
|
469 |
len = 0;
|
|
470 |
for (Line l : selectedLines) {
|
|
471 |
int[] ids = l.UnitsIds.get(p);
|
|
472 |
String[] lstr = new String[ids.length];
|
|
473 |
System.arraycopy(strs, len, lstr, 0, ids.length);
|
|
474 |
if (l.UnitsProperty.size() == props.size()) continue; // the line is already initialized
|
|
475 |
l.put(property, Arrays.asList(lstr));
|
|
476 |
len += ids.length;
|
|
477 |
}
|
|
478 |
p++;
|
|
479 |
}
|
|
480 |
}
|
|
481 |
// System.out.println("Time get lines "+(System.currentTimeMillis()-time));
|
|
482 |
return selectedLines;
|
|
483 |
}
|
|
484 |
|
|
485 |
/**
|
|
486 |
* return all the lines of the index.
|
|
487 |
*
|
|
488 |
* @return the all lines
|
|
489 |
*/
|
|
490 |
public List<Line> getAllLines() {
|
|
491 |
return getLines(0, lines.size());
|
|
492 |
}
|
|
493 |
|
|
494 |
/**
|
|
495 |
* return the cqp expression build with the lines between "from" and "to"
|
|
496 |
* TODO: finish implementation.
|
|
497 |
*
|
|
498 |
* @param from the from
|
|
499 |
* @param to the to
|
|
500 |
* @return the query
|
|
501 |
*/
|
|
502 |
public Query buildQuery(int from, int to) {
|
|
503 |
return new Query(""); //$NON-NLS-1$
|
|
504 |
}
|
|
505 |
|
|
506 |
/**
|
|
507 |
* Gets the query.
|
|
508 |
*
|
|
509 |
* @return the query used
|
|
510 |
*/
|
|
511 |
public Query getQuery() {
|
|
512 |
return query;
|
|
513 |
}
|
|
514 |
|
|
515 |
/**
|
|
516 |
* Gets the v.
|
|
517 |
*
|
|
518 |
* @return the number of entrie in the index
|
|
519 |
*/
|
|
520 |
public int getV() {
|
|
521 |
return lines.size();
|
|
522 |
}
|
|
523 |
|
|
524 |
/**
|
|
525 |
* Gets the t.
|
|
526 |
*
|
|
527 |
* @return the number of tokens returned by the selection
|
|
528 |
*/
|
|
529 |
public int getT() {
|
|
530 |
return nLines;
|
|
531 |
}
|
|
532 |
|
|
533 |
/**
|
|
534 |
* Sort lines.
|
|
535 |
*
|
|
536 |
* @param mode the mode
|
|
537 |
* @param reverse the reverse
|
|
538 |
*/
|
|
539 |
public void sortLines(LineComparator.SortMode mode, boolean reverse) {
|
|
540 |
this.altered = true;
|
|
541 |
LineComparator lc = new LineComparator(mode, reverse);
|
|
542 |
lc.initialize(corpus);
|
|
543 |
Collections.sort(lines, lc);
|
|
544 |
}
|
|
545 |
|
|
546 |
/**
|
|
547 |
* remove line which frenquency is not in the inverval [Fmin,Fmax].
|
|
548 |
*
|
|
549 |
* @param Fmin the fmin
|
|
550 |
* @param Fmax the fmax
|
|
551 |
*/
|
|
552 |
public void filterLines(int Fmin, int Fmax) {
|
|
553 |
int before = lines.size();
|
|
554 |
for (int i = 0; i < lines.size(); i++)// for each line
|
|
555 |
{
|
|
556 |
Line line = lines.get(i);
|
|
557 |
int f = line.getFrequency();
|
|
558 |
if (f < Fmin || f > Fmax) // if its frequency is not in the
|
|
559 |
// interval, remove it
|
|
560 |
{
|
|
561 |
nLines -= line.getFrequency();
|
|
562 |
lines.remove(i);
|
|
563 |
i--;
|
|
564 |
}
|
|
565 |
}
|
|
566 |
this.FilterFmin = Fmin;
|
|
567 |
this.FilterFmax = Fmax;
|
|
568 |
this.updateFminFmax();
|
|
569 |
|
|
570 |
altered = true;
|
|
571 |
}
|
|
572 |
|
|
573 |
/**
|
|
574 |
* keep the vmax lines more frequents.
|
|
575 |
*
|
|
576 |
* @param vmax the vmax
|
|
577 |
*/
|
|
578 |
public void cut(int vmax) {
|
|
579 |
this.acquireSemaphore();
|
|
580 |
// assume the lines are sorted
|
|
581 |
int before = lines.size();
|
|
582 |
this.lines = this.lines.subList(0, Math.min(lines.size(), vmax));
|
|
583 |
this.FilterVmax = vmax;
|
|
584 |
updateFminFmax();
|
|
585 |
|
|
586 |
altered = true;
|
|
587 |
this.releaseSemaphore();
|
|
588 |
}
|
|
589 |
|
|
590 |
/**
|
|
591 |
* checks all lines to update Fmin and Fmax.
|
|
592 |
*/
|
|
593 |
private void updateFminFmax() {
|
|
594 |
Fmin = 9999;
|
|
595 |
Fmax = 1;
|
|
596 |
for (int i = 0; i < lines.size(); i++) {
|
|
597 |
Line line = lines.get(i);
|
|
598 |
int f = line.getFrequency();
|
|
599 |
if (f < Fmin)
|
|
600 |
Fmin = f;
|
|
601 |
if (f > Fmax)
|
|
602 |
Fmax = f;
|
|
603 |
}
|
|
604 |
}
|
|
605 |
|
|
606 |
/**
|
|
607 |
* Write all the lines on a writer.
|
|
608 |
*
|
|
609 |
* @param outfile the outfile
|
|
610 |
* @param encoding the encoding
|
|
611 |
* @param colseparator the colseparator
|
|
612 |
* @param txtseparator the txtseparator
|
|
613 |
* @return true, if successful
|
|
614 |
*/
|
|
615 |
public boolean toTxt(File outfile, String encoding, String colseparator, String txtseparator) {
|
|
616 |
try {
|
|
617 |
toTxt(outfile, 0, lines.size(), encoding, colseparator, txtseparator);
|
|
618 |
} catch (Exception e) {
|
|
619 |
System.err.println(Messages.Index_7 + Log.toString(e));
|
|
620 |
return false;
|
|
621 |
}
|
|
622 |
return true;
|
|
623 |
}
|
|
624 |
|
|
625 |
/**
|
|
626 |
* Write the lines between from and to on a writer.
|
|
627 |
*
|
|
628 |
* @param outfile the outfile
|
|
629 |
* @param from The first line to be written
|
|
630 |
* @param to The last line to be writen
|
|
631 |
* @param encoding the encoding
|
|
632 |
* @param colseparator the colseparator
|
|
633 |
* @param txtseparator the txtseparator
|
|
634 |
* @throws CqiClientException the cqi client exception
|
|
635 |
* @throws IOException Signals that an I/O exception has occurred.
|
|
636 |
*/
|
|
637 |
public void toTxt(File outfile, int from, int to, String encoding, String colseparator, String txtseparator)
|
|
638 |
throws CqiClientException, IOException {
|
|
639 |
// NK: writer declared as class attribute to perform a clean if the operation is interrupted
|
|
640 |
this.writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile),
|
|
641 |
encoding));
|
|
642 |
// if ("UTF-8".equals(encoding)) writer.write('\ufeff'); // UTF-8 BOM
|
|
643 |
String header = ""; //$NON-NLS-1$
|
|
644 |
for (Property p : props)
|
|
645 |
header += (p + propSeparator);
|
|
646 |
header = txtseparator+ header.substring(0, header.length() - 1) +txtseparator;
|
|
647 |
header += colseparator+ txtseparator+ "F" + txtseparator; //$NON-NLS-1$
|
|
648 |
if (partnames.size() > 1)
|
|
649 |
for (int j = 0; j < partnames.size(); j++)
|
|
650 |
header += colseparator + txtseparator+ partnames.get(j).replace(txtseparator, txtseparator+txtseparator)+txtseparator;
|
|
651 |
header += "\n"; //$NON-NLS-1$
|
|
652 |
writer.write(header);
|
|
653 |
|
|
654 |
// for(Line ligne: lines)
|
|
655 |
for (int i = from; i < to; i++) {
|
|
656 |
Line ligne = lines.get(i);
|
|
657 |
writer.write(txtseparator+ ligne.toString().replace(txtseparator, txtseparator+txtseparator)+ txtseparator + colseparator + ligne.getFrequency());
|
|
658 |
if (partnames.size() > 1)
|
|
659 |
for (int j = 0; j < partnames.size(); j++)
|
|
660 |
writer.write(colseparator + ligne.getFrequency(j));
|
|
661 |
writer.write("\n"); //$NON-NLS-1$
|
|
662 |
}
|
|
663 |
writer.flush();
|
|
664 |
writer.close();
|
|
665 |
}
|
|
666 |
|
|
667 |
|
|
668 |
/**
|
|
669 |
* Write the lines between from and to on a writer.
|
|
670 |
*
|
|
671 |
* @param outfile the outfile
|
|
672 |
* @param from The first line to be written
|
|
673 |
* @param to The last line to be writen
|
|
674 |
* @param encoding the encoding
|
|
675 |
* @param colseparator the colseparator
|
|
676 |
* @param txtseparator the txtseparator
|
|
677 |
* @throws CqiClientException the cqi client exception
|
|
678 |
* @throws IOException Signals that an I/O exception has occurred.
|
|
679 |
*/
|
|
680 |
public void toTSVDictionnary(File outfile, String colseparator, String encoding) throws CqiClientException, IOException {
|
|
681 |
|
|
682 |
//String colseparator = "\t";
|
|
683 |
|
|
684 |
this.writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
|
|
685 |
|
|
686 |
String header = ""; //$NON-NLS-1$
|
|
687 |
for (Property p : props) {
|
|
688 |
header += (p + colseparator);
|
|
689 |
}
|
|
690 |
|
|
691 |
//header = header.substring(0, header.length() - 1);
|
|
692 |
header += "F"; //$NON-NLS-1$
|
|
693 |
if (partnames.size() > 1) {
|
|
694 |
for (int j = 0; j < partnames.size(); j++) {
|
|
695 |
header += colseparator + partnames.get(j);
|
|
696 |
}
|
|
697 |
}
|
|
698 |
header += "\n"; //$NON-NLS-1$
|
|
699 |
writer.write(header);
|
|
700 |
|
|
701 |
for (Line ligne : lines) {
|
|
702 |
|
|
703 |
for (int i = 0 ; i < props.size() ; i++) {
|
|
704 |
writer.write(StringUtils.join(ligne.getUnitsProperties().get(i), " ") + colseparator);
|
|
705 |
}
|
|
706 |
|
|
707 |
writer.write(Integer.toString(ligne.getFrequency()));
|
|
708 |
|
|
709 |
if (partnames.size() > 1) {
|
|
710 |
for (int j = 0; j < partnames.size(); j++) {
|
|
711 |
writer.write(colseparator + ligne.getFrequency(j));
|
|
712 |
}
|
|
713 |
}
|
|
714 |
writer.write("\n"); //$NON-NLS-1$
|
|
715 |
}
|
|
716 |
writer.flush();
|
|
717 |
writer.close();
|
|
718 |
}
|
|
719 |
|
|
720 |
/**
|
|
721 |
* dump the index in the console.
|
|
722 |
*
|
|
723 |
* @throws CqiClientException the cqi client exception
|
|
724 |
* @throws IOException Signals that an I/O exception has occurred.
|
|
725 |
*/
|
|
726 |
public void toConsole() throws CqiClientException, IOException {
|
|
727 |
System.out.println(Messages.Index_1 + (lines.size()));
|
|
728 |
toConsole(0, lines.size());
|
|
729 |
}
|
|
730 |
|
|
731 |
/**
|
|
732 |
* dump a part of the index.
|
|
733 |
*
|
|
734 |
* @param from the from
|
|
735 |
* @param to the to
|
|
736 |
* @throws CqiClientException the cqi client exception
|
|
737 |
* @throws IOException Signals that an I/O exception has occurred.
|
|
738 |
*/
|
|
739 |
public void toConsole(int from, int to) throws CqiClientException,
|
|
740 |
IOException {
|
|
741 |
|
|
742 |
String header = ""; //$NON-NLS-1$
|
|
743 |
for (Property p : props)
|
|
744 |
header += (p + propSeparator);
|
|
745 |
header = header.substring(0, header.length() - 1);
|
|
746 |
header += "\tF"; //$NON-NLS-1$
|
|
747 |
if (partnames.size() > 1)
|
|
748 |
for (int j = 0; j < partnames.size(); j++)
|
|
749 |
header += "\t" + partnames.get(j); //$NON-NLS-1$
|
|
750 |
|
|
751 |
System.out.println(header);
|
|
752 |
for (int i = from; i < to; i++) {
|
|
753 |
Line ligne = lines.get(i);
|
|
754 |
System.out.print(ligne + "\t" + ligne.getFrequency()); //$NON-NLS-1$
|
|
755 |
if (partnames.size() > 1)
|
|
756 |
for (int j = 0; j < partnames.size(); j++)
|
|
757 |
System.out.print("\t" + ligne.getFrequency(j)); //$NON-NLS-1$
|
|
758 |
System.out.print("\n"); //$NON-NLS-1$
|
|
759 |
}
|
|
760 |
System.out.flush();
|
|
761 |
}
|
|
762 |
|
|
763 |
/**
|
|
764 |
* Gets the name.
|
|
765 |
*
|
|
766 |
* @return the name of this index
|
|
767 |
*/
|
|
768 |
public String getName() {
|
|
769 |
String name = query.getQueryString() + ":"; //$NON-NLS-1$
|
|
770 |
for (Property s : props)
|
|
771 |
name += s + propSeparator;
|
|
772 |
if (props.size() > 0)
|
|
773 |
name = name.substring(0, name.length() - 1);
|
|
774 |
return name;
|
|
775 |
}
|
|
776 |
|
|
777 |
/**
|
|
778 |
* Gets the corpus.
|
|
779 |
*
|
|
780 |
* @return the corpus
|
|
781 |
*/
|
|
782 |
public Corpus getCorpus() {
|
|
783 |
return this.corpus;
|
|
784 |
}
|
|
785 |
|
|
786 |
/**
|
|
787 |
* Gets the properties.
|
|
788 |
*
|
|
789 |
* @return the properties
|
|
790 |
*/
|
|
791 |
public List<Property> getProperties() {
|
|
792 |
return this.props;
|
|
793 |
}
|
|
794 |
|
|
795 |
/**
|
|
796 |
* Gets the fmax.
|
|
797 |
*
|
|
798 |
* @return the fmax
|
|
799 |
*/
|
|
800 |
public int getFmax() {
|
|
801 |
return Fmax;
|
|
802 |
}
|
|
803 |
|
|
804 |
/**
|
|
805 |
* Gets the fmin.
|
|
806 |
*
|
|
807 |
* @return the fmin
|
|
808 |
*/
|
|
809 |
public int getFmin() {
|
|
810 |
return Fmin;
|
|
811 |
}
|
|
812 |
|
|
813 |
/**
|
|
814 |
* Gets the filter fmin.
|
|
815 |
*
|
|
816 |
* @return the filter fmin
|
|
817 |
*/
|
|
818 |
public int getFilterFmin() {
|
|
819 |
return FilterFmin;
|
|
820 |
}
|
|
821 |
|
|
822 |
/**
|
|
823 |
* Gets the filter fmax.
|
|
824 |
*
|
|
825 |
* @return the filter fmax
|
|
826 |
*/
|
|
827 |
public int getFilterFmax() {
|
|
828 |
return FilterFmax;
|
|
829 |
}
|
|
830 |
|
|
831 |
/**
|
|
832 |
* Gets the filter vmax.
|
|
833 |
*
|
|
834 |
* @return the filter vmax
|
|
835 |
*/
|
|
836 |
public int getFilterVmax() {
|
|
837 |
return FilterVmax;
|
|
838 |
}
|
|
839 |
|
|
840 |
/**
|
|
841 |
* Equals.
|
|
842 |
*
|
|
843 |
* @param voc the voc
|
|
844 |
* @return true, if successful
|
|
845 |
*/
|
|
846 |
public boolean equals(Index voc) {
|
|
847 |
return this.query.equals(voc.getQuery())
|
|
848 |
&& this.props.equals(voc.getProperties())
|
|
849 |
&& this.Fmin == voc.getFmin() && this.Fmax == voc.getFmax();
|
|
850 |
}
|
|
851 |
|
|
852 |
/**
|
|
853 |
* Gets the partnames.
|
|
854 |
*
|
|
855 |
* @return the partnames
|
|
856 |
*/
|
|
857 |
public List<String> getPartnames() {
|
|
858 |
return partnames;
|
|
859 |
}
|
|
860 |
|
|
861 |
/**
|
|
862 |
* Gets the partition.
|
|
863 |
*
|
|
864 |
* @return the partition
|
|
865 |
*/
|
|
866 |
public Partition getPartition() {
|
|
867 |
return partition;
|
|
868 |
}
|
|
869 |
|
|
870 |
/**
|
|
871 |
* Checks if is altered.
|
|
872 |
*
|
|
873 |
* @return true, if is altered
|
|
874 |
*/
|
|
875 |
public boolean isAltered() {
|
|
876 |
// TODO Auto-generated method stub
|
|
877 |
return altered;
|
|
878 |
}
|
|
879 |
|
|
880 |
/** The novoc. */
|
|
881 |
protected static int novoc = 1;
|
|
882 |
|
|
883 |
/** The prefix r. */
|
|
884 |
protected static String prefixR = "Index"; //$NON-NLS-1$
|
|
885 |
|
|
886 |
/**
|
|
887 |
* As r matrix.
|
|
888 |
*
|
|
889 |
* @return the string
|
|
890 |
* @throws RWorkspaceException the r workspace exception
|
|
891 |
*/
|
|
892 |
public String asRMatrix() throws RWorkspaceException {
|
|
893 |
symbol = prefixR+novoc;
|
|
894 |
|
|
895 |
ArrayList<String> colnames = new ArrayList<String>();
|
|
896 |
|
|
897 |
colnames.add("F"); //$NON-NLS-1$
|
|
898 |
if (partnames.size() > 1)
|
|
899 |
for (int j = 0; j < partnames.size(); j++)
|
|
900 |
colnames.add(partnames.get(j));
|
|
901 |
|
|
902 |
//System.out.println("cols: "+colnames);
|
|
903 |
|
|
904 |
String[] keywords = new String[this.lines.size()];
|
|
905 |
|
|
906 |
int[] freq = new int[this.lines.size()];
|
|
907 |
ArrayList<int[]> partfreqs = new ArrayList<int[]>(partnames.size());
|
|
908 |
if (partnames.size() > 1)
|
|
909 |
for (int j = 0; j < partnames.size(); j++)
|
|
910 |
partfreqs.add(new int[this.lines.size()]);
|
|
911 |
|
|
912 |
for (int i = 0 ; i < lines.size() ; i++)
|
|
913 |
{
|
|
914 |
Line ligne = lines.get(i);
|
|
915 |
freq[i] = ligne.getFrequency();
|
|
916 |
keywords[i] = ligne.toString();
|
|
917 |
if (partnames.size() > 1)
|
|
918 |
for (int j = 0; j < partnames.size(); j++)
|
|
919 |
{
|
|
920 |
partfreqs.get(j)[i] = ligne.getFrequency(j);
|
|
921 |
}
|
|
922 |
|
|
923 |
}
|
|
924 |
|
|
925 |
RWorkspace rw = RWorkspace.getRWorkspaceInstance();
|
|
926 |
if (partnames.size() > 1)
|
|
927 |
for (int j = 0; j < partnames.size(); j++)
|
|
928 |
rw.addVectorToWorkspace("vocpartfreqs"+j, partfreqs.get(j)); //$NON-NLS-1$
|
|
929 |
rw.addVectorToWorkspace("vocfreq", freq); //$NON-NLS-1$
|
|
930 |
rw.addVectorToWorkspace("vockeywords", keywords); //$NON-NLS-1$
|
|
931 |
rw.addVectorToWorkspace("voccolnames", colnames.toArray(new String[colnames.size()])); //$NON-NLS-1$
|
|
932 |
|
|
933 |
int ncol = 1;
|
|
934 |
if (partnames.size() > 1)
|
|
935 |
ncol += partnames.size();
|
|
936 |
|
|
937 |
int nrow = lines.size();
|
|
938 |
String partscmd = ""; //$NON-NLS-1$
|
|
939 |
if (partnames.size() > 1)
|
|
940 |
for (int j = 0; j < partnames.size(); j++)
|
|
941 |
partscmd +=", vocpartfreqs"+j; //$NON-NLS-1$
|
|
942 |
|
|
943 |
rw.eval(symbol+ "<- matrix(data = c(vocfreq"+partscmd+"), nrow = "+nrow+", ncol = "+ncol+")"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
|
|
944 |
rw.eval("colnames("+symbol+" ) <- voccolnames"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
945 |
rw.eval("rownames("+symbol+" ) <- vockeywords"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
946 |
rw.eval(symbol+ "<- list(data="+symbol+")"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
947 |
|
|
948 |
novoc++;
|
|
949 |
return symbol;
|
|
950 |
}
|
|
951 |
|
|
952 |
/**
|
|
953 |
* Gets the symbol.
|
|
954 |
*
|
|
955 |
* @return the symbol
|
|
956 |
*/
|
|
957 |
public String getSymbol()
|
|
958 |
{
|
|
959 |
return symbol;
|
|
960 |
}
|
|
961 |
|
|
962 |
/**
|
|
963 |
* If the index has been build with corpus + property, the method returns the lexicon used.
|
|
964 |
*
|
|
965 |
* @return the lexicon
|
|
966 |
*/
|
|
967 |
public Object getLexicon() {
|
|
968 |
return lexicon;
|
|
969 |
}
|
|
970 |
|
|
971 |
/** The properties to process. */
|
|
972 |
HashSet<String> propertiesToProcess;
|
|
973 |
|
|
974 |
// /**
|
|
975 |
// * Render.
|
|
976 |
// *
|
|
977 |
// * @param line the line
|
|
978 |
// * @return the string
|
|
979 |
// */
|
|
980 |
// public String render(Line line)
|
|
981 |
// {
|
|
982 |
// if(propertiesToProcess == null)
|
|
983 |
// propertiesToProcess = TxmRenderer.getPropertyToRender(this, "jsesh"); //$NON-NLS-1$
|
|
984 |
// //System.out.println("props to process: "+propertiesToProcess);
|
|
985 |
//
|
|
986 |
// ArrayList<Boolean> torender = new ArrayList<Boolean>();
|
|
987 |
// for(Property prop : line.properties)
|
|
988 |
// if(propertiesToProcess.contains(prop.getName()))
|
|
989 |
// torender.add(true);
|
|
990 |
// else
|
|
991 |
// torender.add(false);
|
|
992 |
// //System.out.println("to render: "+torender);
|
|
993 |
//
|
|
994 |
// String rez = ""; //$NON-NLS-1$
|
|
995 |
// int len = 0; // get number of token per line
|
|
996 |
// for (int i = 0; i < line.properties.size(); ) {
|
|
997 |
// len = line.UnitsProperty.get(i).size();
|
|
998 |
// break;
|
|
999 |
// }
|
|
1000 |
// // TODO: bug concat props
|
|
1001 |
// for (int i = 0; i < len; i++)// for each token
|
|
1002 |
// {
|
|
1003 |
// for (int j = 0; j < line.properties.size(); j++) {
|
|
1004 |
// if(torender.get(j))
|
|
1005 |
// {
|
|
1006 |
// //System.out.println("render mdc : "+line.UnitsProperty.get(j).get(i));
|
|
1007 |
// rez += TxmRenderer.jseshrenderer.render(line.UnitsProperty.get(j).get(i));
|
|
1008 |
// }
|
|
1009 |
// else
|
|
1010 |
// {
|
|
1011 |
// //System.out.println("tostring : "+line.UnitsProperty.get(j).get(i));
|
|
1012 |
// rez += line.UnitsProperty.get(j).get(i);
|
|
1013 |
// }
|
|
1014 |
//
|
|
1015 |
// if (j < line.properties.size() - 1) {
|
|
1016 |
// rez += rez+"_"; //$NON-NLS-1$
|
|
1017 |
// }
|
|
1018 |
// }
|
|
1019 |
// if (i < len - 1)
|
|
1020 |
// rez += " "; //$NON-NLS-1$
|
|
1021 |
// }
|
|
1022 |
// return rez;
|
|
1023 |
//
|
|
1024 |
// }
|
|
1025 |
|
|
1026 |
public void setIsAltered(boolean b) {
|
|
1027 |
this.altered = b;
|
|
1028 |
}
|
|
1029 |
|
|
1030 |
public HasResults getParent() {
|
|
1031 |
if (partition != null) return partition;
|
|
1032 |
return corpus;
|
|
1033 |
}
|
|
1034 |
|
|
1035 |
@Override
|
|
1036 |
public void clean() {
|
|
1037 |
try {
|
|
1038 |
this.writer.flush();
|
|
1039 |
this.writer.close();
|
|
1040 |
} catch (IOException e) {
|
|
1041 |
// TODO Auto-generated catch block
|
|
1042 |
org.txm.utils.logger.Log.printStackTrace(e);
|
|
1043 |
}
|
|
1044 |
}
|
|
1045 |
|
|
1046 |
@Override
|
|
1047 |
public boolean delete() {
|
|
1048 |
return getParent().removeResult(this);
|
|
1049 |
}
|
|
1050 |
|
|
1051 |
ArrayList<Object> results = new ArrayList<Object>();
|
|
1052 |
@Override
|
|
1053 |
public List<Object> getResults() {
|
|
1054 |
return results;
|
|
1055 |
}
|
|
1056 |
|
|
1057 |
@Override
|
|
1058 |
public boolean removeResult(Object result) {
|
|
1059 |
return results.remove(result);
|
|
1060 |
}
|
|
1061 |
|
|
1062 |
@Override
|
|
1063 |
public List<HasResults> getSubHasResults() {
|
|
1064 |
return new ArrayList<HasResults>();
|
|
1065 |
}
|
|
1066 |
|
|
1067 |
@Override
|
|
1068 |
public void storeResult(Object result) {
|
|
1069 |
results.add(result);
|
|
1070 |
}
|
|
1071 |
}
|
0 |
1072 |
|