Révision 2924
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/MainCorpus.java (revision 2924) | ||
---|---|---|
129 | 129 |
@Override |
130 | 130 |
protected boolean __compute(TXMProgressMonitor monitor) throws Exception { |
131 | 131 |
|
132 |
|
|
133 |
|
|
134 | 132 |
if (!CQPSearchEngine.isInitialized()) { |
135 | 133 |
Log.warning("** TXM can't load MainCorpus when CQP search engine is not ready."); |
136 | 134 |
return false; |
... | ... | |
167 | 165 |
PatchCwbRegistry.patch(this.registryFile, this.dataDirectory); |
168 | 166 |
} |
169 | 167 |
catch (IOException e) { |
170 |
Log.severe(TXMCoreMessages.bind("Error while updating the {0} registry file .", this.registryFile));
|
|
168 |
Log.severe(TXMCoreMessages.bind("Error while updating the {0} registry file.", this.registryFile)); |
|
171 | 169 |
Log.printStackTrace(e); |
172 | 170 |
return false; |
173 | 171 |
} |
174 | 172 |
|
173 |
Log.fine(NLS.bind("Call CQI: load_a_system_corpus with {0} and {1}", this.registryFile.getParent(), this.pID)); |
|
175 | 174 |
CQPSearchEngine.getCqiClient().load_a_system_corpus(this.registryFile.getParent(), this.pID); |
176 | 175 |
|
176 |
List tmp = Arrays.asList(CQPSearchEngine.getCqiClient().listCorpora()); |
|
177 |
if (tmp.contains(this.pID)) { |
|
178 |
Log.fine("Corpus registered: " + pID); |
|
179 |
Log.fine(NLS.bind("Call CQI: corpusProperties with {0}.", this.pID)); |
|
180 |
try { |
|
181 |
String[] props = CQPSearchEngine.getCqiClient().corpusProperties(this.pID); |
|
182 |
Log.fine(NLS.bind("Corpus {0} loaded with properties: {1}.", pID, Arrays.asList(props))); |
|
183 |
} |
|
184 |
catch (Exception e) { |
|
185 |
Log.warning(TXMCoreMessages.bind("Error while loading the {0} corpus: ", pID, e.getMessage())); |
|
186 |
return false; |
|
187 |
} |
|
188 |
} |
|
189 |
else { |
|
190 |
Log.severe(TXMCoreMessages.bind("Error while loading the {0} corpus. Not found in {1}", this.pID, tmp)); |
|
191 |
return false; |
|
192 |
} |
|
193 |
|
|
177 | 194 |
corpora.put(this.pID, this); // register the corpus |
178 | 195 |
} |
179 | 196 |
catch (Exception e) { |
... | ... | |
263 | 280 |
public void clean() { |
264 | 281 |
super.clean(); |
265 | 282 |
|
266 |
if (CorpusManager.getCorpusManager().getCorpora().get(this.pID) == this) { // un register the MainCorpus
|
|
283 |
if (CorpusManager.getCorpusManager().getCorpora().get(this.pID) == this) { // unregister the MainCorpus |
|
267 | 284 |
CorpusManager.getCorpusManager().getCorpora().remove(this.pID); |
268 | 285 |
} |
269 | 286 |
|
... | ... | |
274 | 291 |
} |
275 | 292 |
catch (Exception e) { |
276 | 293 |
Log.fine(e.getLocalizedMessage()); |
277 |
//Log.printStackTrace(e); |
|
294 |
// Log.printStackTrace(e);
|
|
278 | 295 |
} |
279 | 296 |
|
280 | 297 |
if (dataDirectory != null) { |
... | ... | |
381 | 398 |
*/ |
382 | 399 |
@Override |
383 | 400 |
public List<WordProperty> getProperties() throws CqiClientException { |
384 |
if (this.lexicalUnitsProperties != null) |
|
401 |
if (this.lexicalUnitsProperties != null) {
|
|
385 | 402 |
return this.lexicalUnitsProperties; |
403 |
} |
|
386 | 404 |
|
387 | 405 |
String[] propertiesName; |
388 |
CorpusManager cm = null; |
|
389 |
AbstractCqiClient cc = null; |
|
390 | 406 |
try { |
391 |
cm = CorpusManager.getCorpusManager(); |
|
392 |
cc = cm.getCqiClient(); |
|
393 |
|
|
394 |
// if (cc != null) |
|
395 |
// System.out.println("cqiclient OK"); |
|
396 |
|
|
407 |
propertiesName = CQPSearchEngine.getCqiClient().corpusPositionalAttributes(this.pID); |
|
397 | 408 |
} |
398 | 409 |
catch (Exception e) { |
399 | 410 |
throw new CqiClientException(e); |
400 | 411 |
} |
401 |
try { |
|
402 |
// System.out.println(this.pID); // temp : toLowerCAse |
|
403 |
propertiesName = cc.corpusPositionalAttributes(this.pID); |
|
412 |
|
|
413 |
List<WordProperty> properties = new ArrayList<>(propertiesName.length); |
|
414 |
for (int i = 0; i < propertiesName.length; i++) { |
|
415 |
properties.add(new WordProperty(propertiesName[i], this)); |
|
404 | 416 |
} |
405 |
catch (Exception e) { |
|
406 |
throw new CqiClientException(e); |
|
407 |
} |
|
408 |
List<WordProperty> properties = new ArrayList<>( |
|
409 |
propertiesName.length); |
|
410 |
for (int i = 0; i < propertiesName.length; i++) |
|
411 |
properties.add(new WordProperty(propertiesName[i], this)); |
|
412 | 417 |
this.lexicalUnitsProperties = properties; |
413 | 418 |
return properties; |
414 | 419 |
} |
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/AbstractCqiClient.java (revision 2924) | ||
---|---|---|
56 | 56 |
* |
57 | 57 |
* @author mdecorde |
58 | 58 |
*/ |
59 |
public abstract class AbstractCqiClient implements ICqiClient{ |
|
60 |
|
|
59 |
public abstract class AbstractCqiClient implements ICqiClient {
|
|
60 |
|
|
61 | 61 |
static Pattern pattern = Pattern.compile("\\p{Upper}(\\p{Upper}|\\p{Digit}|[_-])*"); //$NON-NLS-1$ |
62 |
|
|
62 | 63 |
static Pattern pattern2 = Pattern.compile("\\p{Upper}(\\p{Lower}|\\p{Digit}|[_-])*"); //$NON-NLS-1$ |
63 |
|
|
64 |
|
|
64 | 65 |
String lastError; |
65 |
|
|
66 | 66 |
|
67 |
|
|
67 | 68 |
/** |
68 | 69 |
* Check wether <code>id</code> is a valid CQi for a corpus. |
69 | 70 |
* |
... | ... | |
71 | 72 |
* @return true if <code>id</code> is in uppercase chararcters |
72 | 73 |
*/ |
73 | 74 |
public static synchronized boolean checkCorpusId(String id) { |
74 |
//System.out.println("Pattern: "+pattern+" test with "+id); |
|
75 |
// System.out.println("Pattern: "+pattern+" test with "+id);
|
|
75 | 76 |
return pattern.matcher(id).matches(); |
76 | 77 |
} |
77 |
|
|
78 |
|
|
78 | 79 |
/** |
79 | 80 |
* Check whether <code>id</code> is a valid CQi for a subcorpus. |
80 | 81 |
* |
81 | 82 |
* @param id the id |
82 | 83 |
* @return true if <code>id</code> is an uppercase character followed by |
83 |
* lowercase characters |
|
84 |
* lowercase characters
|
|
84 | 85 |
*/ |
85 | 86 |
public static synchronized boolean checkSubcorpusId(String id) { |
86 |
//System.out.println("Pattern: "+pattern+" test with "+id); |
|
87 |
// System.out.println("Pattern: "+pattern+" test with "+id);
|
|
87 | 88 |
return pattern2.matcher(id).matches(); |
88 | 89 |
} |
89 | 90 |
|
90 |
public synchronized String getLastError(){
|
|
91 |
if(lastError != null && lastError.length() > 0) { |
|
91 |
public synchronized String getLastError() {
|
|
92 |
if (lastError != null && lastError.length() > 0) {
|
|
92 | 93 |
return lastError; |
93 | 94 |
} |
94 | 95 |
return CQPSearchEngineCoreMessages.noError; |
... | ... | |
101 | 102 |
|
102 | 103 |
if (!isWordProperty) { |
103 | 104 |
StructuralUnitProperty sprop = (StructuralUnitProperty) prop; |
104 |
QueryResult qresult = prop.getCorpus().query(new CQLQuery("<"+sprop.getFullName()+">[] expand to "+sprop.getName()), "TMP", false); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
|
|
105 |
QueryResult qresult = prop.getCorpus().query(new CQLQuery("<" + sprop.getFullName() + ">[] expand to " + sprop.getName()), "TMP", false); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
|
|
105 | 106 |
int[] strucs = this.cpos2Struc(qname, qresult.getStarts()); |
106 | 107 |
return Arrays.asList(cqiClient.struc2Str(qname, strucs)); |
107 |
} else { |
|
108 |
int[] positions = {0,1,2,3}; |
|
109 |
return Arrays.asList(cqiClient.cpos2Str(qname,positions)); |
|
110 | 108 |
} |
109 |
else { |
|
110 |
int[] positions = { 0, 1, 2, 3 }; |
|
111 |
return Arrays.asList(cqiClient.cpos2Str(qname, positions)); |
|
112 |
} |
|
111 | 113 |
} |
112 | 114 |
|
113 | 115 |
public List<String> getSingleData(Property prop, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError { |
... | ... | |
118 | 120 |
int[] strucs = cqiClient.cpos2Struc(qname, positions); |
119 | 121 |
return Arrays.asList(cqiClient.struc2Str(qname, strucs)); |
120 | 122 |
|
121 |
} else { |
|
122 |
return Arrays.asList(cqiClient.cpos2Str(qname,positions)); |
|
123 | 123 |
} |
124 |
else { |
|
125 |
return Arrays.asList(cqiClient.cpos2Str(qname, positions)); |
|
126 |
} |
|
124 | 127 |
} |
125 | 128 |
|
126 | 129 |
public List<List<String>> getData(Property prop, List<Integer> positions, |
127 | 130 |
List<Integer> nWords) throws CqiClientException, IOException, CqiServerError { |
128 | 131 |
|
129 |
//System.out.println("START prop: "+prop); |
|
130 |
//System.out.println("positions : "+positions); |
|
131 |
//System.out.println("nwords : "+nWords);
|
|
132 |
// System.out.println("START prop: "+prop);
|
|
133 |
// System.out.println("positions : "+positions);
|
|
134 |
// System.out.println("nwords : "+nWords);
|
|
132 | 135 |
|
133 |
List<List<String>> result = new ArrayList<List<String>>();
|
|
136 |
List<List<String>> result = new ArrayList<>(); |
|
134 | 137 |
|
135 | 138 |
// get all the positions needed, possible overlap |
136 |
HashSet<Integer> allPositionsNeeded = new HashSet<Integer>();
|
|
139 |
HashSet<Integer> allPositionsNeeded = new HashSet<>(); |
|
137 | 140 |
for (int i = 0; i < positions.size(); i++) { |
138 | 141 |
int n = nWords.get(i); |
139 | 142 |
for (int j = 0; j < n; j++) { |
... | ... | |
144 | 147 |
// conversion from List<Integer> to int[] |
145 | 148 |
int[] cpos = new int[allPositionsNeeded.size()]; |
146 | 149 |
int c = 0; |
147 |
for (int i : allPositionsNeeded) cpos[c++] = i; |
|
150 |
for (int i : allPositionsNeeded) |
|
151 |
cpos[c++] = i; |
|
148 | 152 |
|
149 | 153 |
// get values for positions |
150 | 154 |
String[] values; |
151 | 155 |
boolean isWordProperty = !(prop instanceof StructuralUnitProperty); |
152 | 156 |
if (!isWordProperty) { |
153 |
String qname = ((StructuralUnitProperty)prop).getQualifiedName(); |
|
157 |
String qname = ((StructuralUnitProperty) prop).getQualifiedName();
|
|
154 | 158 |
int[] structs = this.cpos2Struc(qname, cpos); |
155 | 159 |
values = this.struc2Str(qname, structs); |
156 |
} else { |
|
160 |
} |
|
161 |
else { |
|
157 | 162 |
String qname = prop.getQualifiedName(); |
158 | 163 |
values = this.cpos2Str(qname, cpos); |
159 | 164 |
} |
160 | 165 |
|
161 | 166 |
// sort results by position |
162 |
TreeMap<Integer, String> map = new TreeMap<Integer, String>();
|
|
163 |
for (int i = 0 ; i < values.length ; i++) {
|
|
167 |
TreeMap<Integer, String> map = new TreeMap<>(); |
|
168 |
for (int i = 0; i < values.length; i++) {
|
|
164 | 169 |
map.put(cpos[i], values[i]); |
165 | 170 |
} |
166 | 171 |
|
167 | 172 |
// fill results |
168 | 173 |
int start, end; |
169 | 174 |
SortedMap<Integer, String> smap; |
170 |
for (int i = 0 ; i < positions.size() ; i++) {
|
|
175 |
for (int i = 0; i < positions.size(); i++) {
|
|
171 | 176 |
start = positions.get(i); |
172 | 177 |
end = positions.get(i) + nWords.get(i); |
173 | 178 |
if (start > end) { |
174 |
Log.warning("Error: trying to get "+prop+" values from "+start+" to "+end);
|
|
179 |
Log.warning("Error: trying to get " + prop + " values from " + start + " to " + end);
|
|
175 | 180 |
result.add(new ArrayList<String>()); |
176 |
} else { |
|
181 |
} |
|
182 |
else { |
|
177 | 183 |
smap = map.subMap(start, end); |
178 |
result.add(new ArrayList<String>(smap.values()));
|
|
184 |
result.add(new ArrayList<>(smap.values())); |
|
179 | 185 |
} |
180 | 186 |
} |
181 |
|
|
187 |
|
|
182 | 188 |
return result; |
183 | 189 |
} |
184 |
|
|
190 |
|
|
185 | 191 |
/** |
186 | 192 |
* Return lists of strings. |
187 | 193 |
* |
... | ... | |
189 | 195 |
* @throws CqiClientException the cqi client exception |
190 | 196 |
*/ |
191 | 197 |
public List<String> getData(StructuralUnitProperty property, CQPCorpus corpus) throws CqiClientException { |
192 |
QueryResult tmp = corpus.query(new CQLQuery("<" + property.getFullName() + ">[]"), UUID.randomUUID().toString(), false); //$NON-NLS-1$ //$NON-NLS-2$
|
|
193 |
List<Match> matches = tmp.getMatches();
|
|
194 |
tmp.drop();
|
|
195 |
ArrayList<String> ret = new ArrayList<String>(new HashSet<String>(Match
|
|
196 |
.getValuesForProperty(property, matches)));
|
|
197 |
return ret;
|
|
198 |
QueryResult tmp = corpus.query(new CQLQuery("<" + property.getFullName() + ">[]"), UUID.randomUUID().toString(), false); //$NON-NLS-1$ //$NON-NLS-2$ |
|
199 |
List<Match> matches = tmp.getMatches(); |
|
200 |
tmp.drop(); |
|
201 |
ArrayList<String> ret = new ArrayList<>(new HashSet<>(Match
|
|
202 |
.getValuesForProperty(property, matches))); |
|
203 |
return ret; |
|
198 | 204 |
} |
199 | 205 |
|
200 | 206 |
/** |
... | ... | |
206 | 212 |
*/ |
207 | 213 |
public List<String> getData(StructuralUnitProperty prop, int number) throws CqiClientException { |
208 | 214 |
|
209 |
QueryResult tmp = prop.getCorpus()
|
|
210 |
.query(
|
|
211 |
new CQLQuery("<" + prop.getFullName() + ">[] expand to "+prop.getName()), UUID.randomUUID().toString(), false); //$NON-NLS-1$ //$NON-NLS-2$
|
|
212 |
if (number > tmp.getNMatch()) number = tmp.getNMatch();
|
|
213 |
List<Match> matches = tmp.getMatches(0, number);
|
|
214 |
tmp.drop();
|
|
215 |
return new ArrayList<String>(new HashSet<String>(Match
|
|
216 |
.getValuesForProperty(prop, matches)));
|
|
217 |
// System.out.println("Data "+property.getQualifiedName()+": "+data);
|
|
215 |
QueryResult tmp = prop.getCorpus() |
|
216 |
.query( |
|
217 |
new CQLQuery("<" + prop.getFullName() + ">[] expand to " + prop.getName()), UUID.randomUUID().toString(), false); //$NON-NLS-1$ //$NON-NLS-2$
|
|
218 |
if (number > tmp.getNMatch()) number = tmp.getNMatch(); |
|
219 |
List<Match> matches = tmp.getMatches(0, number); |
|
220 |
tmp.drop(); |
|
221 |
return new ArrayList<>(new HashSet<>(Match
|
|
222 |
.getValuesForProperty(prop, matches))); |
|
223 |
// System.out.println("Data "+property.getQualifiedName()+": "+data); |
|
218 | 224 |
|
219 | 225 |
} |
220 | 226 |
|
... | ... | |
227 | 233 |
* @return true, if successful |
228 | 234 |
*/ |
229 | 235 |
@Override |
230 |
public abstract boolean connect(String username, String password) throws UnexpectedAnswerException, IOException, CqiServerError ;
|
|
231 |
|
|
236 |
public abstract boolean connect(String username, String password) throws UnexpectedAnswerException, IOException, CqiServerError; |
|
237 |
|
|
232 | 238 |
// None |
233 | 239 |
/** |
234 | 240 |
* Disconnect. |
... | ... | |
237 | 243 |
*/ |
238 | 244 |
@Override |
239 | 245 |
public abstract boolean disconnect() throws UnexpectedAnswerException, CqiServerError, IOException; |
240 |
|
|
246 |
|
|
241 | 247 |
// CQI_CTRL_LAST_GENERAL_ERROR |
242 | 248 |
/** |
243 | 249 |
* return the last CQP error. |
... | ... | |
245 | 251 |
* @return the last error |
246 | 252 |
*/ |
247 | 253 |
@Override |
248 |
public abstract String getLastCqiError() throws UnexpectedAnswerException, IOException, CqiServerError;
|
|
249 |
|
|
254 |
public abstract String getLastCqiError() throws UnexpectedAnswerException, IOException, CqiServerError;
|
|
255 |
|
|
250 | 256 |
// CQI_CTRL_LAST_CQP_ERROR |
251 | 257 |
/** |
252 | 258 |
* return the last CQP error. |
... | ... | |
254 | 260 |
* @return the last error |
255 | 261 |
*/ |
256 | 262 |
@Override |
257 |
public abstract String getLastCQPError() throws UnexpectedAnswerException, IOException, CqiServerError;
|
|
258 |
|
|
263 |
public abstract String getLastCQPError() throws UnexpectedAnswerException, IOException, CqiServerError;
|
|
264 |
|
|
259 | 265 |
// None |
260 | 266 |
/** |
261 | 267 |
* Lists the corpora available on the server. |
... | ... | |
264 | 270 |
*/ |
265 | 271 |
@Override |
266 | 272 |
public abstract String[] listCorpora() throws UnexpectedAnswerException, IOException, CqiServerError; |
267 |
|
|
273 |
|
|
268 | 274 |
// None |
269 | 275 |
/** |
270 | 276 |
* Gives the corpus charset. |
... | ... | |
274 | 280 |
*/ |
275 | 281 |
@Override |
276 | 282 |
public abstract String corpusCharset(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
277 |
|
|
283 |
|
|
278 | 284 |
// None (not really implemented anyway) |
279 | 285 |
/** |
280 | 286 |
* Gives the corpus properties. |
... | ... | |
284 | 290 |
*/ |
285 | 291 |
@Override |
286 | 292 |
public abstract String[] corpusProperties(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
287 |
|
|
293 |
|
|
288 | 294 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS |
289 | 295 |
/** |
290 | 296 |
* Gives the corpus positional attributes. |
... | ... | |
294 | 300 |
*/ |
295 | 301 |
@Override |
296 | 302 |
public abstract String[] corpusPositionalAttributes(String corpusID) throws UnexpectedAnswerException, IOException, CqiServerError; |
297 |
|
|
303 |
|
|
298 | 304 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS |
299 | 305 |
/** |
300 | 306 |
* Gives the corpus structural attributes. |
... | ... | |
307 | 313 |
*/ |
308 | 314 |
@Override |
309 | 315 |
public abstract String[] corpusStructuralAttributes(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
310 |
|
|
316 |
|
|
311 | 317 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS, CQI_CL_ERROR_NO_SUCH_ATTRIBUTE, |
312 | 318 |
// CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE |
313 | 319 |
/** |
... | ... | |
321 | 327 |
*/ |
322 | 328 |
@Override |
323 | 329 |
public abstract boolean corpusStructuralAttributeHasValues(String attribute) throws UnexpectedAnswerException, IOException, |
324 |
CqiServerError; |
|
325 |
|
|
330 |
CqiServerError;
|
|
331 |
|
|
326 | 332 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS |
327 | 333 |
/** |
328 | 334 |
* Gives the corpus alignement attributes. |
... | ... | |
335 | 341 |
*/ |
336 | 342 |
@Override |
337 | 343 |
public abstract String[] corpusAlignementAttributes(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
338 |
|
|
344 |
|
|
339 | 345 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS |
340 | 346 |
/** |
341 | 347 |
* Gives the corpus full name. |
... | ... | |
348 | 354 |
*/ |
349 | 355 |
@Override |
350 | 356 |
public abstract String corpusFullName(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
351 |
|
|
357 |
|
|
352 | 358 |
/** |
353 | 359 |
* Gives the corpus info listed in the .INFO file. |
354 | 360 |
* |
... | ... | |
360 | 366 |
*/ |
361 | 367 |
@Override |
362 | 368 |
public abstract String[] corpusInfo(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
363 |
|
|
369 |
|
|
364 | 370 |
/** |
365 | 371 |
* Drop a corpus. |
366 | 372 |
* |
... | ... | |
369 | 375 |
*/ |
370 | 376 |
@Override |
371 | 377 |
public abstract void dropCorpus(String corpus) throws Exception; |
372 |
|
|
378 |
|
|
373 | 379 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS, CQI_CL_ERROR_NO_SUCH_ATTRIBUTE, |
374 | 380 |
// CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE, CQI_CL_ERROR_CORPUS_ACCESS |
375 | 381 |
/** |
... | ... | |
383 | 389 |
*/ |
384 | 390 |
@Override |
385 | 391 |
public abstract int attributeSize(String attribute) throws IOException, UnexpectedAnswerException, CqiServerError; |
386 |
|
|
392 |
|
|
387 | 393 |
// CQI_CQP_ERROR_NO_SUCH_CORPUS, CQI_CL_ERROR_NO_SUCH_ATTRIBUTE, |
388 | 394 |
// CQI_CL_ERROR_WRONG_ATTRIBUTE_TYPE, CQI_CL_ERROR_CORPUS_ACCESS |
389 | 395 |
/** |
... | ... | |
398 | 404 |
*/ |
399 | 405 |
@Override |
400 | 406 |
public abstract int lexiconSize(String attribute) throws IOException, UnexpectedAnswerException, CqiServerError; |
401 |
|
|
407 |
|
|
402 | 408 |
/** |
403 | 409 |
* Drop attribute. |
404 | 410 |
* |
... | ... | |
409 | 415 |
*/ |
410 | 416 |
@Override |
411 | 417 |
public abstract void dropAttribute(String attribute) throws IOException, UnexpectedAnswerException, CqiServerError; |
412 |
|
|
418 |
|
|
413 | 419 |
/** |
414 | 420 |
* Converts an array of attribute values to their ID. |
415 | 421 |
* |
... | ... | |
422 | 428 |
*/ |
423 | 429 |
@Override |
424 | 430 |
public abstract int[] str2Id(String attribute, String[] strings) throws IOException, UnexpectedAnswerException, CqiServerError; |
425 |
|
|
431 |
|
|
426 | 432 |
/** |
427 | 433 |
* Converts an array of attribute ID to their values. |
428 | 434 |
* |
... | ... | |
435 | 441 |
*/ |
436 | 442 |
@Override |
437 | 443 |
public abstract String[] id2Str(String attribute, int[] ids) throws UnexpectedAnswerException, IOException, CqiServerError; |
438 |
|
|
444 |
|
|
439 | 445 |
/** |
440 | 446 |
* Converts an array of attribute IDs to their frequency. |
441 | 447 |
* |
... | ... | |
448 | 454 |
*/ |
449 | 455 |
@Override |
450 | 456 |
public abstract int[] id2Freq(String attribute, int[] ids) throws UnexpectedAnswerException, IOException, CqiServerError; |
451 |
|
|
457 |
|
|
452 | 458 |
/** |
453 | 459 |
* Converts an array of position to their ID given an attribute. |
454 | 460 |
* |
... | ... | |
461 | 467 |
*/ |
462 | 468 |
@Override |
463 | 469 |
public abstract int[] cpos2Id(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
464 |
|
|
470 |
|
|
465 | 471 |
/** |
466 | 472 |
* Converts an array of position to their value given an attribute. |
467 | 473 |
* |
... | ... | |
474 | 480 |
*/ |
475 | 481 |
@Override |
476 | 482 |
public abstract String[] cpos2Str(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
477 |
|
|
483 |
|
|
478 | 484 |
/** |
479 | 485 |
* Computes for each position of an array the Id of the enclosing structural |
480 | 486 |
* attribute. |
... | ... | |
488 | 494 |
*/ |
489 | 495 |
@Override |
490 | 496 |
public abstract int[] cpos2Struc(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
491 |
|
|
497 |
|
|
492 | 498 |
/** |
493 | 499 |
* Computes for each position of an array the position of the left boundary |
494 | 500 |
* of the enclosing structural attribute. |
... | ... | |
502 | 508 |
*/ |
503 | 509 |
@Override |
504 | 510 |
public abstract int[] cpos2LBound(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
505 |
|
|
511 |
|
|
506 | 512 |
/** |
507 | 513 |
* Computes for each position of an array the position of the right boundary |
508 | 514 |
* of the enclosing structural attribute. |
... | ... | |
516 | 522 |
*/ |
517 | 523 |
@Override |
518 | 524 |
public abstract int[] cpos2RBound(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
519 |
|
|
525 |
|
|
520 | 526 |
/** |
521 | 527 |
* Computes for each position of an array the Id of the enclosing alignment |
522 | 528 |
* attribute. |
... | ... | |
530 | 536 |
*/ |
531 | 537 |
@Override |
532 | 538 |
public abstract int[] cpos2Alg(String attribute, int[] cpos) throws UnexpectedAnswerException, IOException, CqiServerError; |
533 |
|
|
539 |
|
|
534 | 540 |
/** |
535 | 541 |
* Retrieves annotated string values of structure regions in <strucs>; "" if |
536 | 542 |
* out of range. |
... | ... | |
544 | 550 |
*/ |
545 | 551 |
@Override |
546 | 552 |
public abstract String[] struc2Str(String attribute, int[] strucs) throws UnexpectedAnswerException, IOException, CqiServerError; |
547 |
|
|
553 |
|
|
548 | 554 |
/** |
549 | 555 |
* Retrieves all corpus positions where the given token occurs. |
550 | 556 |
* |
... | ... | |
557 | 563 |
*/ |
558 | 564 |
@Override |
559 | 565 |
public abstract int[] id2Cpos(String attribute, int id) throws UnexpectedAnswerException, IOException, CqiServerError; |
560 |
|
|
566 |
|
|
561 | 567 |
/** |
562 | 568 |
* Retrieves all corpus positions where one of the tokens in <id_list> |
563 | 569 |
* occurs; the returned list is sorted as a whole, not per token id. |
... | ... | |
571 | 577 |
*/ |
572 | 578 |
@Override |
573 | 579 |
public abstract int[] idList2Cpos(String attribute, int[] ids) throws UnexpectedAnswerException, IOException, CqiServerError; |
574 |
|
|
580 |
|
|
575 | 581 |
/** |
576 | 582 |
* Retrieves the lexicon IDs of all tokens that match <regex>; the returned |
577 | 583 |
* list may be empty (size 0). |
... | ... | |
585 | 591 |
*/ |
586 | 592 |
@Override |
587 | 593 |
public abstract int[] regex2Id(String attribute, String regex) throws UnexpectedAnswerException, IOException, CqiServerError; |
588 |
|
|
594 |
|
|
589 | 595 |
/** |
590 | 596 |
* Retrieves the start and end corpus positions of structure region <struc>. |
591 | 597 |
* |
... | ... | |
598 | 604 |
*/ |
599 | 605 |
@Override |
600 | 606 |
public abstract int[] struc2Cpos(String attribute, int struc) throws UnexpectedAnswerException, IOException, CqiServerError; |
601 |
|
|
607 |
|
|
602 | 608 |
/** |
603 | 609 |
* Retrieves start and end corpus positions of an alignement region in the |
604 | 610 |
* source and target corpora<struc>. |
... | ... | |
606 | 612 |
* @param attribute the attribute |
607 | 613 |
* @param struc the struc |
608 | 614 |
* @return an array of size 4 containing (src_start, src_end, target_start, |
609 |
* target_end) |
|
615 |
* target_end)
|
|
610 | 616 |
* @throws UnexpectedAnswerException Signals that the data read on the socket is unexpected |
611 | 617 |
* @throws IOException Signals that an I/O exception has occurred. |
612 | 618 |
* @throws CqiServerError the cqi server error |
613 | 619 |
*/ |
614 | 620 |
@Override |
615 | 621 |
public abstract int[] alg2Cpos(String attribute, int struc) throws UnexpectedAnswerException, IOException, CqiServerError; |
616 |
|
|
622 |
|
|
617 | 623 |
/** |
618 | 624 |
* Runs a CQL query. |
619 | 625 |
* |
... | ... | |
626 | 632 |
*/ |
627 | 633 |
@Override |
628 | 634 |
public abstract void cqpQuery(String motherCorpus, String subcorpus, String query) throws IOException, UnexpectedAnswerException, CqiServerError; |
629 |
|
|
635 |
|
|
630 | 636 |
/** |
631 | 637 |
* Runs a CQP query line. |
632 | 638 |
* |
... | ... | |
639 | 645 |
*/ |
640 | 646 |
@Override |
641 | 647 |
public abstract void query(String query) throws IOException, UnexpectedAnswerException, CqiServerError; |
642 |
|
|
648 |
|
|
643 | 649 |
/** |
644 |
* Runs a CQP query.
|
|
650 |
* Load a CQP corpus (system) from a registry file
|
|
645 | 651 |
* |
646 | 652 |
* @param motherCorpus the mother corpus |
647 |
* @param subcorpus the subcorpus |
|
648 | 653 |
* @param query the query |
649 |
* @return
|
|
654 |
* @return |
|
650 | 655 |
* @throws IOException Signals that an I/O exception has occurred. |
651 | 656 |
* @throws UnexpectedAnswerException Signals that the data read on the socket is unexpected |
652 | 657 |
* @throws CqiServerError the cqi server error |
653 | 658 |
*/ |
654 | 659 |
@Override |
655 | 660 |
public abstract boolean load_a_system_corpus(String regfilepath, String entry) throws IOException, UnexpectedAnswerException, CqiServerError; |
656 |
|
|
661 |
|
|
657 | 662 |
/** |
658 | 663 |
* Lists all the subcorpora of a corpus. |
659 | 664 |
* |
... | ... | |
665 | 670 |
*/ |
666 | 671 |
@Override |
667 | 672 |
public abstract String[] listSubcorpora(String corpus) throws UnexpectedAnswerException, IOException, CqiServerError; |
668 |
|
|
673 |
|
|
669 | 674 |
/** |
670 | 675 |
* Gives the size of a subcorpus . |
671 | 676 |
* |
... | ... | |
683 | 688 |
*/ |
684 | 689 |
@Override |
685 | 690 |
public abstract int subCorpusSize(String subcorpus) throws IOException, |
686 |
UnexpectedAnswerException, CqiServerError; |
|
687 |
|
|
691 |
UnexpectedAnswerException, CqiServerError;
|
|
692 |
|
|
688 | 693 |
/** |
689 | 694 |
* Checks wether a subcorpus has a field. |
690 | 695 |
* |
... | ... | |
704 | 709 |
*/ |
705 | 710 |
@Override |
706 | 711 |
public abstract boolean subCorpusHasField(String subcorpus, byte field) throws IOException, UnexpectedAnswerException, CqiServerError; |
707 |
|
|
712 |
|
|
708 | 713 |
/** |
709 | 714 |
* Dumps the values of <field> for match ranges <first> .. <last> in |
710 | 715 |
* <subcorpus>. <field> is one of the CQI_CONST_FIELD_* constants. |
... | ... | |
729 | 734 |
*/ |
730 | 735 |
@Override |
731 | 736 |
public abstract int[] dumpSubCorpus(String subcorpus, byte field, int first, int last) throws IOException, UnexpectedAnswerException, |
732 |
CqiServerError; |
|
733 |
|
|
737 |
CqiServerError;
|
|
738 |
|
|
734 | 739 |
/** |
735 | 740 |
* Drops a subcorpus. |
736 | 741 |
* |
... | ... | |
746 | 751 |
*/ |
747 | 752 |
@Override |
748 | 753 |
public abstract void dropSubCorpus(String subcorpus) throws IOException, |
749 |
UnexpectedAnswerException, CqiServerError; |
|
750 |
|
|
754 |
UnexpectedAnswerException, CqiServerError;
|
|
755 |
|
|
751 | 756 |
/** |
752 | 757 |
* Returns <n> (id, frequency) pairs flattened into a list of size 2*<n> NB: |
753 | 758 |
* pairs are sorted by frequency desc. |
... | ... | |
775 | 780 |
public abstract int[][] fdist1(String subcorpus, int cutoff, |
776 | 781 |
byte field, String attribute) throws IOException, |
777 | 782 |
UnexpectedAnswerException, CqiServerError; |
778 |
|
|
783 |
|
|
779 | 784 |
/** |
780 | 785 |
* Returns <n> (id1, id2, frequency) pairs flattened into a list of size |
781 | 786 |
* 3*<n> NB: triples are sorted by frequency desc. . |
... | ... | |
805 | 810 |
@Override |
806 | 811 |
public abstract int[][] fdist2(String subcorpus, int cutoff, |
807 | 812 |
byte field1, String attribute1, byte field2, String attribute2) throws IOException, UnexpectedAnswerException, CqiServerError; |
808 |
|
|
813 |
|
|
809 | 814 |
@Override |
810 |
public abstract boolean reconnect() ; |
|
811 |
} |
|
815 |
public abstract boolean reconnect(); |
|
816 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 2924) | ||
---|---|---|
1 |
// Copyright © 2020 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// @author mdecorde |
|
3 |
|
|
1 | 4 |
// STANDARD DECLARATIONS |
2 | 5 |
package org.txm.macro.export |
3 | 6 |
|
... | ... | |
11 | 14 |
import groovy.transform.Field |
12 | 15 |
import org.txm.rcp.swt.widget.parameters.* |
13 | 16 |
|
14 |
println "corpora selection: "+corpusViewSelection |
|
15 |
|
|
16 | 17 |
if (!(corpusViewSelection instanceof CQPCorpus)) { |
17 | 18 |
println "Please select a corpus" |
18 |
return;
|
|
19 |
return |
|
19 | 20 |
} |
20 | 21 |
|
21 | 22 |
// PARAMETERS |
22 | 23 |
|
23 |
@Field @Option(name="exportDirectory", usage="Result directory ", widget="Folder", required=true, def="export")
|
|
24 |
@Field @Option(name="exportDirectory", usage="results directory", widget="Folder", required=true, def="export")
|
|
24 | 25 |
File exportDirectory |
25 | 26 |
|
26 |
@Field @Option(name="lineSeparatorStructureName", usage="line separator structure", widget="String", required=false, def="p")
|
|
27 |
@Field @Option(name="lineSeparatorStructureName", usage="name of the structure to use to produce the output lines", widget="String", required=false, def="p")
|
|
27 | 28 |
def lineSeparatorStructureName |
28 | 29 |
|
29 | 30 |
if (!ParametersDialog.open(this)) return |
... | ... | |
32 | 33 |
|
33 | 34 |
if (!exportDirectory.exists()) exportDirectory.mkdirs() |
34 | 35 |
|
35 |
CQPCorpus corpus = corpusViewSelection
|
|
36 |
CQI = CQPSearchEngine.getCqiClient();
|
|
36 |
def corpus = corpusViewSelection
|
|
37 |
def CQI = CQPSearchEngine.getCqiClient()
|
|
37 | 38 |
|
38 | 39 |
def lineSeparatorStructure = corpus.getStructuralUnit(lineSeparatorStructureName) |
39 | 40 |
|
40 | 41 |
if (lineSeparatorStructure == null) { |
41 |
println "The $lineSeparatorStructureName structure does not exists in the $corpus corpus"
|
|
42 |
return;
|
|
42 |
println "** The $lineSeparatorStructureName structure does not exist in the $corpus corpus"
|
|
43 |
return |
|
43 | 44 |
} |
44 | 45 |
|
45 | 46 |
def breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+lineSeparatorStructureName+">"),"test", false).getEnds()) |
46 |
println breaks_pos |
|
47 |
println "End of structure positions = "+breaks_pos
|
|
47 | 48 |
|
48 |
println "Exporting $corpus text content to $exportDirectory" |
|
49 |
println "Exporting $corpus text content to $exportDirectory..."
|
|
49 | 50 |
|
50 | 51 |
def wordProperty = corpus.getWordProperty() |
51 | 52 |
def textidProperty = corpus.getStructuralUnit("text").getProperty("id") |
52 |
def textStartBoundaries = corpus.getTextStartLimits();
|
|
53 |
def textEndBoundaries = corpus.getTextEndLimits();
|
|
53 |
def textStartBoundaries = corpus.getTextStartLimits() |
|
54 |
def textEndBoundaries = corpus.getTextEndLimits() |
|
54 | 55 |
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries) |
55 | 56 |
String[] textids = CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos) |
56 |
println ""+textStartBoundaries.size()+" texts" |
|
57 |
if (textStartBoundaries.size() == 1) { |
|
58 |
println "1 text" |
|
59 |
} else { |
|
60 |
println ""+textStartBoundaries.size()+" texts" |
|
61 |
} |
|
57 | 62 |
|
58 | 63 |
for (int i = 0 ; i < textStartBoundaries.size() ; i++) { |
59 |
int start = textStartBoundaries[i];
|
|
64 |
int start = textStartBoundaries[i] |
|
60 | 65 |
int end = textEndBoundaries[i] |
61 | 66 |
|
62 | 67 |
File txtFile = new File(exportDirectory, textids[i]+".txt") |
63 |
print ".."
|
|
68 |
print "." |
|
64 | 69 |
def writer = txtFile.newWriter("UTF-8") |
65 | 70 |
int[] positions = new int[end - start + 1] |
66 |
int c = 0;
|
|
71 |
int c = 0 |
|
67 | 72 |
for (int p : start..end) { |
68 | 73 |
positions[c++] = p |
69 | 74 |
} |
70 | 75 |
int[] idx = CQI.cpos2Id(wordProperty.getQualifiedName(), positions) |
71 | 76 |
def words = CQI.id2Str(wordProperty.getQualifiedName(), idx) |
77 |
def tmp = [] |
|
72 | 78 |
for (int j = 0 ; j < positions.length ; j++) { |
73 | 79 |
int p = positions[j] |
74 |
if (breaks_pos.contains(p)) words[j] = words[j] +"\n" |
|
80 |
tmp << words[j] |
|
81 |
if (breaks_pos.contains(p)) { |
|
82 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
83 |
tmp = [] |
|
84 |
} |
|
75 | 85 |
} |
76 |
writer.println LangFormater.format(StringUtils.join(words, " "), |
|
77 |
corpus.getLang()); |
|
78 |
writer.close(); |
|
86 |
if (tmp.size() > 0) { |
|
87 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
88 |
} |
|
89 |
writer.close() |
|
79 | 90 |
} |
80 | 91 |
|
81 | 92 |
println "\nDone, result saved in "+exportDirectory.getAbsolutePath() |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 2924) | ||
---|---|---|
12 | 12 |
import org.txm.importer.xtz.* |
13 | 13 |
|
14 | 14 |
public class XTZDefaultPagerStep { |
15 |
|
|
15 |
|
|
16 | 16 |
List<String> NoSpaceBefore; |
17 |
|
|
17 |
|
|
18 | 18 |
/** The No space after. */ |
19 | 19 |
List<String> NoSpaceAfter; |
20 |
|
|
20 |
|
|
21 | 21 |
/** The wordcount. */ |
22 | 22 |
int wordcount = 0; |
23 |
|
|
23 |
|
|
24 | 24 |
/** The pagecount. */ |
25 | 25 |
int pagecount = 0; |
26 |
|
|
26 |
|
|
27 | 27 |
/** The wordmax. */ |
28 | 28 |
int wordmax = 0; |
29 |
|
|
29 |
|
|
30 | 30 |
/** The basename. */ |
31 | 31 |
String basename = ""; |
32 | 32 |
String txtname = ""; |
33 | 33 |
File outdir; |
34 |
|
|
34 |
|
|
35 | 35 |
/** The wordid. */ |
36 | 36 |
String wordid; |
37 |
|
|
37 |
|
|
38 | 38 |
/** The first word. */ |
39 | 39 |
boolean firstWord = true; |
40 |
|
|
40 |
|
|
41 | 41 |
/** The wordvalue. */ |
42 | 42 |
String wordvalue = ""; |
43 |
|
|
43 |
|
|
44 | 44 |
/** The interpvalue. */ |
45 | 45 |
String interpvalue = ""; |
46 |
|
|
46 |
|
|
47 | 47 |
/** The lastword. */ |
48 | 48 |
String lastword = " "; |
49 |
|
|
49 |
|
|
50 | 50 |
/** The wordtype. */ |
51 | 51 |
String wordtype; |
52 |
|
|
52 |
|
|
53 | 53 |
/** The flagform. */ |
54 | 54 |
boolean flagform = false; |
55 |
|
|
55 |
|
|
56 | 56 |
/** The flaginterp. */ |
57 | 57 |
boolean flaginterp = false; |
58 |
|
|
58 |
|
|
59 | 59 |
/** The url. */ |
60 | 60 |
private def url; |
61 |
|
|
61 |
|
|
62 | 62 |
/** The input data. */ |
63 | 63 |
private def inputData; |
64 |
|
|
64 |
|
|
65 | 65 |
/** The factory. */ |
66 | 66 |
private def factory; |
67 |
|
|
67 |
|
|
68 | 68 |
/** The parser. */ |
69 | 69 |
private XMLStreamReader parser; |
70 |
|
|
70 |
|
|
71 | 71 |
/** The writer. */ |
72 | 72 |
OutputStreamWriter writer; |
73 |
|
|
73 |
|
|
74 | 74 |
/** The pagedWriter. */ |
75 | 75 |
StaxStackWriter pagedWriter = null; |
76 |
|
|
76 |
|
|
77 | 77 |
/** The infile. */ |
78 | 78 |
File infile; |
79 |
|
|
79 |
|
|
80 | 80 |
/** The outfile. */ |
81 | 81 |
File outfile; |
82 |
|
|
82 |
|
|
83 | 83 |
/** The pages. */ |
84 | 84 |
//TODO enhance this to store the page name/id as well |
85 | 85 |
ArrayList<File> pages = new ArrayList<File>(); |
86 |
|
|
86 |
|
|
87 | 87 |
/** The idxstart. */ |
88 | 88 |
ArrayList<String> idxstart = new ArrayList<String>(); |
89 | 89 |
String paginationElement; |
... | ... | |
92 | 92 |
def noteElements = new HashSet<String>(); |
93 | 93 |
def outOfTextElements = new HashSet<String>(); |
94 | 94 |
XTZPager pager; |
95 |
|
|
95 |
|
|
96 | 96 |
/** |
97 | 97 |
* Instantiates a new pager. |
98 | 98 |
* |
... | ... | |
118 | 118 |
this.infile = infile; |
119 | 119 |
this.wordTag= pager.wordTag; |
120 | 120 |
outdir.mkdirs() |
121 |
|
|
121 |
|
|
122 | 122 |
inputData = new BufferedInputStream(url.openStream()); |
123 | 123 |
factory = XMLInputFactory.newInstance(); |
124 | 124 |
parser = factory.createXMLStreamReader(inputData); |
125 |
|
|
125 |
|
|
126 | 126 |
String notesListString = pager.getImportModule().getProject().getTextualPlan("Note") |
127 | 127 |
if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s; |
128 |
|
|
128 |
|
|
129 | 129 |
String elems = pager.getImportModule().getProject().getTextualPlan("OutSideTextTagsAndKeepContent") |
130 | 130 |
if (elems != null) for (def s : elems.split(",")) outOfTextElements << s; |
131 |
|
|
131 |
|
|
132 | 132 |
//process(); |
133 | 133 |
} |
134 |
|
|
134 |
|
|
135 | 135 |
public String getAttributeValue(def parser, String ns, String name) { |
136 | 136 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
137 | 137 |
if (name == parser.getAttributeLocalName(i)) { |
... | ... | |
140 | 140 |
} |
141 | 141 |
return ""; |
142 | 142 |
} |
143 |
|
|
143 |
|
|
144 | 144 |
private def closeMultiWriter() { |
145 | 145 |
if (pagedWriter != null) { |
146 | 146 |
def tags = pagedWriter.getTagStack().clone(); |
147 |
// println "STACK="+pagedWriter.getTagStack() |
|
148 |
// def stack = Thread.currentThread().getStackTrace(); |
|
149 |
// int m = Math.min(15, stack.size()-1) |
|
150 |
// for (def s : stack[1..m]) println s |
|
151 |
// println "FILE ="+outfile |
|
147 |
// println "STACK="+pagedWriter.getTagStack()
|
|
148 |
// def stack = Thread.currentThread().getStackTrace();
|
|
149 |
// int m = Math.min(15, stack.size()-1)
|
|
150 |
// for (def s : stack[1..m]) println s
|
|
151 |
// println "FILE ="+outfile
|
|
152 | 152 |
if (firstWord) { // there was no words |
153 | 153 |
pagedWriter.writeCharacters(""); |
154 | 154 |
this.idxstart.add("${wordTag}_0") |
... | ... | |
174 | 174 |
} |
175 | 175 |
notes.clear() |
176 | 176 |
} |
177 |
|
|
177 |
|
|
178 | 178 |
pagedWriter.close(); |
179 | 179 |
|
180 |
// println "STACK TO REWRITE: $tags" |
|
180 |
// println "STACK TO REWRITE: $tags"
|
|
181 | 181 |
for (int i = 0 ; i < tags.size() ; i++) { |
182 | 182 |
String tag = tags.remove(0) |
183 | 183 |
i-- |
184 |
// println " tag=$tag" |
|
184 |
// println " tag=$tag"
|
|
185 | 185 |
if (tag == "div") { |
186 | 186 |
break; // remove elements until first "div" tag |
187 | 187 |
} |
188 | 188 |
} |
189 |
// println "STACK TO REWRITE2: $tags" |
|
189 |
// println "STACK TO REWRITE2: $tags"
|
|
190 | 190 |
|
191 | 191 |
return tags; |
192 | 192 |
} else { |
193 | 193 |
return []; |
194 | 194 |
} |
195 | 195 |
} |
196 |
|
|
196 |
|
|
197 | 197 |
/** |
198 | 198 |
* Creates the next output. |
199 | 199 |
* |
... | ... | |
207 | 207 |
outfile = new File(outdir, txtname+"_"+(++pagecount)+".html") |
208 | 208 |
pages.add(outfile) |
209 | 209 |
firstWord = true; // waiting for next word |
210 |
|
|
210 |
|
|
211 | 211 |
pagedWriter = new StaxStackWriter(outfile, "UTF-8") |
212 |
|
|
212 |
|
|
213 | 213 |
//pagedWriter.writeStartDocument() |
214 | 214 |
pagedWriter.writeDTD("<!DOCTYPE html>") |
215 | 215 |
pagedWriter.writeCharacters("\n") |
... | ... | |
227 | 227 |
pagedWriter.writeCharacters("\n") |
228 | 228 |
pagedWriter.writeStartElement("body") //<body> |
229 | 229 |
pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div> |
230 |
// println "OPENING: $tags" |
|
230 |
// println "OPENING: $tags"
|
|
231 | 231 |
pagedWriter.writeStartElements(tags) |
232 | 232 |
return true; |
233 | 233 |
} catch (Exception e) { |
... | ... | |
236 | 236 |
return false; |
237 | 237 |
} |
238 | 238 |
} |
239 |
|
|
239 |
|
|
240 | 240 |
/** |
241 | 241 |
* Creates the output. |
242 | 242 |
* |
... | ... | |
251 | 251 |
return false; |
252 | 252 |
} |
253 | 253 |
} |
254 |
|
|
254 |
|
|
255 | 255 |
/** |
256 | 256 |
* Gets the page files. |
257 | 257 |
* |
... | ... | |
260 | 260 |
public ArrayList<File> getPageFiles() { |
261 | 261 |
return pages; |
262 | 262 |
} |
263 |
|
|
263 |
|
|
264 | 264 |
/** |
265 | 265 |
* Gets the idx. |
266 | 266 |
* |
... | ... | |
269 | 269 |
public ArrayList<String> getIdx() { |
270 | 270 |
return idxstart; |
271 | 271 |
} |
272 |
|
|
272 |
|
|
273 | 273 |
/** |
274 | 274 |
* Go to text. |
275 | 275 |
*/ |
... | ... | |
280 | 280 |
return; |
281 | 281 |
} |
282 | 282 |
} |
283 |
|
|
283 |
|
|
284 | 284 |
def notes = [] |
285 | 285 |
def currentOutOfTextElements = [] // stack of element with out of text to edit opened element |
286 | 286 |
def writeOutOfTextToEditText = false |
... | ... | |
288 | 288 |
* Process. |
289 | 289 |
*/ |
290 | 290 |
public boolean process() { |
291 |
|
|
291 |
|
|
292 | 292 |
try { |
293 | 293 |
boolean flagNote = false; |
294 | 294 |
String noteContent = ""; |
295 | 295 |
String rend = "" |
296 | 296 |
goToText(); |
297 |
|
|
297 |
|
|
298 | 298 |
String localname = ""; |
299 | 299 |
if (!createNextOutput()) { |
300 | 300 |
return false; |
301 | 301 |
} |
302 |
|
|
302 |
|
|
303 | 303 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
304 | 304 |
rend = ""; |
305 | 305 |
switch (event) { |
... | ... | |
311 | 311 |
} else if (currentOutOfTextElements.size() > 0) { |
312 | 312 |
currentOutOfTextElements << localname |
313 | 313 |
} |
314 |
|
|
314 |
|
|
315 | 315 |
if (localname == paginationElement) { |
316 | 316 |
createNextOutput() |
317 | 317 |
wordcount=0; |
... | ... | |
320 | 320 |
pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n")) |
321 | 321 |
} |
322 | 322 |
} |
323 |
|
|
323 |
|
|
324 | 324 |
rend = getAttributeValue(parser, null, "rend") |
325 | 325 |
if (rend == null) rend = ""; |
326 |
|
|
326 |
|
|
327 | 327 |
switch (localname) { |
328 | 328 |
case "text": |
329 | 329 |
LinkedHashMap attributes = new LinkedHashMap(); |
330 | 330 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
331 | 331 |
attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString() |
332 | 332 |
} |
333 |
|
|
333 |
|
|
334 | 334 |
pagedWriter.write("\n") |
335 | 335 |
pagedWriter.writeStartElement("p") |
336 | 336 |
pagedWriter.writeAttribute("class", rend) |
337 | 337 |
if (attributes.containsKey("id")) { |
338 | 338 |
pagedWriter.writeElement("h3", attributes["id"]) |
339 | 339 |
} |
340 |
|
|
340 |
|
|
341 | 341 |
pagedWriter.writeStartElement("table") |
342 | 342 |
for (String k : attributes.keySet()) { |
343 | 343 |
if (k == "id") continue; |
344 | 344 |
if (k == "rend") continue; |
345 |
|
|
345 |
|
|
346 | 346 |
pagedWriter.writeStartElement("tr") |
347 | 347 |
pagedWriter.writeElement("td", k) |
348 | 348 |
pagedWriter.writeElement("td", attributes[k]) |
... | ... | |
414 | 414 |
break; |
415 | 415 |
case "sp": |
416 | 416 |
pagedWriter.writeStartElement("p", ["class":"turn"]) |
417 |
pagedWriter.writeStartElement("span") |
|
418 |
pagedWriter.writeAttribute("class", "spk") |
|
419 |
pagedWriter.writeCharacters(parser.getAttributeValue(null,"speaker")+": ") |
|
420 |
pagedWriter.writeEndElement() // span@class=spk |
|
417 |
|
|
418 |
if (parser.getAttributeValue(null,"speaker") != null) { |
|
419 |
pagedWriter.writeStartElement("span") |
|
420 |
pagedWriter.writeAttribute("class", "spk") |
|
421 |
pagedWriter.writeCharacters(parser.getAttributeValue(null,"speaker")+": ") |
|
422 |
pagedWriter.writeEndElement() // span@class=spk |
|
423 |
} |
|
424 |
|
|
421 | 425 |
break; |
422 | 426 |
case "u": |
423 |
//pagedWriter.writeStartElement("p", ["class":"u"]) |
|
424 |
pagedWriter.writeStartElement("span") |
|
425 |
pagedWriter.writeAttribute("class", "sync") |
|
426 |
pagedWriter.writeCharacters(parser.getAttributeValue(null,"time")) |
|
427 |
//pagedWriter.writeEndElement(); // span@class=spk |
|
427 |
//pagedWriter.writeStartElement("p", ["class":"u"]) |
|
428 |
if (parser.getAttributeValue(null,"time") != null) { |
|
429 |
pagedWriter.writeStartElement("span") |
|
430 |
pagedWriter.writeAttribute("class", "sync") |
|
431 |
pagedWriter.writeCharacters(parser.getAttributeValue(null,"time")) |
|
432 |
} |
|
433 |
//pagedWriter.writeEndElement(); // span@class=spk |
|
428 | 434 |
break; |
429 | 435 |
case "div": |
430 | 436 |
case "div1": |
... | ... | |
440 | 446 |
break; |
441 | 447 |
case wordTag: |
442 | 448 |
wordid = getAttributeValue(parser, null,"id"); |
443 |
|
|
449 |
|
|
444 | 450 |
wordcount++; |
445 | 451 |
if (wordcount >= wordmax) { |
446 | 452 |
createNextOutput(); |
447 | 453 |
} |
448 |
|
|
454 |
|
|
449 | 455 |
if (firstWord) { |
450 | 456 |
firstWord = false; |
451 | 457 |
this.idxstart.add(wordid); |
452 | 458 |
} |
453 |
|
|
459 |
|
|
454 | 460 |
break; |
455 | 461 |
case "ana": |
456 | 462 |
flaginterp=true; |
... | ... | |
476 | 482 |
localname = parser.getLocalName(); |
477 | 483 |
if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop() |
478 | 484 |
writeOutOfTextToEditText = currentOutOfTextElements.size() > 0 |
479 |
|
|
485 |
|
|
480 | 486 |
switch (localname) { |
481 | 487 |
case "text": |
482 | 488 |
break; |
... | ... | |
541 | 547 |
if (l > 0) { |
542 | 548 |
endOfLastWord = lastword.subSequence(l-1, l) |
543 | 549 |
} |
544 |
|
|
550 |
|
|
545 | 551 |
if (interpvalue != null) { |
546 | 552 |
interpvalue = interpvalue |
547 | 553 |
} |
... | ... | |
554 | 560 |
pagedWriter.writeCharacters("\n") |
555 | 561 |
pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]) |
556 | 562 |
} |
557 |
|
|
563 |
|
|
558 | 564 |
pagedWriter.writeCharacters(wordvalue) |
559 | 565 |
pagedWriter.writeEndElement() |
560 |
//pagedWriter.writeComment("\n")
|
|
566 |
//pagedWriter.writeComment("\n") |
|
561 | 567 |
lastword=wordvalue; |
562 | 568 |
break; |
563 | 569 |
default: |
tmp/org.txm.core/src/java/org/txm/metadatas/Metadatas.java (revision 2924) | ||
---|---|---|
87 | 87 |
File xmlfile; |
88 | 88 |
|
89 | 89 |
/** The metadatas. */ |
90 |
ArrayList<Metadata> metadatas = new ArrayList<Metadata>();
|
|
90 |
ArrayList<Metadata> metadatas = new ArrayList<>(); |
|
91 | 91 |
|
92 | 92 |
/** The headers list. */ |
93 |
ArrayList<String> headersList = new ArrayList<String>();
|
|
93 |
ArrayList<String> headersList = new ArrayList<>(); |
|
94 | 94 |
|
95 | 95 |
/** The isinialize. */ |
96 | 96 |
boolean isInitialize = false; |
... | ... | |
353 | 353 |
|
354 | 354 |
public HashMap<String, String> getTextMetadata(File f) { |
355 | 355 |
|
356 |
HashMap<String, String> data = new HashMap<String, String>();
|
|
356 |
HashMap<String, String> data = new HashMap<>(); |
|
357 | 357 |
String txtname = f.getName(); |
358 | 358 |
int idx = txtname.lastIndexOf("."); |
359 | 359 |
if (idx > 0) txtname = txtname.substring(0, idx); |
... | ... | |
383 | 383 |
*/ |
384 | 384 |
public static boolean convertCsvToXml(File csvfile, File xmlFile, String encoding, String separator, String txtseparator, int nbheaderline) throws Exception { |
385 | 385 |
|
386 |
if (separator == null || separator.length() == 0) { |
|
387 |
separator = "\t"; |
|
388 |
} |
|
389 |
if (encoding == null || encoding.length() == 0) { |
|
390 |
encoding = "UTF-8"; |
|
391 |
} |
|
392 |
xmlFile.createNewFile(); |
|
393 |
|
|
394 |
if (!csvfile.exists()) { |
|
395 |
System.out.println("Error: CSV file does not exists"); |
|
396 |
return false; |
|
397 |
} |
|
398 |
|
|
399 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
|
400 |
FileOutputStream output = new FileOutputStream(xmlFile); |
|
401 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");// create a new file |
|
402 |
|
|
403 |
CsvReader reader = new CsvReader(csvfile.getAbsolutePath(), separator.charAt(0), Charset.forName(encoding)); |
|
404 |
if (txtseparator != null && txtseparator.length() > 0) |
|
405 |
reader.setTextQualifier(txtseparator.charAt(0)); |
|
406 |
|
|
407 |
reader.readHeaders(); |
|
408 |
|
|
409 |
String[] headers = reader.getHeaders(); |
|
410 |
|
|
411 |
if (headers.length == 0) { |
|
412 |
System.out.println("Error: No header in the metadata file " + csvfile + " with separators: column='" + separator + "' and text='" + txtseparator + "'"); |
|
413 |
writer.close(); |
|
414 |
output.close(); |
|
415 |
return false; |
|
416 |
} |
|
417 |
|
|
418 |
if (!headers[0].equals("id")) { |
|
419 |
System.out.println("Error: The first column name in the header line of the metadata file '$csvfile' must be 'id' and found '" + headers[0] |
|
420 |
+ "' column separator='\"+separator+\"' and text separator='\"+txtseparator+\"'"); |
|
421 |
writer.close(); |
|
422 |
output.close(); |
|
423 |
if (!separator.equals("\t")) { |
|
424 |
System.out.println("\tTrying with separators: column='\t' and text=''..."); |
|
425 |
return convertCsvToXml(csvfile, xmlFile, encoding, "\t", "", nbheaderline); |
|
386 |
try { |
|
387 |
if (separator == null || separator.length() == 0) { |
|
388 |
separator = "\t"; |
|
426 | 389 |
} |
427 |
} |
|
428 |
|
|
429 |
// check for double columns |
|
430 |
HashSet<String> testhash = new HashSet<String>(); |
|
431 |
HashSet<String> doubles = new HashSet<String>(); |
|
432 |
for (String str : headers) { |
|
433 |
if (testhash.contains(str)) |
|
434 |
doubles.add(str); |
|
435 |
testhash.add(str); |
|
436 |
} |
|
437 |
if (doubles.size() > 0) { |
|
438 |
System.out.println("Error: the metadata file '$csvfile' contains duplicated column names: " + doubles); |
|
439 |
return false; |
|
440 |
} |
|
441 |
|
|
442 |
String[] longnames = new String[headers.length]; |
|
443 |
String[] types = new String[headers.length]; |
|
444 |
if (nbheaderline > 1) {// get longnames |
|
445 |
reader.readRecord(); |
|
446 |
for (int i = 0; i < headers.length; i++) { |
|
447 |
longnames[i] = reader.get(headers[i]); |
|
390 |
if (encoding == null || encoding.length() == 0) { |
|
391 |
encoding = "UTF-8"; |
|
448 | 392 |
} |
449 |
} |
|
450 |
else { |
|
451 |
for (int i = 0; i < headers.length; i++) { |
|
452 |
longnames[i] = headers[i]; |
|
393 |
xmlFile.createNewFile(); |
|
394 |
|
|
395 |
if (!csvfile.exists()) { |
|
396 |
System.out.println("Error: CSV file does not exists"); |
|
397 |
return false; |
|
453 | 398 |
} |
454 |
} |
|
455 |
|
|
456 |
if (nbheaderline > 2) {// got types |
|
457 |
reader.readRecord(); |
|
458 |
for (int i = 0; i < headers.length; i++) { |
|
459 |
types[i] = reader.get(headers[i]); |
|
399 |
|
|
400 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
|
401 |
FileOutputStream output = new FileOutputStream(xmlFile); |
|
402 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");// create a new file |
|
403 |
|
|
404 |
CsvReader reader = new CsvReader(csvfile.getAbsolutePath(), separator.charAt(0), Charset.forName(encoding)); |
|
405 |
if (txtseparator != null && txtseparator.length() > 0) |
|
406 |
reader.setTextQualifier(txtseparator.charAt(0)); |
|
407 |
|
|
408 |
reader.readHeaders(); |
|
409 |
|
|
410 |
String[] headers = reader.getHeaders(); |
|
411 |
|
|
412 |
if (headers.length == 0) { |
|
413 |
System.out.println("Error: No header in the metadata file " + csvfile + " with separators: column='" + separator + "' and text='" + txtseparator + "'"); |
|
414 |
writer.close(); |
|
415 |
output.close(); |
|
416 |
return false; |
|
460 | 417 |
} |
461 |
} |
|
462 |
else { |
|
463 |
for (int i = 0; i < headers.length; i++) { |
|
464 |
types[i] = "String"; |
|
418 |
|
|
419 |
if (!headers[0].equals("id")) { |
|
420 |
System.out.println("Error: The first column name in the header line of the metadata file '$csvfile' must be 'id' and found '" + headers[0] |
|
421 |
+ "' column separator='\"+separator+\"' and text separator='\"+txtseparator+\"'"); |
|
422 |
writer.close(); |
|
423 |
output.close(); |
|
424 |
if (!separator.equals("\t")) { |
|
425 |
System.out.println("\tTrying with separators: column='\t' and text=''..."); |
|
426 |
return convertCsvToXml(csvfile, xmlFile, encoding, "\t", "", nbheaderline); |
|
427 |
} |
|
465 | 428 |
} |
466 |
} |
|
467 |
|
|
468 |
writer.writeStartDocument("UTF-8", "1.0"); |
|
469 |
writer.writeStartElement("enrichissement"); |
|
470 |
writer.writeStartElement("metadatas"); |
|
471 |
writer.writeCharacters("\n"); |
|
472 |
// println "headers : "+Arrays.toString(headers) |
|
473 |
for (int i = 1; i < headers.length; i++) { |
|
474 |
if (headers[i].length() == 0) { |
|
475 |
headers[i] = "noname"; |
|
476 |
System.out.println("Warning: the " + (i + 1) + "th column name is empty"); |
|
429 |
|
|
430 |
// check for double columns |
|
431 |
HashSet<String> testhash = new HashSet<>(); |
|
432 |
HashSet<String> doubles = new HashSet<>(); |
|
433 |
for (String str : headers) { |
|
434 |
if (testhash.contains(str)) |
|
435 |
doubles.add(str); |
|
436 |
testhash.add(str); |
|
477 | 437 |
} |
478 |
// if(!headers[i].equals("id"))// the first |
|
479 |
// { |
|
480 |
writer.writeStartElement("metadata"); |
|
481 |
writer.writeAttribute("id", AsciiUtils.buildId(headers[i])); |
|
482 |
writer.writeAttribute("shortname", headers[i]); |
|
483 |
writer.writeAttribute("longname", longnames[i]); |
|
484 |
writer.writeAttribute("type", types[i]); |
|
485 |
writer.writeAttribute("colwidth", "100"); |
|
486 |
writer.writeAttribute("selection", "true"); |
|
487 |
writer.writeAttribute("partition", "true"); |
|
488 |
writer.writeAttribute("display", "true"); |
|
438 |
if (doubles.size() > 0) { |
|
439 |
System.out.println("Error: the metadata file '$csvfile' contains duplicated column names: " + doubles); |
|
440 |
return false; |
|
441 |
} |
|
489 | 442 |
|
490 |
writer.writeEndElement(); |
|
491 |
writer.writeCharacters("\n"); |
|
492 |
// } |
|
493 |
} |
|
494 |
writer.writeEndElement();// close metadatas |
|
495 |
writer.writeCharacters("\n"); |
|
496 |
|
|
497 |
writer.writeStartElement("texts"); |
|
498 |
writer.writeCharacters("\n"); |
|
499 |
while (reader.readRecord()) { |
|
500 |
writer.writeStartElement("text"); |
|
501 |
for (int i = 0; i < headers.length; i++) |
|
502 |
if (headers[i].equals("id")) { |
|
503 |
writer.writeAttribute("id", reader.get(headers[i])); |
|
443 |
String[] longnames = new String[headers.length]; |
|
444 |
String[] types = new String[headers.length]; |
|
445 |
if (nbheaderline > 1) {// get longnames |
|
446 |
reader.readRecord(); |
|
447 |
for (int i = 0; i < headers.length; i++) { |
|
448 |
longnames[i] = reader.get(headers[i]); |
|
504 | 449 |
} |
505 |
else if (headers[i].equals("xpath")) { |
|
506 |
writer.writeAttribute("xpath", reader.get(headers[i])); |
|
450 |
} |
|
451 |
else { |
|
452 |
for (int i = 0; i < headers.length; i++) { |
|
453 |
longnames[i] = headers[i]; |
|
507 | 454 |
} |
455 |
} |
|
508 | 456 |
|
509 |
for (int i = 0; i < headers.length; i++) |
|
510 |
if (!headers[i].equals("id") && !headers[i].equals("xpath")) { |
|
511 |
writer.writeEmptyElement("entry"); |
|
512 |
writer.writeAttribute("id", AsciiUtils.buildId(headers[i])); |
|
513 |
String value = reader.get(headers[i]); |
|
514 |
if (value.length() == 0) |
|
515 |
writer.writeAttribute("value", "N/A"); |
|
516 |
else |
|
517 |
writer.writeAttribute("value", value); |
|
518 |
|
|
457 |
if (nbheaderline > 2) {// got types |
|
458 |
reader.readRecord(); |
|
459 |
for (int i = 0; i < headers.length; i++) { |
|
460 |
types[i] = reader.get(headers[i]); |
|
519 | 461 |
} |
520 |
writer.writeEndElement(); |
|
462 |
} |
|
463 |
else { |
|
464 |
for (int i = 0; i < headers.length; i++) { |
|
465 |
types[i] = "String"; |
|
466 |
} |
|
467 |
} |
|
468 |
|
|
469 |
writer.writeStartDocument("UTF-8", "1.0"); |
|
470 |
writer.writeStartElement("enrichissement"); |
|
471 |
writer.writeStartElement("metadatas"); |
|
521 | 472 |
writer.writeCharacters("\n"); |
473 |
// println "headers : "+Arrays.toString(headers) |
|
474 |
for (int i = 1; i < headers.length; i++) { |
|
475 |
if (headers[i].length() == 0) { |
|
476 |
headers[i] = "noname"; |
|
477 |
System.out.println("Warning: the " + (i + 1) + "th column name is empty"); |
|
478 |
} |
|
479 |
// if(!headers[i].equals("id"))// the first |
|
480 |
// { |
|
481 |
writer.writeStartElement("metadata"); |
Formats disponibles : Unified diff