70 |
70 |
@Option(name = "connluResultDirectory", usage = "connluResultDirectory", widget = "Folder", required = true, def = "connlu-result-directory")
|
71 |
71 |
File connluResultDirectory;
|
72 |
72 |
|
73 |
|
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud")
|
|
73 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
|
74 |
74 |
String propertiesPrefix;
|
75 |
75 |
|
76 |
76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "options")
|
... | ... | |
89 |
89 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
90 |
90 |
Boolean insertParagraphs = false;
|
91 |
91 |
|
92 |
|
// @Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
|
|
92 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
|
93 |
93 |
Boolean detectGap = false;
|
94 |
94 |
|
95 |
95 |
@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
|
... | ... | |
104 |
104 |
@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
|
105 |
105 |
String xposCorrPropertyName;
|
106 |
106 |
|
|
107 |
@Option(name = "punctStrong", usage = "optional strong punct tag to fix sentence limits", widget = "String", required = true, def = "PONfrt")
|
|
108 |
String punctStrong;
|
|
109 |
|
107 |
110 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
|
108 |
111 |
Boolean separator2 = false;
|
109 |
112 |
|
110 |
|
// @Option(name = "punctStrong", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[?!\\.]")
|
111 |
|
// String punctStrong;
|
112 |
|
|
113 |
113 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
|
114 |
114 |
String openingPunct;
|
115 |
115 |
|
... | ... | |
145 |
145 |
|
146 |
146 |
try {
|
147 |
147 |
return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct,
|
148 |
|
formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName,
|
|
148 |
formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, punctStrong,
|
149 |
149 |
detectGap, insertParagraphs, insertNoSpaceAfter);
|
150 |
150 |
}
|
151 |
151 |
catch (Exception e) {
|
... | ... | |
172 |
172 |
* @throws InvalidCqpIdException
|
173 |
173 |
*/
|
174 |
174 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, String formCorrPropertyName, String lemmaCorrPropertyName,
|
175 |
|
String uposCorrPropertyName, String xposCorrPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
|
|
175 |
String uposCorrPropertyName, String xposCorrPropertyName, String punctStrongRegex, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
|
176 |
176 |
IOException,
|
177 |
177 |
CqiServerError,
|
178 |
178 |
CqiClientException, InvalidCqpIdException {
|
... | ... | |
308 |
308 |
// fixing sentences
|
309 |
309 |
for (int s = 0; s < sentences.size(); s++) {
|
310 |
310 |
|
|
311 |
// fix only ud sentences limits
|
311 |
312 |
ArrayList<Integer> sentence = sentences.get(s);
|
|
313 |
|
|
314 |
if (sentidStartPositions.get(sentence.get(0)) == null) {
|
|
315 |
continue; // this is not a UD sentence
|
|
316 |
}
|
|
317 |
|
312 |
318 |
int max = -1;
|
313 |
319 |
int imax = 0;
|
314 |
320 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
... | ... | |
335 |
341 |
}
|
336 |
342 |
}
|
337 |
343 |
|
338 |
|
// fixing sentences
|
339 |
|
for (int s = 0; s < sentences.size(); s++) {
|
340 |
|
|
341 |
|
ArrayList<Integer> sentence = sentences.get(s);
|
342 |
|
if (s > 0 && formValues[sentence.get(0)].matches(openingPunct)) {
|
343 |
|
System.out.println("FIXING: first position " + formValues[sentence.get(0)] + "in " + s);
|
344 |
|
int p = sentence.remove(0);
|
345 |
|
sentences.get(s - 1).add(p);
|
346 |
|
}
|
347 |
|
|
348 |
|
if (s + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
|
349 |
|
System.out.println("FIXING: last position " + formValues[sentence.get(sentence.size() - 1)] + "in " + s);
|
350 |
|
int p = sentence.remove(sentence.size() - 1);
|
351 |
|
sentences.get(s + 1).add(0, p);
|
352 |
|
}
|
353 |
|
|
354 |
|
// int c = 0;
|
355 |
|
// ArrayList<Integer> sentence = sentences.get(s);
|
356 |
|
// for (int ip = 0 ; ip < sentence.size() ; ip++) {
|
357 |
|
//
|
358 |
|
// int p = sentence.get(ip);
|
359 |
|
//
|
360 |
|
// if (idValues[p].equals("__UNDEF__")) {
|
361 |
|
// c++;
|
362 |
|
// }
|
363 |
|
// }
|
364 |
|
// if (c == 0) { // al is fine
|
365 |
|
//
|
366 |
|
// } else if (c )
|
367 |
|
}
|
368 |
|
|
369 |
344 |
if (tmpSentence.size() > 0) { // add last sentence
|
370 |
345 |
sentences.add(new ArrayList<>(tmpSentence));
|
371 |
346 |
}
|
... | ... | |
441 |
416 |
}
|
442 |
417 |
}
|
443 |
418 |
}
|
|
419 |
|
|
420 |
|
444 |
421 |
}
|
445 |
422 |
|
|
423 |
|
|
424 |
// fixing sentence punct limits
|
|
425 |
|
|
426 |
|
|
427 |
// while (sentence.size() > 0 && iSentence > 0 && xpos[0].matches(punctStrongRegex)) {
|
|
428 |
// System.out.println("FIXING: first punctStrong position " + xposValues[sentence.get(0)] + " in " + iSentence);
|
|
429 |
// int p2 = sentence.remove(0);
|
|
430 |
// sentences.get(iSentence - 1).add(p2);
|
|
431 |
// }
|
|
432 |
|
|
433 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
|
|
434 |
System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
|
|
435 |
int p2 = sentence.remove(0);
|
|
436 |
sentences.get(iSentence - 1).add(p2);
|
|
437 |
}
|
|
438 |
//
|
|
439 |
while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
|
|
440 |
System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
|
|
441 |
int p2 = sentence.remove(sentence.size() - 1);
|
|
442 |
sentences.get(iSentence + 1).add(0, p2);
|
|
443 |
}
|
|
444 |
|
|
445 |
if (sentence.size() == 0) { // sentence was depleted after fixing it
|
|
446 |
sentences.remove(iSentence);
|
|
447 |
iSentence--;
|
|
448 |
continue;
|
|
449 |
}
|
|
450 |
|
446 |
451 |
// fixing head and set missing head to 0 and root
|
447 |
452 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
448 |
453 |
int p = sentence.get(ip);
|
... | ... | |
514 |
519 |
iParagraph++;
|
515 |
520 |
}
|
516 |
521 |
|
517 |
|
for (int p : sentence) {
|
518 |
|
|
519 |
|
if (gap != null && gap[p] != null) writer.println("# gap");
|
|
522 |
for (int ip = 0 ; ip < sentence.size() ; ip++) {
|
|
523 |
int p = sentence.get(ip);
|
|
524 |
|
520 |
525 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
521 |
526 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
|
522 |
527 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
|
523 |
528 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
529 |
|
|
530 |
if (gap != null && gap[ip].equals("next")) writer.println("# gap");
|
524 |
531 |
}
|
525 |
532 |
writer.println("");
|
526 |
533 |
numberOfSentencesWritten++;
|