73 |
73 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
|
74 |
74 |
String propertiesPrefix;
|
75 |
75 |
|
76 |
|
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comments options")
|
|
76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
|
77 |
77 |
Boolean separator = false;
|
78 |
78 |
|
79 |
79 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
... | ... | |
91 |
91 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
|
92 |
92 |
Boolean insertTokenWithoutUdAnnotations;
|
93 |
93 |
|
|
94 |
// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
95 |
@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties")
|
|
96 |
Boolean separator_properties = false;
|
|
97 |
|
94 |
98 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
|
95 |
99 |
String defaultFormPropertyName;
|
96 |
100 |
|
... | ... | |
103 |
107 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
|
104 |
108 |
String defaultXposPropertyName;
|
105 |
109 |
|
|
110 |
@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "")
|
|
111 |
String defaultFeatsPropertyName;
|
|
112 |
|
|
113 |
@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "")
|
|
114 |
String defaultHeadPropertyName;
|
|
115 |
|
|
116 |
@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "")
|
|
117 |
String defaultDeprelPropertyName;
|
|
118 |
|
|
119 |
@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "")
|
|
120 |
String defaultDepsPropertyName;
|
|
121 |
|
|
122 |
@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "")
|
|
123 |
String defaultMiscPropertyName;
|
|
124 |
|
106 |
125 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
|
107 |
126 |
Boolean separator2 = false;
|
108 |
127 |
|
... | ... | |
145 |
164 |
try {
|
146 |
165 |
return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
|
147 |
166 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
|
|
167 |
defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName,
|
|
168 |
defaultMiscPropertyName,
|
148 |
169 |
detectGap, insertParagraphs, insertNoSpaceAfter);
|
149 |
170 |
}
|
150 |
171 |
catch (Exception e) {
|
... | ... | |
178 |
199 |
* @throws InvalidCqpIdException
|
179 |
200 |
*/
|
180 |
201 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
|
181 |
|
String defaultFormPropertyName, String defaultLemmaPropertyName,
|
182 |
|
String defaultUposPropertyName, String defaultXposPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
|
|
202 |
String defaultFormPropertyName, String defaultLemmaPropertyName, String defaultUposPropertyName, String defaultXposPropertyName,
|
|
203 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName,
|
|
204 |
String defaultMiscPropertyName,
|
|
205 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
|
183 |
206 |
throws UnexpectedAnswerException,
|
184 |
207 |
IOException,
|
185 |
208 |
CqiServerError,
|
... | ... | |
384 |
407 |
if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
|
385 |
408 |
lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
|
386 |
409 |
}
|
387 |
|
String[] upos = null;
|
|
410 |
String[] uposs = null;
|
388 |
411 |
if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
|
389 |
|
upos = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
|
|
412 |
uposs = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
|
390 |
413 |
}
|
391 |
|
String[] xpos = null;
|
|
414 |
String[] xposs = null;
|
392 |
415 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
|
393 |
|
xpos = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
|
|
416 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
|
394 |
417 |
}
|
395 |
418 |
|
|
419 |
String[] feats = null;
|
|
420 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
|
|
421 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions);
|
|
422 |
}
|
|
423 |
String[] heads = null;
|
|
424 |
if (defaultHeadPropertyName != null && defaultHeadPropertyName.length() > 0) {
|
|
425 |
heads = getDefaultValues(mainCorpus, defaultHeadPropertyName, sentencePositions);
|
|
426 |
}
|
|
427 |
String[] deprels = null;
|
|
428 |
if (defaultDeprelPropertyName != null && defaultDeprelPropertyName.length() > 0) {
|
|
429 |
deprels = getDefaultValues(mainCorpus, defaultDeprelPropertyName, sentencePositions);
|
|
430 |
}
|
|
431 |
String[] depss = null;
|
|
432 |
if (defaultDepsPropertyName != null && defaultDepsPropertyName.length() > 0) {
|
|
433 |
depss = getDefaultValues(mainCorpus, defaultDepsPropertyName, sentencePositions);
|
|
434 |
}
|
|
435 |
String[] miscs = null;
|
|
436 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
|
|
437 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions);
|
|
438 |
}
|
|
439 |
|
396 |
440 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
|
397 |
441 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
|
398 |
442 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
|
... | ... | |
415 |
459 |
if (lemmas != null && lemmaValues[p].equals("_")) {
|
416 |
460 |
lemmaValues[p] = lemmas[ip];
|
417 |
461 |
}
|
418 |
|
if (upos != null && uposValues[p].equals("_")) {
|
419 |
|
uposValues[p] = upos[ip];
|
|
462 |
if (uposs != null && uposValues[p].equals("_")) {
|
|
463 |
uposValues[p] = uposs[ip];
|
420 |
464 |
}
|
421 |
|
if (xpos != null && xposValues[p].equals("_")) {
|
422 |
|
xposValues[p] = xpos[ip];
|
|
465 |
if (xposs != null && xposValues[p].equals("_")) {
|
|
466 |
xposValues[p] = xposs[ip];
|
423 |
467 |
}
|
|
468 |
if (feats != null && featsValues[p].equals("_")) {
|
|
469 |
featsValues[p] = feats[ip];
|
|
470 |
}
|
|
471 |
if (heads != null && headValues[p] < 0) {
|
|
472 |
headValues[p] = Integer.parseInt(heads[ip]);
|
|
473 |
}
|
|
474 |
if (deprels != null && deprelValues[p].equals("_")) {
|
|
475 |
deprelValues[p] = deprels[ip];
|
|
476 |
}
|
|
477 |
if (depss != null && depsValues[p].equals("_")) {
|
|
478 |
depsValues[p] = depss[ip];
|
|
479 |
}
|
|
480 |
if (miscs != null && miscValues[p].equals("_")) {
|
|
481 |
miscValues[p] = miscs[ip];
|
|
482 |
}
|
424 |
483 |
}
|
425 |
484 |
|
426 |
485 |
if (insertNoSpaceAfter) {
|
... | ... | |
441 |
500 |
|
442 |
501 |
// fixing sentence punct limits
|
443 |
502 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
|
444 |
|
System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
|
|
503 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
|
445 |
504 |
int p2 = sentence.remove(0);
|
446 |
505 |
sentences.get(iSentence - 1).add(p2);
|
447 |
506 |
}
|
448 |
507 |
//
|
449 |
508 |
while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
|
450 |
|
System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
|
|
509 |
// System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
|
451 |
510 |
int p2 = sentence.remove(sentence.size() - 1);
|
452 |
511 |
sentences.get(iSentence + 1).add(0, p2);
|
453 |
512 |
}
|
... | ... | |
539 |
598 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
|
540 |
599 |
}
|
541 |
600 |
|
542 |
|
|
543 |
601 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
|
544 |
602 |
writer.println("# newpar id = " + iParagraph);
|
545 |
603 |
iParagraph++;
|
... | ... | |
572 |
630 |
}
|
573 |
631 |
|
574 |
632 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
575 |
|
String[] upos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
|
576 |
|
for (int iupos = 0; iupos < upos.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
577 |
|
if (upos[iupos].length() > 2 && upos[iupos].startsWith("|") && upos[iupos].endsWith("|")) {
|
578 |
|
upos[iupos] = upos[iupos].substring(1, upos[iupos].length() - 1);
|
|
633 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
|
|
634 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
|
635 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
|
|
636 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
|
579 |
637 |
}
|
580 |
638 |
}
|
581 |
639 |
|
582 |
|
return upos;
|
|
640 |
return values;
|
583 |
641 |
}
|
584 |
642 |
|
585 |
643 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|