Révision 4019
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4019) | ||
|---|---|---|
| 32 | 32 |
|
| 33 | 33 |
public final String merge(String orig, def sss) {
|
| 34 | 34 |
|
| 35 |
int n = 0; |
|
| 36 |
for (String s : sss) {
|
|
| 37 |
if (s != "" && s != "_") {
|
|
| 35 |
if (orig.equals("") || orig.equals("_")) {
|
|
| 38 | 36 |
|
| 39 |
def ssset = new HashSet(sss); |
|
| 40 |
if (ssset.size() == 1) return ssset.join(".")
|
|
| 41 |
|
|
| 42 |
return sss.join(".")
|
|
| 43 |
} |
|
| 37 |
} else {
|
|
| 38 |
sss.add(0, orig) |
|
| 44 | 39 |
} |
| 45 | 40 |
|
| 46 |
return orig; |
|
| 41 |
def ssset = new LinkedHashSet(sss) |
|
| 42 |
|
|
| 43 |
return ssset.join(".")
|
|
| 47 | 44 |
} |
| 48 | 45 |
|
| 49 | 46 |
@Override |
| ... | ... | |
| 66 | 63 |
def files = conlluSrcDirectory.listFiles() |
| 67 | 64 |
files.sort() |
| 68 | 65 |
|
| 69 |
// Keep or not contractions |
|
| 70 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
| 71 |
|
|
| 72 |
println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes" |
|
| 66 |
println "Add XmlId if necessary & remove empty nodes" |
|
| 73 | 67 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 74 | 68 |
for (File conlluFile : files) {
|
| 75 | 69 |
cpb_texts.tick() |
| ... | ... | |
| 80 | 74 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
| 81 | 75 |
for (int i = 0 ; i < lines.size() ; i++) {
|
| 82 | 76 |
String line = lines[i] |
| 83 |
|
|
| 77 |
|
|
| 84 | 78 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
| 85 |
|
|
| 79 |
|
|
| 86 | 80 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
| 87 | 81 |
if (split[0].contains(".")) {
|
| 88 | 82 |
//println "REMOVE EMPTY NODE: $split : "+ |
| ... | ... | |
| 107 | 101 |
} |
| 108 | 102 |
} |
| 109 | 103 |
|
| 110 |
if (contractionsManagement == UDPreferences.ALL) {
|
|
| 111 |
// ok on fait rien |
|
| 112 |
} else if (contractionsManagement == UDPreferences.SYNTAX) {
|
|
| 113 |
if (split[0].contains("-")) {
|
|
| 104 |
lines[i] = split.join("\t") // rebuild the line
|
|
| 105 |
} |
|
| 106 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
| 107 |
} |
|
| 108 |
} |
|
| 109 |
cpb_texts.done() |
|
| 114 | 110 |
|
| 115 |
// stores the syntatic word id and the ortographic word properties |
|
| 116 |
temp_multiwords = [:] |
|
| 117 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 118 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 119 |
for (int ii = n1 ; ii <= n2 ; ii++) {
|
|
| 120 |
temp_multiwords[""+ii] = split; |
|
| 121 |
} |
|
| 122 | 111 |
|
| 123 |
//println "REMOVE - $split" |
|
| 124 |
lines.remove(i) |
|
| 125 |
i-- |
|
| 126 |
continue; /// next ! |
|
| 127 |
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
|
|
| 128 |
def split_ortho = temp_multiwords.remove(split[0]) |
|
| 112 |
// Keep or not contractions |
|
| 113 |
File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm") |
|
| 114 |
|
|
| 115 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
| 129 | 116 |
|
| 130 |
if (split[9].length() > 0) split[9] += "|" |
|
| 131 |
split[9] += "multiword="+split_ortho[1] // the orthographic form |
|
| 132 |
} |
|
| 133 |
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
|
| 134 |
if (split[0].contains("-")) {
|
|
| 135 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 136 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 137 |
int n = n2 - n1 |
|
| 117 |
if (contractionsManagement == UDPreferences.ALL) {
|
|
| 118 |
conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done |
|
| 119 |
} else {
|
|
| 120 |
|
|
| 121 |
conlluSrcForTXMDirectory.deleteDir() |
|
| 122 |
conlluSrcForTXMDirectory.mkdirs() |
|
| 123 |
|
|
| 124 |
println "Contractions managment mode is '$contractionsManagement'" |
|
| 125 |
cpb_texts = new ConsoleProgressBar(files.size()) |
|
| 126 |
for (File conlluFile : files) {
|
|
| 127 |
cpb_texts.tick() |
|
| 138 | 128 |
|
| 139 |
//split[0] = ""+n1
|
|
| 129 |
if (conlluFile.getName().endsWith(".conllu")) {
|
|
| 140 | 130 |
|
| 141 |
// before merging and deleting words, check if they are the right ones |
|
| 142 |
if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
|
|
| 143 |
def splits = [] |
|
| 144 |
for (int j = 0 ; j <= n ;j++) {
|
|
| 145 |
def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 146 |
splits << tmp |
|
| 147 |
} |
|
| 131 |
File conlluFile2 = new File(conlluSrcForTXMDirectory, conlluFile.getName()) |
|
| 148 | 132 |
|
| 149 |
for (int j = 2 ; j < 8 ; j++) {
|
|
| 150 |
split[j] = merge(split[j], splits.collect(){it[j]})
|
|
| 133 |
String textid = FileUtils.stripExtension(conlluFile) |
|
| 134 |
int wcounter = 1; |
|
| 135 |
|
|
| 136 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
| 137 |
|
|
| 138 |
def temp_multiwords = [:] |
|
| 139 |
|
|
| 140 |
for (int i = 0 ; i < lines.size() ; i++) {
|
|
| 141 |
String line = lines[i] |
|
| 142 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
| 143 |
|
|
| 144 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 145 |
|
|
| 146 |
if (contractionsManagement == UDPreferences.SYNTAX) {
|
|
| 147 |
if (split[0].contains("-")) {
|
|
| 148 |
|
|
| 149 |
// stores the syntatic word id and the ortographic word properties |
|
| 150 |
temp_multiwords = [:] |
|
| 151 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 152 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 153 |
for (int ii = n1 ; ii <= n2 ; ii++) {
|
|
| 154 |
temp_multiwords[""+ii] = split; |
|
| 151 | 155 |
} |
| 152 | 156 |
|
| 153 |
//println "REMOVE non- $split" |
|
| 154 |
for (int j = 0 ; j <= n ;j++) {
|
|
| 155 |
lines.remove(i+1) |
|
| 157 |
//println "REMOVE - $split" |
|
| 158 |
lines.remove(i) |
|
| 159 |
i-- |
|
| 160 |
continue; /// next ! |
|
| 161 |
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
|
|
| 162 |
def split_ortho = temp_multiwords.remove(split[0]) |
|
| 163 |
|
|
| 164 |
if (split[9].length() > 0) split[9] += "|" |
|
| 165 |
split[9] += "multiword="+split_ortho[1] // the orthographic form |
|
| 166 |
} |
|
| 167 |
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
|
| 168 |
if (split[0].contains("-")) {
|
|
| 169 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 170 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 171 |
int n = n2 - n1 |
|
| 172 |
|
|
| 173 |
//split[0] = ""+n1 |
|
| 174 |
|
|
| 175 |
// before merging and deleting words, check if they are the right ones |
|
| 176 |
if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
|
|
| 177 |
def splits = [] |
|
| 178 |
for (int j = 0 ; j <= n ;j++) {
|
|
| 179 |
def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 180 |
splits << tmp |
|
| 181 |
} |
|
| 182 |
|
|
| 183 |
for (int j = 1 ; j < 8 ; j++) {
|
|
| 184 |
split[j] = merge(split[j], splits.collect(){it[j]})
|
|
| 185 |
} |
|
| 186 |
|
|
| 187 |
//println "REMOVE non- $split" |
|
| 188 |
for (int j = 0 ; j <= n ;j++) {
|
|
| 189 |
lines.remove(i+1) |
|
| 190 |
} |
|
| 156 | 191 |
} |
| 192 |
//println "splits=$splits" |
|
| 157 | 193 |
} |
| 158 |
//println "splits=$splits" |
|
| 159 | 194 |
} |
| 195 |
|
|
| 196 |
lines[i] = split.join("\t") // rebuild the line
|
|
| 160 | 197 |
} |
| 161 |
|
|
| 162 |
lines[i] = split.join("\t") // rebuild the line
|
|
| 198 |
IOUtils.write(conlluFile2, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
| 163 | 199 |
} |
| 164 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
| 165 | 200 |
} |
| 201 |
cpb_texts.done() |
|
| 166 | 202 |
} |
| 167 |
cpb_texts.done() |
|
| 168 |
|
|
| 169 |
|
|
| 170 |
// // Fix missing XmlId in conllu files |
|
| 171 |
// println "Setting word XmlID if necessary" |
|
| 172 |
// ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
|
| 173 |
// for (File conlluFile : files) {
|
|
| 174 |
// cpb_texts.tick() |
|
| 175 |
// if (conlluFile.getName().endsWith(".conllu")) {
|
|
| 176 |
// String textid = FileUtils.stripExtension(conlluFile) |
|
| 177 |
// int wcounter = 1; |
|
| 178 |
// ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
| 179 |
// for (int i = 0 ; i < lines.size() ; i++) {
|
|
| 180 |
// String line = lines[i] |
|
| 181 |
// if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
| 182 |
// |
|
| 183 |
// def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 184 |
// if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
| 185 |
// split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
|
| 186 |
// lines[i] = split.join("\t")
|
|
| 187 |
// } |
|
| 188 |
// |
|
| 189 |
// } |
|
| 190 |
// IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
|
| 191 |
// } |
|
| 192 |
// } |
|
| 193 |
// cpb_texts.done() |
|
| 194 |
|
|
| 195 | 203 |
File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory) |
| 196 | 204 |
File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei") |
| 197 | 205 |
srcDirectory.deleteDir() |
| ... | ... | |
| 203 | 211 |
} |
| 204 | 212 |
|
| 205 | 213 |
println "Convert CoNLL-U to XML-TEI..." |
| 206 |
convertCoNLLU2TEI(conlluSrcDirectory, srcDirectory, project) |
|
| 214 |
convertCoNLLU2TEI(conlluSrcForTXMDirectory, srcDirectory, project)
|
|
| 207 | 215 |
|
| 208 | 216 |
inputDirectory = srcDirectory // switch files source directory |
| 209 | 217 |
|
| ... | ... | |
| 322 | 330 |
} |
| 323 | 331 |
|
| 324 | 332 |
} else {
|
| 325 |
|
|
| 326 | 333 |
LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>() |
| 327 | 334 |
|
| 328 | 335 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
|
| ... | ... | |
| 430 | 437 |
|
| 431 | 438 |
for (def word : words) {
|
| 432 | 439 |
|
| 433 |
//println "UD-ID="+word["id"] |
|
| 434 |
if (word["id"].contains("-")) {
|
|
| 435 |
writer.writeStartElement("seg")
|
|
| 436 |
writer.writeCharacters("******")
|
|
| 437 |
writer.writeEndElement() // span |
|
| 438 |
} |
|
| 439 |
|
|
| 440 | 440 |
String id = null |
| 441 | 441 |
wordCounter++ |
| 442 | 442 |
writer.writeStartElement ("w")
|
| ... | ... | |
| 458 | 458 |
writer.writeAttribute("id", "w_"+text_id+"_"+wordCounter)
|
| 459 | 459 |
} |
| 460 | 460 |
|
| 461 |
writer.writeCharacters(word["form"]) |
|
| 461 |
int idx = word["form"].indexOf(".", 1);
|
|
| 462 |
if (word["id"].contains("-") && idx > 0) {
|
|
| 463 |
writer.writeCharacters(word["form"].substring(0, idx)) |
|
| 464 |
} else {
|
|
| 465 |
writer.writeCharacters(word["form"]) |
|
| 466 |
} |
|
| 462 | 467 |
writer.writeEndElement() // w |
| 463 | 468 |
writer.writeCharacters(" ")
|
| 464 | 469 |
} |
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 4019) | ||
|---|---|---|
| 242 | 242 |
|
| 243 | 243 |
my $commentlines = 0; #added by AL |
| 244 | 244 |
|
| 245 |
# my $contractions = 0; #added by AL
|
|
| 245 |
my $contractions = 0; #added by AL |
|
| 246 | 246 |
# my $text_id = "unknown_text"; |
| 247 | 247 |
my $text_id = $infilename; |
| 248 | 248 |
my $sent_id = "0"; |
| ... | ... | |
| 268 | 268 |
$commentlines++; |
| 269 | 269 |
next; |
| 270 | 270 |
} |
| 271 |
# # Added by AL for contractions
|
|
| 272 |
# elsif ( $words[$w] =~ /^\d+-\d+/ ) {
|
|
| 273 |
# |
|
| 274 |
# # print LOG "Contraction line loop 1: $words[$w]\n";
|
|
| 275 |
# $commentlines++;
|
|
| 276 |
# |
|
| 277 |
# # $contractions++;
|
|
| 278 |
# next;
|
|
| 279 |
# }
|
|
| 271 |
# Added by AL for contractions |
|
| 272 |
elsif ( $words[$w] =~ /^\d+-\d+/ ) {
|
|
| 273 |
|
|
| 274 |
# print LOG "Contraction line loop 1: $words[$w]\n"; |
|
| 275 |
$commentlines++; |
|
| 276 |
|
|
| 277 |
# $contractions++; |
|
| 278 |
next; |
|
| 279 |
} |
|
| 280 | 280 |
else {
|
| 281 | 281 |
if ( defined($opt_c) ) {
|
| 282 | 282 |
$words[$w] =~ s/coord(\d+)-//g; |
| ... | ... | |
| 437 | 437 |
next; |
| 438 | 438 |
} |
| 439 | 439 |
|
| 440 |
# #Added AL for contractions |
|
| 441 |
# if ( $words[$i] =~ /^\d+-\d+/ ) {
|
|
| 442 |
# |
|
| 443 |
# # print LOG "Contraction loop 2 : $words[$i]\n"; |
|
| 444 |
# next; |
|
| 445 |
# } |
|
| 440 |
#Added AL for contractions |
|
| 441 |
if ( $words[$i] =~ /^\d+-\d+/ ) {
|
|
| 446 | 442 |
|
| 443 |
# print LOG "Contraction loop 2 : $words[$i]\n"; |
|
| 444 |
next; |
|
| 445 |
} |
|
| 446 |
|
|
| 447 | 447 |
else {
|
| 448 | 448 |
|
| 449 | 449 |
@cols = split( /\t/, $words[$i] ); |
| ... | ... | |
| 1103 | 1103 |
$print_nt_features = $nt_features; |
| 1104 | 1104 |
} |
| 1105 | 1105 |
} |
| 1106 |
|
|
| 1107 |
my $cat = $_[0]; |
|
| 1108 |
if ( $cat eq '' ) {
|
|
| 1109 |
$cat = '__UNDEF__'; |
|
| 1110 |
} |
|
| 1106 | 1111 |
printf XML |
| 1107 | 1112 |
" <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", |
| 1108 |
$., $w, $dupl, $_[0], $dom, $type, $vform, $vlemma, $print_nt_features,
|
|
| 1113 |
$., $w, $dupl, $cat, $dom, $type, $vform, $vlemma, $print_nt_features,
|
|
| 1109 | 1114 |
notes("$._$w"), $.;
|
| 1110 | 1115 |
printf XML " <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., $w, $dupl; |
| 1111 | 1116 |
|
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/FixDriverFile.java (revision 4019) | ||
|---|---|---|
| 3 | 3 |
import java.io.File; |
| 4 | 4 |
import java.io.IOException; |
| 5 | 5 |
import java.util.Arrays; |
| 6 |
import java.util.HashMap; |
|
| 7 | 6 |
import java.util.HashSet; |
| 7 |
import java.util.LinkedHashMap; |
|
| 8 |
import java.util.LinkedHashSet; |
|
| 8 | 9 |
import java.util.List; |
| 9 | 10 |
|
| 10 | 11 |
import javax.xml.parsers.ParserConfigurationException; |
| ... | ... | |
| 24 | 25 |
|
| 25 | 26 |
public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
|
| 26 | 27 |
|
| 27 |
HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>();
|
|
| 28 |
HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>();
|
|
| 29 |
HashMap<String, Element> featuresElements = new HashMap<String, Element>();
|
|
| 28 |
LinkedHashMap<String, LinkedHashSet<String>> declaredFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
|
|
| 29 |
LinkedHashMap<String, LinkedHashSet<String>> missingFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
|
|
| 30 |
LinkedHashMap<String, Element> featuresElements = new LinkedHashMap<String, Element>();
|
|
| 30 | 31 |
|
| 31 | 32 |
Document doc = DomUtils.load(driverFile); |
| 32 | 33 |
NodeList featuresList = doc.getElementsByTagName("feature");
|
| ... | ... | |
| 34 | 35 |
Element f = (Element) featuresList.item(i); |
| 35 | 36 |
|
| 36 | 37 |
featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
|
| 37 |
HashSet<String> values = new HashSet<String>();
|
|
| 38 |
LinkedHashSet<String> values = new LinkedHashSet<String>();
|
|
| 38 | 39 |
declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
|
| 39 |
missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
|
|
| 40 |
missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new LinkedHashSet<String>());
|
|
| 40 | 41 |
|
| 41 | 42 |
NodeList featureValuesList = f.getElementsByTagName("value");
|
| 42 | 43 |
for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
|
| ... | ... | |
| 95 | 96 |
//System.out.println("\t"+missingFeatureNamedomain);
|
| 96 | 97 |
|
| 97 | 98 |
Element f = featuresElements.get(missingFeatureNamedomain); |
| 99 |
//missingFeatures.get(missingFeatureNamedomain).add("");
|
|
| 98 | 100 |
|
| 99 | 101 |
for (String v : missingFeatures.get(missingFeatureNamedomain)) {
|
| 100 | 102 |
//System.out.println("\t\t"+v);
|
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4019) | ||
|---|---|---|
| 30 | 30 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
| 31 | 31 |
int n = n2 - n1; |
| 32 | 32 |
|
| 33 |
ArrayList<String[]> newlines = new ArrayList<>(); |
|
| 34 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 35 |
newlines.add(new String[split.length]); |
|
| 36 |
for (int p = 0 ; p < split.length ; p++) {
|
|
| 37 |
newlines.get(j)[p] = "_"; |
|
| 38 |
} |
|
| 39 |
} |
|
| 33 |
//System.out.println("Word "+Arrays.toString(split));
|
|
| 34 |
//System.out.println("lines to insert: "+n);
|
|
| 40 | 35 |
if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
|
| 41 |
System.out.println("FIXING "+conll.get(i));
|
|
| 42 |
for (int p = 2 ; p < split.length - 1 ; p++) {
|
|
| 43 |
String[] splittedValues = split[p].split(".");
|
|
| 36 |
|
|
| 37 |
ArrayList<String[]> newlines = new ArrayList<>(); |
|
| 38 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 39 |
newlines.add(new String[split.length]); |
|
| 44 | 40 |
|
| 41 |
newlines.get(j)[0] = ""+(n1+j); |
|
| 45 | 42 |
|
| 46 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 47 |
if (p >= splittedValues.length) {
|
|
| 48 |
|
|
| 49 |
} else {
|
|
| 50 |
newlines.get(j)[p] = splittedValues[p]; |
|
| 43 |
for (int p = 1 ; p < split.length ; p++) {
|
|
| 44 |
newlines.get(j)[p] = "_"; |
|
| 45 |
} |
|
| 46 |
} |
|
| 47 |
|
|
| 48 |
//System.out.println("FIXING "+split);
|
|
| 49 |
for (int p = 1 ; p < split.length - 1 ; p++) {
|
|
| 50 |
String v = split[p]; |
|
| 51 |
String[] splittedValues = v.split("\\.");
|
|
| 52 |
if (splittedValues.length == newlines.size()) {
|
|
| 53 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 54 |
newlines.get(j)[p] = splittedValues[j]; |
|
| 51 | 55 |
} |
| 56 |
} else if ((splittedValues.length - 1) == newlines.size()) {
|
|
| 57 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 58 |
newlines.get(j)[p] = splittedValues[j+1]; |
|
| 59 |
} |
|
| 60 |
} else {
|
|
| 61 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 62 |
newlines.get(j)[p] = split[p]; |
|
| 63 |
} |
|
| 52 | 64 |
} |
| 53 | 65 |
} |
| 54 | 66 |
|
| 67 |
for (int j = 0 ; j <= n ; j++) {
|
|
| 68 |
splittedLines.add(i+j+1, newlines.get(j)); |
|
| 69 |
|
|
| 70 |
} |
|
| 71 |
i = i + newlines.size(); |
|
| 72 |
|
|
| 55 | 73 |
} else {
|
| 56 |
System.out.println("NOT FIXING "+conll.get(i));
|
|
| 74 |
//System.out.println("NOT FIXING "+conll.get(i));
|
|
| 57 | 75 |
} |
| 76 |
|
|
| 77 |
|
|
| 58 | 78 |
} |
| 59 |
|
|
| 79 |
} |
|
| 80 |
|
|
| 81 |
for (int i = 0 ; i < splittedLines.size() ; i++) {
|
|
| 82 |
String split[] = splittedLines.get(i); |
|
| 60 | 83 |
conll2.add(StringUtils.join(split, "\t")); |
| 61 |
|
|
| 62 | 84 |
} |
| 85 |
|
|
| 63 | 86 |
for (String l : conll2) System.out.println(l); |
| 64 | 87 |
|
| 65 | 88 |
String bundle_id = "org.txm.conllu.core"; |
Formats disponibles : Unified diff