25 |
25 |
*
|
26 |
26 |
*/
|
27 |
27 |
class CoNLLUImporter extends XTZImporter {
|
28 |
|
|
|
28 |
|
29 |
29 |
public CoNLLUImporter(ImportModule module) {
|
30 |
30 |
super(module)
|
31 |
31 |
}
|
32 |
|
|
|
32 |
|
33 |
33 |
public final String merge(String orig, def sss) {
|
34 |
|
|
|
34 |
|
35 |
35 |
int n = 0;
|
36 |
36 |
for (String s : sss) {
|
37 |
37 |
if (s != "" && s != "_") {
|
38 |
|
|
|
38 |
|
39 |
39 |
def ssset = new HashSet(sss);
|
40 |
40 |
if (ssset.size() == 1) return ssset.join(".")
|
41 |
|
|
|
41 |
|
42 |
42 |
return sss.join(".")
|
43 |
43 |
}
|
44 |
44 |
}
|
45 |
|
|
|
45 |
|
46 |
46 |
return orig;
|
47 |
47 |
}
|
48 |
|
|
|
48 |
|
49 |
49 |
@Override
|
50 |
50 |
public void process() {
|
51 |
|
|
|
51 |
|
52 |
52 |
File conlluSrcDirectory = inputDirectory
|
53 |
|
|
|
53 |
|
54 |
54 |
boolean usenewdocid = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_USE_NEW_DOC_ID)); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
|
55 |
|
|
|
55 |
|
56 |
56 |
if (usenewdocid) {
|
57 |
57 |
conlluSrcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu")
|
58 |
58 |
conlluSrcDirectory.deleteDir()
|
59 |
59 |
conlluSrcDirectory.mkdirs()
|
60 |
|
|
|
60 |
|
61 |
61 |
if (!splitCoNLLUFiles(inputDirectory, conlluSrcDirectory, project)) {
|
62 |
62 |
return
|
63 |
63 |
}
|
64 |
64 |
}
|
65 |
|
|
|
65 |
|
66 |
66 |
def files = conlluSrcDirectory.listFiles()
|
67 |
67 |
files.sort()
|
68 |
|
|
|
68 |
|
69 |
69 |
// Keep or not contractions
|
70 |
70 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
71 |
|
|
72 |
|
println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
|
73 |
|
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
74 |
|
for (File conlluFile : files) {
|
75 |
|
cpb_texts.tick()
|
76 |
|
if (conlluFile.getName().endsWith(".conllu")) {
|
77 |
|
String textid = FileUtils.stripExtension(conlluFile)
|
78 |
|
int wcounter = 1;
|
|
71 |
|
|
72 |
println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
|
|
73 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
|
74 |
for (File conlluFile : files) {
|
|
75 |
cpb_texts.tick()
|
|
76 |
if (conlluFile.getName().endsWith(".conllu")) {
|
|
77 |
String textid = FileUtils.stripExtension(conlluFile)
|
|
78 |
int wcounter = 1;
|
|
79 |
|
|
80 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
|
|
81 |
for (int i = 0 ; i < lines.size() ; i++) {
|
|
82 |
String line = lines[i]
|
79 |
83 |
|
80 |
|
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
|
|
84 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
81 |
85 |
|
82 |
|
def temp_multiwords = [:]
|
83 |
|
|
84 |
|
for (int i = 0 ; i < lines.size() ; i++) {
|
85 |
|
String line = lines[i]
|
86 |
|
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
87 |
|
|
88 |
|
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
89 |
|
|
90 |
|
if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
91 |
|
if (split[-1] == "_") {
|
92 |
|
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
|
93 |
|
} else {
|
94 |
|
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
|
86 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
87 |
if (split[0].contains(".")) {
|
|
88 |
//println "REMOVE EMPTY NODE: $split : "+
|
|
89 |
lines.remove(i)
|
|
90 |
i--
|
|
91 |
continue; // next !
|
|
92 |
}
|
|
93 |
}
|
|
94 |
def temp_multiwords = [:]
|
|
95 |
|
|
96 |
for (int i = 0 ; i < lines.size() ; i++) {
|
|
97 |
String line = lines[i]
|
|
98 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
99 |
|
|
100 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
101 |
|
|
102 |
if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
103 |
if (split[-1] == "_") {
|
|
104 |
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
|
|
105 |
} else {
|
|
106 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
|
107 |
}
|
|
108 |
}
|
|
109 |
|
|
110 |
if (contractionsManagement == UDPreferences.ALL) {
|
|
111 |
// ok on fait rien
|
|
112 |
} else if (contractionsManagement == UDPreferences.SYNTAX) {
|
|
113 |
if (split[0].contains("-")) {
|
|
114 |
|
|
115 |
// stores the syntatic word id and the ortographic word properties
|
|
116 |
temp_multiwords = [:]
|
|
117 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
118 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
119 |
for (int ii = n1 ; ii <= n2 ; ii++) {
|
|
120 |
temp_multiwords[""+ii] = split;
|
95 |
121 |
}
|
96 |
|
}
|
97 |
|
|
98 |
|
if (split[0].contains(".")) {
|
99 |
|
//println "REMOVE EMPTY NODE: $split : "+
|
|
122 |
|
|
123 |
//println "REMOVE - $split"
|
100 |
124 |
lines.remove(i)
|
101 |
125 |
i--
|
102 |
|
continue; // next !
|
|
126 |
continue; /// next !
|
|
127 |
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
|
|
128 |
def split_ortho = temp_multiwords.remove(split[0])
|
|
129 |
|
|
130 |
if (split[9].length() > 0) split[9] += "|"
|
|
131 |
split[9] += "multiword="+split_ortho[1] // the orthographic form
|
103 |
132 |
}
|
104 |
|
|
105 |
|
if (contractionsManagement == UDPreferences.ALL) {
|
106 |
|
// ok on fait rien
|
107 |
|
} else if (contractionsManagement == UDPreferences.SYNTAX) {
|
108 |
|
if (split[0].contains("-")) {
|
109 |
|
|
110 |
|
// stores the syntatic word id and the ortographic word properties
|
111 |
|
temp_multiwords = [:]
|
112 |
|
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
113 |
|
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
114 |
|
for (int ii = n1 ; ii <= n2 ; ii++) {
|
115 |
|
temp_multiwords[""+ii] = split;
|
116 |
|
}
|
117 |
|
|
118 |
|
//println "REMOVE - $split"
|
119 |
|
lines.remove(i)
|
120 |
|
i--
|
121 |
|
continue; /// next !
|
122 |
|
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
|
123 |
|
def split_ortho = temp_multiwords.remove(split[0])
|
124 |
|
|
125 |
|
if (split[9].length() > 0) split[9] += "|"
|
126 |
|
split[9] += "multiword="+split_ortho[1] // the orthographic form
|
127 |
|
}
|
128 |
|
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
129 |
|
if (split[0].contains("-")) {
|
130 |
|
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
131 |
|
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
132 |
|
int n = n2 - n1
|
133 |
|
|
134 |
|
split[0] = ""+n1
|
135 |
|
|
|
133 |
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
|
134 |
if (split[0].contains("-")) {
|
|
135 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
136 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
137 |
int n = n2 - n1
|
|
138 |
|
|
139 |
//split[0] = ""+n1
|
|
140 |
|
|
141 |
// before merging and deleting words, check if they are the right ones
|
|
142 |
if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
|
136 |
143 |
def splits = []
|
137 |
144 |
for (int j = 0 ; j <= n ;j++) {
|
138 |
145 |
def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
139 |
|
if (tmp[0].contains(".")) {
|
140 |
|
//println "PRE-REMOVE EMPTY NODE: $split : "+
|
141 |
|
lines.remove(i+j)
|
142 |
|
j--
|
143 |
|
continue // next token
|
144 |
|
}
|
145 |
146 |
splits << tmp
|
146 |
147 |
}
|
147 |
148 |
|
148 |
149 |
for (int j = 2 ; j < 8 ; j++) {
|
149 |
150 |
split[j] = merge(split[j], splits.collect(){it[j]})
|
150 |
151 |
}
|
151 |
|
|
|
152 |
|
152 |
153 |
//println "REMOVE non- $split"
|
153 |
154 |
for (int j = 0 ; j <= n ;j++) {
|
154 |
155 |
lines.remove(i+1)
|
155 |
156 |
}
|
156 |
|
//println "splits=$splits"
|
157 |
157 |
}
|
|
158 |
//println "splits=$splits"
|
158 |
159 |
}
|
159 |
|
|
160 |
|
lines[i] = split.join("\t") // rebuild the line
|
161 |
160 |
}
|
162 |
|
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
161 |
|
|
162 |
lines[i] = split.join("\t") // rebuild the line
|
163 |
163 |
}
|
|
164 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
164 |
165 |
}
|
165 |
|
cpb_texts.done()
|
166 |
|
|
167 |
|
|
168 |
|
// // Fix missing XmlId in conllu files
|
169 |
|
// println "Setting word XmlID if necessary"
|
170 |
|
// ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
171 |
|
// for (File conlluFile : files) {
|
172 |
|
// cpb_texts.tick()
|
173 |
|
// if (conlluFile.getName().endsWith(".conllu")) {
|
174 |
|
// String textid = FileUtils.stripExtension(conlluFile)
|
175 |
|
// int wcounter = 1;
|
176 |
|
// ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
|
177 |
|
// for (int i = 0 ; i < lines.size() ; i++) {
|
178 |
|
// String line = lines[i]
|
179 |
|
// if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
180 |
|
//
|
181 |
|
// def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
182 |
|
// if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
183 |
|
// split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
184 |
|
// lines[i] = split.join("\t")
|
185 |
|
// }
|
186 |
|
//
|
187 |
|
// }
|
188 |
|
// IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
189 |
|
// }
|
190 |
|
// }
|
191 |
|
// cpb_texts.done()
|
192 |
|
|
|
166 |
}
|
|
167 |
cpb_texts.done()
|
|
168 |
|
|
169 |
|
|
170 |
// // Fix missing XmlId in conllu files
|
|
171 |
// println "Setting word XmlID if necessary"
|
|
172 |
// ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
|
173 |
// for (File conlluFile : files) {
|
|
174 |
// cpb_texts.tick()
|
|
175 |
// if (conlluFile.getName().endsWith(".conllu")) {
|
|
176 |
// String textid = FileUtils.stripExtension(conlluFile)
|
|
177 |
// int wcounter = 1;
|
|
178 |
// ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
|
|
179 |
// for (int i = 0 ; i < lines.size() ; i++) {
|
|
180 |
// String line = lines[i]
|
|
181 |
// if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
182 |
//
|
|
183 |
// def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
184 |
// if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
185 |
// split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
|
186 |
// lines[i] = split.join("\t")
|
|
187 |
// }
|
|
188 |
//
|
|
189 |
// }
|
|
190 |
// IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
|
191 |
// }
|
|
192 |
// }
|
|
193 |
// cpb_texts.done()
|
|
194 |
|
193 |
195 |
File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory)
|
194 |
196 |
File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
|
195 |
197 |
srcDirectory.deleteDir()
|
196 |
198 |
srcDirectory.mkdirs()
|
197 |
|
|
|
199 |
|
198 |
200 |
if (metadataFile != null && metadataFile.exists()) {
|
199 |
201 |
File metadataFile2 = new File(srcDirectory, metadataFile.getName())
|
200 |
202 |
FileCopy.copy(metadataFile, metadataFile2)
|
201 |
203 |
}
|
202 |
|
|
|
204 |
|
203 |
205 |
println "Convert CoNLL-U to XML-TEI..."
|
204 |
206 |
convertCoNLLU2TEI(conlluSrcDirectory, srcDirectory, project)
|
205 |
|
|
|
207 |
|
206 |
208 |
inputDirectory = srcDirectory // switch files source directory
|
207 |
|
|
|
209 |
|
208 |
210 |
super.process()
|
209 |
211 |
}
|
210 |
|
|
|
212 |
|
211 |
213 |
public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
|
212 |
214 |
def files = inputDirectory.listFiles(new FilenameFilter() {
|
213 |
|
boolean accept(File dir, String name) {
|
214 |
|
return name.toLowerCase().endsWith(".conllu")
|
215 |
|
}
|
216 |
|
});
|
217 |
|
|
|
215 |
boolean accept(File dir, String name) {
|
|
216 |
return name.toLowerCase().endsWith(".conllu")
|
|
217 |
}
|
|
218 |
});
|
|
219 |
|
218 |
220 |
if (files == null) {
|
219 |
221 |
println "Aborting. No CONLL file found in $inputDirectory."
|
220 |
222 |
return false
|
221 |
223 |
}
|
222 |
224 |
files.sort()
|
223 |
225 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
224 |
|
|
|
226 |
|
225 |
227 |
println "Splitting CoNLL-U files..."
|
226 |
228 |
for (File master : files) {
|
227 |
|
|
|
229 |
|
228 |
230 |
cpb_texts.tick()
|
229 |
|
|
|
231 |
|
230 |
232 |
if (!master.getName().endsWith(".conllu")) {
|
231 |
233 |
continue
|
232 |
234 |
}
|
233 |
|
|
|
235 |
|
234 |
236 |
String orig_text_id = FileUtils.stripExtension(master)
|
235 |
237 |
String current_text_id = FileUtils.stripExtension(master)
|
236 |
238 |
File conlluFile = new File(srcDirectory, current_text_id+".conllu")
|
237 |
239 |
def writer = conlluFile.newWriter("UTF-8", true)
|
238 |
|
|
|
240 |
|
239 |
241 |
master.eachLine("UTF-8") { line ->
|
240 |
242 |
if (line.startsWith("# newdoc id = ")) {
|
241 |
|
|
|
243 |
|
242 |
244 |
String text_id = line.substring("# newdoc id = ".length())
|
243 |
245 |
if (!text_id.equals(current_text_id)) {
|
244 |
246 |
writer.close()
|
... | ... | |
247 |
249 |
writer = conlluFile.newWriter("UTF-8", true)
|
248 |
250 |
}
|
249 |
251 |
}
|
250 |
|
|
|
252 |
|
251 |
253 |
writer.println(line)
|
252 |
254 |
}
|
253 |
255 |
writer.close()
|
... | ... | |
255 |
257 |
cpb_texts.done()
|
256 |
258 |
return true
|
257 |
259 |
}
|
258 |
|
|
|
260 |
|
259 |
261 |
public static def convertCoNLLU2TEI(File inputDirectory, File srcDirectory, def project) {
|
260 |
|
|
|
262 |
|
261 |
263 |
def files = inputDirectory.listFiles()
|
262 |
|
|
|
264 |
|
263 |
265 |
if (files == null) {
|
264 |
266 |
println "Aborting. No CONLL file found in $inputDirectory."
|
265 |
267 |
return false
|
266 |
268 |
}
|
267 |
269 |
files.sort()
|
268 |
270 |
def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
|
269 |
|
|
|
271 |
|
270 |
272 |
String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
|
271 |
|
|
|
273 |
|
272 |
274 |
UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference
|
273 |
|
|
|
275 |
|
274 |
276 |
def headPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT)).split(",") as Set
|
275 |
|
|
|
277 |
|
276 |
278 |
def depsPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT)).split(",") as Set
|
277 |
|
|
|
279 |
|
278 |
280 |
def formatSentences = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES))
|
279 |
|
|
|
281 |
|
280 |
282 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
281 |
|
|
|
283 |
|
282 |
284 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
|
283 |
|
|
|
285 |
|
284 |
286 |
println "Parsing CoNLL-U files..."
|
285 |
287 |
for (File master : files) {
|
286 |
|
|
|
288 |
|
287 |
289 |
cpb_texts.tick()
|
288 |
|
|
|
290 |
|
289 |
291 |
if (!master.getName().endsWith(".conllu")) {
|
290 |
292 |
continue
|
291 |
293 |
}
|
292 |
|
|
|
294 |
|
293 |
295 |
def content = [] // list of sentence
|
294 |
|
|
|
296 |
|
295 |
297 |
String text_id = FileUtils.stripExtension(master)
|
296 |
298 |
String sent_id = ""
|
297 |
299 |
String par_id = "1"
|
298 |
300 |
def comments = [] // /text/par/sent
|
299 |
301 |
def words = []
|
300 |
|
|
|
302 |
|
301 |
303 |
master.eachLine("UTF-8") { line ->
|
302 |
|
|
|
304 |
|
303 |
305 |
if (line.startsWith("# newdoc id = ")) {
|
304 |
306 |
// already set or ignored
|
305 |
307 |
} else if (line.startsWith("# sent_id = ")) {
|
... | ... | |
312 |
314 |
if (words.size() > 0) {
|
313 |
315 |
def sentence = [par_id, sent_id, words, comments]
|
314 |
316 |
content.add(sentence)
|
315 |
|
|
|
317 |
|
316 |
318 |
sent_id = ""
|
317 |
319 |
par_id = "1"
|
318 |
320 |
comments = []
|
319 |
321 |
words = []
|
320 |
322 |
}
|
321 |
|
|
|
323 |
|
322 |
324 |
} else {
|
323 |
|
|
|
325 |
|
324 |
326 |
LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
|
325 |
|
|
|
327 |
|
326 |
328 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
|
327 |
329 |
if (split.size() == properties.size()) {
|
328 |
330 |
String id = split[0]
|
329 |
331 |
for (int i = 0 ; i < split.size() ; i++) {
|
330 |
332 |
wProperties[properties[i]] = split[i]
|
331 |
333 |
}
|
332 |
|
|
333 |
|
// if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
|
334 |
|
// if (words.size() > 0) {
|
335 |
|
// def sentence = [par_id, sent_id, words, comments]
|
336 |
|
// content.add(sentence)
|
337 |
|
//
|
338 |
|
// sent_id = ""
|
339 |
|
// par_id = "1"
|
340 |
|
// comments = []
|
341 |
|
// words = []
|
342 |
|
// }
|
343 |
|
// }
|
|
334 |
|
|
335 |
// if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
|
|
336 |
// if (words.size() > 0) {
|
|
337 |
// def sentence = [par_id, sent_id, words, comments]
|
|
338 |
// content.add(sentence)
|
|
339 |
//
|
|
340 |
// sent_id = ""
|
|
341 |
// par_id = "1"
|
|
342 |
// comments = []
|
|
343 |
// words = []
|
|
344 |
// }
|
|
345 |
// }
|
344 |
346 |
if (wProperties[properties[0]].contains(".")) { // id
|
345 |
347 |
// empty node
|
346 |
348 |
} else {
|
... | ... | |
351 |
353 |
}
|
352 |
354 |
}
|
353 |
355 |
}
|
354 |
|
|
|
356 |
|
355 |
357 |
if (words.size() > 0) { // last sentence ?
|
356 |
358 |
def sentence = [par_id, sent_id, words, comments]
|
357 |
359 |
content.add(sentence)
|
358 |
360 |
}
|
359 |
|
|
|
361 |
|
360 |
362 |
if (content.size() == 0) {
|
361 |
363 |
continue;
|
362 |
364 |
}
|
363 |
|
|
|
365 |
|
364 |
366 |
//println "${content.size()} sentences found."
|
365 |
|
|
|
367 |
|
366 |
368 |
File xmlFile = new File(srcDirectory, text_id+".xml")
|
367 |
369 |
// println "xmlFile=$xmlFile"
|
368 |
370 |
BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
|
369 |
371 |
XMLOutputFactory factory = XMLOutputFactory.newInstance()
|
370 |
372 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
|
371 |
|
|
|
373 |
|
372 |
374 |
writer.writeStartDocument("UTF-8","1.0")
|
373 |
375 |
writer.writeStartElement ("TEI")
|
374 |
376 |
writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0")
|
... | ... | |
378 |
380 |
writer.writeEndElement()
|
379 |
381 |
writer.writeCharacters("\n")
|
380 |
382 |
writer.writeStartElement ("text")
|
381 |
|
|
|
383 |
|
382 |
384 |
writer.writeCharacters("\n")
|
383 |
|
|
|
385 |
|
384 |
386 |
String current_par_id = null
|
385 |
387 |
int wordCounter = 0
|
386 |
388 |
for (def sentence : content) { // for all paragraph of the current text
|
387 |
|
|
|
389 |
|
388 |
390 |
par_id = sentence[0]
|
389 |
391 |
sent_id = sentence[1]
|
390 |
392 |
words = sentence[2]
|
391 |
393 |
comments = sentence[3]
|
392 |
|
|
|
394 |
|
393 |
395 |
if (current_par_id == null || par_id != current_par_id) {
|
394 |
396 |
if (current_par_id != null) {
|
395 |
397 |
writer.writeEndElement() // p
|
... | ... | |
397 |
399 |
writer.writeStartElement ("p")
|
398 |
400 |
writer.writeAttribute("id", par_id)
|
399 |
401 |
writer.writeCharacters("\n")
|
400 |
|
|
|
402 |
|
401 |
403 |
current_par_id = par_id
|
402 |
404 |
}
|
403 |
|
|
|
405 |
|
404 |
406 |
writer.writeStartElement ("s")
|
405 |
407 |
writer.writeAttribute("id", sent_id)
|
406 |
408 |
writer.writeCharacters("\n")
|
407 |
|
|
|
409 |
|
408 |
410 |
for (def comment : comments) {
|
409 |
411 |
writer.writeComment(comment.replace("--", "−−"))
|
410 |
412 |
writer.writeCharacters("\n")
|
411 |
413 |
}
|
412 |
|
|
|
414 |
|
413 |
415 |
if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
|
414 |
416 |
LinkedHashMap sentencehash = new LinkedHashMap()
|
415 |
417 |
//println "WORDS="+words
|
... | ... | |
419 |
421 |
//println "SENTENCE="+sentencehash
|
420 |
422 |
ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
|
421 |
423 |
}
|
422 |
|
|
|
424 |
|
423 |
425 |
if (formatSentences) {
|
424 |
|
writer.writeStartElement("p")
|
425 |
|
writer.writeAttribute("type", "sentence")
|
426 |
|
writer.writeAttribute("style", "--before-content:'$sent_id';")
|
|
426 |
writer.writeStartElement("p")
|
|
427 |
writer.writeAttribute("type", "sentence")
|
|
428 |
writer.writeAttribute("style", "--before-content:'$sent_id';")
|
427 |
429 |
}
|
428 |
|
|
|
430 |
|
429 |
431 |
for (def word : words) {
|
430 |
|
|
431 |
|
println "UD-ID="+word["id"]
|
|
432 |
|
|
433 |
//println "UD-ID="+word["id"]
|
432 |
434 |
if (word["id"].contains("-")) {
|
433 |
435 |
writer.writeStartElement("seg")
|
434 |
436 |
writer.writeCharacters("******")
|
435 |
437 |
writer.writeEndElement() // span
|
436 |
438 |
}
|
437 |
|
|
|
439 |
|
438 |
440 |
String id = null
|
439 |
441 |
wordCounter++
|
440 |
442 |
writer.writeStartElement ("w")
|
... | ... | |
449 |
451 |
//println "WORD="+word
|
450 |
452 |
writer.writeAttribute(prefix+p, word[p])
|
451 |
453 |
}
|
452 |
|
|
|
454 |
|
453 |
455 |
if (id != null) {
|
454 |
|
writer.writeAttribute("id", id)
|
|
456 |
writer.writeAttribute("id", id)
|
455 |
457 |
} else {
|
456 |
458 |
writer.writeAttribute("id", "w_"+text_id+"_"+wordCounter)
|
457 |
459 |
}
|
458 |
|
|
|
460 |
|
459 |
461 |
writer.writeCharacters(word["form"])
|
460 |
462 |
writer.writeEndElement() // w
|
461 |
463 |
writer.writeCharacters(" ")
|
462 |
464 |
}
|
463 |
|
|
|
465 |
|
464 |
466 |
if (formatSentences) writer.writeEndElement()
|
465 |
|
|
|
467 |
|
466 |
468 |
writer.writeCharacters("\n")
|
467 |
469 |
writer.writeEndElement() // s
|
468 |
470 |
}
|
469 |
|
|
|
471 |
|
470 |
472 |
if (current_par_id != null) {
|
471 |
473 |
writer.writeEndElement() // p
|
472 |
474 |
writer.writeCharacters("\n")
|
473 |
475 |
}
|
474 |
|
|
|
476 |
|
475 |
477 |
writer.writeEndElement() // text
|
476 |
478 |
writer.writeCharacters("\n")
|
477 |
479 |
writer.writeEndElement() // TEI
|
478 |
480 |
writer.close()
|
479 |
481 |
}
|
480 |
|
|
|
482 |
|
481 |
483 |
cpb_texts.done()
|
482 |
|
|
|
484 |
|
483 |
485 |
return true
|
484 |
486 |
}
|
485 |
|
|
|
487 |
|
486 |
488 |
}
|