Révision 3346
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tigerud2.groovy (revision 3346) | ||
---|---|---|
1 |
|
|
2 |
def CMD="conll2tiger.pl"; |
|
3 |
def VERSION = "1.5"; |
|
4 |
def MODIFIED = "8/12/2015"; // angepasst für Perseus CoNLL erzeugt mit conll.pl -l. CHECK: funktioniert SRCMF noch? |
|
5 |
|
|
6 |
// columns (default are the predicted values in CoNLL 2009 format) |
|
7 |
def coll = 2; // lemma |
|
8 |
def colm = 3; // morph (pos) |
|
9 |
def colf = 5; // features |
|
10 |
def colh = 6; // head |
|
11 |
def cold = 7; // deprel |
|
12 |
def outdir = "conllexport"; // deprel |
|
13 |
def split = 1000; // split output after nr sentences |
|
14 |
|
|
15 |
// tree structure |
|
16 |
def dominates = [:]; |
|
17 |
def deprel = [:]; // deprel{nr} = deprel |
|
18 |
def daughters = []; // daughter nodes, stored in %dominates |
|
19 |
def duplicates = [:]; |
|
20 |
def relators = [:]; |
|
21 |
def notes = [:] |
|
22 |
def aux = [:]; // store nodes of duplicates, relators |
|
23 |
def type = "--"; // node attribute |
|
24 |
def vform = "--" |
|
25 |
def vlemma = "--"; // node attributes for verbs store form and lemma |
|
26 |
def label = "D"; // default edge label |
|
27 |
def nt_features_header = ''; // option -x |
|
28 |
def nt_features = ''; // option -x |
|
29 |
def nt_empty_features = ''; // option -x |
|
30 |
def scodes = []; // option -x |
|
31 |
def add_to_sentcode = ''; |
|
32 |
def rootname = 'root'; // default |
|
33 |
def featcol = 13; |
|
34 |
|
|
35 |
//##################################################################### |
|
36 |
// conll2tiger.pl: converts CoNLL-U from the Universal Dependecies |
|
37 |
// treebanks to TigerXML |
|
38 |
// Achim Stein <achim.stein@ling.uni-stuttgart.de> |
|
39 |
// License : GNU GPL v. 3 (see the LICENSE file) |
|
40 |
//##################################################################### |
|
41 |
// TO DO: |
|
42 |
// - coordination |
|
43 |
// - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100) |
|
44 |
// - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet |
|
45 |
// - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist |
|
46 |
//##################################################################### |
|
47 |
// Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr> |
|
48 |
// for Profiterole project (2019-2021) |
|
49 |
|
|
50 |
// 2019-09-25 |
|
51 |
// - updated default column numbers for CONNL-U SRCMF format |
|
52 |
// - added processing for comment lines |
|
53 |
// - added @textid to terminal nodes |
|
54 |
// - deleted ppos, pmor et plemma (predicted tags and lemmas) |
|
55 |
// - replaced specific SRCMF with standard UD tags |
|
56 |
|
|
57 |
// Update 2020-05-13 |
|
58 |
// - added @editionId for synchronization with BFM word ID |
|
59 |
|
|
60 |
// Update 2021-03-22 |
|
61 |
// - using $infilename for @textid |
|
62 |
// - added support for .conllu extension |
|
63 |
|
|
64 |
// Update 2021-03-29 |
|
65 |
// - added editionId to declarations in main.xml |
|
66 |
|
|
67 |
// Update 2021-07-16 |
|
68 |
// - added "punct" to cat values |
|
69 |
|
|
70 |
// Update 2021-07-20 |
|
71 |
// - added cat value list compiled from |
|
72 |
// https://universaldependencies.org/ext-dep-index.html and the previous |
|
73 |
// version. All relation types and subtypes from the UD 2.8 corpora |
|
74 |
// should be there. |
|
75 |
// - contractions indexed |
|
76 |
//##################################################################### |
|
77 |
|
|
78 |
def HELP = """ |
|
79 |
================================================================== |
|
80 |
$CMD $VERSION: Help |
|
81 |
================================================================== |
|
82 |
FUNKTION: converts CoNLL parser output to TigerXML (for mate tools) |
|
83 |
creates master file, splits input files, corrects unbound nodes |
|
84 |
SYNTAX: $CMD [Options] <CoNLL file> |
|
85 |
OPTIONEN: |
|
86 |
-c ignore coordination (delete coordx- prefix in deprel) |
|
87 |
-C str corpus specials: nca |
|
88 |
-h show help |
|
89 |
-o create all files in this output directory (default: $outdir) |
|
90 |
set COLUMNS for required info (0 = column 1, 1 = column 2, etc.) |
|
91 |
-D nr colum for deprel default=$cold |
|
92 |
-H nr colum for head default=$colh |
|
93 |
-M nr colum for morphology (POS) default=$colm |
|
94 |
-F nr colum for morph. features default=$colf |
|
95 |
-R str Root category (default: $rootname) |
|
96 |
-s nr split output files after each nr sentence (default = $split) |
|
97 |
-x str,... include these attributes if present in the -X column of the first word |
|
98 |
(the first code is also copied into the sentence id) |
|
99 |
-X nr the column where attributes are stored (default: $featcol) |
|
100 |
EXAMPLE: |
|
101 |
- For mate parser output: no further options required |
|
102 |
$CMD parsed.conll |
|
103 |
- For Le Monde 2005: include attributes |
|
104 |
gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr |
|
105 |
- For NCA: |
|
106 |
conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll |
|
107 |
"""; |
|
108 |
|
|
109 |
//########################################################################## |
|
110 |
// DO NOT MODIFY FOLLOWING CODE ! |
|
111 |
//########################################################################## |
|
112 |
|
|
113 |
|
|
114 |
//########################################################################## |
|
115 |
// parse the command line |
|
116 |
//########################################################################## |
|
117 |
|
|
118 |
getopts('c:C:hD:H:M:o:R:s:x:X:'); |
|
119 |
|
|
120 |
if (defined(opt_h)) { |
|
121 |
println "** " + "$HELP"; |
|
122 |
return 0; |
|
123 |
} |
|
124 |
|
|
125 |
if (defined(opt_o)) { |
|
126 |
outdir = opt_o |
|
127 |
} |
|
128 |
if (defined(opt_C)) { |
|
129 |
corpus = opt_C; |
|
130 |
} |
|
131 |
if (defined(opt_D)) { |
|
132 |
cold = opt_D |
|
133 |
} |
|
134 |
if (defined(opt_H)) { |
|
135 |
colh = opt_H |
|
136 |
} |
|
137 |
if (defined(opt_M)) { |
|
138 |
colm = opt_M |
|
139 |
} |
|
140 |
|
|
141 |
if (defined(opt_R)) { |
|
142 |
rootname = opt_R; |
|
143 |
} |
|
144 |
|
|
145 |
if (defined(opt_s)) { |
|
146 |
split = opt_s |
|
147 |
} |
|
148 |
|
|
149 |
if (defined(opt_X)) { |
|
150 |
featcol = opt_X; |
|
151 |
} |
|
152 |
|
|
153 |
if (defined(opt_x)) { |
|
154 |
scodes = opt_x.split(","); |
|
155 |
for (def i = 0; i <= scodes.size(); i++) { |
|
156 |
nt_features_header = nt_features_header + sprintf("<feature name=\"%s\" domain=\"NT\"></feature>\n", scodes[i]); |
|
157 |
} |
|
158 |
nt_features_header =~ s/\bid\b/ncaid/; // avoid reserved Tiger attribute "id" |
|
159 |
} |
|
160 |
|
|
161 |
|
|
162 |
def colnames = ["url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL"]; |
|
163 |
// def pos = [:]%lemma = %form = hDeprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = [:]; |
|
164 |
def coordelements = []; |
|
165 |
|
|
166 |
def id = def form = def lemma = def plemma = def pos = def ppos = def feat = def pfeat = def head = def phead = def deprel = def pdeprel = def edition_id = ""; |
|
167 |
|
|
168 |
def timestamp = `date`; |
|
169 |
chomp(timestamp); |
|
170 |
|
|
171 |
def infile = ARGV[0]; |
|
172 |
infile =~ s/\.conllu?//i; |
|
173 |
if (infile == '') { |
|
174 |
infile = 'subcorpus'; |
|
175 |
} |
|
176 |
def counter = 1; |
|
177 |
suffix = sprintf("%05d", counter); |
|
178 |
infilename = basename(infile); |
|
179 |
|
|
180 |
foo = `if [ ! -d outdir ];then mkdir outdir;fi`; |
|
181 |
open(XML, ">$outdir/$infilename-$suffix.xml") or die "\nopen file error of $outdir/$infilename-$suffix.xml\n"; |
|
182 |
open(LOG, ">$outdir/conversion.log") or die "\nopen file error of conversion.log\n"; |
|
183 |
open(MASTER, ">$outdir/main.xml") or die "\nopen file error of main.xml\n"; |
|
184 |
write_xml_header(); |
|
185 |
write_master_header(); |
|
186 |
|
|
187 |
// flush output for log and master file |
|
188 |
select(LOG); $| = 1; |
|
189 |
select(MASTER); $| = 1; |
|
190 |
|
|
191 |
commandline = $0 + " " + (join " ", @ARGV); |
|
192 |
LOG << "$commandline\n\n"; |
|
193 |
|
|
194 |
MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n"; |
|
195 |
|
|
196 |
$/ = ""; // treat empty line as RS |
|
197 |
while (<>) { |
|
198 |
if ($. % split == 0) { |
|
199 |
XML << "</subcorpus>\n"; |
|
200 |
close(XML); |
|
201 |
suffix = sprintf("%05d", ++counter); |
|
202 |
open(XML, ">$outdir/$infilename-$suffix.xml") or die "\nopen file error\n"; |
|
203 |
write_xml_header(); |
|
204 |
MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n"; |
|
205 |
} |
|
206 |
|
|
207 |
// ---------------------------------------- |
|
208 |
// set root (or fake root if ROOT is missing) |
|
209 |
// ---------------------------------------- |
|
210 |
rootnode = fakeroot = 0; // m = Treat string as multiple lines, so that ^ matches beginning of line |
|
211 |
thisrootname = rootname; |
|
212 |
(rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m); // real root marked by parser |
|
213 |
if (rootnode == 0) { |
|
214 |
// (rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m); // no marked ROOT, but top node (head = 0) TOO SPECIFIC |
|
215 |
// (rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m); // no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein) |
|
216 |
(rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m); // no marked ROOT, but top node (head = 0) in col7 (updated by AL) |
|
217 |
LOG << " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n"; |
|
218 |
fakeroot = 1; |
|
219 |
thisrootname = 'nSnt'; |
|
220 |
} |
|
221 |
if (rootnode == 0) { |
|
222 |
rootnode = 1; // set fake root if nothing goes |
|
223 |
LOG << " Error sentence $.: setting fake root to first word:\n$_\n"; |
|
224 |
fakeroot = 2; |
|
225 |
thisrootname = 'Err'; |
|
226 |
} |
|
227 |
|
|
228 |
def cols = []; |
|
229 |
@words = split (/\n/); |
|
230 |
@terminals = []; |
|
231 |
|
|
232 |
%dominates = [:]; // empty at beginning of sentence |
|
233 |
hDeprel = [:]; // empty at beginning of sentence |
|
234 |
%aux = [:]; // empty at beginning of sentence |
|
235 |
daughters = []; |
|
236 |
|
|
237 |
def commentlines = 0; #added by AL |
|
238 |
// def contractions = 0; #added by AL |
|
239 |
// def text_id = "unknown_text"; |
|
240 |
def text_id = infilename; |
|
241 |
def sent_id = "0"; |
|
242 |
|
|
243 |
// ---------------------------------------- |
|
244 |
// loop through words #1: write tokens (terminal nodes) to XML file |
|
245 |
// store tree relevant information for loop #2 |
|
246 |
// ---------------------------------------- |
|
247 |
for (def w = 0; w <= words.size(); w++) { |
|
248 |
// Added by AL for comment lines |
|
249 |
if (words[w] =~ /^#/) { |
|
250 |
if (words[w] =~ /^# newdoc/) { |
|
251 |
text_id = words[w]; |
|
252 |
text_id =~ s/# newdoc id = //; |
|
253 |
} |
|
254 |
elsif (words[w] =~ /^# sent_id/) { |
|
255 |
sent_id = words[w]; |
|
256 |
sent_id =~ s/# sent_id = //; |
|
257 |
} |
|
258 |
// LOG << "Comment line loop 1: words[w]\n"; |
|
259 |
commentlines++; |
|
260 |
next; |
|
261 |
} |
|
262 |
// Added by AL for contractions |
|
263 |
elsif (words[w] =~ /^\d+-\d+/) { |
|
264 |
// LOG << "Contraction line loop 1: words[w]\n"; |
|
265 |
commentlines++; |
|
266 |
// contractions++; |
|
267 |
next; |
|
268 |
} |
|
269 |
else { |
|
270 |
if (defined (opt_c)) { |
|
271 |
words[w] =~ s/coord(\d+)-//g; |
|
272 |
} |
|
273 |
@cols = split (/\t/, words[w]); |
|
274 |
wnr = cols[0]; |
|
275 |
word = cols[1]; |
|
276 |
lemma = cols[2]; |
|
277 |
plemma = cols[2]; // predicted |
|
278 |
pos = cols[3]; |
|
279 |
ppos = cols[4]; // predicted |
|
280 |
mor = cols[5]; |
|
281 |
pmor = cols[5]; // predicted |
|
282 |
cat = cols[cold]; |
|
283 |
edition_id = cols[9]; |
|
284 |
edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g; |
|
285 |
|
|
286 |
if (cat =~ /[<>]/) { |
|
287 |
LOG << "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n"; |
|
288 |
cat = 'Err2'; |
|
289 |
} |
|
290 |
|
|
291 |
// NCA: enclose lemmas in underscores (easier for regex construction) |
|
292 |
if (corpus =~ /nca/i) { |
|
293 |
lemma = "_" + "$lemma" + "_" |
|
294 |
} |
|
295 |
|
|
296 |
clean_data(); |
|
297 |
|
|
298 |
// get attribute-value pairs from col #13 of first word (option -x) |
|
299 |
if (opt_x == "all") { |
|
300 |
cols[featcol] = "all=" + cols[featcol]; |
|
301 |
} |
|
302 |
if (w == 0 && cols[featcol] =~ /=/) { |
|
303 |
// println "** " + "========== getting att-value for word w: cols[featcol] scodes=@scodes\n"; |
|
304 |
nt_features = nt_empty_features = ''; |
|
305 |
// while(cols[featcol] =~ m/ (.*?)="([^"]*)"/gs) { // quoted values |
|
306 |
while(cols[featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs) { // maybe unquoted values (e.g. Le Monde 2005) |
|
307 |
att = $1; |
|
308 |
val = $2; |
|
309 |
// pick the attributes that match those of the command line option -x |
|
310 |
for (def t = 0; t <= scodes.size(); t++) { |
|
311 |
if (att == scodes[t]) { |
|
312 |
val =~ s/\&/\&/g; // replace "&" in values (appears in URLs) |
|
313 |
if (t == 0) { add_to_sentcode = "_$att$val"; } |
|
314 |
nt_features = nt_features + " $att=\"$val\""; |
|
315 |
// println "** " + "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n"; |
|
316 |
} |
|
317 |
if (att == scodes[t]) { nt_empty_features = nt_empty_features + " $att=\"--\"";} |
|
318 |
} |
|
319 |
} |
|
320 |
// replace the reserved feature 'id' (Tiger) |
|
321 |
add_to_sentcode =~ s/\bid=/ncaid=/; |
|
322 |
nt_features =~ s/\bid=/ncaid=/; |
|
323 |
nt_empty_features =~ s/\bid=/ncaid=/; |
|
324 |
} // if col 13 contains attributes |
|
325 |
else { |
|
326 |
if (defined(opt_x) && (w == 0)) { |
|
327 |
println "** " + "Warning: sentence=$. option -x is defined, but no attribute=value declarations were found!\n"; |
|
328 |
} |
|
329 |
} |
|
330 |
|
|
331 |
|
|
332 |
// store output for terminal node in array, output later. For double categories make a duplicate node. |
|
333 |
tempid = sprintf("%d_%d", $., wnr); |
|
334 |
// push(@terminals, sprintf(" <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, ppos, pmor, plemma, text_id, edition_id)); |
|
335 |
push(@terminals, sprintf(" <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, text_id, edition_id)); |
|
336 |
if (cat =~ /_/) { |
|
337 |
// push(@terminals, sprintf(" <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", "_", "_", "_", text_id, edition_id)); |
|
338 |
push(@terminals, sprintf(" <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", text_id, edition_id)); |
|
339 |
duplicates{tempid} = 1; // store, check later to attach the duplicates to the mother |
|
340 |
} |
|
341 |
|
|
342 |
// associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux) |
|
343 |
if (cat =~ /Aux/) { |
|
344 |
aux{cols[colh]} = "$word" + "_" + "$plemma"; // aux{head} = word_lemma (of Aux) |
|
345 |
} |
|
346 |
|
|
347 |
// ---------------------------------------- |
|
348 |
// store information needed for tree |
|
349 |
// ---------------------------------------- |
|
350 |
// if fake rootnode == 1: nSnt as root node |
|
351 |
if ((fakeroot == 1) && (w-commentlines+1 == rootnode)) { |
|
352 |
cat = 'nSnt'; |
|
353 |
notes{tempid} = 'Warning no marked ROOT node in CoNLL'; // TODO: geht nicht |
|
354 |
} |
|
355 |
// if fake rootnode == 2: flatten structure: attach all words to the first word |
|
356 |
if ((fakeroot == 2) && (w-commentlines+1 != rootnode)) { |
|
357 |
cols[colh] = 1; |
|
358 |
notes{tempid} = 'Error neither ROOT node nor top node in CoNLL'; |
|
359 |
} |
|
360 |
// correct unbound words in parser output (phead = 0, but not marked as ROOT) |
|
361 |
if ((cols[colh] == "0") && (w-commentlines+1 != rootnode)) { // AL: added: -commentlines |
|
362 |
printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", (w-commentlines+1), rootnode; |
|
363 |
cols[colh] = rootnode; |
|
364 |
cat = 'Err'; // let Err instead of deprel appear in dom attribute |
|
365 |
notes{tempid} = 'Warning unbound node in CoNLL'; |
|
366 |
} |
|
367 |
// store for R edge labels |
|
368 |
if (cols[cold] =~ /RelN?C/) { |
|
369 |
relators{tempid} = 1; |
|
370 |
} |
|
371 |
// store deprel for dom attribute |
|
372 |
deprel[tempid] = cat; // cols[cold]; |
|
373 |
// if real root, add this node to daughter array, store array in hash dominates{head}{@daughters} |
|
374 |
if ((fakeroot < 2) && (w-commentlines+1 != rootnode)) { |
|
375 |
daughters = @{dominates[ols[colh]}}; // get the array from the hash of the dominating node |
|
376 |
push(daughters, wnr); |
|
377 |
dominates[cols[colh]] = [daughters]; |
|
378 |
} |
|
379 |
} // for each word loop #1 |
|
380 |
} // AL condition end |
|
381 |
|
|
382 |
// print graph code (needs root attribute) and terminal nodes |
|
383 |
if (rootnode == 0) { |
|
384 |
noroot++; |
|
385 |
LOG << "Error sentence $. ($tempid): root node not found:\n$_\n"; next; |
|
386 |
} else { |
|
387 |
printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., add_to_sentcode; |
|
388 |
XML << " <graph root=\"n$._$rootnode\">\n"; |
|
389 |
XML << " <terminals>\n"; |
|
390 |
for (def t = 0; t <= terminals.size(); t++) { |
|
391 |
XML << terminals[t]; |
|
392 |
} |
|
393 |
XML << " </terminals>\n"; |
|
394 |
} |
|
395 |
|
|
396 |
// ---------------------------------------- |
|
397 |
// loop through words #2 to build Tiger tree (non terminal nodes) |
|
398 |
// ---------------------------------------- |
|
399 |
XML << " <nonterminals>\n"; |
|
400 |
for (def i = 0; i <= words.size(); i++) { |
|
401 |
//Added AL for comment lines |
|
402 |
if (words[i] =~ /^#/) { |
|
403 |
// LOG << "Comment line loop 2 : $words[$i]\n"; |
|
404 |
next; |
|
405 |
} |
|
406 |
//Added AL for contractions |
|
407 |
if (words[i] =~ /^\d+-\d+/) { |
|
408 |
// LOG << "Contraction loop 2 : $words[$i]\n"; |
|
409 |
next; |
|
410 |
} |
|
411 |
|
|
412 |
else { |
|
413 |
|
|
414 |
@cols = split (/\t/, words[i]); |
|
415 |
w = cols[0]; |
|
416 |
// TODO: redundante Variablenzuweisung (= loop #1)?? |
|
417 |
word = cols[1]; |
|
418 |
lemma = cols[2]; |
|
419 |
plemma = cols[3]; // predicted |
|
420 |
pos = cols[4]; |
|
421 |
ppos = cols[5]; // predicted |
|
422 |
mor = cols[6]; |
|
423 |
pmor = cols[7]; // predicted |
|
424 |
cat = cols[cold]; |
|
425 |
|
|
426 |
if (cat =~ /[<>]/) { |
|
427 |
print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n"; |
|
428 |
cat = 'Err2'; |
|
429 |
} |
|
430 |
|
|
431 |
// OF parser has not learned punctuation: set cat for punctuation to PON |
|
432 |
if ((corpus =~ /nca/i) && (pos == 'PON')) { |
|
433 |
cols[cold] = cat = 'Pon'; |
|
434 |
} |
|
435 |
|
|
436 |
clean_data(); |
|
437 |
|
|
438 |
// retrieve daughters, make dom attribute (string of dominated nodes) |
|
439 |
daughters = @{dominates["$w"]}; |
|
440 |
dom = ''; |
|
441 |
for (def d = 0; d <= daughters.size(); d++) { |
|
442 |
dom = dom + "_" + deprel["$._$daughters{$d}"]; |
|
443 |
} |
|
444 |
if (dom =~ /_/) { |
|
445 |
dom =~ s/^_//; |
|
446 |
} else { |
|
447 |
dom = '--'; |
|
448 |
} |
|
449 |
|
|
450 |
// if verbal, set node attributes for verb form and lemma |
|
451 |
type = "nV"; |
|
452 |
vform = vlemma = "--"; |
|
453 |
if (pos =~ /VER/) { // AL: ppos -> pos |
|
454 |
if (mor =~ /infi/) { type = "VInf"; } // AL: pmor -> mor |
|
455 |
elsif (pmor =~ /pper|ppre/) { type = "VPar"; } |
|
456 |
else { type = "VFin"; } |
|
457 |
// if Aux is present, create attribute for main verb |
|
458 |
if (aux{w} =~ /(.*?)_(.*)/) { |
|
459 |
vform = "$1"; |
|
460 |
vlemma = "$2"; |
|
461 |
} |
|
462 |
// else create attr for simple verb |
|
463 |
else { |
|
464 |
vform = word; |
|
465 |
vlemma = lemma; // AL: plemma -> lemma (always void in SRCMF) |
|
466 |
} |
|
467 |
// NCA: enclose lemmas in underscores (easier for regex construction) |
|
468 |
if (corpus =~ /nca/i) { |
|
469 |
vlemma = "_" + "$vlemma" + "_" |
|
470 |
} |
|
471 |
} |
|
472 |
|
|
473 |
// call output function (twice for duplicate categories) |
|
474 |
if (cat =~ /(.*?)_(.*)/) { |
|
475 |
write_nonterminals("$2", ""); // RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC |
|
476 |
write_nonterminals("$1", "_dupl"); // other category is duplicate |
|
477 |
} else { |
|
478 |
write_nonterminals(cat); |
|
479 |
} |
|
480 |
} // for words |
|
481 |
} #AL end condition |
|
482 |
|
|
483 |
XML << " </nonterminals>\n"; |
|
484 |
XML << " </graph>\n"; |
|
485 |
XML << "</s>\n"; |
|
486 |
if ($. % 100 == 0) { println "** " + "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.;} |
|
487 |
} // main |
|
488 |
XML << "</subcorpus>\n"; |
|
489 |
println "** " + "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n"; |
|
490 |
println "** " + " Hint 1: on OS X convert master file to MacRoman, e.g iconv -f latin1 -t macroman\n"; |
|
491 |
println "** " + " Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n"; |
|
492 |
println "** " + " Hint 3: build reliable feature declarations using tiger.sh\n"; |
|
493 |
println "** " + " tiger.sh -a \"lemma word pos ppos\" (for terminals)\n"; |
|
494 |
println "** " + " tiger.sh -A \"lemma word pos ppos\" (for non-terminals)\n"; |
|
495 |
if (noroot > 0) {println "** " + "$noroot sentences ignored: root not found (see log file)\n";} |
|
496 |
write_master_footer(); |
|
497 |
close(MASTER); |
|
498 |
close(XML); |
|
499 |
close(LOG); |
|
500 |
|
|
501 |
exit; |
|
502 |
|
|
503 |
|
|
504 |
|
|
505 |
// ---------------------------------------- |
|
506 |
// sub |
|
507 |
// ---------------------------------------- |
|
508 |
|
|
509 |
def write_xml_header { |
|
510 |
XML << """<?xml version=\"1.0\" encoding=\"UTF-8\"?> |
|
511 |
<subcorpus name=\"$infilename-$suffix\"> |
|
512 |
"""; |
|
513 |
} |
|
514 |
|
|
515 |
def write_master_header { |
|
516 |
printf MASTER """<?xml version="1.0" encoding="UTF-8"?> |
|
517 |
"""; |
|
518 |
|
|
519 |
printf MASTER """<corpus id=\"$corpus\"> |
|
520 |
<head> |
|
521 |
<meta><name>$corpus</name> |
|
522 |
<author>ILR Stuttgart</author> |
|
523 |
<date></date> |
|
524 |
<description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description> |
|
525 |
<format>SRCMF</format> |
|
526 |
<history>TigerXML converted by conll2tiger.pl</history> |
|
527 |
</meta> |
|
528 |
"""; |
|
529 |
|
|
530 |
// printf MASTER '<annotation> |
|
531 |
//<feature name="word" domain="T" ></feature> |
|
532 |
//<feature name="pos" domain="T" ></feature> |
|
533 |
//<feature name="mor" domain="T" ></feature> |
|
534 |
//<feature name="lemma" domain="T" ></feature> |
|
535 |
//<feature name="ppos" domain="T" ></feature> |
|
536 |
//<feature name="pmor" domain="T" ></feature> |
|
537 |
//<feature name="plemma" domain="T" ></feature> |
|
538 |
//<feature name="cat" domain="NT" > |
|
539 |
// <value name="Apst">apostrophe</value> |
|
540 |
// <value name="AtObj">attribut d objet</value> |
|
541 |
// <value name="AtRfc">attribut réfléchi</value> |
|
542 |
// <value name="AtSj">attribut de sujet</value> |
|
543 |
// <value name="AttributReflechi">attribut réfléchi</value> |
|
544 |
// <value name="Aux">auxilié</value> |
|
545 |
// <value name="AuxA">auxilié actif</value> |
|
546 |
// <value name="AuxP">auxilié passif</value> |
|
547 |
// <value name="Circ">circonstant</value> |
|
548 |
// <value name="Circ_RelNC">circonstant pronom relatif</value> |
|
549 |
// <value name="Cmpl">complément</value> |
|
550 |
// <value name="Cmpl_RelNC">complément pronom relatif</value> |
|
551 |
// <value name="Coo">coordination</value> |
|
552 |
// <value name="Det">déterminant</value> |
|
553 |
// <value name="Err">unbound node in CoNLL input</value> |
|
554 |
// <value name="Err2">illegal node name was replaced</value> |
|
555 |
// <value name="GpCoo">coordonné</value> |
|
556 |
// <value name="Ignorer">Ignorer</value> |
|
557 |
// <value name="Insrt">incidente</value> |
|
558 |
// <value name="Intj">interjection</value> |
|
559 |
// <value name="Lac">lacune</value> |
|
560 |
// <value name="ModA">modifieur attaché</value> |
|
561 |
// <value name="ModD">modifieur détaché</value> |
|
562 |
// <value name="Ng">négation</value> |
|
563 |
// <value name="NgPrt">forclusif</value> |
|
564 |
// <value name="Obj">objet</value> |
|
565 |
// <value name="Obj_RelNC">direct object pronom relatif</value> |
|
566 |
// <value name="Pon">ponctuation</value> |
|
567 |
// <value name="PON">ponctuation</value> |
|
568 |
// <value name="Regim">régime</value> |
|
569 |
// <value name="RelC">relateur coordonnant</value> |
|
570 |
// <value name="RelNC">relateur non coordonnant</value> |
|
571 |
// <value name="Rfc">réfléchi</value> |
|
572 |
// <value name="Rfx">réfléxif renforcé</value> |
|
573 |
// <value name="SjImp">sujet impersonnel</value> |
|
574 |
// <value name="SjPer">sujet personnel</value> |
|
575 |
// <value name="SjPer_RelNC">sujet personnel pronom relatif</value> |
|
576 |
// <value name="Snt">phrase</value> |
|
577 |
// <value name="ROOT">phrase</value> |
|
578 |
// <value name="StructureMaximale">structure maximale</value> |
|
579 |
// <value name="VFin">verbe fini</value> |
|
580 |
// <value name="VInf">verbe infinitif</value> |
|
581 |
// <value name="nMax">structure non-maximale</value> |
|
582 |
// <value name="nSnt">non-phrase</value> |
|
583 |
//</feature> |
|
584 |
//<feature name="coord" domain="NT" ></feature> |
|
585 |
//<feature name="dom" domain="NT" ></feature> |
|
586 |
//<feature name="type" domain="NT" > |
|
587 |
// <value name="nV">élément non-verbal</value> |
|
588 |
// <value name="VFin">verbe fini</value> |
|
589 |
// <value name="VInf">verbe infinitif</value> |
|
590 |
// <value name="VPar">verbe participial</value> |
|
591 |
// <value name="--">nil</value> |
|
592 |
//</feature> |
|
593 |
//<feature name="vform" domain="NT"></feature> |
|
594 |
//<feature name="vlemma" domain="NT"></feature> |
|
595 |
//<feature name="note" domain="NT"></feature> |
|
596 |
//<feature name="snr" domain="NT"></feature> |
|
597 |
//'; |
|
598 |
|
|
599 |
printf MASTER """<annotation> |
|
600 |
<feature name="word" domain="T" ></feature> |
|
601 |
<feature name="pos" domain="T" ></feature> |
|
602 |
<feature name="mor" domain="T" ></feature> |
|
603 |
<feature name="lemma" domain="T" ></feature> |
|
604 |
<feature name="textid" domain="T" ></feature> |
|
605 |
<feature name="editionId" domain="T" ></feature> |
|
606 |
<feature name="cat" domain="NT" > |
|
607 |
<value name="__UNDEF__">UNDEFINED !!!</value> |
|
608 |
<value name="acl:adv">acl:adv</value> <!-- Ukrainian --> |
|
609 |
<value name="acl:attr">acl:attr</value> <!-- Chukchi --> |
|
610 |
<value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish --> |
|
611 |
<value name="acl:fixed">acl:fixed</value> <!-- Beja --> |
|
612 |
<value name="acl:inf">acl:inf</value> <!-- Portuguese --> |
|
613 |
<value name="acl:relat">acl:relat</value> <!-- Chukchi --> |
|
614 |
<value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof --> |
|
615 |
<value name="acl">clausal modifier of noun (adnominal clause)</value> |
|
616 |
<value name="advcl:abs">advcl:abs</value> <!-- Latin --> |
|
617 |
<value name="advcl:cau">advcl:cau</value> <!-- Moksha --> |
|
618 |
<value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija --> |
|
619 |
<value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish --> |
|
620 |
<value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur --> |
|
621 |
<value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese --> |
|
622 |
<value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian --> |
|
623 |
<value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak --> |
|
624 |
<value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian --> |
|
625 |
<value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak --> |
|
626 |
<value name="advcl:pred">advcl:pred</value> <!-- Latin --> |
|
627 |
<value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian --> |
|
628 |
<value name="advcl:sp">advcl:sp</value> <!-- Ukrainian --> |
|
629 |
<value name="advcl:svc">advcl:svc</value> <!-- Ukrainian --> |
|
630 |
<value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami --> |
|
631 |
<value name="advcl">adverbial clause modifier</value> |
|
632 |
<value name="advmod:arg">advmod:arg</value> <!-- Polish --> |
|
633 |
<value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
634 |
<value name="advmod:comp">advmod:comp</value> <!-- Erzya --> |
|
635 |
<value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> |
|
636 |
<value name="advmod:det">advmod:det</value> <!-- Ukrainian --> |
|
637 |
<value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese --> |
|
638 |
<value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian --> |
|
639 |
<value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami --> |
|
640 |
<value name="advmod:fixed">advmod:fixed</value> <!-- Beja --> |
|
641 |
<value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami --> |
|
642 |
<value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha --> |
|
643 |
<value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
644 |
<value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> |
|
645 |
<value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian --> |
|
646 |
<value name="advmod:locy">advmod:locy</value> <!-- Hungarian --> |
|
647 |
<value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
648 |
<value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> |
|
649 |
<value name="advmod:mode">advmod:mode</value> <!-- Hungarian --> |
|
650 |
<value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami --> |
|
651 |
<value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French --> |
|
652 |
<value name="advmod:que">advmod:que</value> <!-- Hungarian --> |
|
653 |
<value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian --> |
|
654 |
<value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian --> |
|
655 |
<value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami --> |
|
656 |
<value name="advmod:to">advmod:to</value> <!-- Hungarian --> |
|
657 |
<value name="advmod:tto">advmod:tto</value> <!-- Hungarian --> |
|
658 |
<value name="advmod">adverbial modifier</value> |
|
659 |
<value name="amod:att">amod:att</value> <!-- Hungarian --> |
|
660 |
<value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian --> |
|
661 |
<value name="amod:flat">amod:flat</value> <!-- Polish --> |
|
662 |
<value name="amod">adjectival modifier</value> |
|
663 |
<value name="appos:trans">appos:trans</value> <!-- Turkish German --> |
|
664 |
<value name="appos">appositional modifier</value> |
|
665 |
<value name="aux:aff">aux:aff</value> <!-- Beja --> |
|
666 |
<value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian --> |
|
667 |
<value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian --> |
|
668 |
<value name="aux:clitic">aux:clitic</value> <!-- Polish --> |
|
669 |
<value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish --> |
|
670 |
<value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian --> |
|
671 |
<value name="aux:imp">aux:imp</value> <!-- Erzya, Polish --> |
|
672 |
<value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami --> |
|
673 |
<value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil --> |
|
674 |
<value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha --> |
|
675 |
<value name="aux:part">aux:part</value> <!-- Maltese --> |
|
676 |
<value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese --> |
|
677 |
<value name="aux:pot">aux:pot</value> <!-- Komi Zyrian --> |
|
678 |
<value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German --> |
|
679 |
<value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami --> |
|
680 |
<value name="aux">auxiliary</value> |
|
681 |
<value name="case:acc">case:acc</value> <!-- Hebrew --> |
|
682 |
<value name="case:adv">case:adv</value> <!-- Indonesian --> |
|
683 |
<value name="case:aff">case:aff</value> <!-- Beja --> |
|
684 |
<value name="case:det">preposition with determiner</value> <!-- Maltese, Old French --> |
|
685 |
<value name="case:gen">case:gen</value> <!-- Hebrew --> |
|
686 |
<value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian --> |
|
687 |
<value name="case:pred">case:pred</value> <!-- Welsh --> |
|
688 |
<value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic --> |
|
689 |
<value name="case">case marking</value> |
|
690 |
<value name="cc:nc">cc:nc</value> <!-- Old French --> |
|
691 |
<value name="cc:nc">Coordinated conjunct : non coordonant</value> |
|
692 |
<value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish --> |
|
693 |
<value name="cc:preconj">preconjunct</value> |
|
694 |
<value name="cc">Coordinating conjunction</value> |
|
695 |
<value name="cc">coordinating conjunction</value> |
|
696 |
<value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish --> |
|
697 |
<value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish --> |
|
698 |
<value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian --> |
|
699 |
<value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian --> |
|
700 |
<value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian --> |
|
701 |
<value name="ccomp">clausal complement</value> |
|
702 |
<value name="clf">classifier</value> |
|
703 |
<value name="compound:a">compound:a</value> <!-- Indonesian --> |
|
704 |
<value name="compound:affix">compound:affix</value> <!-- Hebrew --> |
|
705 |
<value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese --> |
|
706 |
<value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese --> |
|
707 |
<value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian --> |
|
708 |
<value name="compound:lvc">light verb construction</value> |
|
709 |
<value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami --> |
|
710 |
<value name="compound:preverb">compound:preverb</value> <!-- Hungarian --> |
|
711 |
<value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba --> |
|
712 |
<value name="compound:prt">phrasal verb particle</value> |
|
713 |
<value name="compound:quant">compound:quant</value> <!-- Cantonese --> |
|
714 |
<value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian --> |
|
715 |
<value name="compound:smixut">compound:smixut</value> <!-- Hebrew --> |
|
716 |
<value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba --> |
|
717 |
<value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese --> |
|
718 |
<value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese --> |
|
719 |
<value name="compound">compound</value> |
|
720 |
<value name="conj:expl">conj:expl</value> <!-- Latin --> |
|
721 |
<value name="conj:extend">conj:extend</value> <!-- Slovenian --> |
|
722 |
<value name="conj:svc">conj:svc</value> <!-- Ukrainian --> |
|
723 |
<value name="conj">conjunct</value> |
|
724 |
<value name="cop:expl">cop:expl</value> <!-- Maltese --> |
|
725 |
<value name="cop:locat">cop:locat</value> <!-- Polish --> |
|
726 |
<value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi --> |
|
727 |
<value name="cop">copula</value> |
|
728 |
<value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic --> |
|
729 |
<value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish --> |
|
730 |
<value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian --> |
|
731 |
<value name="csubj">clausal subject</value> |
|
732 |
<value name="dep:aff">dep:aff</value> <!-- Beja --> |
|
733 |
<value name="dep:agr">dep:agr</value> <!-- Kiche --> |
|
734 |
<value name="dep:alt">dep:alt</value> <!-- Upper Sorbian --> |
|
735 |
<value name="dep:ana">dep:ana</value> <!-- Yupik --> |
|
736 |
<value name="dep:aux">dep:aux</value> <!-- Yupik --> |
|
737 |
<value name="dep:comp">dep:comp</value> <!-- Beja, French --> |
|
738 |
<value name="dep:conj">dep:conj</value> <!-- Beja --> |
|
739 |
<value name="dep:cop">dep:cop</value> <!-- Yupik --> |
|
740 |
<value name="dep:emo">dep:emo</value> <!-- Yupik --> |
|
741 |
<value name="dep:infl">dep:infl</value> <!-- Yupik --> |
|
742 |
<value name="dep:mark">dep:mark</value> <!-- Yupik --> |
|
743 |
<value name="dep:mod">dep:mod</value> <!-- Mbya Guarani --> |
|
744 |
<value name="dep:pos">dep:pos</value> <!-- Yupik --> |
|
745 |
<value name="dep:redup">dep:redup</value> <!-- Beja --> |
|
746 |
<value name="dep:ss">dep:ss</value> <!-- Kiche --> |
|
747 |
<value name="dep">unspecified dependency</value> |
|
748 |
<value name="det:adj">det:adj</value> <!-- Albanian --> |
|
749 |
<value name="det:noun">det:noun</value> <!-- Albanian --> |
|
750 |
<value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian --> |
|
751 |
<value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian --> |
|
752 |
<value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian --> |
|
753 |
<value name="det:predet">det:predet</value> <!-- English, Italian, Persian --> |
|
754 |
<value name="det:pron">det:pron</value> <!-- Albanian --> |
|
755 |
<value name="det:rel">det:rel</value> <!-- Bambara --> |
|
756 |
<value name="det">determiner</value> |
|
757 |
<value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish --> |
|
758 |
<value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian --> |
|
759 |
<value name="discourse:intj">discourse:intj</value> <!-- Polish --> |
|
760 |
<value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese --> |
|
761 |
<value name="discourse">discourse element</value> |
|
762 |
<value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani --> |
|
763 |
<value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin --> |
|
764 |
<value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin --> |
|
765 |
<value name="dislocated:obj">dislocated:obj</value> <!-- Latin --> |
|
766 |
<value name="dislocated:subj">dislocated:subj</value> <!-- Beja --> |
|
767 |
<value name="dislocated">dislocated elements</value> |
|
768 |
<value name="expl:comp">expl:comp</value> <!-- French --> |
|
769 |
<value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish --> |
|
770 |
<value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian --> |
|
771 |
<value name="expl:poss">expl:poss</value> <!-- Romanian --> |
|
772 |
<value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian --> |
|
773 |
<value name="expl:subj">expl:subj</value> <!-- French, Naija --> |
|
774 |
<value name="expl">expletive</value> |
|
775 |
<value name="fixed">fixed multiword expression</value> |
|
776 |
<value name="flat:abs">flat:abs</value> <!-- Ukrainian --> |
|
777 |
<value name="flat:dist">flat:dist</value> <!-- Western Armenian --> |
|
778 |
<value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian --> |
|
779 |
<value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian --> |
|
780 |
<value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian --> |
|
781 |
<value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian --> |
|
782 |
<value name="flat:repeat">flat:repeat</value> <!-- Ukrainian --> |
|
783 |
<value name="flat:sibl">flat:sibl</value> <!-- Ukrainian --> |
|
784 |
<value name="flat:title">flat:title</value> <!-- Ukrainian --> |
|
785 |
<value name="flat:vv">flat:vv</value> <!-- Classical Chinese --> |
|
786 |
<value name="flat">name multiword expression</value> |
|
787 |
<value name="goeswith">goes with</value> |
|
788 |
<value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian --> |
|
789 |
<value name="iobj:appl">iobj:appl</value> <!-- Wolof --> |
|
790 |
<value name="iobj:patient">iobj:patient</value> <!-- Tagalog --> |
|
791 |
<value name="iobj">indirect object</value> |
|
792 |
<value name="list">list</value> |
|
793 |
<value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese --> |
|
794 |
<value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French --> |
|
795 |
<value name="mark:aff">mark:aff</value> <!-- Beja --> |
|
796 |
<value name="mark:obj">marker + object</value> <!--Old French, no doc --> |
|
797 |
<value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc --> |
|
798 |
<value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic --> |
|
799 |
<value name="mark:q">mark:q</value> <!-- Hebrew --> |
|
800 |
<value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese --> |
|
801 |
<value name="mark">marker</value> |
|
802 |
<value name="nmod:agent">nmod:agent</value> <!-- Welsh --> |
|
803 |
<value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha --> |
|
804 |
<value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik --> |
|
805 |
<value name="nmod:att">nmod:att</value> <!-- Hungarian --> |
|
806 |
<value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian --> |
|
807 |
<value name="nmod:attr">nmod:attr</value> <!-- Chukchi --> |
|
808 |
<value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha --> |
|
809 |
<value name="nmod:cau">nmod:cau</value> <!-- Uyghur --> |
|
810 |
<value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur --> |
|
811 |
<value name="nmod:flat">nmod:flat</value> <!-- Polish --> |
|
812 |
<value name="nmod:gen">nmod:gen</value> <!-- Breton --> |
|
813 |
<value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish --> |
|
814 |
<value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian --> |
|
815 |
<value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian --> |
|
816 |
<value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha --> |
|
817 |
<value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian --> |
|
818 |
<value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian --> |
|
819 |
<value name="nmod:obl">nmod:obl</value> <!-- Hungarian --> |
|
820 |
<value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur --> |
|
821 |
<value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof --> |
|
822 |
<value name="nmod:pred">nmod:pred</value> <!-- Polish --> |
|
823 |
<value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian --> |
|
824 |
<value name="nmod:redup">nmod:redup</value> <!-- Welsh --> |
|
825 |
<value name="nmod:relat">nmod:relat</value> <!-- Chukchi --> |
|
826 |
<value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian --> |
|
827 |
<value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur --> |
|
828 |
<value name="nmod">nominal modifier</value> |
|
829 |
<value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French --> |
|
830 |
<value name="nsubj:aff">nsubj:aff</value> <!-- Beja --> |
|
831 |
<value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog --> |
|
832 |
<value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian --> |
|
833 |
<value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin --> |
|
834 |
<value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish --> |
|
835 |
<value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog --> |
|
836 |
<value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog --> |
|
837 |
<value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian --> |
|
838 |
<value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu --> |
|
839 |
<value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French --> |
|
840 |
<value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian --> |
|
841 |
<value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese --> |
|
842 |
<value name="nsubj">Nominal subject</value> |
|
843 |
<value name="nummod:det">nummod:det</value> <!-- Beja --> |
|
844 |
<value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian --> |
|
845 |
<value name="nummod:flat">nummod:flat</value> <!-- Polish --> |
|
846 |
<value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian --> |
|
847 |
<value name="nummod">numeric modifier</value> |
|
848 |
<value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French --> |
|
849 |
<value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg --> |
|
850 |
<value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog --> |
|
851 |
<value name="obj:appl">obj:appl</value> <!-- Wolof --> |
|
852 |
<value name="obj:caus">obj:caus</value> <!-- Wolof --> |
|
853 |
<value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija --> |
|
854 |
<value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French --> |
|
855 |
<value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese --> |
|
856 |
<value name="obj">object</value> |
|
857 |
<value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French --> |
|
858 |
<value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian --> |
|
859 |
<value name="obl:appl">obl:appl</value> <!-- Wolof --> |
|
860 |
<value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil --> |
|
861 |
<value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu --> |
|
862 |
<value name="obl:cmp">obl:cmp</value> <!-- Telugu --> |
|
863 |
<value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil --> |
|
864 |
<value name="obl:comp">obl:comp</value> <!-- Moksha --> |
|
865 |
<value name="obl:dat">obl:dat</value> <!-- Kurmanji --> |
|
866 |
<value name="obl:freq">obl:freq</value> <!-- Moksha --> |
|
867 |
<value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil --> |
|
868 |
<value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
869 |
<value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil --> |
|
870 |
<value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
871 |
<value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha --> |
|
872 |
<value name="obl:lvc">obl:lvc</value> <!-- Hungarian --> |
|
873 |
<value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian --> |
|
874 |
<value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik --> |
|
875 |
<value name="obl:npmod">obl:npmod</value> <!-- Coptic, English --> |
|
876 |
<value name="obl:orphan">obl:orphan</value> <!-- Polish --> |
|
877 |
<value name="obl:own">obl:own</value> <!-- Kazakh --> |
|
878 |
<value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese --> |
|
879 |
<value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil --> |
|
880 |
<value name="obl:poss">obl:poss</value> <!-- Thai --> |
|
881 |
<value name="obl:prep">obl:prep</value> <!-- Irish --> |
|
882 |
<value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani --> |
|
883 |
<value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic --> |
|
884 |
<value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri --> |
|
885 |
<value name="obl:tmod">temporal modifier</value> |
|
886 |
<value name="obl">oblique nominal</value> |
|
887 |
<value name="orphan:missing">textual gap in the source</value> <!-- Latin --> |
|
888 |
<value name="orphan">remnant in ellipsis</value> |
|
889 |
<value name="parataxis:appos">parataxis:appos</value> <!-- Italian --> |
|
890 |
<value name="parataxis:conj">parataxis:conj</value> <!-- Naija --> |
|
891 |
<value name="parataxis:coord">parataxis:coord</value> <!-- Beja --> |
|
892 |
<value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian --> |
|
893 |
<value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian --> |
|
894 |
<value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija --> |
|
895 |
<value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian --> |
|
896 |
<value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish --> |
|
897 |
<value name="parataxis:mod">parataxis:mod</value> <!-- Beja --> |
|
898 |
<value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian --> |
|
899 |
<value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian --> |
|
900 |
<value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish --> |
|
901 |
<value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija --> |
|
902 |
<value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian --> |
|
903 |
<value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani --> |
|
904 |
<value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian --> |
|
905 |
<value name="parataxis:rt">parataxis:rt</value> <!-- Irish --> |
|
906 |
<value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish --> |
|
907 |
<value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German --> |
|
908 |
<value name="parataxis:url">parataxis:url</value> <!-- Irish --> |
|
909 |
<value name="parataxis">parataxis</value> |
|
910 |
<value name="punct">punctuation</value> |
|
911 |
<value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? --> |
|
912 |
<value name="reparandum">overridden disfluency</value> |
|
913 |
<value name="root">root</value> |
|
914 |
<value name="vocative:cl">vocative:cl</value> <!-- Ukrainian --> |
|
915 |
<value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian --> |
|
916 |
<value name="vocative">vocative</value> |
|
917 |
<value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish --> |
|
918 |
<value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi --> |
|
919 |
<value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish --> |
|
920 |
<value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic --> |
|
921 |
<value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian --> |
|
922 |
<value name="xcomp:subj">xcomp:subj</value> <!-- Polish --> |
|
923 |
<value name="xcomp">open clausal complement</value> |
|
924 |
</feature> |
|
925 |
<feature name="coord" domain="NT" ></feature> |
|
926 |
<feature name="dom" domain="NT" ></feature> |
|
927 |
<feature name="type" domain="NT" > |
|
928 |
<value name="nV">élément non-verbal</value> |
|
929 |
<value name="VFin">finite verb</value> |
|
930 |
<value name="VInf">infinitive</value> |
|
931 |
<value name="VPar">participle</value> |
|
932 |
<value name="--">nil</value> |
|
933 |
</feature> |
|
934 |
<feature name="vform" domain="NT"></feature> |
|
935 |
<feature name="vlemma" domain="NT"></feature> |
|
936 |
<feature name="note" domain="NT"></feature> |
|
937 |
<feature name="snr" domain="NT"></feature> |
|
938 |
"""; |
|
939 |
|
|
940 |
|
|
941 |
|
|
942 |
printf MASTER "$nt_features_header"; |
|
943 |
|
|
944 |
printf MASTER """ |
|
945 |
<edgelabel> |
|
946 |
<value name="D">dependency</value> |
|
947 |
<value name="L">lexical</value> |
|
948 |
<value name="R">relator</value> |
|
949 |
<value name="*">not bound</value> |
|
950 |
</edgelabel> |
|
951 |
<secedgelabel> |
|
952 |
<value name="cluster">between elements of GpCoo</value> |
|
953 |
<value name="coord">between members of Coo</value> |
|
954 |
<value name="dupl">between duplicated nodes</value> |
|
955 |
</secedgelabel> |
|
956 |
</annotation> |
|
957 |
</head> |
|
958 |
<body> |
|
959 |
"""; |
|
960 |
} |
|
961 |
|
|
962 |
// <value name="M">main</value> |
|
963 |
// <value name="P">part</value> |
|
964 |
|
|
965 |
|
|
966 |
def write_master_footer { |
|
967 |
MASTER << """</body> |
|
968 |
</corpus> |
|
969 |
"""; |
|
970 |
} |
|
971 |
|
|
972 |
|
|
973 |
TEMP=""" |
|
974 |
<feature name="nodom" domain="NT" ></feature> |
|
975 |
<feature name="headpos" domain="NT" ></feature> |
|
976 |
<feature name="annotationFile" domain="NT" ></feature> |
|
977 |
<feature name="annotationUri" domain="NT" ></feature> |
|
978 |
"""; |
|
979 |
|
|
980 |
|
|
981 |
def define_cat_hashes { |
|
982 |
// abbrev2cat{"Apst"} = "Apostrophe"; |
|
983 |
// abbrev2cat{"AtObj"} = "AttributObjet"; |
|
984 |
// abbrev2cat{"AtRfc"} = "AttributReflechi"; |
|
985 |
// abbrev2cat{"AtSj"} = "AttributSujet"; |
|
986 |
// abbrev2cat{"AuxA"} = "Auxilie-Actif"; |
|
987 |
// abbrev2cat{"AuxP"} = "Auxilie-Passif"; |
|
988 |
// abbrev2cat{"Circ"} = "Circonstant"; |
|
989 |
// abbrev2cat{"Cmpl"} = "Complement"; |
|
990 |
// abbrev2cat{"GpCoo"} = "Coordonne"; |
|
991 |
// abbrev2cat{"Coo"} = "Coordination"; |
|
992 |
// abbrev2cat{"Det"} = "Determinant"; |
|
993 |
// abbrev2cat{"NgPrt"} = "Forclusif"; |
|
994 |
// abbrev2cat{"Insrt"} = "Incidente"; |
|
995 |
// abbrev2cat{"Intj"} = "Interjection"; |
|
996 |
// abbrev2cat{"ModA"} = "ModifieurAttache"; |
|
997 |
// abbrev2cat{"ModD"} = "ModifieurDetache"; |
|
998 |
// abbrev2cat{"Ng"} = "Negation"; |
|
999 |
// abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif"; |
|
1000 |
// abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #? |
|
1001 |
// abbrev2cat{"VFin"} = "NoeudVerbal-Personnel"; |
|
1002 |
// abbrev2cat{"nSnt"} = "NonPhrase"; |
|
1003 |
// abbrev2cat{"Obj"} = "Objet"; |
|
1004 |
// abbrev2cat{"Snt"} = "Phrase"; |
|
1005 |
// abbrev2cat{"Pon"} = "Ponctuation"; |
|
1006 |
// abbrev2cat{"Rfc"} = "Reflechi"; |
|
1007 |
// abbrev2cat{"Rfx"} = "ReflexifRenforce"; |
|
1008 |
// abbrev2cat{"RelC"} = "Relateur-Coordonnant"; |
|
1009 |
// abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant"; |
|
1010 |
// abbrev2cat{"nMax"} = "StructureNonMaximale"; |
|
1011 |
// abbrev2cat{"SjImp"} = "SujetImpersonnel"; |
|
1012 |
// abbrev2cat{"SjPer"} = "SujetPersonnel"; |
|
1013 |
// abbrev2cat{"Lac"} = "Lacune"; |
|
1014 |
// abbrev2cat{"Aux"} = "Auxilie"; |
|
1015 |
// abbrev2cat{"Regim"} = "Regime"; |
|
1016 |
|
|
1017 |
|
|
1018 |
abbrev2cat{"acl"} = "Clausal modifier of noun"; |
|
1019 |
abbrev2cat{"advcl"} = "Adverbial clause modifier"; |
|
1020 |
abbrev2cat{"advmod"} = "Adverbial modifier"; |
|
1021 |
abbrev2cat{"amod"} = "Adjectival modifier"; |
|
1022 |
abbrev2cat{"appos"} = "Appositional modifier"; |
|
1023 |
abbrev2cat{"aux"} = "Auxiliary"; |
|
1024 |
abbrev2cat{"cc-nc"} = "Coordinated conjunct : non coordonant"; |
|
1025 |
abbrev2cat{"cc"} = "Coordinating conjunction"; |
|
1026 |
abbrev2cat{"ccomp"} = "Clausal complement"; |
|
1027 |
abbrev2cat{"conj"} = "Conjunct"; |
|
1028 |
abbrev2cat{"cop"} = "Copula"; |
|
1029 |
abbrev2cat{"csubj"} = "Clausal subject"; |
|
1030 |
abbrev2cat{"det"} = "Determiner"; |
|
1031 |
abbrev2cat{"dislocated"} = "Dislocated elements"; |
|
1032 |
abbrev2cat{"expl"} = "Expletive"; |
|
1033 |
abbrev2cat{"iobj"} = "Indirect object"; |
|
1034 |
abbrev2cat{"mark"} = "Marker"; |
|
1035 |
abbrev2cat{"nmod"} = "Nominal modifier"; |
|
1036 |
abbrev2cat{"nsubj"} = "Nominal subject"; |
|
1037 |
abbrev2cat{"nummod"} = "Numeric modifier"; |
|
1038 |
abbrev2cat{"obj"} = "Object"; |
|
1039 |
abbrev2cat{"obl"} = " Oblique nominal"; |
|
1040 |
abbrev2cat{"orphan"} = "Remnant in ellipsis"; |
|
1041 |
abbrev2cat{"remnant"} = "Remnant ?"; |
|
1042 |
abbrev2cat{"vocative"} = "Vocative"; |
|
1043 |
abbrev2cat{"xcomp"} = "Open clausal complement"; |
|
1044 |
} |
|
1045 |
|
|
1046 |
def print_sentence { |
|
1047 |
for (def q = 0; q <= words.size(); q++) { |
|
1048 |
print "$words[$q]\n"; |
|
1049 |
} |
|
1050 |
} |
|
1051 |
|
|
1052 |
def write_nonterminals { |
|
1053 |
def print_nt_features; |
|
1054 |
if ($_[1] =~ /dupl/) { |
|
1055 |
dupl = '_dupl'; |
|
1056 |
} else { |
|
1057 |
dupl = ''; |
|
1058 |
} |
|
1059 |
if (nt_features != '') { |
|
1060 |
print_nt_features = nt_empty_features; |
|
1061 |
if ($_[0] =~ /$thisrootname/) { |
|
1062 |
print_nt_features = nt_features; |
|
1063 |
} |
|
1064 |
} |
|
1065 |
printf XML " <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", $., w, dupl, $_[0], dom, type, vform, vlemma, print_nt_features, notes("$._$w"), $.; |
|
1066 |
printf XML " <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., w, dupl; |
|
1067 |
// link duplicate with primary original node |
|
1068 |
if ($_[1] =~ /dupl/) { |
|
1069 |
printf XML " <secedge idref=\"s%d_%d\" label=\"dupl\"/>\n", $., w; |
|
1070 |
} |
|
1071 |
// if node is not a duplicate: attach all the daughters |
|
1072 |
if ($_[1] !~ /dupl/) { |
|
1073 |
for (def d = 0; d <= daughters.size(); d++) { |
|
1074 |
daughter = daughters[d]; |
|
1075 |
if ("$._$w" != "$._$daughter") { // avoid cycles |
|
1076 |
printf XML " <edge idref=\"n%d_%d%s\" label=\"%s\"/>\n", $., daughter, dupl, edge_label("$._$daughter"); |
|
1077 |
} |
|
1078 |
// check if a duplicate of this node must be attached |
|
1079 |
if (duplicates{"$._$daughter"} == 1) { |
|
1080 |
printf XML " <edge idref=\"n%d_%d_dupl\" label=\"%s\"/>\n", $., daughter, edge_label("$._$daughter_dupl"); |
|
1081 |
} |
|
1082 |
} |
|
1083 |
} |
|
1084 |
XML << " </nt>\n"; |
|
1085 |
} |
|
1086 |
|
|
1087 |
// checks if node is stored as in hash of relators |
|
1088 |
def edge_label { |
|
1089 |
if (relators{$_[0]} == 1) { |
|
1090 |
return 'R'; |
|
1091 |
} |
|
1092 |
return 'D'; |
|
1093 |
} |
|
1094 |
|
|
1095 |
// retrives notes for this node |
|
1096 |
def notes { |
|
1097 |
if (notes{$_[0]} != '') { |
|
1098 |
return "$notes{$_[0]}"; |
|
1099 |
} |
|
1100 |
return '--'; |
|
1101 |
} |
|
1102 |
|
|
1103 |
// conversions, bug fixes |
|
1104 |
def clean_data { |
|
1105 |
// conversions of values, some necessary some for convenience |
|
1106 |
word =~ s/"/'/g; |
|
1107 |
word =~ s/\&/(and)/g; |
|
1108 |
word =~ s/<</«/g; |
|
1109 |
word =~ s/>>/»/g; |
|
1110 |
word =~ s/[<>]//g; |
|
1111 |
pos =~ s/:/_/g; |
|
1112 |
mor =~ s/\|/_/g; |
|
1113 |
ppos =~ s/:/_/g; |
|
1114 |
pmor =~ s/\|/_/g; |
|
1115 |
lemma =~ s/\|/_/g; |
|
1116 |
lemma =~ s/[<>]//g; |
|
1117 |
lemma =~ s/"/'/g; |
|
1118 |
lemma =~ s/\&/(and)/g; |
|
1119 |
plemma =~ s/\|/_/g; |
|
1120 |
plemma =~ s/[<>]//g; |
|
1121 |
plemma =~ s/"/'/g; |
|
1122 |
plemma =~ s/\&/(and)/g; |
|
1123 |
|
|
1124 |
// clean categories |
|
1125 |
cat =~ s/ROOT/$thisrootname/; // top node, for compatibility with SRCMF |
|
1126 |
// cat =~ s/Ponctuation/Pon/; |
|
1127 |
// correct some bugs in parse |
|
1128 |
// cat =~ s/Sujet/SjPer/; |
|
1129 |
// cat =~ s/Modifieur/ModA/; |
|
1130 |
// cat =~ s/Parenthese/Insrt/; |
|
1131 |
cat =~ s/\-/_/g; |
|
1132 |
cat =~ s/RelNC_(.*)/$1_RelNC/; // RelNC always 2nd node, for consistency in duplicates |
|
1133 |
return; |
|
1134 |
} |
|
1135 |
|
|
1136 |
|
Formats disponibles : Unified diff