Révision 3346

TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tigerud2.groovy (revision 3346)
1

  
2
def CMD="conll2tiger.pl";
3
def VERSION = "1.5";
4
def MODIFIED = "8/12/2015";   // angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
5

  
6
// columns (default are the predicted values in CoNLL 2009 format)
7
def coll = 2;  // lemma
8
def colm = 3;  // morph (pos)
9
def colf = 5;  // features
10
def colh = 6;  // head
11
def cold = 7; // deprel
12
def outdir = "conllexport"; // deprel
13
def split = 1000; // split output after nr sentences
14

  
15
// tree structure
16
def dominates  = [:];
17
def deprel  = [:];  // deprel{nr} = deprel
18
def daughters = []; // daughter nodes, stored in %dominates
19
def duplicates = [:];
20
def relators = [:];
21
def notes = [:]
22
def aux = [:]; // store nodes of duplicates, relators
23
def type = "--"; // node attribute
24
def vform = "--"
25
def vlemma = "--"; // node attributes for verbs store form and lemma
26
def label = "D"; // default edge label
27
def nt_features_header = ''; // option -x
28
def nt_features = ''; // option -x
29
def nt_empty_features = ''; // option -x
30
def scodes = []; // option -x
31
def add_to_sentcode = '';
32
def rootname = 'root';  // default
33
def featcol = 13;
34

  
35
//#####################################################################
36
//  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
37
//  treebanks to TigerXML
38
//    Achim Stein <achim.stein@ling.uni-stuttgart.de>
39
// License : GNU GPL v. 3 (see the LICENSE file)
40
//#####################################################################
41
// TO DO:
42
// - coordination
43
// - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100)
44
//   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
45
//   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
46
//#####################################################################
47
// Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
48
// for Profiterole project (2019-2021)
49

  
50
// 2019-09-25
51
// - updated default column numbers for CONNL-U SRCMF format
52
// - added processing for comment lines
53
// - added @textid to terminal nodes
54
// - deleted ppos, pmor et plemma (predicted tags and lemmas)
55
// - replaced specific SRCMF with standard UD tags
56

  
57
// Update 2020-05-13
58
// - added @editionId for synchronization with BFM word ID
59

  
60
// Update 2021-03-22
61
// - using $infilename for @textid
62
// - added support for .conllu extension
63

  
64
// Update 2021-03-29
65
// - added editionId to declarations in main.xml
66

  
67
// Update 2021-07-16
68
// - added "punct" to cat values
69

  
70
// Update 2021-07-20
71
// - added cat value list compiled from
72
//   https://universaldependencies.org/ext-dep-index.html and the previous
73
//   version. All relation types and subtypes from the UD 2.8 corpora
74
//   should be there.
75
// - contractions indexed
76
//#####################################################################
77

  
78
def HELP = """
79
==================================================================
80
$CMD $VERSION: Help
81
==================================================================
82
FUNKTION: converts CoNLL parser output to TigerXML (for mate tools) 
83
	   creates master file, splits input files, corrects unbound nodes
84
SYNTAX:   $CMD [Options] <CoNLL file>
85
OPTIONEN: 
86
	-c    ignore coordination (delete coordx- prefix in deprel) 
87
	-C str   corpus specials: nca 
88
	-h    show help 
89
	-o    create all files in this output directory (default: $outdir)
90
set COLUMNS for required info (0 = column 1, 1 = column 2, etc.) 
91
	-D nr    colum for deprel default=$cold 
92
	-H nr    colum for head default=$colh 
93
	-M nr    colum for morphology (POS) default=$colm 
94
	-F nr    colum for morph. features default=$colf 
95
	-R str   Root category (default: $rootname) 
96
	-s nr    split output files after each nr sentence (default = $split) 
97
	-x str,...  include these attributes if present in the -X column of the first word 
98
	   (the first code is also copied into the sentence id) 
99
	-X nr    the column where attributes are stored (default: $featcol)
100
EXAMPLE: 
101
	 - For mate parser output: no further options required 
102
	$CMD parsed.conll 
103
	 - For Le Monde 2005: include attributes 
104
	gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr 
105
	 - For NCA: 
106
	conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll
107
""";
108

  
109
//##########################################################################
110
//     DO NOT MODIFY FOLLOWING CODE !
111
//##########################################################################
112

  
113

  
114
//##########################################################################
115
// parse the command line
116
//##########################################################################
117

  
118
getopts('c:C:hD:H:M:o:R:s:x:X:');
119

  
120
if (defined(opt_h)) {
121
	println "** " + "$HELP";
122
	return 0;
123
}
124

  
125
if (defined(opt_o)) {
126
	outdir = opt_o
127
}
128
if (defined(opt_C)) {
129
	corpus = opt_C;
130
}
131
if (defined(opt_D)) {
132
	cold = opt_D
133
}
134
if (defined(opt_H)) {
135
	colh = opt_H
136
}
137
if (defined(opt_M)) {
138
	colm = opt_M
139
}
140

  
141
if (defined(opt_R)) {
142
	rootname = opt_R;
143
}
144

  
145
if (defined(opt_s)) {
146
	split = opt_s
147
}
148

  
149
if (defined(opt_X)) {
150
	featcol = opt_X;
151
}
152

  
153
if (defined(opt_x)) {
154
	scodes = opt_x.split(",");
155
	for (def i = 0; i <= scodes.size(); i++) {
156
		nt_features_header = nt_features_header + sprintf("<feature name=\"%s\" domain=\"NT\"></feature>\n", scodes[i]);
157
	}
158
	nt_features_header =~ s/\bid\b/ncaid/;  // avoid reserved Tiger attribute "id"
159
}
160

  
161

  
162
def colnames = ["url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL"];
163
// def pos = [:]%lemma = %form = hDeprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = [:];
164
def coordelements = [];
165

  
166
def id = def form = def lemma = def plemma = def pos = def ppos = def feat = def pfeat = def head = def phead = def deprel = def pdeprel = def edition_id = "";
167

  
168
def timestamp = `date`;
169
chomp(timestamp);
170

  
171
def infile = ARGV[0];
172
infile =~ s/\.conllu?//i;
173
if (infile == '') { 
174
	 infile = 'subcorpus';
175
}
176
def counter = 1;
177
suffix = sprintf("%05d", counter);
178
infilename = basename(infile);
179

  
180
foo = `if [ ! -d outdir ];then mkdir outdir;fi`;
181
open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
182
open(LOG, ">$outdir/conversion.log")  or die "\nopen file error of conversion.log\n";
183
open(MASTER, ">$outdir/main.xml")  or die "\nopen file error of main.xml\n";
184
write_xml_header();
185
write_master_header();
186

  
187
// flush output for log and master file
188
select(LOG); $| = 1; 
189
select(MASTER); $| = 1;
190

  
191
commandline = $0 + " " + (join " ", @ARGV);
192
LOG << "$commandline\n\n";
193

  
194
MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
195

  
196
$/ = ""; // treat empty line as RS
197
while (<>) { 
198
	 if ($. % split == 0) { 
199
	XML << "</subcorpus>\n"; 
200
	close(XML); 
201
	suffix = sprintf("%05d", ++counter); 
202
	open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error\n"; 
203
	write_xml_header(); 
204
	MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n"; 
205
	 }
206
 
207
	 // ---------------------------------------- 
208
	 // set root (or fake root if ROOT is missing) 
209
	 // ---------------------------------------- 
210
	 rootnode = fakeroot = 0; // m = Treat string as multiple lines, so that ^ matches beginning of line 
211
	 thisrootname = rootname; 
212
	 (rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);  // real root marked by parser 
213
	 if (rootnode == 0) {
214
// (rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  // no marked ROOT, but top node (head = 0)   TOO SPECIFIC
215
// (rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  // no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein) 
216
	(rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  // no marked ROOT, but top node (head = 0) in col7 (updated by AL) 
217
	LOG << " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n"; 
218
	fakeroot = 1; 
219
	thisrootname = 'nSnt'; 
220
	 } 
221
	 if (rootnode == 0) { 
222
	rootnode = 1;   // set fake root if nothing goes 
223
	LOG << " Error sentence $.: setting fake root to first word:\n$_\n"; 
224
	fakeroot = 2; 
225
	thisrootname = 'Err'; 
226
	 }
227
 
228
	 def cols = []; 
229
	 @words = split (/\n/); 
230
	 @terminals = [];
231
 
232
	 %dominates = [:]; // empty at beginning of sentence 
233
	 hDeprel = [:]; // empty at beginning of sentence 
234
	 %aux = [:]; // empty at beginning of sentence 
235
	 daughters = [];
236
 
237
	 def commentlines = 0; #added by AL
238
//  def contractions = 0; #added by AL
239
//  def text_id = "unknown_text";
240
def text_id = infilename; 
241
	 def sent_id = "0";
242
 
243
	 // ---------------------------------------- 
244
	 // loop through words #1: write tokens (terminal nodes) to XML file 
245
	 // store tree relevant information for loop #2 
246
	 // ---------------------------------------- 
247
	 for (def w = 0; w <= words.size(); w++) {
248
// Added by AL for comment lines 
249
	if (words[w] =~ /^#/) {
250
	if (words[w] =~ /^# newdoc/) {
251
		text_id = words[w];
252
		text_id =~ s/# newdoc id = //;
253
	}
254
	elsif (words[w] =~ /^# sent_id/) {
255
		sent_id = words[w];
256
		sent_id =~ s/# sent_id = //;
257
	}
258
//	LOG << "Comment line loop 1: words[w]\n";
259
	commentlines++;
260
	next; 
261
	}
262
// Added by AL for contractions 
263
	elsif (words[w] =~ /^\d+-\d+/) {
264
//	LOG << "Contraction line loop 1: words[w]\n";
265
	commentlines++;
266
//	contractions++;
267
	next; 
268
	} 
269
	else { 
270
	if (defined (opt_c)) { 
271
	  words[w] =~ s/coord(\d+)-//g; 
272
	} 
273
	@cols = split (/\t/, words[w]); 
274
	wnr = cols[0]; 
275
	word = cols[1]; 
276
	lemma = cols[2]; 
277
	plemma = cols[2]; // predicted 
278
	pos = cols[3]; 
279
	ppos = cols[4]; // predicted 
280
	mor = cols[5]; 
281
	pmor = cols[5]; // predicted 
282
	cat = cols[cold]; 
283
	edition_id = cols[9]; 
284
	edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g;
285
 
286
	if (cat =~ /[<>]/) { 
287
	  LOG << "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n"; 
288
	  cat = 'Err2'; 
289
	}
290
 
291
	// NCA: enclose lemmas in underscores (easier for regex construction) 
292
	if (corpus =~ /nca/i) { 
293
	  lemma = "_" + "$lemma" + "_" 
294
	}
295
 
296
	clean_data();
297
 
298
	// get attribute-value pairs from col #13 of first word (option -x) 
299
	if (opt_x == "all") {
300
	  cols[featcol] = "all=" + cols[featcol]; 
301
	} 
302
	if (w == 0 && cols[featcol] =~ /=/) {
303
//   println "** " + "========== getting att-value for word w: cols[featcol] scodes=@scodes\n"; 
304
		nt_features = nt_empty_features = '';
305
//   while(cols[featcol] =~ m/ (.*?)="([^"]*)"/gs) {   // quoted values 
306
	  while(cols[featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs) {  // maybe unquoted values (e.g. Le Monde 2005)
307
	att = $1;
308
	val = $2;
309
	// pick the attributes that match those of the command line option -x
310
	for (def t = 0; t <= scodes.size(); t++) {
311
	  if (att == scodes[t]) {
312
	 val =~ s/\&/\&amp;/g;  //  replace "&" in values (appears in URLs)
313
	 if (t == 0) { add_to_sentcode = "_$att$val"; }
314
	 nt_features = nt_features + " $att=\"$val\"";
315
//	 println "** " + "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n";
316
	  }
317
	  if (att == scodes[t]) { nt_empty_features = nt_empty_features + " $att=\"--\"";}
318
	} 
319
	  } 
320
	  // replace the reserved feature 'id' (Tiger) 
321
	  add_to_sentcode =~ s/\bid=/ncaid=/; 
322
	  nt_features =~ s/\bid=/ncaid=/; 
323
	  nt_empty_features =~ s/\bid=/ncaid=/; 
324
	} // if col 13 contains attributes 
325
	else { 
326
	  if (defined(opt_x) && (w == 0)) {
327
	println "** " + "Warning: sentence=$.  option -x is defined, but no attribute=value declarations were found!\n"; 
328
	  } 
329
	}
330

  
331
 
332
	// store output for terminal node in array, output later. For double categories make a duplicate node. 
333
	tempid = sprintf("%d_%d", $., wnr);
334
// push(@terminals, sprintf("   <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, ppos, pmor, plemma, text_id, edition_id)); 
335
	push(@terminals, sprintf("   <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, text_id, edition_id)); 
336
	if (cat =~ /_/) {
337
//   push(@terminals, sprintf("   <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", "_", "_", "_", text_id, edition_id)); 
338
	  push(@terminals, sprintf("   <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", text_id, edition_id)); 
339
	  duplicates{tempid} = 1; // store, check later to attach the duplicates to the mother 
340
	}
341
 
342
	// associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux) 
343
	if (cat =~ /Aux/) { 
344
	  aux{cols[colh]} = "$word" + "_" + "$plemma"; // aux{head} = word_lemma (of Aux) 
345
	}
346
 
347
	// ---------------------------------------- 
348
	// store information needed for tree 
349
	// ---------------------------------------- 
350
	// if fake rootnode == 1: nSnt as root node 
351
	if ((fakeroot == 1) && (w-commentlines+1 == rootnode)) { 
352
	  cat = 'nSnt'; 
353
	  notes{tempid} = 'Warning no marked ROOT node in CoNLL';  // TODO: geht nicht 
354
	} 
355
	// if fake rootnode == 2: flatten structure: attach all words to the first word 
356
	if ((fakeroot == 2) && (w-commentlines+1 != rootnode)) { 
357
	  cols[colh] = 1; 
358
	  notes{tempid} = 'Error neither ROOT node nor top node in CoNLL';  
359
	} 
360
	// correct unbound words in parser output (phead = 0, but not marked as ROOT) 
361
	if ((cols[colh] == "0") && (w-commentlines+1 != rootnode)) { // AL: added: -commentlines 
362
	  printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", (w-commentlines+1), rootnode; 
363
	  cols[colh] = rootnode; 
364
	  cat = 'Err';  // let Err instead of deprel appear in dom attribute 
365
	  notes{tempid} = 'Warning unbound node in CoNLL';  
366
	} 
367
	// store for R edge labels 
368
	if (cols[cold] =~ /RelN?C/) { 
369
	  relators{tempid} = 1;  
370
	} 
371
	// store deprel for dom attribute 
372
	deprel[tempid] = cat; // cols[cold]; 
373
	// if real root, add this node to daughter array, store array in hash dominates{head}{@daughters} 
374
	if ((fakeroot < 2) && (w-commentlines+1 != rootnode)) { 
375
	  daughters = @{dominates[ols[colh]}};  // get the array from the hash of the dominating node 
376
	  push(daughters, wnr); 
377
	  dominates[cols[colh]] = [daughters]; 
378
	} 
379
	 } // for each word loop #1
380
} // AL condition end
381
 
382
	 // print graph code (needs root attribute) and terminal nodes 
383
	 if (rootnode == 0) { 
384
	noroot++; 
385
	LOG << "Error sentence $. ($tempid): root node not found:\n$_\n"; next; 
386
	 } else { 
387
	printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., add_to_sentcode; 
388
	XML << "  <graph root=\"n$._$rootnode\">\n"; 
389
	XML << " <terminals>\n"; 
390
	for (def t = 0; t <= terminals.size(); t++) { 
391
	  XML << terminals[t]; 
392
	} 
393
	XML << " </terminals>\n"; 
394
	 } 
395
	  
396
	 // ---------------------------------------- 
397
	 // loop through words #2 to build Tiger tree (non terminal nodes) 
398
	 // ---------------------------------------- 
399
	 XML << " <nonterminals>\n"; 
400
	 for (def i = 0; i <= words.size(); i++) {
401
//Added AL for comment lines 
402
	if (words[i] =~ /^#/) {
403
//    LOG << "Comment line loop 2 : $words[$i]\n"; 
404
	   next; 
405
	}
406
//Added AL for contractions 
407
	if (words[i] =~ /^\d+-\d+/) {
408
//    LOG << "Contraction loop 2 : $words[$i]\n"; 
409
	   next; 
410
	} 
411
	   
412
	else {
413
 
414
	@cols = split (/\t/, words[i]); 
415
	w = cols[0]; 
416
	// TODO: redundante Variablenzuweisung (= loop #1)?? 
417
	word = cols[1]; 
418
	lemma = cols[2]; 
419
	plemma = cols[3]; // predicted 
420
	pos = cols[4]; 
421
	ppos = cols[5]; // predicted 
422
	mor = cols[6]; 
423
	pmor = cols[7]; // predicted 
424
	cat = cols[cold];
425
 
426
	if (cat =~ /[<>]/) { 
427
	  print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n"; 
428
	  cat = 'Err2'; 
429
	}
430
 
431
	//  OF parser has not learned punctuation: set cat for punctuation to PON 
432
	if ((corpus =~ /nca/i) && (pos == 'PON')) { 
433
	  cols[cold] = cat = 'Pon'; 
434
	}
435
 
436
	clean_data();
437
 
438
	// retrieve daughters, make dom attribute (string of dominated nodes) 
439
	daughters = @{dominates["$w"]}; 
440
	dom = ''; 
441
	for (def d = 0; d <= daughters.size(); d++) { 
442
	  dom = dom + "_" + deprel["$._$daughters{$d}"]; 
443
	} 
444
	if (dom =~ /_/) { 
445
	  dom =~ s/^_//; 
446
	} else { 
447
	  dom = '--'; 
448
	}
449
 
450
	// if verbal, set node attributes for verb form and lemma 
451
	type = "nV"; 
452
	vform = vlemma = "--"; 
453
	if (pos =~ /VER/) { // AL: ppos -> pos 
454
	  if (mor =~ /infi/) { type = "VInf"; } // AL: pmor -> mor 
455
	  elsif (pmor =~ /pper|ppre/) { type = "VPar"; } 
456
	  else { type = "VFin"; } 
457
	  // if Aux is present, create attribute for main verb  
458
	  if (aux{w} =~ /(.*?)_(.*)/) {
459
	vform = "$1";
460
	vlemma = "$2"; 
461
	  } 
462
	  // else create attr for simple verb 
463
	  else {
464
	vform = word;
465
	vlemma = lemma; // AL: plemma -> lemma (always void in SRCMF) 
466
	  } 
467
	  // NCA: enclose lemmas in underscores (easier for regex construction) 
468
	  if (corpus =~ /nca/i) {
469
	vlemma = "_" + "$vlemma" + "_" 
470
	  } 
471
	}
472
 
473
	// call output function (twice for duplicate categories) 
474
	if (cat =~ /(.*?)_(.*)/) { 
475
	  write_nonterminals("$2", "");   // RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC 
476
	  write_nonterminals("$1", "_dupl");  // other category is duplicate 
477
	} else { 
478
	  write_nonterminals(cat); 
479
	} 
480
	 } // for words
481
} #AL end condition
482
 
483
	 XML << " </nonterminals>\n"; 
484
	 XML << "  </graph>\n"; 
485
	 XML << "</s>\n"; 
486
	 if ($. % 100 == 0) { println "** " + "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.;}
487
} // main
488
XML << "</subcorpus>\n";
489
println "** " + "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n";
490
println "** " + "   Hint 1: on OS X convert master file to MacRoman, e.g  iconv -f latin1 -t macroman\n";
491
println "** " + "   Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n";
492
println "** " + "   Hint 3: build reliable feature declarations using tiger.sh\n";
493
println "** " + "     tiger.sh -a \"lemma word pos ppos\"  (for terminals)\n";
494
println "** " + "     tiger.sh -A \"lemma word pos ppos\"  (for non-terminals)\n";
495
if (noroot > 0) {println "** " + "$noroot sentences ignored: root not found (see log file)\n";}
496
write_master_footer();
497
close(MASTER);
498
close(XML);
499
close(LOG);
500

  
501
exit;
502

  
503

  
504

  
505
// ----------------------------------------
506
// sub
507
// ----------------------------------------
508

  
509
def write_xml_header { 
510
	 XML << """<?xml version=\"1.0\" encoding=\"UTF-8\"?> 
511
	 <subcorpus name=\"$infilename-$suffix\">
512
""";
513
}
514

  
515
def write_master_header { 
516
	 printf MASTER """<?xml version="1.0" encoding="UTF-8"?>
517
""";
518
 
519
	 printf MASTER """<corpus id=\"$corpus\">
520
<head> 
521
	 <meta><name>$corpus</name>  
522
	<author>ILR Stuttgart</author>  
523
	<date></date>  
524
	<description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description>  
525
	<format>SRCMF</format> 
526
	<history>TigerXML converted by conll2tiger.pl</history> 
527
	 </meta>
528
""";
529

  
530
//  printf MASTER '<annotation>
531
//<feature name="word" domain="T" ></feature>
532
//<feature name="pos" domain="T" ></feature>
533
//<feature name="mor" domain="T" ></feature>
534
//<feature name="lemma" domain="T" ></feature>
535
//<feature name="ppos" domain="T" ></feature>
536
//<feature name="pmor" domain="T" ></feature>
537
//<feature name="plemma" domain="T" ></feature>
538
//<feature name="cat" domain="NT" >
539
//  <value name="Apst">apostrophe</value>
540
//  <value name="AtObj">attribut d objet</value>
541
//  <value name="AtRfc">attribut réfléchi</value>
542
//  <value name="AtSj">attribut de sujet</value>
543
//  <value name="AttributReflechi">attribut réfléchi</value>
544
//  <value name="Aux">auxilié</value>
545
//  <value name="AuxA">auxilié actif</value>
546
//  <value name="AuxP">auxilié passif</value>
547
//  <value name="Circ">circonstant</value>
548
//  <value name="Circ_RelNC">circonstant pronom relatif</value>
549
//  <value name="Cmpl">complément</value>
550
//  <value name="Cmpl_RelNC">complément pronom relatif</value>
551
//  <value name="Coo">coordination</value>
552
//  <value name="Det">déterminant</value>
553
//  <value name="Err">unbound node in CoNLL input</value>
554
//  <value name="Err2">illegal node name was replaced</value>
555
//  <value name="GpCoo">coordonné</value>
556
//  <value name="Ignorer">Ignorer</value>
557
//  <value name="Insrt">incidente</value>
558
//  <value name="Intj">interjection</value>
559
//  <value name="Lac">lacune</value>
560
//  <value name="ModA">modifieur attaché</value>
561
//  <value name="ModD">modifieur détaché</value>
562
//  <value name="Ng">négation</value>
563
//  <value name="NgPrt">forclusif</value>
564
//  <value name="Obj">objet</value>
565
//  <value name="Obj_RelNC">direct object pronom relatif</value>
566
//  <value name="Pon">ponctuation</value>
567
//  <value name="PON">ponctuation</value>
568
//  <value name="Regim">régime</value>
569
//  <value name="RelC">relateur coordonnant</value>
570
//  <value name="RelNC">relateur non coordonnant</value>
571
//  <value name="Rfc">réfléchi</value>
572
//  <value name="Rfx">réfléxif renforcé</value>
573
//  <value name="SjImp">sujet impersonnel</value>
574
//  <value name="SjPer">sujet personnel</value>
575
//  <value name="SjPer_RelNC">sujet personnel pronom relatif</value>
576
//  <value name="Snt">phrase</value>
577
//  <value name="ROOT">phrase</value>
578
//  <value name="StructureMaximale">structure maximale</value>
579
//  <value name="VFin">verbe fini</value>
580
//  <value name="VInf">verbe infinitif</value>
581
//  <value name="nMax">structure non-maximale</value>
582
//  <value name="nSnt">non-phrase</value>
583
//</feature>
584
//<feature name="coord" domain="NT" ></feature>
585
//<feature name="dom" domain="NT" ></feature>
586
//<feature name="type" domain="NT" >
587
//  <value name="nV">élément non-verbal</value>
588
//  <value name="VFin">verbe fini</value>
589
//  <value name="VInf">verbe infinitif</value>
590
//  <value name="VPar">verbe participial</value>
591
//  <value name="--">nil</value>
592
//</feature>
593
//<feature name="vform" domain="NT"></feature>
594
//<feature name="vlemma" domain="NT"></feature>
595
//<feature name="note" domain="NT"></feature>
596
//<feature name="snr" domain="NT"></feature>
597
//';
598
 
599
	 printf MASTER """<annotation>
600
<feature name="word" domain="T" ></feature>
601
<feature name="pos" domain="T" ></feature>
602
<feature name="mor" domain="T" ></feature>
603
<feature name="lemma" domain="T" ></feature>
604
<feature name="textid" domain="T" ></feature>
605
<feature name="editionId" domain="T" ></feature>
606
<feature name="cat" domain="NT" > 
607
	 <value name="__UNDEF__">UNDEFINED !!!</value> 
608
	 <value name="acl:adv">acl:adv</value> <!-- Ukrainian --> 
609
	 <value name="acl:attr">acl:attr</value> <!-- Chukchi --> 
610
	 <value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish --> 
611
	 <value name="acl:fixed">acl:fixed</value> <!-- Beja --> 
612
	 <value name="acl:inf">acl:inf</value> <!-- Portuguese --> 
613
	 <value name="acl:relat">acl:relat</value> <!-- Chukchi --> 
614
	 <value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof --> 
615
	 <value name="acl">clausal modifier of noun (adnominal clause)</value> 
616
	 <value name="advcl:abs">advcl:abs</value> <!-- Latin --> 
617
	 <value name="advcl:cau">advcl:cau</value> <!-- Moksha --> 
618
	 <value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija --> 
619
	 <value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish --> 
620
	 <value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur --> 
621
	 <value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese --> 
622
	 <value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian --> 
623
	 <value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak --> 
624
	 <value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian --> 
625
	 <value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak --> 
626
	 <value name="advcl:pred">advcl:pred</value> <!-- Latin --> 
627
	 <value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian --> 
628
	 <value name="advcl:sp">advcl:sp</value> <!-- Ukrainian --> 
629
	 <value name="advcl:svc">advcl:svc</value> <!-- Ukrainian --> 
630
	 <value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami --> 
631
	 <value name="advcl">adverbial clause modifier</value> 
632
	 <value name="advmod:arg">advmod:arg</value> <!-- Polish --> 
633
	 <value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha --> 
634
	 <value name="advmod:comp">advmod:comp</value> <!-- Erzya --> 
635
	 <value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> 
636
	 <value name="advmod:det">advmod:det</value> <!-- Ukrainian --> 
637
	 <value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese --> 
638
	 <value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian --> 
639
	 <value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami --> 
640
	 <value name="advmod:fixed">advmod:fixed</value> <!-- Beja --> 
641
	 <value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami --> 
642
	 <value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha --> 
643
	 <value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha --> 
644
	 <value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> 
645
	 <value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian --> 
646
	 <value name="advmod:locy">advmod:locy</value> <!-- Hungarian --> 
647
	 <value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha --> 
648
	 <value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami --> 
649
	 <value name="advmod:mode">advmod:mode</value> <!-- Hungarian --> 
650
	 <value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami --> 
651
	 <value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French --> 
652
	 <value name="advmod:que">advmod:que</value> <!-- Hungarian --> 
653
	 <value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian --> 
654
	 <value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian --> 
655
	 <value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami --> 
656
	 <value name="advmod:to">advmod:to</value> <!-- Hungarian --> 
657
	 <value name="advmod:tto">advmod:tto</value> <!-- Hungarian --> 
658
	 <value name="advmod">adverbial modifier</value> 
659
	 <value name="amod:att">amod:att</value> <!-- Hungarian --> 
660
	 <value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian --> 
661
	 <value name="amod:flat">amod:flat</value> <!-- Polish --> 
662
	 <value name="amod">adjectival modifier</value> 
663
	 <value name="appos:trans">appos:trans</value> <!-- Turkish German --> 
664
	 <value name="appos">appositional modifier</value> 
665
	 <value name="aux:aff">aux:aff</value> <!-- Beja --> 
666
	 <value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian --> 
667
	 <value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian --> 
668
	 <value name="aux:clitic">aux:clitic</value> <!-- Polish --> 
669
	 <value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish --> 
670
	 <value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian --> 
671
	 <value name="aux:imp">aux:imp</value> <!-- Erzya, Polish --> 
672
	 <value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami --> 
673
	 <value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil --> 
674
	 <value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha --> 
675
	 <value name="aux:part">aux:part</value> <!-- Maltese --> 
676
	 <value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese --> 
677
	 <value name="aux:pot">aux:pot</value> <!-- Komi Zyrian --> 
678
	 <value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German --> 
679
	 <value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami --> 
680
	 <value name="aux">auxiliary</value> 
681
	 <value name="case:acc">case:acc</value> <!-- Hebrew --> 
682
	 <value name="case:adv">case:adv</value> <!-- Indonesian --> 
683
	 <value name="case:aff">case:aff</value> <!-- Beja --> 
684
	 <value name="case:det">preposition with determiner</value> <!-- Maltese, Old French --> 
685
	 <value name="case:gen">case:gen</value> <!-- Hebrew --> 
686
	 <value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian --> 
687
	 <value name="case:pred">case:pred</value> <!-- Welsh --> 
688
	 <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic --> 
689
	 <value name="case">case marking</value> 
690
	 <value name="cc:nc">cc:nc</value> <!-- Old French --> 
691
	 <value name="cc:nc">Coordinated conjunct : non coordonant</value> 
692
	 <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish --> 
693
	 <value name="cc:preconj">preconjunct</value> 
694
	 <value name="cc">Coordinating conjunction</value> 
695
	 <value name="cc">coordinating conjunction</value> 
696
	 <value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish --> 
697
	 <value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish --> 
698
	 <value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian --> 
699
	 <value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian --> 
700
	 <value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian --> 
701
	 <value name="ccomp">clausal complement</value> 
702
	 <value name="clf">classifier</value> 
703
	 <value name="compound:a">compound:a</value> <!-- Indonesian --> 
704
	 <value name="compound:affix">compound:affix</value> <!-- Hebrew --> 
705
	 <value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese --> 
706
	 <value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese --> 
707
	 <value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian --> 
708
	 <value name="compound:lvc">light verb construction</value> 
709
	 <value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami --> 
710
	 <value name="compound:preverb">compound:preverb</value> <!-- Hungarian --> 
711
	 <value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba --> 
712
	 <value name="compound:prt">phrasal verb particle</value> 
713
	 <value name="compound:quant">compound:quant</value> <!-- Cantonese --> 
714
	 <value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian --> 
715
	 <value name="compound:smixut">compound:smixut</value> <!-- Hebrew --> 
716
	 <value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba --> 
717
	 <value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese --> 
718
	 <value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese --> 
719
	 <value name="compound">compound</value> 
720
	 <value name="conj:expl">conj:expl</value> <!-- Latin --> 
721
	 <value name="conj:extend">conj:extend</value> <!-- Slovenian --> 
722
	 <value name="conj:svc">conj:svc</value> <!-- Ukrainian --> 
723
	 <value name="conj">conjunct</value> 
724
	 <value name="cop:expl">cop:expl</value> <!-- Maltese --> 
725
	 <value name="cop:locat">cop:locat</value> <!-- Polish --> 
726
	 <value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi --> 
727
	 <value name="cop">copula</value> 
728
	 <value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic --> 
729
	 <value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish --> 
730
	 <value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian --> 
731
	 <value name="csubj">clausal subject</value> 
732
	 <value name="dep:aff">dep:aff</value> <!-- Beja --> 
733
	 <value name="dep:agr">dep:agr</value> <!-- Kiche --> 
734
	 <value name="dep:alt">dep:alt</value> <!-- Upper Sorbian --> 
735
	 <value name="dep:ana">dep:ana</value> <!-- Yupik --> 
736
	 <value name="dep:aux">dep:aux</value> <!-- Yupik --> 
737
	 <value name="dep:comp">dep:comp</value> <!-- Beja, French --> 
738
	 <value name="dep:conj">dep:conj</value> <!-- Beja --> 
739
	 <value name="dep:cop">dep:cop</value> <!-- Yupik --> 
740
	 <value name="dep:emo">dep:emo</value> <!-- Yupik --> 
741
	 <value name="dep:infl">dep:infl</value> <!-- Yupik --> 
742
	 <value name="dep:mark">dep:mark</value> <!-- Yupik --> 
743
	 <value name="dep:mod">dep:mod</value> <!-- Mbya Guarani --> 
744
	 <value name="dep:pos">dep:pos</value> <!-- Yupik --> 
745
	 <value name="dep:redup">dep:redup</value> <!-- Beja --> 
746
	 <value name="dep:ss">dep:ss</value> <!-- Kiche --> 
747
	 <value name="dep">unspecified dependency</value> 
748
	 <value name="det:adj">det:adj</value> <!-- Albanian --> 
749
	 <value name="det:noun">det:noun</value> <!-- Albanian --> 
750
	 <value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian --> 
751
	 <value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian --> 
752
	 <value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian --> 
753
	 <value name="det:predet">det:predet</value> <!-- English, Italian, Persian --> 
754
	 <value name="det:pron">det:pron</value> <!-- Albanian --> 
755
	 <value name="det:rel">det:rel</value> <!-- Bambara --> 
756
	 <value name="det">determiner</value> 
757
	 <value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish --> 
758
	 <value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian --> 
759
	 <value name="discourse:intj">discourse:intj</value> <!-- Polish --> 
760
	 <value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese --> 
761
	 <value name="discourse">discourse element</value> 
762
	 <value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani --> 
763
	 <value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin --> 
764
	 <value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin --> 
765
	 <value name="dislocated:obj">dislocated:obj</value> <!-- Latin --> 
766
	 <value name="dislocated:subj">dislocated:subj</value> <!-- Beja --> 
767
	 <value name="dislocated">dislocated elements</value> 
768
	 <value name="expl:comp">expl:comp</value> <!-- French --> 
769
	 <value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish --> 
770
	 <value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian --> 
771
	 <value name="expl:poss">expl:poss</value> <!-- Romanian --> 
772
	 <value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian --> 
773
	 <value name="expl:subj">expl:subj</value> <!-- French, Naija --> 
774
	 <value name="expl">expletive</value> 
775
	 <value name="fixed">fixed multiword expression</value> 
776
	 <value name="flat:abs">flat:abs</value> <!-- Ukrainian --> 
777
	 <value name="flat:dist">flat:dist</value> <!-- Western Armenian --> 
778
	 <value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian --> 
779
	 <value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian --> 
780
	 <value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian --> 
781
	 <value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian --> 
782
	 <value name="flat:repeat">flat:repeat</value> <!-- Ukrainian --> 
783
	 <value name="flat:sibl">flat:sibl</value> <!-- Ukrainian --> 
784
	 <value name="flat:title">flat:title</value> <!-- Ukrainian --> 
785
	 <value name="flat:vv">flat:vv</value> <!-- Classical Chinese --> 
786
	 <value name="flat">name multiword expression</value> 
787
	 <value name="goeswith">goes with</value> 
788
	 <value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian --> 
789
	 <value name="iobj:appl">iobj:appl</value> <!-- Wolof --> 
790
	 <value name="iobj:patient">iobj:patient</value> <!-- Tagalog --> 
791
	 <value name="iobj">indirect object</value> 
792
	 <value name="list">list</value> 
793
	 <value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese --> 
794
	 <value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French --> 
795
	 <value name="mark:aff">mark:aff</value> <!-- Beja --> 
796
	 <value name="mark:obj">marker + object</value> <!--Old French, no doc --> 
797
	 <value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc --> 
798
	 <value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic --> 
799
	 <value name="mark:q">mark:q</value> <!-- Hebrew --> 
800
	 <value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese --> 
801
	 <value name="mark">marker</value> 
802
	 <value name="nmod:agent">nmod:agent</value> <!-- Welsh --> 
803
	 <value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha --> 
804
	 <value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik --> 
805
	 <value name="nmod:att">nmod:att</value> <!-- Hungarian --> 
806
	 <value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian --> 
807
	 <value name="nmod:attr">nmod:attr</value> <!-- Chukchi --> 
808
	 <value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha --> 
809
	 <value name="nmod:cau">nmod:cau</value> <!-- Uyghur --> 
810
	 <value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur --> 
811
	 <value name="nmod:flat">nmod:flat</value> <!-- Polish --> 
812
	 <value name="nmod:gen">nmod:gen</value> <!-- Breton --> 
813
	 <value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish --> 
814
	 <value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian --> 
815
	 <value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian --> 
816
	 <value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha --> 
817
	 <value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian --> 
818
	 <value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian --> 
819
	 <value name="nmod:obl">nmod:obl</value> <!-- Hungarian --> 
820
	 <value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur --> 
821
	 <value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof --> 
822
	 <value name="nmod:pred">nmod:pred</value> <!-- Polish --> 
823
	 <value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian --> 
824
	 <value name="nmod:redup">nmod:redup</value> <!-- Welsh --> 
825
	 <value name="nmod:relat">nmod:relat</value> <!-- Chukchi --> 
826
	 <value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian --> 
827
	 <value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur --> 
828
	 <value name="nmod">nominal modifier</value> 
829
	 <value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French --> 
830
	 <value name="nsubj:aff">nsubj:aff</value> <!-- Beja --> 
831
	 <value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog --> 
832
	 <value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian --> 
833
	 <value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin --> 
834
	 <value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish --> 
835
	 <value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog --> 
836
	 <value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog --> 
837
	 <value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian --> 
838
	 <value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu --> 
839
	 <value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French --> 
840
	 <value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian --> 
841
	 <value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese --> 
842
	 <value name="nsubj">Nominal subject</value> 
843
	 <value name="nummod:det">nummod:det</value> <!-- Beja --> 
844
	 <value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian --> 
845
	 <value name="nummod:flat">nummod:flat</value> <!-- Polish --> 
846
	 <value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian --> 
847
	 <value name="nummod">numeric modifier</value> 
848
	 <value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French --> 
849
	 <value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg --> 
850
	 <value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog --> 
851
	 <value name="obj:appl">obj:appl</value> <!-- Wolof --> 
852
	 <value name="obj:caus">obj:caus</value> <!-- Wolof --> 
853
	 <value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija --> 
854
	 <value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French --> 
855
	 <value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese --> 
856
	 <value name="obj">object</value> 
857
	 <value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French --> 
858
	 <value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian --> 
859
	 <value name="obl:appl">obl:appl</value> <!-- Wolof --> 
860
	 <value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil --> 
861
	 <value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu --> 
862
	 <value name="obl:cmp">obl:cmp</value> <!-- Telugu --> 
863
	 <value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil --> 
864
	 <value name="obl:comp">obl:comp</value> <!-- Moksha --> 
865
	 <value name="obl:dat">obl:dat</value> <!-- Kurmanji --> 
866
	 <value name="obl:freq">obl:freq</value> <!-- Moksha --> 
867
	 <value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil --> 
868
	 <value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha --> 
869
	 <value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil --> 
870
	 <value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha --> 
871
	 <value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha --> 
872
	 <value name="obl:lvc">obl:lvc</value> <!-- Hungarian --> 
873
	 <value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian --> 
874
	 <value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik --> 
875
	 <value name="obl:npmod">obl:npmod</value> <!-- Coptic, English --> 
876
	 <value name="obl:orphan">obl:orphan</value> <!-- Polish --> 
877
	 <value name="obl:own">obl:own</value> <!-- Kazakh --> 
878
	 <value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese --> 
879
	 <value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil --> 
880
	 <value name="obl:poss">obl:poss</value> <!-- Thai --> 
881
	 <value name="obl:prep">obl:prep</value> <!-- Irish --> 
882
	 <value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani --> 
883
	 <value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic --> 
884
	 <value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri --> 
885
	 <value name="obl:tmod">temporal modifier</value> 
886
	 <value name="obl">oblique nominal</value> 
887
	 <value name="orphan:missing">textual gap in the source</value> <!-- Latin --> 
888
	 <value name="orphan">remnant in ellipsis</value> 
889
	 <value name="parataxis:appos">parataxis:appos</value> <!-- Italian --> 
890
	 <value name="parataxis:conj">parataxis:conj</value> <!-- Naija --> 
891
	 <value name="parataxis:coord">parataxis:coord</value> <!-- Beja --> 
892
	 <value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian --> 
893
	 <value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian --> 
894
	 <value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija --> 
895
	 <value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian --> 
896
	 <value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish --> 
897
	 <value name="parataxis:mod">parataxis:mod</value> <!-- Beja --> 
898
	 <value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian --> 
899
	 <value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian --> 
900
	 <value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish --> 
901
	 <value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija --> 
902
	 <value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian --> 
903
	 <value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani --> 
904
	 <value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian --> 
905
	 <value name="parataxis:rt">parataxis:rt</value> <!-- Irish --> 
906
	 <value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish --> 
907
	 <value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German --> 
908
	 <value name="parataxis:url">parataxis:url</value> <!-- Irish --> 
909
	 <value name="parataxis">parataxis</value> 
910
	 <value name="punct">punctuation</value> 
911
	 <value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? --> 
912
	 <value name="reparandum">overridden disfluency</value> 
913
	 <value name="root">root</value> 
914
	 <value name="vocative:cl">vocative:cl</value> <!-- Ukrainian --> 
915
	 <value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian --> 
916
	 <value name="vocative">vocative</value> 
917
	 <value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish --> 
918
	 <value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi --> 
919
	 <value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish --> 
920
	 <value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic --> 
921
	 <value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian --> 
922
	 <value name="xcomp:subj">xcomp:subj</value> <!-- Polish --> 
923
	 <value name="xcomp">open clausal complement</value>
924
</feature>
925
<feature name="coord" domain="NT" ></feature>
926
<feature name="dom" domain="NT" ></feature>
927
<feature name="type" domain="NT" > 
928
	 <value name="nV">élément non-verbal</value> 
929
	 <value name="VFin">finite verb</value> 
930
	 <value name="VInf">infinitive</value> 
931
	 <value name="VPar">participle</value> 
932
	 <value name="--">nil</value>
933
</feature>
934
<feature name="vform" domain="NT"></feature>
935
<feature name="vlemma" domain="NT"></feature>
936
<feature name="note" domain="NT"></feature>
937
<feature name="snr" domain="NT"></feature>
938
""";
939

  
940

  
941
 
942
	 printf MASTER "$nt_features_header";
943
 
944
	 printf MASTER """
945
<edgelabel> 
946
	 <value name="D">dependency</value> 
947
	 <value name="L">lexical</value> 
948
	 <value name="R">relator</value> 
949
	 <value name="*">not bound</value>
950
</edgelabel>
951
<secedgelabel> 
952
	 <value name="cluster">between elements of GpCoo</value> 
953
	 <value name="coord">between members of Coo</value> 
954
	 <value name="dupl">between duplicated nodes</value>
955
</secedgelabel>
956
</annotation>
957
</head>
958
<body>
959
""";
960
}
961

  
962
//  <value name="M">main</value>
963
//  <value name="P">part</value>
964

  
965

  
966
def write_master_footer {
967
MASTER << """</body>
968
</corpus>
969
""";
970
}
971

  
972

  
973
TEMP="""
974
<feature name="nodom" domain="NT" ></feature>
975
<feature name="headpos" domain="NT" ></feature>
976
<feature name="annotationFile" domain="NT" ></feature>
977
<feature name="annotationUri" domain="NT" ></feature>
978
""";
979

  
980

  
981
def define_cat_hashes {
982
//  abbrev2cat{"Apst"} = "Apostrophe";
983
//  abbrev2cat{"AtObj"} = "AttributObjet";
984
//  abbrev2cat{"AtRfc"} = "AttributReflechi";
985
//  abbrev2cat{"AtSj"} = "AttributSujet";
986
//  abbrev2cat{"AuxA"} = "Auxilie-Actif";
987
//  abbrev2cat{"AuxP"} = "Auxilie-Passif";
988
//  abbrev2cat{"Circ"} = "Circonstant";
989
//  abbrev2cat{"Cmpl"} = "Complement";
990
//  abbrev2cat{"GpCoo"} = "Coordonne";
991
//  abbrev2cat{"Coo"} = "Coordination";
992
//  abbrev2cat{"Det"} = "Determinant";
993
//  abbrev2cat{"NgPrt"} = "Forclusif";
994
//  abbrev2cat{"Insrt"} = "Incidente";
995
//  abbrev2cat{"Intj"} = "Interjection";
996
//  abbrev2cat{"ModA"} = "ModifieurAttache";
997
//  abbrev2cat{"ModD"} = "ModifieurDetache";
998
//  abbrev2cat{"Ng"} = "Negation";
999
//  abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif";
1000
//  abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #?
1001
//  abbrev2cat{"VFin"} = "NoeudVerbal-Personnel";
1002
//  abbrev2cat{"nSnt"} = "NonPhrase";
1003
//  abbrev2cat{"Obj"} = "Objet";
1004
//  abbrev2cat{"Snt"} = "Phrase";
1005
//  abbrev2cat{"Pon"} = "Ponctuation";
1006
//  abbrev2cat{"Rfc"} = "Reflechi";
1007
//  abbrev2cat{"Rfx"} = "ReflexifRenforce";
1008
//  abbrev2cat{"RelC"} = "Relateur-Coordonnant";
1009
//  abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant";
1010
//  abbrev2cat{"nMax"} = "StructureNonMaximale";
1011
//  abbrev2cat{"SjImp"} = "SujetImpersonnel";
1012
//  abbrev2cat{"SjPer"} = "SujetPersonnel";
1013
//  abbrev2cat{"Lac"} = "Lacune";
1014
//  abbrev2cat{"Aux"} = "Auxilie";
1015
//  abbrev2cat{"Regim"} = "Regime";
1016

  
1017

  
1018
abbrev2cat{"acl"} = "Clausal modifier of noun";
1019
abbrev2cat{"advcl"} = "Adverbial clause modifier";
1020
abbrev2cat{"advmod"} = "Adverbial modifier";
1021
abbrev2cat{"amod"} = "Adjectival modifier";
1022
abbrev2cat{"appos"} = "Appositional modifier";
1023
abbrev2cat{"aux"} = "Auxiliary";
1024
abbrev2cat{"cc-nc"} = "Coordinated conjunct : non coordonant";
1025
abbrev2cat{"cc"} = "Coordinating conjunction";
1026
abbrev2cat{"ccomp"} = "Clausal complement";
1027
abbrev2cat{"conj"} = "Conjunct";
1028
abbrev2cat{"cop"} = "Copula";
1029
abbrev2cat{"csubj"} = "Clausal subject";
1030
abbrev2cat{"det"} = "Determiner";
1031
abbrev2cat{"dislocated"} = "Dislocated elements";
1032
abbrev2cat{"expl"} = "Expletive";
1033
abbrev2cat{"iobj"} = "Indirect object";
1034
abbrev2cat{"mark"} = "Marker";
1035
abbrev2cat{"nmod"} = "Nominal modifier";
1036
abbrev2cat{"nsubj"} = "Nominal subject";
1037
abbrev2cat{"nummod"} = "Numeric modifier";
1038
abbrev2cat{"obj"} = "Object";
1039
abbrev2cat{"obl"} = " Oblique nominal";
1040
abbrev2cat{"orphan"} = "Remnant in ellipsis";
1041
abbrev2cat{"remnant"} = "Remnant ?";
1042
abbrev2cat{"vocative"} = "Vocative";
1043
abbrev2cat{"xcomp"} = "Open clausal complement";
1044
}
1045

  
1046
def print_sentence { 
1047
	for (def q = 0; q <= words.size(); q++) { 
1048
	  print "$words[$q]\n"; 
1049
	}
1050
}
1051

  
1052
def write_nonterminals { 
1053
	 def print_nt_features; 
1054
	 if ($_[1] =~ /dupl/) { 
1055
	dupl = '_dupl'; 
1056
	 } else { 
1057
	dupl = ''; 
1058
	 } 
1059
	 if (nt_features != '') { 
1060
	print_nt_features = nt_empty_features; 
1061
	if ($_[0] =~ /$thisrootname/) { 
1062
	  print_nt_features = nt_features; 
1063
	} 
1064
	 } 
1065
	 printf XML "   <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", $., w, dupl, $_[0], dom, type, vform, vlemma, print_nt_features, notes("$._$w"), $.; 
1066
	 printf XML "  <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., w, dupl; 
1067
	 // link duplicate with primary original node 
1068
	 if ($_[1] =~ /dupl/) { 
1069
	printf XML "  <secedge idref=\"s%d_%d\" label=\"dupl\"/>\n", $., w; 
1070
	 } 
1071
	 // if node is not a duplicate: attach all the daughters 
1072
	 if ($_[1] !~ /dupl/) { 
1073
	for (def d = 0; d <= daughters.size(); d++) { 
1074
	  daughter = daughters[d]; 
1075
	  if ("$._$w" != "$._$daughter") { // avoid cycles
1076
	printf XML "  <edge idref=\"n%d_%d%s\" label=\"%s\"/>\n", $., daughter, dupl, edge_label("$._$daughter"); 
1077
	  } 
1078
	  // check if a duplicate of this node must be attached 
1079
	  if (duplicates{"$._$daughter"} == 1) {
1080
	printf XML "  <edge idref=\"n%d_%d_dupl\" label=\"%s\"/>\n", $., daughter, edge_label("$._$daughter_dupl"); 
1081
	  } 
1082
	} 
1083
	 } 
1084
	 XML << "   </nt>\n";
1085
}
1086

  
1087
// checks if node is stored as in hash of relators
1088
def edge_label { 
1089
	 if (relators{$_[0]} == 1) { 
1090
	return 'R'; 
1091
	 } 
1092
	 return 'D';
1093
}
1094

  
1095
// retrives notes for this node
1096
def notes { 
1097
	 if (notes{$_[0]} != '') { 
1098
	return "$notes{$_[0]}"; 
1099
	 } 
1100
	 return '--';
1101
}
1102

  
1103
// conversions, bug fixes
1104
def clean_data { 
1105
	// conversions of values, some necessary some for convenience 
1106
	word =~ s/"/'/g; 
1107
	word =~ s/\&/(and)/g; 
1108
	word =~ s/<</«/g; 
1109
	word =~ s/>>/»/g; 
1110
	word =~ s/[<>]//g; 
1111
	pos =~ s/:/_/g; 
1112
	mor =~ s/\|/_/g; 
1113
	ppos =~ s/:/_/g; 
1114
	pmor =~ s/\|/_/g; 
1115
	lemma =~ s/\|/_/g; 
1116
	lemma =~ s/[<>]//g; 
1117
	lemma =~ s/"/'/g; 
1118
	lemma =~ s/\&/(and)/g; 
1119
	plemma =~ s/\|/_/g; 
1120
	plemma =~ s/[<>]//g; 
1121
	plemma =~ s/"/'/g; 
1122
	plemma =~ s/\&/(and)/g;
1123
 
1124
	// clean categories 
1125
	cat =~ s/ROOT/$thisrootname/;  // top node, for compatibility with SRCMF
1126
// cat =~ s/Ponctuation/Pon/; 
1127
	// correct some bugs in parse
1128
// cat =~ s/Sujet/SjPer/;
1129
// cat =~ s/Modifieur/ModA/;
1130
// cat =~ s/Parenthese/Insrt/; 
1131
	cat =~ s/\-/_/g; 
1132
	cat =~ s/RelNC_(.*)/$1_RelNC/;  // RelNC always 2nd node, for consistency in duplicates 
1133
	 return;
1134
}
1135

  
1136

  

Formats disponibles : Unified diff