Révision 3347

TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3347)
177 177
							
178 178
							if (id.contains("-")) { // multi-word line
179 179
								int index = id.indexOf("-")
180
								String id1 = id.substring(0, id)
181
								String id2 = id.substring(id+1)
180
								String id1 = id.substring(0, index)
181
								String id2 = id.substring(index+1)
182 182
								def token1 = sentence[id1]
183 183
								def token2 = sentence[id2]
184 184
								
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl.tdy (revision 3347)
1
#!/usr/bin/perl
2

  
3
use File::Basename;
4

  
5
my $CMD      = "conll2tiger.pl";
6
my $VERSION  = "1.5";
7
my $MODIFIED = "8/12/2015";        # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
8

  
9
# columns (default are the predicted values in CoNLL 2009 format)
10
my $coll   = 2;                    # lemma
11
my $colm   = 3;                    # morph (pos)
12
my $colf   = 5;                    # features
13
my $colh   = 6;                    # head
14
my $cold   = 7;                    # deprel
15
my $outdir = "conllexport";        # deprel
16
my $split  = 1000;                 # split output after nr sentences
17

  
18
# tree structure
19
my %dominates          = ();
20
my %deprel             = ();                                         # deprel{nr} = deprel
21
my @daughters          = ();                                         # daughter nodes, stored in %dominates
22
my %duplicates         = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
23
my $type               = "--";                                       # node attribute
24
my $vform              = my $vlemma = "--";                          # node attributes for verbs store form and lemma
25
my $label              = "D";                                        # default edge label
26
my $nt_features_header = '';                                         # option -x
27
my $nt_features        = '';                                         # option -x
28
my $nt_empty_features  = '';                                         # option -x
29
my @scodes             = ();                                         # option -x
30
my $add_to_sentcode    = '';
31
my $rootname           = 'root';                                     # default
32
my $featcol            = 13;
33

  
34
######################################################################
35
#  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
36
#  treebanks to TigerXML
37
#          Achim Stein <achim.stein@ling.uni-stuttgart.de>
38
# License : GNU GPL v. 3 (see the LICENSE file)
39
######################################################################
40
# TO DO:
41
# - coordination
42
# - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100)
43
#   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
44
#   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
45
######################################################################
46
# Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
47
# for Profiterole project (2019-2021)
48

  
49
# 2019-09-25
50
# - updated default column numbers for CONNL-U SRCMF format
51
# - added processing for comment lines
52
# - added @textid to terminal nodes
53
# - deleted ppos, pmor et plemma (predicted tags and lemmas)
54
# - replaced specific SRCMF with standard UD tags
55

  
56
# Update 2020-05-13
57
# - added @editionId for synchronization with BFM word ID
58

  
59
# Update 2021-03-22
60
# - using $infilename for @textid
61
# - added support for .conllu extension
62

  
63
# Update 2021-03-29
64
# - added editionId to declarations in main.xml
65

  
66
# Update 2021-07-16
67
# - added "punct" to cat values
68

  
69
# Update 2021-07-20
70
# - added cat value list compiled from
71
#   https://universaldependencies.org/ext-dep-index.html and the previous
72
#   version. All relation types and subtypes from the UD 2.8 corpora
73
#   should be there.
74
# - contractions indexed
75
######################################################################
76

  
77
my $HELP = "
78
==================================================================
79
$CMD $VERSION: Help
80
==================================================================
81
FUNKTION: converts CoNLL parser output to TigerXML (for mate tools)
82
          creates master file, splits input files, corrects unbound nodes
83
SYNTAX:      $CMD [Options] <CoNLL file>
84
OPTIONEN:
85
 -c          ignore coordination (delete coordx- prefix in deprel)
86
 -C str      corpus specials: nca
87
 -h          show help
88
 -o          create all files in this output directory (default: $outdir)
89
set COLUMNS for required info (0 = column 1, 1 = column 2, etc.)
90
 -D nr       colum for deprel default=$cold
91
 -H nr       colum for head default=$colh
92
 -M nr       colum for morphology (POS) default=$colm
93
 -F nr       colum for morph. features default=$colf
94
 -R str      Root category (default: $rootname)
95
 -s nr       split output files after each nr sentence (default = $split)
96
 -x str,...  include these attributes if present in the -X column of the first word
97
             (the first code is also copied into the sentence id)
98
 -X nr       the column where attributes are stored (default: $featcol)
99
EXAMPLE:
100
  - For mate parser output: no further options required
101
    $CMD parsed.conll
102
  - For Le Monde 2005: include attributes
103
    gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr
104
  - For NCA:
105
    conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll
106
";
107

  
108
###########################################################################
109
#                    DO NOT MODIFY FOLLOWING CODE !
110
###########################################################################
111

  
112
###########################################################################
113
# parse the command line
114
###########################################################################
115

  
116
use Getopt::Std;
117
getopts('c:C:hD:H:M:o:R:s:x:X:');
118

  
119
if ( defined($opt_h) ) {
120
    print STDERR "$HELP";
121
    exit(0);
122
}
123

  
124
if ( defined($opt_o) ) {
125
    $outdir = $opt_o;
126
}
127
if ( defined($opt_C) ) {
128
    $corpus = $opt_C;
129
}
130
if ( defined($opt_D) ) {
131
    $cold = $opt_D;
132
}
133
if ( defined($opt_H) ) {
134
    $colh = $opt_H;
135
}
136
if ( defined($opt_M) ) {
137
    $colm = $opt_M;
138
}
139

  
140
if ( defined($opt_R) ) {
141
    $rootname = $opt_R;
142
}
143

  
144
if ( defined($opt_s) ) {
145
    $split = $opt_s;
146
}
147

  
148
if ( defined($opt_X) ) {
149
    $featcol = $opt_X;
150
}
151

  
152
if ( defined($opt_x) ) {
153
    @scodes = split( ",", $opt_x );
154
    for ( my $i = 0 ; $i <= $#scodes ; $i++ ) {
155
        $nt_features_header = $nt_features_header . sprintf( "<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i] );
156
    }
157
    $nt_features_header =~ s/\bid\b/ncaid/;    # avoid reserved Tiger attribute "id"
158
}
159

  
160
my @colnames = ( "url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL" );
161

  
162
# my %pos = %lemma = %form = %deprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = ();
163
my @coordelements = ();
164

  
165
my $id = my $form = my $lemma = my $plemma = my $pos = my $ppos = my $feat = my $pfeat = my $head = my $phead = my $deprel = my $pdeprel = my $edition_id = "";
166

  
167
my $timestamp = `date`;
168
chomp($timestamp);
169

  
170
my $infile = $ARGV[0];
171
$infile =~ s/\.conllu?//i;
172
if ( $infile eq '' ) {
173
    $infile = 'subcorpus';
174
}
175
my $counter = 1;
176
$suffix = sprintf( "%05d", $counter );
177
$infilename = basename($infile);
178

  
179
$foo = `if [ ! -d $outdir ];then mkdir $outdir;fi`;
180
open( XML,    ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
181
open( LOG,    ">$outdir/conversion.log" )          or die "\nopen file error of conversion.log\n";
182
open( MASTER, ">$outdir/main.xml" )                or die "\nopen file error of main.xml\n";
183
write_xml_header();
184
write_master_header();
185

  
186
# flush output for log and master file
187
select(LOG);
188
$| = 1;
189
select(MASTER);
190
$| = 1;
191

  
192
$commandline = $0 . " " . ( join " ", @ARGV );
193
print LOG "$commandline\n\n";
194

  
195
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
196

  
197
$/ = "";    # treat empty line as RS
198
while (<>) {
199
    if ( $. % $split == 0 ) {
200
        print XML "</subcorpus>\n";
201
        close(XML);
202
        $suffix = sprintf( "%05d", ++$counter );
203
        open( XML, ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error\n";
204
        write_xml_header();
205
        print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
206
    }
207

  
208
    # ----------------------------------------
209
    # set root (or fake root if ROOT is missing)
210
    # ----------------------------------------
211
    $rootnode = $fakeroot = 0;    # m = Treat string as multiple lines, so that ^ matches beginning of line
212
    $thisrootname = $rootname;
213
    ($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);    # real root marked by parser
214
    if ( $rootnode == 0 ) {
215

  
216
        #    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
217
        #    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
218
        ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);    # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
219
        print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
220
        $fakeroot     = 1;
221
        $thisrootname = 'nSnt';
222
    }
223
    if ( $rootnode == 0 ) {
224
        $rootnode = 1;                                               # set fake root if nothing goes
225
        print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
226
        $fakeroot     = 2;
227
        $thisrootname = 'Err';
228
    }
229

  
230
    my @cols = ();
231
    @words     = split(/\n/);
232
    @terminals = ();
233

  
234
    %dominates = ();                                                 # empty at beginning of sentence
235
    %deprel    = ();                                                 # empty at beginning of sentence
236
    %aux       = ();                                                 # empty at beginning of sentence
237
    @daughters = ();
238

  
239
    my $commentlines = 0;                                            #added by AL
240

  
241
    #  my $contractions = 0; #added by AL
242
    #  my $text_id = "unknown_text";
243
    my $text_id = $infilename;
244
    my $sent_id = "0";
245

  
246
    # ----------------------------------------
247
    # loop through words #1: write tokens (terminal nodes) to XML file
248
    # store tree relevant information for loop #2
249
    # ----------------------------------------
250
    for ( my $w = 0 ; $w <= $#words ; $w++ ) {
251

  
252
        # Added by AL for comment lines
253
        if ( $words[$w] =~ /^#/ ) {
254
            if ( $words[$w] =~ /^# newdoc/ ) {
255
                $text_id = $words[$w];
256
                $text_id =~ s/# newdoc id = //;
257
            }
258
            elsif ( $words[$w] =~ /^# sent_id/ ) {
259
                $sent_id = $words[$w];
260
                $sent_id =~ s/# sent_id = //;
261
            }
262

  
263
            #	print LOG "Comment line loop 1: $words[$w]\n";
264
            $commentlines++;
265
            next;
266
        }
267

  
268
        # Added by AL for contractions
269
        elsif ( $words[$w] =~ /^\d+-\d+/ ) {
270

  
271
            #	print LOG "Contraction line loop 1: $words[$w]\n";
272
            $commentlines++;
273

  
274
            #	$contractions++;
275
            next;
276
        }
277
        else {
278
            if ( defined($opt_c) ) {
279
                $words[$w] =~ s/coord(\d+)-//g;
280
            }
281
            @cols       = split( /\t/, $words[$w] );
282
            $wnr        = $cols[0];
283
            $word       = $cols[1];
284
            $lemma      = $cols[2];
285
            $plemma     = $cols[2];                    # predicted
286
            $pos        = $cols[3];
287
            $ppos       = $cols[4];                    # predicted
288
            $mor        = $cols[5];
289
            $pmor       = $cols[5];                    # predicted
290
            $cat        = $cols[$cold];
291
            $edition_id = $cols[9];
292
            $edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g;
293

  
294
            if ( $cat =~ /[<>]/ ) {
295
                print LOG "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
296
                $cat = 'Err2';
297
            }
298

  
299
            # NCA: enclose lemmas in underscores (easier for regex construction)
300
            if ( $corpus =~ /nca/i ) {
301
                $lemma = "_" . "$lemma" . "_";
302
            }
303

  
304
            clean_data();
305

  
306
            # get attribute-value pairs from col #13 of first word (option -x)
307
            if ( $opt_x == "all" ) {
308
                $cols[$featcol] = "all=" . $cols[$featcol];
309
            }
310
            if ( $w == 0 && $cols[$featcol] =~ /=/ ) {
311

  
312
                #      print STDERR "========== getting att-value for word $w: $cols[$featcol] scodes=@scodes\n";
313
                $nt_features = $nt_empty_features = '';
314

  
315
                #      while($cols[$featcol] =~ m/ (.*?)="([^"]*)"/gs) {   # quoted values
316
                while ( $cols[$featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs ) {    # maybe unquoted values (e.g. Le Monde 2005)
317
                    $att = $1;
318
                    $val = $2;
319

  
320
                    # pick the attributes that match those of the command line option -x
321
                    for ( my $t = 0 ; $t <= $#scodes ; $t++ ) {
322
                        if ( $att eq $scodes[$t] ) {
323
                            $val =~ s/\&/\&amp;/g;                                 #  replace "&" in values (appears in URLs)
324
                            if ( $t == 0 ) { $add_to_sentcode = "_$att$val"; }
325
                            $nt_features = $nt_features . " $att=\"$val\"";
326

  
327
                            #	    print STDERR "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n";
328
                        }
329
                        if ( $att eq $scodes[$t] ) { $nt_empty_features = $nt_empty_features . " $att=\"--\""; }
330
                    }
331
                }
332

  
333
                # replace the reserved feature 'id' (Tiger)
334
                $add_to_sentcode =~ s/\bid=/ncaid=/;
335
                $nt_features =~ s/\bid=/ncaid=/;
336
                $nt_empty_features =~ s/\bid=/ncaid=/;
337
            }    # if col 13 contains attributes
338
            else {
339
                if ( defined($opt_x) && ( $w == 0 ) ) {
340
                    print STDERR "Warning: sentence=$.  option -x is defined, but no attribute=value declarations were found!\n";
341
                }
342
            }
343

  
344
            # store output for terminal node in array, output later. For double categories make a duplicate node.
345
            $tempid = sprintf( "%d_%d", $., $wnr );
346

  
347
#    push(@terminals, sprintf("      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $ppos, $pmor, $plemma, $text_id, $edition_id));
348
            push( @terminals,
349
                sprintf( "      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $text_id, $edition_id ) );
350
            if ( $cat =~ /_/ ) {
351

  
352
#      push(@terminals, sprintf("      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", "_", "_", "_", $text_id, $edition_id));
353
                push( @terminals,
354
                    sprintf( "      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", $text_id, $edition_id ) );
355
                $duplicates{$tempid} = 1;    # store, check later to attach the duplicates to the mother
356
            }
357

  
358
            # associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux)
359
            if ( $cat =~ /Aux/ ) {
360
                $aux{ $cols[$colh] } = "$word" . "_" . "$plemma";    # $aux{head} = word_lemma (of Aux)
361
            }
362

  
363
            # ----------------------------------------
364
            # store information needed for tree
365
            # ----------------------------------------
366
            # if fake rootnode == 1: nSnt as root node
367
            if ( ( $fakeroot == 1 ) && ( $w - $commentlines + 1 == $rootnode ) ) {
368
                $cat = 'nSnt';
369
                $notes{$tempid} = 'Warning no marked ROOT node in CoNLL';    # TODO: geht nicht
370
            }
371

  
372
            # if fake rootnode == 2: flatten structure: attach all words to the first word
373
            if ( ( $fakeroot == 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
374
                $cols[$colh] = 1;
375
                $notes{$tempid} = 'Error neither ROOT node nor top node in CoNLL';
376
            }
377

  
378
            # correct unbound words in parser output (phead = 0, but not marked as ROOT)
379
            if ( ( $cols[$colh] eq "0" ) && ( $w - $commentlines + 1 != $rootnode ) ) {    #AL: added: -$commentlines
380
                printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", ( $w - $commentlines + 1 ), $rootnode;
381
                $cols[$colh]    = $rootnode;
382
                $cat            = 'Err';                                                   # let Err instead of deprel appear in dom attribute
383
                $notes{$tempid} = 'Warning unbound node in CoNLL';
384
            }
385

  
386
            # store for R edge labels
387
            if ( $cols[$cold] =~ /RelN?C/ ) {
388
                $relators{$tempid} = 1;
389
            }
390

  
391
            # store deprel for dom attribute
392
            $deprel{$tempid} = $cat;                                                       # $cols[$cold];
393
                                                                                           # if real root, add this node to daughter array, store array in hash dominates{head}{@daughters}
394
            if ( ( $fakeroot < 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
395
                @daughters = @{ $dominates{ $cols[$colh] } };                              # get the array from the hash of the dominating node
396
                push( @daughters, $wnr );
397
                $dominates{ $cols[$colh] } = [@daughters];
398
            }
399
        }    # for each word loop #1
400
    }    # AL condition end
401

  
402
    # print graph code (needs root attribute) and terminal nodes
403
    if ( $rootnode == 0 ) {
404
        $noroot++;
405
        print LOG "Error sentence $. ($tempid): root node not found:\n$_\n";
406
        next;
407
    }
408
    else {
409
        printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., $add_to_sentcode;
410
        print XML "  <graph root=\"n$._$rootnode\">\n";
411
        print XML "    <terminals>\n";
412
        for ( my $t = 0 ; $t <= $#terminals ; $t++ ) {
413
            print XML $terminals[$t];
414
        }
415
        print XML "    </terminals>\n";
416
    }
417

  
418
    # ----------------------------------------
419
    # loop through words #2 to build Tiger tree (non terminal nodes)
420
    # ----------------------------------------
421
    print XML "    <nonterminals>\n";
422
    for ( my $i = 0 ; $i <= $#words ; $i++ ) {
423

  
424
        #Added AL for comment lines
425
        if ( $words[$i] =~ /^#/ ) {
426

  
427
            #       print LOG "Comment line loop 2 : $words[$i]\n";
428
            next;
429
        }
430

  
431
        #Added AL for contractions
432
        if ( $words[$i] =~ /^\d+-\d+/ ) {
433

  
434
            #       print LOG "Contraction loop 2 : $words[$i]\n";
435
            next;
436
        }
437

  
438
        else {
439

  
440
            @cols = split( /\t/, $words[$i] );
441
            $w = $cols[0];
442
            ### TODO: redundante Variablenzuweisung (= loop #1)??
443
            $word   = $cols[1];
444
            $lemma  = $cols[2];
445
            $plemma = $cols[3];       # predicted
446
            $pos    = $cols[4];
447
            $ppos   = $cols[5];       # predicted
448
            $mor    = $cols[6];
449
            $pmor   = $cols[7];       # predicted
450
            $cat    = $cols[$cold];
451

  
452
            if ( $cat =~ /[<>]/ ) {
453
                print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
454
                $cat = 'Err2';
455
            }
456

  
457
            #  OF parser has not learned punctuation: set cat for punctuation to PON
458
            if ( ( $corpus =~ /nca/i ) && ( $pos eq 'PON' ) ) {
459
                $cols[$cold] = $cat = 'Pon';
460
            }
461

  
462
            clean_data();
463

  
464
            # retrieve daughters, make dom attribute (string of dominated nodes)
465
            @daughters = @{ $dominates{"$w"} };
466
            $dom       = '';
467
            for ( my $d = 0 ; $d <= $#daughters ; $d++ ) {
468
                $dom = $dom . "_" . $deprel{"$._$daughters[$d]"};
469
            }
470
            if ( $dom =~ /_/ ) {
471
                $dom =~ s/^_//;
472
            }
473
            else {
474
                $dom = '--';
475
            }
476

  
477
            # if verbal, set node attributes for verb form and lemma
478
            $type = "nV";
479
            $vform = $vlemma = "--";
480
            if ( $pos =~ /VER/ ) {    # AL: $ppos -> $pos
481
                if    ( $mor =~ /infi/ )       { $type = "VInf"; }    #AL: $pmor -> $mor
482
                elsif ( $pmor =~ /pper|ppre/ ) { $type = "VPar"; }
483
                else                           { $type = "VFin"; }
484

  
485
                # if Aux is present, create attribute for main verb
486
                if ( $aux{$w} =~ /(.*?)_(.*)/ ) {
487
                    $vform  = "$1";
488
                    $vlemma = "$2";
489
                }
490

  
491
                # else create attr for simple verb
492
                else {
493
                    $vform  = $word;
494
                    $vlemma = $lemma;    # AL: $plemma -> $lemma (always void in SRCMF)
495
                }
496

  
497
                # NCA: enclose lemmas in underscores (easier for regex construction)
498
                if ( $corpus =~ /nca/i ) {
499
                    $vlemma = "_" . "$vlemma" . "_";
500
                }
501
            }
502

  
503
            # call output function (twice for duplicate categories)
504
            if ( $cat =~ /(.*?)_(.*)/ ) {
505
                write_nonterminals( "$2", "" );         # RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC
506
                write_nonterminals( "$1", "_dupl" );    # other category is duplicate
507
            }
508
            else {
509
                write_nonterminals($cat);
510
            }
511
        }    # for words
512
    }    #AL end condition
513

  
514
    print XML "    </nonterminals>\n";
515
    print XML "  </graph>\n";
516
    print XML "</s>\n";
517
    if ( $. % 100 == 0 ) { print STDERR "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.; }
518
}    # main
519
print XML "</subcorpus>\n";
520
print STDERR "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n";
521
print STDERR "   Hint 1: on OS X convert master file to MacRoman, e.g  iconv -f latin1 -t macroman\n";
522
print STDERR "   Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n";
523
print STDERR "   Hint 3: build reliable feature declarations using tiger.sh\n";
524
print STDERR "           tiger.sh -a \"lemma word pos ppos\"  (for terminals)\n";
525
print STDERR "           tiger.sh -A \"lemma word pos ppos\"  (for non-terminals)\n";
526
if ( $noroot > 0 ) { print STDERR "$noroot sentences ignored: root not found (see log file)\n"; }
527
write_master_footer();
528
close(MASTER);
529
close(XML);
530
close(LOG);
531

  
532
exit;
533

  
534
# ----------------------------------------
535
# sub
536
# ----------------------------------------
537

  
538
sub write_xml_header {
539
    print XML "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
540
  <subcorpus name=\"$infilename-$suffix\">
541
";
542
}
543

  
544
sub write_master_header {
545
    printf MASTER '<?xml version="1.0" encoding="UTF-8"?>
546
';
547

  
548
    printf MASTER "<corpus id=\"$corpus\">
549
<head>
550
  <meta><name>$corpus</name> 
551
    <author>ILR Stuttgart</author> 
552
    <date></date> 
553
    <description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description> 
554
    <format>SRCMF</format>
555
    <history>TigerXML converted by conll2tiger.pl</history>
556
  </meta>
557
";
558

  
559
    #  printf MASTER '<annotation>
560
    #<feature name="word" domain="T" ></feature>
561
    #<feature name="pos" domain="T" ></feature>
562
    #<feature name="mor" domain="T" ></feature>
563
    #<feature name="lemma" domain="T" ></feature>
564
    #<feature name="ppos" domain="T" ></feature>
565
    #<feature name="pmor" domain="T" ></feature>
566
    #<feature name="plemma" domain="T" ></feature>
567
    #<feature name="cat" domain="NT" >
568
    #  <value name="Apst">apostrophe</value>
569
    #  <value name="AtObj">attribut d objet</value>
570
    #  <value name="AtRfc">attribut réfléchi</value>
571
    #  <value name="AtSj">attribut de sujet</value>
572
    #  <value name="AttributReflechi">attribut réfléchi</value>
573
    #  <value name="Aux">auxilié</value>
574
    #  <value name="AuxA">auxilié actif</value>
575
    #  <value name="AuxP">auxilié passif</value>
576
    #  <value name="Circ">circonstant</value>
577
    #  <value name="Circ_RelNC">circonstant pronom relatif</value>
578
    #  <value name="Cmpl">complément</value>
579
    #  <value name="Cmpl_RelNC">complément pronom relatif</value>
580
    #  <value name="Coo">coordination</value>
581
    #  <value name="Det">déterminant</value>
582
    #  <value name="Err">unbound node in CoNLL input</value>
583
    #  <value name="Err2">illegal node name was replaced</value>
584
    #  <value name="GpCoo">coordonné</value>
585
    #  <value name="Ignorer">Ignorer</value>
586
    #  <value name="Insrt">incidente</value>
587
    #  <value name="Intj">interjection</value>
588
    #  <value name="Lac">lacune</value>
589
    #  <value name="ModA">modifieur attaché</value>
590
    #  <value name="ModD">modifieur détaché</value>
591
    #  <value name="Ng">négation</value>
592
    #  <value name="NgPrt">forclusif</value>
593
    #  <value name="Obj">objet</value>
594
    #  <value name="Obj_RelNC">direct object pronom relatif</value>
595
    #  <value name="Pon">ponctuation</value>
596
    #  <value name="PON">ponctuation</value>
597
    #  <value name="Regim">régime</value>
598
    #  <value name="RelC">relateur coordonnant</value>
599
    #  <value name="RelNC">relateur non coordonnant</value>
600
    #  <value name="Rfc">réfléchi</value>
601
    #  <value name="Rfx">réfléxif renforcé</value>
602
    #  <value name="SjImp">sujet impersonnel</value>
603
    #  <value name="SjPer">sujet personnel</value>
604
    #  <value name="SjPer_RelNC">sujet personnel pronom relatif</value>
605
    #  <value name="Snt">phrase</value>
606
    #  <value name="ROOT">phrase</value>
607
    #  <value name="StructureMaximale">structure maximale</value>
608
    #  <value name="VFin">verbe fini</value>
609
    #  <value name="VInf">verbe infinitif</value>
610
    #  <value name="nMax">structure non-maximale</value>
611
    #  <value name="nSnt">non-phrase</value>
612
    #</feature>
613
    #<feature name="coord" domain="NT" ></feature>
614
    #<feature name="dom" domain="NT" ></feature>
615
    #<feature name="type" domain="NT" >
616
    #  <value name="nV">élément non-verbal</value>
617
    #  <value name="VFin">verbe fini</value>
618
    #  <value name="VInf">verbe infinitif</value>
619
    #  <value name="VPar">verbe participial</value>
620
    #  <value name="--">nil</value>
621
    #</feature>
622
    #<feature name="vform" domain="NT"></feature>
623
    #<feature name="vlemma" domain="NT"></feature>
624
    #<feature name="note" domain="NT"></feature>
625
    #<feature name="snr" domain="NT"></feature>
626
    #';
627

  
628
    printf MASTER '<annotation>
629
<feature name="word" domain="T" ></feature>
630
<feature name="pos" domain="T" ></feature>
631
<feature name="mor" domain="T" ></feature>
632
<feature name="lemma" domain="T" ></feature>
633
<feature name="textid" domain="T" ></feature>
634
<feature name="editionId" domain="T" ></feature>
635
<feature name="cat" domain="NT" >
636
  <value name="__UNDEF__">UNDEFINED !!!</value>
637
  <value name="acl:adv">acl:adv</value> <!-- Ukrainian -->
638
  <value name="acl:attr">acl:attr</value> <!-- Chukchi -->
639
  <value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish -->
640
  <value name="acl:fixed">acl:fixed</value> <!-- Beja -->
641
  <value name="acl:inf">acl:inf</value> <!-- Portuguese -->
642
  <value name="acl:relat">acl:relat</value> <!-- Chukchi -->
643
  <value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof -->
644
  <value name="acl">clausal modifier of noun (adnominal clause)</value>
645
  <value name="advcl:abs">advcl:abs</value> <!-- Latin -->
646
  <value name="advcl:cau">advcl:cau</value> <!-- Moksha -->
647
  <value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija -->
648
  <value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish -->
649
  <value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur -->
650
  <value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese -->
651
  <value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian -->
652
  <value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak -->
653
  <value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian -->
654
  <value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak -->
655
  <value name="advcl:pred">advcl:pred</value> <!-- Latin -->
656
  <value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian -->
657
  <value name="advcl:sp">advcl:sp</value> <!-- Ukrainian -->
658
  <value name="advcl:svc">advcl:svc</value> <!-- Ukrainian -->
659
  <value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
660
  <value name="advcl">adverbial clause modifier</value>
661
  <value name="advmod:arg">advmod:arg</value> <!-- Polish -->
662
  <value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha -->
663
  <value name="advmod:comp">advmod:comp</value> <!-- Erzya -->
664
  <value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
665
  <value name="advmod:det">advmod:det</value> <!-- Ukrainian -->
666
  <value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese -->
667
  <value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian -->
668
  <value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
669
  <value name="advmod:fixed">advmod:fixed</value> <!-- Beja -->
670
  <value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
671
  <value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha -->
672
  <value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
673
  <value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
674
  <value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian -->
675
  <value name="advmod:locy">advmod:locy</value> <!-- Hungarian -->
676
  <value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
677
  <value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
678
  <value name="advmod:mode">advmod:mode</value> <!-- Hungarian -->
679
  <value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami -->
680
  <value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French -->
681
  <value name="advmod:que">advmod:que</value> <!-- Hungarian -->
682
  <value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian -->
683
  <value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian -->
684
  <value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
685
  <value name="advmod:to">advmod:to</value> <!-- Hungarian -->
686
  <value name="advmod:tto">advmod:tto</value> <!-- Hungarian -->
687
  <value name="advmod">adverbial modifier</value>
688
  <value name="amod:att">amod:att</value> <!-- Hungarian -->
689
  <value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian -->
690
  <value name="amod:flat">amod:flat</value> <!-- Polish -->
691
  <value name="amod">adjectival modifier</value>
692
  <value name="appos:trans">appos:trans</value> <!-- Turkish German -->
693
  <value name="appos">appositional modifier</value>
694
  <value name="aux:aff">aux:aff</value> <!-- Beja -->
695
  <value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian -->
696
  <value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian -->
697
  <value name="aux:clitic">aux:clitic</value> <!-- Polish -->
698
  <value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish -->
699
  <value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian -->
700
  <value name="aux:imp">aux:imp</value> <!-- Erzya, Polish -->
701
  <value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami -->
702
  <value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil -->
703
  <value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha -->
704
  <value name="aux:part">aux:part</value> <!-- Maltese -->
705
  <value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese -->
706
  <value name="aux:pot">aux:pot</value> <!-- Komi Zyrian -->
707
  <value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German -->
708
  <value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami -->
709
  <value name="aux">auxiliary</value>
710
  <value name="case:acc">case:acc</value> <!-- Hebrew -->
711
  <value name="case:adv">case:adv</value> <!-- Indonesian -->
712
  <value name="case:aff">case:aff</value> <!-- Beja -->
713
  <value name="case:det">preposition with determiner</value> <!-- Maltese, Old French -->
714
  <value name="case:gen">case:gen</value> <!-- Hebrew -->
715
  <value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian -->
716
  <value name="case:pred">case:pred</value> <!-- Welsh -->
717
  <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic -->
718
  <value name="case">case marking</value>
719
  <value name="cc:nc">cc:nc</value> <!-- Old French -->
720
  <value name="cc:nc">Coordinated conjunct : non coordonant</value>
721
  <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish -->
722
  <value name="cc:preconj">preconjunct</value>
723
  <value name="cc">Coordinating conjunction</value>
724
  <value name="cc">coordinating conjunction</value>
725
  <value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish -->
726
  <value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish -->
727
  <value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian -->
728
  <value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian -->
729
  <value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian -->
730
  <value name="ccomp">clausal complement</value>
731
  <value name="clf">classifier</value>
732
  <value name="compound:a">compound:a</value> <!-- Indonesian -->
733
  <value name="compound:affix">compound:affix</value> <!-- Hebrew -->
734
  <value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese -->
735
  <value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese -->
736
  <value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian -->
737
  <value name="compound:lvc">light verb construction</value>
738
  <value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami -->
739
  <value name="compound:preverb">compound:preverb</value> <!-- Hungarian -->
740
  <value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba -->
741
  <value name="compound:prt">phrasal verb particle</value>
742
  <value name="compound:quant">compound:quant</value> <!-- Cantonese -->
743
  <value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian -->
744
  <value name="compound:smixut">compound:smixut</value> <!-- Hebrew -->
745
  <value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba -->
746
  <value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese -->
747
  <value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese -->
748
  <value name="compound">compound</value>
749
  <value name="conj:expl">conj:expl</value> <!-- Latin -->
750
  <value name="conj:extend">conj:extend</value> <!-- Slovenian -->
751
  <value name="conj:svc">conj:svc</value> <!-- Ukrainian -->
752
  <value name="conj">conjunct</value>
753
  <value name="cop:expl">cop:expl</value> <!-- Maltese -->
754
  <value name="cop:locat">cop:locat</value> <!-- Polish -->
755
  <value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi -->
756
  <value name="cop">copula</value>
757
  <value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic -->
758
  <value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish -->
759
  <value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian -->
760
  <value name="csubj">clausal subject</value>
761
  <value name="dep:aff">dep:aff</value> <!-- Beja -->
762
  <value name="dep:agr">dep:agr</value> <!-- Kiche -->
763
  <value name="dep:alt">dep:alt</value> <!-- Upper Sorbian -->
764
  <value name="dep:ana">dep:ana</value> <!-- Yupik -->
765
  <value name="dep:aux">dep:aux</value> <!-- Yupik -->
766
  <value name="dep:comp">dep:comp</value> <!-- Beja, French -->
767
  <value name="dep:conj">dep:conj</value> <!-- Beja -->
768
  <value name="dep:cop">dep:cop</value> <!-- Yupik -->
769
  <value name="dep:emo">dep:emo</value> <!-- Yupik -->
770
  <value name="dep:infl">dep:infl</value> <!-- Yupik -->
771
  <value name="dep:mark">dep:mark</value> <!-- Yupik -->
772
  <value name="dep:mod">dep:mod</value> <!-- Mbya Guarani -->
773
  <value name="dep:pos">dep:pos</value> <!-- Yupik -->
774
  <value name="dep:redup">dep:redup</value> <!-- Beja -->
775
  <value name="dep:ss">dep:ss</value> <!-- Kiche -->
776
  <value name="dep">unspecified dependency</value>
777
  <value name="det:adj">det:adj</value> <!-- Albanian -->
778
  <value name="det:noun">det:noun</value> <!-- Albanian -->
779
  <value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian -->
780
  <value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian -->
781
  <value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian -->
782
  <value name="det:predet">det:predet</value> <!-- English, Italian, Persian -->
783
  <value name="det:pron">det:pron</value> <!-- Albanian -->
784
  <value name="det:rel">det:rel</value> <!-- Bambara -->
785
  <value name="det">determiner</value>
786
  <value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish -->
787
  <value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian -->
788
  <value name="discourse:intj">discourse:intj</value> <!-- Polish -->
789
  <value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese -->
790
  <value name="discourse">discourse element</value>
791
  <value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani -->
792
  <value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin -->
793
  <value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin -->
794
  <value name="dislocated:obj">dislocated:obj</value> <!-- Latin -->
795
  <value name="dislocated:subj">dislocated:subj</value> <!-- Beja -->
796
  <value name="dislocated">dislocated elements</value>
797
  <value name="expl:comp">expl:comp</value> <!-- French -->
798
  <value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish -->
799
  <value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian -->
800
  <value name="expl:poss">expl:poss</value> <!-- Romanian -->
801
  <value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian -->
802
  <value name="expl:subj">expl:subj</value> <!-- French, Naija -->
803
  <value name="expl">expletive</value>
804
  <value name="fixed">fixed multiword expression</value>
805
  <value name="flat:abs">flat:abs</value> <!-- Ukrainian -->
806
  <value name="flat:dist">flat:dist</value> <!-- Western Armenian -->
807
  <value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian -->
808
  <value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian -->
809
  <value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian -->
810
  <value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian -->
811
  <value name="flat:repeat">flat:repeat</value> <!-- Ukrainian -->
812
  <value name="flat:sibl">flat:sibl</value> <!-- Ukrainian -->
813
  <value name="flat:title">flat:title</value> <!-- Ukrainian -->
814
  <value name="flat:vv">flat:vv</value> <!-- Classical Chinese -->
815
  <value name="flat">name multiword expression</value>
816
  <value name="goeswith">goes with</value>
817
  <value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian -->
818
  <value name="iobj:appl">iobj:appl</value> <!-- Wolof -->
819
  <value name="iobj:patient">iobj:patient</value> <!-- Tagalog -->
820
  <value name="iobj">indirect object</value>
821
  <value name="list">list</value>
822
  <value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese -->
823
  <value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French -->
824
  <value name="mark:aff">mark:aff</value> <!-- Beja -->
825
  <value name="mark:obj">marker + object</value> <!--Old French, no doc -->
826
  <value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc -->
827
  <value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic -->
828
  <value name="mark:q">mark:q</value> <!-- Hebrew -->
829
  <value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese -->
830
  <value name="mark">marker</value>
831
  <value name="nmod:agent">nmod:agent</value> <!-- Welsh -->
832
  <value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha -->
833
  <value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik -->
834
  <value name="nmod:att">nmod:att</value> <!-- Hungarian -->
835
  <value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian -->
836
  <value name="nmod:attr">nmod:attr</value> <!-- Chukchi -->
837
  <value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha -->
838
  <value name="nmod:cau">nmod:cau</value> <!-- Uyghur -->
839
  <value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur -->
840
  <value name="nmod:flat">nmod:flat</value> <!-- Polish -->
841
  <value name="nmod:gen">nmod:gen</value> <!-- Breton -->
842
  <value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish -->
843
  <value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian -->
844
  <value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian -->
845
  <value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha -->
846
  <value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian -->
847
  <value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian -->
848
  <value name="nmod:obl">nmod:obl</value> <!-- Hungarian -->
849
  <value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur -->
850
  <value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof -->
851
  <value name="nmod:pred">nmod:pred</value> <!-- Polish -->
852
  <value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian -->
853
  <value name="nmod:redup">nmod:redup</value> <!-- Welsh -->
854
  <value name="nmod:relat">nmod:relat</value> <!-- Chukchi -->
855
  <value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian -->
856
  <value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur -->
857
  <value name="nmod">nominal modifier</value>
858
  <value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French -->
859
  <value name="nsubj:aff">nsubj:aff</value> <!-- Beja -->
860
  <value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog -->
861
  <value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian -->
862
  <value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin -->
863
  <value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish -->
864
  <value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog -->
865
  <value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog -->
866
  <value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian -->
867
  <value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu -->
868
  <value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French -->
869
  <value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian -->
870
  <value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese -->
871
  <value name="nsubj">Nominal subject</value>
872
  <value name="nummod:det">nummod:det</value> <!-- Beja -->
873
  <value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian -->
874
  <value name="nummod:flat">nummod:flat</value> <!-- Polish -->
875
  <value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian -->
876
  <value name="nummod">numeric modifier</value>
877
  <value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French -->
878
  <value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg -->
879
  <value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog -->
880
  <value name="obj:appl">obj:appl</value> <!-- Wolof -->
881
  <value name="obj:caus">obj:caus</value> <!-- Wolof -->
882
  <value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija -->
883
  <value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French -->
884
  <value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese -->
885
  <value name="obj">object</value>
886
  <value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French -->
887
  <value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian -->
888
  <value name="obl:appl">obl:appl</value> <!-- Wolof -->
889
  <value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil -->
890
  <value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu -->
891
  <value name="obl:cmp">obl:cmp</value> <!-- Telugu -->
892
  <value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil -->
893
  <value name="obl:comp">obl:comp</value> <!-- Moksha -->
894
  <value name="obl:dat">obl:dat</value> <!-- Kurmanji -->
895
  <value name="obl:freq">obl:freq</value> <!-- Moksha -->
896
  <value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil -->
897
  <value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
898
  <value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil -->
899
  <value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha -->
900
  <value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
901
  <value name="obl:lvc">obl:lvc</value> <!-- Hungarian -->
902
  <value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian -->
903
  <value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik -->
904
  <value name="obl:npmod">obl:npmod</value> <!-- Coptic, English -->
905
  <value name="obl:orphan">obl:orphan</value> <!-- Polish -->
906
  <value name="obl:own">obl:own</value> <!-- Kazakh -->
907
  <value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese -->
908
  <value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil -->
909
  <value name="obl:poss">obl:poss</value> <!-- Thai -->
910
  <value name="obl:prep">obl:prep</value> <!-- Irish -->
911
  <value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani -->
912
  <value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic -->
913
  <value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri -->
914
  <value name="obl:tmod">temporal modifier</value>
915
  <value name="obl">oblique nominal</value>
916
  <value name="orphan:missing">textual gap in the source</value> <!-- Latin -->
917
  <value name="orphan">remnant in ellipsis</value>
918
  <value name="parataxis:appos">parataxis:appos</value> <!-- Italian -->
919
  <value name="parataxis:conj">parataxis:conj</value> <!-- Naija -->
920
  <value name="parataxis:coord">parataxis:coord</value> <!-- Beja -->
921
  <value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian -->
922
  <value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian -->
923
  <value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija -->
924
  <value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian -->
925
  <value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish -->
926
  <value name="parataxis:mod">parataxis:mod</value> <!-- Beja -->
927
  <value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian -->
928
  <value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian -->
929
  <value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish -->
930
  <value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija -->
931
  <value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian -->
932
  <value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani -->
933
  <value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian -->
934
  <value name="parataxis:rt">parataxis:rt</value> <!-- Irish -->
935
  <value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish -->
936
  <value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German -->
937
  <value name="parataxis:url">parataxis:url</value> <!-- Irish -->
938
  <value name="parataxis">parataxis</value>
939
  <value name="punct">punctuation</value>
940
  <value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? -->
941
  <value name="reparandum">overridden disfluency</value>
942
  <value name="root">root</value>
943
  <value name="vocative:cl">vocative:cl</value> <!-- Ukrainian -->
944
  <value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian -->
945
  <value name="vocative">vocative</value>
946
  <value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish -->
947
  <value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi -->
948
  <value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish -->
949
  <value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic -->
950
  <value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian -->
951
  <value name="xcomp:subj">xcomp:subj</value> <!-- Polish -->
952
  <value name="xcomp">open clausal complement</value>
953
</feature>
954
<feature name="coord" domain="NT" ></feature>
955
<feature name="dom" domain="NT" ></feature>
956
<feature name="type" domain="NT" >
957
  <value name="nV">élément non-verbal</value>
958
  <value name="VFin">finite verb</value>
959
  <value name="VInf">infinitive</value>
960
  <value name="VPar">participle</value>
961
  <value name="--">nil</value>
962
</feature>
963
<feature name="vform" domain="NT"></feature>
964
<feature name="vlemma" domain="NT"></feature>
965
<feature name="note" domain="NT"></feature>
966
<feature name="snr" domain="NT"></feature>
967
';
968

  
969
    printf MASTER "$nt_features_header";
970

  
971
    printf MASTER '
972
<edgelabel>
973
  <value name="D">dependency</value>
974
  <value name="L">lexical</value>
975
  <value name="R">relator</value>
976
  <value name="*">not bound</value>
977
</edgelabel>
978
<secedgelabel>
979
  <value name="cluster">between elements of GpCoo</value>
980
  <value name="coord">between members of Coo</value>
981
  <value name="dupl">between duplicated nodes</value>
982
</secedgelabel>
983
</annotation>
984
</head>
985
<body>
986
';
987
}
988

  
989
#  <value name="M">main</value>
990
#  <value name="P">part</value>
991

  
992
sub write_master_footer {
993
    print MASTER '</body>
994
</corpus>
995
';
996
}
997

  
998
$TEMP = '
999
<feature name="nodom" domain="NT" ></feature>
1000
<feature name="headpos" domain="NT" ></feature>
1001
<feature name="annotationFile" domain="NT" ></feature>
1002
<feature name="annotationUri" domain="NT" ></feature>
1003
';
1004

  
1005
sub define_cat_hashes {
1006

  
1007
    #  $abbrev2cat{"Apst"} = "Apostrophe";
1008
    #  $abbrev2cat{"AtObj"} = "AttributObjet";
1009
    #  $abbrev2cat{"AtRfc"} = "AttributReflechi";
1010
    #  $abbrev2cat{"AtSj"} = "AttributSujet";
1011
    #  $abbrev2cat{"AuxA"} = "Auxilie-Actif";
1012
    #  $abbrev2cat{"AuxP"} = "Auxilie-Passif";
1013
    #  $abbrev2cat{"Circ"} = "Circonstant";
1014
    #  $abbrev2cat{"Cmpl"} = "Complement";
1015
    #  $abbrev2cat{"GpCoo"} = "Coordonne";
1016
    #  $abbrev2cat{"Coo"} = "Coordination";
1017
    #  $abbrev2cat{"Det"} = "Determinant";
1018
    #  $abbrev2cat{"NgPrt"} = "Forclusif";
1019
    #  $abbrev2cat{"Insrt"} = "Incidente";
1020
    #  $abbrev2cat{"Intj"} = "Interjection";
1021
    #  $abbrev2cat{"ModA"} = "ModifieurAttache";
1022
    #  $abbrev2cat{"ModD"} = "ModifieurDetache";
1023
    #  $abbrev2cat{"Ng"} = "Negation";
1024
    #  $abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif";
1025
    #  $abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #?
1026
    #  $abbrev2cat{"VFin"} = "NoeudVerbal-Personnel";
1027
    #  $abbrev2cat{"nSnt"} = "NonPhrase";
1028
    #  $abbrev2cat{"Obj"} = "Objet";
1029
    #  $abbrev2cat{"Snt"} = "Phrase";
1030
    #  $abbrev2cat{"Pon"} = "Ponctuation";
1031
    #  $abbrev2cat{"Rfc"} = "Reflechi";
1032
    #  $abbrev2cat{"Rfx"} = "ReflexifRenforce";
1033
    #  $abbrev2cat{"RelC"} = "Relateur-Coordonnant";
1034
    #  $abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant";
1035
    #  $abbrev2cat{"nMax"} = "StructureNonMaximale";
1036
    #  $abbrev2cat{"SjImp"} = "SujetImpersonnel";
1037
    #  $abbrev2cat{"SjPer"} = "SujetPersonnel";
1038
    #  $abbrev2cat{"Lac"} = "Lacune";
1039
    #  $abbrev2cat{"Aux"} = "Auxilie";
1040
    #  $abbrev2cat{"Regim"} = "Regime";
1041

  
1042
    $abbrev2cat{"acl"}        = "Clausal modifier of noun";
1043
    $abbrev2cat{"advcl"}      = "Adverbial clause modifier";
1044
    $abbrev2cat{"advmod"}     = "Adverbial modifier";
1045
    $abbrev2cat{"amod"}       = "Adjectival modifier";
1046
    $abbrev2cat{"appos"}      = "Appositional modifier";
1047
    $abbrev2cat{"aux"}        = "Auxiliary";
1048
    $abbrev2cat{"cc-nc"}      = "Coordinated conjunct : non coordonant";
1049
    $abbrev2cat{"cc"}         = "Coordinating conjunction";
1050
    $abbrev2cat{"ccomp"}      = "Clausal complement";
1051
    $abbrev2cat{"conj"}       = "Conjunct";
1052
    $abbrev2cat{"cop"}        = "Copula";
1053
    $abbrev2cat{"csubj"}      = "Clausal subject";
1054
    $abbrev2cat{"det"}        = "Determiner";
1055
    $abbrev2cat{"dislocated"} = "Dislocated elements";
1056
    $abbrev2cat{"expl"}       = "Expletive";
1057
    $abbrev2cat{"iobj"}       = "Indirect object";
1058
    $abbrev2cat{"mark"}       = "Marker";
1059
    $abbrev2cat{"nmod"}       = "Nominal modifier";
1060
    $abbrev2cat{"nsubj"}      = "Nominal subject";
1061
    $abbrev2cat{"nummod"}     = "Numeric modifier";
1062
    $abbrev2cat{"obj"}        = "Object";
1063
    $abbrev2cat{"obl"}        = " Oblique nominal";
1064
    $abbrev2cat{"orphan"}     = "Remnant in ellipsis";
1065
    $abbrev2cat{"remnant"}    = "Remnant ?";
1066
    $abbrev2cat{"vocative"}   = "Vocative";
1067
    $abbrev2cat{"xcomp"}      = "Open clausal complement";
1068
}
1069

  
1070
sub print_sentence {
1071
    for ( my $q = 0 ; $q <= $#words ; $q++ ) {
1072
        print "$words[$q]\n";
1073
    }
1074
}
1075

  
1076
sub write_nonterminals {
1077
    my $print_nt_features;
1078
    if ( $_[1] =~ /dupl/ ) {
1079
        $dupl = '_dupl';
1080
    }
1081
    else {
1082
        $dupl = '';
1083
    }
1084
    if ( $nt_features ne '' ) {
1085
        $print_nt_features = $nt_empty_features;
1086
        if ( $_[0] =~ /$thisrootname/ ) {
1087
            $print_nt_features = $nt_features;
1088
        }
1089
    }
1090
    printf XML "      <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", $., $w, $dupl, $_[0], $dom, $type, $vform, $vlemma,
1091
      $print_nt_features, notes("$._$w"), $.;
1092
    printf XML "        <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., $w, $dupl;
1093

  
1094
    # link duplicate with primary original node
1095
    if ( $_[1] =~ /dupl/ ) {
1096
        printf XML "        <secedge idref=\"s%d_%d\" label=\"dupl\"/>\n", $., $w;
1097
    }
1098

  
1099
    # if node is not a duplicate: attach all the daughters
1100
    if ( $_[1] !~ /dupl/ ) {
1101
        for ( my $d = 0 ; $d <= $#daughters ; $d++ ) {
1102
            $daughter = $daughters[$d];
1103
            if ( "$._$w" ne "$._$daughter" ) {    # avoid cycles
1104
                printf XML "        <edge idref=\"n%d_%d%s\" label=\"%s\"/>\n", $., $daughter, $dupl, edge_label("$._$daughter");
1105
            }
1106

  
1107
            # check if a duplicate of this node must be attached
1108
            if ( $duplicates{"$._$daughter"} == 1 ) {
1109
                printf XML "        <edge idref=\"n%d_%d_dupl\" label=\"%s\"/>\n", $., $daughter, edge_label("$._$daughter_dupl");
1110
            }
1111
        }
1112
    }
1113
    print XML "      </nt>\n";
1114
}
1115

  
1116
# checks if node is stored as in hash of relators
1117
sub edge_label {
1118
    if ( $relators{ $_[0] } == 1 ) {
1119
        return 'R';
1120
    }
1121
    return 'D';
1122
}
1123

  
1124
# retrives notes for this node
1125
sub notes {
1126
    if ( $notes{ $_[0] } ne '' ) {
1127
        return "$notes{$_[0]}";
1128
    }
1129
    return '--';
1130
}
1131

  
1132
# conversions, bug fixes
1133
sub clean_data {
1134

  
1135
    # conversions of values, some necessary some for convenience
1136
    $word =~ s/"/'/g;
1137
    $word =~ s/\&/(and)/g;
1138
    $word =~ s/<</«/g;
1139
    $word =~ s/>>/»/g;
1140
    $word =~ s/[<>]//g;
1141
    $pos =~ s/:/_/g;
1142
    $mor =~ s/\|/_/g;
1143
    $ppos =~ s/:/_/g;
1144
    $pmor =~ s/\|/_/g;
1145
    $lemma =~ s/\|/_/g;
1146
    $lemma =~ s/[<>]//g;
1147
    $lemma =~ s/"/'/g;
1148
    $lemma =~ s/\&/(and)/g;
1149
    $plemma =~ s/\|/_/g;
1150
    $plemma =~ s/[<>]//g;
1151
    $plemma =~ s/"/'/g;
1152
    $plemma =~ s/\&/(and)/g;
1153

  
1154
    # clean categories
1155
    $cat =~ s/ROOT/$thisrootname/;    # top node, for compatibility with SRCMF
1156

  
1157
    #    $cat =~ s/Ponctuation/Pon/;
1158
    # correct some bugs in parse
1159
    #    $cat =~ s/Sujet/SjPer/;
1160
    #    $cat =~ s/Modifieur/ModA/;
1161
    #    $cat =~ s/Parenthese/Insrt/;
1162
    $cat =~ s/\-/_/g;
1163
    $cat =~ s/RelNC_(.*)/$1_RelNC/;    # RelNC always 2nd node, for consistency in duplicates
1164
    return;
1165
}
1166

  
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 3347)
2 2

  
3 3
use File::Basename;
4 4

  
5
my $CMD="conll2tiger.pl";
6
my $VERSION="1.5";
7
my $MODIFIED="8/12/2015";   # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
5
my $CMD      = "conll2tiger.pl";
6
my $VERSION  = "1.5";
7
my $MODIFIED = "8/12/2015";        # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
8 8

  
9 9
# columns (default are the predicted values in CoNLL 2009 format)
10
my $coll = 2;  # lemma
11
my $colm = 3;  # morph (pos)
12
my $colf = 5;  # features
13
my $colh = 6;  # head
14
my $cold = 7; # deprel
15
my $outdir = "conllexport"; # deprel
16
my $split = 1000; # split output after nr sentences
10
my $coll   = 2;                    # lemma
11
my $colm   = 3;                    # morph (pos)
12
my $colf   = 5;                    # features
13
my $colh   = 6;                    # head
14
my $cold   = 7;                    # deprel
15
my $outdir = "conllexport";        # deprel
16
my $split  = 1000;                 # split output after nr sentences
17 17

  
18 18
# tree structure
19
my %dominates  = ();
20
my %deprel  = ();  # deprel{nr} = deprel
21
my @daughters = ();    # daughter nodes, stored in %dominates
22
my %duplicates = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
23
my $type = "--"; # node attribute
24
my $vform = my $vlemma = "--"; # node attributes for verbs store form and lemma
25
my $label = "D"; # default edge label
26
my $nt_features_header = ''; # option -x 
27
my $nt_features = ''; # option -x 
28
my $nt_empty_features = ''; # option -x 
29
my @scodes = (); # option -x
30
my $add_to_sentcode = '';
31
my $rootname = 'root';  # default
32
my $featcol = 13;
19
my %dominates          = ();
20
my %deprel             = ();                                         # deprel{nr} = deprel
21
my @daughters          = ();                                         # daughter nodes, stored in %dominates
22
my %duplicates         = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
23
my $type               = "--";                                       # node attribute
24
my $vform              = my $vlemma = "--";                          # node attributes for verbs store form and lemma
25
my $label              = "D";                                        # default edge label
26
my $nt_features_header = '';                                         # option -x
27
my $nt_features        = '';                                         # option -x
28
my $nt_empty_features  = '';                                         # option -x
29
my @scodes             = ();                                         # option -x
30
my $add_to_sentcode    = '';
31
my $rootname           = 'root';                                     # default
32
my $featcol            = 13;
33 33

  
34 34
######################################################################
35 35
#  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
......
43 43
#   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
44 44
#   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
45 45
######################################################################
46
# Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr> 
46
# Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
47 47
# for Profiterole project (2019-2021)
48 48

  
49 49
# 2019-09-25
......
67 67
# - added "punct" to cat values
68 68

  
69 69
# Update 2021-07-20
70
# - added cat value list compiled from 
71
#   https://universaldependencies.org/ext-dep-index.html and the previous 
72
#   version. All relation types and subtypes from the UD 2.8 corpora 
70
# - added cat value list compiled from
71
#   https://universaldependencies.org/ext-dep-index.html and the previous
72
#   version. All relation types and subtypes from the UD 2.8 corpora
73 73
#   should be there.
74 74
# - contractions indexed
75 75
######################################################################
76 76

  
77
my $HELP="
77
my $HELP = "
78 78
==================================================================
79 79
$CMD $VERSION: Help
80 80
==================================================================
......
109 109
#                    DO NOT MODIFY FOLLOWING CODE !
110 110
###########################################################################
111 111

  
112

  
113 112
###########################################################################
114 113
# parse the command line
115 114
###########################################################################
......
117 116
use Getopt::Std;
118 117
getopts('c:C:hD:H:M:o:R:s:x:X:');
119 118

  
120
if (defined($opt_h)) {
121
  print STDERR "$HELP";
122
  exit(0);
119
if ( defined($opt_h) ) {
120
	print STDERR "$HELP";
121
	exit(0);
123 122
}
124 123

  
125
if (defined($opt_o)) {
126
  $outdir = $opt_o
124
if ( defined($opt_o) ) {
125
	$outdir = $opt_o;
127 126
}
128
if (defined($opt_C)) {
129
  $corpus = $opt_C;
127
if ( defined($opt_C) ) {
128
	$corpus = $opt_C;
130 129
}
131
if (defined($opt_D)) {
132
  $cold = $opt_D
130
if ( defined($opt_D) ) {
131
	$cold = $opt_D;
133 132
}
134
if (defined($opt_H)) {
135
  $colh = $opt_H
133
if ( defined($opt_H) ) {
134
	$colh = $opt_H;
136 135
}
137
if (defined($opt_M)) {
138
  $colm = $opt_M
136
if ( defined($opt_M) ) {
137
	$colm = $opt_M;
139 138
}
140 139

  
141
if (defined($opt_R)) {
142
  $rootname = $opt_R;
140
if ( defined($opt_R) ) {
141
	$rootname = $opt_R;
143 142
}
144 143

  
145
if (defined($opt_s)) {
146
  $split = $opt_s
144
if ( defined($opt_s) ) {
145
	$split = $opt_s;
147 146
}
148 147

  
149
if (defined($opt_X)) {
150
  $featcol = $opt_X;
148
if ( defined($opt_X) ) {
149
	$featcol = $opt_X;
151 150
}
152 151

  
153
if (defined($opt_x)) {
154
  @scodes = split(",", $opt_x);
155
  for (my $i=0; $i<=$#scodes; $i++) {
156
    $nt_features_header = $nt_features_header . sprintf("<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i]);
157
  }
158
  $nt_features_header =~ s/\bid\b/ncaid/;  # avoid reserved Tiger attribute "id"
152
if ( defined($opt_x) ) {
153
	@scodes = split( ",", $opt_x );
154
	for ( my $i = 0 ; $i <= $#scodes ; $i++ ) {
155
		$nt_features_header = $nt_features_header . sprintf( "<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i] );
156
	}
157
	$nt_features_header =~ s/\bid\b/ncaid/;    # avoid reserved Tiger attribute "id"
159 158
}
160 159

  
160
my @colnames = ( "url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL" );
161 161

  
162
my @colnames = ("url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL");
163 162
# my %pos = %lemma = %form = %deprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = ();
164 163
my @coordelements = ();
165 164

  
......
170 169

  
171 170
my $infile = $ARGV[0];
172 171
$infile =~ s/\.conllu?//i;
173
if($infile eq '') {
174
  $infile = 'subcorpus';
172
if ( $infile eq '' ) {
173
	$infile = 'subcorpus';
175 174
}
176 175
my $counter = 1;
177
$suffix = sprintf("%05d", $counter);
176
$suffix = sprintf( "%05d", $counter );
178 177
$infilename = basename($infile);
179 178

  
180 179
$foo = `if [ ! -d $outdir ];then mkdir $outdir;fi`;
181
open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
182
open(LOG, ">$outdir/conversion.log")  or die "\nopen file error of conversion.log\n";
183
open(MASTER, ">$outdir/main.xml")  or die "\nopen file error of main.xml\n";
180
open( XML, ">$outdir/$infilename-$suffix.xml" )
181
  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
182
open( LOG, ">$outdir/conversion.log" )
183
  or die "\nopen file error of conversion.log\n";
184
open( MASTER, ">$outdir/main.xml" ) or die "\nopen file error of main.xml\n";
184 185
write_xml_header();
185
write_master_header ();
186
write_master_header();
186 187

  
187 188
# flush output for log and master file
188
select(LOG); $| = 1; 
189
select(MASTER); $| = 1;
189
select(LOG);
190
$| = 1;
191
select(MASTER);
192
$| = 1;
190 193

  
191
$commandline = $0 . " ". (join " ", @ARGV);
194
$commandline = $0 . " " . ( join " ", @ARGV );
192 195
print LOG "$commandline\n\n";
193 196

  
194 197
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
195 198

  
196
$/ = ""; # treat empty line as RS
199
$/ = "";    # treat empty line as RS
197 200
while (<>) {
198
  if($. % $split == 0) {
199
    print XML "</subcorpus>\n";
200
    close(XML);
201
    $suffix = sprintf("%05d", ++$counter);
202
    open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error\n";
203
    write_xml_header();
204
    print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
205
  }
201
	if ( $. % $split == 0 ) {
202
		print XML "</subcorpus>\n";
203
		close(XML);
204
		$suffix = sprintf( "%05d", ++$counter );
205
		open( XML, ">$outdir/$infilename-$suffix.xml" )
206
		  or die "\nopen file error\n";
207
		write_xml_header();
208
		print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
209
	}
206 210

  
207
  # ----------------------------------------
208
  # set root (or fake root if ROOT is missing)
209
  # ----------------------------------------
210
  $rootnode = $fakeroot = 0; # m = Treat string as multiple lines, so that ^ matches beginning of line
211
  $thisrootname = $rootname;
212
  ($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);  # real root marked by parser
213
  if($rootnode == 0) {
214
#    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
215
#    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
216
    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
217
    print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
218
    $fakeroot = 1;
219
    $thisrootname = 'nSnt';
220
  }
221
  if($rootnode == 0) {
222
    $rootnode = 1;   # set fake root if nothing goes
223
    print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
224
    $fakeroot = 2;
225
    $thisrootname = 'Err';
226
  }
211
	# ----------------------------------------
212
	# set root (or fake root if ROOT is missing)
213
	# ----------------------------------------
214
	$rootnode = $fakeroot = 0;    # m = Treat string as multiple lines, so that ^ matches beginning of line
215
	$thisrootname = $rootname;
216
	($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);    # real root marked by parser
217
	if ( $rootnode == 0 ) {
227 218

  
228
  my @cols = ();
229
  @words = split (/\n/);
230
  @terminals = ();
219
		#    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
220
		#    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
221
		($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);                                     # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
222
		print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
223
		$fakeroot     = 1;
224
		$thisrootname = 'nSnt';
225
	}
226
	if ( $rootnode == 0 ) {
227
		$rootnode = 1;                                                                                # set fake root if nothing goes
228
		print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
229
		$fakeroot     = 2;
230
		$thisrootname = 'Err';
231
	}
231 232

  
232
  %dominates = (); # empty at beginning of sentence
233
  %deprel = (); # empty at beginning of sentence
234
  %aux = (); # empty at beginning of sentence
235
  @daughters = ();
233
	my @cols = ();
234
	@words     = split(/\n/);
235
	@terminals = ();
236 236

  
237
  my $commentlines = 0; #added by AL
238
#  my $contractions = 0; #added by AL
239
#  my $text_id = "unknown_text";
240
my $text_id = $infilename;
241
  my $sent_id = "0";
237
	%dominates = ();                                                                                  # empty at beginning of sentence
238
	%deprel    = ();                                                                                  # empty at beginning of sentence
239
	%aux       = ();                                                                                  # empty at beginning of sentence
240
	@daughters = ();
242 241

  
243
  # ----------------------------------------
244
  # loop through words #1: write tokens (terminal nodes) to XML file
245
  # store tree relevant information for loop #2
246
  # ----------------------------------------
247
  for (my $w=0; $w<=$#words; $w++) {
248
# Added by AL for comment lines
249
    if ($words[$w] =~ /^#/) {
250
	if ($words[$w] =~ /^# newdoc/) {
251
		$text_id = $words[$w];
252
		$text_id =~ s/# newdoc id = //;
253
	}
254
	elsif ($words[$w] =~ /^# sent_id/) {
255
		$sent_id = $words[$w];
256
		$sent_id =~ s/# sent_id = //;
257
	}
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff