/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3347

     							if (id.contains("-")) { // multi-word line
     								int index = id.indexOf("-")
     								String id1 = id.substring(0, id)
     								String id2 = id.substring(id+1)
     								String id1 = id.substring(0, index)
     								String id2 = id.substring(index+1)
     								def token1 = sentence[id1]
     								def token2 = sentence[id2]

     #!/usr/bin/perl
     use File::Basename;
     my $CMD      = "conll2tiger.pl";
     my $VERSION  = "1.5";
     my $MODIFIED = "8/12/2015";        # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
     # columns (default are the predicted values in CoNLL 2009 format)
     my $coll   = 2;                    # lemma
     my $colm   = 3;                    # morph (pos)
     my $colf   = 5;                    # features
     my $colh   = 6;                    # head
     my $cold   = 7;                    # deprel
     my $outdir = "conllexport";        # deprel
     my $split  = 1000;                 # split output after nr sentences
     # tree structure
     my %dominates          = ();
     my %deprel             = ();                                         # deprel{nr} = deprel
     my @daughters          = ();                                         # daughter nodes, stored in %dominates
     my %duplicates         = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
     my $type               = "--";                                       # node attribute
     my $vform              = my $vlemma = "--";                          # node attributes for verbs store form and lemma
     my $label              = "D";                                        # default edge label
     my $nt_features_header = '';                                         # option -x
     my $nt_features        = '';                                         # option -x
     my $nt_empty_features  = '';                                         # option -x
     my @scodes             = ();                                         # option -x
     my $add_to_sentcode    = '';
     my $rootname           = 'root';                                     # default
     my $featcol            = 13;
     ######################################################################
     #  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
     #  treebanks to TigerXML
     #          Achim Stein <achim.stein@ling.uni-stuttgart.de>
     # License : GNU GPL v. 3 (see the LICENSE file)
     ######################################################################
     # TO DO:
     # - coordination
     # - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100)
     #   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
     #   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
     ######################################################################
     # Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
     # for Profiterole project (2019-2021)
     # 2019-09-25
     # - updated default column numbers for CONNL-U SRCMF format
     # - added processing for comment lines
     # - added @textid to terminal nodes
     # - deleted ppos, pmor et plemma (predicted tags and lemmas)
     # - replaced specific SRCMF with standard UD tags
     # Update 2020-05-13
     # - added @editionId for synchronization with BFM word ID
     # Update 2021-03-22
     # - using $infilename for @textid
     # - added support for .conllu extension
     # Update 2021-03-29
     # - added editionId to declarations in main.xml
     # Update 2021-07-16
     # - added "punct" to cat values
     # Update 2021-07-20
     # - added cat value list compiled from
     #   https://universaldependencies.org/ext-dep-index.html and the previous
     #   version. All relation types and subtypes from the UD 2.8 corpora
     #   should be there.
     # - contractions indexed
     ######################################################################
     my $HELP = "
     ==================================================================
     $CMD $VERSION: Help
     ==================================================================
     FUNKTION: converts CoNLL parser output to TigerXML (for mate tools)
               creates master file, splits input files, corrects unbound nodes
     SYNTAX:      $CMD [Options] <CoNLL file>
     OPTIONEN:
      -c          ignore coordination (delete coordx- prefix in deprel)
      -C str      corpus specials: nca
      -h          show help
      -o          create all files in this output directory (default: $outdir)
     set COLUMNS for required info (0 = column 1, 1 = column 2, etc.)
      -D nr       colum for deprel default=$cold
      -H nr       colum for head default=$colh
      -M nr       colum for morphology (POS) default=$colm
      -F nr       colum for morph. features default=$colf
      -R str      Root category (default: $rootname)
      -s nr       split output files after each nr sentence (default = $split)
      -x str,...  include these attributes if present in the -X column of the first word
                  (the first code is also copied into the sentence id)
      -X nr       the column where attributes are stored (default: $featcol)
     EXAMPLE:
       - For mate parser output: no further options required
         $CMD parsed.conll
       - For Le Monde 2005: include attributes
         gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr
       - For NCA:
         conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll
     ";
     ###########################################################################
     #                    DO NOT MODIFY FOLLOWING CODE !
     ###########################################################################
     ###########################################################################
     # parse the command line
     ###########################################################################
     use Getopt::Std;
     getopts('c:C:hD:H:M:o:R:s:x:X:');
     if ( defined($opt_h) ) {
         print STDERR "$HELP";
         exit(0);
+    }
     if ( defined($opt_o) ) {
         $outdir = $opt_o;
+    }
     if ( defined($opt_C) ) {
         $corpus = $opt_C;
+    }
     if ( defined($opt_D) ) {
         $cold = $opt_D;
+    }
     if ( defined($opt_H) ) {
         $colh = $opt_H;
+    }
     if ( defined($opt_M) ) {
         $colm = $opt_M;
+    }
     if ( defined($opt_R) ) {
         $rootname = $opt_R;
+    }
     if ( defined($opt_s) ) {
         $split = $opt_s;
+    }
     if ( defined($opt_X) ) {
         $featcol = $opt_X;
+    }
     if ( defined($opt_x) ) {
         @scodes = split( ",", $opt_x );
         for ( my $i = 0 ; $i <= $#scodes ; $i++ ) {
             $nt_features_header = $nt_features_header . sprintf( "<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i] );
+        }
         $nt_features_header =~ s/\bid\b/ncaid/;    # avoid reserved Tiger attribute "id"
+    }
     my @colnames = ( "url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL" );
     # my %pos = %lemma = %form = %deprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = ();
     my @coordelements = ();
     my $id = my $form = my $lemma = my $plemma = my $pos = my $ppos = my $feat = my $pfeat = my $head = my $phead = my $deprel = my $pdeprel = my $edition_id = "";
     my $timestamp = `date`;
     chomp($timestamp);
     my $infile = $ARGV[0];
     $infile =~ s/\.conllu?//i;
     if ( $infile eq '' ) {
         $infile = 'subcorpus';
+    }
     my $counter = 1;
     $suffix = sprintf( "%05d", $counter );
     $infilename = basename($infile);
     $foo = `if [ ! -d $outdir ];then mkdir $outdir;fi`;
     open( XML,    ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
     open( LOG,    ">$outdir/conversion.log" )          or die "\nopen file error of conversion.log\n";
     open( MASTER, ">$outdir/main.xml" )                or die "\nopen file error of main.xml\n";
     write_xml_header();
     write_master_header();
     # flush output for log and master file
     select(LOG);
     $| = 1;
     select(MASTER);
     $| = 1;
     $commandline = $0 . " " . ( join " ", @ARGV );
     print LOG "$commandline\n\n";
     print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
     $/ = "";    # treat empty line as RS
     while (<>) {
         if ( $. % $split == 0 ) {
             print XML "</subcorpus>\n";
             close(XML);
             $suffix = sprintf( "%05d", ++$counter );
             open( XML, ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error\n";
             write_xml_header();
             print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
+        }
         # ----------------------------------------
         # set root (or fake root if ROOT is missing)
         # ----------------------------------------
         $rootnode = $fakeroot = 0;    # m = Treat string as multiple lines, so that ^ matches beginning of line
         $thisrootname = $rootname;
         ($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);    # real root marked by parser
         if ( $rootnode == 0 ) {
             #    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
             #    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
             ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);    # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
             print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
             $fakeroot     = 1;
             $thisrootname = 'nSnt';
+        }
         if ( $rootnode == 0 ) {
             $rootnode = 1;                                               # set fake root if nothing goes
             print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
             $fakeroot     = 2;
             $thisrootname = 'Err';
+        }
         my @cols = ();
         @words     = split(/\n/);
         @terminals = ();
         %dominates = ();                                                 # empty at beginning of sentence
         %deprel    = ();                                                 # empty at beginning of sentence
         %aux       = ();                                                 # empty at beginning of sentence
         @daughters = ();
         my $commentlines = 0;                                            #added by AL
         #  my $contractions = 0; #added by AL
         #  my $text_id = "unknown_text";
         my $text_id = $infilename;
         my $sent_id = "0";
         # ----------------------------------------
         # loop through words #1: write tokens (terminal nodes) to XML file
         # store tree relevant information for loop #2
         # ----------------------------------------
         for ( my $w = 0 ; $w <= $#words ; $w++ ) {
             # Added by AL for comment lines
             if ( $words[$w] =~ /^#/ ) {
                 if ( $words[$w] =~ /^# newdoc/ ) {
                     $text_id = $words[$w];
                     $text_id =~ s/# newdoc id = //;
+                }
                 elsif ( $words[$w] =~ /^# sent_id/ ) {
                     $sent_id = $words[$w];
                     $sent_id =~ s/# sent_id = //;
+                }
                 #	print LOG "Comment line loop 1: $words[$w]\n";
                 $commentlines++;
                 next;
+            }
             # Added by AL for contractions
             elsif ( $words[$w] =~ /^\d+-\d+/ ) {
                 #	print LOG "Contraction line loop 1: $words[$w]\n";
                 $commentlines++;
                 #	$contractions++;
                 next;
+            }
             else {
                 if ( defined($opt_c) ) {
                     $words[$w] =~ s/coord(\d+)-//g;
+                }
                 @cols       = split( /\t/, $words[$w] );
                 $wnr        = $cols[0];
                 $word       = $cols[1];
                 $lemma      = $cols[2];
                 $plemma     = $cols[2];                    # predicted
                 $pos        = $cols[3];
                 $ppos       = $cols[4];                    # predicted
                 $mor        = $cols[5];
                 $pmor       = $cols[5];                    # predicted
                 $cat        = $cols[$cold];
                 $edition_id = $cols[9];
                 $edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g;
                 if ( $cat =~ /[<>]/ ) {
                     print LOG "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
                     $cat = 'Err2';
+                }
                 # NCA: enclose lemmas in underscores (easier for regex construction)
                 if ( $corpus =~ /nca/i ) {
                     $lemma = "_" . "$lemma" . "_";
+                }
                 clean_data();
                 # get attribute-value pairs from col #13 of first word (option -x)
                 if ( $opt_x == "all" ) {
                     $cols[$featcol] = "all=" . $cols[$featcol];
+                }
                 if ( $w == 0 && $cols[$featcol] =~ /=/ ) {
                     #      print STDERR "========== getting att-value for word $w: $cols[$featcol] scodes=@scodes\n";
                     $nt_features = $nt_empty_features = '';
                     #      while($cols[$featcol] =~ m/ (.*?)="([^"]*)"/gs) {   # quoted values
                     while ( $cols[$featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs ) {    # maybe unquoted values (e.g. Le Monde 2005)
                         $att = $1;
                         $val = $2;
                         # pick the attributes that match those of the command line option -x
                         for ( my $t = 0 ; $t <= $#scodes ; $t++ ) {
                             if ( $att eq $scodes[$t] ) {
                                 $val =~ s/\&/\&amp;/g;                                 #  replace "&" in values (appears in URLs)
                                 if ( $t == 0 ) { $add_to_sentcode = "_$att$val"; }
                                 $nt_features = $nt_features . " $att=\"$val\"";
                                 #	    print STDERR "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n";
+                            }
                             if ( $att eq $scodes[$t] ) { $nt_empty_features = $nt_empty_features . " $att=\"--\""; }
+                        }
+                    }
                     # replace the reserved feature 'id' (Tiger)
                     $add_to_sentcode =~ s/\bid=/ncaid=/;
                     $nt_features =~ s/\bid=/ncaid=/;
                     $nt_empty_features =~ s/\bid=/ncaid=/;
                 }    # if col 13 contains attributes
                 else {
                     if ( defined($opt_x) && ( $w == 0 ) ) {
                         print STDERR "Warning: sentence=$.  option -x is defined, but no attribute=value declarations were found!\n";
+                    }
+                }
                 # store output for terminal node in array, output later. For double categories make a duplicate node.
                 $tempid = sprintf( "%d_%d", $., $wnr );
     #    push(@terminals, sprintf("      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $ppos, $pmor, $plemma, $text_id, $edition_id));
                 push( @terminals,
                     sprintf( "      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $text_id, $edition_id ) );
                 if ( $cat =~ /_/ ) {
     #      push(@terminals, sprintf("      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", "_", "_", "_", $text_id, $edition_id));
                     push( @terminals,
                         sprintf( "      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", $text_id, $edition_id ) );
                     $duplicates{$tempid} = 1;    # store, check later to attach the duplicates to the mother
+                }
                 # associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux)
                 if ( $cat =~ /Aux/ ) {
                     $aux{ $cols[$colh] } = "$word" . "_" . "$plemma";    # $aux{head} = word_lemma (of Aux)
+                }
                 # ----------------------------------------
                 # store information needed for tree
                 # ----------------------------------------
                 # if fake rootnode == 1: nSnt as root node
                 if ( ( $fakeroot == 1 ) && ( $w - $commentlines + 1 == $rootnode ) ) {
                     $cat = 'nSnt';
                     $notes{$tempid} = 'Warning no marked ROOT node in CoNLL';    # TODO: geht nicht
+                }
                 # if fake rootnode == 2: flatten structure: attach all words to the first word
                 if ( ( $fakeroot == 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
                     $cols[$colh] = 1;
                     $notes{$tempid} = 'Error neither ROOT node nor top node in CoNLL';
+                }
                 # correct unbound words in parser output (phead = 0, but not marked as ROOT)
                 if ( ( $cols[$colh] eq "0" ) && ( $w - $commentlines + 1 != $rootnode ) ) {    #AL: added: -$commentlines
                     printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", ( $w - $commentlines + 1 ), $rootnode;
                     $cols[$colh]    = $rootnode;
                     $cat            = 'Err';                                                   # let Err instead of deprel appear in dom attribute
                     $notes{$tempid} = 'Warning unbound node in CoNLL';
+                }
                 # store for R edge labels
                 if ( $cols[$cold] =~ /RelN?C/ ) {
                     $relators{$tempid} = 1;
+                }
                 # store deprel for dom attribute
                 $deprel{$tempid} = $cat;                                                       # $cols[$cold];
                                                                                                # if real root, add this node to daughter array, store array in hash dominates{head}{@daughters}
                 if ( ( $fakeroot < 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
                     @daughters = @{ $dominates{ $cols[$colh] } };                              # get the array from the hash of the dominating node
                     push( @daughters, $wnr );
                     $dominates{ $cols[$colh] } = [@daughters];
+                }
             }    # for each word loop #1
         }    # AL condition end
         # print graph code (needs root attribute) and terminal nodes
         if ( $rootnode == 0 ) {
             $noroot++;
             print LOG "Error sentence $. ($tempid): root node not found:\n$_\n";
             next;
+        }
         else {
             printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., $add_to_sentcode;
             print XML "  <graph root=\"n$._$rootnode\">\n";
             print XML "    <terminals>\n";
             for ( my $t = 0 ; $t <= $#terminals ; $t++ ) {
                 print XML $terminals[$t];
+            }
             print XML "    </terminals>\n";
+        }
         # ----------------------------------------
         # loop through words #2 to build Tiger tree (non terminal nodes)
         # ----------------------------------------
         print XML "    <nonterminals>\n";
         for ( my $i = 0 ; $i <= $#words ; $i++ ) {
             #Added AL for comment lines
             if ( $words[$i] =~ /^#/ ) {
                 #       print LOG "Comment line loop 2 : $words[$i]\n";
                 next;
+            }
             #Added AL for contractions
             if ( $words[$i] =~ /^\d+-\d+/ ) {
                 #       print LOG "Contraction loop 2 : $words[$i]\n";
                 next;
+            }
             else {
                 @cols = split( /\t/, $words[$i] );
                 $w = $cols[0];
                 ### TODO: redundante Variablenzuweisung (= loop #1)??
                 $word   = $cols[1];
                 $lemma  = $cols[2];
                 $plemma = $cols[3];       # predicted
                 $pos    = $cols[4];
                 $ppos   = $cols[5];       # predicted
                 $mor    = $cols[6];
                 $pmor   = $cols[7];       # predicted
                 $cat    = $cols[$cold];
                 if ( $cat =~ /[<>]/ ) {
                     print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
                     $cat = 'Err2';
+                }
                 #  OF parser has not learned punctuation: set cat for punctuation to PON
                 if ( ( $corpus =~ /nca/i ) && ( $pos eq 'PON' ) ) {
                     $cols[$cold] = $cat = 'Pon';
+                }
                 clean_data();
                 # retrieve daughters, make dom attribute (string of dominated nodes)
                 @daughters = @{ $dominates{"$w"} };
                 $dom       = '';
                 for ( my $d = 0 ; $d <= $#daughters ; $d++ ) {
                     $dom = $dom . "_" . $deprel{"$._$daughters[$d]"};
+                }
                 if ( $dom =~ /_/ ) {
                     $dom =~ s/^_//;
+                }
                 else {
                     $dom = '--';
+                }
                 # if verbal, set node attributes for verb form and lemma
                 $type = "nV";
                 $vform = $vlemma = "--";
                 if ( $pos =~ /VER/ ) {    # AL: $ppos -> $pos
                     if    ( $mor =~ /infi/ )       { $type = "VInf"; }    #AL: $pmor -> $mor
                     elsif ( $pmor =~ /pper|ppre/ ) { $type = "VPar"; }
                     else                           { $type = "VFin"; }
                     # if Aux is present, create attribute for main verb
                     if ( $aux{$w} =~ /(.*?)_(.*)/ ) {
                         $vform  = "$1";
                         $vlemma = "$2";
+                    }
                     # else create attr for simple verb
                     else {
                         $vform  = $word;
                         $vlemma = $lemma;    # AL: $plemma -> $lemma (always void in SRCMF)
+                    }
                     # NCA: enclose lemmas in underscores (easier for regex construction)
                     if ( $corpus =~ /nca/i ) {
                         $vlemma = "_" . "$vlemma" . "_";
+                    }
+                }
                 # call output function (twice for duplicate categories)
                 if ( $cat =~ /(.*?)_(.*)/ ) {
                     write_nonterminals( "$2", "" );         # RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC
                     write_nonterminals( "$1", "_dupl" );    # other category is duplicate
+                }
                 else {
                     write_nonterminals($cat);
+                }
             }    # for words
         }    #AL end condition
         print XML "    </nonterminals>\n";
         print XML "  </graph>\n";
         print XML "</s>\n";
         if ( $. % 100 == 0 ) { print STDERR "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.; }
     }    # main
     print XML "</subcorpus>\n";
     print STDERR "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n";
     print STDERR "   Hint 1: on OS X convert master file to MacRoman, e.g  iconv -f latin1 -t macroman\n";
     print STDERR "   Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n";
     print STDERR "   Hint 3: build reliable feature declarations using tiger.sh\n";
     print STDERR "           tiger.sh -a \"lemma word pos ppos\"  (for terminals)\n";
     print STDERR "           tiger.sh -A \"lemma word pos ppos\"  (for non-terminals)\n";
     if ( $noroot > 0 ) { print STDERR "$noroot sentences ignored: root not found (see log file)\n"; }
     write_master_footer();
     close(MASTER);
     close(XML);
     close(LOG);
     exit;
     # ----------------------------------------
     # sub
     # ----------------------------------------
     sub write_xml_header {
         print XML "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
       <subcorpus name=\"$infilename-$suffix\">
     ";
+    }
     sub write_master_header {
         printf MASTER '<?xml version="1.0" encoding="UTF-8"?>
     ';
         printf MASTER "<corpus id=\"$corpus\">
     <head>
       <meta><name>$corpus</name>
         <author>ILR Stuttgart</author>
         <date></date>
         <description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description>
         <format>SRCMF</format>
         <history>TigerXML converted by conll2tiger.pl</history>
       </meta>
     ";
         #  printf MASTER '<annotation>
         #<feature name="word" domain="T" ></feature>
         #<feature name="pos" domain="T" ></feature>
         #<feature name="mor" domain="T" ></feature>
         #<feature name="lemma" domain="T" ></feature>
         #<feature name="ppos" domain="T" ></feature>
         #<feature name="pmor" domain="T" ></feature>
         #<feature name="plemma" domain="T" ></feature>
         #<feature name="cat" domain="NT" >
         #  <value name="Apst">apostrophe</value>
         #  <value name="AtObj">attribut d objet</value>
         #  <value name="AtRfc">attribut réfléchi</value>
         #  <value name="AtSj">attribut de sujet</value>
         #  <value name="AttributReflechi">attribut réfléchi</value>
         #  <value name="Aux">auxilié</value>
         #  <value name="AuxA">auxilié actif</value>
         #  <value name="AuxP">auxilié passif</value>
         #  <value name="Circ">circonstant</value>
         #  <value name="Circ_RelNC">circonstant pronom relatif</value>
         #  <value name="Cmpl">complément</value>
         #  <value name="Cmpl_RelNC">complément pronom relatif</value>
         #  <value name="Coo">coordination</value>
         #  <value name="Det">déterminant</value>
         #  <value name="Err">unbound node in CoNLL input</value>
         #  <value name="Err2">illegal node name was replaced</value>
         #  <value name="GpCoo">coordonné</value>
         #  <value name="Ignorer">Ignorer</value>
         #  <value name="Insrt">incidente</value>
         #  <value name="Intj">interjection</value>
         #  <value name="Lac">lacune</value>
         #  <value name="ModA">modifieur attaché</value>
         #  <value name="ModD">modifieur détaché</value>
         #  <value name="Ng">négation</value>
         #  <value name="NgPrt">forclusif</value>
         #  <value name="Obj">objet</value>
         #  <value name="Obj_RelNC">direct object pronom relatif</value>
         #  <value name="Pon">ponctuation</value>
         #  <value name="PON">ponctuation</value>
         #  <value name="Regim">régime</value>
         #  <value name="RelC">relateur coordonnant</value>
         #  <value name="RelNC">relateur non coordonnant</value>
         #  <value name="Rfc">réfléchi</value>
         #  <value name="Rfx">réfléxif renforcé</value>
         #  <value name="SjImp">sujet impersonnel</value>
         #  <value name="SjPer">sujet personnel</value>
         #  <value name="SjPer_RelNC">sujet personnel pronom relatif</value>
         #  <value name="Snt">phrase</value>
         #  <value name="ROOT">phrase</value>
         #  <value name="StructureMaximale">structure maximale</value>
         #  <value name="VFin">verbe fini</value>
         #  <value name="VInf">verbe infinitif</value>
         #  <value name="nMax">structure non-maximale</value>
         #  <value name="nSnt">non-phrase</value>
         #</feature>
         #<feature name="coord" domain="NT" ></feature>
         #<feature name="dom" domain="NT" ></feature>
         #<feature name="type" domain="NT" >
         #  <value name="nV">élément non-verbal</value>
         #  <value name="VFin">verbe fini</value>
         #  <value name="VInf">verbe infinitif</value>
         #  <value name="VPar">verbe participial</value>
         #  <value name="--">nil</value>
         #</feature>
         #<feature name="vform" domain="NT"></feature>
         #<feature name="vlemma" domain="NT"></feature>
         #<feature name="note" domain="NT"></feature>
         #<feature name="snr" domain="NT"></feature>
         #';
         printf MASTER '<annotation>
     <feature name="word" domain="T" ></feature>
     <feature name="pos" domain="T" ></feature>
     <feature name="mor" domain="T" ></feature>
     <feature name="lemma" domain="T" ></feature>
     <feature name="textid" domain="T" ></feature>
     <feature name="editionId" domain="T" ></feature>
     <feature name="cat" domain="NT" >
       <value name="__UNDEF__">UNDEFINED !!!</value>
       <value name="acl:adv">acl:adv</value> <!-- Ukrainian -->
       <value name="acl:attr">acl:attr</value> <!-- Chukchi -->
       <value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish -->
       <value name="acl:fixed">acl:fixed</value> <!-- Beja -->
       <value name="acl:inf">acl:inf</value> <!-- Portuguese -->
       <value name="acl:relat">acl:relat</value> <!-- Chukchi -->
       <value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof -->
       <value name="acl">clausal modifier of noun (adnominal clause)</value>
       <value name="advcl:abs">advcl:abs</value> <!-- Latin -->
       <value name="advcl:cau">advcl:cau</value> <!-- Moksha -->
       <value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija -->
       <value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish -->
       <value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur -->
       <value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese -->
       <value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian -->
       <value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak -->
       <value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian -->
       <value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak -->
       <value name="advcl:pred">advcl:pred</value> <!-- Latin -->
       <value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian -->
       <value name="advcl:sp">advcl:sp</value> <!-- Ukrainian -->
       <value name="advcl:svc">advcl:svc</value> <!-- Ukrainian -->
       <value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
       <value name="advcl">adverbial clause modifier</value>
       <value name="advmod:arg">advmod:arg</value> <!-- Polish -->
       <value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="advmod:comp">advmod:comp</value> <!-- Erzya -->
       <value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
       <value name="advmod:det">advmod:det</value> <!-- Ukrainian -->
       <value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese -->
       <value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian -->
       <value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
       <value name="advmod:fixed">advmod:fixed</value> <!-- Beja -->
       <value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
       <value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha -->
       <value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
       <value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian -->
       <value name="advmod:locy">advmod:locy</value> <!-- Hungarian -->
       <value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
       <value name="advmod:mode">advmod:mode</value> <!-- Hungarian -->
       <value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami -->
       <value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French -->
       <value name="advmod:que">advmod:que</value> <!-- Hungarian -->
       <value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian -->
       <value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian -->
       <value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
       <value name="advmod:to">advmod:to</value> <!-- Hungarian -->
       <value name="advmod:tto">advmod:tto</value> <!-- Hungarian -->
       <value name="advmod">adverbial modifier</value>
       <value name="amod:att">amod:att</value> <!-- Hungarian -->
       <value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian -->
       <value name="amod:flat">amod:flat</value> <!-- Polish -->
       <value name="amod">adjectival modifier</value>
       <value name="appos:trans">appos:trans</value> <!-- Turkish German -->
       <value name="appos">appositional modifier</value>
       <value name="aux:aff">aux:aff</value> <!-- Beja -->
       <value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian -->
       <value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian -->
       <value name="aux:clitic">aux:clitic</value> <!-- Polish -->
       <value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish -->
       <value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian -->
       <value name="aux:imp">aux:imp</value> <!-- Erzya, Polish -->
       <value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami -->
       <value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil -->
       <value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha -->
       <value name="aux:part">aux:part</value> <!-- Maltese -->
       <value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese -->
       <value name="aux:pot">aux:pot</value> <!-- Komi Zyrian -->
       <value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German -->
       <value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami -->
       <value name="aux">auxiliary</value>
       <value name="case:acc">case:acc</value> <!-- Hebrew -->
       <value name="case:adv">case:adv</value> <!-- Indonesian -->
       <value name="case:aff">case:aff</value> <!-- Beja -->
       <value name="case:det">preposition with determiner</value> <!-- Maltese, Old French -->
       <value name="case:gen">case:gen</value> <!-- Hebrew -->
       <value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian -->
       <value name="case:pred">case:pred</value> <!-- Welsh -->
       <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic -->
       <value name="case">case marking</value>
       <value name="cc:nc">cc:nc</value> <!-- Old French -->
       <value name="cc:nc">Coordinated conjunct : non coordonant</value>
       <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish -->
       <value name="cc:preconj">preconjunct</value>
       <value name="cc">Coordinating conjunction</value>
       <value name="cc">coordinating conjunction</value>
       <value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish -->
       <value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish -->
       <value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian -->
       <value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian -->
       <value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian -->
       <value name="ccomp">clausal complement</value>
       <value name="clf">classifier</value>
       <value name="compound:a">compound:a</value> <!-- Indonesian -->
       <value name="compound:affix">compound:affix</value> <!-- Hebrew -->
       <value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese -->
       <value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese -->
       <value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian -->
       <value name="compound:lvc">light verb construction</value>
       <value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami -->
       <value name="compound:preverb">compound:preverb</value> <!-- Hungarian -->
       <value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba -->
       <value name="compound:prt">phrasal verb particle</value>
       <value name="compound:quant">compound:quant</value> <!-- Cantonese -->
       <value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian -->
       <value name="compound:smixut">compound:smixut</value> <!-- Hebrew -->
       <value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba -->
       <value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese -->
       <value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese -->
       <value name="compound">compound</value>
       <value name="conj:expl">conj:expl</value> <!-- Latin -->
       <value name="conj:extend">conj:extend</value> <!-- Slovenian -->
       <value name="conj:svc">conj:svc</value> <!-- Ukrainian -->
       <value name="conj">conjunct</value>
       <value name="cop:expl">cop:expl</value> <!-- Maltese -->
       <value name="cop:locat">cop:locat</value> <!-- Polish -->
       <value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi -->
       <value name="cop">copula</value>
       <value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic -->
       <value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish -->
       <value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian -->
       <value name="csubj">clausal subject</value>
       <value name="dep:aff">dep:aff</value> <!-- Beja -->
       <value name="dep:agr">dep:agr</value> <!-- Kiche -->
       <value name="dep:alt">dep:alt</value> <!-- Upper Sorbian -->
       <value name="dep:ana">dep:ana</value> <!-- Yupik -->
       <value name="dep:aux">dep:aux</value> <!-- Yupik -->
       <value name="dep:comp">dep:comp</value> <!-- Beja, French -->
       <value name="dep:conj">dep:conj</value> <!-- Beja -->
       <value name="dep:cop">dep:cop</value> <!-- Yupik -->
       <value name="dep:emo">dep:emo</value> <!-- Yupik -->
       <value name="dep:infl">dep:infl</value> <!-- Yupik -->
       <value name="dep:mark">dep:mark</value> <!-- Yupik -->
       <value name="dep:mod">dep:mod</value> <!-- Mbya Guarani -->
       <value name="dep:pos">dep:pos</value> <!-- Yupik -->
       <value name="dep:redup">dep:redup</value> <!-- Beja -->
       <value name="dep:ss">dep:ss</value> <!-- Kiche -->
       <value name="dep">unspecified dependency</value>
       <value name="det:adj">det:adj</value> <!-- Albanian -->
       <value name="det:noun">det:noun</value> <!-- Albanian -->
       <value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian -->
       <value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian -->
       <value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian -->
       <value name="det:predet">det:predet</value> <!-- English, Italian, Persian -->
       <value name="det:pron">det:pron</value> <!-- Albanian -->
       <value name="det:rel">det:rel</value> <!-- Bambara -->
       <value name="det">determiner</value>
       <value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish -->
       <value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian -->
       <value name="discourse:intj">discourse:intj</value> <!-- Polish -->
       <value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese -->
       <value name="discourse">discourse element</value>
       <value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani -->
       <value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin -->
       <value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin -->
       <value name="dislocated:obj">dislocated:obj</value> <!-- Latin -->
       <value name="dislocated:subj">dislocated:subj</value> <!-- Beja -->
       <value name="dislocated">dislocated elements</value>
       <value name="expl:comp">expl:comp</value> <!-- French -->
       <value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish -->
       <value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian -->
       <value name="expl:poss">expl:poss</value> <!-- Romanian -->
       <value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian -->
       <value name="expl:subj">expl:subj</value> <!-- French, Naija -->
       <value name="expl">expletive</value>
       <value name="fixed">fixed multiword expression</value>
       <value name="flat:abs">flat:abs</value> <!-- Ukrainian -->
       <value name="flat:dist">flat:dist</value> <!-- Western Armenian -->
       <value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian -->
       <value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian -->
       <value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian -->
       <value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian -->
       <value name="flat:repeat">flat:repeat</value> <!-- Ukrainian -->
       <value name="flat:sibl">flat:sibl</value> <!-- Ukrainian -->
       <value name="flat:title">flat:title</value> <!-- Ukrainian -->
       <value name="flat:vv">flat:vv</value> <!-- Classical Chinese -->
       <value name="flat">name multiword expression</value>
       <value name="goeswith">goes with</value>
       <value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian -->
       <value name="iobj:appl">iobj:appl</value> <!-- Wolof -->
       <value name="iobj:patient">iobj:patient</value> <!-- Tagalog -->
       <value name="iobj">indirect object</value>
       <value name="list">list</value>
       <value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese -->
       <value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French -->
       <value name="mark:aff">mark:aff</value> <!-- Beja -->
       <value name="mark:obj">marker + object</value> <!--Old French, no doc -->
       <value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc -->
       <value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic -->
       <value name="mark:q">mark:q</value> <!-- Hebrew -->
       <value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese -->
       <value name="mark">marker</value>
       <value name="nmod:agent">nmod:agent</value> <!-- Welsh -->
       <value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha -->
       <value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik -->
       <value name="nmod:att">nmod:att</value> <!-- Hungarian -->
       <value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian -->
       <value name="nmod:attr">nmod:attr</value> <!-- Chukchi -->
       <value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha -->
       <value name="nmod:cau">nmod:cau</value> <!-- Uyghur -->
       <value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur -->
       <value name="nmod:flat">nmod:flat</value> <!-- Polish -->
       <value name="nmod:gen">nmod:gen</value> <!-- Breton -->
       <value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish -->
       <value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian -->
       <value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian -->
       <value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha -->
       <value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian -->
       <value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian -->
       <value name="nmod:obl">nmod:obl</value> <!-- Hungarian -->
       <value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur -->
       <value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof -->
       <value name="nmod:pred">nmod:pred</value> <!-- Polish -->
       <value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian -->
       <value name="nmod:redup">nmod:redup</value> <!-- Welsh -->
       <value name="nmod:relat">nmod:relat</value> <!-- Chukchi -->
       <value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian -->
       <value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur -->
       <value name="nmod">nominal modifier</value>
       <value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French -->
       <value name="nsubj:aff">nsubj:aff</value> <!-- Beja -->
       <value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog -->
       <value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian -->
       <value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin -->
       <value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish -->
       <value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog -->
       <value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog -->
       <value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian -->
       <value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu -->
       <value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French -->
       <value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian -->
       <value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese -->
       <value name="nsubj">Nominal subject</value>
       <value name="nummod:det">nummod:det</value> <!-- Beja -->
       <value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian -->
       <value name="nummod:flat">nummod:flat</value> <!-- Polish -->
       <value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian -->
       <value name="nummod">numeric modifier</value>
       <value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French -->
       <value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg -->
       <value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog -->
       <value name="obj:appl">obj:appl</value> <!-- Wolof -->
       <value name="obj:caus">obj:caus</value> <!-- Wolof -->
       <value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija -->
       <value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French -->
       <value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese -->
       <value name="obj">object</value>
       <value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French -->
       <value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian -->
       <value name="obl:appl">obl:appl</value> <!-- Wolof -->
       <value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil -->
       <value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu -->
       <value name="obl:cmp">obl:cmp</value> <!-- Telugu -->
       <value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil -->
       <value name="obl:comp">obl:comp</value> <!-- Moksha -->
       <value name="obl:dat">obl:dat</value> <!-- Kurmanji -->
       <value name="obl:freq">obl:freq</value> <!-- Moksha -->
       <value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil -->
       <value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil -->
       <value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
       <value name="obl:lvc">obl:lvc</value> <!-- Hungarian -->
       <value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian -->
       <value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik -->
       <value name="obl:npmod">obl:npmod</value> <!-- Coptic, English -->
       <value name="obl:orphan">obl:orphan</value> <!-- Polish -->
       <value name="obl:own">obl:own</value> <!-- Kazakh -->
       <value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese -->
       <value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil -->
       <value name="obl:poss">obl:poss</value> <!-- Thai -->
       <value name="obl:prep">obl:prep</value> <!-- Irish -->
       <value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani -->
       <value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic -->
       <value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri -->
       <value name="obl:tmod">temporal modifier</value>
       <value name="obl">oblique nominal</value>
       <value name="orphan:missing">textual gap in the source</value> <!-- Latin -->
       <value name="orphan">remnant in ellipsis</value>
       <value name="parataxis:appos">parataxis:appos</value> <!-- Italian -->
       <value name="parataxis:conj">parataxis:conj</value> <!-- Naija -->
       <value name="parataxis:coord">parataxis:coord</value> <!-- Beja -->
       <value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian -->
       <value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian -->
       <value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija -->
       <value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian -->
       <value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish -->
       <value name="parataxis:mod">parataxis:mod</value> <!-- Beja -->
       <value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian -->
       <value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian -->
       <value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish -->
       <value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija -->
       <value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian -->
       <value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani -->
       <value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian -->
       <value name="parataxis:rt">parataxis:rt</value> <!-- Irish -->
       <value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish -->
       <value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German -->
       <value name="parataxis:url">parataxis:url</value> <!-- Irish -->
       <value name="parataxis">parataxis</value>
       <value name="punct">punctuation</value>
       <value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? -->
       <value name="reparandum">overridden disfluency</value>
       <value name="root">root</value>
       <value name="vocative:cl">vocative:cl</value> <!-- Ukrainian -->
       <value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian -->
       <value name="vocative">vocative</value>
       <value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish -->
       <value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi -->
       <value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish -->
       <value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic -->
       <value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian -->
       <value name="xcomp:subj">xcomp:subj</value> <!-- Polish -->
       <value name="xcomp">open clausal complement</value>
     </feature>
     <feature name="coord" domain="NT" ></feature>
     <feature name="dom" domain="NT" ></feature>
     <feature name="type" domain="NT" >
       <value name="nV">élément non-verbal</value>
       <value name="VFin">finite verb</value>
       <value name="VInf">infinitive</value>
       <value name="VPar">participle</value>
       <value name="--">nil</value>
     </feature>
     <feature name="vform" domain="NT"></feature>
     <feature name="vlemma" domain="NT"></feature>
     <feature name="note" domain="NT"></feature>
     <feature name="snr" domain="NT"></feature>
     ';
         printf MASTER "$nt_features_header";
         printf MASTER '
     <edgelabel>
       <value name="D">dependency</value>
       <value name="L">lexical</value>
       <value name="R">relator</value>
       <value name="*">not bound</value>
     </edgelabel>
     <secedgelabel>
       <value name="cluster">between elements of GpCoo</value>
       <value name="coord">between members of Coo</value>
       <value name="dupl">between duplicated nodes</value>
     </secedgelabel>
     </annotation>
     </head>
     <body>
     ';
+    }
     #  <value name="M">main</value>
     #  <value name="P">part</value>
     sub write_master_footer {
         print MASTER '</body>
     </corpus>
     ';
+    }
     $TEMP = '
     <feature name="nodom" domain="NT" ></feature>
     <feature name="headpos" domain="NT" ></feature>
     <feature name="annotationFile" domain="NT" ></feature>
     <feature name="annotationUri" domain="NT" ></feature>
     ';
     sub define_cat_hashes {
         #  $abbrev2cat{"Apst"} = "Apostrophe";
         #  $abbrev2cat{"AtObj"} = "AttributObjet";
         #  $abbrev2cat{"AtRfc"} = "AttributReflechi";
         #  $abbrev2cat{"AtSj"} = "AttributSujet";
         #  $abbrev2cat{"AuxA"} = "Auxilie-Actif";
         #  $abbrev2cat{"AuxP"} = "Auxilie-Passif";
         #  $abbrev2cat{"Circ"} = "Circonstant";
         #  $abbrev2cat{"Cmpl"} = "Complement";
         #  $abbrev2cat{"GpCoo"} = "Coordonne";
         #  $abbrev2cat{"Coo"} = "Coordination";
         #  $abbrev2cat{"Det"} = "Determinant";
         #  $abbrev2cat{"NgPrt"} = "Forclusif";
         #  $abbrev2cat{"Insrt"} = "Incidente";
         #  $abbrev2cat{"Intj"} = "Interjection";
         #  $abbrev2cat{"ModA"} = "ModifieurAttache";
         #  $abbrev2cat{"ModD"} = "ModifieurDetache";
         #  $abbrev2cat{"Ng"} = "Negation";
         #  $abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif";
         #  $abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #?
         #  $abbrev2cat{"VFin"} = "NoeudVerbal-Personnel";
         #  $abbrev2cat{"nSnt"} = "NonPhrase";
         #  $abbrev2cat{"Obj"} = "Objet";
         #  $abbrev2cat{"Snt"} = "Phrase";
         #  $abbrev2cat{"Pon"} = "Ponctuation";
         #  $abbrev2cat{"Rfc"} = "Reflechi";
         #  $abbrev2cat{"Rfx"} = "ReflexifRenforce";
         #  $abbrev2cat{"RelC"} = "Relateur-Coordonnant";
         #  $abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant";
         #  $abbrev2cat{"nMax"} = "StructureNonMaximale";
         #  $abbrev2cat{"SjImp"} = "SujetImpersonnel";
         #  $abbrev2cat{"SjPer"} = "SujetPersonnel";
         #  $abbrev2cat{"Lac"} = "Lacune";
         #  $abbrev2cat{"Aux"} = "Auxilie";
         #  $abbrev2cat{"Regim"} = "Regime";
         $abbrev2cat{"acl"}        = "Clausal modifier of noun";
         $abbrev2cat{"advcl"}      = "Adverbial clause modifier";
         $abbrev2cat{"advmod"}     = "Adverbial modifier";
         $abbrev2cat{"amod"}       = "Adjectival modifier";
         $abbrev2cat{"appos"}      = "Appositional modifier";
         $abbrev2cat{"aux"}        = "Auxiliary";
         $abbrev2cat{"cc-nc"}      = "Coordinated conjunct : non coordonant";
         $abbrev2cat{"cc"}         = "Coordinating conjunction";
         $abbrev2cat{"ccomp"}      = "Clausal complement";
         $abbrev2cat{"conj"}       = "Conjunct";
         $abbrev2cat{"cop"}        = "Copula";
         $abbrev2cat{"csubj"}      = "Clausal subject";
         $abbrev2cat{"det"}        = "Determiner";
         $abbrev2cat{"dislocated"} = "Dislocated elements";
         $abbrev2cat{"expl"}       = "Expletive";
         $abbrev2cat{"iobj"}       = "Indirect object";
         $abbrev2cat{"mark"}       = "Marker";
         $abbrev2cat{"nmod"}       = "Nominal modifier";
         $abbrev2cat{"nsubj"}      = "Nominal subject";
         $abbrev2cat{"nummod"}     = "Numeric modifier";
         $abbrev2cat{"obj"}        = "Object";
         $abbrev2cat{"obl"}        = " Oblique nominal";
         $abbrev2cat{"orphan"}     = "Remnant in ellipsis";
         $abbrev2cat{"remnant"}    = "Remnant ?";
         $abbrev2cat{"vocative"}   = "Vocative";
         $abbrev2cat{"xcomp"}      = "Open clausal complement";
+    }
     sub print_sentence {
         for ( my $q = 0 ; $q <= $#words ; $q++ ) {
             print "$words[$q]\n";
+        }
+    }
     sub write_nonterminals {
         my $print_nt_features;
         if ( $_[1] =~ /dupl/ ) {
             $dupl = '_dupl';
+        }
         else {
             $dupl = '';
+        }
         if ( $nt_features ne '' ) {
             $print_nt_features = $nt_empty_features;
             if ( $_[0] =~ /$thisrootname/ ) {
                 $print_nt_features = $nt_features;
+            }
+        }
         printf XML "      <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", $., $w, $dupl, $_[0], $dom, $type, $vform, $vlemma,
           $print_nt_features, notes("$._$w"), $.;
         printf XML "        <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., $w, $dupl;
         # link duplicate with primary original node
         if ( $_[1] =~ /dupl/ ) {
             printf XML "        <secedge idref=\"s%d_%d\" label=\"dupl\"/>\n", $., $w;
+        }
         # if node is not a duplicate: attach all the daughters
         if ( $_[1] !~ /dupl/ ) {
             for ( my $d = 0 ; $d <= $#daughters ; $d++ ) {
                 $daughter = $daughters[$d];
                 if ( "$._$w" ne "$._$daughter" ) {    # avoid cycles
                     printf XML "        <edge idref=\"n%d_%d%s\" label=\"%s\"/>\n", $., $daughter, $dupl, edge_label("$._$daughter");
+                }
                 # check if a duplicate of this node must be attached
                 if ( $duplicates{"$._$daughter"} == 1 ) {
                     printf XML "        <edge idref=\"n%d_%d_dupl\" label=\"%s\"/>\n", $., $daughter, edge_label("$._$daughter_dupl");
+                }
+            }
+        }
         print XML "      </nt>\n";
+    }
     # checks if node is stored as in hash of relators
     sub edge_label {
         if ( $relators{ $_[0] } == 1 ) {
             return 'R';
+        }
         return 'D';
+    }
     # retrives notes for this node
     sub notes {
         if ( $notes{ $_[0] } ne '' ) {
             return "$notes{$_[0]}";
+        }
         return '--';
+    }
     # conversions, bug fixes
     sub clean_data {
         # conversions of values, some necessary some for convenience
         $word =~ s/"/'/g;
         $word =~ s/\&/(and)/g;
         $word =~ s/<</«/g;
         $word =~ s/>>/»/g;
         $word =~ s/[<>]//g;
         $pos =~ s/:/_/g;
         $mor =~ s/\|/_/g;
         $ppos =~ s/:/_/g;
         $pmor =~ s/\|/_/g;
         $lemma =~ s/\|/_/g;
         $lemma =~ s/[<>]//g;
         $lemma =~ s/"/'/g;
         $lemma =~ s/\&/(and)/g;
         $plemma =~ s/\|/_/g;
         $plemma =~ s/[<>]//g;
         $plemma =~ s/"/'/g;
         $plemma =~ s/\&/(and)/g;
         # clean categories
         $cat =~ s/ROOT/$thisrootname/;    # top node, for compatibility with SRCMF
         #    $cat =~ s/Ponctuation/Pon/;
         # correct some bugs in parse
         #    $cat =~ s/Sujet/SjPer/;
         #    $cat =~ s/Modifieur/ModA/;
         #    $cat =~ s/Parenthese/Insrt/;
         $cat =~ s/\-/_/g;
         $cat =~ s/RelNC_(.*)/$1_RelNC/;    # RelNC always 2nd node, for consistency in duplicates
         return;
+    }

     use File::Basename;
     my $CMD="conll2tiger.pl";
     my $VERSION="1.5";
     my $MODIFIED="8/12/2015";   # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
     my $CMD      = "conll2tiger.pl";
     my $VERSION  = "1.5";
     my $MODIFIED = "8/12/2015";        # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
     # columns (default are the predicted values in CoNLL 2009 format)
     my $coll = 2;  # lemma
     my $colm = 3;  # morph (pos)
     my $colf = 5;  # features
     my $colh = 6;  # head
     my $cold = 7; # deprel
     my $outdir = "conllexport"; # deprel
     my $split = 1000; # split output after nr sentences
     my $coll   = 2;                    # lemma
     my $colm   = 3;                    # morph (pos)
     my $colf   = 5;                    # features
     my $colh   = 6;                    # head
     my $cold   = 7;                    # deprel
     my $outdir = "conllexport";        # deprel
     my $split  = 1000;                 # split output after nr sentences
     # tree structure
     my %dominates  = ();
     my %deprel  = ();  # deprel{nr} = deprel
     my @daughters = ();    # daughter nodes, stored in %dominates
     my %duplicates = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
     my $type = "--"; # node attribute
     my $vform = my $vlemma = "--"; # node attributes for verbs store form and lemma
     my $label = "D"; # default edge label
     my $nt_features_header = ''; # option -x
     my $nt_features = ''; # option -x
     my $nt_empty_features = ''; # option -x
     my @scodes = (); # option -x
     my $add_to_sentcode = '';
     my $rootname = 'root';  # default
     my $featcol = 13;
     my %dominates          = ();
     my %deprel             = ();                                         # deprel{nr} = deprel
     my @daughters          = ();                                         # daughter nodes, stored in %dominates
     my %duplicates         = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
     my $type               = "--";                                       # node attribute
     my $vform              = my $vlemma = "--";                          # node attributes for verbs store form and lemma
     my $label              = "D";                                        # default edge label
     my $nt_features_header = '';                                         # option -x
     my $nt_features        = '';                                         # option -x
     my $nt_empty_features  = '';                                         # option -x
     my @scodes             = ();                                         # option -x
     my $add_to_sentcode    = '';
     my $rootname           = 'root';                                     # default
     my $featcol            = 13;
     ######################################################################
     #  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
-...
     #   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
     #   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
     ######################################################################
     # Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
     # Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
     # for Profiterole project (2019-2021)
     # 2019-09-25
-...
     # - added "punct" to cat values
     # Update 2021-07-20
     # - added cat value list compiled from
     #   https://universaldependencies.org/ext-dep-index.html and the previous
     #   version. All relation types and subtypes from the UD 2.8 corpora
     # - added cat value list compiled from
     #   https://universaldependencies.org/ext-dep-index.html and the previous
     #   version. All relation types and subtypes from the UD 2.8 corpora
     #   should be there.
     # - contractions indexed
     ######################################################################
     my $HELP="
     my $HELP = "
     ==================================================================
     $CMD $VERSION: Help
     ==================================================================
-...
     #                    DO NOT MODIFY FOLLOWING CODE !
     ###########################################################################
     ###########################################################################
     # parse the command line
     ###########################################################################
-...
     use Getopt::Std;
     getopts('c:C:hD:H:M:o:R:s:x:X:');
     if (defined($opt_h)) {
       print STDERR "$HELP";
       exit(0);
     if ( defined($opt_h) ) {
     	print STDERR "$HELP";
     	exit(0);
+    }
     if (defined($opt_o)) {
       $outdir = $opt_o
     if ( defined($opt_o) ) {
     	$outdir = $opt_o;
+    }
     if (defined($opt_C)) {
       $corpus = $opt_C;
     if ( defined($opt_C) ) {
     	$corpus = $opt_C;
+    }
     if (defined($opt_D)) {
       $cold = $opt_D
     if ( defined($opt_D) ) {
     	$cold = $opt_D;
+    }
     if (defined($opt_H)) {
       $colh = $opt_H
     if ( defined($opt_H) ) {
     	$colh = $opt_H;
+    }
     if (defined($opt_M)) {
       $colm = $opt_M
     if ( defined($opt_M) ) {
     	$colm = $opt_M;
+    }
     if (defined($opt_R)) {
       $rootname = $opt_R;
     if ( defined($opt_R) ) {
     	$rootname = $opt_R;
+    }
     if (defined($opt_s)) {
       $split = $opt_s
     if ( defined($opt_s) ) {
     	$split = $opt_s;
+    }
     if (defined($opt_X)) {
       $featcol = $opt_X;
     if ( defined($opt_X) ) {
     	$featcol = $opt_X;
+    }
     if (defined($opt_x)) {
       @scodes = split(",", $opt_x);
       for (my $i=0; $i<=$#scodes; $i++) {
         $nt_features_header = $nt_features_header . sprintf("<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i]);
+      }
       $nt_features_header =~ s/\bid\b/ncaid/;  # avoid reserved Tiger attribute "id"
     if ( defined($opt_x) ) {
     	@scodes = split( ",", $opt_x );
     	for ( my $i = 0 ; $i <= $#scodes ; $i++ ) {
     		$nt_features_header = $nt_features_header . sprintf( "<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i] );
+    	}
     	$nt_features_header =~ s/\bid\b/ncaid/;    # avoid reserved Tiger attribute "id"
+    }
     my @colnames = ( "url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL" );
     my @colnames = ("url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL");
     # my %pos = %lemma = %form = %deprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = ();
     my @coordelements = ();
-...
     my $infile = $ARGV[0];
     $infile =~ s/\.conllu?//i;
     if($infile eq '') {
       $infile = 'subcorpus';
     if ( $infile eq '' ) {
     	$infile = 'subcorpus';
+    }
     my $counter = 1;
     $suffix = sprintf("%05d", $counter);
     $suffix = sprintf( "%05d", $counter );
     $infilename = basename($infile);
     $foo = `if [ ! -d $outdir ];then mkdir $outdir;fi`;
     open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
     open(LOG, ">$outdir/conversion.log")  or die "\nopen file error of conversion.log\n";
     open(MASTER, ">$outdir/main.xml")  or die "\nopen file error of main.xml\n";
     open( XML, ">$outdir/$infilename-$suffix.xml" )
       or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
     open( LOG, ">$outdir/conversion.log" )
       or die "\nopen file error of conversion.log\n";
     open( MASTER, ">$outdir/main.xml" ) or die "\nopen file error of main.xml\n";
     write_xml_header();
     write_master_header ();
     write_master_header();
     # flush output for log and master file
     select(LOG); $| = 1;
     select(MASTER); $| = 1;
     select(LOG);
     $| = 1;
     select(MASTER);
     $| = 1;
     $commandline = $0 . " ". (join " ", @ARGV);
     $commandline = $0 . " " . ( join " ", @ARGV );
     print LOG "$commandline\n\n";
     print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
     $/ = ""; # treat empty line as RS
     $/ = "";    # treat empty line as RS
     while (<>) {
       if($. % $split == 0) {
         print XML "</subcorpus>\n";
         close(XML);
         $suffix = sprintf("%05d", ++$counter);
         open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error\n";
         write_xml_header();
         print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
+      }
     	if ( $. % $split == 0 ) {
     		print XML "</subcorpus>\n";
     		close(XML);
     		$suffix = sprintf( "%05d", ++$counter );
     		open( XML, ">$outdir/$infilename-$suffix.xml" )
     		  or die "\nopen file error\n";
     		write_xml_header();
     		print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
+    	}
       # ----------------------------------------
       # set root (or fake root if ROOT is missing)
       # ----------------------------------------
       $rootnode = $fakeroot = 0; # m = Treat string as multiple lines, so that ^ matches beginning of line
       $thisrootname = $rootname;
       ($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);  # real root marked by parser
       if($rootnode == 0) {
     #    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
     #    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
         ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
         print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
         $fakeroot = 1;
         $thisrootname = 'nSnt';
+      }
       if($rootnode == 0) {
         $rootnode = 1;   # set fake root if nothing goes
         print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
         $fakeroot = 2;
         $thisrootname = 'Err';
+      }
     	# ----------------------------------------
     	# set root (or fake root if ROOT is missing)
     	# ----------------------------------------
     	$rootnode = $fakeroot = 0;    # m = Treat string as multiple lines, so that ^ matches beginning of line
     	$thisrootname = $rootname;
     	($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);    # real root marked by parser
     	if ( $rootnode == 0 ) {
       my @cols = ();
       @words = split (/\n/);
       @terminals = ();
     		#    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
     		#    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
     		($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);                                     # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
     		print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
     		$fakeroot     = 1;
     		$thisrootname = 'nSnt';
+    	}
     	if ( $rootnode == 0 ) {
     		$rootnode = 1;                                                                                # set fake root if nothing goes
     		print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
     		$fakeroot     = 2;
     		$thisrootname = 'Err';
+    	}
       %dominates = (); # empty at beginning of sentence
       %deprel = (); # empty at beginning of sentence
       %aux = (); # empty at beginning of sentence
       @daughters = ();
     	my @cols = ();
     	@words     = split(/\n/);
     	@terminals = ();
       my $commentlines = 0; #added by AL
     #  my $contractions = 0; #added by AL
     #  my $text_id = "unknown_text";
     my $text_id = $infilename;
       my $sent_id = "0";
     	%dominates = ();                                                                                  # empty at beginning of sentence
     	%deprel    = ();                                                                                  # empty at beginning of sentence
     	%aux       = ();                                                                                  # empty at beginning of sentence
     	@daughters = ();
       # ----------------------------------------
       # loop through words #1: write tokens (terminal nodes) to XML file
       # store tree relevant information for loop #2
       # ----------------------------------------
       for (my $w=0; $w<=$#words; $w++) {
     # Added by AL for comment lines
         if ($words[$w] =~ /^#/) {
     	if ($words[$w] =~ /^# newdoc/) {
     		$text_id = $words[$w];
     		$text_id =~ s/# newdoc id = //;
+    	}
     	elsif ($words[$w] =~ /^# sent_id/) {
     		$sent_id = $words[$w];
     		$sent_id =~ s/# sent_id = //;
+    	}

... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3347