/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3346

     def CMD="conll2tiger.pl";
     def VERSION = "1.5";
     def MODIFIED = "8/12/2015";   // angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
     // columns (default are the predicted values in CoNLL 2009 format)
     def coll = 2;  // lemma
     def colm = 3;  // morph (pos)
     def colf = 5;  // features
     def colh = 6;  // head
     def cold = 7; // deprel
     def outdir = "conllexport"; // deprel
     def split = 1000; // split output after nr sentences
     // tree structure
     def dominates  = [:];
     def deprel  = [:];  // deprel{nr} = deprel
     def daughters = []; // daughter nodes, stored in %dominates
     def duplicates = [:];
     def relators = [:];
     def notes = [:]
     def aux = [:]; // store nodes of duplicates, relators
     def type = "--"; // node attribute
     def vform = "--"
     def vlemma = "--"; // node attributes for verbs store form and lemma
     def label = "D"; // default edge label
     def nt_features_header = ''; // option -x
     def nt_features = ''; // option -x
     def nt_empty_features = ''; // option -x
     def scodes = []; // option -x
     def add_to_sentcode = '';
     def rootname = 'root';  // default
     def featcol = 13;
     //#####################################################################
     //  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
     //  treebanks to TigerXML
     //    Achim Stein <achim.stein@ling.uni-stuttgart.de>
     // License : GNU GPL v. 3 (see the LICENSE file)
     //#####################################################################
     // TO DO:
     // - coordination
     // - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100)
     //   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
     //   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
     //#####################################################################
     // Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
     // for Profiterole project (2019-2021)
     // 2019-09-25
     // - updated default column numbers for CONNL-U SRCMF format
     // - added processing for comment lines
     // - added @textid to terminal nodes
     // - deleted ppos, pmor et plemma (predicted tags and lemmas)
     // - replaced specific SRCMF with standard UD tags
     // Update 2020-05-13
     // - added @editionId for synchronization with BFM word ID
     // Update 2021-03-22
     // - using $infilename for @textid
     // - added support for .conllu extension
     // Update 2021-03-29
     // - added editionId to declarations in main.xml
     // Update 2021-07-16
     // - added "punct" to cat values
     // Update 2021-07-20
     // - added cat value list compiled from
     //   https://universaldependencies.org/ext-dep-index.html and the previous
     //   version. All relation types and subtypes from the UD 2.8 corpora
     //   should be there.
     // - contractions indexed
     //#####################################################################
     def HELP = """
     ==================================================================
     $CMD $VERSION: Help
     ==================================================================
     FUNKTION: converts CoNLL parser output to TigerXML (for mate tools)
     	   creates master file, splits input files, corrects unbound nodes
     SYNTAX:   $CMD [Options] <CoNLL file>
     OPTIONEN:
     	-c    ignore coordination (delete coordx- prefix in deprel)
     	-C str   corpus specials: nca
     	-h    show help
     	-o    create all files in this output directory (default: $outdir)
     set COLUMNS for required info (0 = column 1, 1 = column 2, etc.)
     	-D nr    colum for deprel default=$cold
     	-H nr    colum for head default=$colh
     	-M nr    colum for morphology (POS) default=$colm
     	-F nr    colum for morph. features default=$colf
     	-R str   Root category (default: $rootname)
     	-s nr    split output files after each nr sentence (default = $split)
     	-x str,...  include these attributes if present in the -X column of the first word
     	   (the first code is also copied into the sentence id)
     	-X nr    the column where attributes are stored (default: $featcol)
     EXAMPLE:
     	 - For mate parser output: no further options required
     	$CMD parsed.conll
     	 - For Le Monde 2005: include attributes
     	gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr
     	 - For NCA:
     	conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll
     """;
     //##########################################################################
     //     DO NOT MODIFY FOLLOWING CODE !
     //##########################################################################
     //##########################################################################
     // parse the command line
     //##########################################################################
     getopts('c:C:hD:H:M:o:R:s:x:X:');
     if (defined(opt_h)) {
     	println "** " + "$HELP";
     	return 0;
+    }
     if (defined(opt_o)) {
     	outdir = opt_o
+    }
     if (defined(opt_C)) {
     	corpus = opt_C;
+    }
     if (defined(opt_D)) {
     	cold = opt_D
+    }
     if (defined(opt_H)) {
     	colh = opt_H
+    }
     if (defined(opt_M)) {
     	colm = opt_M
+    }
     if (defined(opt_R)) {
     	rootname = opt_R;
+    }
     if (defined(opt_s)) {
     	split = opt_s
+    }
     if (defined(opt_X)) {
     	featcol = opt_X;
+    }
     if (defined(opt_x)) {
     	scodes = opt_x.split(",");
     	for (def i = 0; i <= scodes.size(); i++) {
     		nt_features_header = nt_features_header + sprintf("<feature name=\"%s\" domain=\"NT\"></feature>\n", scodes[i]);
+    	}
     	nt_features_header =~ s/\bid\b/ncaid/;  // avoid reserved Tiger attribute "id"
+    }
     def colnames = ["url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL"];
     // def pos = [:]%lemma = %form = hDeprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = [:];
     def coordelements = [];
     def id = def form = def lemma = def plemma = def pos = def ppos = def feat = def pfeat = def head = def phead = def deprel = def pdeprel = def edition_id = "";
     def timestamp = `date`;
     chomp(timestamp);
     def infile = ARGV[0];
     infile =~ s/\.conllu?//i;
     if (infile == '') {
     	 infile = 'subcorpus';
+    }
     def counter = 1;
     suffix = sprintf("%05d", counter);
     infilename = basename(infile);
     foo = `if [ ! -d outdir ];then mkdir outdir;fi`;
     open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
     open(LOG, ">$outdir/conversion.log")  or die "\nopen file error of conversion.log\n";
     open(MASTER, ">$outdir/main.xml")  or die "\nopen file error of main.xml\n";
     write_xml_header();
     write_master_header();
     // flush output for log and master file
     select(LOG); $| = 1;
     select(MASTER); $| = 1;
     commandline = $0 + " " + (join " ", @ARGV);
     LOG << "$commandline\n\n";
     MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
     $/ = ""; // treat empty line as RS
     while (<>) {
     	 if ($. % split == 0) {
     	XML << "</subcorpus>\n";
     	close(XML);
     	suffix = sprintf("%05d", ++counter);
     	open(XML, ">$outdir/$infilename-$suffix.xml")  or die "\nopen file error\n";
     	write_xml_header();
     	MASTER << "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
+    	 }
     	 // ----------------------------------------
     	 // set root (or fake root if ROOT is missing)
     	 // ----------------------------------------
     	 rootnode = fakeroot = 0; // m = Treat string as multiple lines, so that ^ matches beginning of line
     	 thisrootname = rootname;
     	 (rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);  // real root marked by parser
     	 if (rootnode == 0) {
     // (rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  // no marked ROOT, but top node (head = 0)   TOO SPECIFIC
     // (rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  // no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
     	(rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  // no marked ROOT, but top node (head = 0) in col7 (updated by AL)
     	LOG << " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
     	fakeroot = 1;
     	thisrootname = 'nSnt';
+    	 }
     	 if (rootnode == 0) {
     	rootnode = 1;   // set fake root if nothing goes
     	LOG << " Error sentence $.: setting fake root to first word:\n$_\n";
     	fakeroot = 2;
     	thisrootname = 'Err';
+    	 }
     	 def cols = [];
     	 @words = split (/\n/);
     	 @terminals = [];
     	 %dominates = [:]; // empty at beginning of sentence
     	 hDeprel = [:]; // empty at beginning of sentence
     	 %aux = [:]; // empty at beginning of sentence
     	 daughters = [];
     	 def commentlines = 0; #added by AL
     //  def contractions = 0; #added by AL
     //  def text_id = "unknown_text";
     def text_id = infilename;
     	 def sent_id = "0";
     	 // ----------------------------------------
     	 // loop through words #1: write tokens (terminal nodes) to XML file
     	 // store tree relevant information for loop #2
     	 // ----------------------------------------
     	 for (def w = 0; w <= words.size(); w++) {
     // Added by AL for comment lines
     	if (words[w] =~ /^#/) {
     	if (words[w] =~ /^# newdoc/) {
     		text_id = words[w];
     		text_id =~ s/# newdoc id = //;
+    	}
     	elsif (words[w] =~ /^# sent_id/) {
     		sent_id = words[w];
     		sent_id =~ s/# sent_id = //;
+    	}
     //	LOG << "Comment line loop 1: words[w]\n";
     	commentlines++;
     	next;
+    	}
     // Added by AL for contractions
     	elsif (words[w] =~ /^\d+-\d+/) {
     //	LOG << "Contraction line loop 1: words[w]\n";
     	commentlines++;
     //	contractions++;
     	next;
+    	}
     	else {
     	if (defined (opt_c)) {
     	  words[w] =~ s/coord(\d+)-//g;
+    	}
     	@cols = split (/\t/, words[w]);
     	wnr = cols[0];
     	word = cols[1];
     	lemma = cols[2];
     	plemma = cols[2]; // predicted
     	pos = cols[3];
     	ppos = cols[4]; // predicted
     	mor = cols[5];
     	pmor = cols[5]; // predicted
     	cat = cols[cold];
     	edition_id = cols[9];
     	edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g;
     	if (cat =~ /[<>]/) {
     	  LOG << "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
     	  cat = 'Err2';
+    	}
     	// NCA: enclose lemmas in underscores (easier for regex construction)
     	if (corpus =~ /nca/i) {
     	  lemma = "_" + "$lemma" + "_"
+    	}
     	clean_data();
     	// get attribute-value pairs from col #13 of first word (option -x)
     	if (opt_x == "all") {
     	  cols[featcol] = "all=" + cols[featcol];
+    	}
     	if (w == 0 && cols[featcol] =~ /=/) {
     //   println "** " + "========== getting att-value for word w: cols[featcol] scodes=@scodes\n";
     		nt_features = nt_empty_features = '';
     //   while(cols[featcol] =~ m/ (.*?)="([^"]*)"/gs) {   // quoted values
     	  while(cols[featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs) {  // maybe unquoted values (e.g. Le Monde 2005)
     	att = $1;
     	val = $2;
     	// pick the attributes that match those of the command line option -x
     	for (def t = 0; t <= scodes.size(); t++) {
     	  if (att == scodes[t]) {
     	 val =~ s/\&/\&amp;/g;  //  replace "&" in values (appears in URLs)
     	 if (t == 0) { add_to_sentcode = "_$att$val"; }
     	 nt_features = nt_features + " $att=\"$val\"";
     //	 println "** " + "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n";
+    	  }
     	  if (att == scodes[t]) { nt_empty_features = nt_empty_features + " $att=\"--\"";}
+    	}
+    	  }
     	  // replace the reserved feature 'id' (Tiger)
     	  add_to_sentcode =~ s/\bid=/ncaid=/;
     	  nt_features =~ s/\bid=/ncaid=/;
     	  nt_empty_features =~ s/\bid=/ncaid=/;
     	} // if col 13 contains attributes
     	else {
     	  if (defined(opt_x) && (w == 0)) {
     	println "** " + "Warning: sentence=$.  option -x is defined, but no attribute=value declarations were found!\n";
+    	  }
+    	}
     	// store output for terminal node in array, output later. For double categories make a duplicate node.
     	tempid = sprintf("%d_%d", $., wnr);
     // push(@terminals, sprintf("   <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, ppos, pmor, plemma, text_id, edition_id));
     	push(@terminals, sprintf("   <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, word, pos, mor, lemma, text_id, edition_id));
     	if (cat =~ /_/) {
     //   push(@terminals, sprintf("   <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", "_", "_", "_", text_id, edition_id));
     	  push(@terminals, sprintf("   <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., wnr, "*", "_", "_", "_", text_id, edition_id));
     	  duplicates{tempid} = 1; // store, check later to attach the duplicates to the mother
+    	}
     	// associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux)
     	if (cat =~ /Aux/) {
     	  aux{cols[colh]} = "$word" + "_" + "$plemma"; // aux{head} = word_lemma (of Aux)
+    	}
     	// ----------------------------------------
     	// store information needed for tree
     	// ----------------------------------------
     	// if fake rootnode == 1: nSnt as root node
     	if ((fakeroot == 1) && (w-commentlines+1 == rootnode)) {
     	  cat = 'nSnt';
     	  notes{tempid} = 'Warning no marked ROOT node in CoNLL';  // TODO: geht nicht
+    	}
     	// if fake rootnode == 2: flatten structure: attach all words to the first word
     	if ((fakeroot == 2) && (w-commentlines+1 != rootnode)) {
     	  cols[colh] = 1;
     	  notes{tempid} = 'Error neither ROOT node nor top node in CoNLL';
+    	}
     	// correct unbound words in parser output (phead = 0, but not marked as ROOT)
     	if ((cols[colh] == "0") && (w-commentlines+1 != rootnode)) { // AL: added: -commentlines
     	  printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", (w-commentlines+1), rootnode;
     	  cols[colh] = rootnode;
     	  cat = 'Err';  // let Err instead of deprel appear in dom attribute
     	  notes{tempid} = 'Warning unbound node in CoNLL';
+    	}
     	// store for R edge labels
     	if (cols[cold] =~ /RelN?C/) {
     	  relators{tempid} = 1;
+    	}
     	// store deprel for dom attribute
     	deprel[tempid] = cat; // cols[cold];
     	// if real root, add this node to daughter array, store array in hash dominates{head}{@daughters}
     	if ((fakeroot < 2) && (w-commentlines+1 != rootnode)) {
     	  daughters = @{dominates[ols[colh]}};  // get the array from the hash of the dominating node
     	  push(daughters, wnr);
     	  dominates[cols[colh]] = [daughters];
+    	}
     	 } // for each word loop #1
     } // AL condition end
     	 // print graph code (needs root attribute) and terminal nodes
     	 if (rootnode == 0) {
     	noroot++;
     	LOG << "Error sentence $. ($tempid): root node not found:\n$_\n"; next;
     	 } else {
     	printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., add_to_sentcode;
     	XML << "  <graph root=\"n$._$rootnode\">\n";
     	XML << " <terminals>\n";
     	for (def t = 0; t <= terminals.size(); t++) {
     	  XML << terminals[t];
+    	}
     	XML << " </terminals>\n";
+    	 }
     	 // ----------------------------------------
     	 // loop through words #2 to build Tiger tree (non terminal nodes)
     	 // ----------------------------------------
     	 XML << " <nonterminals>\n";
     	 for (def i = 0; i <= words.size(); i++) {
     //Added AL for comment lines
     	if (words[i] =~ /^#/) {
     //    LOG << "Comment line loop 2 : $words[$i]\n";
     	   next;
+    	}
     //Added AL for contractions
     	if (words[i] =~ /^\d+-\d+/) {
     //    LOG << "Contraction loop 2 : $words[$i]\n";
     	   next;
+    	}
     	else {
     	@cols = split (/\t/, words[i]);
     	w = cols[0];
     	// TODO: redundante Variablenzuweisung (= loop #1)??
     	word = cols[1];
     	lemma = cols[2];
     	plemma = cols[3]; // predicted
     	pos = cols[4];
     	ppos = cols[5]; // predicted
     	mor = cols[6];
     	pmor = cols[7]; // predicted
     	cat = cols[cold];
     	if (cat =~ /[<>]/) {
     	  print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
     	  cat = 'Err2';
+    	}
     	//  OF parser has not learned punctuation: set cat for punctuation to PON
     	if ((corpus =~ /nca/i) && (pos == 'PON')) {
     	  cols[cold] = cat = 'Pon';
+    	}
     	clean_data();
     	// retrieve daughters, make dom attribute (string of dominated nodes)
     	daughters = @{dominates["$w"]};
     	dom = '';
     	for (def d = 0; d <= daughters.size(); d++) {
     	  dom = dom + "_" + deprel["$._$daughters{$d}"];
+    	}
     	if (dom =~ /_/) {
     	  dom =~ s/^_//;
     	} else {
     	  dom = '--';
+    	}
     	// if verbal, set node attributes for verb form and lemma
     	type = "nV";
     	vform = vlemma = "--";
     	if (pos =~ /VER/) { // AL: ppos -> pos
     	  if (mor =~ /infi/) { type = "VInf"; } // AL: pmor -> mor
     	  elsif (pmor =~ /pper|ppre/) { type = "VPar"; }
     	  else { type = "VFin"; }
     	  // if Aux is present, create attribute for main verb
     	  if (aux{w} =~ /(.*?)_(.*)/) {
     	vform = "$1";
     	vlemma = "$2";
+    	  }
     	  // else create attr for simple verb
     	  else {
     	vform = word;
     	vlemma = lemma; // AL: plemma -> lemma (always void in SRCMF)
+    	  }
     	  // NCA: enclose lemmas in underscores (easier for regex construction)
     	  if (corpus =~ /nca/i) {
     	vlemma = "_" + "$vlemma" + "_"
+    	  }
+    	}
     	// call output function (twice for duplicate categories)
     	if (cat =~ /(.*?)_(.*)/) {
     	  write_nonterminals("$2", "");   // RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC
     	  write_nonterminals("$1", "_dupl");  // other category is duplicate
     	} else {
     	  write_nonterminals(cat);
+    	}
     	 } // for words
     } #AL end condition
     	 XML << " </nonterminals>\n";
     	 XML << "  </graph>\n";
     	 XML << "</s>\n";
     	 if ($. % 100 == 0) { println "** " + "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.;}
     } // main
     XML << "</subcorpus>\n";
     println "** " + "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n";
     println "** " + "   Hint 1: on OS X convert master file to MacRoman, e.g  iconv -f latin1 -t macroman\n";
     println "** " + "   Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n";
     println "** " + "   Hint 3: build reliable feature declarations using tiger.sh\n";
     println "** " + "     tiger.sh -a \"lemma word pos ppos\"  (for terminals)\n";
     println "** " + "     tiger.sh -A \"lemma word pos ppos\"  (for non-terminals)\n";
     if (noroot > 0) {println "** " + "$noroot sentences ignored: root not found (see log file)\n";}
     write_master_footer();
     close(MASTER);
     close(XML);
     close(LOG);
     exit;
     // ----------------------------------------
     // sub
     // ----------------------------------------
     def write_xml_header {
     	 XML << """<?xml version=\"1.0\" encoding=\"UTF-8\"?>
     	 <subcorpus name=\"$infilename-$suffix\">
     """;
+    }
     def write_master_header {
     	 printf MASTER """<?xml version="1.0" encoding="UTF-8"?>
     """;
     	 printf MASTER """<corpus id=\"$corpus\">
     <head>
     	 <meta><name>$corpus</name>
     	<author>ILR Stuttgart</author>
     	<date></date>
     	<description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description>
     	<format>SRCMF</format>
     	<history>TigerXML converted by conll2tiger.pl</history>
     	 </meta>
     """;
     //  printf MASTER '<annotation>
     //<feature name="word" domain="T" ></feature>
     //<feature name="pos" domain="T" ></feature>
     //<feature name="mor" domain="T" ></feature>
     //<feature name="lemma" domain="T" ></feature>
     //<feature name="ppos" domain="T" ></feature>
     //<feature name="pmor" domain="T" ></feature>
     //<feature name="plemma" domain="T" ></feature>
     //<feature name="cat" domain="NT" >
     //  <value name="Apst">apostrophe</value>
     //  <value name="AtObj">attribut d objet</value>
     //  <value name="AtRfc">attribut réfléchi</value>
     //  <value name="AtSj">attribut de sujet</value>
     //  <value name="AttributReflechi">attribut réfléchi</value>
     //  <value name="Aux">auxilié</value>
     //  <value name="AuxA">auxilié actif</value>
     //  <value name="AuxP">auxilié passif</value>
     //  <value name="Circ">circonstant</value>
     //  <value name="Circ_RelNC">circonstant pronom relatif</value>
     //  <value name="Cmpl">complément</value>
     //  <value name="Cmpl_RelNC">complément pronom relatif</value>
     //  <value name="Coo">coordination</value>
     //  <value name="Det">déterminant</value>
     //  <value name="Err">unbound node in CoNLL input</value>
     //  <value name="Err2">illegal node name was replaced</value>
     //  <value name="GpCoo">coordonné</value>
     //  <value name="Ignorer">Ignorer</value>
     //  <value name="Insrt">incidente</value>
     //  <value name="Intj">interjection</value>
     //  <value name="Lac">lacune</value>
     //  <value name="ModA">modifieur attaché</value>
     //  <value name="ModD">modifieur détaché</value>
     //  <value name="Ng">négation</value>
     //  <value name="NgPrt">forclusif</value>
     //  <value name="Obj">objet</value>
     //  <value name="Obj_RelNC">direct object pronom relatif</value>
     //  <value name="Pon">ponctuation</value>
     //  <value name="PON">ponctuation</value>
     //  <value name="Regim">régime</value>
     //  <value name="RelC">relateur coordonnant</value>
     //  <value name="RelNC">relateur non coordonnant</value>
     //  <value name="Rfc">réfléchi</value>
     //  <value name="Rfx">réfléxif renforcé</value>
     //  <value name="SjImp">sujet impersonnel</value>
     //  <value name="SjPer">sujet personnel</value>
     //  <value name="SjPer_RelNC">sujet personnel pronom relatif</value>
     //  <value name="Snt">phrase</value>
     //  <value name="ROOT">phrase</value>
     //  <value name="StructureMaximale">structure maximale</value>
     //  <value name="VFin">verbe fini</value>
     //  <value name="VInf">verbe infinitif</value>
     //  <value name="nMax">structure non-maximale</value>
     //  <value name="nSnt">non-phrase</value>
     //</feature>
     //<feature name="coord" domain="NT" ></feature>
     //<feature name="dom" domain="NT" ></feature>
     //<feature name="type" domain="NT" >
     //  <value name="nV">élément non-verbal</value>
     //  <value name="VFin">verbe fini</value>
     //  <value name="VInf">verbe infinitif</value>
     //  <value name="VPar">verbe participial</value>
     //  <value name="--">nil</value>
     //</feature>
     //<feature name="vform" domain="NT"></feature>
     //<feature name="vlemma" domain="NT"></feature>
     //<feature name="note" domain="NT"></feature>
     //<feature name="snr" domain="NT"></feature>
     //';
     	 printf MASTER """<annotation>
     <feature name="word" domain="T" ></feature>
     <feature name="pos" domain="T" ></feature>
     <feature name="mor" domain="T" ></feature>
     <feature name="lemma" domain="T" ></feature>
     <feature name="textid" domain="T" ></feature>
     <feature name="editionId" domain="T" ></feature>
     <feature name="cat" domain="NT" >
     	 <value name="__UNDEF__">UNDEFINED !!!</value>
     	 <value name="acl:adv">acl:adv</value> <!-- Ukrainian -->
     	 <value name="acl:attr">acl:attr</value> <!-- Chukchi -->
     	 <value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish -->
     	 <value name="acl:fixed">acl:fixed</value> <!-- Beja -->
     	 <value name="acl:inf">acl:inf</value> <!-- Portuguese -->
     	 <value name="acl:relat">acl:relat</value> <!-- Chukchi -->
     	 <value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof -->
     	 <value name="acl">clausal modifier of noun (adnominal clause)</value>
     	 <value name="advcl:abs">advcl:abs</value> <!-- Latin -->
     	 <value name="advcl:cau">advcl:cau</value> <!-- Moksha -->
     	 <value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija -->
     	 <value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish -->
     	 <value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur -->
     	 <value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese -->
     	 <value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian -->
     	 <value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak -->
     	 <value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian -->
     	 <value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak -->
     	 <value name="advcl:pred">advcl:pred</value> <!-- Latin -->
     	 <value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian -->
     	 <value name="advcl:sp">advcl:sp</value> <!-- Ukrainian -->
     	 <value name="advcl:svc">advcl:svc</value> <!-- Ukrainian -->
     	 <value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
     	 <value name="advcl">adverbial clause modifier</value>
     	 <value name="advmod:arg">advmod:arg</value> <!-- Polish -->
     	 <value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="advmod:comp">advmod:comp</value> <!-- Erzya -->
     	 <value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="advmod:det">advmod:det</value> <!-- Ukrainian -->
     	 <value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese -->
     	 <value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian -->
     	 <value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="advmod:fixed">advmod:fixed</value> <!-- Beja -->
     	 <value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha -->
     	 <value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian -->
     	 <value name="advmod:locy">advmod:locy</value> <!-- Hungarian -->
     	 <value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="advmod:mode">advmod:mode</value> <!-- Hungarian -->
     	 <value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami -->
     	 <value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French -->
     	 <value name="advmod:que">advmod:que</value> <!-- Hungarian -->
     	 <value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian -->
     	 <value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian -->
     	 <value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
     	 <value name="advmod:to">advmod:to</value> <!-- Hungarian -->
     	 <value name="advmod:tto">advmod:tto</value> <!-- Hungarian -->
     	 <value name="advmod">adverbial modifier</value>
     	 <value name="amod:att">amod:att</value> <!-- Hungarian -->
     	 <value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian -->
     	 <value name="amod:flat">amod:flat</value> <!-- Polish -->
     	 <value name="amod">adjectival modifier</value>
     	 <value name="appos:trans">appos:trans</value> <!-- Turkish German -->
     	 <value name="appos">appositional modifier</value>
     	 <value name="aux:aff">aux:aff</value> <!-- Beja -->
     	 <value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian -->
     	 <value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian -->
     	 <value name="aux:clitic">aux:clitic</value> <!-- Polish -->
     	 <value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish -->
     	 <value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian -->
     	 <value name="aux:imp">aux:imp</value> <!-- Erzya, Polish -->
     	 <value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami -->
     	 <value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil -->
     	 <value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha -->
     	 <value name="aux:part">aux:part</value> <!-- Maltese -->
     	 <value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese -->
     	 <value name="aux:pot">aux:pot</value> <!-- Komi Zyrian -->
     	 <value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German -->
     	 <value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami -->
     	 <value name="aux">auxiliary</value>
     	 <value name="case:acc">case:acc</value> <!-- Hebrew -->
     	 <value name="case:adv">case:adv</value> <!-- Indonesian -->
     	 <value name="case:aff">case:aff</value> <!-- Beja -->
     	 <value name="case:det">preposition with determiner</value> <!-- Maltese, Old French -->
     	 <value name="case:gen">case:gen</value> <!-- Hebrew -->
     	 <value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian -->
     	 <value name="case:pred">case:pred</value> <!-- Welsh -->
     	 <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic -->
     	 <value name="case">case marking</value>
     	 <value name="cc:nc">cc:nc</value> <!-- Old French -->
     	 <value name="cc:nc">Coordinated conjunct : non coordonant</value>
     	 <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish -->
     	 <value name="cc:preconj">preconjunct</value>
     	 <value name="cc">Coordinating conjunction</value>
     	 <value name="cc">coordinating conjunction</value>
     	 <value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish -->
     	 <value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish -->
     	 <value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian -->
     	 <value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian -->
     	 <value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian -->
     	 <value name="ccomp">clausal complement</value>
     	 <value name="clf">classifier</value>
     	 <value name="compound:a">compound:a</value> <!-- Indonesian -->
     	 <value name="compound:affix">compound:affix</value> <!-- Hebrew -->
     	 <value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese -->
     	 <value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese -->
     	 <value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian -->
     	 <value name="compound:lvc">light verb construction</value>
     	 <value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami -->
     	 <value name="compound:preverb">compound:preverb</value> <!-- Hungarian -->
     	 <value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba -->
     	 <value name="compound:prt">phrasal verb particle</value>
     	 <value name="compound:quant">compound:quant</value> <!-- Cantonese -->
     	 <value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian -->
     	 <value name="compound:smixut">compound:smixut</value> <!-- Hebrew -->
     	 <value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba -->
     	 <value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese -->
     	 <value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese -->
     	 <value name="compound">compound</value>
     	 <value name="conj:expl">conj:expl</value> <!-- Latin -->
     	 <value name="conj:extend">conj:extend</value> <!-- Slovenian -->
     	 <value name="conj:svc">conj:svc</value> <!-- Ukrainian -->
     	 <value name="conj">conjunct</value>
     	 <value name="cop:expl">cop:expl</value> <!-- Maltese -->
     	 <value name="cop:locat">cop:locat</value> <!-- Polish -->
     	 <value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi -->
     	 <value name="cop">copula</value>
     	 <value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic -->
     	 <value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish -->
     	 <value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian -->
     	 <value name="csubj">clausal subject</value>
     	 <value name="dep:aff">dep:aff</value> <!-- Beja -->
     	 <value name="dep:agr">dep:agr</value> <!-- Kiche -->
     	 <value name="dep:alt">dep:alt</value> <!-- Upper Sorbian -->
     	 <value name="dep:ana">dep:ana</value> <!-- Yupik -->
     	 <value name="dep:aux">dep:aux</value> <!-- Yupik -->
     	 <value name="dep:comp">dep:comp</value> <!-- Beja, French -->
     	 <value name="dep:conj">dep:conj</value> <!-- Beja -->
     	 <value name="dep:cop">dep:cop</value> <!-- Yupik -->
     	 <value name="dep:emo">dep:emo</value> <!-- Yupik -->
     	 <value name="dep:infl">dep:infl</value> <!-- Yupik -->
     	 <value name="dep:mark">dep:mark</value> <!-- Yupik -->
     	 <value name="dep:mod">dep:mod</value> <!-- Mbya Guarani -->
     	 <value name="dep:pos">dep:pos</value> <!-- Yupik -->
     	 <value name="dep:redup">dep:redup</value> <!-- Beja -->
     	 <value name="dep:ss">dep:ss</value> <!-- Kiche -->
     	 <value name="dep">unspecified dependency</value>
     	 <value name="det:adj">det:adj</value> <!-- Albanian -->
     	 <value name="det:noun">det:noun</value> <!-- Albanian -->
     	 <value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian -->
     	 <value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian -->
     	 <value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian -->
     	 <value name="det:predet">det:predet</value> <!-- English, Italian, Persian -->
     	 <value name="det:pron">det:pron</value> <!-- Albanian -->
     	 <value name="det:rel">det:rel</value> <!-- Bambara -->
     	 <value name="det">determiner</value>
     	 <value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish -->
     	 <value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian -->
     	 <value name="discourse:intj">discourse:intj</value> <!-- Polish -->
     	 <value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese -->
     	 <value name="discourse">discourse element</value>
     	 <value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani -->
     	 <value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin -->
     	 <value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin -->
     	 <value name="dislocated:obj">dislocated:obj</value> <!-- Latin -->
     	 <value name="dislocated:subj">dislocated:subj</value> <!-- Beja -->
     	 <value name="dislocated">dislocated elements</value>
     	 <value name="expl:comp">expl:comp</value> <!-- French -->
     	 <value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish -->
     	 <value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian -->
     	 <value name="expl:poss">expl:poss</value> <!-- Romanian -->
     	 <value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian -->
     	 <value name="expl:subj">expl:subj</value> <!-- French, Naija -->
     	 <value name="expl">expletive</value>
     	 <value name="fixed">fixed multiword expression</value>
     	 <value name="flat:abs">flat:abs</value> <!-- Ukrainian -->
     	 <value name="flat:dist">flat:dist</value> <!-- Western Armenian -->
     	 <value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian -->
     	 <value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian -->
     	 <value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian -->
     	 <value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian -->
     	 <value name="flat:repeat">flat:repeat</value> <!-- Ukrainian -->
     	 <value name="flat:sibl">flat:sibl</value> <!-- Ukrainian -->
     	 <value name="flat:title">flat:title</value> <!-- Ukrainian -->
     	 <value name="flat:vv">flat:vv</value> <!-- Classical Chinese -->
     	 <value name="flat">name multiword expression</value>
     	 <value name="goeswith">goes with</value>
     	 <value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian -->
     	 <value name="iobj:appl">iobj:appl</value> <!-- Wolof -->
     	 <value name="iobj:patient">iobj:patient</value> <!-- Tagalog -->
     	 <value name="iobj">indirect object</value>
     	 <value name="list">list</value>
     	 <value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese -->
     	 <value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French -->
     	 <value name="mark:aff">mark:aff</value> <!-- Beja -->
     	 <value name="mark:obj">marker + object</value> <!--Old French, no doc -->
     	 <value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc -->
     	 <value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic -->
     	 <value name="mark:q">mark:q</value> <!-- Hebrew -->
     	 <value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese -->
     	 <value name="mark">marker</value>
     	 <value name="nmod:agent">nmod:agent</value> <!-- Welsh -->
     	 <value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha -->
     	 <value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik -->
     	 <value name="nmod:att">nmod:att</value> <!-- Hungarian -->
     	 <value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian -->
     	 <value name="nmod:attr">nmod:attr</value> <!-- Chukchi -->
     	 <value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha -->
     	 <value name="nmod:cau">nmod:cau</value> <!-- Uyghur -->
     	 <value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur -->
     	 <value name="nmod:flat">nmod:flat</value> <!-- Polish -->
     	 <value name="nmod:gen">nmod:gen</value> <!-- Breton -->
     	 <value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish -->
     	 <value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian -->
     	 <value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian -->
     	 <value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha -->
     	 <value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian -->
     	 <value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian -->
     	 <value name="nmod:obl">nmod:obl</value> <!-- Hungarian -->
     	 <value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur -->
     	 <value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof -->
     	 <value name="nmod:pred">nmod:pred</value> <!-- Polish -->
     	 <value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian -->
     	 <value name="nmod:redup">nmod:redup</value> <!-- Welsh -->
     	 <value name="nmod:relat">nmod:relat</value> <!-- Chukchi -->
     	 <value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian -->
     	 <value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur -->
     	 <value name="nmod">nominal modifier</value>
     	 <value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French -->
     	 <value name="nsubj:aff">nsubj:aff</value> <!-- Beja -->
     	 <value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog -->
     	 <value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian -->
     	 <value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin -->
     	 <value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish -->
     	 <value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog -->
     	 <value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog -->
     	 <value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian -->
     	 <value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu -->
     	 <value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French -->
     	 <value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian -->
     	 <value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese -->
     	 <value name="nsubj">Nominal subject</value>
     	 <value name="nummod:det">nummod:det</value> <!-- Beja -->
     	 <value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian -->
     	 <value name="nummod:flat">nummod:flat</value> <!-- Polish -->
     	 <value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian -->
     	 <value name="nummod">numeric modifier</value>
     	 <value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French -->
     	 <value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg -->
     	 <value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog -->
     	 <value name="obj:appl">obj:appl</value> <!-- Wolof -->
     	 <value name="obj:caus">obj:caus</value> <!-- Wolof -->
     	 <value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija -->
     	 <value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French -->
     	 <value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese -->
     	 <value name="obj">object</value>
     	 <value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French -->
     	 <value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian -->
     	 <value name="obl:appl">obl:appl</value> <!-- Wolof -->
     	 <value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil -->
     	 <value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu -->
     	 <value name="obl:cmp">obl:cmp</value> <!-- Telugu -->
     	 <value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil -->
     	 <value name="obl:comp">obl:comp</value> <!-- Moksha -->
     	 <value name="obl:dat">obl:dat</value> <!-- Kurmanji -->
     	 <value name="obl:freq">obl:freq</value> <!-- Moksha -->
     	 <value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil -->
     	 <value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil -->
     	 <value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
     	 <value name="obl:lvc">obl:lvc</value> <!-- Hungarian -->
     	 <value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian -->
     	 <value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik -->
     	 <value name="obl:npmod">obl:npmod</value> <!-- Coptic, English -->
     	 <value name="obl:orphan">obl:orphan</value> <!-- Polish -->
     	 <value name="obl:own">obl:own</value> <!-- Kazakh -->
     	 <value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese -->
     	 <value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil -->
     	 <value name="obl:poss">obl:poss</value> <!-- Thai -->
     	 <value name="obl:prep">obl:prep</value> <!-- Irish -->
     	 <value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani -->
     	 <value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic -->
     	 <value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri -->
     	 <value name="obl:tmod">temporal modifier</value>
     	 <value name="obl">oblique nominal</value>
     	 <value name="orphan:missing">textual gap in the source</value> <!-- Latin -->
     	 <value name="orphan">remnant in ellipsis</value>
     	 <value name="parataxis:appos">parataxis:appos</value> <!-- Italian -->
     	 <value name="parataxis:conj">parataxis:conj</value> <!-- Naija -->
     	 <value name="parataxis:coord">parataxis:coord</value> <!-- Beja -->
     	 <value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian -->
     	 <value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian -->
     	 <value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija -->
     	 <value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian -->
     	 <value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish -->
     	 <value name="parataxis:mod">parataxis:mod</value> <!-- Beja -->
     	 <value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian -->
     	 <value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian -->
     	 <value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish -->
     	 <value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija -->
     	 <value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian -->
     	 <value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani -->
     	 <value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian -->
     	 <value name="parataxis:rt">parataxis:rt</value> <!-- Irish -->
     	 <value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish -->
     	 <value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German -->
     	 <value name="parataxis:url">parataxis:url</value> <!-- Irish -->
     	 <value name="parataxis">parataxis</value>
     	 <value name="punct">punctuation</value>
     	 <value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? -->
     	 <value name="reparandum">overridden disfluency</value>
     	 <value name="root">root</value>
     	 <value name="vocative:cl">vocative:cl</value> <!-- Ukrainian -->
     	 <value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian -->
     	 <value name="vocative">vocative</value>
     	 <value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish -->
     	 <value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi -->
     	 <value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish -->
     	 <value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic -->
     	 <value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian -->
     	 <value name="xcomp:subj">xcomp:subj</value> <!-- Polish -->
     	 <value name="xcomp">open clausal complement</value>
     </feature>
     <feature name="coord" domain="NT" ></feature>
     <feature name="dom" domain="NT" ></feature>
     <feature name="type" domain="NT" >
     	 <value name="nV">élément non-verbal</value>
     	 <value name="VFin">finite verb</value>
     	 <value name="VInf">infinitive</value>
     	 <value name="VPar">participle</value>
     	 <value name="--">nil</value>
     </feature>
     <feature name="vform" domain="NT"></feature>
     <feature name="vlemma" domain="NT"></feature>
     <feature name="note" domain="NT"></feature>
     <feature name="snr" domain="NT"></feature>
     """;
     	 printf MASTER "$nt_features_header";
     	 printf MASTER """
     <edgelabel>
     	 <value name="D">dependency</value>
     	 <value name="L">lexical</value>
     	 <value name="R">relator</value>
     	 <value name="*">not bound</value>
     </edgelabel>
     <secedgelabel>
     	 <value name="cluster">between elements of GpCoo</value>
     	 <value name="coord">between members of Coo</value>
     	 <value name="dupl">between duplicated nodes</value>
     </secedgelabel>
     </annotation>
     </head>
     <body>
     """;
+    }
     //  <value name="M">main</value>
     //  <value name="P">part</value>
     def write_master_footer {
     MASTER << """</body>
     </corpus>
     """;
+    }
     TEMP="""
     <feature name="nodom" domain="NT" ></feature>
     <feature name="headpos" domain="NT" ></feature>
     <feature name="annotationFile" domain="NT" ></feature>
     <feature name="annotationUri" domain="NT" ></feature>
     """;
     def define_cat_hashes {
     //  abbrev2cat{"Apst"} = "Apostrophe";
     //  abbrev2cat{"AtObj"} = "AttributObjet";
     //  abbrev2cat{"AtRfc"} = "AttributReflechi";
     //  abbrev2cat{"AtSj"} = "AttributSujet";
     //  abbrev2cat{"AuxA"} = "Auxilie-Actif";
     //  abbrev2cat{"AuxP"} = "Auxilie-Passif";
     //  abbrev2cat{"Circ"} = "Circonstant";
     //  abbrev2cat{"Cmpl"} = "Complement";
     //  abbrev2cat{"GpCoo"} = "Coordonne";
     //  abbrev2cat{"Coo"} = "Coordination";
     //  abbrev2cat{"Det"} = "Determinant";
     //  abbrev2cat{"NgPrt"} = "Forclusif";
     //  abbrev2cat{"Insrt"} = "Incidente";
     //  abbrev2cat{"Intj"} = "Interjection";
     //  abbrev2cat{"ModA"} = "ModifieurAttache";
     //  abbrev2cat{"ModD"} = "ModifieurDetache";
     //  abbrev2cat{"Ng"} = "Negation";
     //  abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif";
     //  abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #?
     //  abbrev2cat{"VFin"} = "NoeudVerbal-Personnel";
     //  abbrev2cat{"nSnt"} = "NonPhrase";
     //  abbrev2cat{"Obj"} = "Objet";
     //  abbrev2cat{"Snt"} = "Phrase";
     //  abbrev2cat{"Pon"} = "Ponctuation";
     //  abbrev2cat{"Rfc"} = "Reflechi";
     //  abbrev2cat{"Rfx"} = "ReflexifRenforce";
     //  abbrev2cat{"RelC"} = "Relateur-Coordonnant";
     //  abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant";
     //  abbrev2cat{"nMax"} = "StructureNonMaximale";
     //  abbrev2cat{"SjImp"} = "SujetImpersonnel";
     //  abbrev2cat{"SjPer"} = "SujetPersonnel";
     //  abbrev2cat{"Lac"} = "Lacune";
     //  abbrev2cat{"Aux"} = "Auxilie";
     //  abbrev2cat{"Regim"} = "Regime";
     abbrev2cat{"acl"} = "Clausal modifier of noun";
     abbrev2cat{"advcl"} = "Adverbial clause modifier";
     abbrev2cat{"advmod"} = "Adverbial modifier";
     abbrev2cat{"amod"} = "Adjectival modifier";
     abbrev2cat{"appos"} = "Appositional modifier";
     abbrev2cat{"aux"} = "Auxiliary";
     abbrev2cat{"cc-nc"} = "Coordinated conjunct : non coordonant";
     abbrev2cat{"cc"} = "Coordinating conjunction";
     abbrev2cat{"ccomp"} = "Clausal complement";
     abbrev2cat{"conj"} = "Conjunct";
     abbrev2cat{"cop"} = "Copula";
     abbrev2cat{"csubj"} = "Clausal subject";
     abbrev2cat{"det"} = "Determiner";
     abbrev2cat{"dislocated"} = "Dislocated elements";
     abbrev2cat{"expl"} = "Expletive";
     abbrev2cat{"iobj"} = "Indirect object";
     abbrev2cat{"mark"} = "Marker";
     abbrev2cat{"nmod"} = "Nominal modifier";
     abbrev2cat{"nsubj"} = "Nominal subject";
     abbrev2cat{"nummod"} = "Numeric modifier";
     abbrev2cat{"obj"} = "Object";
     abbrev2cat{"obl"} = " Oblique nominal";
     abbrev2cat{"orphan"} = "Remnant in ellipsis";
     abbrev2cat{"remnant"} = "Remnant ?";
     abbrev2cat{"vocative"} = "Vocative";
     abbrev2cat{"xcomp"} = "Open clausal complement";
+    }
     def print_sentence {
     	for (def q = 0; q <= words.size(); q++) {
     	  print "$words[$q]\n";
+    	}
+    }
     def write_nonterminals {
     	 def print_nt_features;
     	 if ($_[1] =~ /dupl/) {
     	dupl = '_dupl';
     	 } else {
     	dupl = '';
+    	 }
     	 if (nt_features != '') {
     	print_nt_features = nt_empty_features;
     	if ($_[0] =~ /$thisrootname/) {
     	  print_nt_features = nt_features;
+    	}
+    	 }
     	 printf XML "   <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n", $., w, dupl, $_[0], dom, type, vform, vlemma, print_nt_features, notes("$._$w"), $.;
     	 printf XML "  <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., w, dupl;
     	 // link duplicate with primary original node
     	 if ($_[1] =~ /dupl/) {
     	printf XML "  <secedge idref=\"s%d_%d\" label=\"dupl\"/>\n", $., w;
+    	 }
     	 // if node is not a duplicate: attach all the daughters
     	 if ($_[1] !~ /dupl/) {
     	for (def d = 0; d <= daughters.size(); d++) {
     	  daughter = daughters[d];
     	  if ("$._$w" != "$._$daughter") { // avoid cycles
     	printf XML "  <edge idref=\"n%d_%d%s\" label=\"%s\"/>\n", $., daughter, dupl, edge_label("$._$daughter");
+    	  }
     	  // check if a duplicate of this node must be attached
     	  if (duplicates{"$._$daughter"} == 1) {
     	printf XML "  <edge idref=\"n%d_%d_dupl\" label=\"%s\"/>\n", $., daughter, edge_label("$._$daughter_dupl");
+    	  }
+    	}
+    	 }
     	 XML << "   </nt>\n";
+    }
     // checks if node is stored as in hash of relators
     def edge_label {
     	 if (relators{$_[0]} == 1) {
     	return 'R';
+    	 }
     	 return 'D';
+    }
     // retrives notes for this node
     def notes {
     	 if (notes{$_[0]} != '') {
     	return "$notes{$_[0]}";
+    	 }
     	 return '--';
+    }
     // conversions, bug fixes
     def clean_data {
     	// conversions of values, some necessary some for convenience
     	word =~ s/"/'/g;
     	word =~ s/\&/(and)/g;
     	word =~ s/<</«/g;
     	word =~ s/>>/»/g;
     	word =~ s/[<>]//g;
     	pos =~ s/:/_/g;
     	mor =~ s/\|/_/g;
     	ppos =~ s/:/_/g;
     	pmor =~ s/\|/_/g;
     	lemma =~ s/\|/_/g;
     	lemma =~ s/[<>]//g;
     	lemma =~ s/"/'/g;
     	lemma =~ s/\&/(and)/g;
     	plemma =~ s/\|/_/g;
     	plemma =~ s/[<>]//g;
     	plemma =~ s/"/'/g;
     	plemma =~ s/\&/(and)/g;
     	// clean categories
     	cat =~ s/ROOT/$thisrootname/;  // top node, for compatibility with SRCMF
     // cat =~ s/Ponctuation/Pon/;
     	// correct some bugs in parse
     // cat =~ s/Sujet/SjPer/;
     // cat =~ s/Modifieur/ModA/;
     // cat =~ s/Parenthese/Insrt/;
     	cat =~ s/\-/_/g;
     	cat =~ s/RelNC_(.*)/$1_RelNC/;  // RelNC always 2nd node, for consistency in duplicates
     	 return;
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3346