Révision 3567

TXM/trunk/org.txm.conllu.core/.project (revision 3567)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>org.txm.conllu.core</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
	</buildSpec>
24
	<natures>
25
		<nature>org.eclipse.jdt.groovy.core.groovyNature</nature>
26
		<nature>org.eclipse.pde.PluginNature</nature>
27
		<nature>org.eclipse.jdt.core.javanature</nature>
28
	</natures>
29
</projectDescription>
0 30

  
TXM/trunk/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/tigerXml-commentOutLongSentences.xsl (revision 3567)
1
<!-- The Identity Transformation -->
2
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
3
  
4
  <!-- This stylesheet comments out the sentences containing more than 100 terminal nodes in Tiger XML files.
5
    To change the theshold, puse the maxLength parameter.
6
  
7
  Written by A. Lavrentiev, CNRS, UMR IHRIM 2021-07-16
8
  Licence: GNU GPL v.3
9
  -->
10
  
11
  <!-- Whenever you match any node or any attribute -->
12
  <xsl:template match="node()|@*">
13
    <!-- Copy the current node -->
14
    <xsl:copy>
15
      <!-- Including any attributes it has and any child nodes -->
16
      <xsl:apply-templates select="@*|node()"/>
17
    </xsl:copy>
18
  </xsl:template>
19
  
20
  <xsl:param name="maxLength" as="xs:integer">100</xsl:param>
21
  
22
  <xsl:template match="s">
23
    <xsl:choose>
24
      <xsl:when test="count(graph/terminals/t) gt $maxLength">
25
        <xsl:comment>
26
          Sentence too long (<xsl:value-of select="count(graph/terminals/t)"/> tokens) :
27
          <xsl:for-each select="graph/terminals/t"><xsl:value-of select="@word"/><xsl:text> </xsl:text></xsl:for-each>
28
        </xsl:comment>
29
      </xsl:when>
30
      <xsl:otherwise>
31
        <xsl:copy-of select="."/>
32
      </xsl:otherwise>
33
    </xsl:choose>
34
  </xsl:template>
35
  
36
</xsl:stylesheet>
0 37

  
TXM/trunk/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3567)
1
package org.txm.scripts.importer.conllu
2

  
3
import org.txm.Toolbox
4
import org.txm.importer.xtz.ImportModule;
5
import org.txm.metadatas.Metadatas
6
import org.txm.utils.io.FileCopy
7
import org.txm.utils.io.IOUtils
8
import org.txm.importer.xtz.*
9
import org.txm.scripts.importer.xtz.*
10
import org.txm.conllu.core.function.ImportCoNLLUAnnotations
11
import org.txm.conllu.core.preferences.UDPreferences
12
import org.txm.importer.ApplyXsl2;
13
import javax.xml.stream.*
14
import org.txm.utils.AsciiUtils
15
import org.txm.utils.ConsoleProgressBar
16
import org.txm.utils.FileUtils
17
import org.txm.conllu.core.preferences.UDPreferences
18
/**
19
 * Only build the Metadatas object since all XML-TXM files already exists.
20
 * Metadatas is used to build text order.
21
 * 
22
 * 
23
 * @author mdecorde
24
 *
25
 */
26
class CoNLLUImporter extends XTZImporter {
27
	
28
	public CoNLLUImporter(ImportModule module) {
29
		super(module);
30
	}
31
	
32
	@Override
33
	public void process() {
34
		
35
		File connluSrcDirectory = inputDirectory
36
		
37
		boolean usenewdocid =  UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
38
		
39
		if (usenewdocid) {
40
			connluSrcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu")
41
			connluSrcDirectory.deleteDir();
42
			connluSrcDirectory.mkdirs();
43
			
44
			println "Convert CoNLL-U to XML-TEI..."
45
			if (!splitCoNLLUFiles(inputDirectory, connluSrcDirectory, project)) {
46
				return;
47
			}
48
		}
49
		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
50
		srcDirectory.deleteDir();
51
		srcDirectory.mkdirs();
52
		
53
		println "Convert CoNLL-U to XML-TEI..."
54
		convertCoNLLU2TEI(connluSrcDirectory, srcDirectory, project)
55
		
56
		inputDirectory = srcDirectory // switch source directory
57
		
58
		super.process();
59
	}
60
	
61
	public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
62
		def files = inputDirectory.listFiles()
63
		
64
		if (files == null) {
65
			println "Aborting. No CONLL file found in $inputDirectory."
66
			return false
67
		}
68
		
69
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
70
		
71
		println "Splitting CoNLL-U files..."
72
		for (File master : files) {
73
			
74
			cpb_texts.tick()
75
			
76
			if (!master.getName().endsWith(".conllu")) {
77
				continue;
78
			}
79
			
80
			String orig_text_id = FileUtils.stripExtension(master)
81
			String current_text_id = FileUtils.stripExtension(master)
82
			File conlluFile = new File(srcDirectory, current_text_id+".conllu")
83
			def writer = conlluFile.newWriter("UTF-8", true)
84
			
85
			master.eachLine("UTF-8") { line ->
86
				if (line.startsWith("# newdoc id = ")) {
87
					
88
					String text_id = line.substring("# newdoc id = ".length())
89
					if (!text_id.equals(current_text_id)) {
90
						writer.close()
91
						current_text_id = text_id
92
						conlluFile = new File(srcDirectory, current_text_id+".conllu")
93
						writer = conlluFile.newWriter("UTF-8", true)
94
					}
95
				}
96
				
97
				writer.println(line)
98
			}
99
		}
100
		cpb_texts.done()
101
		return true
102
	}
103
	
104
	public static def convertCoNLLU2TEI(File inputDirectory, File srcDirectory, def project) {
105
		
106
		def files = inputDirectory.listFiles()
107
		
108
		if (files == null) {
109
			println "Aborting. No CONLL file found in $inputDirectory."
110
			return false
111
		}
112
		
113
		
114
		
115
		def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
116
		
117
		String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
118
		
119
		UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference
120
		
121
		boolean keepContractions =  UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS)
122
		
123
		def headPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT).split(",") as Set
124
		
125
		def depsPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT).split(",") as Set
126
		
127
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
128
		
129
		println "Parsing CoNLL-U files..."
130
		for (File master : files) {
131
			
132
			cpb_texts.tick()
133
			
134
			if (!master.getName().endsWith(".conllu")) {
135
				continue;
136
			}
137
			
138
			def content = []; // list of sentence
139
			
140
			String text_id = FileUtils.stripExtension(master);
141
			String sent_id = "";
142
			String par_id = "1";
143
			def comments = []; // /text/par/sent
144
			def words = []
145
			
146
			master.eachLine("UTF-8") { line ->
147
				
148
				if (line.startsWith("# newdoc id = ")) {
149
					// already set or ignored
150
				} else if (line.startsWith("# sent_id = ")) {
151
					sent_id = line.substring("# sent_id = ".length())
152
				} else if (line.startsWith("# newpar id = ")) {
153
					par_id = line.substring("# newpar id = ".length())
154
				} else if (line.startsWith("#")) {
155
					comments << line
156
				} else if (line.trim().isEmpty()) {
157
					if (words.size() > 0) {
158
						def sentence = [par_id, sent_id, words, comments]
159
						content.add(sentence)
160
						
161
						sent_id = "";
162
						par_id = "1";
163
						comments = [];
164
						words = []
165
					}
166
					
167
				} else {
168
					
169
					HashMap<String, String> wProperties = new HashMap<String, String>()
170
					
171
					def split = line.split("\t")
172
					if (split.size() == properties.size()) {
173
						String id = split[0]
174
						for (int i = 0 ; i < split.size() ; i++) {
175
							wProperties[properties[i]] = split[i]
176
						}
177
						
178
						if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence
179
							if (words.size() > 0) {
180
								def sentence = [par_id, sent_id, words, comments]
181
								content.add(sentence)
182
								
183
								sent_id = "";
184
								par_id = "1";
185
								comments = [];
186
								words = []
187
							}
188
						}
189
						
190
						words << wProperties
191
					} else {
192
						//println "Warning: not a line: "+line
193
					}
194
				}
195
			}
196
			
197
			if (content.size() == 0) {
198
				continue;
199
			}
200
			
201
			File xmlFile = new File(srcDirectory, text_id+".xml")
202
			// println "xmlFile=$xmlFile"
203
			BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
204
			XMLOutputFactory factory = XMLOutputFactory.newInstance()
205
			XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
206
			
207
			writer.writeStartDocument("UTF-8","1.0")
208
			writer.writeStartElement ("TEI")
209
			writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0")
210
			writer.writeNamespace("txm", "http://textometrie.org/1.0")
211
			writer.writeCharacters("\n")
212
			writer.writeStartElement ("teiHeader")
213
			writer.writeEndElement()
214
			writer.writeCharacters("\n")
215
			writer.writeStartElement ("text")
216
			
217
			writer.writeCharacters("\n")
218
			
219
			String current_par_id = null
220
			
221
			for (def sentence : content) { // for all paragraph of the current text
222
				
223
				par_id = sentence[0]
224
				sent_id = sentence[1]
225
				words = sentence[2]
226
				comments = sentence[3]
227
				
228
				if (current_par_id == null || par_id != current_par_id) {
229
					if (current_par_id != null) {
230
						writer.writeEndElement() // p
231
					}
232
					writer.writeStartElement ("p");
233
					writer.writeAttribute("id", par_id)
234
					writer.writeCharacters("\n")
235
					
236
					current_par_id = par_id
237
				}
238
				
239
				writer.writeStartElement ("s")
240
				writer.writeAttribute("id", sent_id)
241
				writer.writeCharacters("\n")
242
				
243
				for (def comment : comments) {
244
					writer.writeComment(comment.replace("--", "―"))
245
					writer.writeCharacters("\n")
246
				}
247
				
248
				if (!keepContractions) { // merge properties in the "-" word and remove the parts
249
					for (int i = 0 ; i < words.size() ; i++) {
250
						def word = words[i]
251
						String id = word[0]
252
						
253
						if (id.contains("-")) { // multi-word line
254
							int index = id.indexOf("-")
255
							String id1 = id.substring(0, index)
256
							String id2 = id.substring(index+1)
257
							def token1 = sentence[id1]
258
							def token2 = sentence[id2]
259
							
260
							if (token1 == null || token2 == null) {
261
								println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
262
								continue
263
							}
264
							
265
							for (String p : properties) {
266
								if (p == "id") continue // don't merge the form property
267
									if (p == "form") continue // don't merge the form property
268
									word[p] =  token1[p] + "+" + token2[p]
269
							}
270
							words.remove(i+1) // remove the token
271
							words.remove(i+1) // remove the token
272
						}
273
					}
274
				}
275
				
276
				if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
277
					LinkedHashMap sentencehash = new LinkedHashMap()
278
					//println "WORDS="+words
279
					for (def word : words) {
280
						sentencehash[word["id"]] = word
281
					}
282
					//println "SENTENCE="+sentencehash
283
					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
284
				}
285
				
286
				for (def word : words) {
287
					
288
					def id = word["id"]
289
					
290
					writer.writeStartElement ("w")
291
					for (String p : word.keySet()) {
292
						if (p == "feats") word[p] = "|"+word[p]+"|"
293
						//println "WORD="+word
294
						writer.writeAttribute(prefix+p, word[p])
295
					}
296
					
297
					writer.writeCharacters(word["form"])
298
					writer.writeEndElement() // w
299
					writer.writeCharacters(" ")
300
				}
301
				writer.writeCharacters("\n")
302
				writer.writeEndElement() // s
303
			}
304
			
305
			if (current_par_id != null) {
306
				writer.writeEndElement() // p
307
				writer.writeCharacters("\n")
308
			}
309
			
310
			writer.writeEndElement() // text
311
			writer.writeCharacters("\n")
312
			writer.writeEndElement() // TEI
313
			writer.close()
314
			output.close()
315
		}
316
		
317
		cpb_texts.done()
318
		
319
		return true
320
	}
321
	
322
}
TXM/trunk/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl.tdy (revision 3567)
1
#!/usr/bin/perl
2

  
3
use File::Basename;
4

  
5
my $CMD      = "conll2tiger.pl";
6
my $VERSION  = "1.5";
7
my $MODIFIED = "8/12/2015";        # angepasst für Perseus CoNLL erzeugt mit conll.pl -l.  CHECK: funktioniert SRCMF noch?
8

  
9
# columns (default are the predicted values in CoNLL 2009 format)
10
my $coll   = 2;                    # lemma
11
my $colm   = 3;                    # morph (pos)
12
my $colf   = 5;                    # features
13
my $colh   = 6;                    # head
14
my $cold   = 7;                    # deprel
15
my $outdir = "conllexport";        # deprel
16
my $split  = 1000;                 # split output after nr sentences
17

  
18
# tree structure
19
my %dominates          = ();
20
my %deprel             = ();                                         # deprel{nr} = deprel
21
my @daughters          = ();                                         # daughter nodes, stored in %dominates
22
my %duplicates         = my %relators = my %notes = my %aux = ();    # store nodes of duplicates, relators
23
my $type               = "--";                                       # node attribute
24
my $vform              = my $vlemma = "--";                          # node attributes for verbs store form and lemma
25
my $label              = "D";                                        # default edge label
26
my $nt_features_header = '';                                         # option -x
27
my $nt_features        = '';                                         # option -x
28
my $nt_empty_features  = '';                                         # option -x
29
my @scodes             = ();                                         # option -x
30
my $add_to_sentcode    = '';
31
my $rootname           = 'root';                                     # default
32
my $featcol            = 13;
33

  
34
######################################################################
35
#  conll2tiger.pl: converts CoNLL-U from the Universal Dependecies
36
#  treebanks to TigerXML
37
#          Achim Stein <achim.stein@ling.uni-stuttgart.de>
38
# License : GNU GPL v. 3 (see the LICENSE file)
39
######################################################################
40
# TO DO:
41
# - coordination
42
# - Umgang mit zu langen, gesplitteten Sätzen (conll.pl -r 100)
43
#   - dafür wurde als Wort-ID statt $w (for-Zähler) $wnr verwendet
44
#   - es gibt aber noch unbound nodes wenn Regens entfernt (im anderen Teil) ist
45
######################################################################
46
# Modifications by Alexei Lavrentiev <alexei.lavrentev@ens-lyon.fr>
47
# for Profiterole project (2019-2021)
48

  
49
# 2019-09-25
50
# - updated default column numbers for CONNL-U SRCMF format
51
# - added processing for comment lines
52
# - added @textid to terminal nodes
53
# - deleted ppos, pmor et plemma (predicted tags and lemmas)
54
# - replaced specific SRCMF with standard UD tags
55

  
56
# Update 2020-05-13
57
# - added @editionId for synchronization with BFM word ID
58

  
59
# Update 2021-03-22
60
# - using $infilename for @textid
61
# - added support for .conllu extension
62

  
63
# Update 2021-03-29
64
# - added editionId to declarations in main.xml
65

  
66
# Update 2021-07-16
67
# - added "punct" to cat values
68

  
69
# Update 2021-07-20
70
# - added cat value list compiled from
71
#   https://universaldependencies.org/ext-dep-index.html and the previous
72
#   version. All relation types and subtypes from the UD 2.8 corpora
73
#   should be there.
74
# - contractions indexed
75
######################################################################
76

  
77
my $HELP = "
78
==================================================================
79
$CMD $VERSION: Help
80
==================================================================
81
FUNKTION: converts CoNLL parser output to TigerXML (for mate tools)
82
          creates master file, splits input files, corrects unbound nodes
83
SYNTAX:      $CMD [Options] <CoNLL file>
84
OPTIONEN:
85
 -c          ignore coordination (delete coordx- prefix in deprel)
86
 -C str      corpus specials: nca
87
 -h          show help
88
 -o          create all files in this output directory (default: $outdir)
89
set COLUMNS for required info (0 = column 1, 1 = column 2, etc.)
90
 -D nr       colum for deprel default=$cold
91
 -H nr       colum for head default=$colh
92
 -M nr       colum for morphology (POS) default=$colm
93
 -F nr       colum for morph. features default=$colf
94
 -R str      Root category (default: $rootname)
95
 -s nr       split output files after each nr sentence (default = $split)
96
 -x str,...  include these attributes if present in the -X column of the first word
97
             (the first code is also copied into the sentence id)
98
 -X nr       the column where attributes are stored (default: $featcol)
99
EXAMPLE:
100
  - For mate parser output: no further options required
101
    $CMD parsed.conll
102
  - For Le Monde 2005: include attributes
103
    gunzip -c parsed.conll.gz | conll2tiger.pl -x date,artnr,rubr
104
  - For NCA:
105
    conll2tiger.pl -C nca -x id,deaf,titreDees,editionDees,manuscritDees,regionDees,coefficientRegionDees,dateMoyenneDees,codeRegional,coefficientRegional,vers,ponctuation,mots,passage,commentairePhilologique,qualite,sourceQualite,commentaireForme,auteur,dateComposition,dateManuscrit,lieuComposition,lieuManuscrit,sourceDateComposition,sourceDateManuscrit,sourceLieuComposition,sourceLieuManuscrit,genre,traditionTextuelle,analyses,lignes,editionNCA tagged-oldfrench-lrec2014-dep.conll
106
";
107

  
108
###########################################################################
109
#                    DO NOT MODIFY FOLLOWING CODE !
110
###########################################################################
111

  
112
###########################################################################
113
# parse the command line
114
###########################################################################
115

  
116
use Getopt::Std;
117
getopts('c:C:hD:H:M:o:R:s:x:X:');
118

  
119
if ( defined($opt_h) ) {
120
    print STDERR "$HELP";
121
    exit(0);
122
}
123

  
124
if ( defined($opt_o) ) {
125
    $outdir = $opt_o;
126
}
127
if ( defined($opt_C) ) {
128
    $corpus = $opt_C;
129
}
130
if ( defined($opt_D) ) {
131
    $cold = $opt_D;
132
}
133
if ( defined($opt_H) ) {
134
    $colh = $opt_H;
135
}
136
if ( defined($opt_M) ) {
137
    $colm = $opt_M;
138
}
139

  
140
if ( defined($opt_R) ) {
141
    $rootname = $opt_R;
142
}
143

  
144
if ( defined($opt_s) ) {
145
    $split = $opt_s;
146
}
147

  
148
if ( defined($opt_X) ) {
149
    $featcol = $opt_X;
150
}
151

  
152
if ( defined($opt_x) ) {
153
    @scodes = split( ",", $opt_x );
154
    for ( my $i = 0 ; $i <= $#scodes ; $i++ ) {
155
        $nt_features_header = $nt_features_header . sprintf( "<feature name=\"%s\" domain=\"NT\"></feature>\n", $scodes[$i] );
156
    }
157
    $nt_features_header =~ s/\bid\b/ncaid/;    # avoid reserved Tiger attribute "id"
158
}
159

  
160
my @colnames = ( "url", "ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL" );
161

  
162
# my %pos = %lemma = %form = %deprel = %head = %governs = %cat2abbrev = %abbrev2cat = %coordnr_cat = %coordnr_head = %w_coordnr = %w_head = %w_deprel = %first_coordnr = %coordnr_first = %verb_head = ();
163
my @coordelements = ();
164

  
165
my $id = my $form = my $lemma = my $plemma = my $pos = my $ppos = my $feat = my $pfeat = my $head = my $phead = my $deprel = my $pdeprel = my $edition_id = "";
166

  
167
my $timestamp = `date`;
168
chomp($timestamp);
169

  
170
my $infile = $ARGV[0];
171
$infile =~ s/\.conllu?//i;
172
if ( $infile eq '' ) {
173
    $infile = 'subcorpus';
174
}
175
my $counter = 1;
176
$suffix = sprintf( "%05d", $counter );
177
$infilename = basename($infile);
178

  
179
$foo = `if [ ! -d $outdir ];then mkdir $outdir;fi`;
180
open( XML,    ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error of $outdir/$infilename-$suffix.xml\n";
181
open( LOG,    ">$outdir/conversion.log" )          or die "\nopen file error of conversion.log\n";
182
open( MASTER, ">$outdir/main.xml" )                or die "\nopen file error of main.xml\n";
183
write_xml_header();
184
write_master_header();
185

  
186
# flush output for log and master file
187
select(LOG);
188
$| = 1;
189
select(MASTER);
190
$| = 1;
191

  
192
$commandline = $0 . " " . ( join " ", @ARGV );
193
print LOG "$commandline\n\n";
194

  
195
print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
196

  
197
$/ = "";    # treat empty line as RS
198
while (<>) {
199
    if ( $. % $split == 0 ) {
200
        print XML "</subcorpus>\n";
201
        close(XML);
202
        $suffix = sprintf( "%05d", ++$counter );
203
        open( XML, ">$outdir/$infilename-$suffix.xml" ) or die "\nopen file error\n";
204
        write_xml_header();
205
        print MASTER "<subcorpus name='$infilename-$suffix' external='file:$infilename-$suffix.xml'/>\n";
206
    }
207

  
208
    # ----------------------------------------
209
    # set root (or fake root if ROOT is missing)
210
    # ----------------------------------------
211
    $rootnode = $fakeroot = 0;    # m = Treat string as multiple lines, so that ^ matches beginning of line
212
    $thisrootname = $rootname;
213
    ($rootnode) = (/^(\d+?)\s.*?\b$rootname\b/m);    # real root marked by parser
214
    if ( $rootnode == 0 ) {
215

  
216
        #    ($rootnode) = (/^(\d+?)\t.*?\t-1\t0\t/m);  # no marked ROOT, but top node (head = 0)   TOO SPECIFIC
217
        #    ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);  # no marked ROOT, but top node (head = 0) in col9 (original by Achim Stein)
218
        ($rootnode) = (/^(\d+?)\t.*?\t.*?\t.*?\t.*?\t.*?\t0\t/m);    # no marked ROOT, but top node (head = 0) in col7 (updated by AL)
219
        print LOG " Warning sentence $.: not marked root ($rootname), using top node $rootnode\n";
220
        $fakeroot     = 1;
221
        $thisrootname = 'nSnt';
222
    }
223
    if ( $rootnode == 0 ) {
224
        $rootnode = 1;                                               # set fake root if nothing goes
225
        print LOG " Error sentence $.: setting fake root to first word:\n$_\n";
226
        $fakeroot     = 2;
227
        $thisrootname = 'Err';
228
    }
229

  
230
    my @cols = ();
231
    @words     = split(/\n/);
232
    @terminals = ();
233

  
234
    %dominates = ();                                                 # empty at beginning of sentence
235
    %deprel    = ();                                                 # empty at beginning of sentence
236
    %aux       = ();                                                 # empty at beginning of sentence
237
    @daughters = ();
238

  
239
    my $commentlines = 0;                                            #added by AL
240

  
241
    #  my $contractions = 0; #added by AL
242
    #  my $text_id = "unknown_text";
243
    my $text_id = $infilename;
244
    my $sent_id = "0";
245

  
246
    # ----------------------------------------
247
    # loop through words #1: write tokens (terminal nodes) to XML file
248
    # store tree relevant information for loop #2
249
    # ----------------------------------------
250
    for ( my $w = 0 ; $w <= $#words ; $w++ ) {
251

  
252
        # Added by AL for comment lines
253
        if ( $words[$w] =~ /^#/ ) {
254
            if ( $words[$w] =~ /^# newdoc/ ) {
255
                $text_id = $words[$w];
256
                $text_id =~ s/# newdoc id = //;
257
            }
258
            elsif ( $words[$w] =~ /^# sent_id/ ) {
259
                $sent_id = $words[$w];
260
                $sent_id =~ s/# sent_id = //;
261
            }
262

  
263
            #	print LOG "Comment line loop 1: $words[$w]\n";
264
            $commentlines++;
265
            next;
266
        }
267

  
268
        # Added by AL for contractions
269
        elsif ( $words[$w] =~ /^\d+-\d+/ ) {
270

  
271
            #	print LOG "Contraction line loop 1: $words[$w]\n";
272
            $commentlines++;
273

  
274
            #	$contractions++;
275
            next;
276
        }
277
        else {
278
            if ( defined($opt_c) ) {
279
                $words[$w] =~ s/coord(\d+)-//g;
280
            }
281
            @cols       = split( /\t/, $words[$w] );
282
            $wnr        = $cols[0];
283
            $word       = $cols[1];
284
            $lemma      = $cols[2];
285
            $plemma     = $cols[2];                    # predicted
286
            $pos        = $cols[3];
287
            $ppos       = $cols[4];                    # predicted
288
            $mor        = $cols[5];
289
            $pmor       = $cols[5];                    # predicted
290
            $cat        = $cols[$cold];
291
            $edition_id = $cols[9];
292
            $edition_id =~ s/^.*XmlId=([^|]+).*$/\1/g;
293

  
294
            if ( $cat =~ /[<>]/ ) {
295
                print LOG "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
296
                $cat = 'Err2';
297
            }
298

  
299
            # NCA: enclose lemmas in underscores (easier for regex construction)
300
            if ( $corpus =~ /nca/i ) {
301
                $lemma = "_" . "$lemma" . "_";
302
            }
303

  
304
            clean_data();
305

  
306
            # get attribute-value pairs from col #13 of first word (option -x)
307
            if ( $opt_x == "all" ) {
308
                $cols[$featcol] = "all=" . $cols[$featcol];
309
            }
310
            if ( $w == 0 && $cols[$featcol] =~ /=/ ) {
311

  
312
                #      print STDERR "========== getting att-value for word $w: $cols[$featcol] scodes=@scodes\n";
313
                $nt_features = $nt_empty_features = '';
314

  
315
                #      while($cols[$featcol] =~ m/ (.*?)="([^"]*)"/gs) {   # quoted values
316
                while ( $cols[$featcol] =~ m/ ?([^=]*?)="?([^, ]+)\b"?\b/gs ) {    # maybe unquoted values (e.g. Le Monde 2005)
317
                    $att = $1;
318
                    $val = $2;
319

  
320
                    # pick the attributes that match those of the command line option -x
321
                    for ( my $t = 0 ; $t <= $#scodes ; $t++ ) {
322
                        if ( $att eq $scodes[$t] ) {
323
                            $val =~ s/\&/\&amp;/g;                                 #  replace "&" in values (appears in URLs)
324
                            if ( $t == 0 ) { $add_to_sentcode = "_$att$val"; }
325
                            $nt_features = $nt_features . " $att=\"$val\"";
326

  
327
                            #	    print STDERR "$./$w/$featcol: $cols[$featcol] --- nt_features: $nt_features\n";
328
                        }
329
                        if ( $att eq $scodes[$t] ) { $nt_empty_features = $nt_empty_features . " $att=\"--\""; }
330
                    }
331
                }
332

  
333
                # replace the reserved feature 'id' (Tiger)
334
                $add_to_sentcode =~ s/\bid=/ncaid=/;
335
                $nt_features =~ s/\bid=/ncaid=/;
336
                $nt_empty_features =~ s/\bid=/ncaid=/;
337
            }    # if col 13 contains attributes
338
            else {
339
                if ( defined($opt_x) && ( $w == 0 ) ) {
340
                    print STDERR "Warning: sentence=$.  option -x is defined, but no attribute=value declarations were found!\n";
341
                }
342
            }
343

  
344
            # store output for terminal node in array, output later. For double categories make a duplicate node.
345
            $tempid = sprintf( "%d_%d", $., $wnr );
346

  
347
#    push(@terminals, sprintf("      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $ppos, $pmor, $plemma, $text_id, $edition_id));
348
            push( @terminals,
349
                sprintf( "      <t id=\"s%d_%d\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, $word, $pos, $mor, $lemma, $text_id, $edition_id ) );
350
            if ( $cat =~ /_/ ) {
351

  
352
#      push(@terminals, sprintf("      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" ppos=\"%s\" pmor=\"%s\" plemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", "_", "_", "_", $text_id, $edition_id));
353
                push( @terminals,
354
                    sprintf( "      <t id=\"s%d_%d_dupl\" word=\"%s\" pos=\"%s\" mor=\"%s\" lemma=\"%s\" textid=\"%s\" editionId=\"%s\"/>\n", $., $wnr, "*", "_", "_", "_", $text_id, $edition_id ) );
355
                $duplicates{$tempid} = 1;    # store, check later to attach the duplicates to the mother
356
            }
357

  
358
            # associate Aux with main verb, to create an attribute in the verb node in loop #2 (TODO: more than one Aux)
359
            if ( $cat =~ /Aux/ ) {
360
                $aux{ $cols[$colh] } = "$word" . "_" . "$plemma";    # $aux{head} = word_lemma (of Aux)
361
            }
362

  
363
            # ----------------------------------------
364
            # store information needed for tree
365
            # ----------------------------------------
366
            # if fake rootnode == 1: nSnt as root node
367
            if ( ( $fakeroot == 1 ) && ( $w - $commentlines + 1 == $rootnode ) ) {
368
                $cat = 'nSnt';
369
                $notes{$tempid} = 'Warning no marked ROOT node in CoNLL';    # TODO: geht nicht
370
            }
371

  
372
            # if fake rootnode == 2: flatten structure: attach all words to the first word
373
            if ( ( $fakeroot == 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
374
                $cols[$colh] = 1;
375
                $notes{$tempid} = 'Error neither ROOT node nor top node in CoNLL';
376
            }
377

  
378
            # correct unbound words in parser output (phead = 0, but not marked as ROOT)
379
            if ( ( $cols[$colh] eq "0" ) && ( $w - $commentlines + 1 != $rootnode ) ) {    #AL: added: -$commentlines
380
                printf LOG " Warning sentence $. ($tempid): unbound node %d (attached to root %d)\n", ( $w - $commentlines + 1 ), $rootnode;
381
                $cols[$colh]    = $rootnode;
382
                $cat            = 'Err';                                                   # let Err instead of deprel appear in dom attribute
383
                $notes{$tempid} = 'Warning unbound node in CoNLL';
384
            }
385

  
386
            # store for R edge labels
387
            if ( $cols[$cold] =~ /RelN?C/ ) {
388
                $relators{$tempid} = 1;
389
            }
390

  
391
            # store deprel for dom attribute
392
            $deprel{$tempid} = $cat;                                                       # $cols[$cold];
393
                                                                                           # if real root, add this node to daughter array, store array in hash dominates{head}{@daughters}
394
            if ( ( $fakeroot < 2 ) && ( $w - $commentlines + 1 != $rootnode ) ) {
395
                @daughters = @{ $dominates{ $cols[$colh] } };                              # get the array from the hash of the dominating node
396
                push( @daughters, $wnr );
397
                $dominates{ $cols[$colh] } = [@daughters];
398
            }
399
        }    # for each word loop #1
400
    }    # AL condition end
401

  
402
    # print graph code (needs root attribute) and terminal nodes
403
    if ( $rootnode == 0 ) {
404
        $noroot++;
405
        print LOG "Error sentence $. ($tempid): root node not found:\n$_\n";
406
        next;
407
    }
408
    else {
409
        printf XML "<s id=\"s%s%s\" textid=\"$text_id\" sentid=\"$sent_id\">\n", $., $add_to_sentcode;
410
        print XML "  <graph root=\"n$._$rootnode\">\n";
411
        print XML "    <terminals>\n";
412
        for ( my $t = 0 ; $t <= $#terminals ; $t++ ) {
413
            print XML $terminals[$t];
414
        }
415
        print XML "    </terminals>\n";
416
    }
417

  
418
    # ----------------------------------------
419
    # loop through words #2 to build Tiger tree (non terminal nodes)
420
    # ----------------------------------------
421
    print XML "    <nonterminals>\n";
422
    for ( my $i = 0 ; $i <= $#words ; $i++ ) {
423

  
424
        #Added AL for comment lines
425
        if ( $words[$i] =~ /^#/ ) {
426

  
427
            #       print LOG "Comment line loop 2 : $words[$i]\n";
428
            next;
429
        }
430

  
431
        #Added AL for contractions
432
        if ( $words[$i] =~ /^\d+-\d+/ ) {
433

  
434
            #       print LOG "Contraction loop 2 : $words[$i]\n";
435
            next;
436
        }
437

  
438
        else {
439

  
440
            @cols = split( /\t/, $words[$i] );
441
            $w = $cols[0];
442
            ### TODO: redundante Variablenzuweisung (= loop #1)??
443
            $word   = $cols[1];
444
            $lemma  = $cols[2];
445
            $plemma = $cols[3];       # predicted
446
            $pos    = $cols[4];
447
            $ppos   = $cols[5];       # predicted
448
            $mor    = $cols[6];
449
            $pmor   = $cols[7];       # predicted
450
            $cat    = $cols[$cold];
451

  
452
            if ( $cat =~ /[<>]/ ) {
453
                print Log "Warning in line $.: illegal node name: \"$cat\" -> \"Err2\"\n";
454
                $cat = 'Err2';
455
            }
456

  
457
            #  OF parser has not learned punctuation: set cat for punctuation to PON
458
            if ( ( $corpus =~ /nca/i ) && ( $pos eq 'PON' ) ) {
459
                $cols[$cold] = $cat = 'Pon';
460
            }
461

  
462
            clean_data();
463

  
464
            # retrieve daughters, make dom attribute (string of dominated nodes)
465
            @daughters = @{ $dominates{"$w"} };
466
            $dom       = '';
467
            for ( my $d = 0 ; $d <= $#daughters ; $d++ ) {
468
                $dom = $dom . "_" . $deprel{"$._$daughters[$d]"};
469
            }
470
            if ( $dom =~ /_/ ) {
471
                $dom =~ s/^_//;
472
            }
473
            else {
474
                $dom = '--';
475
            }
476

  
477
            # if verbal, set node attributes for verb form and lemma
478
            $type = "nV";
479
            $vform = $vlemma = "--";
480
            if ( $pos =~ /VER/ ) {    # AL: $ppos -> $pos
481
                if    ( $mor =~ /infi/ )       { $type = "VInf"; }    #AL: $pmor -> $mor
482
                elsif ( $pmor =~ /pper|ppre/ ) { $type = "VPar"; }
483
                else                           { $type = "VFin"; }
484

  
485
                # if Aux is present, create attribute for main verb
486
                if ( $aux{$w} =~ /(.*?)_(.*)/ ) {
487
                    $vform  = "$1";
488
                    $vlemma = "$2";
489
                }
490

  
491
                # else create attr for simple verb
492
                else {
493
                    $vform  = $word;
494
                    $vlemma = $lemma;    # AL: $plemma -> $lemma (always void in SRCMF)
495
                }
496

  
497
                # NCA: enclose lemmas in underscores (easier for regex construction)
498
                if ( $corpus =~ /nca/i ) {
499
                    $vlemma = "_" . "$vlemma" . "_";
500
                }
501
            }
502

  
503
            # call output function (twice for duplicate categories)
504
            if ( $cat =~ /(.*?)_(.*)/ ) {
505
                write_nonterminals( "$2", "" );         # RelNC is always node (see clean categories), function is duplicate, e.g. SjPer_RelNC
506
                write_nonterminals( "$1", "_dupl" );    # other category is duplicate
507
            }
508
            else {
509
                write_nonterminals($cat);
510
            }
511
        }    # for words
512
    }    #AL end condition
513

  
514
    print XML "    </nonterminals>\n";
515
    print XML "  </graph>\n";
516
    print XML "</s>\n";
517
    if ( $. % 100 == 0 ) { print STDERR "\b\b\b\b\b\b\b\b"; printf STDERR "%08d", $.; }
518
}    # main
519
print XML "</subcorpus>\n";
520
print STDERR "\n$CMD: $. sentences converted. Results in $outdir. Log in $outdir/conversion.log.\n";
521
print STDERR "   Hint 1: on OS X convert master file to MacRoman, e.g  iconv -f latin1 -t macroman\n";
522
print STDERR "   Hint 2: use tiger.pl -c <Tiger XML file> to detect unbound nodes.\n";
523
print STDERR "   Hint 3: build reliable feature declarations using tiger.sh\n";
524
print STDERR "           tiger.sh -a \"lemma word pos ppos\"  (for terminals)\n";
525
print STDERR "           tiger.sh -A \"lemma word pos ppos\"  (for non-terminals)\n";
526
if ( $noroot > 0 ) { print STDERR "$noroot sentences ignored: root not found (see log file)\n"; }
527
write_master_footer();
528
close(MASTER);
529
close(XML);
530
close(LOG);
531

  
532
exit;
533

  
534
# ----------------------------------------
535
# sub
536
# ----------------------------------------
537

  
538
sub write_xml_header {
539
    print XML "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
540
  <subcorpus name=\"$infilename-$suffix\">
541
";
542
}
543

  
544
sub write_master_header {
545
    printf MASTER '<?xml version="1.0" encoding="UTF-8"?>
546
';
547

  
548
    printf MASTER "<corpus id=\"$corpus\">
549
<head>
550
  <meta><name>$corpus</name> 
551
    <author>ILR Stuttgart</author> 
552
    <date></date> 
553
    <description>Parsed with mate tools using a SRCMF-based grammar model (http://srcmf.org). </description> 
554
    <format>SRCMF</format>
555
    <history>TigerXML converted by conll2tiger.pl</history>
556
  </meta>
557
";
558

  
559
    #  printf MASTER '<annotation>
560
    #<feature name="word" domain="T" ></feature>
561
    #<feature name="pos" domain="T" ></feature>
562
    #<feature name="mor" domain="T" ></feature>
563
    #<feature name="lemma" domain="T" ></feature>
564
    #<feature name="ppos" domain="T" ></feature>
565
    #<feature name="pmor" domain="T" ></feature>
566
    #<feature name="plemma" domain="T" ></feature>
567
    #<feature name="cat" domain="NT" >
568
    #  <value name="Apst">apostrophe</value>
569
    #  <value name="AtObj">attribut d objet</value>
570
    #  <value name="AtRfc">attribut réfléchi</value>
571
    #  <value name="AtSj">attribut de sujet</value>
572
    #  <value name="AttributReflechi">attribut réfléchi</value>
573
    #  <value name="Aux">auxilié</value>
574
    #  <value name="AuxA">auxilié actif</value>
575
    #  <value name="AuxP">auxilié passif</value>
576
    #  <value name="Circ">circonstant</value>
577
    #  <value name="Circ_RelNC">circonstant pronom relatif</value>
578
    #  <value name="Cmpl">complément</value>
579
    #  <value name="Cmpl_RelNC">complément pronom relatif</value>
580
    #  <value name="Coo">coordination</value>
581
    #  <value name="Det">déterminant</value>
582
    #  <value name="Err">unbound node in CoNLL input</value>
583
    #  <value name="Err2">illegal node name was replaced</value>
584
    #  <value name="GpCoo">coordonné</value>
585
    #  <value name="Ignorer">Ignorer</value>
586
    #  <value name="Insrt">incidente</value>
587
    #  <value name="Intj">interjection</value>
588
    #  <value name="Lac">lacune</value>
589
    #  <value name="ModA">modifieur attaché</value>
590
    #  <value name="ModD">modifieur détaché</value>
591
    #  <value name="Ng">négation</value>
592
    #  <value name="NgPrt">forclusif</value>
593
    #  <value name="Obj">objet</value>
594
    #  <value name="Obj_RelNC">direct object pronom relatif</value>
595
    #  <value name="Pon">ponctuation</value>
596
    #  <value name="PON">ponctuation</value>
597
    #  <value name="Regim">régime</value>
598
    #  <value name="RelC">relateur coordonnant</value>
599
    #  <value name="RelNC">relateur non coordonnant</value>
600
    #  <value name="Rfc">réfléchi</value>
601
    #  <value name="Rfx">réfléxif renforcé</value>
602
    #  <value name="SjImp">sujet impersonnel</value>
603
    #  <value name="SjPer">sujet personnel</value>
604
    #  <value name="SjPer_RelNC">sujet personnel pronom relatif</value>
605
    #  <value name="Snt">phrase</value>
606
    #  <value name="ROOT">phrase</value>
607
    #  <value name="StructureMaximale">structure maximale</value>
608
    #  <value name="VFin">verbe fini</value>
609
    #  <value name="VInf">verbe infinitif</value>
610
    #  <value name="nMax">structure non-maximale</value>
611
    #  <value name="nSnt">non-phrase</value>
612
    #</feature>
613
    #<feature name="coord" domain="NT" ></feature>
614
    #<feature name="dom" domain="NT" ></feature>
615
    #<feature name="type" domain="NT" >
616
    #  <value name="nV">élément non-verbal</value>
617
    #  <value name="VFin">verbe fini</value>
618
    #  <value name="VInf">verbe infinitif</value>
619
    #  <value name="VPar">verbe participial</value>
620
    #  <value name="--">nil</value>
621
    #</feature>
622
    #<feature name="vform" domain="NT"></feature>
623
    #<feature name="vlemma" domain="NT"></feature>
624
    #<feature name="note" domain="NT"></feature>
625
    #<feature name="snr" domain="NT"></feature>
626
    #';
627

  
628
    printf MASTER '<annotation>
629
<feature name="word" domain="T" ></feature>
630
<feature name="pos" domain="T" ></feature>
631
<feature name="mor" domain="T" ></feature>
632
<feature name="lemma" domain="T" ></feature>
633
<feature name="textid" domain="T" ></feature>
634
<feature name="editionId" domain="T" ></feature>
635
<feature name="cat" domain="NT" >
636
  <value name="__UNDEF__">UNDEFINED !!!</value>
637
  <value name="acl:adv">acl:adv</value> <!-- Ukrainian -->
638
  <value name="acl:attr">acl:attr</value> <!-- Chukchi -->
639
  <value name="acl:cleft">acl:cleft</value> <!-- Norwegian, Swedish -->
640
  <value name="acl:fixed">acl:fixed</value> <!-- Beja -->
641
  <value name="acl:inf">acl:inf</value> <!-- Portuguese -->
642
  <value name="acl:relat">acl:relat</value> <!-- Chukchi -->
643
  <value name="acl:relcl">relative clause modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Beja, Belarusian, Breton, Bulgarian, Chinese, Czech, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, French, German, Greek, Hebrew, Hindi, Hindi English, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Lithuanian, Livvi, Manx, Marathi, Moksha, Naija, North Sami, Norwegian, Old East Slavic, Old French, Persian, Polish, Portuguese, Russian, Sanskrit, Scottish Gaelic, Slovak, Spanish, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Ukrainian, Urdu, Welsh, Western Armenian, Wolof -->
644
  <value name="acl">clausal modifier of noun (adnominal clause)</value>
645
  <value name="advcl:abs">advcl:abs</value> <!-- Latin -->
646
  <value name="advcl:cau">advcl:cau</value> <!-- Moksha -->
647
  <value name="advcl:cleft">advcl:cleft</value> <!-- French, Naija -->
648
  <value name="advcl:cmpr">advcl:cmpr</value> <!-- Latin, Polish -->
649
  <value name="advcl:cond">advcl:cond</value> <!-- Tamil, Telugu, Uyghur -->
650
  <value name="advcl:coverb">advcl:coverb</value> <!-- Cantonese -->
651
  <value name="advcl:eval">advcl:eval</value> <!-- Komi Zyrian -->
652
  <value name="advcl:lcl">advcl:lcl</value> <!-- Komi Permyak -->
653
  <value name="advcl:lto">advcl:lto</value> <!-- Komi Zyrian -->
654
  <value name="advcl:mcl">advcl:mcl</value> <!-- Komi Permyak -->
655
  <value name="advcl:pred">advcl:pred</value> <!-- Latin -->
656
  <value name="advcl:relcl">advcl:relcl</value> <!-- Polish, Western Armenian -->
657
  <value name="advcl:sp">advcl:sp</value> <!-- Ukrainian -->
658
  <value name="advcl:svc">advcl:svc</value> <!-- Ukrainian -->
659
  <value name="advcl:tcl">advcl:tcl</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
660
  <value name="advcl">adverbial clause modifier</value>
661
  <value name="advmod:arg">advmod:arg</value> <!-- Polish -->
662
  <value name="advmod:cau">advmod:cau</value> <!-- Erzya, Komi Zyrian, Moksha -->
663
  <value name="advmod:comp">advmod:comp</value> <!-- Erzya -->
664
  <value name="advmod:deg">advmod:deg</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
665
  <value name="advmod:det">advmod:det</value> <!-- Ukrainian -->
666
  <value name="advmod:df">advmod:df</value> <!-- Cantonese, Chinese -->
667
  <value name="advmod:emph">emphasizing word, intensifier</value> <!-- Akkadian, Arabic, Armenian, Catalan, Chukchi, Croatian, Czech, Indonesian, Komi Zyrian, Latin, Lithuanian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil, Turkish, Turkish German, Upper Sorbian, Uyghur, Western Armenian -->
668
  <value name="advmod:eval">advmod:eval</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
669
  <value name="advmod:fixed">advmod:fixed</value> <!-- Beja -->
670
  <value name="advmod:foc">advmod:foc</value> <!-- Erzya, Komi Zyrian, Moksha, Skolt Sami -->
671
  <value name="advmod:freq">advmod:freq</value> <!-- Komi Zyrian, Moksha -->
672
  <value name="advmod:lfrom">advmod:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
673
  <value name="advmod:lmod">locative adverbial modifier</value> <!-- Apurina, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
674
  <value name="advmod:lmp">advmod:lmp</value> <!-- Erzya, Komi Zyrian -->
675
  <value name="advmod:locy">advmod:locy</value> <!-- Hungarian -->
676
  <value name="advmod:lto">advmod:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
677
  <value name="advmod:mmod">advmod:mmod</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami -->
678
  <value name="advmod:mode">advmod:mode</value> <!-- Hungarian -->
679
  <value name="advmod:neg">advmod:neg</value> <!-- Apurina, Buryat, Kiche, Kurmanji, Latin, Maltese, Polish, Skolt Sami -->
680
  <value name="advmod:obl">adverbial modifier + oblique nominal</value> <!-- Old French -->
681
  <value name="advmod:que">advmod:que</value> <!-- Hungarian -->
682
  <value name="advmod:tfrom">advmod:tfrom</value> <!-- Hungarian -->
683
  <value name="advmod:tlocy">advmod:tlocy</value> <!-- Hungarian -->
684
  <value name="advmod:tmod">advmod:tmod</value> <!-- Apurina, Erzya, Komi Permyak, Komi Zyrian, Moksha, Romanian, Skolt Sami -->
685
  <value name="advmod:to">advmod:to</value> <!-- Hungarian -->
686
  <value name="advmod:tto">advmod:tto</value> <!-- Hungarian -->
687
  <value name="advmod">adverbial modifier</value>
688
  <value name="amod:att">amod:att</value> <!-- Hungarian -->
689
  <value name="amod:attlvc">amod:attlvc</value> <!-- Hungarian -->
690
  <value name="amod:flat">amod:flat</value> <!-- Polish -->
691
  <value name="amod">adjectival modifier</value>
692
  <value name="appos:trans">appos:trans</value> <!-- Turkish German -->
693
  <value name="appos">appositional modifier</value>
694
  <value name="aux:aff">aux:aff</value> <!-- Beja -->
695
  <value name="aux:aspect">aux:aspect</value> <!-- Komi Zyrian -->
696
  <value name="aux:caus">aux:caus</value> <!-- Armenian, French, Western Armenian -->
697
  <value name="aux:clitic">aux:clitic</value> <!-- Polish -->
698
  <value name="aux:cnd">aux:cnd</value> <!-- Erzya, Komi Permyak, Komi Zyrian, Polish -->
699
  <value name="aux:ex">aux:ex</value> <!-- Armenian, Western Armenian -->
700
  <value name="aux:imp">aux:imp</value> <!-- Erzya, Polish -->
701
  <value name="aux:nec">aux:nec</value> <!-- Komi Zyrian, Moksha, Skolt Sami -->
702
  <value name="aux:neg">aux:neg</value> <!-- Chukchi, Erzya, Komi Permyak, Komi Zyrian, Maltese, Moksha, North Sami, Skolt Sami, Tamil -->
703
  <value name="aux:opt">aux:opt</value> <!-- Erzya, Moksha -->
704
  <value name="aux:part">aux:part</value> <!-- Maltese -->
705
  <value name="aux:pass">passive auxilary</value> <!-- Afrikaans, Ancient Greek, Arabic, Assyrian, Belarusian, Bhojpuri, Breton, Bulgarian, Buryat, Chinese, Czech, Dutch, English, Faroese, Finnish, French, Frisian Dutch, Galician, German, Hindi, Italian, Kangri, Karelian, Latin, Latvian, Lithuanian, Maltese, Marathi, Norwegian, Old Church Slavonic, Old East Slavic, Old French, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swedish, Swiss German, Tamil, Thai, Turkish German, Upper Sorbian, Vietnamese -->
706
  <value name="aux:pot">aux:pot</value> <!-- Komi Zyrian -->
707
  <value name="aux:q">aux:q</value> <!-- Erzya, Turkish, Turkish German -->
708
  <value name="aux:tense">aux:tense</value> <!-- French, Komi Zyrian, Skolt Sami -->
709
  <value name="aux">auxiliary</value>
710
  <value name="case:acc">case:acc</value> <!-- Hebrew -->
711
  <value name="case:adv">case:adv</value> <!-- Indonesian -->
712
  <value name="case:aff">case:aff</value> <!-- Beja -->
713
  <value name="case:det">preposition with determiner</value> <!-- Maltese, Old French -->
714
  <value name="case:gen">case:gen</value> <!-- Hebrew -->
715
  <value name="case:loc">case:loc</value> <!-- Armenian, Cantonese, Chinese, Western Armenian -->
716
  <value name="case:pred">case:pred</value> <!-- Welsh -->
717
  <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic -->
718
  <value name="case">case marking</value>
719
  <value name="cc:nc">cc:nc</value> <!-- Old French -->
720
  <value name="cc:nc">Coordinated conjunct : non coordonant</value>
721
  <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish -->
722
  <value name="cc:preconj">preconjunct</value>
723
  <value name="cc">Coordinating conjunction</value>
724
  <value name="cc">coordinating conjunction</value>
725
  <value name="ccomp:cleft">ccomp:cleft</value> <!-- Polish -->
726
  <value name="ccomp:obj">ccomp:obj</value> <!-- Hungarian, Polish -->
727
  <value name="ccomp:obl">ccomp:obl</value> <!-- Hungarian -->
728
  <value name="ccomp:pmod">ccomp:pmod</value> <!-- Romanian -->
729
  <value name="ccomp:pred">ccomp:pred</value> <!-- Hungarian -->
730
  <value name="ccomp">clausal complement</value>
731
  <value name="clf">classifier</value>
732
  <value name="compound:a">compound:a</value> <!-- Indonesian -->
733
  <value name="compound:affix">compound:affix</value> <!-- Hebrew -->
734
  <value name="compound:dir">compound:dir</value> <!-- Cantonese, Chinese -->
735
  <value name="compound:ext">compound:ext</value> <!-- Cantonese, Chinese -->
736
  <value name="compound:lvc">compound:lvc</value> <!-- Armenian, Hindi, Kazakh, Khunsari, Korean, Kurmanji, Marathi, Nayini, Persian, Soi, Tamil, Telugu, Turkish, Turkish German, Uyghur, Western Armenian -->
737
  <value name="compound:lvc">light verb construction</value>
738
  <value name="compound:nn">compound:nn</value> <!-- Finnish, Livvi, North Sami -->
739
  <value name="compound:preverb">compound:preverb</value> <!-- Hungarian -->
740
  <value name="compound:prt">compound:prt</value> <!-- Afrikaans, Arabic, Danish, Dutch, English, Erzya, Estonian, Faroese, Finnish, Frisian Dutch, German, Icelandic, Irish, Karelian, Komi Permyak, Naija, Norwegian, Persian, Spanish, Swedish, Swedish Sign Language, Swiss German, Tamil, Thai, Turkish German, Wolof, Yoruba -->
741
  <value name="compound:prt">phrasal verb particle</value>
742
  <value name="compound:quant">compound:quant</value> <!-- Cantonese -->
743
  <value name="compound:redup">reduplicated compounds</value> <!-- Armenian, Bambara, Classical Chinese, Erzya, Hindi, Kurmanji, Marathi, Naija, Tagalog, Tamil, Telugu, Turkish, Turkish German, Uyghur, Welsh, Western Armenian -->
744
  <value name="compound:smixut">compound:smixut</value> <!-- Hebrew -->
745
  <value name="compound:svc">serial verb compounds</value> <!-- Amharic, Armenian, Marathi, Mbya Guarani, Naija, Swedish Sign Language, Telugu, Ukrainian, Western Armenian, Wolof, Yoruba -->
746
  <value name="compound:vo">compound:vo</value> <!-- Cantonese, Chinese -->
747
  <value name="compound:vv">compound:vv</value> <!-- Cantonese, Chinese -->
748
  <value name="compound">compound</value>
749
  <value name="conj:expl">conj:expl</value> <!-- Latin -->
750
  <value name="conj:extend">conj:extend</value> <!-- Slovenian -->
751
  <value name="conj:svc">conj:svc</value> <!-- Ukrainian -->
752
  <value name="conj">conjunct</value>
753
  <value name="cop:expl">cop:expl</value> <!-- Maltese -->
754
  <value name="cop:locat">cop:locat</value> <!-- Polish -->
755
  <value name="cop:own">cop:own</value> <!-- Finnish, Karelian, Livvi, Marathi -->
756
  <value name="cop">copula</value>
757
  <value name="csubj:cleft">csubj:cleft</value> <!-- Irish, Latin, Manx, Scottish Gaelic -->
758
  <value name="csubj:cop">csubj:cop</value> <!-- Erzya, Estonian, Finnish, Irish, Komi Zyrian, Livvi, Manx, Moksha, Scottish Gaelic, Turkish -->
759
  <value name="csubj:pass">clausal passive subject</value> <!-- Albanian, Amharic, Ancient Greek, Arabic, Armenian, Belarusian, Bulgarian, Catalan, Chinese, Classical Chinese, Czech, English, French, German, Gothic, Greek, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Norwegian, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Slovak, Spanish, Swedish, Western Armenian -->
760
  <value name="csubj">clausal subject</value>
761
  <value name="dep:aff">dep:aff</value> <!-- Beja -->
762
  <value name="dep:agr">dep:agr</value> <!-- Kiche -->
763
  <value name="dep:alt">dep:alt</value> <!-- Upper Sorbian -->
764
  <value name="dep:ana">dep:ana</value> <!-- Yupik -->
765
  <value name="dep:aux">dep:aux</value> <!-- Yupik -->
766
  <value name="dep:comp">dep:comp</value> <!-- Beja, French -->
767
  <value name="dep:conj">dep:conj</value> <!-- Beja -->
768
  <value name="dep:cop">dep:cop</value> <!-- Yupik -->
769
  <value name="dep:emo">dep:emo</value> <!-- Yupik -->
770
  <value name="dep:infl">dep:infl</value> <!-- Yupik -->
771
  <value name="dep:mark">dep:mark</value> <!-- Yupik -->
772
  <value name="dep:mod">dep:mod</value> <!-- Mbya Guarani -->
773
  <value name="dep:pos">dep:pos</value> <!-- Yupik -->
774
  <value name="dep:redup">dep:redup</value> <!-- Beja -->
775
  <value name="dep:ss">dep:ss</value> <!-- Kiche -->
776
  <value name="dep">unspecified dependency</value>
777
  <value name="det:adj">det:adj</value> <!-- Albanian -->
778
  <value name="det:noun">det:noun</value> <!-- Albanian -->
779
  <value name="det:numgov">pronominal quantifier governing the case of the noun</value> <!-- Czech, Polish, Serbian, Slovak, Ukrainian, Upper Sorbian -->
780
  <value name="det:nummod">pronominal quantifier agreeing in case with the noun</value> <!-- Czech, Polish, Ukrainian -->
781
  <value name="det:poss">possessive determiner</value> <!-- Akkadian, Armenian, German, Italian, Korean, Polish, Portuguese, Western Armenian -->
782
  <value name="det:predet">det:predet</value> <!-- English, Italian, Persian -->
783
  <value name="det:pron">det:pron</value> <!-- Albanian -->
784
  <value name="det:rel">det:rel</value> <!-- Bambara -->
785
  <value name="det">determiner</value>
786
  <value name="discourse:emo">discourse:emo</value> <!-- Irish, Italian, Polish -->
787
  <value name="discourse:filler">discourse:filler</value> <!-- Norwegian, Slovenian -->
788
  <value name="discourse:intj">discourse:intj</value> <!-- Polish -->
789
  <value name="discourse:sp">discourse:sp</value> <!-- Cantonese, Chinese, Classical Chinese -->
790
  <value name="discourse">discourse element</value>
791
  <value name="dislocated:cleft">dislocated:cleft</value> <!-- Mbya Guarani -->
792
  <value name="dislocated:csubj">dislocated:csubj</value> <!-- Latin -->
793
  <value name="dislocated:nsubj">dislocated:nsubj</value> <!-- Latin -->
794
  <value name="dislocated:obj">dislocated:obj</value> <!-- Latin -->
795
  <value name="dislocated:subj">dislocated:subj</value> <!-- Beja -->
796
  <value name="dislocated">dislocated elements</value>
797
  <value name="expl:comp">expl:comp</value> <!-- French -->
798
  <value name="expl:impers">impersonal expletive</value> <!-- Italian, Polish, Romanian, Spanish -->
799
  <value name="expl:pass">reflexive pronoun used in reflexive passive</value> <!-- Catalan, Czech, French, Italian, Latin, Portuguese, Romanian, Slovak, Spanish, Upper Sorbian -->
800
  <value name="expl:poss">expl:poss</value> <!-- Romanian -->
801
  <value name="expl:pv">reflexive clitic with an inherently reflexive verb</value> <!-- Czech, Dutch, German, Old East Slavic, Polish, Portuguese, Romanian, Slovak, Spanish, Turkish German, Upper Sorbian -->
802
  <value name="expl:subj">expl:subj</value> <!-- French, Naija -->
803
  <value name="expl">expletive</value>
804
  <value name="fixed">fixed multiword expression</value>
805
  <value name="flat:abs">flat:abs</value> <!-- Ukrainian -->
806
  <value name="flat:dist">flat:dist</value> <!-- Western Armenian -->
807
  <value name="flat:foreign">foreign words</value> <!-- Arabic, Belarusian, Buryat, Chinese, Chukchi, Croatian, Czech, English, Estonian, Faroese, Finnish, French, Galician, Icelandic, Indonesian, Irish, Italian, Komi Zyrian, Latin, Latvian, Lithuanian, Manx, Naija, Norwegian, Persian, Polish, Portuguese, Russian, Scottish Gaelic, Slovak, Slovenian, South Levantine Arabic, Ukrainian, Upper Sorbian -->
808
  <value name="flat:name">names</value> <!-- Ancient Greek, Belarusian, Breton, Chinese, Chukchi, Erzya, Faroese, Finnish, French, Frisian Dutch, Galician, German, Gothic, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Irish, Italian, Karelian, Kazakh, Komi Permyak, Komi Zyrian, Korean, Latin, Latvian, Livvi, Maltese, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Portuguese, Russian, Scottish Gaelic, Skolt Sami, Slovenian, Spanish, Swedish, Thai, Ukrainian, Welsh, Western Armenian -->
809
  <value name="flat:num">flat:num</value> <!-- Komi Zyrian, Persian -->
810
  <value name="flat:range">flat:range</value> <!-- Ukrainian, Western Armenian -->
811
  <value name="flat:repeat">flat:repeat</value> <!-- Ukrainian -->
812
  <value name="flat:sibl">flat:sibl</value> <!-- Ukrainian -->
813
  <value name="flat:title">flat:title</value> <!-- Ukrainian -->
814
  <value name="flat:vv">flat:vv</value> <!-- Classical Chinese -->
815
  <value name="flat">name multiword expression</value>
816
  <value name="goeswith">goes with</value>
817
  <value name="iobj:agent">iobj:agent</value> <!-- Armenian, French, Western Armenian -->
818
  <value name="iobj:appl">iobj:appl</value> <!-- Wolof -->
819
  <value name="iobj:patient">iobj:patient</value> <!-- Tagalog -->
820
  <value name="iobj">indirect object</value>
821
  <value name="list">list</value>
822
  <value name="mark:adv">mark:adv</value> <!-- Cantonese, Chinese -->
823
  <value name="mark:advmod">adverbial modifier confusable with a subordination marker</value> <!-- Old French -->
824
  <value name="mark:aff">mark:aff</value> <!-- Beja -->
825
  <value name="mark:obj">marker + object</value> <!--Old French, no doc -->
826
  <value name="mark:obl">marker + oblique nominal</value> <!--Old French, no doc -->
827
  <value name="mark:prt">mark:prt</value> <!-- Chinese, Irish, Scottish Gaelic -->
828
  <value name="mark:q">mark:q</value> <!-- Hebrew -->
829
  <value name="mark:rel">mark:rel</value> <!-- Cantonese, Chinese -->
830
  <value name="mark">marker</value>
831
  <value name="nmod:agent">nmod:agent</value> <!-- Welsh -->
832
  <value name="nmod:appos">nmod:appos</value> <!-- French, Komi Zyrian, Moksha -->
833
  <value name="nmod:arg">nmod:arg</value> <!-- Polish, Yupik -->
834
  <value name="nmod:att">nmod:att</value> <!-- Hungarian -->
835
  <value name="nmod:attlvc">nmod:attlvc</value> <!-- Hungarian -->
836
  <value name="nmod:attr">nmod:attr</value> <!-- Chukchi -->
837
  <value name="nmod:bahuv">nmod:bahuv</value> <!-- Moksha -->
838
  <value name="nmod:cau">nmod:cau</value> <!-- Uyghur -->
839
  <value name="nmod:comp">nmod:comp</value> <!-- Erzya, Komi Zyrian, Moksha, Turkish, Uyghur -->
840
  <value name="nmod:flat">nmod:flat</value> <!-- Polish -->
841
  <value name="nmod:gen">nmod:gen</value> <!-- Breton -->
842
  <value name="nmod:gobj">nmod:gobj</value> <!-- Erzya, Finnish -->
843
  <value name="nmod:gsubj">nmod:gsubj</value> <!-- Erzya, Finnish, Karelian -->
844
  <value name="nmod:lfrom">nmod:lfrom</value> <!-- Komi Zyrian -->
845
  <value name="nmod:lmod">nmod:lmod</value> <!-- Erzya, Indonesian, Komi Permyak, Komi Zyrian, Moksha -->
846
  <value name="nmod:npmod">nmod:npmod</value> <!-- Armenian, English, Western Armenian -->
847
  <value name="nmod:obj">nmod:obj</value> <!-- Komi Zyrian -->
848
  <value name="nmod:obl">nmod:obl</value> <!-- Hungarian -->
849
  <value name="nmod:part">nmod:part</value> <!-- Turkish, Uyghur -->
850
  <value name="nmod:poss">possessive nominal modifier</value> <!-- Akkadian, Albanian, Apurina, Arabic, Armenian, Assyrian, Bambara, Beja, Breton, Chukchi, Danish, Dutch, English, Erzya, Faroese, Finnish, Frisian Dutch, German, Hebrew, Hindi, Icelandic, Indonesian, Irish, Karelian, Kazakh, Khunsari, Komi Permyak, Komi Zyrian, Korean, Kurmanji, Latin, Livvi, Maltese, Manx, Marathi, Moksha, Naija, Nayini, North Sami, Persian, Polish, Sanskrit, Scottish Gaelic, Skolt Sami, Soi, South Levantine Arabic, Swedish, Swedish Sign Language, Swiss German, Tagalog, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri, Welsh, Western Armenian, Wolof -->
851
  <value name="nmod:pred">nmod:pred</value> <!-- Polish -->
852
  <value name="nmod:prp">nmod:prp</value> <!-- Komi Zyrian -->
853
  <value name="nmod:redup">nmod:redup</value> <!-- Welsh -->
854
  <value name="nmod:relat">nmod:relat</value> <!-- Chukchi -->
855
  <value name="nmod:subj">nmod:subj</value> <!-- Komi Zyrian -->
856
  <value name="nmod:tmod">temporal modifier</value> <!-- Chinese, English, Indonesian, Moksha, Romanian, Telugu, Uyghur -->
857
  <value name="nmod">nominal modifier</value>
858
  <value name="nsubj:advmod">fused subject pronoun and adverb</value> <!-- Old French -->
859
  <value name="nsubj:aff">nsubj:aff</value> <!-- Beja -->
860
  <value name="nsubj:bfoc">nsubj:bfoc</value> <!-- Tagalog -->
861
  <value name="nsubj:caus">nsubj:caus</value> <!-- Armenian, French, Western Armenian -->
862
  <value name="nsubj:cleft">nsubj:cleft</value> <!-- Latin -->
863
  <value name="nsubj:cop">nsubj:cop</value> <!-- Apurina, Breton, Erzya, Estonian, Finnish, Hebrew, Karelian, Komi Permyak, Komi Zyrian, Livvi, Moksha, Sanskrit, Skolt Sami, Turkish -->
864
  <value name="nsubj:ifoc">nsubj:ifoc</value> <!-- Tagalog -->
865
  <value name="nsubj:lfoc">nsubj:lfoc</value> <!-- Tagalog -->
866
  <value name="nsubj:lvc">nsubj:lvc</value> <!-- Hungarian -->
867
  <value name="nsubj:nc">nsubj:nc</value> <!-- Persian, Tamil, Telugu -->
868
  <value name="nsubj:obj">fused subject and object pronoun</value> <!-- Old French -->
869
  <value name="nsubj:pass">passive nominal subject</value> <!-- Afrikaans, Amharic, Ancient Greek, Arabic, Armenian, Assyrian, Belarusian, Bulgarian, Buryat, Cantonese, Catalan, Chinese, Classical Chinese, Czech, Dutch, English, Faroese, French, Frisian Dutch, Galician, German, Gothic, Greek, Hindi, Indonesian, Italian, Korean, Latin, Latvian, Lithuanian, Maltese, Marathi, Moksha, Norwegian, Old Church Slavonic, Old East Slavic, Persian, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Slovak, Spanish, Swedish, Swiss German, Tagalog, Tamil, Thai, Turkish German, Upper Sorbian, Western Armenian -->
870
  <value name="nsubj:periph">nsubj:periph</value> <!-- Cantonese -->
871
  <value name="nsubj">Nominal subject</value>
872
  <value name="nummod:det">nummod:det</value> <!-- Beja -->
873
  <value name="nummod:entity">numeric modifier governed by a noun</value> <!-- Russian -->
874
  <value name="nummod:flat">nummod:flat</value> <!-- Polish -->
875
  <value name="nummod:gov">numeric modifier governing the case of the noun</value> <!-- Belarusian, Czech, Lithuanian, Old East Slavic, Polish, Russian, Sanskrit, Serbian, Ukrainian, Upper Sorbian -->
876
  <value name="nummod">numeric modifier</value>
877
  <value name="obj:advmod">fused adverb and object pronoun</value> <!-- Old French -->
878
  <value name="obj:advneg">fused negation and object pronoun</value> <!-- no doc for advneg -->
879
  <value name="obj:agent">obj:agent</value> <!-- Apurina, French, Tagalog -->
880
  <value name="obj:appl">obj:appl</value> <!-- Wolof -->
881
  <value name="obj:caus">obj:caus</value> <!-- Wolof -->
882
  <value name="obj:lvc">obj:lvc</value> <!-- French, Hungarian, Naija -->
883
  <value name="obj:obl">fused oblique and object pronoun</value> <!-- Old French -->
884
  <value name="obj:periph">obj:periph</value> <!-- Cantonese, Chinese -->
885
  <value name="obj">object</value>
886
  <value name="obl:advmod">adverbial modifier confusable with an oblique dependent</value> <!-- Old French -->
887
  <value name="obl:agent">agent modifier</value> <!-- Ancient Greek, Armenian, Belarusian, Breton, Cantonese, Chinese, Czech, Dutch, Erzya, French, German, Gothic, Greek, Hindi, Indonesian, Italian, Komi Zyrian, Latin, Lithuanian, Maltese, Moksha, Naija, Old Church Slavonic, Old East Slavic, Polish, Portuguese, Romanian, Russian, Sanskrit, Skolt Sami, Swedish, Tamil, Turkish, Welsh, Western Armenian -->
888
  <value name="obl:appl">obl:appl</value> <!-- Wolof -->
889
  <value name="obl:arg">oblique argument</value> <!-- Arabic, Beja, Czech, French, German, Greek, Icelandic, Latin, Lithuanian, Maltese, Naija, Persian, Polish, Sanskrit, Slovak, South Levantine Arabic, Tamil -->
890
  <value name="obl:cau">obl:cau</value> <!-- Erzya, Komi Zyrian, Moksha, Telugu -->
891
  <value name="obl:cmp">obl:cmp</value> <!-- Telugu -->
892
  <value name="obl:cmpr">obl:cmpr</value> <!-- Latin, Polish, Tamil -->
893
  <value name="obl:comp">obl:comp</value> <!-- Moksha -->
894
  <value name="obl:dat">obl:dat</value> <!-- Kurmanji -->
895
  <value name="obl:freq">obl:freq</value> <!-- Moksha -->
896
  <value name="obl:inst">obl:inst</value> <!-- Erzya, Moksha, Tamil -->
897
  <value name="obl:lfrom">obl:lfrom</value> <!-- Erzya, Komi Zyrian, Moksha -->
898
  <value name="obl:lmod">locative modifier</value> <!-- Apurina, Classical Chinese, Danish, Erzya, Komi Permyak, Komi Zyrian, Moksha, Skolt Sami, Tamil -->
899
  <value name="obl:lmp">obl:lmp</value> <!-- Erzya, Komi Zyrian, Moksha -->
900
  <value name="obl:lto">obl:lto</value> <!-- Erzya, Komi Zyrian, Moksha -->
901
  <value name="obl:lvc">obl:lvc</value> <!-- Hungarian -->
902
  <value name="obl:mcl">obl:mcl</value> <!-- Komi Zyrian -->
903
  <value name="obl:mod"> oblique modifier</value> <!-- Beja, French, Naija, Yupik -->
904
  <value name="obl:npmod">obl:npmod</value> <!-- Coptic, English -->
905
  <value name="obl:orphan">obl:orphan</value> <!-- Polish -->
906
  <value name="obl:own">obl:own</value> <!-- Kazakh -->
907
  <value name="obl:patient">obl:patient</value> <!-- Cantonese, Chinese -->
908
  <value name="obl:pmod">obl:pmod</value> <!-- Romanian, Tamil -->
909
  <value name="obl:poss">obl:poss</value> <!-- Thai -->
910
  <value name="obl:prep">obl:prep</value> <!-- Irish -->
911
  <value name="obl:sentcon">obl:sentcon</value> <!-- Mbya Guarani -->
912
  <value name="obl:smod">obl:smod</value> <!-- Scottish Gaelic -->
913
  <value name="obl:tmod">obl:tmod</value> <!-- Apurina, Arabic, Cantonese, Chinese, Classical Chinese, Danish, English, Erzya, Frisian Dutch, German, Hindi, Indonesian, Irish, Italian, Komi Permyak, Komi Zyrian, Korean, Manx, Moksha, Portuguese, Scottish Gaelic, Skolt Sami, Spanish, Tamil, Telugu, Thai, Turkish, Uyghur, Warlpiri -->
914
  <value name="obl:tmod">temporal modifier</value>
915
  <value name="obl">oblique nominal</value>
916
  <value name="orphan:missing">textual gap in the source</value> <!-- Latin -->
917
  <value name="orphan">remnant in ellipsis</value>
918
  <value name="parataxis:appos">parataxis:appos</value> <!-- Italian -->
919
  <value name="parataxis:conj">parataxis:conj</value> <!-- Naija -->
920
  <value name="parataxis:coord">parataxis:coord</value> <!-- Beja -->
921
  <value name="parataxis:deletion">parataxis:deletion</value> <!-- Norwegian -->
922
  <value name="parataxis:discourse">parataxis:discourse</value> <!-- Italian, Naija, Slovenian, Turkish German, Ukrainian -->
923
  <value name="parataxis:dislocated">parataxis:dislocated</value> <!-- Naija -->
924
  <value name="parataxis:hashtag">parataxis:hashtag</value> <!-- Irish, Italian -->
925
  <value name="parataxis:insert">parataxis:insert</value> <!-- French, Italian, Polish -->
926
  <value name="parataxis:mod">parataxis:mod</value> <!-- Beja -->
927
  <value name="parataxis:newsent">parataxis:newsent</value> <!-- Ukrainian -->
928
  <value name="parataxis:nsubj">parataxis:nsubj</value> <!-- Italian -->
929
  <value name="parataxis:obj">parataxis:obj</value> <!-- Bambara, Italian, Polish -->
930
  <value name="parataxis:parenth">parataxis:parenth</value> <!-- French, Naija -->
931
  <value name="parataxis:rel">parataxis:rel</value> <!-- Ukrainian -->
932
  <value name="parataxis:rep">parataxis:rep</value> <!-- Chukchi, Latin, Mbya Guarani -->
933
  <value name="parataxis:restart">parataxis:restart</value> <!-- Slovenian -->
934
  <value name="parataxis:rt">parataxis:rt</value> <!-- Irish -->
935
  <value name="parataxis:sentence">parataxis:sentence</value> <!-- Irish -->
936
  <value name="parataxis:trans">parataxis:trans</value> <!-- Turkish German -->
937
  <value name="parataxis:url">parataxis:url</value> <!-- Irish -->
938
  <value name="parataxis">parataxis</value>
939
  <value name="punct">punctuation</value>
940
  <value name="remnant">Remnant ?</value> <!-- no doc, replace with orphan? -->
941
  <value name="reparandum">overridden disfluency</value>
942
  <value name="root">root</value>
943
  <value name="vocative:cl">vocative:cl</value> <!-- Ukrainian -->
944
  <value name="vocative:mention">vocative:mention</value> <!-- Irish, Italian -->
945
  <value name="vocative">vocative</value>
946
  <value name="xcomp:cleft">xcomp:cleft</value> <!-- Polish -->
947
  <value name="xcomp:ds">xcomp:ds</value> <!-- Erzya, Finnish, Karelian, Komi Permyak, Livvi -->
948
  <value name="xcomp:obj">xcomp:obj</value> <!-- North Sami, Polish -->
949
  <value name="xcomp:pred">xcomp:pred</value> <!-- Irish, Latin, Manx, North Sami, Polish, Scottish Gaelic -->
950
  <value name="xcomp:sp">xcomp:sp</value> <!-- Ukrainian -->
951
  <value name="xcomp:subj">xcomp:subj</value> <!-- Polish -->
952
  <value name="xcomp">open clausal complement</value>
953
</feature>
954
<feature name="coord" domain="NT" ></feature>
955
<feature name="dom" domain="NT" ></feature>
956
<feature name="type" domain="NT" >
957
  <value name="nV">élément non-verbal</value>
958
  <value name="VFin">finite verb</value>
959
  <value name="VInf">infinitive</value>
960
  <value name="VPar">participle</value>
961
  <value name="--">nil</value>
962
</feature>
963
<feature name="vform" domain="NT"></feature>
964
<feature name="vlemma" domain="NT"></feature>
965
<feature name="note" domain="NT"></feature>
966
<feature name="snr" domain="NT"></feature>
967
';
968

  
969
    printf MASTER "$nt_features_header";
970

  
971
    printf MASTER '
972
<edgelabel>
973
  <value name="D">dependency</value>
974
  <value name="L">lexical</value>
975
  <value name="R">relator</value>
976
  <value name="*">not bound</value>
977
</edgelabel>
978
<secedgelabel>
979
  <value name="cluster">between elements of GpCoo</value>
980
  <value name="coord">between members of Coo</value>
981
  <value name="dupl">between duplicated nodes</value>
982
</secedgelabel>
983
</annotation>
984
</head>
985
<body>
986
';
987
}
988

  
989
#  <value name="M">main</value>
990
#  <value name="P">part</value>
991

  
992
sub write_master_footer {
993
    print MASTER '</body>
994
</corpus>
995
';
996
}
997

  
998
$TEMP = '
999
<feature name="nodom" domain="NT" ></feature>
1000
<feature name="headpos" domain="NT" ></feature>
1001
<feature name="annotationFile" domain="NT" ></feature>
1002
<feature name="annotationUri" domain="NT" ></feature>
1003
';
1004

  
1005
sub define_cat_hashes {
1006

  
1007
    #  $abbrev2cat{"Apst"} = "Apostrophe";
1008
    #  $abbrev2cat{"AtObj"} = "AttributObjet";
1009
    #  $abbrev2cat{"AtRfc"} = "AttributReflechi";
1010
    #  $abbrev2cat{"AtSj"} = "AttributSujet";
1011
    #  $abbrev2cat{"AuxA"} = "Auxilie-Actif";
1012
    #  $abbrev2cat{"AuxP"} = "Auxilie-Passif";
1013
    #  $abbrev2cat{"Circ"} = "Circonstant";
1014
    #  $abbrev2cat{"Cmpl"} = "Complement";
1015
    #  $abbrev2cat{"GpCoo"} = "Coordonne";
1016
    #  $abbrev2cat{"Coo"} = "Coordination";
1017
    #  $abbrev2cat{"Det"} = "Determinant";
1018
    #  $abbrev2cat{"NgPrt"} = "Forclusif";
1019
    #  $abbrev2cat{"Insrt"} = "Incidente";
1020
    #  $abbrev2cat{"Intj"} = "Interjection";
1021
    #  $abbrev2cat{"ModA"} = "ModifieurAttache";
1022
    #  $abbrev2cat{"ModD"} = "ModifieurDetache";
1023
    #  $abbrev2cat{"Ng"} = "Negation";
1024
    #  $abbrev2cat{"VInf"} = "NoeudVerbal-Infinitif";
1025
    #  $abbrev2cat{"VPrt"} = "NoeudVerbal-Participe"; #?
1026
    #  $abbrev2cat{"VFin"} = "NoeudVerbal-Personnel";
1027
    #  $abbrev2cat{"nSnt"} = "NonPhrase";
1028
    #  $abbrev2cat{"Obj"} = "Objet";
1029
    #  $abbrev2cat{"Snt"} = "Phrase";
1030
    #  $abbrev2cat{"Pon"} = "Ponctuation";
1031
    #  $abbrev2cat{"Rfc"} = "Reflechi";
1032
    #  $abbrev2cat{"Rfx"} = "ReflexifRenforce";
1033
    #  $abbrev2cat{"RelC"} = "Relateur-Coordonnant";
1034
    #  $abbrev2cat{"RelNC"} = "Relateur-NonCoordonnant";
1035
    #  $abbrev2cat{"nMax"} = "StructureNonMaximale";
1036
    #  $abbrev2cat{"SjImp"} = "SujetImpersonnel";
1037
    #  $abbrev2cat{"SjPer"} = "SujetPersonnel";
1038
    #  $abbrev2cat{"Lac"} = "Lacune";
1039
    #  $abbrev2cat{"Aux"} = "Auxilie";
1040
    #  $abbrev2cat{"Regim"} = "Regime";
1041

  
1042
    $abbrev2cat{"acl"}        = "Clausal modifier of noun";
1043
    $abbrev2cat{"advcl"}      = "Adverbial clause modifier";
1044
    $abbrev2cat{"advmod"}     = "Adverbial modifier";
1045
    $abbrev2cat{"amod"}       = "Adjectival modifier";
1046
    $abbrev2cat{"appos"}      = "Appositional modifier";
1047
    $abbrev2cat{"aux"}        = "Auxiliary";
1048
    $abbrev2cat{"cc-nc"}      = "Coordinated conjunct : non coordonant";
1049
    $abbrev2cat{"cc"}         = "Coordinating conjunction";
1050
    $abbrev2cat{"ccomp"}      = "Clausal complement";
1051
    $abbrev2cat{"conj"}       = "Conjunct";
1052
    $abbrev2cat{"cop"}        = "Copula";
1053
    $abbrev2cat{"csubj"}      = "Clausal subject";
1054
    $abbrev2cat{"det"}        = "Determiner";
1055
    $abbrev2cat{"dislocated"} = "Dislocated elements";
1056
    $abbrev2cat{"expl"}       = "Expletive";
1057
    $abbrev2cat{"iobj"}       = "Indirect object";
1058
    $abbrev2cat{"mark"}       = "Marker";
1059
    $abbrev2cat{"nmod"}       = "Nominal modifier";
1060
    $abbrev2cat{"nsubj"}      = "Nominal subject";
1061
    $abbrev2cat{"nummod"}     = "Numeric modifier";
1062
    $abbrev2cat{"obj"}        = "Object";
1063
    $abbrev2cat{"obl"}        = " Oblique nominal";
1064
    $abbrev2cat{"orphan"}     = "Remnant in ellipsis";
1065
    $abbrev2cat{"remnant"}    = "Remnant ?";
1066
    $abbrev2cat{"vocative"}   = "Vocative";
1067
    $abbrev2cat{"xcomp"}      = "Open clausal complement";
1068
}
1069

  
1070
sub print_sentence {
1071
    for ( my $q = 0 ; $q <= $#words ; $q++ ) {
1072
        print "$words[$q]\n";
1073
    }
1074
}
1075

  
1076
sub write_nonterminals {
1077
    my $print_nt_features;
1078
    if ( $_[1] =~ /dupl/ ) {
1079
        $dupl = '_dupl';
1080
    }
1081
    else {
1082
        $dupl = '';
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff