Révision 1670
tmp/TXMReleasePlugins.site/site.xml (revision 1670) | ||
---|---|---|
9 | 9 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.1660.jar" id="org.txm.wordcloud.feature" version="1.0.0.1660"> |
10 | 10 |
<category name="Commands"/> |
11 | 11 |
</feature> |
12 |
<feature url="features/org.txm.treetagger.binaries.feature_1.0.0.1660.jar" id="org.txm.treetagger.binaries.feature" version="1.0.0.1660" os="linux,macosx,win32" ws="cocoa,gtk,win32">
|
|
12 |
<feature url="features/org.txm.treetagger.binaries.feature_1.0.0.1669.jar" id="org.txm.treetagger.binaries.feature" version="1.0.0.1669" os="" ws="">
|
|
13 | 13 |
<category name="Annotation"/> |
14 | 14 |
</feature> |
15 | 15 |
<feature url="features/org.txm.treetagger.models.feature_1.0.0.1660.jar" id="org.txm.treetagger.models.feature" version="1.0.0.1660"> |
tmp/org.txm.treetagger.core.linux/META-INF/MANIFEST.MF (revision 1670) | ||
---|---|---|
5 | 5 |
Bundle-Version: 1.0.0.qualifier |
6 | 6 |
Fragment-Host: org.txm.treetagger.core;bundle-version="1.0.0" |
7 | 7 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
8 |
Eclipse-PlatformFilter: (osgi.os=linux) |
tmp/org.txm.annotation.rcp/build.properties (revision 1670) | ||
---|---|---|
1 | 1 |
#Fri Jul 06 10:25:03 CEST 2018 |
2 | 2 |
output..=bin/ |
3 |
bin.includes=META-INF/,.,plugin.xml,icons/,OSGI-INF/l10n/bundle.properties |
|
3 |
bin.includes = META-INF/,\ |
|
4 |
.,\ |
|
5 |
plugin.xml,\ |
|
6 |
icons/,\ |
|
7 |
OSGI-INF/l10n/bundle.properties,\ |
|
8 |
OSGI-INF/l10n/bundle_fr.properties,\ |
|
9 |
OSGI-INF/l10n/bundle_ru.properties |
|
4 | 10 |
source..=src/ |
5 | 11 |
qualifier=svn |
tmp/org.txm.annotation.kr.rcp/build.properties (revision 1670) | ||
---|---|---|
3 | 3 |
bin.includes = META-INF/,\ |
4 | 4 |
.,\ |
5 | 5 |
plugin.xml,\ |
6 |
OSGI-INF/l10n/bundle.properties,\
|
|
7 |
icons/
|
|
6 |
icons/,\
|
|
7 |
OSGI-INF/
|
|
8 | 8 |
source..=src/ |
9 | 9 |
qualifier=svn |
tmp/org.txm.annotation.kr.core/build.properties (revision 1670) | ||
---|---|---|
1 | 1 |
#Fri Jul 06 10:25:03 CEST 2018 |
2 | 2 |
output..=bin/ |
3 |
bin.includes=META-INF/,.,plugin.xml |
|
3 |
bin.includes = META-INF/,\ |
|
4 |
.,\ |
|
5 |
plugin.xml,\ |
|
6 |
OSGI-INF/ |
|
4 | 7 |
source..=src/ |
5 | 8 |
qualifier=svn |
tmp/org.txm.treetagger.binaries.feature/feature.xml (revision 1670) | ||
---|---|---|
3 | 3 |
id="org.txm.treetagger.binaries.feature" |
4 | 4 |
label="TreeTagger software" |
5 | 5 |
version="1.0.0.qualifier" |
6 |
provider-name="Textometrie.org" |
|
7 |
os="linux,macosx,win32" |
|
8 |
ws="cocoa,gtk,win32"> |
|
6 |
provider-name="Textometrie.org"> |
|
9 | 7 |
|
10 | 8 |
<description url="http://www.example.com/description"> |
11 | 9 |
Install TreeTagger software / Installation du logiciel TreeTagger |
... | ... | |
71 | 69 |
|
72 | 70 |
<plugin |
73 | 71 |
id="org.txm.treetagger.core.linux" |
74 |
os="linux" |
|
75 |
ws="gtk" |
|
76 | 72 |
download-size="0" |
77 | 73 |
install-size="0" |
78 | 74 |
version="0.0.0" |
... | ... | |
80 | 76 |
|
81 | 77 |
<plugin |
82 | 78 |
id="org.txm.treetagger.core.macosx" |
83 |
os="macosx" |
|
84 |
ws="cocoa" |
|
85 | 79 |
download-size="0" |
86 | 80 |
install-size="0" |
87 | 81 |
version="0.0.0" |
... | ... | |
89 | 83 |
|
90 | 84 |
<plugin |
91 | 85 |
id="org.txm.treetagger.core.win32" |
92 |
os="win32" |
|
93 |
ws="win32" |
|
94 | 86 |
download-size="0" |
95 | 87 |
install-size="0" |
96 | 88 |
version="0.0.0" |
tmp/org.txm.treetagger.core/src/org/txm/treetagger/core/preferences/TreeTaggerPreferences.java (revision 1670) | ||
---|---|---|
148 | 148 |
String osname = System.getProperty("os.name").toLowerCase(); |
149 | 149 |
if (osname.contains("windows")) { |
150 | 150 |
osname = "win32"; |
151 |
} else if (osname.contains("macosx")) {
|
|
151 |
} else if (osname.contains("mac os x")) {
|
|
152 | 152 |
osname = "macosx"; |
153 | 153 |
} else { |
154 | 154 |
osname = "linux"; |
tmp/org.txm.treetagger.core.macosx/META-INF/MANIFEST.MF (revision 1670) | ||
---|---|---|
6 | 6 |
Bundle-Version: 1.0.0.qualifier |
7 | 7 |
Fragment-Host: org.txm.treetagger.core |
8 | 8 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
9 |
Eclipse-PlatformFilter: (osgi.os=macosx) |
tmp/org.txm.treetagger.core.win32/META-INF/MANIFEST.MF (revision 1670) | ||
---|---|---|
5 | 5 |
Bundle-Version: 1.0.0.qualifier |
6 | 6 |
Fragment-Host: org.txm.treetagger.core;bundle-version="1.0.0" |
7 | 7 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
8 |
Eclipse-PlatformFilter: (osgi.os=win32) |
tmp/org.txm.treetagger.core.win32/res/win/README.txt (revision 1670) | ||
---|---|---|
1 |
|
|
2 |
/****************************************************************************/ |
|
3 |
/* How to use the TreeTagger */ |
|
4 |
/* */ |
|
5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
6 |
/****************************************************************************/ |
|
7 |
|
|
8 |
|
|
9 |
The TreeTagger consists of two programs: the training program creates |
|
10 |
a parameter file from a fullform lexicon and a handtagged corpus. The |
|
11 |
tagger program reads the parameter file and annotates the text with |
|
12 |
part of speech and lemma information. Both programs print information |
|
13 |
about their usage when they are called without arguments. |
|
14 |
|
|
15 |
|
|
16 |
Tagging |
|
17 |
------- |
|
18 |
|
|
19 |
Tagging is done with the *tree-tagger* program. |
|
20 |
|
|
21 |
The first argument is the name of a parameter file which was generated |
|
22 |
with the train-tree-tagger program. Parameter files generated on |
|
23 |
different platforms or with older versions of train-tree-tagger will |
|
24 |
not work. |
|
25 |
|
|
26 |
The second argument is the input file. It must be in one-word-per-line |
|
27 |
format, i.e. each line contains one token (word, punctuation character |
|
28 |
or parenthesis) and should not exceed 1000 characters. Tokens may contain |
|
29 |
blanks. It is possible to override the lexical information contained |
|
30 |
in the parameter file of the tagger by specifying a list of possible |
|
31 |
tags after the token. This list has to be preceded by a tab character |
|
32 |
and the elements are separated by tab characters. Pretagging could be |
|
33 |
used e.g. to ensure that certain text-specific expressions are tagged |
|
34 |
correctly. Clitics (like "'s", "'re", and "'d" in English or "-la" and |
|
35 |
"-t-elle" in French) have to be separated if they were separated in |
|
36 |
the training data. (The French and English parameter files available |
|
37 |
by ftp expect separation of clitics). |
|
38 |
|
|
39 |
Sample input file: |
|
40 |
He |
|
41 |
moved |
|
42 |
to |
|
43 |
New York City NP |
|
44 |
. |
|
45 |
|
|
46 |
|
|
47 |
The third argument is the name of the output file. The output is also |
|
48 |
in one-word-per-line format. Depending on the specified options, it |
|
49 |
will contain columns with tokens, tags and lemmas. If the third |
|
50 |
argument is missing, the output will be printed to standard output. If |
|
51 |
the second argument is missing, too, input is read from standard |
|
52 |
input. |
|
53 |
|
|
54 |
Options: |
|
55 |
|
|
56 |
-token: Prints the token as well. |
|
57 |
-lemma: Prints the lemma as well. |
|
58 |
-sgml: Don't tag SGML annotations, i.e. lines starting with '<' and ending |
|
59 |
with '>'. |
|
60 |
-threshold <p>: Print all tags with a probability higher than <p> times the |
|
61 |
probability of the best tag. |
|
62 |
-prob: Print tag probabilities (requires option -threshold) |
|
63 |
-no-unknown: Print the token rather than <unknown> for unknown lemmas |
|
64 |
-quiet: Don't print status messages |
|
65 |
-pt-with-lemma: If this option is specified, then each pretagging tag |
|
66 |
(see above) has to be followed by a whitespace and a lemma. |
|
67 |
-pt-with-prob: If this option is specified, then each pretagging tag |
|
68 |
(see above) has to be followed by whitespace and a tag probability |
|
69 |
value. If -pt-with-prob and -pt-with-lemma have been specified, |
|
70 |
then each pretagging tag is followed by a probability and a lemma |
|
71 |
in that order. |
|
72 |
-files f: Read the names of input and output files pairwise from the |
|
73 |
file f. The format of f is the lexicon file format described below. |
|
74 |
-lex f: Read auxiliary lexicon entries from the file f. |
|
75 |
-eos-tag <tag>: The SGML tag <tag> signals the end of a sentence. |
|
76 |
This option implies the option -sgml |
|
77 |
|
|
78 |
Some more exotic options: |
|
79 |
-proto: Print lexical information for each word |
|
80 |
The lexicon type is signalled by one of the characters |
|
81 |
f: The word was found in the full form lexicon. |
|
82 |
c: The word in lowercase was found in the lexicon |
|
83 |
h: The word contains an hyphen and the word following the hyphen was found |
|
84 |
in the full form lexicon; e.g. instead of "table-wine" only "wine" has |
|
85 |
been found. |
|
86 |
s: The word has been looked up in the suffix lexicon |
|
87 |
p: Tags have been assigned by pretagging. |
|
88 |
-gramotron: Same as -proto but with a different format |
|
89 |
-proto-with-prob: Same as -proto but with lexical tag probabilities |
|
90 |
-print-prob-tree: Print the transition probability tree and exit |
|
91 |
-eps <epsilon>: Value which is used to replace zero lexical frequencies. |
|
92 |
Zero frequencies occur when a word/tag pair is contained in the lexicon |
|
93 |
but not in the training corpus. The default is 0.1. |
|
94 |
-base: Use only lexical probabilities for tagging. This option is only |
|
95 |
useful to obtain a baseline result to which the actual tagger output is |
|
96 |
compared. |
|
97 |
|
|
98 |
|
|
99 |
|
|
100 |
Training |
|
101 |
-------- |
|
102 |
|
|
103 |
Training is done with the *train-tree-tagger* program. If the program is |
|
104 |
called without arguments, the following output is printed: |
|
105 |
|
|
106 |
USAGE: train-tree-tagger <lexicon> <open class file> <infile> <outfile> |
|
107 |
{-cl <context length>} {-dtg <min. decision tree gain>} |
|
108 |
{-ecw <eq. class weight>} {-atg <affix tree gain>} {-st <sent. tag>} |
|
109 |
|
|
110 |
Description of the command line arguments: |
|
111 |
* <lexicon>: name of a file which contains the fullform lexicon. Each line |
|
112 |
of the lexicon corresponds to one word form and contains the word form |
|
113 |
itself followed by a Tab character and a sequence of tag-lemma pairs. |
|
114 |
The tags and lemmata are separated by whitespace. |
|
115 |
|
|
116 |
Example: |
|
117 |
aback RB aback |
|
118 |
abacuses NNS abacus |
|
119 |
abandon VB abandon VBP abandon |
|
120 |
abandoned JJ abandoned VBD abandon VBN abandon |
|
121 |
abandoning VBG abandon |
|
122 |
|
|
123 |
Important: Ordinal and cardinal numbers which consist of digits |
|
124 |
should not be included in the lexicon. Otherwise, the tagger will |
|
125 |
not be able to learn how to tag numbers which are not listed in the |
|
126 |
lexicon. Numbers with unusual tags should be added to the lexicon, |
|
127 |
however. |
|
128 |
|
|
129 |
Remark: The tagger doesn't need the lemmata for tagging. If |
|
130 |
you do not have the lemma information or if you do not plan to |
|
131 |
annotate corpora with lemmas, you can replace the lemma with a dummy |
|
132 |
value, e.g. "-". |
|
133 |
|
|
134 |
* <open class file>: name of a file which contains a list of open class tags |
|
135 |
i.e. possible tags of unknown word forms. This information is needed to |
|
136 |
estimate likely tags of unknown words. This file would typically contain |
|
137 |
adverb, adjective, noun, proper name and perhaps verb tags, but not |
|
138 |
prepositions, determiners, pronouns or numbers. |
|
139 |
* <input file>: name of a file which contains tagged training data. The data |
|
140 |
must be in one-word-per-line format. This means that each line contains |
|
141 |
one token and one tag in that order separated by a tabulator. |
|
142 |
Punctuation marks are considered as tokens and must have been tagged as well. |
|
143 |
|
|
144 |
Example: |
|
145 |
Pierre NP |
|
146 |
Vinken NP |
|
147 |
, , |
|
148 |
61 CD |
|
149 |
years NNS |
|
150 |
|
|
151 |
* <output file>: name of the file in which the resulting tagger parameters |
|
152 |
are stored. |
|
153 |
|
|
154 |
|
|
155 |
The following parameters are optional: |
|
156 |
|
|
157 |
* -cl <context length>: number of preceding words forming the tagging |
|
158 |
context. The default is 2 which corresponds to a trigram context. For |
|
159 |
small training corpora and/or large tagsets, it could be useful to reduce |
|
160 |
this parameter to 1. |
|
161 |
* -dtg <min. decision tree gain>: Threshold - If the information gain at a |
|
162 |
leaf node of the decision tree is below this threshold, the node is deleted. |
|
163 |
The default value is 0.7. |
|
164 |
* -ecw <eq. class weight>: weight of the equivalence class based probability |
|
165 |
estimates. The default is 0.15. |
|
166 |
* -atg <affix tree gain> Threshold - If the information gain at a leaf of an |
|
167 |
affix tree is below this threshold, it is deleted. The default is 1.2. |
|
168 |
* -st <sent. tag>: the end-of-sentence part-of-speech tag, i.e. the tag which |
|
169 |
is assigned to sentence punctuation like ".", "!", "?". |
|
170 |
Default is "SENT". It is important to set this option properly, if your |
|
171 |
tag for sentence punctuation is not "SENT". |
|
172 |
|
|
173 |
The accuracy of the TreeTagger usually improves a bit, if different |
|
174 |
settings of the above parameters are tested and the best combination |
|
175 |
is chosen. |
tmp/org.txm.treetagger.core.win32/res/win/INSTALL.txt (revision 1670) | ||
---|---|---|
1 |
|
|
2 |
/****************************************************************************/ |
|
3 |
/* How to install the Windows version of the TreeTagger */ |
|
4 |
/* */ |
|
5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
6 |
/****************************************************************************/ |
|
7 |
|
|
8 |
This is the Windows distribution of the TreeTagger. |
|
9 |
|
|
10 |
It contains the following files: |
|
11 |
|
|
12 |
- tree-tagger.exe: the tagger program |
|
13 |
|
|
14 |
- train-tree-tagger.exe: the training program |
|
15 |
|
|
16 |
- utf8-tokenize.perl: A Perl script which transforms the tagger input |
|
17 |
into one-word-perl-line format |
|
18 |
|
|
19 |
- *-abbreviations: abbreviation lists required by the tokenizer |
|
20 |
|
|
21 |
- tag-*.bat: batch files for different languages which call |
|
22 |
the tokeniser and the tagger |
|
23 |
|
|
24 |
- chunk-*.bat batch files for POS tagging and chunking |
|
25 |
|
|
26 |
|
|
27 |
Installation |
|
28 |
------------ |
|
29 |
|
|
30 |
1. Install a Perl interpreter (if you have not already installed one). |
|
31 |
You can download a Perl interpreter for Windows for free at |
|
32 |
http://www.activestate.com/activeperl/ |
|
33 |
|
|
34 |
2. Move the TreeTagger directory to the root directory of drive C:. |
|
35 |
|
|
36 |
3. Download the PC parameter files for the languages you need, decompress |
|
37 |
them (e.g. using Winzip or 7zip) and move them to the subdirectory lib. |
|
38 |
Rename the parameter files to <language>-utf8.par |
|
39 |
Example: Rename french-par-linux-3.2-utf8.bin to french-utf8.par |
|
40 |
|
|
41 |
Non-UTF8 parameter files are not supported anymore. |
|
42 |
|
|
43 |
4. Add the path C:\TreeTagger\bin to the PATH environment variable. |
|
44 |
|
|
45 |
5. Open a shell and type the command |
|
46 |
set PATH=C:\TreeTagger\bin;%PATH% |
|
47 |
|
|
48 |
6. Change to the directory C:\TreeTagger |
|
49 |
|
|
50 |
7. Now you can test the tagger, e.g. by analyzing this file with the command |
|
51 |
tag-english INSTALL.txt |
|
52 |
|
|
53 |
If you install the TreeTagger in a different directory, you have to |
|
54 |
modify the first path in the batch files tag-*.bat. |
|
55 |
|
|
56 |
|
|
57 |
Michaela Atterer told me that she had difficulties to install the |
|
58 |
TreeTagger on a Windows XP system. She recommends the following |
|
59 |
work-around. |
|
60 |
|
|
61 |
4. Windows XP: |
|
62 |
-Right click on "My Computer" |
|
63 |
-Select the "Advanced" tab |
|
64 |
-Click on "Environment Variables" |
|
65 |
-click on New: enter PATH and C:\TreeTagger\bin\;%PATH% |
|
66 |
|
|
67 |
If the files have been unpacked into a single directory, you should |
|
68 |
restore the following directory structure: |
|
69 |
|
|
70 |
TreeTagger: |
|
71 |
INSTALL.txt README.txt bin cmd lib |
|
72 |
|
|
73 |
TreeTagger/bin: |
|
74 |
tag-english.bat tag-german.bat tag-spanish.bat tree-tagger.exe |
|
75 |
tag-french.bat tag-italian.bat train-tree-tagger.exe |
|
76 |
|
|
77 |
TreeTagger/cmd: |
|
78 |
mwl-lookup.perl tokenize.pl |
|
79 |
|
|
80 |
TreeTagger/lib: |
|
81 |
english-abbreviations german-abbreviations spanish-abbreviations |
|
82 |
french-abbreviations italian-abbreviations spanish-mwls |
|
83 |
|
|
84 |
|
|
85 |
Note that the TreeTagger comes without a graphical interface. You have |
|
86 |
to run it by entering a command in a command line window. If you prefer |
|
87 |
a graphical interface, try the one provided by Ciarán Ó Duibhín at |
|
88 |
http://www.smo.uhi.ac.uk/~oduibhin/oideasra/interfaces/winttinterface.htm |
|
89 |
|
tmp/org.txm.treetagger.core.win32/res/win32/README.txt (revision 1670) | ||
---|---|---|
1 |
|
|
2 |
/****************************************************************************/ |
|
3 |
/* How to use the TreeTagger */ |
|
4 |
/* */ |
|
5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
6 |
/****************************************************************************/ |
|
7 |
|
|
8 |
|
|
9 |
The TreeTagger consists of two programs: the training program creates |
|
10 |
a parameter file from a fullform lexicon and a handtagged corpus. The |
|
11 |
tagger program reads the parameter file and annotates the text with |
|
12 |
part of speech and lemma information. Both programs print information |
|
13 |
about their usage when they are called without arguments. |
|
14 |
|
|
15 |
|
|
16 |
Tagging |
|
17 |
------- |
|
18 |
|
|
19 |
Tagging is done with the *tree-tagger* program. |
|
20 |
|
|
21 |
The first argument is the name of a parameter file which was generated |
|
22 |
with the train-tree-tagger program. Parameter files generated on |
|
23 |
different platforms or with older versions of train-tree-tagger will |
|
24 |
not work. |
|
25 |
|
|
26 |
The second argument is the input file. It must be in one-word-per-line |
|
27 |
format, i.e. each line contains one token (word, punctuation character |
|
28 |
or parenthesis) and should not exceed 1000 characters. Tokens may contain |
|
29 |
blanks. It is possible to override the lexical information contained |
|
30 |
in the parameter file of the tagger by specifying a list of possible |
|
31 |
tags after the token. This list has to be preceded by a tab character |
|
32 |
and the elements are separated by tab characters. Pretagging could be |
|
33 |
used e.g. to ensure that certain text-specific expressions are tagged |
|
34 |
correctly. Clitics (like "'s", "'re", and "'d" in English or "-la" and |
|
35 |
"-t-elle" in French) have to be separated if they were separated in |
|
36 |
the training data. (The French and English parameter files available |
|
37 |
by ftp expect separation of clitics). |
|
38 |
|
|
39 |
Sample input file: |
|
40 |
He |
|
41 |
moved |
|
42 |
to |
|
43 |
New York City NP |
|
44 |
. |
|
45 |
|
|
46 |
|
|
47 |
The third argument is the name of the output file. The output is also |
|
48 |
in one-word-per-line format. Depending on the specified options, it |
|
49 |
will contain columns with tokens, tags and lemmas. If the third |
|
50 |
argument is missing, the output will be printed to standard output. If |
|
51 |
the second argument is missing, too, input is read from standard |
|
52 |
input. |
|
53 |
|
|
54 |
Options: |
|
55 |
|
|
56 |
-token: Prints the token as well. |
|
57 |
-lemma: Prints the lemma as well. |
|
58 |
-sgml: Don't tag SGML annotations, i.e. lines starting with '<' and ending |
|
59 |
with '>'. |
|
60 |
-threshold <p>: Print all tags with a probability higher than <p> times the |
|
61 |
probability of the best tag. |
|
62 |
-prob: Print tag probabilities (requires option -threshold) |
|
63 |
-no-unknown: Print the token rather than <unknown> for unknown lemmas |
|
64 |
-quiet: Don't print status messages |
|
65 |
-pt-with-lemma: If this option is specified, then each pretagging tag |
|
66 |
(see above) has to be followed by a whitespace and a lemma. |
|
67 |
-pt-with-prob: If this option is specified, then each pretagging tag |
|
68 |
(see above) has to be followed by whitespace and a tag probability |
|
69 |
value. If -pt-with-prob and -pt-with-lemma have been specified, |
|
70 |
then each pretagging tag is followed by a probability and a lemma |
|
71 |
in that order. |
|
72 |
-files f: Read the names of input and output files pairwise from the |
|
73 |
file f. The format of f is the lexicon file format described below. |
|
74 |
-lex f: Read auxiliary lexicon entries from the file f. |
|
75 |
-eos-tag <tag>: The SGML tag <tag> signals the end of a sentence. |
|
76 |
This option implies the option -sgml |
|
77 |
|
|
78 |
Some more exotic options: |
|
79 |
-proto: Print lexical information for each word |
|
80 |
The lexicon type is signalled by one of the characters |
|
81 |
f: The word was found in the full form lexicon. |
|
82 |
c: The word in lowercase was found in the lexicon |
|
83 |
h: The word contains an hyphen and the word following the hyphen was found |
|
84 |
in the full form lexicon; e.g. instead of "table-wine" only "wine" has |
|
85 |
been found. |
|
86 |
s: The word has been looked up in the suffix lexicon |
|
87 |
p: Tags have been assigned by pretagging. |
|
88 |
-gramotron: Same as -proto but with a different format |
|
89 |
-proto-with-prob: Same as -proto but with lexical tag probabilities |
|
90 |
-print-prob-tree: Print the transition probability tree and exit |
|
91 |
-eps <epsilon>: Value which is used to replace zero lexical frequencies. |
|
92 |
Zero frequencies occur when a word/tag pair is contained in the lexicon |
|
93 |
but not in the training corpus. The default is 0.1. |
|
94 |
-base: Use only lexical probabilities for tagging. This option is only |
|
95 |
useful to obtain a baseline result to which the actual tagger output is |
|
96 |
compared. |
|
97 |
|
|
98 |
|
|
99 |
|
|
100 |
Training |
|
101 |
-------- |
|
102 |
|
|
103 |
Training is done with the *train-tree-tagger* program. If the program is |
|
104 |
called without arguments, the following output is printed: |
|
105 |
|
|
106 |
USAGE: train-tree-tagger <lexicon> <open class file> <infile> <outfile> |
|
107 |
{-cl <context length>} {-dtg <min. decision tree gain>} |
|
108 |
{-ecw <eq. class weight>} {-atg <affix tree gain>} {-st <sent. tag>} |
|
109 |
|
|
110 |
Description of the command line arguments: |
|
111 |
* <lexicon>: name of a file which contains the fullform lexicon. Each line |
|
112 |
of the lexicon corresponds to one word form and contains the word form |
|
113 |
itself followed by a Tab character and a sequence of tag-lemma pairs. |
|
114 |
The tags and lemmata are separated by whitespace. |
|
115 |
|
|
116 |
Example: |
|
117 |
aback RB aback |
|
118 |
abacuses NNS abacus |
|
119 |
abandon VB abandon VBP abandon |
|
120 |
abandoned JJ abandoned VBD abandon VBN abandon |
|
121 |
abandoning VBG abandon |
|
122 |
|
|
123 |
Important: Ordinal and cardinal numbers which consist of digits |
|
124 |
should not be included in the lexicon. Otherwise, the tagger will |
|
125 |
not be able to learn how to tag numbers which are not listed in the |
|
126 |
lexicon. Numbers with unusual tags should be added to the lexicon, |
|
127 |
however. |
|
128 |
|
|
129 |
Remark: The tagger doesn't need the lemmata for tagging. If |
|
130 |
you do not have the lemma information or if you do not plan to |
|
131 |
annotate corpora with lemmas, you can replace the lemma with a dummy |
|
132 |
value, e.g. "-". |
|
133 |
|
|
134 |
* <open class file>: name of a file which contains a list of open class tags |
|
135 |
i.e. possible tags of unknown word forms. This information is needed to |
|
136 |
estimate likely tags of unknown words. This file would typically contain |
|
137 |
adverb, adjective, noun, proper name and perhaps verb tags, but not |
|
138 |
prepositions, determiners, pronouns or numbers. |
|
139 |
* <input file>: name of a file which contains tagged training data. The data |
|
140 |
must be in one-word-per-line format. This means that each line contains |
|
141 |
one token and one tag in that order separated by a tabulator. |
|
142 |
Punctuation marks are considered as tokens and must have been tagged as well. |
|
143 |
|
|
144 |
Example: |
|
145 |
Pierre NP |
|
146 |
Vinken NP |
|
147 |
, , |
|
148 |
61 CD |
|
149 |
years NNS |
|
150 |
|
|
151 |
* <output file>: name of the file in which the resulting tagger parameters |
|
152 |
are stored. |
|
153 |
|
|
154 |
|
|
155 |
The following parameters are optional: |
|
156 |
|
|
157 |
* -cl <context length>: number of preceding words forming the tagging |
|
158 |
context. The default is 2 which corresponds to a trigram context. For |
|
159 |
small training corpora and/or large tagsets, it could be useful to reduce |
|
160 |
this parameter to 1. |
|
161 |
* -dtg <min. decision tree gain>: Threshold - If the information gain at a |
|
162 |
leaf node of the decision tree is below this threshold, the node is deleted. |
|
163 |
The default value is 0.7. |
|
164 |
* -ecw <eq. class weight>: weight of the equivalence class based probability |
|
165 |
estimates. The default is 0.15. |
|
166 |
* -atg <affix tree gain> Threshold - If the information gain at a leaf of an |
|
167 |
affix tree is below this threshold, it is deleted. The default is 1.2. |
|
168 |
* -st <sent. tag>: the end-of-sentence part-of-speech tag, i.e. the tag which |
|
169 |
is assigned to sentence punctuation like ".", "!", "?". |
|
170 |
Default is "SENT". It is important to set this option properly, if your |
|
171 |
tag for sentence punctuation is not "SENT". |
|
172 |
|
|
173 |
The accuracy of the TreeTagger usually improves a bit, if different |
|
174 |
settings of the above parameters are tested and the best combination |
|
175 |
is chosen. |
|
0 | 176 |
tmp/org.txm.treetagger.core.win32/res/win32/INSTALL.txt (revision 1670) | ||
---|---|---|
1 |
|
|
2 |
/****************************************************************************/ |
|
3 |
/* How to install the Windows version of the TreeTagger */ |
|
4 |
/* */ |
|
5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
6 |
/****************************************************************************/ |
|
7 |
|
|
8 |
This is the Windows distribution of the TreeTagger. |
|
9 |
|
|
10 |
It contains the following files: |
|
11 |
|
|
12 |
- tree-tagger.exe: the tagger program |
|
13 |
|
|
14 |
- train-tree-tagger.exe: the training program |
|
15 |
|
|
16 |
- utf8-tokenize.perl: A Perl script which transforms the tagger input |
|
17 |
into one-word-perl-line format |
|
18 |
|
|
19 |
- *-abbreviations: abbreviation lists required by the tokenizer |
|
20 |
|
|
21 |
- tag-*.bat: batch files for different languages which call |
|
22 |
the tokeniser and the tagger |
|
23 |
|
|
24 |
- chunk-*.bat batch files for POS tagging and chunking |
|
25 |
|
|
26 |
|
|
27 |
Installation |
|
28 |
------------ |
|
29 |
|
|
30 |
1. Install a Perl interpreter (if you have not already installed one). |
|
31 |
You can download a Perl interpreter for Windows for free at |
|
32 |
http://www.activestate.com/activeperl/ |
|
33 |
|
|
34 |
2. Move the TreeTagger directory to the root directory of drive C:. |
|
35 |
|
|
36 |
3. Download the PC parameter files for the languages you need, decompress |
|
37 |
them (e.g. using Winzip or 7zip) and move them to the subdirectory lib. |
|
38 |
Rename the parameter files to <language>-utf8.par |
|
39 |
Example: Rename french-par-linux-3.2-utf8.bin to french-utf8.par |
|
40 |
|
|
41 |
Non-UTF8 parameter files are not supported anymore. |
|
42 |
|
|
43 |
4. Add the path C:\TreeTagger\bin to the PATH environment variable. |
|
44 |
|
|
45 |
5. Open a shell and type the command |
|
46 |
set PATH=C:\TreeTagger\bin;%PATH% |
|
47 |
|
|
48 |
6. Change to the directory C:\TreeTagger |
|
49 |
|
|
50 |
7. Now you can test the tagger, e.g. by analyzing this file with the command |
|
51 |
tag-english INSTALL.txt |
|
52 |
|
|
53 |
If you install the TreeTagger in a different directory, you have to |
|
54 |
modify the first path in the batch files tag-*.bat. |
|
55 |
|
|
56 |
|
|
57 |
Michaela Atterer told me that she had difficulties to install the |
|
58 |
TreeTagger on a Windows XP system. She recommends the following |
|
59 |
work-around. |
|
60 |
|
|
61 |
4. Windows XP: |
|
62 |
-Right click on "My Computer" |
|
63 |
-Select the "Advanced" tab |
|
64 |
-Click on "Environment Variables" |
|
65 |
-click on New: enter PATH and C:\TreeTagger\bin\;%PATH% |
|
66 |
|
|
67 |
If the files have been unpacked into a single directory, you should |
|
68 |
restore the following directory structure: |
|
69 |
|
|
70 |
TreeTagger: |
|
71 |
INSTALL.txt README.txt bin cmd lib |
|
72 |
|
|
73 |
TreeTagger/bin: |
|
74 |
tag-english.bat tag-german.bat tag-spanish.bat tree-tagger.exe |
|
75 |
tag-french.bat tag-italian.bat train-tree-tagger.exe |
|
76 |
|
|
77 |
TreeTagger/cmd: |
|
78 |
mwl-lookup.perl tokenize.pl |
|
79 |
|
|
80 |
TreeTagger/lib: |
|
81 |
english-abbreviations german-abbreviations spanish-abbreviations |
|
82 |
french-abbreviations italian-abbreviations spanish-mwls |
|
83 |
|
|
84 |
|
|
85 |
Note that the TreeTagger comes without a graphical interface. You have |
|
86 |
to run it by entering a command in a command line window. If you prefer |
|
87 |
a graphical interface, try the one provided by Ciarán Ó Duibhín at |
|
88 |
http://www.smo.uhi.ac.uk/~oduibhin/oideasra/interfaces/winttinterface.htm |
|
89 |
|
|
0 | 90 |
Formats disponibles : Unified diff