Révision 1670
| tmp/TXMReleasePlugins.site/site.xml (revision 1670) | ||
|---|---|---|
| 9 | 9 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.1660.jar" id="org.txm.wordcloud.feature" version="1.0.0.1660"> |
| 10 | 10 |
<category name="Commands"/> |
| 11 | 11 |
</feature> |
| 12 |
<feature url="features/org.txm.treetagger.binaries.feature_1.0.0.1660.jar" id="org.txm.treetagger.binaries.feature" version="1.0.0.1660" os="linux,macosx,win32" ws="cocoa,gtk,win32">
|
|
| 12 |
<feature url="features/org.txm.treetagger.binaries.feature_1.0.0.1669.jar" id="org.txm.treetagger.binaries.feature" version="1.0.0.1669" os="" ws="">
|
|
| 13 | 13 |
<category name="Annotation"/> |
| 14 | 14 |
</feature> |
| 15 | 15 |
<feature url="features/org.txm.treetagger.models.feature_1.0.0.1660.jar" id="org.txm.treetagger.models.feature" version="1.0.0.1660"> |
| tmp/org.txm.treetagger.core.linux/META-INF/MANIFEST.MF (revision 1670) | ||
|---|---|---|
| 5 | 5 |
Bundle-Version: 1.0.0.qualifier |
| 6 | 6 |
Fragment-Host: org.txm.treetagger.core;bundle-version="1.0.0" |
| 7 | 7 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
| 8 |
Eclipse-PlatformFilter: (osgi.os=linux) |
|
| tmp/org.txm.annotation.rcp/build.properties (revision 1670) | ||
|---|---|---|
| 1 | 1 |
#Fri Jul 06 10:25:03 CEST 2018 |
| 2 | 2 |
output..=bin/ |
| 3 |
bin.includes=META-INF/,.,plugin.xml,icons/,OSGI-INF/l10n/bundle.properties |
|
| 3 |
bin.includes = META-INF/,\ |
|
| 4 |
.,\ |
|
| 5 |
plugin.xml,\ |
|
| 6 |
icons/,\ |
|
| 7 |
OSGI-INF/l10n/bundle.properties,\ |
|
| 8 |
OSGI-INF/l10n/bundle_fr.properties,\ |
|
| 9 |
OSGI-INF/l10n/bundle_ru.properties |
|
| 4 | 10 |
source..=src/ |
| 5 | 11 |
qualifier=svn |
| tmp/org.txm.annotation.kr.rcp/build.properties (revision 1670) | ||
|---|---|---|
| 3 | 3 |
bin.includes = META-INF/,\ |
| 4 | 4 |
.,\ |
| 5 | 5 |
plugin.xml,\ |
| 6 |
OSGI-INF/l10n/bundle.properties,\
|
|
| 7 |
icons/
|
|
| 6 |
icons/,\
|
|
| 7 |
OSGI-INF/
|
|
| 8 | 8 |
source..=src/ |
| 9 | 9 |
qualifier=svn |
| tmp/org.txm.annotation.kr.core/build.properties (revision 1670) | ||
|---|---|---|
| 1 | 1 |
#Fri Jul 06 10:25:03 CEST 2018 |
| 2 | 2 |
output..=bin/ |
| 3 |
bin.includes=META-INF/,.,plugin.xml |
|
| 3 |
bin.includes = META-INF/,\ |
|
| 4 |
.,\ |
|
| 5 |
plugin.xml,\ |
|
| 6 |
OSGI-INF/ |
|
| 4 | 7 |
source..=src/ |
| 5 | 8 |
qualifier=svn |
| tmp/org.txm.treetagger.binaries.feature/feature.xml (revision 1670) | ||
|---|---|---|
| 3 | 3 |
id="org.txm.treetagger.binaries.feature" |
| 4 | 4 |
label="TreeTagger software" |
| 5 | 5 |
version="1.0.0.qualifier" |
| 6 |
provider-name="Textometrie.org" |
|
| 7 |
os="linux,macosx,win32" |
|
| 8 |
ws="cocoa,gtk,win32"> |
|
| 6 |
provider-name="Textometrie.org"> |
|
| 9 | 7 |
|
| 10 | 8 |
<description url="http://www.example.com/description"> |
| 11 | 9 |
Install TreeTagger software / Installation du logiciel TreeTagger |
| ... | ... | |
| 71 | 69 |
|
| 72 | 70 |
<plugin |
| 73 | 71 |
id="org.txm.treetagger.core.linux" |
| 74 |
os="linux" |
|
| 75 |
ws="gtk" |
|
| 76 | 72 |
download-size="0" |
| 77 | 73 |
install-size="0" |
| 78 | 74 |
version="0.0.0" |
| ... | ... | |
| 80 | 76 |
|
| 81 | 77 |
<plugin |
| 82 | 78 |
id="org.txm.treetagger.core.macosx" |
| 83 |
os="macosx" |
|
| 84 |
ws="cocoa" |
|
| 85 | 79 |
download-size="0" |
| 86 | 80 |
install-size="0" |
| 87 | 81 |
version="0.0.0" |
| ... | ... | |
| 89 | 83 |
|
| 90 | 84 |
<plugin |
| 91 | 85 |
id="org.txm.treetagger.core.win32" |
| 92 |
os="win32" |
|
| 93 |
ws="win32" |
|
| 94 | 86 |
download-size="0" |
| 95 | 87 |
install-size="0" |
| 96 | 88 |
version="0.0.0" |
| tmp/org.txm.treetagger.core/src/org/txm/treetagger/core/preferences/TreeTaggerPreferences.java (revision 1670) | ||
|---|---|---|
| 148 | 148 |
String osname = System.getProperty("os.name").toLowerCase();
|
| 149 | 149 |
if (osname.contains("windows")) {
|
| 150 | 150 |
osname = "win32"; |
| 151 |
} else if (osname.contains("macosx")) {
|
|
| 151 |
} else if (osname.contains("mac os x")) {
|
|
| 152 | 152 |
osname = "macosx"; |
| 153 | 153 |
} else {
|
| 154 | 154 |
osname = "linux"; |
| tmp/org.txm.treetagger.core.macosx/META-INF/MANIFEST.MF (revision 1670) | ||
|---|---|---|
| 6 | 6 |
Bundle-Version: 1.0.0.qualifier |
| 7 | 7 |
Fragment-Host: org.txm.treetagger.core |
| 8 | 8 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
| 9 |
Eclipse-PlatformFilter: (osgi.os=macosx) |
|
| tmp/org.txm.treetagger.core.win32/META-INF/MANIFEST.MF (revision 1670) | ||
|---|---|---|
| 5 | 5 |
Bundle-Version: 1.0.0.qualifier |
| 6 | 6 |
Fragment-Host: org.txm.treetagger.core;bundle-version="1.0.0" |
| 7 | 7 |
Bundle-RequiredExecutionEnvironment: JavaSE-1.7 |
| 8 |
Eclipse-PlatformFilter: (osgi.os=win32) |
|
| tmp/org.txm.treetagger.core.win32/res/win/README.txt (revision 1670) | ||
|---|---|---|
| 1 |
|
|
| 2 |
/****************************************************************************/ |
|
| 3 |
/* How to use the TreeTagger */ |
|
| 4 |
/* */ |
|
| 5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
| 6 |
/****************************************************************************/ |
|
| 7 |
|
|
| 8 |
|
|
| 9 |
The TreeTagger consists of two programs: the training program creates |
|
| 10 |
a parameter file from a fullform lexicon and a handtagged corpus. The |
|
| 11 |
tagger program reads the parameter file and annotates the text with |
|
| 12 |
part of speech and lemma information. Both programs print information |
|
| 13 |
about their usage when they are called without arguments. |
|
| 14 |
|
|
| 15 |
|
|
| 16 |
Tagging |
|
| 17 |
------- |
|
| 18 |
|
|
| 19 |
Tagging is done with the *tree-tagger* program. |
|
| 20 |
|
|
| 21 |
The first argument is the name of a parameter file which was generated |
|
| 22 |
with the train-tree-tagger program. Parameter files generated on |
|
| 23 |
different platforms or with older versions of train-tree-tagger will |
|
| 24 |
not work. |
|
| 25 |
|
|
| 26 |
The second argument is the input file. It must be in one-word-per-line |
|
| 27 |
format, i.e. each line contains one token (word, punctuation character |
|
| 28 |
or parenthesis) and should not exceed 1000 characters. Tokens may contain |
|
| 29 |
blanks. It is possible to override the lexical information contained |
|
| 30 |
in the parameter file of the tagger by specifying a list of possible |
|
| 31 |
tags after the token. This list has to be preceded by a tab character |
|
| 32 |
and the elements are separated by tab characters. Pretagging could be |
|
| 33 |
used e.g. to ensure that certain text-specific expressions are tagged |
|
| 34 |
correctly. Clitics (like "'s", "'re", and "'d" in English or "-la" and |
|
| 35 |
"-t-elle" in French) have to be separated if they were separated in |
|
| 36 |
the training data. (The French and English parameter files available |
|
| 37 |
by ftp expect separation of clitics). |
|
| 38 |
|
|
| 39 |
Sample input file: |
|
| 40 |
He |
|
| 41 |
moved |
|
| 42 |
to |
|
| 43 |
New York City NP |
|
| 44 |
. |
|
| 45 |
|
|
| 46 |
|
|
| 47 |
The third argument is the name of the output file. The output is also |
|
| 48 |
in one-word-per-line format. Depending on the specified options, it |
|
| 49 |
will contain columns with tokens, tags and lemmas. If the third |
|
| 50 |
argument is missing, the output will be printed to standard output. If |
|
| 51 |
the second argument is missing, too, input is read from standard |
|
| 52 |
input. |
|
| 53 |
|
|
| 54 |
Options: |
|
| 55 |
|
|
| 56 |
-token: Prints the token as well. |
|
| 57 |
-lemma: Prints the lemma as well. |
|
| 58 |
-sgml: Don't tag SGML annotations, i.e. lines starting with '<' and ending |
|
| 59 |
with '>'. |
|
| 60 |
-threshold <p>: Print all tags with a probability higher than <p> times the |
|
| 61 |
probability of the best tag. |
|
| 62 |
-prob: Print tag probabilities (requires option -threshold) |
|
| 63 |
-no-unknown: Print the token rather than <unknown> for unknown lemmas |
|
| 64 |
-quiet: Don't print status messages |
|
| 65 |
-pt-with-lemma: If this option is specified, then each pretagging tag |
|
| 66 |
(see above) has to be followed by a whitespace and a lemma. |
|
| 67 |
-pt-with-prob: If this option is specified, then each pretagging tag |
|
| 68 |
(see above) has to be followed by whitespace and a tag probability |
|
| 69 |
value. If -pt-with-prob and -pt-with-lemma have been specified, |
|
| 70 |
then each pretagging tag is followed by a probability and a lemma |
|
| 71 |
in that order. |
|
| 72 |
-files f: Read the names of input and output files pairwise from the |
|
| 73 |
file f. The format of f is the lexicon file format described below. |
|
| 74 |
-lex f: Read auxiliary lexicon entries from the file f. |
|
| 75 |
-eos-tag <tag>: The SGML tag <tag> signals the end of a sentence. |
|
| 76 |
This option implies the option -sgml |
|
| 77 |
|
|
| 78 |
Some more exotic options: |
|
| 79 |
-proto: Print lexical information for each word |
|
| 80 |
The lexicon type is signalled by one of the characters |
|
| 81 |
f: The word was found in the full form lexicon. |
|
| 82 |
c: The word in lowercase was found in the lexicon |
|
| 83 |
h: The word contains an hyphen and the word following the hyphen was found |
|
| 84 |
in the full form lexicon; e.g. instead of "table-wine" only "wine" has |
|
| 85 |
been found. |
|
| 86 |
s: The word has been looked up in the suffix lexicon |
|
| 87 |
p: Tags have been assigned by pretagging. |
|
| 88 |
-gramotron: Same as -proto but with a different format |
|
| 89 |
-proto-with-prob: Same as -proto but with lexical tag probabilities |
|
| 90 |
-print-prob-tree: Print the transition probability tree and exit |
|
| 91 |
-eps <epsilon>: Value which is used to replace zero lexical frequencies. |
|
| 92 |
Zero frequencies occur when a word/tag pair is contained in the lexicon |
|
| 93 |
but not in the training corpus. The default is 0.1. |
|
| 94 |
-base: Use only lexical probabilities for tagging. This option is only |
|
| 95 |
useful to obtain a baseline result to which the actual tagger output is |
|
| 96 |
compared. |
|
| 97 |
|
|
| 98 |
|
|
| 99 |
|
|
| 100 |
Training |
|
| 101 |
-------- |
|
| 102 |
|
|
| 103 |
Training is done with the *train-tree-tagger* program. If the program is |
|
| 104 |
called without arguments, the following output is printed: |
|
| 105 |
|
|
| 106 |
USAGE: train-tree-tagger <lexicon> <open class file> <infile> <outfile> |
|
| 107 |
{-cl <context length>} {-dtg <min. decision tree gain>}
|
|
| 108 |
{-ecw <eq. class weight>} {-atg <affix tree gain>} {-st <sent. tag>}
|
|
| 109 |
|
|
| 110 |
Description of the command line arguments: |
|
| 111 |
* <lexicon>: name of a file which contains the fullform lexicon. Each line |
|
| 112 |
of the lexicon corresponds to one word form and contains the word form |
|
| 113 |
itself followed by a Tab character and a sequence of tag-lemma pairs. |
|
| 114 |
The tags and lemmata are separated by whitespace. |
|
| 115 |
|
|
| 116 |
Example: |
|
| 117 |
aback RB aback |
|
| 118 |
abacuses NNS abacus |
|
| 119 |
abandon VB abandon VBP abandon |
|
| 120 |
abandoned JJ abandoned VBD abandon VBN abandon |
|
| 121 |
abandoning VBG abandon |
|
| 122 |
|
|
| 123 |
Important: Ordinal and cardinal numbers which consist of digits |
|
| 124 |
should not be included in the lexicon. Otherwise, the tagger will |
|
| 125 |
not be able to learn how to tag numbers which are not listed in the |
|
| 126 |
lexicon. Numbers with unusual tags should be added to the lexicon, |
|
| 127 |
however. |
|
| 128 |
|
|
| 129 |
Remark: The tagger doesn't need the lemmata for tagging. If |
|
| 130 |
you do not have the lemma information or if you do not plan to |
|
| 131 |
annotate corpora with lemmas, you can replace the lemma with a dummy |
|
| 132 |
value, e.g. "-". |
|
| 133 |
|
|
| 134 |
* <open class file>: name of a file which contains a list of open class tags |
|
| 135 |
i.e. possible tags of unknown word forms. This information is needed to |
|
| 136 |
estimate likely tags of unknown words. This file would typically contain |
|
| 137 |
adverb, adjective, noun, proper name and perhaps verb tags, but not |
|
| 138 |
prepositions, determiners, pronouns or numbers. |
|
| 139 |
* <input file>: name of a file which contains tagged training data. The data |
|
| 140 |
must be in one-word-per-line format. This means that each line contains |
|
| 141 |
one token and one tag in that order separated by a tabulator. |
|
| 142 |
Punctuation marks are considered as tokens and must have been tagged as well. |
|
| 143 |
|
|
| 144 |
Example: |
|
| 145 |
Pierre NP |
|
| 146 |
Vinken NP |
|
| 147 |
, , |
|
| 148 |
61 CD |
|
| 149 |
years NNS |
|
| 150 |
|
|
| 151 |
* <output file>: name of the file in which the resulting tagger parameters |
|
| 152 |
are stored. |
|
| 153 |
|
|
| 154 |
|
|
| 155 |
The following parameters are optional: |
|
| 156 |
|
|
| 157 |
* -cl <context length>: number of preceding words forming the tagging |
|
| 158 |
context. The default is 2 which corresponds to a trigram context. For |
|
| 159 |
small training corpora and/or large tagsets, it could be useful to reduce |
|
| 160 |
this parameter to 1. |
|
| 161 |
* -dtg <min. decision tree gain>: Threshold - If the information gain at a |
|
| 162 |
leaf node of the decision tree is below this threshold, the node is deleted. |
|
| 163 |
The default value is 0.7. |
|
| 164 |
* -ecw <eq. class weight>: weight of the equivalence class based probability |
|
| 165 |
estimates. The default is 0.15. |
|
| 166 |
* -atg <affix tree gain> Threshold - If the information gain at a leaf of an |
|
| 167 |
affix tree is below this threshold, it is deleted. The default is 1.2. |
|
| 168 |
* -st <sent. tag>: the end-of-sentence part-of-speech tag, i.e. the tag which |
|
| 169 |
is assigned to sentence punctuation like ".", "!", "?". |
|
| 170 |
Default is "SENT". It is important to set this option properly, if your |
|
| 171 |
tag for sentence punctuation is not "SENT". |
|
| 172 |
|
|
| 173 |
The accuracy of the TreeTagger usually improves a bit, if different |
|
| 174 |
settings of the above parameters are tested and the best combination |
|
| 175 |
is chosen. |
|
| tmp/org.txm.treetagger.core.win32/res/win/INSTALL.txt (revision 1670) | ||
|---|---|---|
| 1 |
|
|
| 2 |
/****************************************************************************/ |
|
| 3 |
/* How to install the Windows version of the TreeTagger */ |
|
| 4 |
/* */ |
|
| 5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
| 6 |
/****************************************************************************/ |
|
| 7 |
|
|
| 8 |
This is the Windows distribution of the TreeTagger. |
|
| 9 |
|
|
| 10 |
It contains the following files: |
|
| 11 |
|
|
| 12 |
- tree-tagger.exe: the tagger program |
|
| 13 |
|
|
| 14 |
- train-tree-tagger.exe: the training program |
|
| 15 |
|
|
| 16 |
- utf8-tokenize.perl: A Perl script which transforms the tagger input |
|
| 17 |
into one-word-perl-line format |
|
| 18 |
|
|
| 19 |
- *-abbreviations: abbreviation lists required by the tokenizer |
|
| 20 |
|
|
| 21 |
- tag-*.bat: batch files for different languages which call |
|
| 22 |
the tokeniser and the tagger |
|
| 23 |
|
|
| 24 |
- chunk-*.bat batch files for POS tagging and chunking |
|
| 25 |
|
|
| 26 |
|
|
| 27 |
Installation |
|
| 28 |
------------ |
|
| 29 |
|
|
| 30 |
1. Install a Perl interpreter (if you have not already installed one). |
|
| 31 |
You can download a Perl interpreter for Windows for free at |
|
| 32 |
http://www.activestate.com/activeperl/ |
|
| 33 |
|
|
| 34 |
2. Move the TreeTagger directory to the root directory of drive C:. |
|
| 35 |
|
|
| 36 |
3. Download the PC parameter files for the languages you need, decompress |
|
| 37 |
them (e.g. using Winzip or 7zip) and move them to the subdirectory lib. |
|
| 38 |
Rename the parameter files to <language>-utf8.par |
|
| 39 |
Example: Rename french-par-linux-3.2-utf8.bin to french-utf8.par |
|
| 40 |
|
|
| 41 |
Non-UTF8 parameter files are not supported anymore. |
|
| 42 |
|
|
| 43 |
4. Add the path C:\TreeTagger\bin to the PATH environment variable. |
|
| 44 |
|
|
| 45 |
5. Open a shell and type the command |
|
| 46 |
set PATH=C:\TreeTagger\bin;%PATH% |
|
| 47 |
|
|
| 48 |
6. Change to the directory C:\TreeTagger |
|
| 49 |
|
|
| 50 |
7. Now you can test the tagger, e.g. by analyzing this file with the command |
|
| 51 |
tag-english INSTALL.txt |
|
| 52 |
|
|
| 53 |
If you install the TreeTagger in a different directory, you have to |
|
| 54 |
modify the first path in the batch files tag-*.bat. |
|
| 55 |
|
|
| 56 |
|
|
| 57 |
Michaela Atterer told me that she had difficulties to install the |
|
| 58 |
TreeTagger on a Windows XP system. She recommends the following |
|
| 59 |
work-around. |
|
| 60 |
|
|
| 61 |
4. Windows XP: |
|
| 62 |
-Right click on "My Computer" |
|
| 63 |
-Select the "Advanced" tab |
|
| 64 |
-Click on "Environment Variables" |
|
| 65 |
-click on New: enter PATH and C:\TreeTagger\bin\;%PATH% |
|
| 66 |
|
|
| 67 |
If the files have been unpacked into a single directory, you should |
|
| 68 |
restore the following directory structure: |
|
| 69 |
|
|
| 70 |
TreeTagger: |
|
| 71 |
INSTALL.txt README.txt bin cmd lib |
|
| 72 |
|
|
| 73 |
TreeTagger/bin: |
|
| 74 |
tag-english.bat tag-german.bat tag-spanish.bat tree-tagger.exe |
|
| 75 |
tag-french.bat tag-italian.bat train-tree-tagger.exe |
|
| 76 |
|
|
| 77 |
TreeTagger/cmd: |
|
| 78 |
mwl-lookup.perl tokenize.pl |
|
| 79 |
|
|
| 80 |
TreeTagger/lib: |
|
| 81 |
english-abbreviations german-abbreviations spanish-abbreviations |
|
| 82 |
french-abbreviations italian-abbreviations spanish-mwls |
|
| 83 |
|
|
| 84 |
|
|
| 85 |
Note that the TreeTagger comes without a graphical interface. You have |
|
| 86 |
to run it by entering a command in a command line window. If you prefer |
|
| 87 |
a graphical interface, try the one provided by Ciarán Ó Duibhín at |
|
| 88 |
http://www.smo.uhi.ac.uk/~oduibhin/oideasra/interfaces/winttinterface.htm |
|
| 89 |
|
|
| tmp/org.txm.treetagger.core.win32/res/win32/README.txt (revision 1670) | ||
|---|---|---|
| 1 |
|
|
| 2 |
/****************************************************************************/ |
|
| 3 |
/* How to use the TreeTagger */ |
|
| 4 |
/* */ |
|
| 5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
| 6 |
/****************************************************************************/ |
|
| 7 |
|
|
| 8 |
|
|
| 9 |
The TreeTagger consists of two programs: the training program creates |
|
| 10 |
a parameter file from a fullform lexicon and a handtagged corpus. The |
|
| 11 |
tagger program reads the parameter file and annotates the text with |
|
| 12 |
part of speech and lemma information. Both programs print information |
|
| 13 |
about their usage when they are called without arguments. |
|
| 14 |
|
|
| 15 |
|
|
| 16 |
Tagging |
|
| 17 |
------- |
|
| 18 |
|
|
| 19 |
Tagging is done with the *tree-tagger* program. |
|
| 20 |
|
|
| 21 |
The first argument is the name of a parameter file which was generated |
|
| 22 |
with the train-tree-tagger program. Parameter files generated on |
|
| 23 |
different platforms or with older versions of train-tree-tagger will |
|
| 24 |
not work. |
|
| 25 |
|
|
| 26 |
The second argument is the input file. It must be in one-word-per-line |
|
| 27 |
format, i.e. each line contains one token (word, punctuation character |
|
| 28 |
or parenthesis) and should not exceed 1000 characters. Tokens may contain |
|
| 29 |
blanks. It is possible to override the lexical information contained |
|
| 30 |
in the parameter file of the tagger by specifying a list of possible |
|
| 31 |
tags after the token. This list has to be preceded by a tab character |
|
| 32 |
and the elements are separated by tab characters. Pretagging could be |
|
| 33 |
used e.g. to ensure that certain text-specific expressions are tagged |
|
| 34 |
correctly. Clitics (like "'s", "'re", and "'d" in English or "-la" and |
|
| 35 |
"-t-elle" in French) have to be separated if they were separated in |
|
| 36 |
the training data. (The French and English parameter files available |
|
| 37 |
by ftp expect separation of clitics). |
|
| 38 |
|
|
| 39 |
Sample input file: |
|
| 40 |
He |
|
| 41 |
moved |
|
| 42 |
to |
|
| 43 |
New York City NP |
|
| 44 |
. |
|
| 45 |
|
|
| 46 |
|
|
| 47 |
The third argument is the name of the output file. The output is also |
|
| 48 |
in one-word-per-line format. Depending on the specified options, it |
|
| 49 |
will contain columns with tokens, tags and lemmas. If the third |
|
| 50 |
argument is missing, the output will be printed to standard output. If |
|
| 51 |
the second argument is missing, too, input is read from standard |
|
| 52 |
input. |
|
| 53 |
|
|
| 54 |
Options: |
|
| 55 |
|
|
| 56 |
-token: Prints the token as well. |
|
| 57 |
-lemma: Prints the lemma as well. |
|
| 58 |
-sgml: Don't tag SGML annotations, i.e. lines starting with '<' and ending |
|
| 59 |
with '>'. |
|
| 60 |
-threshold <p>: Print all tags with a probability higher than <p> times the |
|
| 61 |
probability of the best tag. |
|
| 62 |
-prob: Print tag probabilities (requires option -threshold) |
|
| 63 |
-no-unknown: Print the token rather than <unknown> for unknown lemmas |
|
| 64 |
-quiet: Don't print status messages |
|
| 65 |
-pt-with-lemma: If this option is specified, then each pretagging tag |
|
| 66 |
(see above) has to be followed by a whitespace and a lemma. |
|
| 67 |
-pt-with-prob: If this option is specified, then each pretagging tag |
|
| 68 |
(see above) has to be followed by whitespace and a tag probability |
|
| 69 |
value. If -pt-with-prob and -pt-with-lemma have been specified, |
|
| 70 |
then each pretagging tag is followed by a probability and a lemma |
|
| 71 |
in that order. |
|
| 72 |
-files f: Read the names of input and output files pairwise from the |
|
| 73 |
file f. The format of f is the lexicon file format described below. |
|
| 74 |
-lex f: Read auxiliary lexicon entries from the file f. |
|
| 75 |
-eos-tag <tag>: The SGML tag <tag> signals the end of a sentence. |
|
| 76 |
This option implies the option -sgml |
|
| 77 |
|
|
| 78 |
Some more exotic options: |
|
| 79 |
-proto: Print lexical information for each word |
|
| 80 |
The lexicon type is signalled by one of the characters |
|
| 81 |
f: The word was found in the full form lexicon. |
|
| 82 |
c: The word in lowercase was found in the lexicon |
|
| 83 |
h: The word contains an hyphen and the word following the hyphen was found |
|
| 84 |
in the full form lexicon; e.g. instead of "table-wine" only "wine" has |
|
| 85 |
been found. |
|
| 86 |
s: The word has been looked up in the suffix lexicon |
|
| 87 |
p: Tags have been assigned by pretagging. |
|
| 88 |
-gramotron: Same as -proto but with a different format |
|
| 89 |
-proto-with-prob: Same as -proto but with lexical tag probabilities |
|
| 90 |
-print-prob-tree: Print the transition probability tree and exit |
|
| 91 |
-eps <epsilon>: Value which is used to replace zero lexical frequencies. |
|
| 92 |
Zero frequencies occur when a word/tag pair is contained in the lexicon |
|
| 93 |
but not in the training corpus. The default is 0.1. |
|
| 94 |
-base: Use only lexical probabilities for tagging. This option is only |
|
| 95 |
useful to obtain a baseline result to which the actual tagger output is |
|
| 96 |
compared. |
|
| 97 |
|
|
| 98 |
|
|
| 99 |
|
|
| 100 |
Training |
|
| 101 |
-------- |
|
| 102 |
|
|
| 103 |
Training is done with the *train-tree-tagger* program. If the program is |
|
| 104 |
called without arguments, the following output is printed: |
|
| 105 |
|
|
| 106 |
USAGE: train-tree-tagger <lexicon> <open class file> <infile> <outfile> |
|
| 107 |
{-cl <context length>} {-dtg <min. decision tree gain>}
|
|
| 108 |
{-ecw <eq. class weight>} {-atg <affix tree gain>} {-st <sent. tag>}
|
|
| 109 |
|
|
| 110 |
Description of the command line arguments: |
|
| 111 |
* <lexicon>: name of a file which contains the fullform lexicon. Each line |
|
| 112 |
of the lexicon corresponds to one word form and contains the word form |
|
| 113 |
itself followed by a Tab character and a sequence of tag-lemma pairs. |
|
| 114 |
The tags and lemmata are separated by whitespace. |
|
| 115 |
|
|
| 116 |
Example: |
|
| 117 |
aback RB aback |
|
| 118 |
abacuses NNS abacus |
|
| 119 |
abandon VB abandon VBP abandon |
|
| 120 |
abandoned JJ abandoned VBD abandon VBN abandon |
|
| 121 |
abandoning VBG abandon |
|
| 122 |
|
|
| 123 |
Important: Ordinal and cardinal numbers which consist of digits |
|
| 124 |
should not be included in the lexicon. Otherwise, the tagger will |
|
| 125 |
not be able to learn how to tag numbers which are not listed in the |
|
| 126 |
lexicon. Numbers with unusual tags should be added to the lexicon, |
|
| 127 |
however. |
|
| 128 |
|
|
| 129 |
Remark: The tagger doesn't need the lemmata for tagging. If |
|
| 130 |
you do not have the lemma information or if you do not plan to |
|
| 131 |
annotate corpora with lemmas, you can replace the lemma with a dummy |
|
| 132 |
value, e.g. "-". |
|
| 133 |
|
|
| 134 |
* <open class file>: name of a file which contains a list of open class tags |
|
| 135 |
i.e. possible tags of unknown word forms. This information is needed to |
|
| 136 |
estimate likely tags of unknown words. This file would typically contain |
|
| 137 |
adverb, adjective, noun, proper name and perhaps verb tags, but not |
|
| 138 |
prepositions, determiners, pronouns or numbers. |
|
| 139 |
* <input file>: name of a file which contains tagged training data. The data |
|
| 140 |
must be in one-word-per-line format. This means that each line contains |
|
| 141 |
one token and one tag in that order separated by a tabulator. |
|
| 142 |
Punctuation marks are considered as tokens and must have been tagged as well. |
|
| 143 |
|
|
| 144 |
Example: |
|
| 145 |
Pierre NP |
|
| 146 |
Vinken NP |
|
| 147 |
, , |
|
| 148 |
61 CD |
|
| 149 |
years NNS |
|
| 150 |
|
|
| 151 |
* <output file>: name of the file in which the resulting tagger parameters |
|
| 152 |
are stored. |
|
| 153 |
|
|
| 154 |
|
|
| 155 |
The following parameters are optional: |
|
| 156 |
|
|
| 157 |
* -cl <context length>: number of preceding words forming the tagging |
|
| 158 |
context. The default is 2 which corresponds to a trigram context. For |
|
| 159 |
small training corpora and/or large tagsets, it could be useful to reduce |
|
| 160 |
this parameter to 1. |
|
| 161 |
* -dtg <min. decision tree gain>: Threshold - If the information gain at a |
|
| 162 |
leaf node of the decision tree is below this threshold, the node is deleted. |
|
| 163 |
The default value is 0.7. |
|
| 164 |
* -ecw <eq. class weight>: weight of the equivalence class based probability |
|
| 165 |
estimates. The default is 0.15. |
|
| 166 |
* -atg <affix tree gain> Threshold - If the information gain at a leaf of an |
|
| 167 |
affix tree is below this threshold, it is deleted. The default is 1.2. |
|
| 168 |
* -st <sent. tag>: the end-of-sentence part-of-speech tag, i.e. the tag which |
|
| 169 |
is assigned to sentence punctuation like ".", "!", "?". |
|
| 170 |
Default is "SENT". It is important to set this option properly, if your |
|
| 171 |
tag for sentence punctuation is not "SENT". |
|
| 172 |
|
|
| 173 |
The accuracy of the TreeTagger usually improves a bit, if different |
|
| 174 |
settings of the above parameters are tested and the best combination |
|
| 175 |
is chosen. |
|
| 0 | 176 | |
| tmp/org.txm.treetagger.core.win32/res/win32/INSTALL.txt (revision 1670) | ||
|---|---|---|
| 1 |
|
|
| 2 |
/****************************************************************************/ |
|
| 3 |
/* How to install the Windows version of the TreeTagger */ |
|
| 4 |
/* */ |
|
| 5 |
/* Author: Helmut Schmid, CIS, Ludwig-Maximilians-Universität, Germany */ |
|
| 6 |
/****************************************************************************/ |
|
| 7 |
|
|
| 8 |
This is the Windows distribution of the TreeTagger. |
|
| 9 |
|
|
| 10 |
It contains the following files: |
|
| 11 |
|
|
| 12 |
- tree-tagger.exe: the tagger program |
|
| 13 |
|
|
| 14 |
- train-tree-tagger.exe: the training program |
|
| 15 |
|
|
| 16 |
- utf8-tokenize.perl: A Perl script which transforms the tagger input |
|
| 17 |
into one-word-perl-line format |
|
| 18 |
|
|
| 19 |
- *-abbreviations: abbreviation lists required by the tokenizer |
|
| 20 |
|
|
| 21 |
- tag-*.bat: batch files for different languages which call |
|
| 22 |
the tokeniser and the tagger |
|
| 23 |
|
|
| 24 |
- chunk-*.bat batch files for POS tagging and chunking |
|
| 25 |
|
|
| 26 |
|
|
| 27 |
Installation |
|
| 28 |
------------ |
|
| 29 |
|
|
| 30 |
1. Install a Perl interpreter (if you have not already installed one). |
|
| 31 |
You can download a Perl interpreter for Windows for free at |
|
| 32 |
http://www.activestate.com/activeperl/ |
|
| 33 |
|
|
| 34 |
2. Move the TreeTagger directory to the root directory of drive C:. |
|
| 35 |
|
|
| 36 |
3. Download the PC parameter files for the languages you need, decompress |
|
| 37 |
them (e.g. using Winzip or 7zip) and move them to the subdirectory lib. |
|
| 38 |
Rename the parameter files to <language>-utf8.par |
|
| 39 |
Example: Rename french-par-linux-3.2-utf8.bin to french-utf8.par |
|
| 40 |
|
|
| 41 |
Non-UTF8 parameter files are not supported anymore. |
|
| 42 |
|
|
| 43 |
4. Add the path C:\TreeTagger\bin to the PATH environment variable. |
|
| 44 |
|
|
| 45 |
5. Open a shell and type the command |
|
| 46 |
set PATH=C:\TreeTagger\bin;%PATH% |
|
| 47 |
|
|
| 48 |
6. Change to the directory C:\TreeTagger |
|
| 49 |
|
|
| 50 |
7. Now you can test the tagger, e.g. by analyzing this file with the command |
|
| 51 |
tag-english INSTALL.txt |
|
| 52 |
|
|
| 53 |
If you install the TreeTagger in a different directory, you have to |
|
| 54 |
modify the first path in the batch files tag-*.bat. |
|
| 55 |
|
|
| 56 |
|
|
| 57 |
Michaela Atterer told me that she had difficulties to install the |
|
| 58 |
TreeTagger on a Windows XP system. She recommends the following |
|
| 59 |
work-around. |
|
| 60 |
|
|
| 61 |
4. Windows XP: |
|
| 62 |
-Right click on "My Computer" |
|
| 63 |
-Select the "Advanced" tab |
|
| 64 |
-Click on "Environment Variables" |
|
| 65 |
-click on New: enter PATH and C:\TreeTagger\bin\;%PATH% |
|
| 66 |
|
|
| 67 |
If the files have been unpacked into a single directory, you should |
|
| 68 |
restore the following directory structure: |
|
| 69 |
|
|
| 70 |
TreeTagger: |
|
| 71 |
INSTALL.txt README.txt bin cmd lib |
|
| 72 |
|
|
| 73 |
TreeTagger/bin: |
|
| 74 |
tag-english.bat tag-german.bat tag-spanish.bat tree-tagger.exe |
|
| 75 |
tag-french.bat tag-italian.bat train-tree-tagger.exe |
|
| 76 |
|
|
| 77 |
TreeTagger/cmd: |
|
| 78 |
mwl-lookup.perl tokenize.pl |
|
| 79 |
|
|
| 80 |
TreeTagger/lib: |
|
| 81 |
english-abbreviations german-abbreviations spanish-abbreviations |
|
| 82 |
french-abbreviations italian-abbreviations spanish-mwls |
|
| 83 |
|
|
| 84 |
|
|
| 85 |
Note that the TreeTagger comes without a graphical interface. You have |
|
| 86 |
to run it by entering a command in a command line window. If you prefer |
|
| 87 |
a graphical interface, try the one provided by Ciarán Ó Duibhín at |
|
| 88 |
http://www.smo.uhi.ac.uk/~oduibhin/oideasra/interfaces/winttinterface.htm |
|
| 89 |
|
|
| 0 | 90 | |
Formats disponibles : Unified diff