Révision 1683
tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-french.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
################################################################### |
|
4 |
### ### |
|
5 |
### File: filter-chunker-output.perl ### |
|
6 |
### Author: Michel Genereux ### |
|
7 |
### (indicated modifications by Dennis Spohr (DS)) ### |
|
8 |
### Purpose: Filter chunker output and create XML-like markup ### |
|
9 |
### Created: Mon Feb 19 2007 ### |
|
10 |
### ### |
|
11 |
################################################################### |
|
12 |
|
|
13 |
use Getopt::Std; |
|
14 |
getopts('t'); |
|
15 |
|
|
16 |
$| = 1; |
|
17 |
|
|
18 |
print doc_start(); |
|
19 |
|
|
20 |
### DS start: end-of-sentence marker |
|
21 |
$eos = '[.?!;]'; |
|
22 |
$push = 1; |
|
23 |
### |
|
24 |
|
|
25 |
$n = 0; |
|
26 |
|
|
27 |
while (<>) { |
|
28 |
s/.-SBAR$/O/; |
|
29 |
s/I-PC$/0/ if (/I-PC$/ && !$inside_pp); |
|
30 |
|
|
31 |
### DS start: process lemma column |
|
32 |
if (($token[$n],$tag[$n],$tag,$chunk[$n],$x,$lemma[$n]) = $_ =~ /^(.*)-(.*)\t(.*)\/(.*)(\t(.*))?$/) { |
|
33 |
### DS end |
|
34 |
|
|
35 |
### DS start: chunking error; some SENTs have e.g. I-NP although |
|
36 |
### they mark the end of a sentence; lead to omission |
|
37 |
### of closing tags |
|
38 |
$push = 0; |
|
39 |
$chunk[$n] = 0 if ($tag eq 'SENT' && $chunk[$n] =~ /^(I|B)-/); |
|
40 |
### DS end |
|
41 |
|
|
42 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) { |
|
43 |
$flag[$n] = $1; |
|
44 |
$chunk[$n] = $2; |
|
45 |
} else { |
|
46 |
undef $flag[$n]; |
|
47 |
undef $chunk[$n]; |
|
48 |
} |
|
49 |
|
|
50 |
### DS start: performance boost: set $n to 0 after printing |
|
51 |
### sentence; otherwise $n and arrays get too big |
|
52 |
### and cause slowdown |
|
53 |
if ($token[$n] =~ /^$eos\s*$/ && $chunk[$n] == 0 && $tag[$n] eq 'SENT') { |
|
54 |
print_sentence(0); |
|
55 |
$n = 0; |
|
56 |
$start_markup = ""; |
|
57 |
} else { |
|
58 |
$n++; |
|
59 |
} |
|
60 |
### DS end |
|
61 |
|
|
62 |
### DS start: keep markup already present in input data and insert |
|
63 |
### chunker markup correctly; if an element starting before |
|
64 |
### the sentence is closed before the sentence is closed |
|
65 |
### (e.g. headlines without sentence end markers), then |
|
66 |
### the sentence should also be closed, e.g. avoid cases like |
|
67 |
### <HEADLINE><s>Les r?sultats de jeudi</HEADLINE></s> |
|
68 |
} elsif (/^<([^\/]*?)(( |~).*)?>/ && $push) { |
|
69 |
push(@tag_stack,$1); |
|
70 |
$start_markup .= "$&\n"; |
|
71 |
} elsif (/^<\/(.*?)>/ && $1 eq $tag_stack[$#tag_stack]) { |
|
72 |
$end_markup = "$&\n"; |
|
73 |
print_sentence(1); |
|
74 |
$n = 0; |
|
75 |
$push = 1; |
|
76 |
$start_markup = ""; |
|
77 |
$end_markup = ""; |
|
78 |
pop(@tag_stack); |
|
79 |
### DS end |
|
80 |
|
|
81 |
} else { |
|
82 |
$markup[$n] .= $_; |
|
83 |
} |
|
84 |
} |
|
85 |
|
|
86 |
print_sentence(1); |
|
87 |
print doc_end(); |
|
88 |
|
|
89 |
|
|
90 |
sub print_sentence { |
|
91 |
|
|
92 |
### DS start: indicate whether print_sentence is forced by |
|
93 |
### closing input markup |
|
94 |
my $forced = shift; |
|
95 |
### DS end |
|
96 |
|
|
97 |
my($i,$chunk); |
|
98 |
|
|
99 |
for( $i=0; $i<=$n; $i++ ) { |
|
100 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) { |
|
101 |
$flag[$i] = 'B'; |
|
102 |
} |
|
103 |
if ($flag[$i] eq 'B') { |
|
104 |
if (defined $chunk) { |
|
105 |
$cetags[$i-1] .= end_tag($chunk); |
|
106 |
} |
|
107 |
if ($chunk[$i] eq 'PC') { |
|
108 |
for( $k=$i+1; $k<=$n; $k++ ) { |
|
109 |
last if ($flag[$k] eq 'B'); |
|
110 |
} |
|
111 |
for( $k++; $k<=$n; $k++ ) { |
|
112 |
last if ($flag[$k] ne 'I'); |
|
113 |
} |
|
114 |
if ($k <= $n && $flag[$k] eq 'E' && $chunk[$k] eq 'PC') { |
|
115 |
$markup[$k+1] .= end_tag('PC'); |
|
116 |
undef $flag[$k]; |
|
117 |
undef $chunk[$k]; |
|
118 |
|
|
119 |
### DS start: $k may be greater than $n; add closing PC tag |
|
120 |
### to $markup[$n]; otherwise closing tags are |
|
121 |
### omitted |
|
122 |
} elsif ($k > $n && ($forced || $token[$n] =~ /^$eos\s*$/)) { |
|
123 |
$markup[$n] .= end_tag('PC'); |
|
124 |
### DS end |
|
125 |
|
|
126 |
} else { |
|
127 |
$markup[$k] .= end_tag('PC'); |
|
128 |
} |
|
129 |
undef $chunk; |
|
130 |
} |
|
131 |
else { |
|
132 |
$chunk = $chunk[$i]; |
|
133 |
} |
|
134 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
135 |
} |
|
136 |
elsif ($flag[$i] eq 'E') { |
|
137 |
if ($chunk[$i] eq $chunk) { |
|
138 |
$cetags[$i] .= end_tag($chunk); |
|
139 |
undef $chunk; |
|
140 |
} |
|
141 |
elsif ($chunk[$i] eq 'PC') { |
|
142 |
$cetags[$i-1] .= end_tag($chunk) if defined $chunk; |
|
143 |
$cetags[$i] .= end_tag("PC"); |
|
144 |
my $k; |
|
145 |
for( $k=$i; $k>=0; $k-- ) { |
|
146 |
if ($flag[$k] eq 'B') { |
|
147 |
$cbtags[$k] = start_tag("PC").$cbtags[$k]; |
|
148 |
last; |
|
149 |
} |
|
150 |
} |
|
151 |
undef $chunk; |
|
152 |
} |
|
153 |
else { |
|
154 |
die; |
|
155 |
} |
|
156 |
} |
|
157 |
elsif ($flag[$i] ne 'I' && defined $chunk) { |
|
158 |
$cetags[$i-1] .= end_tag($chunk); |
|
159 |
undef $chunk; |
|
160 |
} |
|
161 |
} |
|
162 |
|
|
163 |
$printed = 0;# start_tag("s"); |
|
164 |
|
|
165 |
### DS start: print opening tags of input markup before sentence |
|
166 |
print $start_markup; |
|
167 |
print start_tag("s") if $n > 0; |
|
168 |
### DS end |
|
169 |
|
|
170 |
for( $i=0; $i<=$n; $i++ ) { |
|
171 |
print $markup[$i]; |
|
172 |
#unless ($printed) { |
|
173 |
# print start_tag("s"); |
|
174 |
# $printed = 1; |
|
175 |
#} |
|
176 |
print $cbtags[$i]; |
|
177 |
|
|
178 |
### DS start: slightly renamed sub and added lemma parameter |
|
179 |
print token_and_tag_and_lemma($token[$i],$tag[$i],$lemma[$i]) if defined $token[$i]; |
|
180 |
### DS end |
|
181 |
|
|
182 |
print $cetags[$i]; |
|
183 |
} |
|
184 |
|
|
185 |
### DS start: print closing "s" tag and closing input markup if |
|
186 |
### print_sentence had been forced |
|
187 |
print end_tag("s") if $n>0; |
|
188 |
print $end_markup if $forced; |
|
189 |
### DS end |
|
190 |
|
|
191 |
undef @token; |
|
192 |
undef @tag; |
|
193 |
undef @chunk; |
|
194 |
undef @cbtags; |
|
195 |
undef @cetags; |
|
196 |
undef @flag; |
|
197 |
undef @markup; |
|
198 |
} |
|
199 |
|
|
200 |
sub doc_start { |
|
201 |
return '' unless defined $opt_t; |
|
202 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
203 |
} |
|
204 |
|
|
205 |
sub doc_end { |
|
206 |
return '' unless defined $opt_t; |
|
207 |
return "</corpus>\n"; |
|
208 |
} |
|
209 |
|
|
210 |
sub start_tag { |
|
211 |
my $t=shift; |
|
212 |
return "<$t>\n" unless defined $opt_t; |
|
213 |
return " <phrase cat=\"$t\">\n"; |
|
214 |
} |
|
215 |
|
|
216 |
sub end_tag { |
|
217 |
my $t=shift; |
|
218 |
return "</$t>\n" unless defined $opt_t; |
|
219 |
return " </phrase>\n"; |
|
220 |
} |
|
221 |
|
|
222 |
### DS start: also process and output lemma parameter |
|
223 |
sub token_and_tag_and_lemma { |
|
224 |
my ($token,$tag,$lemma)=@_; |
|
225 |
return "$token\t$tag\t$lemma\n" unless defined $opt_t; |
|
226 |
return " <token word=\"$token\" lemma=\"$lemma\" pos=\"$tag\"/>\n"; |
|
227 |
} |
|
228 |
### DS end |
|
229 |
|
tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/local/bin/perl |
|
2 |
|
|
3 |
use Getopt::Std; |
|
4 |
getopt('dhf:'); |
|
5 |
|
|
6 |
# This perl script recognizes multi word units in the input stream |
|
7 |
# and puts them on one line. Input must have one-word-per-line format. |
|
8 |
# The multi word units are listed in the parameter file with POS tags. |
|
9 |
# Each line contains one multi word unit where the individual words |
|
10 |
# are separated by blanks followed by a tab character and the blank- |
|
11 |
# separated list of POS tags. |
|
12 |
# Author: Helmut Schmid, IMS, Uni Stuttgart |
|
13 |
|
|
14 |
if (!defined($opt_f) || defined($opt_h)) { |
|
15 |
$0 =~ s/.*\///; |
|
16 |
printf "\nUsage: $0 [-d del] -f mwl-file ...files...\n"; |
|
17 |
print "\nOptions:\n"; |
|
18 |
print "-d del : Use del as delimiter rather than a blank\n\n"; |
|
19 |
die |
|
20 |
} |
|
21 |
|
|
22 |
if (!open(FILE, $opt_f)) { |
|
23 |
die "\nCan't open mwl file: ",$opt_f,"\n"; |
|
24 |
} |
|
25 |
if (defined($opt_d)) { |
|
26 |
$del = $opt_d; |
|
27 |
} else { |
|
28 |
$del = " "; |
|
29 |
} |
|
30 |
|
|
31 |
$N=1; |
|
32 |
while (<FILE>) { |
|
33 |
chomp(); |
|
34 |
next if /^$/; |
|
35 |
@G = split("\t"); |
|
36 |
@F = split(/\s+/,$G[0]); |
|
37 |
$state = 0; |
|
38 |
for($i=0; $i<=$#F; $i++) { |
|
39 |
if (!exists($arc{$state,$F[$i]})) { |
|
40 |
$arc{$state,$F[$i]} = $N++; |
|
41 |
} |
|
42 |
$state = $arc{$state,$F[$i]}; |
|
43 |
} |
|
44 |
$final{$state} = $G[1]; |
|
45 |
} |
|
46 |
close(FILE); |
|
47 |
|
|
48 |
|
|
49 |
$last = $match = $last_match = 0; |
|
50 |
$state = 0; |
|
51 |
|
|
52 |
for (;;) { |
|
53 |
if ($match == $last) { |
|
54 |
if (!($token[$last] = <>)) { |
|
55 |
if ($last_match > 0) { |
|
56 |
print $token[0]; |
|
57 |
for ($i=1; $i<=$last_match; $i++) { |
|
58 |
print $del,$token[$i]; |
|
59 |
} |
|
60 |
print "\n"; |
|
61 |
} else { |
|
62 |
$i=0; |
|
63 |
} |
|
64 |
for (; $i<$last; $i++) { |
|
65 |
print $token[$i],"\n"; |
|
66 |
} |
|
67 |
last; |
|
68 |
} |
|
69 |
chomp($token[$last++]); |
|
70 |
} |
|
71 |
if (($s = $arc{$state, $token[$match]}) || |
|
72 |
($s = $arc{$state, lc($token[$match])}) || |
|
73 |
($s = $arc{$state, ucfirst(lc($token[$match]))})) { |
|
74 |
if (exists($final{$s})) { |
|
75 |
$last_match = $match; |
|
76 |
$last_tag = $final{$s}; |
|
77 |
} |
|
78 |
$state = $s; |
|
79 |
$match++; |
|
80 |
} else { |
|
81 |
if ($last_match > 0) { |
|
82 |
print $token[0]; |
|
83 |
for($i=1; $i<=$last_match; $i++) { |
|
84 |
print $del,$token[$i]; |
|
85 |
} |
|
86 |
print "\t$last_tag\n"; |
|
87 |
} else { |
|
88 |
print $token[0],"\n"; |
|
89 |
} |
|
90 |
for($i=0,$k=$last_match+1; $k<$last; ) { |
|
91 |
$token[$i++] = $token[$k++]; |
|
92 |
} |
|
93 |
$last = $last - $last_match - 1; |
|
94 |
$last_match = $match = 0; |
|
95 |
$state = 0; |
|
96 |
} |
|
97 |
} |
tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-german.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
use Getopt::Std; |
|
4 |
getopts('t'); |
|
5 |
|
|
6 |
print doc_start(); |
|
7 |
|
|
8 |
$n = 0; |
|
9 |
while (<>) { |
|
10 |
s/.-SBAR$/O/; |
|
11 |
|
|
12 |
if (/^(.*)-(.*)\t(.*)\/(.*)$/) { |
|
13 |
$token[$n] = $1; |
|
14 |
$tag[$n] = $2; |
|
15 |
$chunk[$n] = $4; |
|
16 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) { |
|
17 |
$flag[$n] = $1; |
|
18 |
$chunk[$n] = $2; |
|
19 |
} |
|
20 |
else { |
|
21 |
undef $flag[$n]; |
|
22 |
undef $chunk[$n]; |
|
23 |
} |
|
24 |
print_sentence() if $token[$n] eq '.'; |
|
25 |
$n++; |
|
26 |
} |
|
27 |
|
|
28 |
else { |
|
29 |
$markup[$n] .= $_; |
|
30 |
} |
|
31 |
} |
|
32 |
|
|
33 |
print_sentence(); |
|
34 |
print doc_end(); |
|
35 |
|
|
36 |
|
|
37 |
sub print_sentence { |
|
38 |
my($i,$chunk); |
|
39 |
|
|
40 |
for( $i=0; $i<=$n; $i++ ) { |
|
41 |
|
|
42 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) { |
|
43 |
$flag[$i] = 'B'; |
|
44 |
} |
|
45 |
|
|
46 |
if ($flag[$i] eq 'B') { |
|
47 |
if (defined $chunk) { |
|
48 |
$cetags[$i-1] = end_tag($chunk); |
|
49 |
} |
|
50 |
$chunk = $chunk[$i]; |
|
51 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
52 |
} |
|
53 |
|
|
54 |
# German chunker uses E-flags for PCs |
|
55 |
elsif ($flag[$i] eq 'E') { |
|
56 |
if ($chunk[$i] eq $chunk) { |
|
57 |
$cetags[$i] = end_tag($chunk); |
|
58 |
undef $chunk; |
|
59 |
} |
|
60 |
elsif ($chunk[$i] eq "PC" && $chunk eq "NC") { |
|
61 |
for( $k=$i-1; $k>=0; $k-- ) { |
|
62 |
if ($chunk[$k] eq "NC") { |
|
63 |
$chunk[$k] = "PC"; |
|
64 |
} |
|
65 |
if ($flag[$k] ne "I") { |
|
66 |
last; |
|
67 |
} |
|
68 |
} |
|
69 |
$cbtags[$k] = start_tag($chunk[$i]); |
|
70 |
$cetags[$i] = end_tag($chunk[$i]); |
|
71 |
undef $chunk; |
|
72 |
undef $inPC; |
|
73 |
} |
|
74 |
} |
|
75 |
|
|
76 |
elsif ($flag[$i] ne 'I' && defined $chunk) { |
|
77 |
$cetags[$i-1] = end_tag($chunk); |
|
78 |
undef $chunk; |
|
79 |
} |
|
80 |
} |
|
81 |
|
|
82 |
for( $i=0; $i<=$n; $i++ ) { |
|
83 |
print $markup[$i]; |
|
84 |
print $cbtags[$i]; |
|
85 |
print token_and_tag($token[$i],$tag[$i]) if defined $token[$i]; |
|
86 |
print $cetags[$i]; |
|
87 |
} |
|
88 |
|
|
89 |
undef @token; |
|
90 |
undef @tag; |
|
91 |
undef @chunk; |
|
92 |
undef @cbtags; |
|
93 |
undef @cetags; |
|
94 |
undef @flag; |
|
95 |
undef @markup; |
|
96 |
$n = 0; |
|
97 |
} |
|
98 |
|
|
99 |
sub doc_start { |
|
100 |
return '' unless defined $opt_t; |
|
101 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
102 |
} |
|
103 |
|
|
104 |
sub doc_end { |
|
105 |
return '' unless defined $opt_t; |
|
106 |
return "</corpus>\n"; |
|
107 |
} |
|
108 |
|
|
109 |
sub start_tag { |
|
110 |
my $t=shift; |
|
111 |
return "<$t>\n" unless defined $opt_t; |
|
112 |
return " <phrase cat=\"$t\">\n"; |
|
113 |
} |
|
114 |
|
|
115 |
sub end_tag { |
|
116 |
my $t=shift; |
|
117 |
return "</$t>\n" unless defined $opt_t; |
|
118 |
return " </phrase>\n"; |
|
119 |
} |
|
120 |
|
|
121 |
sub token_and_tag { |
|
122 |
my ($token,$tag)=@_; |
|
123 |
return "$token\t$tag\n" unless defined $opt_t; |
|
124 |
return " <token word=\"$token\" pos=\"$tag\"/>\n"; |
|
125 |
} |
tmp/org.txm.treetagger.core.win32/res/win32/cmd/utf8-tokenize.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
######################################################################## |
|
4 |
# # |
|
5 |
# tokenization script for tagger preprocessing # |
|
6 |
# Author: Helmut Schmid, IMS, University of Stuttgart # |
|
7 |
# Serge Sharoff, University of Leeds # |
|
8 |
# Description: # |
|
9 |
# - splits input text into tokens (one token per line) # |
|
10 |
# - cuts off punctuation, parentheses etc. # |
|
11 |
# - disambiguates periods # |
|
12 |
# - preserves SGML markup # |
|
13 |
# # |
|
14 |
######################################################################## |
|
15 |
|
|
16 |
use Getopt::Std; |
|
17 |
use utf8; |
|
18 |
use Encode; |
|
19 |
|
|
20 |
getopts('hgfeiza:'); |
|
21 |
|
|
22 |
# Modify the following lines in order to adapt the tokenizer to other |
|
23 |
# types of text and/or languages |
|
24 |
|
|
25 |
# characters which have to be cut off at the beginning of a word |
|
26 |
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›'."'"; |
|
27 |
|
|
28 |
# characters which have to be cut off at the end of a word |
|
29 |
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›'; |
|
30 |
|
|
31 |
# character sequences which have to be cut off at the beginning of a word |
|
32 |
my $PClitic=''; |
|
33 |
|
|
34 |
# character sequences which have to be cut off at the end of a word |
|
35 |
my $FClitic; |
|
36 |
|
|
37 |
if (defined($opt_e)) { |
|
38 |
# English |
|
39 |
$FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t'; |
|
40 |
} |
|
41 |
if (defined($opt_i)) { |
|
42 |
# Italian |
|
43 |
$PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''; |
|
44 |
} |
|
45 |
if (defined($opt_f)) { |
|
46 |
# French |
|
47 |
$PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\''; |
|
48 |
$FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l'; |
|
49 |
} |
|
50 |
if (defined($opt_z)) { |
|
51 |
# Galician |
|
52 |
$FClitic = '-la|-las|-lo|-los|-nos'; |
|
53 |
} |
|
54 |
|
|
55 |
|
|
56 |
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE ######################### |
|
57 |
|
|
58 |
if (defined($opt_h)) { |
|
59 |
die " |
|
60 |
Usage: utf8-tokenize.perl [ options ] ...files... |
|
61 |
|
|
62 |
Options: |
|
63 |
-e : English text |
|
64 |
-f : French text |
|
65 |
-i : Italian text |
|
66 |
-a <file>: <file> contains a list of words which are either abbreviations or |
|
67 |
words which should not be further split. |
|
68 |
"; |
|
69 |
} |
|
70 |
|
|
71 |
# Read the list of abbreviations and words |
|
72 |
if (defined($opt_a)) { |
|
73 |
die "Can't read: $opt_a: $!\n" unless (open(FILE, $opt_a)); |
|
74 |
while (<FILE>) { |
|
75 |
$_ = decode('utf8',$_); |
|
76 |
s/^[ \t\r\n]+//; |
|
77 |
s/[ \t\r\n]+$//; |
|
78 |
next if (/^\#/ || /^\s$/); # ignore comments |
|
79 |
$Token{$_} = 1; |
|
80 |
} |
|
81 |
close FILE; |
|
82 |
} |
|
83 |
|
|
84 |
#SS: main loop; |
|
85 |
my $first_line = 1; |
|
86 |
while (<>) { |
|
87 |
$_ = decode('utf8',$_); |
|
88 |
# delete optional byte order markers (BOM) |
|
89 |
if ($first_line) { |
|
90 |
undef $first_line; |
|
91 |
s/^\x{FEFF}//; |
|
92 |
} |
|
93 |
|
|
94 |
# replace newlines and tab characters with blanks |
|
95 |
tr/\n\t/ /; |
|
96 |
|
|
97 |
# replace blanks within SGML tags |
|
98 |
while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) { |
|
99 |
} |
|
100 |
; |
|
101 |
#Separ: ÿþ |
|
102 |
|
|
103 |
# replace whitespace with a special character |
|
104 |
tr/ /\376/; |
|
105 |
|
|
106 |
# restore SGML tags |
|
107 |
tr/\377\376/ \377/; |
|
108 |
|
|
109 |
# prepare SGML-Tags for tokenization |
|
110 |
s/(<[^<>]*>)/\377$1\377/g; |
|
111 |
s/^\377//; |
|
112 |
s/\377$//; |
|
113 |
s/\377\377\377*/\377/g; |
|
114 |
|
|
115 |
@S = split("\377"); |
|
116 |
for ( $i=0; $i<=$#S; $i++) { |
|
117 |
$_ = $S[$i]; |
|
118 |
|
|
119 |
if (/^<.*>$/) { |
|
120 |
# SGML tag |
|
121 |
print encode('utf8',"$_\n"); |
|
122 |
} else { |
|
123 |
# add a blank at the beginning and the end of each segment |
|
124 |
$_ = ' '.$_.' '; |
|
125 |
# insert missing blanks after punctuation |
|
126 |
s/(\.\.\.)/ ... /g; |
|
127 |
s/([;\!\?])([^ ])/$1 $2/g; |
|
128 |
s/([.,:])([^ 0-9.])/$1 $2/g; |
|
129 |
|
|
130 |
@F = split; |
|
131 |
for ( $j=0; $j<=$#F; $j++) { |
|
132 |
my $suffix=""; |
|
133 |
$_ = $F[$j]; |
|
134 |
# separate punctuation and parentheses from words |
|
135 |
do { |
|
136 |
$finished = 1; |
|
137 |
# cut off preceding punctuation |
|
138 |
if (s/^([$PChar])(.)/$2/) { |
|
139 |
print encode('utf8',"$1\n"); |
|
140 |
$finished = 0; |
|
141 |
} |
|
142 |
# cut off trailing punctuation |
|
143 |
if (s/(.)([$FChar])$/$1/) { |
|
144 |
$suffix = "$2\n$suffix"; |
|
145 |
$finished = 0; |
|
146 |
} |
|
147 |
# cut off trailing periods if punctuation precedes |
|
148 |
if (s/([$FChar])\.$//) { |
|
149 |
$suffix = ".\n$suffix"; |
|
150 |
if ($_ eq "") { |
|
151 |
$_ = $1; |
|
152 |
} else { |
|
153 |
$suffix = "$1\n$suffix"; |
|
154 |
} |
|
155 |
$finished = 0; |
|
156 |
} |
|
157 |
} while (!$finished); |
|
158 |
|
|
159 |
# handle explicitly listed tokens |
|
160 |
if (defined($Token{$_})) { |
|
161 |
print encode('utf8',"$_\n$suffix"); |
|
162 |
next; |
|
163 |
} |
|
164 |
|
|
165 |
# abbreviations of the form A. or U.S.A. |
|
166 |
if (/^([A-Za-z-]\.)+$/) { |
|
167 |
print encode('utf8',"$_\n$suffix"); |
|
168 |
next; |
|
169 |
} |
|
170 |
|
|
171 |
|
|
172 |
# disambiguate periods |
|
173 |
if (/^(..*)\.$/ && $_ ne "..." && !($opt_g && /^[0-9]+\.$/)) { |
|
174 |
$_ = $1; |
|
175 |
$suffix = ".\n$suffix"; |
|
176 |
if (defined($Token{$_})) { |
|
177 |
print encode('utf8',"$_\n$suffix"); |
|
178 |
next; |
|
179 |
} |
|
180 |
} |
|
181 |
|
|
182 |
# cut off clitics |
|
183 |
while (s/^(--)(.)/$2/) { |
|
184 |
print encode('utf8',"$1\n"); |
|
185 |
} |
|
186 |
if ($PClitic ne '') { |
|
187 |
while (s/^($PClitic)(.)/$2/) { |
|
188 |
print encode('utf8',"$1\n"); |
|
189 |
} |
|
190 |
} |
|
191 |
|
|
192 |
while (s/(.)(--)$/$1/) { |
|
193 |
$suffix = "$2\n$suffix"; |
|
194 |
} |
|
195 |
if ($FClitic ne '') { |
|
196 |
while (s/(.)($FClitic)$/$1/) { |
|
197 |
$suffix = "$2\n$suffix"; |
|
198 |
} |
|
199 |
} |
|
200 |
|
|
201 |
print encode('utf8',"$_\n$suffix"); |
|
202 |
} |
|
203 |
} |
|
204 |
} |
|
205 |
} |
|
0 | 206 |
tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
use Getopt::Std; |
|
4 |
getopts('t'); |
|
5 |
|
|
6 |
print doc_start(); |
|
7 |
|
|
8 |
$n = 0; |
|
9 |
while (<>) { |
|
10 |
s/.-SBAR$/O/; |
|
11 |
|
|
12 |
if (/^(.*)-(.*)\t(.*)\/(.*)$/) { |
|
13 |
$token[$n] = $1; |
|
14 |
$tag[$n] = $2; |
|
15 |
$chunk[$n] = $4; |
|
16 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) { |
|
17 |
$flag[$n] = $1; |
|
18 |
$chunk[$n] = $2; |
|
19 |
} |
|
20 |
else { |
|
21 |
undef $flag[$n]; |
|
22 |
undef $chunk[$n]; |
|
23 |
} |
|
24 |
print_sentence() if $token[$n] eq '.'; |
|
25 |
$n++; |
|
26 |
} |
|
27 |
|
|
28 |
else { |
|
29 |
$markup[$n] .= $_; |
|
30 |
} |
|
31 |
} |
|
32 |
|
|
33 |
print_sentence(); |
|
34 |
print doc_end(); |
|
35 |
|
|
36 |
|
|
37 |
sub print_sentence { |
|
38 |
my($i,$chunk); |
|
39 |
|
|
40 |
for( $i=0; $i<=$n; $i++ ) { |
|
41 |
|
|
42 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) { |
|
43 |
$flag[$i] = 'B'; |
|
44 |
} |
|
45 |
|
|
46 |
if ($flag[$i] ne '' && $token[$i] eq '.') { |
|
47 |
delete $flag[$i]; |
|
48 |
$chunk[$i] = '0'; |
|
49 |
} |
|
50 |
|
|
51 |
if ($flag[$i] eq 'B') { |
|
52 |
if (defined $chunk) { |
|
53 |
if (($chunk eq 'PC' && $chunk[$i] eq 'NC') || |
|
54 |
($chunk eq 'PP' && $chunk[$i] eq 'NP')) |
|
55 |
{ |
|
56 |
$inPC = $chunk; |
|
57 |
} |
|
58 |
else { |
|
59 |
$cetags[$i-1] = end_tag($chunk); |
|
60 |
if (defined $inPC) { |
|
61 |
$cetags[$i-1] .= end_tag($inPC); |
|
62 |
undef $inPC; |
|
63 |
} |
|
64 |
} |
|
65 |
} |
|
66 |
$chunk = $chunk[$i]; |
|
67 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
68 |
} |
|
69 |
|
|
70 |
elsif ($flag[$i] ne 'I' && defined $chunk) { |
|
71 |
$cetags[$i-1] = end_tag($chunk); |
|
72 |
undef $chunk; |
|
73 |
if (defined $inPC) { |
|
74 |
$cetags[$i-1] .= end_tag($inPC); |
|
75 |
undef $inPC; |
|
76 |
} |
|
77 |
} |
|
78 |
} |
|
79 |
|
|
80 |
for( $i=0; $i<=$n; $i++ ) { |
|
81 |
print $markup[$i]; |
|
82 |
print $cbtags[$i]; |
|
83 |
print token_and_tag($token[$i],$tag[$i]) if defined $token[$i]; |
|
84 |
print $cetags[$i]; |
|
85 |
} |
|
86 |
|
|
87 |
undef @token; |
|
88 |
undef @tag; |
|
89 |
undef @chunk; |
|
90 |
undef @cbtags; |
|
91 |
undef @cetags; |
|
92 |
undef @flag; |
|
93 |
undef @markup; |
|
94 |
$n = 0; |
|
95 |
} |
|
96 |
|
|
97 |
sub doc_start { |
|
98 |
return '' unless defined $opt_t; |
|
99 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
100 |
} |
|
101 |
|
|
102 |
sub doc_end { |
|
103 |
return '' unless defined $opt_t; |
|
104 |
return "</corpus>\n"; |
|
105 |
} |
|
106 |
|
|
107 |
sub start_tag { |
|
108 |
my $t=shift; |
|
109 |
return "<$t>\n" unless defined $opt_t; |
|
110 |
return " <phrase cat=\"$t\">\n"; |
|
111 |
} |
|
112 |
|
|
113 |
sub end_tag { |
|
114 |
my $t=shift; |
|
115 |
return "</$t>\n" unless defined $opt_t; |
|
116 |
return " </phrase>\n"; |
|
117 |
} |
|
118 |
|
|
119 |
sub token_and_tag { |
|
120 |
my ($token,$tag)=@_; |
|
121 |
return "$token\t$tag\n" unless defined $opt_t; |
|
122 |
return " <token word=\"$token\" pos=\"$tag\"/>\n"; |
|
123 |
} |
tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup-greek.perl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
$month{"??????????"} = 1; |
|
4 |
$month{"???????????"} = 1; |
|
5 |
$month{"???????"} = 1; |
|
6 |
$month{"????????"} = 1; |
|
7 |
$month{"?????"} = 1; |
|
8 |
$month{"?????"} = 1; |
|
9 |
$month{"?????"} = 1; |
|
10 |
$month{"???????"} = 1; |
|
11 |
$month{"???????"} = 1; |
|
12 |
$month{"?????????"} = 1; |
|
13 |
$month{"???????????"} = 1; |
|
14 |
$month{"?????????"} = 1; |
|
15 |
$month{"?????????"} = 1; |
|
16 |
$month{"??????????"} = 1; |
|
17 |
|
|
18 |
$month{"?????????"} = 1; |
|
19 |
$month{"??????????"} = 1; |
|
20 |
$month{"??????"} = 1; |
|
21 |
$month{"???????"} = 1; |
|
22 |
$month{"????"} = 1; |
|
23 |
$month{"??????"} = 1; |
|
24 |
$month{"??????"} = 1; |
|
25 |
$month{"????????"} = 1; |
|
26 |
$month{"????????"} = 1; |
|
27 |
$month{"??????????"} = 1; |
|
28 |
$month{"????????"} = 1; |
|
29 |
$month{"????????"} = 1; |
|
30 |
$month{"?????????"} = 1; |
|
31 |
|
|
32 |
while (<>) { |
|
33 |
chomp; |
|
34 |
if ($_ eq '') { |
|
35 |
print_sentence(); |
|
36 |
} |
|
37 |
else { |
|
38 |
push @token, $_; |
|
39 |
} |
|
40 |
} |
|
41 |
print_sentence(); |
|
42 |
|
|
43 |
sub print_sentence { |
|
44 |
for( $i=0; $i<=$#token; $i++ ) { |
|
45 |
if (exists $month{$token[$i]}) { |
|
46 |
$start = $end = $i; |
|
47 |
if ($token[$start-1] =~ /^[1-9][0-9]?([???]??)?(-[1-9][0-9]?([???]??)?)?$/){ |
|
48 |
$start--; |
|
49 |
} |
|
50 |
if ($token[$start-1] eq '??????') { |
|
51 |
$start--; |
|
52 |
} |
|
53 |
if ($token[$end+1] eq '???') { |
|
54 |
$end++; |
|
55 |
} |
|
56 |
if ($token[$end+1] =~ /^(1[0-9][0-9][0-9]|20[0-9][0-9]|'[0-9][0-9])$/) { |
|
57 |
$end++; |
|
58 |
} |
|
59 |
for( $k=$start; $k<$end; $k++) { |
|
60 |
$join[$k] = 1; |
|
61 |
} |
|
62 |
} |
|
63 |
elsif (($token[$i] eq "??'" && $token[$i+1] eq '???') || |
|
64 |
($token[$i] eq '??' && |
|
65 |
($token[$i+1] eq '????' || $token[$i+1] eq '????'))) |
|
66 |
{ |
|
67 |
$join[$i] = 1; |
|
68 |
} |
|
69 |
|
|
70 |
} |
|
71 |
for( $i=0; $i<=$#token; $i++ ) { |
|
72 |
if ($join[$i] == 1) { |
|
73 |
print "$token[$i] " |
|
74 |
} |
|
75 |
else { |
|
76 |
print "$token[$i]\n" |
|
77 |
} |
|
78 |
} |
|
79 |
undef @token; |
|
80 |
undef @join; |
|
81 |
} |
tmp/org.txm.treetagger.core.win32/res/win32/cmd/tokenize.pl (revision 1683) | ||
---|---|---|
1 |
#!/usr/bin/perl |
|
2 |
|
|
3 |
######################################################################## |
|
4 |
# # |
|
5 |
# tokenization script for tagger preprocessing # |
|
6 |
# Author: Helmut Schmid, IMS, University of Stuttgart # |
|
7 |
# Serge Sharoff, University of Leeds # |
|
8 |
# Description: # |
|
9 |
# - splits input text into tokens (one token per line) # |
|
10 |
# - cuts off punctuation, parentheses etc. # |
|
11 |
# - disambiguates periods # |
|
12 |
# - preserves SGML markup # |
|
13 |
# # |
|
14 |
######################################################################## |
|
15 |
|
|
16 |
use Getopt::Std; |
|
17 |
|
|
18 |
getopts('hfeia:u'); |
|
19 |
|
|
20 |
use utf8; |
|
21 |
if (defined $opt_u) { |
|
22 |
use open ':utf8'; |
|
23 |
binmode(STDIN,":utf8"); |
|
24 |
binmode(STDOUT,":utf8"); |
|
25 |
} |
|
26 |
|
|
27 |
# Modify the following lines in order to adapt the tokenizer to other |
|
28 |
# types of text and/or languages |
|
29 |
|
|
30 |
# characters which have to be cut off at the beginning of a word |
|
31 |
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›'; |
|
32 |
|
|
33 |
# characters which have to be cut off at the end of a word |
|
34 |
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›'; |
|
35 |
|
|
36 |
# character sequences which have to be cut off at the beginning of a word |
|
37 |
my $PClitic=''; |
|
38 |
|
|
39 |
# character sequences which have to be cut off at the end of a word |
|
40 |
my $FClitic; |
|
41 |
|
|
42 |
if (defined($opt_e)) { |
|
43 |
# English |
|
44 |
$FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t'; |
|
45 |
} |
|
46 |
if (defined($opt_i)) { |
|
47 |
# Italian |
|
48 |
$PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''; |
|
49 |
} |
|
50 |
if (defined($opt_f)) { |
|
51 |
# French |
|
52 |
$PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\''; |
|
53 |
$FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l'; |
|
54 |
} |
|
55 |
|
|
56 |
|
|
57 |
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE ######################### |
|
58 |
|
|
59 |
if (defined($opt_h)) { |
|
60 |
die " |
|
61 |
Usage: tokenize.perl [ options ] ...files... |
|
62 |
|
|
63 |
Options: |
|
64 |
-u : use UTF8 encoding |
|
65 |
-e : English text |
|
66 |
-f : French text |
|
67 |
-i : Italian text |
|
68 |
-a <file>: <file> contains a list of words which are either abbreviations or |
|
69 |
words which should not be further split. |
|
70 |
"; |
|
71 |
} |
|
72 |
|
|
73 |
# Read the list of abbreviations and words |
|
74 |
if (defined($opt_a)) { |
|
75 |
die "Can't read: $opt_a: $!\n" unless (open(FILE, $opt_a)); |
|
76 |
while (<FILE>) { |
|
77 |
s/^[ \t\r\n]+//; |
|
78 |
s/[ \t\r\n]+$//; |
|
79 |
next if (/^\#/ || /^\s$/); # ignore comments |
|
80 |
$Token{$_} = 1; |
|
81 |
} |
|
82 |
close FILE; |
|
83 |
} |
|
84 |
|
|
85 |
#SS: main loop; |
|
86 |
my $first_line = 1; |
|
87 |
while (<>) { |
|
88 |
# delete optional byte order markers (BOM) |
|
89 |
if ($first_line) { |
|
90 |
undef $first_line; |
|
91 |
s/^\x{FEFF}//; |
|
92 |
} |
|
93 |
|
|
94 |
# replace newlines and tab characters with blanks |
|
95 |
tr/\n\t/ /; |
|
96 |
|
|
97 |
# replace blanks within SGML tags |
|
98 |
while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) { |
|
99 |
} |
|
100 |
; |
|
101 |
#Separ: ÿþ |
|
102 |
|
|
103 |
# replace whitespace with a special character |
|
104 |
tr/ /\376/; |
|
105 |
|
|
106 |
# restore SGML tags |
|
107 |
tr/\377\376/ \377/; |
|
108 |
|
|
109 |
# prepare SGML-Tags for tokenization |
|
110 |
s/(<[^<>]*>)/\377$1\377/g; |
|
111 |
s/^\377//; |
|
112 |
s/\377$//; |
|
113 |
s/\377\377\377*/\377/g; |
|
114 |
|
|
115 |
@S = split("\377"); |
|
116 |
for ( $i=0; $i<=$#S; $i++) { |
|
117 |
$_ = $S[$i]; |
|
118 |
|
|
119 |
if (/^<.*>$/) { |
|
120 |
# SGML tag |
|
121 |
print $_,"\n"; |
|
122 |
} else { |
|
123 |
# add a blank at the beginning and the end of each segment |
|
124 |
$_ = ' '.$_.' '; |
|
125 |
# insert missing blanks after punctuation |
|
126 |
s/(\.\.\.)/ ... /g; |
|
127 |
s/([;\!\?])([^ ])/$1 $2/g; |
|
128 |
s/([.,:])([^ 0-9.])/$1 $2/g; |
|
129 |
|
|
130 |
@F = split; |
|
131 |
for ( $j=0; $j<=$#F; $j++) { |
|
132 |
my $suffix=""; |
|
133 |
$_ = $F[$j]; |
|
134 |
# separate punctuation and parentheses from words |
|
135 |
do { |
|
136 |
$finished = 1; |
|
137 |
# cut off preceding punctuation |
|
138 |
if (s/^([$PChar])(.)/$2/) { |
|
139 |
print $1,"\n"; |
|
140 |
$finished = 0; |
|
141 |
} |
|
142 |
# cut off trailing punctuation |
|
143 |
if (s/(.)([$FChar])$/$1/) { |
|
144 |
$suffix = "$2\n$suffix"; |
|
145 |
$finished = 0; |
|
146 |
} |
|
147 |
# cut off trailing periods if punctuation precedes |
|
148 |
if (s/([$FChar])\.$//) { |
|
149 |
$suffix = ".\n$suffix"; |
|
150 |
if ($_ eq "") { |
|
151 |
$_ = $1; |
|
152 |
} else { |
|
153 |
$suffix = "$1\n$suffix"; |
|
154 |
} |
|
155 |
$finished = 0; |
|
156 |
} |
|
157 |
} while (!$finished); |
|
158 |
|
|
159 |
# handle explicitly listed tokens |
|
160 |
if (defined($Token{$_})) { |
|
161 |
print "$_\n$suffix"; |
|
162 |
next; |
|
163 |
} |
|
164 |
|
|
165 |
# abbreviations of the form A. or U.S.A. |
|
166 |
if (/^([A-Za-z-]\.)+$/) { |
|
167 |
print "$_\n$suffix"; |
|
168 |
next; |
|
169 |
} |
|
170 |
|
|
171 |
# disambiguate periods |
|
172 |
if (/^(..*)\.$/ && $_ ne "..." && !/^[0-9]+\.$/) { |
|
173 |
$_ = $1; |
|
174 |
$suffix = ".\n$suffix"; |
|
175 |
if (defined($Token{$_})) { |
|
176 |
print "$_\n$suffix"; |
|
177 |
next; |
|
178 |
} |
|
179 |
} |
|
180 |
|
|
181 |
# cut off clitics |
|
182 |
if ($PClitic ne '') { |
|
183 |
while (s/^($PClitic)(.)/$2/) { |
|
184 |
print $1,"\n"; |
|
185 |
} |
|
186 |
} |
|
187 |
if ($FClitic ne '') { |
|
188 |
while (s/(.)($FClitic)$/$1/) { |
|
189 |
$suffix = "$2\n$suffix"; |
|
190 |
} |
|
191 |
} |
|
192 |
|
|
193 |
print "$_\n$suffix"; |
|
194 |
} |
|
195 |
} |
|
196 |
} |
|
197 |
} |
|
0 | 198 |
tmp/org.txm.treetagger.core.win32/res/win32/lib/english-abbreviations (revision 1683) | ||
---|---|---|
1 |
Adm. |
|
2 |
Ala. |
|
3 |
Ariz. |
|
4 |
Ark. |
|
5 |
Aug. |
|
6 |
Ave. |
|
7 |
Bancorp. |
|
8 |
Bhd. |
|
9 |
Brig. |
|
10 |
Bros. |
|
11 |
CO. |
|
12 |
CORP. |
|
13 |
COS. |
|
14 |
Ca. |
|
15 |
Calif. |
|
16 |
Canada-U.S. |
|
17 |
Canadian-U.S. |
|
18 |
Capt. |
|
19 |
Cia. |
|
20 |
Cie. |
|
21 |
Co. |
|
22 |
Col. |
|
23 |
Colo. |
|
24 |
Conn. |
|
25 |
Corp. |
|
26 |
Cos. |
|
27 |
D-Mass. |
|
28 |
Dec. |
|
29 |
Del. |
|
30 |
Dept. |
|
31 |
Dr. |
|
32 |
Drs. |
|
33 |
Etc. |
|
34 |
Feb. |
|
35 |
Fla. |
|
36 |
Ft. |
|
37 |
Ga. |
|
38 |
Gen. |
|
39 |
Gov. |
|
40 |
Hon. |
|
41 |
INC. |
|
42 |
Ill. |
|
43 |
Inc. |
|
44 |
Ind. |
|
45 |
Jan. |
|
46 |
Japan-U.S. |
|
47 |
Jr. |
|
48 |
Kan. |
|
49 |
Korean-U.S. |
|
50 |
Ky. |
|
51 |
La. |
|
52 |
Lt. |
|
53 |
Ltd. |
|
54 |
Maj. |
|
55 |
Mass. |
|
56 |
Md. |
|
57 |
Messrs. |
|
58 |
Mfg. |
|
59 |
Mich. |
|
60 |
Minn. |
|
61 |
Miss. |
|
62 |
Mo. |
|
63 |
Mr. |
|
64 |
Mrs. |
|
65 |
Ms. |
|
66 |
Neb. |
|
67 |
Nev. |
|
68 |
No. |
|
69 |
Nos. |
|
70 |
Nov. |
|
71 |
Oct. |
|
72 |
Okla. |
|
73 |
Ont. |
|
74 |
Ore. |
|
75 |
Pa. |
|
76 |
Ph. |
|
77 |
Prof. |
|
78 |
Prop. |
|
79 |
Pty. |
|
80 |
Rep. |
|
81 |
Reps. |
|
82 |
Rev. |
|
83 |
S.p.A. |
|
84 |
Sen. |
|
85 |
Sens. |
|
86 |
Sept. |
|
87 |
Sgt. |
|
88 |
Sino-U.S. |
|
89 |
Sr. |
|
90 |
St. |
|
91 |
Ste. |
|
92 |
Tenn. |
|
93 |
Tex. |
|
94 |
U.S.-U.K. |
|
95 |
U.S.-U.S.S.R. |
|
96 |
Va. |
|
97 |
Vt. |
|
98 |
W.Va. |
|
99 |
Wash. |
|
100 |
Wis. |
|
101 |
Wyo. |
|
102 |
a.k.a. |
|
103 |
a.m. |
|
104 |
anti-U.S. |
|
105 |
cap. |
|
106 |
days. |
|
107 |
etc. |
|
108 |
ft. |
|
109 |
i.e. |
|
110 |
non-U.S. |
|
111 |
p.m. |
|
112 |
president-U.S. |
|
113 |
s.r.l. |
|
114 |
v. |
|
115 |
v.B. |
|
116 |
v.w. |
|
117 |
vs. |
tmp/org.txm.treetagger.core.win32/res/win32/lib/german-abbreviations (revision 1683) | ||
---|---|---|
1 |
A. |
|
2 |
A.-G. |
|
3 |
A.G. |
|
4 |
ADN-Korr. |
|
5 |
AT-Mot. |
|
6 |
Abb. |
|
7 |
Abess. |
|
8 |
Abl. |
|
9 |
Ablief.-Gew. |
|
10 |
Abm. |
|
11 |
Abs. |
|
12 |
Abt. |
|
13 |
Abtlg. |
|
14 |
Agl. |
|
15 |
Agt. |
|
16 |
Akt.-Ges. |
|
17 |
Aktbr. |
|
18 |
Alg. |
|
19 |
Alleininh. |
|
20 |
Allg. |
|
21 |
Altwageneint. |
|
22 |
Alu-Felg. |
|
23 |
Alum. |
|
24 |
Am. |
|
25 |
Amp. |
|
26 |
Anf. |
|
27 |
Anfr. |
|
28 |
Anfrag. |
|
29 |
Ang. |
|
30 |
Angb. |
|
31 |
Angeb. |
|
32 |
Angl. |
|
33 |
Anhängerkuppl. |
|
34 |
Anl. |
|
35 |
Anleih. |
|
36 |
Ann.-Exp. |
|
37 |
Ann.-Exped. |
|
38 |
Ant. |
|
39 |
Anten. |
|
40 |
Anz. |
|
41 |
Anz.-Exp. |
|
42 |
Anz.-Verm. |
|
43 |
Anzahlg. |
|
44 |
Anzhlg. |
|
45 |
Apoth. |
|
46 |
App. |
|
47 |
Appartem. |
|
48 |
April-Lief. |
|
49 |
Argent. |
|
50 |
Atl. |
|
51 |
Aufb. |
|
52 |
Aufst. |
|
53 |
Aug. |
|
54 |
Augsb. |
|
55 |
Ausg. |
|
56 |
Ausgl. |
|
57 |
Ausk. |
|
58 |
Ausl. |
|
59 |
Ausl.-Akt. |
|
60 |
Auslandsanl. |
|
61 |
Auslandsb. |
|
62 |
Ausst. |
|
63 |
Ausstattg. |
|
64 |
Austral. |
|
65 |
Ausz. |
|
66 |
Aut. |
|
67 |
Autom. |
|
68 |
Automat. |
|
69 |
Automin. |
|
70 |
B. |
|
71 |
B.P. |
|
72 |
BGBl. |
|
73 |
Bahnhofstr. |
|
74 |
Balk. |
|
75 |
Bau-Ing. |
|
76 |
Bauges. |
|
77 |
Bauj. |
|
78 |
Bay. |
|
79 |
Bayer. |
|
80 |
Bb. |
|
81 |
Bd. |
|
82 |
Bed. |
|
83 |
Beding. |
|
84 |
Ber. |
|
85 |
Beratg. |
|
86 |
Bereif. |
|
87 |
Bergb. |
|
88 |
Bergstr. |
|
89 |
Bernh. |
|
90 |
Bes. |
|
91 |
Besichtig. |
|
92 |
Bestzust. |
|
93 |
Beteil. |
|
94 |
Beteilig. |
|
95 |
Betr. |
|
96 |
Bett. |
|
97 |
Bew. |
|
98 |
Bewerb. |
|
99 |
Bewerbg. |
|
100 |
Bez. |
|
101 |
Bgl. |
|
102 |
Bhf. |
|
103 |
Bierbr. |
|
104 |
Bildzuschr. |
|
105 |
Bilf. |
|
106 |
Bj. |
|
107 |
Bk. |
|
108 |
Bkz. |
|
109 |
Bl. |
|
110 |
Bln. |
|
111 |
Boch. |
|
112 |
Bod. |
|
113 |
Bor. |
|
114 |
Bov. |
|
115 |
Br. |
|
116 |
Brem. |
|
117 |
Brh. |
|
118 |
Brok. |
|
119 |
Brsg. |
|
120 |
Bu. |
|
121 |
Bung. |
|
122 |
Burgstr. |
|
123 |
Bw. |
|
124 |
Bwsp. |
|
125 |
Bz. |
|
126 |
Bäd. |
|
127 |
C. |
|
128 |
C.G. |
|
129 |
Cabr. |
|
130 |
Can. |
|
131 |
Cap. |
|
132 |
Cav. |
|
133 |
Cbr. |
|
134 |
Cem. |
|
135 |
Centralb. |
|
136 |
Cert. |
|
137 |
Ch. |
|
138 |
Charlottenstr. |
|
139 |
Chem. |
|
140 |
Chem.-Ing. |
|
141 |
Chevr. |
|
142 |
Chr. |
|
143 |
Christophstr. |
|
144 |
Cie. |
|
145 |
Co. |
|
146 |
Colorvergl. |
|
147 |
Commerzb. |
|
148 |
Conc. |
|
149 |
Cons. |
|
150 |
Corneliusstr. |
|
151 |
Corp. |
|
152 |
Cp. |
|
153 |
Cpt. |
|
154 |
Cz. |
|
155 |
D. |
|
156 |
DG. |
|
157 |
DM. |
|
158 |
DUB-Schulth. |
|
159 |
DW. |
|
160 |
Dahlb. |
|
161 |
Dawes-Anl. |
|
162 |
Dept. |
|
163 |
Dev. |
|
164 |
Dez. |
|
165 |
Di. |
|
166 |
Dipl. |
|
167 |
Dipl.-Ing. |
|
168 |
Dipl.-Kfm. |
|
169 |
Dir. |
|
170 |
Direktionsw. |
|
171 |
Div. |
|
172 |
Do. |
|
173 |
Doll. |
|
174 |
Don. |
|
175 |
Dorfk. |
|
176 |
Dpf. |
|
177 |
Dr. |
|
178 |
Dr.-Ing. |
|
179 |
Dreij. |
|
180 |
Drog. |
|
181 |
Dt. |
|
182 |
Du. |
|
183 |
Dyckerh. |
|
184 |
Dyn. |
|
185 |
Dän. |
|
186 |
Düsseld. |
|
187 |
E. |
|
188 |
E.h. |
|
189 |
Einf. |
|
190 |
Einh. |
|
191 |
Einr. |
|
192 |
Eint. |
|
193 |
Eintr. |
|
194 |
Einw. |
|
195 |
Einwohn. |
|
196 |
Einz. |
|
197 |
Einzelzi. |
|
198 |
Eisenb. |
|
199 |
El. |
|
200 |
Elektr. |
|
201 |
Em. |
|
202 |
Endpr. |
|
203 |
Engl. |
|
204 |
Ent. |
|
205 |
Entsch. |
|
206 |
Entw. |
|
207 |
Erdgesch. |
|
208 |
Erf. |
|
209 |
Erfahr. |
|
210 |
Erstzul. |
|
211 |
Erzgeb. |
|
212 |
Esc. |
|
213 |
Eterna. |
|
214 |
Etg.-Hs. |
|
215 |
Eur. |
|
216 |
Ew. |
|
217 |
Exp. |
|
218 |
Expl. |
|
219 |
F. |
|
220 |
FS. |
|
221 |
Fa. |
|
222 |
Fabr. |
|
223 |
Fabrikat. |
|
224 |
Fachm. |
|
225 |
Fachricht. |
|
226 |
Fahrz. |
|
227 |
Fam. |
|
228 |
Fb. |
|
229 |
Fd. |
|
230 |
Fds. |
|
231 |
Feb. |
|
232 |
Febr. |
|
233 |
Febr.-Abl. |
|
234 |
Febr.-März-Abl. |
|
235 |
Feldstr. |
|
236 |
Fensterh. |
|
237 |
Ferd. |
|
238 |
Ferdinandstr. |
|
239 |
Fernschr. |
|
240 |
Ferr. |
|
241 |
Feuervers. |
|
242 |
Ffm. |
|
243 |
Fil. |
|
244 |
Fin. |
|
245 |
Finanzier. |
|
246 |
Finanzierg. |
|
247 |
Finanzierungsmöglichk. |
|
248 |
Finnl. |
|
249 |
Ford. |
|
250 |
Fortschr. |
|
251 |
Fr. |
|
252 |
Frankf. |
|
253 |
Franz. |
|
254 |
Franziskanerstr. |
|
255 |
Freiverk. |
|
256 |
Frhr. |
|
257 |
Fried. |
|
258 |
Friedr. |
|
259 |
Friedrich-Ebert-Str. |
|
260 |
Friedrichstr. |
|
261 |
Frl. |
|
262 |
Frühst. |
|
263 |
Führersch. |
|
264 |
G. |
|
265 |
G.M.B.H. |
|
266 |
G.m.b.H. |
|
267 |
Gar. |
|
268 |
Garag. |
|
269 |
Gart. |
|
270 |
Geb. |
Formats disponibles : Unified diff