Révision 1683
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-french.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
################################################################### |
|
| 4 |
### ### |
|
| 5 |
### File: filter-chunker-output.perl ### |
|
| 6 |
### Author: Michel Genereux ### |
|
| 7 |
### (indicated modifications by Dennis Spohr (DS)) ### |
|
| 8 |
### Purpose: Filter chunker output and create XML-like markup ### |
|
| 9 |
### Created: Mon Feb 19 2007 ### |
|
| 10 |
### ### |
|
| 11 |
################################################################### |
|
| 12 |
|
|
| 13 |
use Getopt::Std; |
|
| 14 |
getopts('t');
|
|
| 15 |
|
|
| 16 |
$| = 1; |
|
| 17 |
|
|
| 18 |
print doc_start(); |
|
| 19 |
|
|
| 20 |
### DS start: end-of-sentence marker |
|
| 21 |
$eos = '[.?!;]'; |
|
| 22 |
$push = 1; |
|
| 23 |
### |
|
| 24 |
|
|
| 25 |
$n = 0; |
|
| 26 |
|
|
| 27 |
while (<>) {
|
|
| 28 |
s/.-SBAR$/O/; |
|
| 29 |
s/I-PC$/0/ if (/I-PC$/ && !$inside_pp); |
|
| 30 |
|
|
| 31 |
### DS start: process lemma column |
|
| 32 |
if (($token[$n],$tag[$n],$tag,$chunk[$n],$x,$lemma[$n]) = $_ =~ /^(.*)-(.*)\t(.*)\/(.*)(\t(.*))?$/) {
|
|
| 33 |
### DS end |
|
| 34 |
|
|
| 35 |
### DS start: chunking error; some SENTs have e.g. I-NP although |
|
| 36 |
### they mark the end of a sentence; lead to omission |
|
| 37 |
### of closing tags |
|
| 38 |
$push = 0; |
|
| 39 |
$chunk[$n] = 0 if ($tag eq 'SENT' && $chunk[$n] =~ /^(I|B)-/); |
|
| 40 |
### DS end |
|
| 41 |
|
|
| 42 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) {
|
|
| 43 |
$flag[$n] = $1; |
|
| 44 |
$chunk[$n] = $2; |
|
| 45 |
} else {
|
|
| 46 |
undef $flag[$n]; |
|
| 47 |
undef $chunk[$n]; |
|
| 48 |
} |
|
| 49 |
|
|
| 50 |
### DS start: performance boost: set $n to 0 after printing |
|
| 51 |
### sentence; otherwise $n and arrays get too big |
|
| 52 |
### and cause slowdown |
|
| 53 |
if ($token[$n] =~ /^$eos\s*$/ && $chunk[$n] == 0 && $tag[$n] eq 'SENT') {
|
|
| 54 |
print_sentence(0); |
|
| 55 |
$n = 0; |
|
| 56 |
$start_markup = ""; |
|
| 57 |
} else {
|
|
| 58 |
$n++; |
|
| 59 |
} |
|
| 60 |
### DS end |
|
| 61 |
|
|
| 62 |
### DS start: keep markup already present in input data and insert |
|
| 63 |
### chunker markup correctly; if an element starting before |
|
| 64 |
### the sentence is closed before the sentence is closed |
|
| 65 |
### (e.g. headlines without sentence end markers), then |
|
| 66 |
### the sentence should also be closed, e.g. avoid cases like |
|
| 67 |
### <HEADLINE><s>Les r?sultats de jeudi</HEADLINE></s> |
|
| 68 |
} elsif (/^<([^\/]*?)(( |~).*)?>/ && $push) {
|
|
| 69 |
push(@tag_stack,$1); |
|
| 70 |
$start_markup .= "$&\n"; |
|
| 71 |
} elsif (/^<\/(.*?)>/ && $1 eq $tag_stack[$#tag_stack]) {
|
|
| 72 |
$end_markup = "$&\n"; |
|
| 73 |
print_sentence(1); |
|
| 74 |
$n = 0; |
|
| 75 |
$push = 1; |
|
| 76 |
$start_markup = ""; |
|
| 77 |
$end_markup = ""; |
|
| 78 |
pop(@tag_stack); |
|
| 79 |
### DS end |
|
| 80 |
|
|
| 81 |
} else {
|
|
| 82 |
$markup[$n] .= $_; |
|
| 83 |
} |
|
| 84 |
} |
|
| 85 |
|
|
| 86 |
print_sentence(1); |
|
| 87 |
print doc_end(); |
|
| 88 |
|
|
| 89 |
|
|
| 90 |
sub print_sentence {
|
|
| 91 |
|
|
| 92 |
### DS start: indicate whether print_sentence is forced by |
|
| 93 |
### closing input markup |
|
| 94 |
my $forced = shift; |
|
| 95 |
### DS end |
|
| 96 |
|
|
| 97 |
my($i,$chunk); |
|
| 98 |
|
|
| 99 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 100 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
|
|
| 101 |
$flag[$i] = 'B'; |
|
| 102 |
} |
|
| 103 |
if ($flag[$i] eq 'B') {
|
|
| 104 |
if (defined $chunk) {
|
|
| 105 |
$cetags[$i-1] .= end_tag($chunk); |
|
| 106 |
} |
|
| 107 |
if ($chunk[$i] eq 'PC') {
|
|
| 108 |
for( $k=$i+1; $k<=$n; $k++ ) {
|
|
| 109 |
last if ($flag[$k] eq 'B'); |
|
| 110 |
} |
|
| 111 |
for( $k++; $k<=$n; $k++ ) {
|
|
| 112 |
last if ($flag[$k] ne 'I'); |
|
| 113 |
} |
|
| 114 |
if ($k <= $n && $flag[$k] eq 'E' && $chunk[$k] eq 'PC') {
|
|
| 115 |
$markup[$k+1] .= end_tag('PC');
|
|
| 116 |
undef $flag[$k]; |
|
| 117 |
undef $chunk[$k]; |
|
| 118 |
|
|
| 119 |
### DS start: $k may be greater than $n; add closing PC tag |
|
| 120 |
### to $markup[$n]; otherwise closing tags are |
|
| 121 |
### omitted |
|
| 122 |
} elsif ($k > $n && ($forced || $token[$n] =~ /^$eos\s*$/)) {
|
|
| 123 |
$markup[$n] .= end_tag('PC');
|
|
| 124 |
### DS end |
|
| 125 |
|
|
| 126 |
} else {
|
|
| 127 |
$markup[$k] .= end_tag('PC');
|
|
| 128 |
} |
|
| 129 |
undef $chunk; |
|
| 130 |
} |
|
| 131 |
else {
|
|
| 132 |
$chunk = $chunk[$i]; |
|
| 133 |
} |
|
| 134 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
| 135 |
} |
|
| 136 |
elsif ($flag[$i] eq 'E') {
|
|
| 137 |
if ($chunk[$i] eq $chunk) {
|
|
| 138 |
$cetags[$i] .= end_tag($chunk); |
|
| 139 |
undef $chunk; |
|
| 140 |
} |
|
| 141 |
elsif ($chunk[$i] eq 'PC') {
|
|
| 142 |
$cetags[$i-1] .= end_tag($chunk) if defined $chunk; |
|
| 143 |
$cetags[$i] .= end_tag("PC");
|
|
| 144 |
my $k; |
|
| 145 |
for( $k=$i; $k>=0; $k-- ) {
|
|
| 146 |
if ($flag[$k] eq 'B') {
|
|
| 147 |
$cbtags[$k] = start_tag("PC").$cbtags[$k];
|
|
| 148 |
last; |
|
| 149 |
} |
|
| 150 |
} |
|
| 151 |
undef $chunk; |
|
| 152 |
} |
|
| 153 |
else {
|
|
| 154 |
die; |
|
| 155 |
} |
|
| 156 |
} |
|
| 157 |
elsif ($flag[$i] ne 'I' && defined $chunk) {
|
|
| 158 |
$cetags[$i-1] .= end_tag($chunk); |
|
| 159 |
undef $chunk; |
|
| 160 |
} |
|
| 161 |
} |
|
| 162 |
|
|
| 163 |
$printed = 0;# start_tag("s");
|
|
| 164 |
|
|
| 165 |
### DS start: print opening tags of input markup before sentence |
|
| 166 |
print $start_markup; |
|
| 167 |
print start_tag("s") if $n > 0;
|
|
| 168 |
### DS end |
|
| 169 |
|
|
| 170 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 171 |
print $markup[$i]; |
|
| 172 |
#unless ($printed) {
|
|
| 173 |
# print start_tag("s");
|
|
| 174 |
# $printed = 1; |
|
| 175 |
#} |
|
| 176 |
print $cbtags[$i]; |
|
| 177 |
|
|
| 178 |
### DS start: slightly renamed sub and added lemma parameter |
|
| 179 |
print token_and_tag_and_lemma($token[$i],$tag[$i],$lemma[$i]) if defined $token[$i]; |
|
| 180 |
### DS end |
|
| 181 |
|
|
| 182 |
print $cetags[$i]; |
|
| 183 |
} |
|
| 184 |
|
|
| 185 |
### DS start: print closing "s" tag and closing input markup if |
|
| 186 |
### print_sentence had been forced |
|
| 187 |
print end_tag("s") if $n>0;
|
|
| 188 |
print $end_markup if $forced; |
|
| 189 |
### DS end |
|
| 190 |
|
|
| 191 |
undef @token; |
|
| 192 |
undef @tag; |
|
| 193 |
undef @chunk; |
|
| 194 |
undef @cbtags; |
|
| 195 |
undef @cetags; |
|
| 196 |
undef @flag; |
|
| 197 |
undef @markup; |
|
| 198 |
} |
|
| 199 |
|
|
| 200 |
sub doc_start {
|
|
| 201 |
return '' unless defined $opt_t; |
|
| 202 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
| 203 |
} |
|
| 204 |
|
|
| 205 |
sub doc_end {
|
|
| 206 |
return '' unless defined $opt_t; |
|
| 207 |
return "</corpus>\n"; |
|
| 208 |
} |
|
| 209 |
|
|
| 210 |
sub start_tag {
|
|
| 211 |
my $t=shift; |
|
| 212 |
return "<$t>\n" unless defined $opt_t; |
|
| 213 |
return " <phrase cat=\"$t\">\n"; |
|
| 214 |
} |
|
| 215 |
|
|
| 216 |
sub end_tag {
|
|
| 217 |
my $t=shift; |
|
| 218 |
return "</$t>\n" unless defined $opt_t; |
|
| 219 |
return " </phrase>\n"; |
|
| 220 |
} |
|
| 221 |
|
|
| 222 |
### DS start: also process and output lemma parameter |
|
| 223 |
sub token_and_tag_and_lemma {
|
|
| 224 |
my ($token,$tag,$lemma)=@_; |
|
| 225 |
return "$token\t$tag\t$lemma\n" unless defined $opt_t; |
|
| 226 |
return " <token word=\"$token\" lemma=\"$lemma\" pos=\"$tag\"/>\n"; |
|
| 227 |
} |
|
| 228 |
### DS end |
|
| 229 |
|
|
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/local/bin/perl |
|
| 2 |
|
|
| 3 |
use Getopt::Std; |
|
| 4 |
getopt('dhf:');
|
|
| 5 |
|
|
| 6 |
# This perl script recognizes multi word units in the input stream |
|
| 7 |
# and puts them on one line. Input must have one-word-per-line format. |
|
| 8 |
# The multi word units are listed in the parameter file with POS tags. |
|
| 9 |
# Each line contains one multi word unit where the individual words |
|
| 10 |
# are separated by blanks followed by a tab character and the blank- |
|
| 11 |
# separated list of POS tags. |
|
| 12 |
# Author: Helmut Schmid, IMS, Uni Stuttgart |
|
| 13 |
|
|
| 14 |
if (!defined($opt_f) || defined($opt_h)) {
|
|
| 15 |
$0 =~ s/.*\///; |
|
| 16 |
printf "\nUsage: $0 [-d del] -f mwl-file ...files...\n"; |
|
| 17 |
print "\nOptions:\n"; |
|
| 18 |
print "-d del : Use del as delimiter rather than a blank\n\n"; |
|
| 19 |
die |
|
| 20 |
} |
|
| 21 |
|
|
| 22 |
if (!open(FILE, $opt_f)) {
|
|
| 23 |
die "\nCan't open mwl file: ",$opt_f,"\n"; |
|
| 24 |
} |
|
| 25 |
if (defined($opt_d)) {
|
|
| 26 |
$del = $opt_d; |
|
| 27 |
} else {
|
|
| 28 |
$del = " "; |
|
| 29 |
} |
|
| 30 |
|
|
| 31 |
$N=1; |
|
| 32 |
while (<FILE>) {
|
|
| 33 |
chomp(); |
|
| 34 |
next if /^$/; |
|
| 35 |
@G = split("\t");
|
|
| 36 |
@F = split(/\s+/,$G[0]); |
|
| 37 |
$state = 0; |
|
| 38 |
for($i=0; $i<=$#F; $i++) {
|
|
| 39 |
if (!exists($arc{$state,$F[$i]})) {
|
|
| 40 |
$arc{$state,$F[$i]} = $N++;
|
|
| 41 |
} |
|
| 42 |
$state = $arc{$state,$F[$i]};
|
|
| 43 |
} |
|
| 44 |
$final{$state} = $G[1];
|
|
| 45 |
} |
|
| 46 |
close(FILE); |
|
| 47 |
|
|
| 48 |
|
|
| 49 |
$last = $match = $last_match = 0; |
|
| 50 |
$state = 0; |
|
| 51 |
|
|
| 52 |
for (;;) {
|
|
| 53 |
if ($match == $last) {
|
|
| 54 |
if (!($token[$last] = <>)) {
|
|
| 55 |
if ($last_match > 0) {
|
|
| 56 |
print $token[0]; |
|
| 57 |
for ($i=1; $i<=$last_match; $i++) {
|
|
| 58 |
print $del,$token[$i]; |
|
| 59 |
} |
|
| 60 |
print "\n"; |
|
| 61 |
} else {
|
|
| 62 |
$i=0; |
|
| 63 |
} |
|
| 64 |
for (; $i<$last; $i++) {
|
|
| 65 |
print $token[$i],"\n"; |
|
| 66 |
} |
|
| 67 |
last; |
|
| 68 |
} |
|
| 69 |
chomp($token[$last++]); |
|
| 70 |
} |
|
| 71 |
if (($s = $arc{$state, $token[$match]}) ||
|
|
| 72 |
($s = $arc{$state, lc($token[$match])}) ||
|
|
| 73 |
($s = $arc{$state, ucfirst(lc($token[$match]))})) {
|
|
| 74 |
if (exists($final{$s})) {
|
|
| 75 |
$last_match = $match; |
|
| 76 |
$last_tag = $final{$s};
|
|
| 77 |
} |
|
| 78 |
$state = $s; |
|
| 79 |
$match++; |
|
| 80 |
} else {
|
|
| 81 |
if ($last_match > 0) {
|
|
| 82 |
print $token[0]; |
|
| 83 |
for($i=1; $i<=$last_match; $i++) {
|
|
| 84 |
print $del,$token[$i]; |
|
| 85 |
} |
|
| 86 |
print "\t$last_tag\n"; |
|
| 87 |
} else {
|
|
| 88 |
print $token[0],"\n"; |
|
| 89 |
} |
|
| 90 |
for($i=0,$k=$last_match+1; $k<$last; ) {
|
|
| 91 |
$token[$i++] = $token[$k++]; |
|
| 92 |
} |
|
| 93 |
$last = $last - $last_match - 1; |
|
| 94 |
$last_match = $match = 0; |
|
| 95 |
$state = 0; |
|
| 96 |
} |
|
| 97 |
} |
|
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-german.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
use Getopt::Std; |
|
| 4 |
getopts('t');
|
|
| 5 |
|
|
| 6 |
print doc_start(); |
|
| 7 |
|
|
| 8 |
$n = 0; |
|
| 9 |
while (<>) {
|
|
| 10 |
s/.-SBAR$/O/; |
|
| 11 |
|
|
| 12 |
if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
|
|
| 13 |
$token[$n] = $1; |
|
| 14 |
$tag[$n] = $2; |
|
| 15 |
$chunk[$n] = $4; |
|
| 16 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) {
|
|
| 17 |
$flag[$n] = $1; |
|
| 18 |
$chunk[$n] = $2; |
|
| 19 |
} |
|
| 20 |
else {
|
|
| 21 |
undef $flag[$n]; |
|
| 22 |
undef $chunk[$n]; |
|
| 23 |
} |
|
| 24 |
print_sentence() if $token[$n] eq '.'; |
|
| 25 |
$n++; |
|
| 26 |
} |
|
| 27 |
|
|
| 28 |
else {
|
|
| 29 |
$markup[$n] .= $_; |
|
| 30 |
} |
|
| 31 |
} |
|
| 32 |
|
|
| 33 |
print_sentence(); |
|
| 34 |
print doc_end(); |
|
| 35 |
|
|
| 36 |
|
|
| 37 |
sub print_sentence {
|
|
| 38 |
my($i,$chunk); |
|
| 39 |
|
|
| 40 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 41 |
|
|
| 42 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
|
|
| 43 |
$flag[$i] = 'B'; |
|
| 44 |
} |
|
| 45 |
|
|
| 46 |
if ($flag[$i] eq 'B') {
|
|
| 47 |
if (defined $chunk) {
|
|
| 48 |
$cetags[$i-1] = end_tag($chunk); |
|
| 49 |
} |
|
| 50 |
$chunk = $chunk[$i]; |
|
| 51 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
| 52 |
} |
|
| 53 |
|
|
| 54 |
# German chunker uses E-flags for PCs |
|
| 55 |
elsif ($flag[$i] eq 'E') {
|
|
| 56 |
if ($chunk[$i] eq $chunk) {
|
|
| 57 |
$cetags[$i] = end_tag($chunk); |
|
| 58 |
undef $chunk; |
|
| 59 |
} |
|
| 60 |
elsif ($chunk[$i] eq "PC" && $chunk eq "NC") {
|
|
| 61 |
for( $k=$i-1; $k>=0; $k-- ) {
|
|
| 62 |
if ($chunk[$k] eq "NC") {
|
|
| 63 |
$chunk[$k] = "PC"; |
|
| 64 |
} |
|
| 65 |
if ($flag[$k] ne "I") {
|
|
| 66 |
last; |
|
| 67 |
} |
|
| 68 |
} |
|
| 69 |
$cbtags[$k] = start_tag($chunk[$i]); |
|
| 70 |
$cetags[$i] = end_tag($chunk[$i]); |
|
| 71 |
undef $chunk; |
|
| 72 |
undef $inPC; |
|
| 73 |
} |
|
| 74 |
} |
|
| 75 |
|
|
| 76 |
elsif ($flag[$i] ne 'I' && defined $chunk) {
|
|
| 77 |
$cetags[$i-1] = end_tag($chunk); |
|
| 78 |
undef $chunk; |
|
| 79 |
} |
|
| 80 |
} |
|
| 81 |
|
|
| 82 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 83 |
print $markup[$i]; |
|
| 84 |
print $cbtags[$i]; |
|
| 85 |
print token_and_tag($token[$i],$tag[$i]) if defined $token[$i]; |
|
| 86 |
print $cetags[$i]; |
|
| 87 |
} |
|
| 88 |
|
|
| 89 |
undef @token; |
|
| 90 |
undef @tag; |
|
| 91 |
undef @chunk; |
|
| 92 |
undef @cbtags; |
|
| 93 |
undef @cetags; |
|
| 94 |
undef @flag; |
|
| 95 |
undef @markup; |
|
| 96 |
$n = 0; |
|
| 97 |
} |
|
| 98 |
|
|
| 99 |
sub doc_start {
|
|
| 100 |
return '' unless defined $opt_t; |
|
| 101 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
| 102 |
} |
|
| 103 |
|
|
| 104 |
sub doc_end {
|
|
| 105 |
return '' unless defined $opt_t; |
|
| 106 |
return "</corpus>\n"; |
|
| 107 |
} |
|
| 108 |
|
|
| 109 |
sub start_tag {
|
|
| 110 |
my $t=shift; |
|
| 111 |
return "<$t>\n" unless defined $opt_t; |
|
| 112 |
return " <phrase cat=\"$t\">\n"; |
|
| 113 |
} |
|
| 114 |
|
|
| 115 |
sub end_tag {
|
|
| 116 |
my $t=shift; |
|
| 117 |
return "</$t>\n" unless defined $opt_t; |
|
| 118 |
return " </phrase>\n"; |
|
| 119 |
} |
|
| 120 |
|
|
| 121 |
sub token_and_tag {
|
|
| 122 |
my ($token,$tag)=@_; |
|
| 123 |
return "$token\t$tag\n" unless defined $opt_t; |
|
| 124 |
return " <token word=\"$token\" pos=\"$tag\"/>\n"; |
|
| 125 |
} |
|
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/utf8-tokenize.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
######################################################################## |
|
| 4 |
# # |
|
| 5 |
# tokenization script for tagger preprocessing # |
|
| 6 |
# Author: Helmut Schmid, IMS, University of Stuttgart # |
|
| 7 |
# Serge Sharoff, University of Leeds # |
|
| 8 |
# Description: # |
|
| 9 |
# - splits input text into tokens (one token per line) # |
|
| 10 |
# - cuts off punctuation, parentheses etc. # |
|
| 11 |
# - disambiguates periods # |
|
| 12 |
# - preserves SGML markup # |
|
| 13 |
# # |
|
| 14 |
######################################################################## |
|
| 15 |
|
|
| 16 |
use Getopt::Std; |
|
| 17 |
use utf8; |
|
| 18 |
use Encode; |
|
| 19 |
|
|
| 20 |
getopts('hgfeiza:');
|
|
| 21 |
|
|
| 22 |
# Modify the following lines in order to adapt the tokenizer to other |
|
| 23 |
# types of text and/or languages |
|
| 24 |
|
|
| 25 |
# characters which have to be cut off at the beginning of a word |
|
| 26 |
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›'."'";
|
|
| 27 |
|
|
| 28 |
# characters which have to be cut off at the end of a word |
|
| 29 |
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›'; |
|
| 30 |
|
|
| 31 |
# character sequences which have to be cut off at the beginning of a word |
|
| 32 |
my $PClitic=''; |
|
| 33 |
|
|
| 34 |
# character sequences which have to be cut off at the end of a word |
|
| 35 |
my $FClitic; |
|
| 36 |
|
|
| 37 |
if (defined($opt_e)) {
|
|
| 38 |
# English |
|
| 39 |
$FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t'; |
|
| 40 |
} |
|
| 41 |
if (defined($opt_i)) {
|
|
| 42 |
# Italian |
|
| 43 |
$PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''; |
|
| 44 |
} |
|
| 45 |
if (defined($opt_f)) {
|
|
| 46 |
# French |
|
| 47 |
$PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\''; |
|
| 48 |
$FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l'; |
|
| 49 |
} |
|
| 50 |
if (defined($opt_z)) {
|
|
| 51 |
# Galician |
|
| 52 |
$FClitic = '-la|-las|-lo|-los|-nos'; |
|
| 53 |
} |
|
| 54 |
|
|
| 55 |
|
|
| 56 |
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE ######################### |
|
| 57 |
|
|
| 58 |
if (defined($opt_h)) {
|
|
| 59 |
die " |
|
| 60 |
Usage: utf8-tokenize.perl [ options ] ...files... |
|
| 61 |
|
|
| 62 |
Options: |
|
| 63 |
-e : English text |
|
| 64 |
-f : French text |
|
| 65 |
-i : Italian text |
|
| 66 |
-a <file>: <file> contains a list of words which are either abbreviations or |
|
| 67 |
words which should not be further split. |
|
| 68 |
"; |
|
| 69 |
} |
|
| 70 |
|
|
| 71 |
# Read the list of abbreviations and words |
|
| 72 |
if (defined($opt_a)) {
|
|
| 73 |
die "Can't read: $opt_a: $!\n" unless (open(FILE, $opt_a)); |
|
| 74 |
while (<FILE>) {
|
|
| 75 |
$_ = decode('utf8',$_);
|
|
| 76 |
s/^[ \t\r\n]+//; |
|
| 77 |
s/[ \t\r\n]+$//; |
|
| 78 |
next if (/^\#/ || /^\s$/); # ignore comments |
|
| 79 |
$Token{$_} = 1;
|
|
| 80 |
} |
|
| 81 |
close FILE; |
|
| 82 |
} |
|
| 83 |
|
|
| 84 |
#SS: main loop; |
|
| 85 |
my $first_line = 1; |
|
| 86 |
while (<>) {
|
|
| 87 |
$_ = decode('utf8',$_);
|
|
| 88 |
# delete optional byte order markers (BOM) |
|
| 89 |
if ($first_line) {
|
|
| 90 |
undef $first_line; |
|
| 91 |
s/^\x{FEFF}//;
|
|
| 92 |
} |
|
| 93 |
|
|
| 94 |
# replace newlines and tab characters with blanks |
|
| 95 |
tr/\n\t/ /; |
|
| 96 |
|
|
| 97 |
# replace blanks within SGML tags |
|
| 98 |
while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
|
|
| 99 |
} |
|
| 100 |
; |
|
| 101 |
#Separ: ÿþ |
|
| 102 |
|
|
| 103 |
# replace whitespace with a special character |
|
| 104 |
tr/ /\376/; |
|
| 105 |
|
|
| 106 |
# restore SGML tags |
|
| 107 |
tr/\377\376/ \377/; |
|
| 108 |
|
|
| 109 |
# prepare SGML-Tags for tokenization |
|
| 110 |
s/(<[^<>]*>)/\377$1\377/g; |
|
| 111 |
s/^\377//; |
|
| 112 |
s/\377$//; |
|
| 113 |
s/\377\377\377*/\377/g; |
|
| 114 |
|
|
| 115 |
@S = split("\377");
|
|
| 116 |
for ( $i=0; $i<=$#S; $i++) {
|
|
| 117 |
$_ = $S[$i]; |
|
| 118 |
|
|
| 119 |
if (/^<.*>$/) {
|
|
| 120 |
# SGML tag |
|
| 121 |
print encode('utf8',"$_\n");
|
|
| 122 |
} else {
|
|
| 123 |
# add a blank at the beginning and the end of each segment |
|
| 124 |
$_ = ' '.$_.' '; |
|
| 125 |
# insert missing blanks after punctuation |
|
| 126 |
s/(\.\.\.)/ ... /g; |
|
| 127 |
s/([;\!\?])([^ ])/$1 $2/g; |
|
| 128 |
s/([.,:])([^ 0-9.])/$1 $2/g; |
|
| 129 |
|
|
| 130 |
@F = split; |
|
| 131 |
for ( $j=0; $j<=$#F; $j++) {
|
|
| 132 |
my $suffix=""; |
|
| 133 |
$_ = $F[$j]; |
|
| 134 |
# separate punctuation and parentheses from words |
|
| 135 |
do {
|
|
| 136 |
$finished = 1; |
|
| 137 |
# cut off preceding punctuation |
|
| 138 |
if (s/^([$PChar])(.)/$2/) {
|
|
| 139 |
print encode('utf8',"$1\n");
|
|
| 140 |
$finished = 0; |
|
| 141 |
} |
|
| 142 |
# cut off trailing punctuation |
|
| 143 |
if (s/(.)([$FChar])$/$1/) {
|
|
| 144 |
$suffix = "$2\n$suffix"; |
|
| 145 |
$finished = 0; |
|
| 146 |
} |
|
| 147 |
# cut off trailing periods if punctuation precedes |
|
| 148 |
if (s/([$FChar])\.$//) {
|
|
| 149 |
$suffix = ".\n$suffix"; |
|
| 150 |
if ($_ eq "") {
|
|
| 151 |
$_ = $1; |
|
| 152 |
} else {
|
|
| 153 |
$suffix = "$1\n$suffix"; |
|
| 154 |
} |
|
| 155 |
$finished = 0; |
|
| 156 |
} |
|
| 157 |
} while (!$finished); |
|
| 158 |
|
|
| 159 |
# handle explicitly listed tokens |
|
| 160 |
if (defined($Token{$_})) {
|
|
| 161 |
print encode('utf8',"$_\n$suffix");
|
|
| 162 |
next; |
|
| 163 |
} |
|
| 164 |
|
|
| 165 |
# abbreviations of the form A. or U.S.A. |
|
| 166 |
if (/^([A-Za-z-]\.)+$/) {
|
|
| 167 |
print encode('utf8',"$_\n$suffix");
|
|
| 168 |
next; |
|
| 169 |
} |
|
| 170 |
|
|
| 171 |
|
|
| 172 |
# disambiguate periods |
|
| 173 |
if (/^(..*)\.$/ && $_ ne "..." && !($opt_g && /^[0-9]+\.$/)) {
|
|
| 174 |
$_ = $1; |
|
| 175 |
$suffix = ".\n$suffix"; |
|
| 176 |
if (defined($Token{$_})) {
|
|
| 177 |
print encode('utf8',"$_\n$suffix");
|
|
| 178 |
next; |
|
| 179 |
} |
|
| 180 |
} |
|
| 181 |
|
|
| 182 |
# cut off clitics |
|
| 183 |
while (s/^(--)(.)/$2/) {
|
|
| 184 |
print encode('utf8',"$1\n");
|
|
| 185 |
} |
|
| 186 |
if ($PClitic ne '') {
|
|
| 187 |
while (s/^($PClitic)(.)/$2/) {
|
|
| 188 |
print encode('utf8',"$1\n");
|
|
| 189 |
} |
|
| 190 |
} |
|
| 191 |
|
|
| 192 |
while (s/(.)(--)$/$1/) {
|
|
| 193 |
$suffix = "$2\n$suffix"; |
|
| 194 |
} |
|
| 195 |
if ($FClitic ne '') {
|
|
| 196 |
while (s/(.)($FClitic)$/$1/) {
|
|
| 197 |
$suffix = "$2\n$suffix"; |
|
| 198 |
} |
|
| 199 |
} |
|
| 200 |
|
|
| 201 |
print encode('utf8',"$_\n$suffix");
|
|
| 202 |
} |
|
| 203 |
} |
|
| 204 |
} |
|
| 205 |
} |
|
| 0 | 206 | |
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
use Getopt::Std; |
|
| 4 |
getopts('t');
|
|
| 5 |
|
|
| 6 |
print doc_start(); |
|
| 7 |
|
|
| 8 |
$n = 0; |
|
| 9 |
while (<>) {
|
|
| 10 |
s/.-SBAR$/O/; |
|
| 11 |
|
|
| 12 |
if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
|
|
| 13 |
$token[$n] = $1; |
|
| 14 |
$tag[$n] = $2; |
|
| 15 |
$chunk[$n] = $4; |
|
| 16 |
if ($chunk[$n] =~ /^(.*)-(.*)$/) {
|
|
| 17 |
$flag[$n] = $1; |
|
| 18 |
$chunk[$n] = $2; |
|
| 19 |
} |
|
| 20 |
else {
|
|
| 21 |
undef $flag[$n]; |
|
| 22 |
undef $chunk[$n]; |
|
| 23 |
} |
|
| 24 |
print_sentence() if $token[$n] eq '.'; |
|
| 25 |
$n++; |
|
| 26 |
} |
|
| 27 |
|
|
| 28 |
else {
|
|
| 29 |
$markup[$n] .= $_; |
|
| 30 |
} |
|
| 31 |
} |
|
| 32 |
|
|
| 33 |
print_sentence(); |
|
| 34 |
print doc_end(); |
|
| 35 |
|
|
| 36 |
|
|
| 37 |
sub print_sentence {
|
|
| 38 |
my($i,$chunk); |
|
| 39 |
|
|
| 40 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 41 |
|
|
| 42 |
if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
|
|
| 43 |
$flag[$i] = 'B'; |
|
| 44 |
} |
|
| 45 |
|
|
| 46 |
if ($flag[$i] ne '' && $token[$i] eq '.') {
|
|
| 47 |
delete $flag[$i]; |
|
| 48 |
$chunk[$i] = '0'; |
|
| 49 |
} |
|
| 50 |
|
|
| 51 |
if ($flag[$i] eq 'B') {
|
|
| 52 |
if (defined $chunk) {
|
|
| 53 |
if (($chunk eq 'PC' && $chunk[$i] eq 'NC') || |
|
| 54 |
($chunk eq 'PP' && $chunk[$i] eq 'NP')) |
|
| 55 |
{
|
|
| 56 |
$inPC = $chunk; |
|
| 57 |
} |
|
| 58 |
else {
|
|
| 59 |
$cetags[$i-1] = end_tag($chunk); |
|
| 60 |
if (defined $inPC) {
|
|
| 61 |
$cetags[$i-1] .= end_tag($inPC); |
|
| 62 |
undef $inPC; |
|
| 63 |
} |
|
| 64 |
} |
|
| 65 |
} |
|
| 66 |
$chunk = $chunk[$i]; |
|
| 67 |
$cbtags[$i] .= start_tag($chunk[$i]); |
|
| 68 |
} |
|
| 69 |
|
|
| 70 |
elsif ($flag[$i] ne 'I' && defined $chunk) {
|
|
| 71 |
$cetags[$i-1] = end_tag($chunk); |
|
| 72 |
undef $chunk; |
|
| 73 |
if (defined $inPC) {
|
|
| 74 |
$cetags[$i-1] .= end_tag($inPC); |
|
| 75 |
undef $inPC; |
|
| 76 |
} |
|
| 77 |
} |
|
| 78 |
} |
|
| 79 |
|
|
| 80 |
for( $i=0; $i<=$n; $i++ ) {
|
|
| 81 |
print $markup[$i]; |
|
| 82 |
print $cbtags[$i]; |
|
| 83 |
print token_and_tag($token[$i],$tag[$i]) if defined $token[$i]; |
|
| 84 |
print $cetags[$i]; |
|
| 85 |
} |
|
| 86 |
|
|
| 87 |
undef @token; |
|
| 88 |
undef @tag; |
|
| 89 |
undef @chunk; |
|
| 90 |
undef @cbtags; |
|
| 91 |
undef @cetags; |
|
| 92 |
undef @flag; |
|
| 93 |
undef @markup; |
|
| 94 |
$n = 0; |
|
| 95 |
} |
|
| 96 |
|
|
| 97 |
sub doc_start {
|
|
| 98 |
return '' unless defined $opt_t; |
|
| 99 |
return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n"; |
|
| 100 |
} |
|
| 101 |
|
|
| 102 |
sub doc_end {
|
|
| 103 |
return '' unless defined $opt_t; |
|
| 104 |
return "</corpus>\n"; |
|
| 105 |
} |
|
| 106 |
|
|
| 107 |
sub start_tag {
|
|
| 108 |
my $t=shift; |
|
| 109 |
return "<$t>\n" unless defined $opt_t; |
|
| 110 |
return " <phrase cat=\"$t\">\n"; |
|
| 111 |
} |
|
| 112 |
|
|
| 113 |
sub end_tag {
|
|
| 114 |
my $t=shift; |
|
| 115 |
return "</$t>\n" unless defined $opt_t; |
|
| 116 |
return " </phrase>\n"; |
|
| 117 |
} |
|
| 118 |
|
|
| 119 |
sub token_and_tag {
|
|
| 120 |
my ($token,$tag)=@_; |
|
| 121 |
return "$token\t$tag\n" unless defined $opt_t; |
|
| 122 |
return " <token word=\"$token\" pos=\"$tag\"/>\n"; |
|
| 123 |
} |
|
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup-greek.perl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
$month{"??????????"} = 1;
|
|
| 4 |
$month{"???????????"} = 1;
|
|
| 5 |
$month{"???????"} = 1;
|
|
| 6 |
$month{"????????"} = 1;
|
|
| 7 |
$month{"?????"} = 1;
|
|
| 8 |
$month{"?????"} = 1;
|
|
| 9 |
$month{"?????"} = 1;
|
|
| 10 |
$month{"???????"} = 1;
|
|
| 11 |
$month{"???????"} = 1;
|
|
| 12 |
$month{"?????????"} = 1;
|
|
| 13 |
$month{"???????????"} = 1;
|
|
| 14 |
$month{"?????????"} = 1;
|
|
| 15 |
$month{"?????????"} = 1;
|
|
| 16 |
$month{"??????????"} = 1;
|
|
| 17 |
|
|
| 18 |
$month{"?????????"} = 1;
|
|
| 19 |
$month{"??????????"} = 1;
|
|
| 20 |
$month{"??????"} = 1;
|
|
| 21 |
$month{"???????"} = 1;
|
|
| 22 |
$month{"????"} = 1;
|
|
| 23 |
$month{"??????"} = 1;
|
|
| 24 |
$month{"??????"} = 1;
|
|
| 25 |
$month{"????????"} = 1;
|
|
| 26 |
$month{"????????"} = 1;
|
|
| 27 |
$month{"??????????"} = 1;
|
|
| 28 |
$month{"????????"} = 1;
|
|
| 29 |
$month{"????????"} = 1;
|
|
| 30 |
$month{"?????????"} = 1;
|
|
| 31 |
|
|
| 32 |
while (<>) {
|
|
| 33 |
chomp; |
|
| 34 |
if ($_ eq '') {
|
|
| 35 |
print_sentence(); |
|
| 36 |
} |
|
| 37 |
else {
|
|
| 38 |
push @token, $_; |
|
| 39 |
} |
|
| 40 |
} |
|
| 41 |
print_sentence(); |
|
| 42 |
|
|
| 43 |
sub print_sentence {
|
|
| 44 |
for( $i=0; $i<=$#token; $i++ ) {
|
|
| 45 |
if (exists $month{$token[$i]}) {
|
|
| 46 |
$start = $end = $i; |
|
| 47 |
if ($token[$start-1] =~ /^[1-9][0-9]?([???]??)?(-[1-9][0-9]?([???]??)?)?$/){
|
|
| 48 |
$start--; |
|
| 49 |
} |
|
| 50 |
if ($token[$start-1] eq '??????') {
|
|
| 51 |
$start--; |
|
| 52 |
} |
|
| 53 |
if ($token[$end+1] eq '???') {
|
|
| 54 |
$end++; |
|
| 55 |
} |
|
| 56 |
if ($token[$end+1] =~ /^(1[0-9][0-9][0-9]|20[0-9][0-9]|'[0-9][0-9])$/) {
|
|
| 57 |
$end++; |
|
| 58 |
} |
|
| 59 |
for( $k=$start; $k<$end; $k++) {
|
|
| 60 |
$join[$k] = 1; |
|
| 61 |
} |
|
| 62 |
} |
|
| 63 |
elsif (($token[$i] eq "??'" && $token[$i+1] eq '???') || |
|
| 64 |
($token[$i] eq '??' && |
|
| 65 |
($token[$i+1] eq '????' || $token[$i+1] eq '????'))) |
|
| 66 |
{
|
|
| 67 |
$join[$i] = 1; |
|
| 68 |
} |
|
| 69 |
|
|
| 70 |
} |
|
| 71 |
for( $i=0; $i<=$#token; $i++ ) {
|
|
| 72 |
if ($join[$i] == 1) {
|
|
| 73 |
print "$token[$i] " |
|
| 74 |
} |
|
| 75 |
else {
|
|
| 76 |
print "$token[$i]\n" |
|
| 77 |
} |
|
| 78 |
} |
|
| 79 |
undef @token; |
|
| 80 |
undef @join; |
|
| 81 |
} |
|
| tmp/org.txm.treetagger.core.win32/res/win32/cmd/tokenize.pl (revision 1683) | ||
|---|---|---|
| 1 |
#!/usr/bin/perl |
|
| 2 |
|
|
| 3 |
######################################################################## |
|
| 4 |
# # |
|
| 5 |
# tokenization script for tagger preprocessing # |
|
| 6 |
# Author: Helmut Schmid, IMS, University of Stuttgart # |
|
| 7 |
# Serge Sharoff, University of Leeds # |
|
| 8 |
# Description: # |
|
| 9 |
# - splits input text into tokens (one token per line) # |
|
| 10 |
# - cuts off punctuation, parentheses etc. # |
|
| 11 |
# - disambiguates periods # |
|
| 12 |
# - preserves SGML markup # |
|
| 13 |
# # |
|
| 14 |
######################################################################## |
|
| 15 |
|
|
| 16 |
use Getopt::Std; |
|
| 17 |
|
|
| 18 |
getopts('hfeia:u');
|
|
| 19 |
|
|
| 20 |
use utf8; |
|
| 21 |
if (defined $opt_u) {
|
|
| 22 |
use open ':utf8'; |
|
| 23 |
binmode(STDIN,":utf8"); |
|
| 24 |
binmode(STDOUT,":utf8"); |
|
| 25 |
} |
|
| 26 |
|
|
| 27 |
# Modify the following lines in order to adapt the tokenizer to other |
|
| 28 |
# types of text and/or languages |
|
| 29 |
|
|
| 30 |
# characters which have to be cut off at the beginning of a word |
|
| 31 |
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›';
|
|
| 32 |
|
|
| 33 |
# characters which have to be cut off at the end of a word |
|
| 34 |
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›'; |
|
| 35 |
|
|
| 36 |
# character sequences which have to be cut off at the beginning of a word |
|
| 37 |
my $PClitic=''; |
|
| 38 |
|
|
| 39 |
# character sequences which have to be cut off at the end of a word |
|
| 40 |
my $FClitic; |
|
| 41 |
|
|
| 42 |
if (defined($opt_e)) {
|
|
| 43 |
# English |
|
| 44 |
$FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t'; |
|
| 45 |
} |
|
| 46 |
if (defined($opt_i)) {
|
|
| 47 |
# Italian |
|
| 48 |
$PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''; |
|
| 49 |
} |
|
| 50 |
if (defined($opt_f)) {
|
|
| 51 |
# French |
|
| 52 |
$PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\''; |
|
| 53 |
$FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l'; |
|
| 54 |
} |
|
| 55 |
|
|
| 56 |
|
|
| 57 |
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE ######################### |
|
| 58 |
|
|
| 59 |
if (defined($opt_h)) {
|
|
| 60 |
die " |
|
| 61 |
Usage: tokenize.perl [ options ] ...files... |
|
| 62 |
|
|
| 63 |
Options: |
|
| 64 |
-u : use UTF8 encoding |
|
| 65 |
-e : English text |
|
| 66 |
-f : French text |
|
| 67 |
-i : Italian text |
|
| 68 |
-a <file>: <file> contains a list of words which are either abbreviations or |
|
| 69 |
words which should not be further split. |
|
| 70 |
"; |
|
| 71 |
} |
|
| 72 |
|
|
| 73 |
# Read the list of abbreviations and words |
|
| 74 |
if (defined($opt_a)) {
|
|
| 75 |
die "Can't read: $opt_a: $!\n" unless (open(FILE, $opt_a)); |
|
| 76 |
while (<FILE>) {
|
|
| 77 |
s/^[ \t\r\n]+//; |
|
| 78 |
s/[ \t\r\n]+$//; |
|
| 79 |
next if (/^\#/ || /^\s$/); # ignore comments |
|
| 80 |
$Token{$_} = 1;
|
|
| 81 |
} |
|
| 82 |
close FILE; |
|
| 83 |
} |
|
| 84 |
|
|
| 85 |
#SS: main loop; |
|
| 86 |
my $first_line = 1; |
|
| 87 |
while (<>) {
|
|
| 88 |
# delete optional byte order markers (BOM) |
|
| 89 |
if ($first_line) {
|
|
| 90 |
undef $first_line; |
|
| 91 |
s/^\x{FEFF}//;
|
|
| 92 |
} |
|
| 93 |
|
|
| 94 |
# replace newlines and tab characters with blanks |
|
| 95 |
tr/\n\t/ /; |
|
| 96 |
|
|
| 97 |
# replace blanks within SGML tags |
|
| 98 |
while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
|
|
| 99 |
} |
|
| 100 |
; |
|
| 101 |
#Separ: ÿþ |
|
| 102 |
|
|
| 103 |
# replace whitespace with a special character |
|
| 104 |
tr/ /\376/; |
|
| 105 |
|
|
| 106 |
# restore SGML tags |
|
| 107 |
tr/\377\376/ \377/; |
|
| 108 |
|
|
| 109 |
# prepare SGML-Tags for tokenization |
|
| 110 |
s/(<[^<>]*>)/\377$1\377/g; |
|
| 111 |
s/^\377//; |
|
| 112 |
s/\377$//; |
|
| 113 |
s/\377\377\377*/\377/g; |
|
| 114 |
|
|
| 115 |
@S = split("\377");
|
|
| 116 |
for ( $i=0; $i<=$#S; $i++) {
|
|
| 117 |
$_ = $S[$i]; |
|
| 118 |
|
|
| 119 |
if (/^<.*>$/) {
|
|
| 120 |
# SGML tag |
|
| 121 |
print $_,"\n"; |
|
| 122 |
} else {
|
|
| 123 |
# add a blank at the beginning and the end of each segment |
|
| 124 |
$_ = ' '.$_.' '; |
|
| 125 |
# insert missing blanks after punctuation |
|
| 126 |
s/(\.\.\.)/ ... /g; |
|
| 127 |
s/([;\!\?])([^ ])/$1 $2/g; |
|
| 128 |
s/([.,:])([^ 0-9.])/$1 $2/g; |
|
| 129 |
|
|
| 130 |
@F = split; |
|
| 131 |
for ( $j=0; $j<=$#F; $j++) {
|
|
| 132 |
my $suffix=""; |
|
| 133 |
$_ = $F[$j]; |
|
| 134 |
# separate punctuation and parentheses from words |
|
| 135 |
do {
|
|
| 136 |
$finished = 1; |
|
| 137 |
# cut off preceding punctuation |
|
| 138 |
if (s/^([$PChar])(.)/$2/) {
|
|
| 139 |
print $1,"\n"; |
|
| 140 |
$finished = 0; |
|
| 141 |
} |
|
| 142 |
# cut off trailing punctuation |
|
| 143 |
if (s/(.)([$FChar])$/$1/) {
|
|
| 144 |
$suffix = "$2\n$suffix"; |
|
| 145 |
$finished = 0; |
|
| 146 |
} |
|
| 147 |
# cut off trailing periods if punctuation precedes |
|
| 148 |
if (s/([$FChar])\.$//) {
|
|
| 149 |
$suffix = ".\n$suffix"; |
|
| 150 |
if ($_ eq "") {
|
|
| 151 |
$_ = $1; |
|
| 152 |
} else {
|
|
| 153 |
$suffix = "$1\n$suffix"; |
|
| 154 |
} |
|
| 155 |
$finished = 0; |
|
| 156 |
} |
|
| 157 |
} while (!$finished); |
|
| 158 |
|
|
| 159 |
# handle explicitly listed tokens |
|
| 160 |
if (defined($Token{$_})) {
|
|
| 161 |
print "$_\n$suffix"; |
|
| 162 |
next; |
|
| 163 |
} |
|
| 164 |
|
|
| 165 |
# abbreviations of the form A. or U.S.A. |
|
| 166 |
if (/^([A-Za-z-]\.)+$/) {
|
|
| 167 |
print "$_\n$suffix"; |
|
| 168 |
next; |
|
| 169 |
} |
|
| 170 |
|
|
| 171 |
# disambiguate periods |
|
| 172 |
if (/^(..*)\.$/ && $_ ne "..." && !/^[0-9]+\.$/) {
|
|
| 173 |
$_ = $1; |
|
| 174 |
$suffix = ".\n$suffix"; |
|
| 175 |
if (defined($Token{$_})) {
|
|
| 176 |
print "$_\n$suffix"; |
|
| 177 |
next; |
|
| 178 |
} |
|
| 179 |
} |
|
| 180 |
|
|
| 181 |
# cut off clitics |
|
| 182 |
if ($PClitic ne '') {
|
|
| 183 |
while (s/^($PClitic)(.)/$2/) {
|
|
| 184 |
print $1,"\n"; |
|
| 185 |
} |
|
| 186 |
} |
|
| 187 |
if ($FClitic ne '') {
|
|
| 188 |
while (s/(.)($FClitic)$/$1/) {
|
|
| 189 |
$suffix = "$2\n$suffix"; |
|
| 190 |
} |
|
| 191 |
} |
|
| 192 |
|
|
| 193 |
print "$_\n$suffix"; |
|
| 194 |
} |
|
| 195 |
} |
|
| 196 |
} |
|
| 197 |
} |
|
| 0 | 198 | |
| tmp/org.txm.treetagger.core.win32/res/win32/lib/english-abbreviations (revision 1683) | ||
|---|---|---|
| 1 |
Adm. |
|
| 2 |
Ala. |
|
| 3 |
Ariz. |
|
| 4 |
Ark. |
|
| 5 |
Aug. |
|
| 6 |
Ave. |
|
| 7 |
Bancorp. |
|
| 8 |
Bhd. |
|
| 9 |
Brig. |
|
| 10 |
Bros. |
|
| 11 |
CO. |
|
| 12 |
CORP. |
|
| 13 |
COS. |
|
| 14 |
Ca. |
|
| 15 |
Calif. |
|
| 16 |
Canada-U.S. |
|
| 17 |
Canadian-U.S. |
|
| 18 |
Capt. |
|
| 19 |
Cia. |
|
| 20 |
Cie. |
|
| 21 |
Co. |
|
| 22 |
Col. |
|
| 23 |
Colo. |
|
| 24 |
Conn. |
|
| 25 |
Corp. |
|
| 26 |
Cos. |
|
| 27 |
D-Mass. |
|
| 28 |
Dec. |
|
| 29 |
Del. |
|
| 30 |
Dept. |
|
| 31 |
Dr. |
|
| 32 |
Drs. |
|
| 33 |
Etc. |
|
| 34 |
Feb. |
|
| 35 |
Fla. |
|
| 36 |
Ft. |
|
| 37 |
Ga. |
|
| 38 |
Gen. |
|
| 39 |
Gov. |
|
| 40 |
Hon. |
|
| 41 |
INC. |
|
| 42 |
Ill. |
|
| 43 |
Inc. |
|
| 44 |
Ind. |
|
| 45 |
Jan. |
|
| 46 |
Japan-U.S. |
|
| 47 |
Jr. |
|
| 48 |
Kan. |
|
| 49 |
Korean-U.S. |
|
| 50 |
Ky. |
|
| 51 |
La. |
|
| 52 |
Lt. |
|
| 53 |
Ltd. |
|
| 54 |
Maj. |
|
| 55 |
Mass. |
|
| 56 |
Md. |
|
| 57 |
Messrs. |
|
| 58 |
Mfg. |
|
| 59 |
Mich. |
|
| 60 |
Minn. |
|
| 61 |
Miss. |
|
| 62 |
Mo. |
|
| 63 |
Mr. |
|
| 64 |
Mrs. |
|
| 65 |
Ms. |
|
| 66 |
Neb. |
|
| 67 |
Nev. |
|
| 68 |
No. |
|
| 69 |
Nos. |
|
| 70 |
Nov. |
|
| 71 |
Oct. |
|
| 72 |
Okla. |
|
| 73 |
Ont. |
|
| 74 |
Ore. |
|
| 75 |
Pa. |
|
| 76 |
Ph. |
|
| 77 |
Prof. |
|
| 78 |
Prop. |
|
| 79 |
Pty. |
|
| 80 |
Rep. |
|
| 81 |
Reps. |
|
| 82 |
Rev. |
|
| 83 |
S.p.A. |
|
| 84 |
Sen. |
|
| 85 |
Sens. |
|
| 86 |
Sept. |
|
| 87 |
Sgt. |
|
| 88 |
Sino-U.S. |
|
| 89 |
Sr. |
|
| 90 |
St. |
|
| 91 |
Ste. |
|
| 92 |
Tenn. |
|
| 93 |
Tex. |
|
| 94 |
U.S.-U.K. |
|
| 95 |
U.S.-U.S.S.R. |
|
| 96 |
Va. |
|
| 97 |
Vt. |
|
| 98 |
W.Va. |
|
| 99 |
Wash. |
|
| 100 |
Wis. |
|
| 101 |
Wyo. |
|
| 102 |
a.k.a. |
|
| 103 |
a.m. |
|
| 104 |
anti-U.S. |
|
| 105 |
cap. |
|
| 106 |
days. |
|
| 107 |
etc. |
|
| 108 |
ft. |
|
| 109 |
i.e. |
|
| 110 |
non-U.S. |
|
| 111 |
p.m. |
|
| 112 |
president-U.S. |
|
| 113 |
s.r.l. |
|
| 114 |
v. |
|
| 115 |
v.B. |
|
| 116 |
v.w. |
|
| 117 |
vs. |
|
| tmp/org.txm.treetagger.core.win32/res/win32/lib/german-abbreviations (revision 1683) | ||
|---|---|---|
| 1 |
A. |
|
| 2 |
A.-G. |
|
| 3 |
A.G. |
|
| 4 |
ADN-Korr. |
|
| 5 |
AT-Mot. |
|
| 6 |
Abb. |
|
| 7 |
Abess. |
|
| 8 |
Abl. |
|
| 9 |
Ablief.-Gew. |
|
| 10 |
Abm. |
|
| 11 |
Abs. |
|
| 12 |
Abt. |
|
| 13 |
Abtlg. |
|
| 14 |
Agl. |
|
| 15 |
Agt. |
|
| 16 |
Akt.-Ges. |
|
| 17 |
Aktbr. |
|
| 18 |
Alg. |
|
| 19 |
Alleininh. |
|
| 20 |
Allg. |
|
| 21 |
Altwageneint. |
|
| 22 |
Alu-Felg. |
|
| 23 |
Alum. |
|
| 24 |
Am. |
|
| 25 |
Amp. |
|
| 26 |
Anf. |
|
| 27 |
Anfr. |
|
| 28 |
Anfrag. |
|
| 29 |
Ang. |
|
| 30 |
Angb. |
|
| 31 |
Angeb. |
|
| 32 |
Angl. |
|
| 33 |
Anhängerkuppl. |
|
| 34 |
Anl. |
|
| 35 |
Anleih. |
|
| 36 |
Ann.-Exp. |
|
| 37 |
Ann.-Exped. |
|
| 38 |
Ant. |
|
| 39 |
Anten. |
|
| 40 |
Anz. |
|
| 41 |
Anz.-Exp. |
|
| 42 |
Anz.-Verm. |
|
| 43 |
Anzahlg. |
|
| 44 |
Anzhlg. |
|
| 45 |
Apoth. |
|
| 46 |
App. |
|
| 47 |
Appartem. |
|
| 48 |
April-Lief. |
|
| 49 |
Argent. |
|
| 50 |
Atl. |
|
| 51 |
Aufb. |
|
| 52 |
Aufst. |
|
| 53 |
Aug. |
|
| 54 |
Augsb. |
|
| 55 |
Ausg. |
|
| 56 |
Ausgl. |
|
| 57 |
Ausk. |
|
| 58 |
Ausl. |
|
| 59 |
Ausl.-Akt. |
|
| 60 |
Auslandsanl. |
|
| 61 |
Auslandsb. |
|
| 62 |
Ausst. |
|
| 63 |
Ausstattg. |
|
| 64 |
Austral. |
|
| 65 |
Ausz. |
|
| 66 |
Aut. |
|
| 67 |
Autom. |
|
| 68 |
Automat. |
|
| 69 |
Automin. |
|
| 70 |
B. |
|
| 71 |
B.P. |
|
| 72 |
BGBl. |
|
| 73 |
Bahnhofstr. |
|
| 74 |
Balk. |
|
| 75 |
Bau-Ing. |
|
| 76 |
Bauges. |
|
| 77 |
Bauj. |
|
| 78 |
Bay. |
|
| 79 |
Bayer. |
|
| 80 |
Bb. |
|
| 81 |
Bd. |
|
| 82 |
Bed. |
|
| 83 |
Beding. |
|
| 84 |
Ber. |
|
| 85 |
Beratg. |
|
| 86 |
Bereif. |
|
| 87 |
Bergb. |
|
| 88 |
Bergstr. |
|
| 89 |
Bernh. |
|
| 90 |
Bes. |
|
| 91 |
Besichtig. |
|
| 92 |
Bestzust. |
|
| 93 |
Beteil. |
|
| 94 |
Beteilig. |
|
| 95 |
Betr. |
|
| 96 |
Bett. |
|
| 97 |
Bew. |
|
| 98 |
Bewerb. |
|
| 99 |
Bewerbg. |
|
| 100 |
Bez. |
|
| 101 |
Bgl. |
|
| 102 |
Bhf. |
|
| 103 |
Bierbr. |
|
| 104 |
Bildzuschr. |
|
| 105 |
Bilf. |
|
| 106 |
Bj. |
|
| 107 |
Bk. |
|
| 108 |
Bkz. |
|
| 109 |
Bl. |
|
| 110 |
Bln. |
|
| 111 |
Boch. |
|
| 112 |
Bod. |
|
| 113 |
Bor. |
|
| 114 |
Bov. |
|
| 115 |
Br. |
|
| 116 |
Brem. |
|
| 117 |
Brh. |
|
| 118 |
Brok. |
|
| 119 |
Brsg. |
|
| 120 |
Bu. |
|
| 121 |
Bung. |
|
| 122 |
Burgstr. |
|
| 123 |
Bw. |
|
| 124 |
Bwsp. |
|
| 125 |
Bz. |
|
| 126 |
Bäd. |
|
| 127 |
C. |
|
| 128 |
C.G. |
|
| 129 |
Cabr. |
|
| 130 |
Can. |
|
| 131 |
Cap. |
|
| 132 |
Cav. |
|
| 133 |
Cbr. |
|
| 134 |
Cem. |
|
| 135 |
Centralb. |
|
| 136 |
Cert. |
|
| 137 |
Ch. |
|
| 138 |
Charlottenstr. |
|
| 139 |
Chem. |
|
| 140 |
Chem.-Ing. |
|
| 141 |
Chevr. |
|
| 142 |
Chr. |
|
| 143 |
Christophstr. |
|
| 144 |
Cie. |
|
| 145 |
Co. |
|
| 146 |
Colorvergl. |
|
| 147 |
Commerzb. |
|
| 148 |
Conc. |
|
| 149 |
Cons. |
|
| 150 |
Corneliusstr. |
|
| 151 |
Corp. |
|
| 152 |
Cp. |
|
| 153 |
Cpt. |
|
| 154 |
Cz. |
|
| 155 |
D. |
|
| 156 |
DG. |
|
| 157 |
DM. |
|
| 158 |
DUB-Schulth. |
|
| 159 |
DW. |
|
| 160 |
Dahlb. |
|
| 161 |
Dawes-Anl. |
|
| 162 |
Dept. |
|
| 163 |
Dev. |
|
| 164 |
Dez. |
|
| 165 |
Di. |
|
| 166 |
Dipl. |
|
| 167 |
Dipl.-Ing. |
|
| 168 |
Dipl.-Kfm. |
|
| 169 |
Dir. |
|
| 170 |
Direktionsw. |
|
| 171 |
Div. |
|
| 172 |
Do. |
|
| 173 |
Doll. |
|
| 174 |
Don. |
|
| 175 |
Dorfk. |
|
| 176 |
Dpf. |
|
| 177 |
Dr. |
|
| 178 |
Dr.-Ing. |
|
| 179 |
Dreij. |
|
| 180 |
Drog. |
|
| 181 |
Dt. |
|
| 182 |
Du. |
|
| 183 |
Dyckerh. |
|
| 184 |
Dyn. |
|
| 185 |
Dän. |
|
| 186 |
Düsseld. |
|
| 187 |
E. |
|
| 188 |
E.h. |
|
| 189 |
Einf. |
|
| 190 |
Einh. |
|
| 191 |
Einr. |
|
| 192 |
Eint. |
|
| 193 |
Eintr. |
|
| 194 |
Einw. |
|
| 195 |
Einwohn. |
|
| 196 |
Einz. |
|
| 197 |
Einzelzi. |
|
| 198 |
Eisenb. |
|
| 199 |
El. |
|
| 200 |
Elektr. |
|
| 201 |
Em. |
|
| 202 |
Endpr. |
|
| 203 |
Engl. |
|
| 204 |
Ent. |
|
| 205 |
Entsch. |
|
| 206 |
Entw. |
|
| 207 |
Erdgesch. |
|
| 208 |
Erf. |
|
| 209 |
Erfahr. |
|
| 210 |
Erstzul. |
|
| 211 |
Erzgeb. |
|
| 212 |
Esc. |
|
| 213 |
Eterna. |
|
| 214 |
Etg.-Hs. |
|
| 215 |
Eur. |
|
| 216 |
Ew. |
|
| 217 |
Exp. |
|
| 218 |
Expl. |
|
| 219 |
F. |
|
| 220 |
FS. |
|
| 221 |
Fa. |
|
| 222 |
Fabr. |
|
| 223 |
Fabrikat. |
|
| 224 |
Fachm. |
|
| 225 |
Fachricht. |
|
| 226 |
Fahrz. |
|
| 227 |
Fam. |
|
| 228 |
Fb. |
|
| 229 |
Fd. |
|
| 230 |
Fds. |
|
| 231 |
Feb. |
|
| 232 |
Febr. |
|
| 233 |
Febr.-Abl. |
|
| 234 |
Febr.-März-Abl. |
|
| 235 |
Feldstr. |
|
| 236 |
Fensterh. |
|
| 237 |
Ferd. |
|
| 238 |
Ferdinandstr. |
|
| 239 |
Fernschr. |
|
| 240 |
Ferr. |
|
| 241 |
Feuervers. |
|
| 242 |
Ffm. |
|
| 243 |
Fil. |
|
| 244 |
Fin. |
|
| 245 |
Finanzier. |
|
| 246 |
Finanzierg. |
|
| 247 |
Finanzierungsmöglichk. |
|
| 248 |
Finnl. |
|
| 249 |
Ford. |
|
| 250 |
Fortschr. |
|
| 251 |
Fr. |
|
| 252 |
Frankf. |
|
| 253 |
Franz. |
|
| 254 |
Franziskanerstr. |
|
| 255 |
Freiverk. |
|
| 256 |
Frhr. |
|
| 257 |
Fried. |
|
| 258 |
Friedr. |
|
| 259 |
Friedrich-Ebert-Str. |
|
| 260 |
Friedrichstr. |
|
| 261 |
Frl. |
|
| 262 |
Frühst. |
|
| 263 |
Führersch. |
|
| 264 |
G. |
|
| 265 |
G.M.B.H. |
|
| 266 |
G.m.b.H. |
|
| 267 |
Gar. |
|
| 268 |
Garag. |
|
| 269 |
Gart. |
|
| 270 |
Geb. |
|
Formats disponibles : Unified diff