Révision 1683

tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-french.perl (revision 1683)
1
#!/usr/bin/perl
2

  
3
###################################################################
4
###                                                             ###
5
###      File: filter-chunker-output.perl                       ###
6
###    Author: Michel Genereux                                  ###
7
###            (indicated modifications by Dennis Spohr (DS))   ###
8
###   Purpose: Filter chunker output and create XML-like markup ###
9
###   Created: Mon Feb 19 2007                                  ###
10
###                                                             ###
11
###################################################################
12

  
13
use Getopt::Std;
14
getopts('t');
15

  
16
$| = 1;
17

  
18
print doc_start();
19

  
20
### DS start: end-of-sentence marker
21
$eos = '[.?!;]';
22
$push = 1;
23
###
24

  
25
$n = 0;
26

  
27
while (<>) {
28
  s/.-SBAR$/O/;
29
  s/I-PC$/0/  if (/I-PC$/ && !$inside_pp);
30

  
31
  ### DS start: process lemma column
32
  if (($token[$n],$tag[$n],$tag,$chunk[$n],$x,$lemma[$n]) = $_ =~ /^(.*)-(.*)\t(.*)\/(.*)(\t(.*))?$/) {
33
  ### DS end
34

  
35
    ### DS start: chunking error; some SENTs have e.g. I-NP although
36
    ###           they mark the end of a sentence; lead to omission 
37
    ###           of closing tags
38
    $push = 0;
39
    $chunk[$n] = 0 if ($tag eq 'SENT' && $chunk[$n] =~ /^(I|B)-/);
40
    ### DS end
41

  
42
    if ($chunk[$n] =~ /^(.*)-(.*)$/) {
43
      $flag[$n] = $1;
44
      $chunk[$n] = $2;
45
    } else {
46
      undef $flag[$n];
47
      undef $chunk[$n];
48
    }
49

  
50
    ### DS start: performance boost: set $n to 0 after printing
51
    ###           sentence; otherwise $n and arrays get too big 
52
    ###           and cause slowdown
53
    if ($token[$n] =~ /^$eos\s*$/ && $chunk[$n] == 0 && $tag[$n] eq 'SENT') {
54
      print_sentence(0);
55
      $n = 0;
56
      $start_markup = "";
57
    } else {
58
      $n++;
59
    }
60
    ### DS end
61

  
62
  ### DS start: keep markup already present in input data and insert
63
  ###           chunker markup correctly; if an element starting before
64
  ###           the sentence is closed before the sentence is closed
65
  ###           (e.g. headlines without sentence end markers), then
66
  ###           the sentence should also be closed, e.g. avoid cases like
67
  ###           <HEADLINE><s>Les r?sultats de jeudi</HEADLINE></s>
68
  } elsif (/^<([^\/]*?)(( |~).*)?>/ && $push) {
69
      push(@tag_stack,$1);
70
      $start_markup .= "$&\n";
71
  } elsif (/^<\/(.*?)>/ && $1 eq $tag_stack[$#tag_stack]) {
72
      $end_markup = "$&\n";
73
      print_sentence(1);
74
      $n = 0;
75
      $push = 1;
76
      $start_markup = "";
77
      $end_markup = "";
78
      pop(@tag_stack);
79
  ### DS end
80

  
81
  } else {
82
    $markup[$n] .= $_;
83
  }
84
}
85

  
86
print_sentence(1);
87
print doc_end();
88

  
89

  
90
sub print_sentence {
91

  
92
  ### DS start: indicate whether print_sentence is forced by
93
  ###           closing input markup
94
  my $forced = shift;
95
  ### DS end
96

  
97
  my($i,$chunk);
98

  
99
  for( $i=0; $i<=$n; $i++ ) {
100
    if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
101
      $flag[$i] = 'B';
102
    }
103
    if ($flag[$i] eq 'B') {
104
      if (defined $chunk) {
105
	$cetags[$i-1] .= end_tag($chunk);
106
      }
107
      if ($chunk[$i] eq 'PC') {
108
	for( $k=$i+1; $k<=$n; $k++ ) {
109
	  last if ($flag[$k] eq 'B');
110
	}
111
	for( $k++; $k<=$n; $k++ ) {
112
	  last if ($flag[$k] ne 'I');
113
	}
114
	if ($k <= $n && $flag[$k] eq 'E' && $chunk[$k] eq 'PC') {
115
	  $markup[$k+1] .= end_tag('PC');
116
	  undef $flag[$k];
117
	  undef $chunk[$k];
118

  
119
        ### DS start: $k may be greater than $n; add closing PC tag
120
        ###           to $markup[$n]; otherwise closing tags are
121
        ###           omitted
122
	} elsif ($k > $n && ($forced || $token[$n] =~ /^$eos\s*$/)) {
123
          $markup[$n] .= end_tag('PC');
124
        ### DS end
125

  
126
        } else {
127
	  $markup[$k] .= end_tag('PC');
128
	}
129
	undef $chunk;
130
      }
131
      else {
132
	$chunk = $chunk[$i];
133
      }
134
      $cbtags[$i] .= start_tag($chunk[$i]);
135
    }
136
    elsif ($flag[$i] eq 'E') {
137
      if ($chunk[$i] eq $chunk) {
138
	$cetags[$i] .= end_tag($chunk);
139
	undef $chunk;
140
      }
141
      elsif ($chunk[$i] eq 'PC') {
142
	$cetags[$i-1] .= end_tag($chunk) if defined $chunk;
143
	$cetags[$i] .= end_tag("PC");
144
	my $k;
145
	for( $k=$i; $k>=0; $k-- ) {
146
	  if ($flag[$k] eq 'B') {
147
	    $cbtags[$k] = start_tag("PC").$cbtags[$k];
148
	    last;
149
	  }
150
	}
151
	undef $chunk;
152
      }
153
      else {
154
	die;
155
      }
156
    }
157
    elsif ($flag[$i] ne 'I' && defined $chunk) {
158
      $cetags[$i-1] .= end_tag($chunk);
159
      undef $chunk;
160
    }
161
  }
162

  
163
  $printed = 0;# start_tag("s");
164

  
165
  ### DS start: print opening tags of input markup before sentence
166
  print $start_markup;
167
  print start_tag("s") if $n > 0;
168
  ### DS end
169

  
170
  for( $i=0; $i<=$n; $i++ ) {
171
    print $markup[$i];
172
    #unless ($printed) {
173
    #  print start_tag("s");
174
    #  $printed = 1;
175
    #}
176
    print $cbtags[$i];
177

  
178
    ### DS start: slightly renamed sub and added lemma parameter
179
    print token_and_tag_and_lemma($token[$i],$tag[$i],$lemma[$i]) if defined $token[$i];
180
    ### DS end
181

  
182
    print $cetags[$i];
183
  }
184

  
185
  ### DS start: print closing "s" tag and closing input markup if 
186
  ###           print_sentence had been forced
187
  print end_tag("s") if $n>0;
188
  print $end_markup if $forced;
189
  ### DS end
190

  
191
  undef @token;
192
  undef @tag;
193
  undef @chunk;
194
  undef @cbtags;
195
  undef @cetags;
196
  undef @flag;
197
  undef @markup;
198
}
199

  
200
sub doc_start {
201
  return '' unless defined $opt_t;
202
  return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
203
}
204

  
205
sub doc_end {
206
  return '' unless defined $opt_t;
207
  return "</corpus>\n";
208
}
209

  
210
sub start_tag {
211
  my $t=shift;
212
  return "<$t>\n" unless defined $opt_t;
213
  return "  <phrase cat=\"$t\">\n";
214
}
215

  
216
sub end_tag {
217
  my $t=shift;
218
  return "</$t>\n" unless defined $opt_t;
219
  return "  </phrase>\n";
220
}
221

  
222
### DS start: also process and output lemma parameter
223
sub token_and_tag_and_lemma {
224
  my ($token,$tag,$lemma)=@_;
225
  return "$token\t$tag\t$lemma\n" unless defined $opt_t;
226
  return "    <token word=\"$token\" lemma=\"$lemma\" pos=\"$tag\"/>\n";
227
}
228
### DS end
229

  
tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup.perl (revision 1683)
1
#!/usr/local/bin/perl
2

  
3
use Getopt::Std;
4
getopt('dhf:');
5

  
6
# This perl script recognizes multi word units in the input stream
7
# and puts them on one line. Input must have one-word-per-line format.
8
# The multi word units are listed in the parameter file with POS tags.
9
# Each line contains one multi word unit where the individual words
10
# are separated by blanks followed by a tab character and the blank-
11
# separated list of POS tags.
12
# Author: Helmut Schmid, IMS, Uni Stuttgart
13

  
14
if (!defined($opt_f) || defined($opt_h)) {
15
  $0 =~ s/.*\///;
16
  printf "\nUsage: $0 [-d del] -f mwl-file ...files...\n";
17
  print "\nOptions:\n";
18
  print "-d del : Use del as delimiter rather than a blank\n\n";
19
  die
20
}
21

  
22
if (!open(FILE, $opt_f)) {
23
  die "\nCan't open mwl file: ",$opt_f,"\n";
24
}
25
if (defined($opt_d)) {
26
  $del = $opt_d;
27
} else {
28
  $del = " ";
29
}
30

  
31
$N=1;
32
while (<FILE>) {
33
  chomp();
34
  next if /^$/;
35
  @G = split("\t");
36
  @F = split(/\s+/,$G[0]);
37
  $state = 0;
38
  for($i=0; $i<=$#F; $i++) {
39
    if (!exists($arc{$state,$F[$i]})) {
40
      $arc{$state,$F[$i]} = $N++;
41
    }
42
    $state = $arc{$state,$F[$i]};
43
   }
44
  $final{$state} = $G[1];
45
}
46
close(FILE);
47

  
48

  
49
$last = $match = $last_match = 0;
50
$state = 0;
51

  
52
for (;;) {
53
  if ($match == $last) {
54
    if (!($token[$last] = <>)) {
55
      if ($last_match > 0) {
56
	print $token[0];
57
	for ($i=1; $i<=$last_match; $i++) {
58
	  print $del,$token[$i];
59
	}
60
	print "\n";
61
      } else {
62
	$i=0;
63
      }
64
      for (; $i<$last; $i++) {
65
	print $token[$i],"\n";
66
      }
67
      last;
68
    }
69
    chomp($token[$last++]);
70
  }
71
  if (($s = $arc{$state, $token[$match]}) ||
72
      ($s = $arc{$state, lc($token[$match])}) ||
73
      ($s = $arc{$state, ucfirst(lc($token[$match]))})) {
74
    if (exists($final{$s})) {
75
      $last_match = $match;
76
      $last_tag = $final{$s};
77
    }
78
    $state = $s;
79
    $match++;
80
  } else {
81
    if ($last_match > 0) {
82
      print $token[0];
83
      for($i=1; $i<=$last_match; $i++) {
84
	print $del,$token[$i];
85
      }
86
      print "\t$last_tag\n";
87
    } else {
88
      print $token[0],"\n";
89
    }
90
    for($i=0,$k=$last_match+1; $k<$last; ) {
91
      $token[$i++] = $token[$k++];
92
    }
93
    $last = $last - $last_match - 1;
94
    $last_match = $match = 0;
95
    $state = 0;
96
  }
97
}
tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output-german.perl (revision 1683)
1
#!/usr/bin/perl
2

  
3
use Getopt::Std;
4
getopts('t');
5

  
6
print doc_start();
7

  
8
$n = 0;
9
while (<>) {
10
  s/.-SBAR$/O/;
11

  
12
  if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
13
    $token[$n] = $1;
14
    $tag[$n] = $2;
15
    $chunk[$n] = $4;
16
    if ($chunk[$n] =~ /^(.*)-(.*)$/) {
17
      $flag[$n] = $1;
18
      $chunk[$n] = $2;
19
    }
20
    else {
21
      undef $flag[$n];
22
      undef $chunk[$n];
23
    }
24
    print_sentence()  if $token[$n] eq '.';
25
    $n++;
26
  } 
27

  
28
  else {
29
    $markup[$n] .= $_;
30
  }
31
}
32

  
33
print_sentence();
34
print doc_end();
35

  
36

  
37
sub print_sentence {
38
  my($i,$chunk);
39

  
40
  for( $i=0; $i<=$n; $i++ ) {
41

  
42
    if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
43
      $flag[$i] = 'B';
44
    }
45

  
46
    if ($flag[$i] eq 'B') {
47
      if (defined $chunk) {
48
	$cetags[$i-1] = end_tag($chunk);
49
      }
50
      $chunk = $chunk[$i];
51
      $cbtags[$i] .= start_tag($chunk[$i]);
52
    }
53

  
54
    # German chunker uses E-flags for PCs
55
    elsif ($flag[$i] eq 'E') {
56
      if ($chunk[$i] eq $chunk) {
57
	$cetags[$i] = end_tag($chunk);
58
	undef $chunk;
59
      }
60
      elsif ($chunk[$i] eq "PC" && $chunk eq "NC") {
61
	for( $k=$i-1; $k>=0; $k-- ) {
62
	  if ($chunk[$k] eq "NC") {
63
	    $chunk[$k] = "PC";
64
	  }
65
	  if ($flag[$k] ne "I") {
66
	    last;
67
	  }
68
	}
69
	$cbtags[$k] = start_tag($chunk[$i]);
70
	$cetags[$i] = end_tag($chunk[$i]);
71
	undef $chunk;
72
	undef $inPC;
73
      }
74
    }
75

  
76
    elsif ($flag[$i] ne 'I' && defined $chunk) {
77
      $cetags[$i-1] = end_tag($chunk);
78
      undef $chunk;
79
    }
80
  }
81

  
82
  for( $i=0; $i<=$n; $i++ ) {
83
    print $markup[$i];
84
    print $cbtags[$i];
85
    print token_and_tag($token[$i],$tag[$i]) if defined $token[$i];
86
    print $cetags[$i];
87
  }
88

  
89
  undef @token;
90
  undef @tag;
91
  undef @chunk;
92
  undef @cbtags;
93
  undef @cetags;
94
  undef @flag;
95
  undef @markup;
96
  $n = 0;
97
}
98

  
99
sub doc_start {
100
  return '' unless defined $opt_t;
101
  return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
102
}
103

  
104
sub doc_end {
105
  return '' unless defined $opt_t;
106
  return "</corpus>\n";
107
}
108

  
109
sub start_tag {
110
  my $t=shift;
111
  return "<$t>\n" unless defined $opt_t;
112
  return "  <phrase cat=\"$t\">\n";
113
}
114

  
115
sub end_tag {
116
  my $t=shift;
117
  return "</$t>\n" unless defined $opt_t;
118
  return "  </phrase>\n";
119
}
120

  
121
sub token_and_tag {
122
  my ($token,$tag)=@_;
123
  return "$token\t$tag\n" unless defined $opt_t;
124
  return "    <token word=\"$token\" pos=\"$tag\"/>\n";
125
}
tmp/org.txm.treetagger.core.win32/res/win32/cmd/utf8-tokenize.perl (revision 1683)
1
#!/usr/bin/perl
2

  
3
########################################################################
4
#                                                                      #
5
#  tokenization script for tagger preprocessing                        #
6
#  Author: Helmut Schmid, IMS, University of Stuttgart                 #
7
#          Serge Sharoff, University of Leeds                          #
8
#  Description:                                                        #
9
#  - splits input text into tokens (one token per line)                #
10
#  - cuts off punctuation, parentheses etc.                            #
11
#  - disambiguates periods                                             #
12
#  - preserves SGML markup                                             #
13
#                                                                      #
14
########################################################################
15

  
16
use Getopt::Std;
17
use utf8;
18
use Encode;
19

  
20
getopts('hgfeiza:');
21

  
22
# Modify the following lines in order to adapt the tokenizer to other
23
# types of text and/or languages
24

  
25
# characters which have to be cut off at the beginning of a word
26
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›'."'";
27

  
28
# characters which have to be cut off at the end of a word
29
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›';
30

  
31
# character sequences which have to be cut off at the beginning of a word
32
my $PClitic='';
33

  
34
# character sequences which have to be cut off at the end of a word
35
my $FClitic;
36

  
37
if (defined($opt_e)) {
38
  # English
39
  $FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t';
40
}
41
if (defined($opt_i)) {
42
  # Italian
43
  $PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\'';
44
}
45
if (defined($opt_f)) {
46
  # French
47
  $PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
48
  $FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
49
}
50
if (defined($opt_z)) {
51
  # Galician
52
  $FClitic = '-la|-las|-lo|-los|-nos';
53
}
54

  
55

  
56
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE #########################
57

  
58
if (defined($opt_h)) {
59
  die "
60
Usage: utf8-tokenize.perl [ options ] ...files...
61

  
62
Options:
63
-e : English text 
64
-f : French text
65
-i : Italian text
66
-a <file>: <file> contains a list of words which are either abbreviations or
67
           words which should not be further split.
68
";
69
}
70

  
71
# Read the list of abbreviations and words
72
if (defined($opt_a)) {
73
  die "Can't read: $opt_a: $!\n"  unless (open(FILE, $opt_a));
74
  while (<FILE>) {
75
      $_ = decode('utf8',$_);
76
      s/^[ \t\r\n]+//;
77
      s/[ \t\r\n]+$//;
78
      next if (/^\#/ || /^\s$/);    # ignore comments
79
      $Token{$_} = 1;
80
  }
81
  close FILE;
82
}
83

  
84
#SS: main loop; 
85
my $first_line = 1;
86
while (<>) {
87
  $_ = decode('utf8',$_);
88
  # delete optional byte order markers (BOM)
89
  if ($first_line) {
90
      undef $first_line;
91
      s/^\x{FEFF}//;
92
  }
93

  
94
  # replace newlines and tab characters with blanks
95
  tr/\n\t/  /;
96

  
97
  # replace blanks within SGML tags
98
  while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
99
  }
100
  ;
101
  #Separ: ÿþ
102

  
103
  # replace whitespace with a special character
104
  tr/ /\376/;
105

  
106
  # restore SGML tags
107
  tr/\377\376/ \377/;
108

  
109
  # prepare SGML-Tags for tokenization
110
  s/(<[^<>]*>)/\377$1\377/g;
111
  s/^\377//;
112
  s/\377$//;
113
  s/\377\377\377*/\377/g;
114

  
115
  @S = split("\377");
116
  for ( $i=0; $i<=$#S; $i++) {
117
    $_ = $S[$i];
118
  
119
    if (/^<.*>$/) {
120
      # SGML tag
121
      print encode('utf8',"$_\n");
122
    } else {
123
      # add a blank at the beginning and the end of each segment
124
      $_ = ' '.$_.' ';
125
      # insert missing blanks after punctuation
126
      s/(\.\.\.)/ ... /g;
127
      s/([;\!\?])([^ ])/$1 $2/g;
128
      s/([.,:])([^ 0-9.])/$1 $2/g;
129
    
130
      @F = split;
131
      for ( $j=0; $j<=$#F; $j++) {
132
	my $suffix="";
133
	$_ = $F[$j];
134
	# separate punctuation and parentheses from words
135
	do {
136
	  $finished = 1;
137
	  # cut off preceding punctuation
138
	  if (s/^([$PChar])(.)/$2/) {
139
	    print encode('utf8',"$1\n");
140
	    $finished = 0;
141
	  }
142
	  # cut off trailing punctuation
143
	  if (s/(.)([$FChar])$/$1/) {
144
	    $suffix = "$2\n$suffix";
145
	    $finished = 0;
146
	  }
147
	  # cut off trailing periods if punctuation precedes
148
	  if (s/([$FChar])\.$//) { 
149
	    $suffix = ".\n$suffix";
150
	    if ($_ eq "") {
151
	      $_ = $1;
152
	    } else {
153
	      $suffix = "$1\n$suffix";
154
	    }
155
	    $finished = 0; 
156
	  }
157
	} while (!$finished);
158
                
159
	# handle explicitly listed tokens
160
	if (defined($Token{$_})) {
161
	  print encode('utf8',"$_\n$suffix");
162
	  next;
163
	}
164
                
165
	# abbreviations of the form A. or U.S.A.
166
	if (/^([A-Za-z-]\.)+$/) {
167
	  print encode('utf8',"$_\n$suffix");
168
	  next;
169
	}
170
                 
171

  
172
	# disambiguate periods
173
	if (/^(..*)\.$/ && $_ ne "..." && !($opt_g && /^[0-9]+\.$/)) {
174
	  $_ = $1;
175
	  $suffix = ".\n$suffix";
176
	  if (defined($Token{$_})) {
177
 	    print encode('utf8',"$_\n$suffix");
178
	    next;
179
	  }
180
	}
181
                 
182
	# cut off clitics
183
	while (s/^(--)(.)/$2/) {
184
 	    print encode('utf8',"$1\n");
185
	}
186
	if ($PClitic ne '') {
187
	  while (s/^($PClitic)(.)/$2/) {
188
 	    print encode('utf8',"$1\n");
189
	  }
190
	}
191

  
192
	while (s/(.)(--)$/$1/) {
193
	    $suffix = "$2\n$suffix";
194
	}
195
	if ($FClitic ne '') {
196
	  while (s/(.)($FClitic)$/$1/) {
197
	    $suffix = "$2\n$suffix";
198
	  }
199
	}
200
                 
201
	print encode('utf8',"$_\n$suffix");
202
      }
203
    }
204
  }
205
}
0 206

  
tmp/org.txm.treetagger.core.win32/res/win32/cmd/filter-chunker-output.perl (revision 1683)
1
#!/usr/bin/perl
2

  
3
use Getopt::Std;
4
getopts('t');
5

  
6
print doc_start();
7

  
8
$n = 0;
9
while (<>) {
10
  s/.-SBAR$/O/;
11

  
12
  if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
13
    $token[$n] = $1;
14
    $tag[$n] = $2;
15
    $chunk[$n] = $4;
16
    if ($chunk[$n] =~ /^(.*)-(.*)$/) {
17
      $flag[$n] = $1;
18
      $chunk[$n] = $2;
19
    }
20
    else {
21
      undef $flag[$n];
22
      undef $chunk[$n];
23
    }
24
    print_sentence()  if $token[$n] eq '.';
25
    $n++;
26
  } 
27

  
28
  else {
29
    $markup[$n] .= $_;
30
  }
31
}
32

  
33
print_sentence();
34
print doc_end();
35

  
36

  
37
sub print_sentence {
38
  my($i,$chunk);
39

  
40
  for( $i=0; $i<=$n; $i++ ) {
41

  
42
    if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
43
      $flag[$i] = 'B';
44
    }
45

  
46
    if ($flag[$i] ne '' && $token[$i] eq '.') {
47
      delete $flag[$i];
48
      $chunk[$i] = '0';
49
    }
50

  
51
    if ($flag[$i] eq 'B') {
52
      if (defined $chunk) {
53
	if (($chunk eq 'PC' && $chunk[$i] eq 'NC') ||
54
	    ($chunk eq 'PP' && $chunk[$i] eq 'NP'))
55
	  {
56
	    $inPC = $chunk;
57
	  }
58
	else {
59
	  $cetags[$i-1] = end_tag($chunk);
60
	  if (defined $inPC) {
61
	    $cetags[$i-1] .= end_tag($inPC);
62
	    undef $inPC;
63
	  }
64
	}
65
      }
66
      $chunk = $chunk[$i];
67
      $cbtags[$i] .= start_tag($chunk[$i]);
68
    }
69

  
70
    elsif ($flag[$i] ne 'I' && defined $chunk) {
71
      $cetags[$i-1] = end_tag($chunk);
72
      undef $chunk;
73
      if (defined $inPC) {
74
	$cetags[$i-1] .= end_tag($inPC);
75
	undef $inPC;
76
      }
77
    }
78
  }
79

  
80
  for( $i=0; $i<=$n; $i++ ) {
81
    print $markup[$i];
82
    print $cbtags[$i];
83
    print token_and_tag($token[$i],$tag[$i]) if defined $token[$i];
84
    print $cetags[$i];
85
  }
86

  
87
  undef @token;
88
  undef @tag;
89
  undef @chunk;
90
  undef @cbtags;
91
  undef @cetags;
92
  undef @flag;
93
  undef @markup;
94
  $n = 0;
95
}
96

  
97
sub doc_start {
98
  return '' unless defined $opt_t;
99
  return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
100
}
101

  
102
sub doc_end {
103
  return '' unless defined $opt_t;
104
  return "</corpus>\n";
105
}
106

  
107
sub start_tag {
108
  my $t=shift;
109
  return "<$t>\n" unless defined $opt_t;
110
  return "  <phrase cat=\"$t\">\n";
111
}
112

  
113
sub end_tag {
114
  my $t=shift;
115
  return "</$t>\n" unless defined $opt_t;
116
  return "  </phrase>\n";
117
}
118

  
119
sub token_and_tag {
120
  my ($token,$tag)=@_;
121
  return "$token\t$tag\n" unless defined $opt_t;
122
  return "    <token word=\"$token\" pos=\"$tag\"/>\n";
123
}
tmp/org.txm.treetagger.core.win32/res/win32/cmd/mwl-lookup-greek.perl (revision 1683)
1
#!/usr/bin/perl
2

  
3
$month{"??????????"} = 1;
4
$month{"???????????"} = 1;
5
$month{"???????"} = 1;
6
$month{"????????"} = 1;
7
$month{"?????"} = 1;
8
$month{"?????"} = 1;
9
$month{"?????"} = 1;
10
$month{"???????"} = 1;
11
$month{"???????"} = 1;
12
$month{"?????????"} = 1;
13
$month{"???????????"} = 1;
14
$month{"?????????"} = 1;
15
$month{"?????????"} = 1;
16
$month{"??????????"} = 1;
17

  
18
$month{"?????????"} = 1;
19
$month{"??????????"} = 1;
20
$month{"??????"} = 1;
21
$month{"???????"} = 1;
22
$month{"????"} = 1;
23
$month{"??????"} = 1;
24
$month{"??????"} = 1;
25
$month{"????????"} = 1;
26
$month{"????????"} = 1;
27
$month{"??????????"} = 1;
28
$month{"????????"} = 1;
29
$month{"????????"} = 1;
30
$month{"?????????"} = 1;
31

  
32
while (<>) {
33
    chomp;
34
    if ($_ eq '') {
35
	print_sentence();
36
    }
37
    else {
38
	push @token, $_;
39
    }
40
}
41
print_sentence();
42

  
43
sub print_sentence {
44
    for( $i=0; $i<=$#token; $i++ ) {
45
	if (exists $month{$token[$i]}) {
46
	    $start = $end = $i;
47
	    if ($token[$start-1] =~ /^[1-9][0-9]?([???]??)?(-[1-9][0-9]?([???]??)?)?$/){
48
		$start--;
49
	    }
50
	    if ($token[$start-1] eq '??????') {
51
		$start--;
52
	    }
53
	    if ($token[$end+1] eq '???') {
54
		$end++;
55
	    }
56
	    if ($token[$end+1] =~ /^(1[0-9][0-9][0-9]|20[0-9][0-9]|'[0-9][0-9])$/) {
57
		$end++;
58
	    }
59
	    for( $k=$start; $k<$end; $k++) {
60
		$join[$k] = 1;
61
	    }
62
	}
63
	elsif (($token[$i] eq "??'" && $token[$i+1] eq '???') ||
64
	       ($token[$i] eq '??' && 
65
		($token[$i+1] eq '????' || $token[$i+1] eq '????')))
66
	{
67
	    $join[$i] = 1;
68
	}
69
	       
70
    }
71
    for( $i=0; $i<=$#token; $i++ ) {
72
	if ($join[$i] == 1) {
73
	    print "$token[$i] "
74
	}
75
	else {
76
	    print "$token[$i]\n"
77
	}
78
    }
79
    undef @token;
80
    undef @join;
81
}
tmp/org.txm.treetagger.core.win32/res/win32/cmd/tokenize.pl (revision 1683)
1
#!/usr/bin/perl
2

  
3
########################################################################
4
#                                                                      #
5
#  tokenization script for tagger preprocessing                        #
6
#  Author: Helmut Schmid, IMS, University of Stuttgart                 #
7
#          Serge Sharoff, University of Leeds                          #
8
#  Description:                                                        #
9
#  - splits input text into tokens (one token per line)                #
10
#  - cuts off punctuation, parentheses etc.                            #
11
#  - disambiguates periods                                             #
12
#  - preserves SGML markup                                             #
13
#                                                                      #
14
########################################################################
15

  
16
use Getopt::Std;
17

  
18
getopts('hfeia:u');
19

  
20
use utf8;
21
if (defined $opt_u) {
22
    use open ':utf8';
23
    binmode(STDIN,":utf8");
24
    binmode(STDOUT,":utf8");
25
}
26

  
27
# Modify the following lines in order to adapt the tokenizer to other
28
# types of text and/or languages
29

  
30
# characters which have to be cut off at the beginning of a word
31
my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›';
32

  
33
# characters which have to be cut off at the end of a word
34
my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›';
35

  
36
# character sequences which have to be cut off at the beginning of a word
37
my $PClitic='';
38

  
39
# character sequences which have to be cut off at the end of a word
40
my $FClitic;
41

  
42
if (defined($opt_e)) {
43
  # English
44
  $FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t';
45
}
46
if (defined($opt_i)) {
47
  # Italian
48
  $PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\'';
49
}
50
if (defined($opt_f)) {
51
  # French
52
  $PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
53
  $FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
54
}
55

  
56

  
57
### NO MODIFICATIONS REQUIRED BEYOND THIS LINE #########################
58

  
59
if (defined($opt_h)) {
60
  die "
61
Usage: tokenize.perl [ options ] ...files...
62

  
63
Options:
64
-u : use UTF8 encoding
65
-e : English text 
66
-f : French text
67
-i : Italian text
68
-a <file>: <file> contains a list of words which are either abbreviations or
69
           words which should not be further split.
70
";
71
}
72

  
73
# Read the list of abbreviations and words
74
if (defined($opt_a)) {
75
  die "Can't read: $opt_a: $!\n"  unless (open(FILE, $opt_a));
76
  while (<FILE>) {
77
    s/^[ \t\r\n]+//;
78
    s/[ \t\r\n]+$//;
79
    next if (/^\#/ || /^\s$/);    # ignore comments
80
    $Token{$_} = 1;
81
  }
82
  close FILE;
83
}
84

  
85
#SS: main loop; 
86
my $first_line = 1;
87
while (<>) {
88
  # delete optional byte order markers (BOM)
89
  if ($first_line) {
90
      undef $first_line;
91
      s/^\x{FEFF}//;
92
  }
93

  
94
  # replace newlines and tab characters with blanks
95
  tr/\n\t/  /;
96

  
97
  # replace blanks within SGML tags
98
  while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
99
  }
100
  ;
101
  #Separ: ÿþ
102

  
103
  # replace whitespace with a special character
104
  tr/ /\376/;
105

  
106
  # restore SGML tags
107
  tr/\377\376/ \377/;
108

  
109
  # prepare SGML-Tags for tokenization
110
  s/(<[^<>]*>)/\377$1\377/g;
111
  s/^\377//;
112
  s/\377$//;
113
  s/\377\377\377*/\377/g;
114

  
115
  @S = split("\377");
116
  for ( $i=0; $i<=$#S; $i++) {
117
    $_ = $S[$i];
118
  
119
    if (/^<.*>$/) {
120
      # SGML tag
121
      print $_,"\n";
122
    } else {
123
      # add a blank at the beginning and the end of each segment
124
      $_ = ' '.$_.' ';
125
      # insert missing blanks after punctuation
126
      s/(\.\.\.)/ ... /g;
127
      s/([;\!\?])([^ ])/$1 $2/g;
128
      s/([.,:])([^ 0-9.])/$1 $2/g;
129
    
130
      @F = split;
131
      for ( $j=0; $j<=$#F; $j++) {
132
	my $suffix="";
133
	$_ = $F[$j];
134
	# separate punctuation and parentheses from words
135
	do {
136
	  $finished = 1;
137
	  # cut off preceding punctuation
138
	  if (s/^([$PChar])(.)/$2/) {
139
	    print $1,"\n";
140
	    $finished = 0;
141
	  }
142
	  # cut off trailing punctuation
143
	  if (s/(.)([$FChar])$/$1/) {
144
	    $suffix = "$2\n$suffix";
145
	    $finished = 0;
146
	  }
147
	  # cut off trailing periods if punctuation precedes
148
	  if (s/([$FChar])\.$//) { 
149
	    $suffix = ".\n$suffix";
150
	    if ($_ eq "") {
151
	      $_ = $1;
152
	    } else {
153
	      $suffix = "$1\n$suffix";
154
	    }
155
	    $finished = 0; 
156
	  }
157
	} while (!$finished);
158
                
159
	# handle explicitly listed tokens
160
	if (defined($Token{$_})) {
161
	  print "$_\n$suffix";
162
	  next;
163
	}
164
                
165
	# abbreviations of the form A. or U.S.A.
166
	if (/^([A-Za-z-]\.)+$/) {
167
	  print "$_\n$suffix";
168
	  next;
169
	}
170
                 
171
	# disambiguate periods
172
	if (/^(..*)\.$/ && $_ ne "..." && !/^[0-9]+\.$/) {
173
	  $_ = $1;
174
	  $suffix = ".\n$suffix";
175
	  if (defined($Token{$_})) {
176
	    print "$_\n$suffix";
177
	    next;
178
	  }
179
	}
180
                 
181
	# cut off clitics
182
	if ($PClitic ne '') {
183
	  while (s/^($PClitic)(.)/$2/) {
184
	    print $1,"\n";
185
	  }
186
	}
187
	if ($FClitic ne '') {
188
	  while (s/(.)($FClitic)$/$1/) {
189
	    $suffix = "$2\n$suffix";
190
	  }
191
	}
192
                 
193
	print "$_\n$suffix";
194
      }
195
    }
196
  }
197
}
0 198

  
tmp/org.txm.treetagger.core.win32/res/win32/lib/english-abbreviations (revision 1683)
1
Adm.
2
Ala.
3
Ariz.
4
Ark.
5
Aug.
6
Ave.
7
Bancorp.
8
Bhd.
9
Brig.
10
Bros.
11
CO.
12
CORP.
13
COS.
14
Ca.
15
Calif.
16
Canada-U.S.
17
Canadian-U.S.
18
Capt.
19
Cia.
20
Cie.
21
Co.
22
Col.
23
Colo.
24
Conn.
25
Corp.
26
Cos.
27
D-Mass.
28
Dec.
29
Del.
30
Dept.
31
Dr.
32
Drs.
33
Etc.
34
Feb.
35
Fla.
36
Ft.
37
Ga.
38
Gen.
39
Gov.
40
Hon.
41
INC.
42
Ill.
43
Inc.
44
Ind.
45
Jan.
46
Japan-U.S.
47
Jr.
48
Kan.
49
Korean-U.S.
50
Ky.
51
La.
52
Lt.
53
Ltd.
54
Maj.
55
Mass.
56
Md.
57
Messrs.
58
Mfg.
59
Mich.
60
Minn.
61
Miss.
62
Mo.
63
Mr.
64
Mrs.
65
Ms.
66
Neb.
67
Nev.
68
No.
69
Nos.
70
Nov.
71
Oct.
72
Okla.
73
Ont.
74
Ore.
75
Pa.
76
Ph.
77
Prof.
78
Prop.
79
Pty.
80
Rep.
81
Reps.
82
Rev.
83
S.p.A.
84
Sen.
85
Sens.
86
Sept.
87
Sgt.
88
Sino-U.S.
89
Sr.
90
St.
91
Ste.
92
Tenn.
93
Tex.
94
U.S.-U.K.
95
U.S.-U.S.S.R.
96
Va.
97
Vt.
98
W.Va.
99
Wash.
100
Wis.
101
Wyo.
102
a.k.a.
103
a.m.
104
anti-U.S.
105
cap.
106
days.
107
etc.
108
ft.
109
i.e.
110
non-U.S.
111
p.m.
112
president-U.S.
113
s.r.l.
114
v.
115
v.B.
116
v.w.
117
vs.
tmp/org.txm.treetagger.core.win32/res/win32/lib/german-abbreviations (revision 1683)
1
A.
2
A.-G.
3
A.G.
4
ADN-Korr.
5
AT-Mot.
6
Abb.
7
Abess.
8
Abl.
9
Ablief.-Gew.
10
Abm.
11
Abs.
12
Abt.
13
Abtlg.
14
Agl.
15
Agt.
16
Akt.-Ges.
17
Aktbr.
18
Alg.
19
Alleininh.
20
Allg.
21
Altwageneint.
22
Alu-Felg.
23
Alum.
24
Am.
25
Amp.
26
Anf.
27
Anfr.
28
Anfrag.
29
Ang.
30
Angb.
31
Angeb.
32
Angl.
33
Anhängerkuppl.
34
Anl.
35
Anleih.
36
Ann.-Exp.
37
Ann.-Exped.
38
Ant.
39
Anten.
40
Anz.
41
Anz.-Exp.
42
Anz.-Verm.
43
Anzahlg.
44
Anzhlg.
45
Apoth.
46
App.
47
Appartem.
48
April-Lief.
49
Argent.
50
Atl.
51
Aufb.
52
Aufst.
53
Aug.
54
Augsb.
55
Ausg.
56
Ausgl.
57
Ausk.
58
Ausl.
59
Ausl.-Akt.
60
Auslandsanl.
61
Auslandsb.
62
Ausst.
63
Ausstattg.
64
Austral.
65
Ausz.
66
Aut.
67
Autom.
68
Automat.
69
Automin.
70
B.
71
B.P.
72
BGBl.
73
Bahnhofstr.
74
Balk.
75
Bau-Ing.
76
Bauges.
77
Bauj.
78
Bay.
79
Bayer.
80
Bb.
81
Bd.
82
Bed.
83
Beding.
84
Ber.
85
Beratg.
86
Bereif.
87
Bergb.
88
Bergstr.
89
Bernh.
90
Bes.
91
Besichtig.
92
Bestzust.
93
Beteil.
94
Beteilig.
95
Betr.
96
Bett.
97
Bew.
98
Bewerb.
99
Bewerbg.
100
Bez.
101
Bgl.
102
Bhf.
103
Bierbr.
104
Bildzuschr.
105
Bilf.
106
Bj.
107
Bk.
108
Bkz.
109
Bl.
110
Bln.
111
Boch.
112
Bod.
113
Bor.
114
Bov.
115
Br.
116
Brem.
117
Brh.
118
Brok.
119
Brsg.
120
Bu.
121
Bung.
122
Burgstr.
123
Bw.
124
Bwsp.
125
Bz.
126
Bäd.
127
C.
128
C.G.
129
Cabr.
130
Can.
131
Cap.
132
Cav.
133
Cbr.
134
Cem.
135
Centralb.
136
Cert.
137
Ch.
138
Charlottenstr.
139
Chem.
140
Chem.-Ing.
141
Chevr.
142
Chr.
143
Christophstr.
144
Cie.
145
Co.
146
Colorvergl.
147
Commerzb.
148
Conc.
149
Cons.
150
Corneliusstr.
151
Corp.
152
Cp.
153
Cpt.
154
Cz.
155
D.
156
DG.
157
DM.
158
DUB-Schulth.
159
DW.
160
Dahlb.
161
Dawes-Anl.
162
Dept.
163
Dev.
164
Dez.
165
Di.
166
Dipl.
167
Dipl.-Ing.
168
Dipl.-Kfm.
169
Dir.
170
Direktionsw.
171
Div.
172
Do.
173
Doll.
174
Don.
175
Dorfk.
176
Dpf.
177
Dr.
178
Dr.-Ing.
179
Dreij.
180
Drog.
181
Dt.
182
Du.
183
Dyckerh.
184
Dyn.
185
Dän.
186
Düsseld.
187
E.
188
E.h.
189
Einf.
190
Einh.
191
Einr.
192
Eint.
193
Eintr.
194
Einw.
195
Einwohn.
196
Einz.
197
Einzelzi.
198
Eisenb.
199
El.
200
Elektr.
201
Em.
202
Endpr.
203
Engl.
204
Ent.
205
Entsch.
206
Entw.
207
Erdgesch.
208
Erf.
209
Erfahr.
210
Erstzul.
211
Erzgeb.
212
Esc.
213
Eterna.
214
Etg.-Hs.
215
Eur.
216
Ew.
217
Exp.
218
Expl.
219
F.
220
FS.
221
Fa.
222
Fabr.
223
Fabrikat.
224
Fachm.
225
Fachricht.
226
Fahrz.
227
Fam.
228
Fb.
229
Fd.
230
Fds.
231
Feb.
232
Febr.
233
Febr.-Abl.
234
Febr.-März-Abl.
235
Feldstr.
236
Fensterh.
237
Ferd.
238
Ferdinandstr.
239
Fernschr.
240
Ferr.
241
Feuervers.
242
Ffm.
243
Fil.
244
Fin.
245
Finanzier.
246
Finanzierg.
247
Finanzierungsmöglichk.
248
Finnl.
249
Ford.
250
Fortschr.
251
Fr.
252
Frankf.
253
Franz.
254
Franziskanerstr.
255
Freiverk.
256
Frhr.
257
Fried.
258
Friedr.
259
Friedrich-Ebert-Str.
260
Friedrichstr.
261
Frl.
262
Frühst.
263
Führersch.
264
G.
265
G.M.B.H.
266
G.m.b.H.
267
Gar.
268
Garag.
269
Gart.
270
Geb.
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff