/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     #!/usr/bin/perl
     ###################################################################
     ###                                                             ###
     ###      File: filter-chunker-output.perl                       ###
     ###    Author: Michel Genereux                                  ###
     ###            (indicated modifications by Dennis Spohr (DS))   ###
     ###   Purpose: Filter chunker output and create XML-like markup ###
     ###   Created: Mon Feb 19 2007                                  ###
     ###                                                             ###
     ###################################################################
     use Getopt::Std;
     getopts('t');
     $| = 1;
     print doc_start();
     ### DS start: end-of-sentence marker
     $eos = '[.?!;]';
     $push = 1;
     ###
     $n = 0;
     while (<>) {
       s/.-SBAR$/O/;
       s/I-PC$/0/  if (/I-PC$/ && !$inside_pp);
       ### DS start: process lemma column
       if (($token[$n],$tag[$n],$tag,$chunk[$n],$x,$lemma[$n]) = $_ =~ /^(.*)-(.*)\t(.*)\/(.*)(\t(.*))?$/) {
       ### DS end
         ### DS start: chunking error; some SENTs have e.g. I-NP although
         ###           they mark the end of a sentence; lead to omission
         ###           of closing tags
         $push = 0;
         $chunk[$n] = 0 if ($tag eq 'SENT' && $chunk[$n] =~ /^(I|B)-/);
         ### DS end
         if ($chunk[$n] =~ /^(.*)-(.*)$/) {
           $flag[$n] = $1;
           $chunk[$n] = $2;
         } else {
           undef $flag[$n];
           undef $chunk[$n];
+        }
         ### DS start: performance boost: set $n to 0 after printing
         ###           sentence; otherwise $n and arrays get too big
         ###           and cause slowdown
         if ($token[$n] =~ /^$eos\s*$/ && $chunk[$n] == 0 && $tag[$n] eq 'SENT') {
           print_sentence(0);
           $n = 0;
           $start_markup = "";
         } else {
           $n++;
+        }
         ### DS end
       ### DS start: keep markup already present in input data and insert
       ###           chunker markup correctly; if an element starting before
       ###           the sentence is closed before the sentence is closed
       ###           (e.g. headlines without sentence end markers), then
       ###           the sentence should also be closed, e.g. avoid cases like
       ###           <HEADLINE><s>Les r?sultats de jeudi</HEADLINE></s>
       } elsif (/^<([^\/]*?)(( |~).*)?>/ && $push) {
           push(@tag_stack,$1);
           $start_markup .= "$&\n";
       } elsif (/^<\/(.*?)>/ && $1 eq $tag_stack[$#tag_stack]) {
           $end_markup = "$&\n";
           print_sentence(1);
           $n = 0;
           $push = 1;
           $start_markup = "";
           $end_markup = "";
           pop(@tag_stack);
       ### DS end
       } else {
         $markup[$n] .= $_;
+      }
+    }
     print_sentence(1);
     print doc_end();
     sub print_sentence {
       ### DS start: indicate whether print_sentence is forced by
       ###           closing input markup
       my $forced = shift;
       ### DS end
       my($i,$chunk);
       for( $i=0; $i<=$n; $i++ ) {
         if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
           $flag[$i] = 'B';
+        }
         if ($flag[$i] eq 'B') {
           if (defined $chunk) {
     	$cetags[$i-1] .= end_tag($chunk);
+          }
           if ($chunk[$i] eq 'PC') {
     	for( $k=$i+1; $k<=$n; $k++ ) {
     	  last if ($flag[$k] eq 'B');
+    	}
     	for( $k++; $k<=$n; $k++ ) {
     	  last if ($flag[$k] ne 'I');
+    	}
     	if ($k <= $n && $flag[$k] eq 'E' && $chunk[$k] eq 'PC') {
     	  $markup[$k+1] .= end_tag('PC');
     	  undef $flag[$k];
     	  undef $chunk[$k];
             ### DS start: $k may be greater than $n; add closing PC tag
             ###           to $markup[$n]; otherwise closing tags are
             ###           omitted
     	} elsif ($k > $n && ($forced || $token[$n] =~ /^$eos\s*$/)) {
               $markup[$n] .= end_tag('PC');
             ### DS end
             } else {
     	  $markup[$k] .= end_tag('PC');
+    	}
     	undef $chunk;
+          }
           else {
     	$chunk = $chunk[$i];
+          }
           $cbtags[$i] .= start_tag($chunk[$i]);
+        }
         elsif ($flag[$i] eq 'E') {
           if ($chunk[$i] eq $chunk) {
     	$cetags[$i] .= end_tag($chunk);
     	undef $chunk;
+          }
           elsif ($chunk[$i] eq 'PC') {
     	$cetags[$i-1] .= end_tag($chunk) if defined $chunk;
     	$cetags[$i] .= end_tag("PC");
     	my $k;
     	for( $k=$i; $k>=0; $k-- ) {
     	  if ($flag[$k] eq 'B') {
     	    $cbtags[$k] = start_tag("PC").$cbtags[$k];
     	    last;
+    	  }
+    	}
     	undef $chunk;
+          }
           else {
     	die;
+          }
+        }
         elsif ($flag[$i] ne 'I' && defined $chunk) {
           $cetags[$i-1] .= end_tag($chunk);
           undef $chunk;
+        }
+      }
       $printed = 0;# start_tag("s");
       ### DS start: print opening tags of input markup before sentence
       print $start_markup;
       print start_tag("s") if $n > 0;
       ### DS end
       for( $i=0; $i<=$n; $i++ ) {
         print $markup[$i];
         #unless ($printed) {
         #  print start_tag("s");
         #  $printed = 1;
         #}
         print $cbtags[$i];
         ### DS start: slightly renamed sub and added lemma parameter
         print token_and_tag_and_lemma($token[$i],$tag[$i],$lemma[$i]) if defined $token[$i];
         ### DS end
         print $cetags[$i];
+      }
       ### DS start: print closing "s" tag and closing input markup if
       ###           print_sentence had been forced
       print end_tag("s") if $n>0;
       print $end_markup if $forced;
       ### DS end
       undef @token;
       undef @tag;
       undef @chunk;
       undef @cbtags;
       undef @cetags;
       undef @flag;
       undef @markup;
+    }
     sub doc_start {
       return '' unless defined $opt_t;
       return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
+    }
     sub doc_end {
       return '' unless defined $opt_t;
       return "</corpus>\n";
+    }
     sub start_tag {
       my $t=shift;
       return "<$t>\n" unless defined $opt_t;
       return "  <phrase cat=\"$t\">\n";
+    }
     sub end_tag {
       my $t=shift;
       return "</$t>\n" unless defined $opt_t;
       return "  </phrase>\n";
+    }
     ### DS start: also process and output lemma parameter
     sub token_and_tag_and_lemma {
       my ($token,$tag,$lemma)=@_;
       return "$token\t$tag\t$lemma\n" unless defined $opt_t;
       return "    <token word=\"$token\" lemma=\"$lemma\" pos=\"$tag\"/>\n";
+    }
     ### DS end

     #!/usr/local/bin/perl
     use Getopt::Std;
     getopt('dhf:');
     # This perl script recognizes multi word units in the input stream
     # and puts them on one line. Input must have one-word-per-line format.
     # The multi word units are listed in the parameter file with POS tags.
     # Each line contains one multi word unit where the individual words
     # are separated by blanks followed by a tab character and the blank-
     # separated list of POS tags.
     # Author: Helmut Schmid, IMS, Uni Stuttgart
     if (!defined($opt_f) || defined($opt_h)) {
       $0 =~ s/.*\///;
       printf "\nUsage: $0 [-d del] -f mwl-file ...files...\n";
       print "\nOptions:\n";
       print "-d del : Use del as delimiter rather than a blank\n\n";
       die
+    }
     if (!open(FILE, $opt_f)) {
       die "\nCan't open mwl file: ",$opt_f,"\n";
+    }
     if (defined($opt_d)) {
       $del = $opt_d;
     } else {
       $del = " ";
+    }
     $N=1;
     while (<FILE>) {
       chomp();
       next if /^$/;
       @G = split("\t");
       @F = split(/\s+/,$G[0]);
       $state = 0;
       for($i=0; $i<=$#F; $i++) {
         if (!exists($arc{$state,$F[$i]})) {
           $arc{$state,$F[$i]} = $N++;
+        }
         $state = $arc{$state,$F[$i]};
+       }
       $final{$state} = $G[1];
+    }
     close(FILE);
     $last = $match = $last_match = 0;
     $state = 0;
     for (;;) {
       if ($match == $last) {
         if (!($token[$last] = <>)) {
           if ($last_match > 0) {
     	print $token[0];
     	for ($i=1; $i<=$last_match; $i++) {
     	  print $del,$token[$i];
+    	}
     	print "\n";
           } else {
     	$i=0;
+          }
           for (; $i<$last; $i++) {
     	print $token[$i],"\n";
+          }
           last;
+        }
         chomp($token[$last++]);
+      }
       if (($s = $arc{$state, $token[$match]}) ||
           ($s = $arc{$state, lc($token[$match])}) ||
           ($s = $arc{$state, ucfirst(lc($token[$match]))})) {
         if (exists($final{$s})) {
           $last_match = $match;
           $last_tag = $final{$s};
+        }
         $state = $s;
         $match++;
       } else {
         if ($last_match > 0) {
           print $token[0];
           for($i=1; $i<=$last_match; $i++) {
     	print $del,$token[$i];
+          }
           print "\t$last_tag\n";
         } else {
           print $token[0],"\n";
+        }
         for($i=0,$k=$last_match+1; $k<$last; ) {
           $token[$i++] = $token[$k++];
+        }
         $last = $last - $last_match - 1;
         $last_match = $match = 0;
         $state = 0;
+      }
+    }

     #!/usr/bin/perl
     use Getopt::Std;
     getopts('t');
     print doc_start();
     $n = 0;
     while (<>) {
       s/.-SBAR$/O/;
       if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
         $token[$n] = $1;
         $tag[$n] = $2;
         $chunk[$n] = $4;
         if ($chunk[$n] =~ /^(.*)-(.*)$/) {
           $flag[$n] = $1;
           $chunk[$n] = $2;
+        }
         else {
           undef $flag[$n];
           undef $chunk[$n];
+        }
         print_sentence()  if $token[$n] eq '.';
         $n++;
+      }
       else {
         $markup[$n] .= $_;
+      }
+    }
     print_sentence();
     print doc_end();
     sub print_sentence {
       my($i,$chunk);
       for( $i=0; $i<=$n; $i++ ) {
         if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
           $flag[$i] = 'B';
+        }
         if ($flag[$i] eq 'B') {
           if (defined $chunk) {
     	$cetags[$i-1] = end_tag($chunk);
+          }
           $chunk = $chunk[$i];
           $cbtags[$i] .= start_tag($chunk[$i]);
+        }
         # German chunker uses E-flags for PCs
         elsif ($flag[$i] eq 'E') {
           if ($chunk[$i] eq $chunk) {
     	$cetags[$i] = end_tag($chunk);
     	undef $chunk;
+          }
           elsif ($chunk[$i] eq "PC" && $chunk eq "NC") {
     	for( $k=$i-1; $k>=0; $k-- ) {
     	  if ($chunk[$k] eq "NC") {
     	    $chunk[$k] = "PC";
+    	  }
     	  if ($flag[$k] ne "I") {
     	    last;
+    	  }
+    	}
     	$cbtags[$k] = start_tag($chunk[$i]);
     	$cetags[$i] = end_tag($chunk[$i]);
     	undef $chunk;
     	undef $inPC;
+          }
+        }
         elsif ($flag[$i] ne 'I' && defined $chunk) {
           $cetags[$i-1] = end_tag($chunk);
           undef $chunk;
+        }
+      }
       for( $i=0; $i<=$n; $i++ ) {
         print $markup[$i];
         print $cbtags[$i];
         print token_and_tag($token[$i],$tag[$i]) if defined $token[$i];
         print $cetags[$i];
+      }
       undef @token;
       undef @tag;
       undef @chunk;
       undef @cbtags;
       undef @cetags;
       undef @flag;
       undef @markup;
       $n = 0;
+    }
     sub doc_start {
       return '' unless defined $opt_t;
       return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
+    }
     sub doc_end {
       return '' unless defined $opt_t;
       return "</corpus>\n";
+    }
     sub start_tag {
       my $t=shift;
       return "<$t>\n" unless defined $opt_t;
       return "  <phrase cat=\"$t\">\n";
+    }
     sub end_tag {
       my $t=shift;
       return "</$t>\n" unless defined $opt_t;
       return "  </phrase>\n";
+    }
     sub token_and_tag {
       my ($token,$tag)=@_;
       return "$token\t$tag\n" unless defined $opt_t;
       return "    <token word=\"$token\" pos=\"$tag\"/>\n";
+    }

     #!/usr/bin/perl
     ########################################################################
     #                                                                      #
     #  tokenization script for tagger preprocessing                        #
     #  Author: Helmut Schmid, IMS, University of Stuttgart                 #
     #          Serge Sharoff, University of Leeds                          #
     #  Description:                                                        #
     #  - splits input text into tokens (one token per line)                #
     #  - cuts off punctuation, parentheses etc.                            #
     #  - disambiguates periods                                             #
     #  - preserves SGML markup                                             #
     #                                                                      #
     ########################################################################
     use Getopt::Std;
     use utf8;
     use Encode;
     getopts('hgfeiza:');
     # Modify the following lines in order to adapt the tokenizer to other
     # types of text and/or languages
     # characters which have to be cut off at the beginning of a word
     my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›'."'";
     # characters which have to be cut off at the end of a word
     my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›';
     # character sequences which have to be cut off at the beginning of a word
     my $PClitic='';
     # character sequences which have to be cut off at the end of a word
     my $FClitic;
     if (defined($opt_e)) {
       # English
       $FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t';
+    }
     if (defined($opt_i)) {
       # Italian
       $PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\'';
+    }
     if (defined($opt_f)) {
       # French
       $PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
       $FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
+    }
     if (defined($opt_z)) {
       # Galician
       $FClitic = '-la|-las|-lo|-los|-nos';
+    }
     ### NO MODIFICATIONS REQUIRED BEYOND THIS LINE #########################
     if (defined($opt_h)) {
       die "
     Usage: utf8-tokenize.perl [ options ] ...files...
     Options:
     -e : English text
     -f : French text
     -i : Italian text
     -a <file>: <file> contains a list of words which are either abbreviations or
                words which should not be further split.
     ";
+    }
     # Read the list of abbreviations and words
     if (defined($opt_a)) {
       die "Can't read: $opt_a: $!\n"  unless (open(FILE, $opt_a));
       while (<FILE>) {
           $_ = decode('utf8',$_);
           s/^[ \t\r\n]+//;
           s/[ \t\r\n]+$//;
           next if (/^\#/ || /^\s$/);    # ignore comments
           $Token{$_} = 1;
+      }
       close FILE;
+    }
     #SS: main loop;
     my $first_line = 1;
     while (<>) {
       $_ = decode('utf8',$_);
       # delete optional byte order markers (BOM)
       if ($first_line) {
           undef $first_line;
           s/^\x{FEFF}//;
+      }
       # replace newlines and tab characters with blanks
       tr/\n\t/  /;
       # replace blanks within SGML tags
       while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
+      }
+      ;
       #Separ: ÿþ
       # replace whitespace with a special character
       tr/ /\376/;
       # restore SGML tags
       tr/\377\376/ \377/;
       # prepare SGML-Tags for tokenization
       s/(<[^<>]*>)/\377$1\377/g;
       s/^\377//;
       s/\377$//;
       s/\377\377\377*/\377/g;
       @S = split("\377");
       for ( $i=0; $i<=$#S; $i++) {
         $_ = $S[$i];
         if (/^<.*>$/) {
           # SGML tag
           print encode('utf8',"$_\n");
         } else {
           # add a blank at the beginning and the end of each segment
           $_ = ' '.$_.' ';
           # insert missing blanks after punctuation
           s/(\.\.\.)/ ... /g;
           s/([;\!\?])([^ ])/$1 $2/g;
           s/([.,:])([^ 0-9.])/$1 $2/g;
           @F = split;
           for ( $j=0; $j<=$#F; $j++) {
     	my $suffix="";
     	$_ = $F[$j];
     	# separate punctuation and parentheses from words
     	do {
     	  $finished = 1;
     	  # cut off preceding punctuation
     	  if (s/^([$PChar])(.)/$2/) {
     	    print encode('utf8',"$1\n");
     	    $finished = 0;
+    	  }
     	  # cut off trailing punctuation
     	  if (s/(.)([$FChar])$/$1/) {
     	    $suffix = "$2\n$suffix";
     	    $finished = 0;
+    	  }
     	  # cut off trailing periods if punctuation precedes
     	  if (s/([$FChar])\.$//) {
     	    $suffix = ".\n$suffix";
     	    if ($_ eq "") {
     	      $_ = $1;
     	    } else {
     	      $suffix = "$1\n$suffix";
+    	    }
     	    $finished = 0;
+    	  }
     	} while (!$finished);
     	# handle explicitly listed tokens
     	if (defined($Token{$_})) {
     	  print encode('utf8',"$_\n$suffix");
     	  next;
+    	}
     	# abbreviations of the form A. or U.S.A.
     	if (/^([A-Za-z-]\.)+$/) {
     	  print encode('utf8',"$_\n$suffix");
     	  next;
+    	}
     	# disambiguate periods
     	if (/^(..*)\.$/ && $_ ne "..." && !($opt_g && /^[0-9]+\.$/)) {
     	  $_ = $1;
     	  $suffix = ".\n$suffix";
     	  if (defined($Token{$_})) {
      	    print encode('utf8',"$_\n$suffix");
     	    next;
+    	  }
+    	}
     	# cut off clitics
     	while (s/^(--)(.)/$2/) {
      	    print encode('utf8',"$1\n");
+    	}
     	if ($PClitic ne '') {
     	  while (s/^($PClitic)(.)/$2/) {
      	    print encode('utf8',"$1\n");
+    	  }
+    	}
     	while (s/(.)(--)$/$1/) {
     	    $suffix = "$2\n$suffix";
+    	}
     	if ($FClitic ne '') {
     	  while (s/(.)($FClitic)$/$1/) {
     	    $suffix = "$2\n$suffix";
+    	  }
+    	}
     	print encode('utf8',"$_\n$suffix");
+          }
+        }
+      }
+    }

     #!/usr/bin/perl
     use Getopt::Std;
     getopts('t');
     print doc_start();
     $n = 0;
     while (<>) {
       s/.-SBAR$/O/;
       if (/^(.*)-(.*)\t(.*)\/(.*)$/) {
         $token[$n] = $1;
         $tag[$n] = $2;
         $chunk[$n] = $4;
         if ($chunk[$n] =~ /^(.*)-(.*)$/) {
           $flag[$n] = $1;
           $chunk[$n] = $2;
+        }
         else {
           undef $flag[$n];
           undef $chunk[$n];
+        }
         print_sentence()  if $token[$n] eq '.';
         $n++;
+      }
       else {
         $markup[$n] .= $_;
+      }
+    }
     print_sentence();
     print doc_end();
     sub print_sentence {
       my($i,$chunk);
       for( $i=0; $i<=$n; $i++ ) {
         if ($flag[$i] eq 'I' && $chunk ne $chunk[$i]) {
           $flag[$i] = 'B';
+        }
         if ($flag[$i] ne '' && $token[$i] eq '.') {
           delete $flag[$i];
           $chunk[$i] = '0';
+        }
         if ($flag[$i] eq 'B') {
           if (defined $chunk) {
     	if (($chunk eq 'PC' && $chunk[$i] eq 'NC') ||
     	    ($chunk eq 'PP' && $chunk[$i] eq 'NP'))
+    	  {
     	    $inPC = $chunk;
+    	  }
     	else {
     	  $cetags[$i-1] = end_tag($chunk);
     	  if (defined $inPC) {
     	    $cetags[$i-1] .= end_tag($inPC);
     	    undef $inPC;
+    	  }
+    	}
+          }
           $chunk = $chunk[$i];
           $cbtags[$i] .= start_tag($chunk[$i]);
+        }
         elsif ($flag[$i] ne 'I' && defined $chunk) {
           $cetags[$i-1] = end_tag($chunk);
           undef $chunk;
           if (defined $inPC) {
     	$cetags[$i-1] .= end_tag($inPC);
     	undef $inPC;
+          }
+        }
+      }
       for( $i=0; $i<=$n; $i++ ) {
         print $markup[$i];
         print $cbtags[$i];
         print token_and_tag($token[$i],$tag[$i]) if defined $token[$i];
         print $cetags[$i];
+      }
       undef @token;
       undef @tag;
       undef @chunk;
       undef @cbtags;
       undef @cetags;
       undef @flag;
       undef @markup;
       $n = 0;
+    }
     sub doc_start {
       return '' unless defined $opt_t;
       return "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"yes\"?>\n<corpus>\n";
+    }
     sub doc_end {
       return '' unless defined $opt_t;
       return "</corpus>\n";
+    }
     sub start_tag {
       my $t=shift;
       return "<$t>\n" unless defined $opt_t;
       return "  <phrase cat=\"$t\">\n";
+    }
     sub end_tag {
       my $t=shift;
       return "</$t>\n" unless defined $opt_t;
       return "  </phrase>\n";
+    }
     sub token_and_tag {
       my ($token,$tag)=@_;
       return "$token\t$tag\n" unless defined $opt_t;
       return "    <token word=\"$token\" pos=\"$tag\"/>\n";
+    }

     #!/usr/bin/perl
     $month{"??????????"} = 1;
     $month{"???????????"} = 1;
     $month{"???????"} = 1;
     $month{"????????"} = 1;
     $month{"?????"} = 1;
     $month{"?????"} = 1;
     $month{"?????"} = 1;
     $month{"???????"} = 1;
     $month{"???????"} = 1;
     $month{"?????????"} = 1;
     $month{"???????????"} = 1;
     $month{"?????????"} = 1;
     $month{"?????????"} = 1;
     $month{"??????????"} = 1;
     $month{"?????????"} = 1;
     $month{"??????????"} = 1;
     $month{"??????"} = 1;
     $month{"???????"} = 1;
     $month{"????"} = 1;
     $month{"??????"} = 1;
     $month{"??????"} = 1;
     $month{"????????"} = 1;
     $month{"????????"} = 1;
     $month{"??????????"} = 1;
     $month{"????????"} = 1;
     $month{"????????"} = 1;
     $month{"?????????"} = 1;
     while (<>) {
         chomp;
         if ($_ eq '') {
     	print_sentence();
+        }
         else {
     	push @token, $_;
+        }
+    }
     print_sentence();
     sub print_sentence {
         for( $i=0; $i<=$#token; $i++ ) {
     	if (exists $month{$token[$i]}) {
     	    $start = $end = $i;
     	    if ($token[$start-1] =~ /^[1-9][0-9]?([???]??)?(-[1-9][0-9]?([???]??)?)?$/){
     		$start--;
+    	    }
     	    if ($token[$start-1] eq '??????') {
     		$start--;
+    	    }
     	    if ($token[$end+1] eq '???') {
     		$end++;
+    	    }
     	    if ($token[$end+1] =~ /^(1[0-9][0-9][0-9]|20[0-9][0-9]|'[0-9][0-9])$/) {
     		$end++;
+    	    }
     	    for( $k=$start; $k<$end; $k++) {
     		$join[$k] = 1;
+    	    }
+    	}
     	elsif (($token[$i] eq "??'" && $token[$i+1] eq '???') ||
     	       ($token[$i] eq '??' &&
     		($token[$i+1] eq '????' || $token[$i+1] eq '????')))
+    	{
     	    $join[$i] = 1;
+    	}
+        }
         for( $i=0; $i<=$#token; $i++ ) {
     	if ($join[$i] == 1) {
     	    print "$token[$i] "
+    	}
     	else {
     	    print "$token[$i]\n"
+    	}
+        }
         undef @token;
         undef @join;
+    }

     #!/usr/bin/perl
     ########################################################################
     #                                                                      #
     #  tokenization script for tagger preprocessing                        #
     #  Author: Helmut Schmid, IMS, University of Stuttgart                 #
     #          Serge Sharoff, University of Leeds                          #
     #  Description:                                                        #
     #  - splits input text into tokens (one token per line)                #
     #  - cuts off punctuation, parentheses etc.                            #
     #  - disambiguates periods                                             #
     #  - preserves SGML markup                                             #
     #                                                                      #
     ########################################################################
     use Getopt::Std;
     getopts('hfeia:u');
     use utf8;
     if (defined $opt_u) {
         use open ':utf8';
         binmode(STDIN,":utf8");
         binmode(STDOUT,":utf8");
+    }
     # Modify the following lines in order to adapt the tokenizer to other
     # types of text and/or languages
     # characters which have to be cut off at the beginning of a word
     my $PChar='[¿¡{(\\`"‚„†‡‹‘’“”•–—›';
     # characters which have to be cut off at the end of a word
     my $FChar=']}\'\`\"),;:\!\?\%‚„…†‡‰‹‘’“”•–—›';
     # character sequences which have to be cut off at the beginning of a word
     my $PClitic='';
     # character sequences which have to be cut off at the end of a word
     my $FClitic;
     if (defined($opt_e)) {
       # English
       $FClitic = '\'(s|re|ve|d|m|em|ll)|n\'t';
+    }
     if (defined($opt_i)) {
       # Italian
       $PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\'';
+    }
     if (defined($opt_f)) {
       # French
       $PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
       $FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
+    }
     ### NO MODIFICATIONS REQUIRED BEYOND THIS LINE #########################
     if (defined($opt_h)) {
       die "
     Usage: tokenize.perl [ options ] ...files...
     Options:
     -u : use UTF8 encoding
     -e : English text
     -f : French text
     -i : Italian text
     -a <file>: <file> contains a list of words which are either abbreviations or
                words which should not be further split.
     ";
+    }
     # Read the list of abbreviations and words
     if (defined($opt_a)) {
       die "Can't read: $opt_a: $!\n"  unless (open(FILE, $opt_a));
       while (<FILE>) {
         s/^[ \t\r\n]+//;
         s/[ \t\r\n]+$//;
         next if (/^\#/ || /^\s$/);    # ignore comments
         $Token{$_} = 1;
+      }
       close FILE;
+    }
     #SS: main loop;
     my $first_line = 1;
     while (<>) {
       # delete optional byte order markers (BOM)
       if ($first_line) {
           undef $first_line;
           s/^\x{FEFF}//;
+      }
       # replace newlines and tab characters with blanks
       tr/\n\t/  /;
       # replace blanks within SGML tags
       while (s/(<[^<> ]*) ([^<>]*>)/$1\377$2/g) {
+      }
+      ;
       #Separ: ÿþ
       # replace whitespace with a special character
       tr/ /\376/;
       # restore SGML tags
       tr/\377\376/ \377/;
       # prepare SGML-Tags for tokenization
       s/(<[^<>]*>)/\377$1\377/g;
       s/^\377//;
       s/\377$//;
       s/\377\377\377*/\377/g;
       @S = split("\377");
       for ( $i=0; $i<=$#S; $i++) {
         $_ = $S[$i];
         if (/^<.*>$/) {
           # SGML tag
           print $_,"\n";
         } else {
           # add a blank at the beginning and the end of each segment
           $_ = ' '.$_.' ';
           # insert missing blanks after punctuation
           s/(\.\.\.)/ ... /g;
           s/([;\!\?])([^ ])/$1 $2/g;
           s/([.,:])([^ 0-9.])/$1 $2/g;
           @F = split;
           for ( $j=0; $j<=$#F; $j++) {
     	my $suffix="";
     	$_ = $F[$j];
     	# separate punctuation and parentheses from words
     	do {
     	  $finished = 1;
     	  # cut off preceding punctuation
     	  if (s/^([$PChar])(.)/$2/) {
     	    print $1,"\n";
     	    $finished = 0;
+    	  }
     	  # cut off trailing punctuation
     	  if (s/(.)([$FChar])$/$1/) {
     	    $suffix = "$2\n$suffix";
     	    $finished = 0;
+    	  }
     	  # cut off trailing periods if punctuation precedes
     	  if (s/([$FChar])\.$//) {
     	    $suffix = ".\n$suffix";
     	    if ($_ eq "") {
     	      $_ = $1;
     	    } else {
     	      $suffix = "$1\n$suffix";
+    	    }
     	    $finished = 0;
+    	  }
     	} while (!$finished);
     	# handle explicitly listed tokens
     	if (defined($Token{$_})) {
     	  print "$_\n$suffix";
     	  next;
+    	}
     	# abbreviations of the form A. or U.S.A.
     	if (/^([A-Za-z-]\.)+$/) {
     	  print "$_\n$suffix";
     	  next;
+    	}
     	# disambiguate periods
     	if (/^(..*)\.$/ && $_ ne "..." && !/^[0-9]+\.$/) {
     	  $_ = $1;
     	  $suffix = ".\n$suffix";
     	  if (defined($Token{$_})) {
     	    print "$_\n$suffix";
     	    next;
+    	  }
+    	}
     	# cut off clitics
     	if ($PClitic ne '') {
     	  while (s/^($PClitic)(.)/$2/) {
     	    print $1,"\n";
+    	  }
+    	}
     	if ($FClitic ne '') {
     	  while (s/(.)($FClitic)$/$1/) {
     	    $suffix = "$2\n$suffix";
+    	  }
+    	}
     	print "$_\n$suffix";
+          }
+        }
+      }
+    }

     Adm.
     Ala.
     Ariz.
     Ark.
     Aug.
     Ave.
     Bancorp.
     Bhd.
     Brig.
     Bros.
     CO.
     CORP.
     COS.
     Ca.
     Calif.
     Canada-U.S.
     Canadian-U.S.
     Capt.
     Cia.
     Cie.
     Co.
     Col.
     Colo.
     Conn.
     Corp.
     Cos.
     D-Mass.
     Dec.
     Del.
     Dept.
     Dr.
     Drs.
     Etc.
     Feb.
     Fla.
     Ft.
     Ga.
     Gen.
     Gov.
     Hon.
     INC.
     Ill.
     Inc.
     Ind.
     Jan.
     Japan-U.S.
     Jr.
     Kan.
     Korean-U.S.
     Ky.
     La.
     Lt.
     Ltd.
     Maj.
     Mass.
     Md.
     Messrs.
     Mfg.
     Mich.
     Minn.
     Miss.
     Mo.
     Mr.
     Mrs.
     Ms.
     Neb.
     Nev.
     No.
     Nos.
     Nov.
     Oct.
     Okla.
     Ont.
     Ore.
     Pa.
     Ph.
     Prof.
     Prop.
     Pty.
     Rep.
     Reps.
     Rev.
     S.p.A.
     Sen.
     Sens.
     Sept.
     Sgt.
     Sino-U.S.
     Sr.
     St.
     Ste.
     Tenn.
     Tex.
     U.S.-U.K.
     U.S.-U.S.S.R.
     Va.
     Vt.
     W.Va.
     Wash.
     Wis.
     Wyo.
     a.k.a.
     a.m.
     anti-U.S.
     cap.
     days.
     etc.
     ft.
     i.e.
     non-U.S.
     p.m.
     president-U.S.
     s.r.l.
     v.
     v.B.
     v.w.
     vs.

     A.
     A.-G.
     A.G.
     ADN-Korr.
     AT-Mot.
     Abb.
     Abess.
     Abl.
     Ablief.-Gew.
     Abm.
     Abs.
     Abt.
     Abtlg.
     Agl.
     Agt.
     Akt.-Ges.
     Aktbr.
     Alg.
     Alleininh.
     Allg.
     Altwageneint.
     Alu-Felg.
     Alum.
     Am.
     Amp.
     Anf.
     Anfr.
     Anfrag.
     Ang.
     Angb.
     Angeb.
     Angl.
     Anhängerkuppl.
     Anl.
     Anleih.
     Ann.-Exp.
     Ann.-Exped.
     Ant.
     Anten.
     Anz.
     Anz.-Exp.
     Anz.-Verm.
     Anzahlg.
     Anzhlg.
     Apoth.
     App.
     Appartem.
     April-Lief.
     Argent.
     Atl.
     Aufb.
     Aufst.
     Aug.
     Augsb.
     Ausg.
     Ausgl.
     Ausk.
     Ausl.
     Ausl.-Akt.
     Auslandsanl.
     Auslandsb.
     Ausst.
     Ausstattg.
     Austral.
     Ausz.
     Aut.
     Autom.
     Automat.
     Automin.
     B.
     B.P.
     BGBl.
     Bahnhofstr.
     Balk.
     Bau-Ing.
     Bauges.
     Bauj.
     Bay.
     Bayer.
     Bb.
     Bd.
     Bed.
     Beding.
     Ber.
     Beratg.
     Bereif.
     Bergb.
     Bergstr.
     Bernh.
     Bes.
     Besichtig.
     Bestzust.
     Beteil.
     Beteilig.
     Betr.
     Bett.
     Bew.
     Bewerb.
     Bewerbg.
     Bez.
     Bgl.
     Bhf.
     Bierbr.
     Bildzuschr.
     Bilf.
     Bj.
     Bk.
     Bkz.
     Bl.
     Bln.
     Boch.
     Bod.
     Bor.
     Bov.
     Br.
     Brem.
     Brh.
     Brok.
     Brsg.
     Bu.
     Bung.
     Burgstr.
     Bw.
     Bwsp.
     Bz.
     Bäd.
     C.
     C.G.
     Cabr.
     Can.
     Cap.
     Cav.
     Cbr.
     Cem.
     Centralb.
     Cert.
     Ch.
     Charlottenstr.
     Chem.
     Chem.-Ing.
     Chevr.
     Chr.
     Christophstr.
     Cie.
     Co.
     Colorvergl.
     Commerzb.
     Conc.
     Cons.
     Corneliusstr.
     Corp.
     Cp.
     Cpt.
     Cz.
     D.
     DG.
     DM.
     DUB-Schulth.
     DW.
     Dahlb.
     Dawes-Anl.
     Dept.
     Dev.
     Dez.
     Di.
     Dipl.
     Dipl.-Ing.
     Dipl.-Kfm.
     Dir.
     Direktionsw.
     Div.
     Do.
     Doll.
     Don.
     Dorfk.
     Dpf.
     Dr.
     Dr.-Ing.
     Dreij.
     Drog.
     Dt.
     Du.
     Dyckerh.
     Dyn.
     Dän.
     Düsseld.
     E.
     E.h.
     Einf.
     Einh.
     Einr.
     Eint.
     Eintr.
     Einw.
     Einwohn.
     Einz.
     Einzelzi.
     Eisenb.
     El.
     Elektr.
     Em.
     Endpr.
     Engl.
     Ent.
     Entsch.
     Entw.
     Erdgesch.
     Erf.
     Erfahr.
     Erstzul.
     Erzgeb.
     Esc.
     Eterna.
     Etg.-Hs.
     Eur.
     Ew.
     Exp.
     Expl.
     F.
     FS.
     Fa.
     Fabr.
     Fabrikat.
     Fachm.
     Fachricht.
     Fahrz.
     Fam.
     Fb.
     Fd.
     Fds.
     Feb.
     Febr.
     Febr.-Abl.
     Febr.-März-Abl.
     Feldstr.
     Fensterh.
     Ferd.
     Ferdinandstr.
     Fernschr.
     Ferr.
     Feuervers.
     Ffm.
     Fil.
     Fin.
     Finanzier.
     Finanzierg.
     Finanzierungsmöglichk.
     Finnl.
     Ford.
     Fortschr.
     Fr.
     Frankf.
     Franz.
     Franziskanerstr.
     Freiverk.
     Frhr.
     Fried.
     Friedr.
     Friedrich-Ebert-Str.
     Friedrichstr.
     Frl.
     Frühst.
     Führersch.
     G.
     G.M.B.H.
     G.m.b.H.
     Gar.
     Garag.
     Gart.
     Geb.

Laboratoire ICAR » Plateforme TXM

Révision 1683