Statistics
| Revision:

root / tmp / org.txm.treetagger.core.macosx / res / macosx / cmd / lookup.perl @ 1120

History | View | Annotate | Download (1012 Bytes)

1 826 mdecorde
#!/usr/local/GNU/bin/perl
2 826 mdecorde
3 826 mdecorde
# Usage: lookup.perl <file>*
4 826 mdecorde
# Perl script to be used prior to tagging
5 826 mdecorde
6 826 mdecorde
# It assigns sets of possible tags to selected word forms.
7 826 mdecorde
# A file named "elex" containing these word forms and their tags
8 826 mdecorde
# has to be in the current working directory.
9 826 mdecorde
# The format of this file is:
10 826 mdecorde
# <word form><tab>[<tag><whitespace>{<tag prob.><whitespace>}]*
11 826 mdecorde
# The word form which may contain blanks is followed by a tab character
12 826 mdecorde
# and a sequence of tags separated by whitespace. The tags are optionally
13 826 mdecorde
# followed by tag probability values in the range from 0.0 to 1.0.
14 826 mdecorde
15 826 mdecorde
$LEXICON = "elex";
16 826 mdecorde
17 826 mdecorde
open(LEXICON);
18 826 mdecorde
while (<LEXICON>) {
19 826 mdecorde
    chop();
20 826 mdecorde
    @word = split('\t');
21 826 mdecorde
    $word[1] =~ s/[ \t\n][ \t\n]*/ /go;
22 826 mdecorde
    $word[1] =~ s/^[ \t\n]*(.*[^ \t\n])[ \t\n]*$/$1/go;
23 826 mdecorde
    $tag{$word[0]} = $word[1];
24 826 mdecorde
}
25 826 mdecorde
close(INPUT);
26 826 mdecorde
27 826 mdecorde
while (<>) {
28 826 mdecorde
    chop();
29 826 mdecorde
    s/[ \t\n][ \t\n]*/ /go;
30 826 mdecorde
    s/^[ \t\n]*(.*[^ \t\n])[ \t\n]*$/$1/go;
31 826 mdecorde
    if (defined($tag{$_})) {
32 826 mdecorde
	print $_,"\t",$tag{$_},"\n";
33 826 mdecorde
    }
34 826 mdecorde
    else {
35 826 mdecorde
	print $_, "\n";
36 826 mdecorde
    }
37 826 mdecorde
}