Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / tal / stanford / StanfordTaggerMacro.groovy @ 499

History | View | Annotate | Download (2.1 kB)

1
import edu.stanford.nlp.ling.*
2
import edu.stanford.nlp.process.*
3
import edu.stanford.nlp.tagger.maxent.*;
4

    
5
// STANDARD DECLARATIONS
6

    
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import org.txm.rcp.swt.widget.parameters.*
10

    
11
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
12
File inputDir
13
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-t")
14
File outputDir
15

    
16
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="taggers/arabic-train.tagger")
17
File model
18
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
19
String extension = "\\.txt"
20

    
21
// Open the parameters input dialog box
22
if (!ParametersDialog.open(this)) return;
23
def modelPath = model.getAbsolutePath()
24

    
25
outputDir.mkdir()
26
if (!outputDir.exists()) {
27
        println("Could not create $outputDir")
28
        return;
29
}
30

    
31
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
32
        String name = inputFile.getName()
33
        int idx = name.lastIndexOf(".")
34
        if (idx > 0) name = name.substring(0, idx)
35
        
36
        new File(outputDir, name+".xml").withWriter("UTF-8") { writer ->
37
                
38
                writer.println('<?xml version="1.0" encoding="UTF-8"?>')
39
                writer.println('<text>')
40
                //TODO: uncomment me
41
        //        def tagger = new MaxentTagger(modelPath);
42
        //        def ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),        "untokenizable=noneKeep");
43
        
44
                        //TODO: uncomment me
45
//                def documentPreprocessor = new DocumentPreprocessor(inputFile.newReader());
46
                documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
47
                int n = 0
48
                
49
                for (def sentence : documentPreprocessor) {
50
                        writer.println("<s n=\"$n\">")
51
                        n++;
52
                        def tSentence = tagger.tagSentence(sentence);
53
                        //println(Sentence.listToString(tSentence, false));
54
                        for (def taggedWord : tSentence)
55
                        {
56
                                def form = w.word();
57
                                def pos = w.tag();
58
                                writer.println("<w pos=\"$pos\">$form</w>")
59
                        }
60
                        writer.println('</s>')
61
                }
62
                writer.println('</text>')
63
        }
64
}