Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / stanford / StanfordTaggerMacro.groovy @ 1000

History | View | Annotate | Download (2.1 kB)

1
package org.txm.scripts.tal.stanford
2
import edu.stanford.nlp.ling.*
3
import edu.stanford.nlp.process.*
4
import edu.stanford.nlp.tagger.maxent.*;
5

    
6
// STANDARD DECLARATIONS
7

    
8
import org.kohsuke.args4j.*
9
import groovy.transform.Field
10
import org.txm.rcp.swt.widget.parameters.*
11

    
12
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
13
File inputDir
14
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-t")
15
File outputDir
16

    
17
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="taggers/arabic-train.tagger")
18
File model
19
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
20
String extension = "\\.txt"
21

    
22
// Open the parameters input dialog box
23
if (!ParametersDialog.open(this)) return;
24
def modelPath = model.getAbsolutePath()
25

    
26
outputDir.mkdir()
27
if (!outputDir.exists()) {
28
        println("Could not create $outputDir")
29
        return;
30
}
31

    
32
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
33
        String name = inputFile.getName()
34
        int idx = name.lastIndexOf(".")
35
        if (idx > 0) name = name.substring(0, idx)
36
        
37
        new File(outputDir, name+".xml").withWriter("UTF-8") { writer ->
38
                
39
                writer.println('<?xml version="1.0" encoding="UTF-8"?>')
40
                writer.println('<text>')
41
                //TODO: uncomment me
42
        //        def tagger = new MaxentTagger(modelPath);
43
        //        def ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),        "untokenizable=noneKeep");
44
        
45
                        //TODO: uncomment me
46
//                def documentPreprocessor = new DocumentPreprocessor(inputFile.newReader());
47
                documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
48
                int n = 0
49
                
50
                for (def sentence : documentPreprocessor) {
51
                        writer.println("<s n=\"$n\">")
52
                        n++;
53
                        def tSentence = tagger.tagSentence(sentence);
54
                        //println(Sentence.listToString(tSentence, false));
55
                        for (def taggedWord : tSentence)
56
                        {
57
                                def form = w.word();
58
                                def pos = w.tag();
59
                                writer.println("<w pos=\"$pos\">$form</w>")
60
                        }
61
                        writer.println('</s>')
62
                }
63
                writer.println('</text>')
64
        }
65
}