Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macroproto / StanfordTaggerMacro.groovy @ 187

History | View | Annotate | Download (2.1 kB)

1
package org.txm.macroproto;
2
/*
3

4
import edu.stanford.nlp.ling.*
5
import edu.stanford.nlp.process.*
6
import edu.stanford.nlp.tagger.maxent.*;
7

8
// STANDARD DECLARATIONS
9

10
import org.kohsuke.args4j.*
11
import groovy.transform.Field
12
import org.txm.rcpapplication.swt.widget.parameters.*
13

14
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
15
File inputDir
16
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-t")
17
File outputDir
18

19
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="taggers/arabic-train.tagger")
20
File model
21
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
22
String extension = "\\.txt"
23

24
// Open the parameters input dialog box
25
if (!ParametersDialog.open(this)) return;
26
def modelPath = model.getAbsolutePath()
27

28
outputDir.mkdir()
29
if (!outputDir.exists()) {
30
        println("Could not create $outputDir")
31
        return;
32
}
33

34
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
35
        String name = inputFile.getName()
36
        int idx = name.indexOf(".")
37
        if (idx > 0) name = name.substring(0, idx)
38
        
39
        new File(outputDir, name+".xml").withWriter("UTF-8") { writer ->
40
                
41
                writer.println('<?xml version="1.0" encoding="UTF-8"?>')
42
                writer.println('<text>')
43
                
44
                MaxentTagger tagger = new MaxentTagger(modelPath);
45
                TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
46
                                "untokenizable=noneKeep");
47
        
48
                DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(inputFile.newReader());
49
                documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
50
                int n = 0
51
                
52
                for (List<HasWord> sentence : documentPreprocessor) {
53
                        writer.println("<s n=\"$n\">")
54
                        n++;
55
                        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
56
                        //println(Sentence.listToString(tSentence, false));
57
                        for (def TaggedWord w : tSentence) {
58
                                def form = w.word();
59
                                def pos = w.tag();
60
                                writer.println("<w pos=\"$pos\">$form</w>")
61
                        }
62
                        writer.println('</s>')
63
                }
64
                writer.println('</text>')
65
        }
66
}
67
*/