Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / tal / stanford / StanfordSegmenterMacro.groovy @ 499

History | View | Annotate | Download (4 kB)

1
// STANDARD DECLARATIONS
2

    
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import org.txm.rcp.swt.widget.parameters.*
6
import edu.stanford.nlp.international.arabic.process.*
7
import edu.stanford.nlp.international.arabic.*
8

    
9
import java.io.BufferedReader;
10
import java.io.File;
11
import java.io.FileInputStream;
12
import java.io.FileNotFoundException;
13
import java.io.IOException;
14
import java.io.InputStreamReader;
15
import java.io.OutputStreamWriter;
16
import java.io.PrintWriter;
17
import java.io.Serializable;
18
import java.io.StringReader;
19
import java.io.UnsupportedEncodingException;
20
import java.util.Collection;
21
import java.util.List;
22
import java.util.Map;
23
import java.util.Properties;
24

    
25
import edu.stanford.nlp.ie.crf.*;
26
import edu.stanford.nlp.io.*;
27
import edu.stanford.nlp.ling.*;
28
import edu.stanford.nlp.objectbank.*;
29
import edu.stanford.nlp.process.*;
30
import edu.stanford.nlp.sequences.*;
31
import edu.stanford.nlp.stats.*;
32
import edu.stanford.nlp.trees.*;
33
import edu.stanford.nlp.util.*;
34
import edu.stanford.nlp.util.concurrent.*;
35

    
36
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe")
37
File inputDir
38
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
39
File outputDir
40

    
41
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="data/arabic-segmenter-atbtrain.ser.gz")
42
File model
43
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
44
String extension = "\\.txt"
45

    
46
// Open the parameters input dialog box
47
if (!ParametersDialog.open(this)) return;
48

    
49
outputDir.mkdir()
50
if (!outputDir.exists()) {
51
        println("Could not create $outputDir")
52
        return;
53
}
54

    
55
// END OF PARAMETERS
56
//File inputFile = new File("/home/mdecorde/xml/testarabe/test.txt")
57
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
58
println "Processing file: $inputFile"
59
File outputFile = new File(outputDir, inputFile.getName())
60
//File model = new File("/home/mdecorde/LIBRAIRIES/stanford-segmenter-2014-01-04/data/arabic-segmenter-atbtrain.ser.gz")
61
Properties options = new Properties();
62
//options.put("prefixMarker", "#")
63
//options.put("suffixMarker", "#")
64
options.put("loadClassifier", model.getAbsolutePath())
65
//TODO: uncommend me
66
//ArabicSegmenter segmenter = new ArabicSegmenter(options);
67
segmenter.flags.inputEncoding = "UTF-8";
68
segmenter.loadSegmenter(segmenter.flags.loadClassifier, options);
69

    
70
def br = inputFile.newReader("UTF-8")
71
OutputStreamWriter out = new OutputStreamWriter( new FileOutputStream(outputFile), "UTF-8");
72
PrintWriter pw = new PrintWriter(out, true);
73
double charsPerSec = decode(segmenter, br, pw, 5);
74
IOUtils.closeIgnoringExceptions(br);
75
System.err.printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);
76

    
77
}
78

    
79
/**
80
   * Segment input and write to output stream.
81
   *
82
   * @param segmenter
83
   * @param br
84
   * @param pwOut
85
   * @param nThreads
86
   * @return input characters processed per second
87
   */
88
def decode(def segmenter, BufferedReader br,
89
                               PrintWriter pwOut, int nThreads) {
90
    assert nThreads > 0;
91
    long nChars = 0;
92
    final long startTime = System.nanoTime();
93
    if (nThreads > 1) {
94
                //TODO: uncommend me
95
                //      MulticoreWrapper<String,String> wrapper = new MulticoreWrapper<String,String>(nThreads, segmenter);
96
      try {
97
        for (String line; (line = br.readLine()) != null;) {
98
          nChars += line.length();
99
          wrapper.put(line);
100
          while (wrapper.peek()) {
101
            pwOut.println(wrapper.poll());
102
          }
103
        }
104

    
105
        wrapper.join();
106
        while (wrapper.peek()) {
107
          pwOut.println(wrapper.poll());
108
        }
109

    
110
      } catch (IOException e) {
111
        e.printStackTrace();
112
      }
113

    
114
    } else {
115
      nChars = segmenter.segment(br, pwOut);
116
    }
117
    long duration = System.nanoTime() - startTime;
118
    double charsPerSec = (double) nChars / (duration / 1000000000.0);
119
    return charsPerSec;
120
  }