Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / stanford / StanfordSegmenterMacro.groovy @ 1000

History | View | Annotate | Download (4 kB)

1
// STANDARD DECLARATIONS
2
package org.txm.scripts.tal.stanford
3

    
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcp.swt.widget.parameters.*
7
import edu.stanford.nlp.international.arabic.process.*
8
import edu.stanford.nlp.international.arabic.*
9

    
10
import java.io.BufferedReader;
11
import java.io.File;
12
import java.io.FileInputStream;
13
import java.io.FileNotFoundException;
14
import java.io.IOException;
15
import java.io.InputStreamReader;
16
import java.io.OutputStreamWriter;
17
import java.io.PrintWriter;
18
import java.io.Serializable;
19
import java.io.StringReader;
20
import java.io.UnsupportedEncodingException;
21
import java.util.Collection;
22
import java.util.List;
23
import java.util.Map;
24
import java.util.Properties;
25

    
26
import edu.stanford.nlp.ie.crf.*;
27
import edu.stanford.nlp.io.*;
28
import edu.stanford.nlp.ling.*;
29
import edu.stanford.nlp.objectbank.*;
30
import edu.stanford.nlp.process.*;
31
import edu.stanford.nlp.sequences.*;
32
import edu.stanford.nlp.stats.*;
33
import edu.stanford.nlp.trees.*;
34
import edu.stanford.nlp.util.*;
35
import edu.stanford.nlp.util.concurrent.*;
36

    
37
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe")
38
File inputDir
39
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
40
File outputDir
41

    
42
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="data/arabic-segmenter-atbtrain.ser.gz")
43
File model
44
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
45
String extension = "\\.txt"
46

    
47
// Open the parameters input dialog box
48
if (!ParametersDialog.open(this)) return;
49

    
50
outputDir.mkdir()
51
if (!outputDir.exists()) {
52
        println("Could not create $outputDir")
53
        return;
54
}
55

    
56
// END OF PARAMETERS
57
//File inputFile = new File("/home/mdecorde/xml/testarabe/test.txt")
58
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
59
println "Processing file: $inputFile"
60
File outputFile = new File(outputDir, inputFile.getName())
61
//File model = new File("/home/mdecorde/LIBRAIRIES/stanford-segmenter-2014-01-04/data/arabic-segmenter-atbtrain.ser.gz")
62
Properties options = new Properties();
63
//options.put("prefixMarker", "#")
64
//options.put("suffixMarker", "#")
65
options.put("loadClassifier", model.getAbsolutePath())
66
//TODO: uncommend me
67
//ArabicSegmenter segmenter = new ArabicSegmenter(options);
68
segmenter.flags.inputEncoding = "UTF-8";
69
segmenter.loadSegmenter(segmenter.flags.loadClassifier, options);
70

    
71
def br = inputFile.newReader("UTF-8")
72
OutputStreamWriter out = new OutputStreamWriter( new FileOutputStream(outputFile), "UTF-8");
73
PrintWriter pw = new PrintWriter(out, true);
74
double charsPerSec = decode(segmenter, br, pw, 5);
75
IOUtils.closeIgnoringExceptions(br);
76
System.err.printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);
77

    
78
}
79

    
80
/**
81
   * Segment input and write to output stream.
82
   *
83
   * @param segmenter
84
   * @param br
85
   * @param pwOut
86
   * @param nThreads
87
   * @return input characters processed per second
88
   */
89
def decode(def segmenter, BufferedReader br,
90
                               PrintWriter pwOut, int nThreads) {
91
    assert nThreads > 0;
92
    long nChars = 0;
93
    final long startTime = System.nanoTime();
94
    if (nThreads > 1) {
95
                //TODO: uncommend me
96
                //      MulticoreWrapper<String,String> wrapper = new MulticoreWrapper<String,String>(nThreads, segmenter);
97
      try {
98
        for (String line; (line = br.readLine()) != null;) {
99
          nChars += line.length();
100
          wrapper.put(line);
101
          while (wrapper.peek()) {
102
            pwOut.println(wrapper.poll());
103
          }
104
        }
105

    
106
        wrapper.join();
107
        while (wrapper.peek()) {
108
          pwOut.println(wrapper.poll());
109
        }
110

    
111
      } catch (IOException e) {
112
        e.printStackTrace();
113
      }
114

    
115
    } else {
116
      nChars = segmenter.segment(br, pwOut);
117
    }
118
    long duration = System.nanoTime() - startTime;
119
    double charsPerSec = (double) nChars / (duration / 1000000000.0);
120
    return charsPerSec;
121
  }