Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macroproto / StanfordSegmenterMacro.groovy @ 499

History | View | Annotate | Download (4.3 kB)

1
package org.txm.macroproto;
2
// STANDARD DECLARATIONS
3
/*
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcp.swt.widget.parameters.*
7
import edu.stanford.nlp.international.arabic.process.*
8
import edu.stanford.nlp.international.arabic.*
9

10
import java.io.BufferedReader;
11
import java.io.File;
12
import java.io.FileInputStream;
13
import java.io.FileNotFoundException;
14
import java.io.IOException;
15
import java.io.InputStreamReader;
16
import java.io.OutputStreamWriter;
17
import java.io.PrintWriter;
18
import java.io.Serializable;
19
import java.io.StringReader;
20
import java.io.UnsupportedEncodingException;
21
import java.util.Collection;
22
import java.util.List;
23
import java.util.Map;
24
import java.util.Properties;
25

26
import edu.stanford.nlp.ie.crf.CRFClassifier;
27
import edu.stanford.nlp.io.IOUtils;
28
import edu.stanford.nlp.ling.CoreLabel;
29
import edu.stanford.nlp.ling.HasWord;
30
import edu.stanford.nlp.ling.Sentence;
31
import edu.stanford.nlp.ling.TaggedWord;
32
import edu.stanford.nlp.ling.CoreAnnotations;
33
import edu.stanford.nlp.objectbank.ObjectBank;
34
import edu.stanford.nlp.process.TokenizerFactory;
35
import edu.stanford.nlp.process.WordSegmenter;
36
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
37
import edu.stanford.nlp.sequences.SeqClassifierFlags;
38
import edu.stanford.nlp.stats.ClassicCounter;
39
import edu.stanford.nlp.stats.Counter;
40
import edu.stanford.nlp.trees.Tree;
41
import edu.stanford.nlp.util.Generics;
42
import edu.stanford.nlp.util.PropertiesUtils;
43
import edu.stanford.nlp.util.StringUtils;
44
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
45
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
46

47
@Field @Option(name="inputDir", usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe")
48
File inputDir
49
@Field @Option(name="outputDir", usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
50
File outputDir
51

52
@Field @Option(name="model", usage="'.tagger' model file", widget="File", required=false, def="data/arabic-segmenter-atbtrain.ser.gz")
53
File model
54
@Field @Option(name="extension", usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
55
String extension = "\\.txt"
56

57
// Open the parameters input dialog box
58
if (!ParametersDialog.open(this)) return;
59

60
outputDir.mkdir()
61
if (!outputDir.exists()) {
62
        println("Could not create $outputDir")
63
        return;
64
}
65

66
// END OF PARAMETERS
67
//File inputFile = new File("/home/mdecorde/xml/testarabe/test.txt")
68
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
69
println "Processing file: $inputFile"
70
File outputFile = new File(outputDir, inputFile.getName())
71
//File model = new File("/home/mdecorde/LIBRAIRIES/stanford-segmenter-2014-01-04/data/arabic-segmenter-atbtrain.ser.gz")
72
Properties options = new Properties();
73
//options.put("prefixMarker", "#")
74
//options.put("suffixMarker", "#")
75
options.put("loadClassifier", model.getAbsolutePath())
76
ArabicSegmenter segmenter = new ArabicSegmenter(options);
77
segmenter.flags.inputEncoding = "UTF-8";
78
segmenter.loadSegmenter(segmenter.flags.loadClassifier, options);
79

80
def br = inputFile.newReader("UTF-8")
81
OutputStreamWriter out = new OutputStreamWriter( new FileOutputStream(outputFile), "UTF-8");
82
PrintWriter pw = new PrintWriter(out, true);
83
double charsPerSec = decode(segmenter, br, pw, 5);
84
IOUtils.closeIgnoringExceptions(br);
85
System.err.printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);
86

87
}
88

89
def decode(ArabicSegmenter segmenter, BufferedReader br,
90
                               PrintWriter pwOut, int nThreads) {
91
    assert nThreads > 0;
92
    long nChars = 0;
93
    final long startTime = System.nanoTime();
94
    if (nThreads > 1) {
95
      MulticoreWrapper<String,String> wrapper = new MulticoreWrapper<String,String>(nThreads, segmenter);
96
      try {
97
        for (String line; (line = br.readLine()) != null;) {
98
          nChars += line.length();
99
          wrapper.put(line);
100
          while (wrapper.peek()) {
101
            pwOut.println(wrapper.poll());
102
          }
103
        }
104

105
        wrapper.join();
106
        while (wrapper.peek()) {
107
          pwOut.println(wrapper.poll());
108
        }
109

110
      } catch (IOException e) {
111
        e.printStackTrace();
112
      }
113

114
    } else {
115
      nChars = segmenter.segment(br, pwOut);
116
    }
117
    long duration = System.nanoTime() - startTime;
118
    double charsPerSec = (double) nChars / (duration / 1000000000.0);
119
    return charsPerSec;
120
  }
121
*/