Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / export / conll2009 / ToCoNLL2009.groovy @ 187

History | View | Annotate | Download (3.6 kB)

1
package org.txm.export.conll2009
2

    
3
import java.io.File;
4
import java.io.FileOutputStream;
5
import java.io.OutputStreamWriter;
6
import java.io.Writer;
7
import java.util.Arrays;
8
import java.util.List;
9

    
10
import org.txm.searchengine.cqp.CqpDataProxy;
11
import org.txm.searchengine.cqp.corpus.Corpus
12
import org.txm.searchengine.cqp.corpus.*
13
import org.txm.searchengine.cqp.corpus.query.Query
14

    
15
class ToCoNLL2009 {
16

    
17
        boolean debug = true
18
        /**
19
         * To CoNLL2009.
20
         *
21
         * @param outfile the outfile, where is result is saved
22
         * @param corpus the corpus or sub-corpus to export
23
         * @param sentenceUnit the StructuralUnitProperty that identify a sentence
24
         * @param word the word property
25
         * @param lemme the lemme property
26
         * @param pos the pos property
27
         * @param encoding the encoding of the outfile
28
         * @return true, if successful
29
         */
30
        public boolean process(File outfile, Corpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding)
31
        {
32
                def matches = corpus.query(new Query("<s> []+ </s>"), "TMPEXPORTCONNL", false).getMatches()
33
                if (debug) println matches.size()
34
                int npositions = 0;
35
                for (def match : matches) npositions += match.length +1
36
                if (debug) println "npositions= $npositions"
37

    
38
                int[] positions = new int[npositions+1]
39
                int i = 0;
40
                for (def match : matches) {
41
                        for (int p : match.getRange())
42
                                positions[i++] = p
43
                }
44

    
45
                List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
46

    
47
                List<String> lemmes = null;
48
                if (lemme != null) {
49
                        lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
50
                }
51

    
52
                List<String> poss = null;
53
                if (pos != null) {
54
                        poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
55
                }
56

    
57
                Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
58

    
59
                String lemmestr;
60
                String posstr;
61
                int id = 0;
62
                int iposition = 0;
63
                for (def match : matches) {
64
                        id = 0 // first position in sentence
65
                        for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
66

    
67
                                if (lemme != null)
68
                                        lemmestr = lemmes.get(iposition);
69
                                else
70
                                        lemmestr = "_";
71

    
72
                                if (pos != null)
73
                                        posstr = poss.get(iposition);
74
                                else
75
                                        posstr = "_";
76

    
77
                                //print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
78
                                writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
79

    
80
                                id++; // next word
81
                                iposition++ // nextposition
82
                        }
83
                }
84
                writer.flush();
85
                writer.close();
86

    
87
                return true;
88
        }
89

    
90
        public static void main(String[] args) {
91
                Corpus corpus = null
92
                File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv")
93
                String encoding = "UTF-8"
94

    
95
                String sentenceProperty = "s_id"
96
                String posProperty = "frpos"
97
                String lemmaProperty = "frlemma"
98

    
99

    
100
                def split = sentenceProperty.split("_", 2)
101
                StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
102
                Property word = null;//corpus.getProperty("word")
103
                Property lemma = null;//corpus.getProperty(lemmaProperty)
104
                Property pos = null;//corpus.getProperty(posProperty)
105

    
106
                //                if (s == null) { println "Error sentence property: $sentenceProperty"; return}
107
                //                if (word == null) { println "Error no word property"; return}
108
                //                if (pos == null) { println "Error pos property: $posProperty"; return}
109
                //                if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
110

    
111

    
112
                ToCoNLL2009 processor = new ToCoNLL2009()
113
                processor.process(outfile, corpus, s, word, lemma, pos, encoding)
114
        }
115
}