Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / export / conll2009 / ToCoNLL2009.groovy @ 2769

History | View | Annotate | Download (3.6 kB)

1
package org.txm.scripts.export.conll2009
2

    
3
import java.io.File;
4
import java.io.FileOutputStream;
5
import java.io.OutputStreamWriter;
6
import java.io.Writer;
7
import java.util.Arrays;
8
import java.util.List;
9

    
10
import org.txm.searchengine.cqp.CqpDataProxy;
11
import org.txm.searchengine.cqp.corpus.*
12
import org.txm.searchengine.cqp.corpus.query.CQLQuery
13

    
14
class ToCoNLL2009 {
15

    
16
        boolean debug = true
17
        /**
18
         * To CoNLL2009.
19
         *
20
         * @param outfile the outfile, where is result is saved
21
         * @param corpus the corpus or sub-corpus to export
22
         * @param sentenceUnit the StructuralUnitProperty that identify a sentence
23
         * @param word the word property
24
         * @param lemme the lemme property
25
         * @param pos the pos property
26
         * @param encoding the encoding of the outfile
27
         * @return true, if successful
28
         */
29
        public boolean process(File outfile, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding)
30
        {
31
                def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONNL", false).getMatches()
32
                if (debug) println matches.size()
33
                int npositions = 0;
34
                for (def match : matches) npositions += match.size() +1
35
                if (debug) println "npositions= $npositions"
36

    
37
                int[] positions = new int[npositions+1]
38
                int i = 0;
39
                for (def match : matches) {
40
                        for (int p : match.getRange())
41
                                positions[i++] = p
42
                }
43

    
44
                List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
45

    
46
                List<String> lemmes = null;
47
                if (lemme != null) {
48
                        lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
49
                }
50

    
51
                List<String> poss = null;
52
                if (pos != null) {
53
                        poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
54
                }
55

    
56
                Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
57

    
58
                String lemmestr;
59
                String posstr;
60
                int id = 0;
61
                int iposition = 0;
62
                for (def match : matches) {
63
                        id = 0 // first position in sentence
64
                        for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
65

    
66
                                if (lemme != null)
67
                                        lemmestr = lemmes.get(iposition);
68
                                else
69
                                        lemmestr = "_";
70

    
71
                                if (pos != null)
72
                                        posstr = poss.get(iposition);
73
                                else
74
                                        posstr = "_";
75

    
76
                                //print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
77
                                writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
78

    
79
                                id++; // next word
80
                                iposition++ // nextposition
81
                        }
82
                }
83
                writer.flush();
84
                writer.close();
85

    
86
                return true;
87
        }
88

    
89
        public static void main(String[] args) {
90
                CQPCorpus corpus = null
91
                File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv")
92
                String encoding = "UTF-8"
93

    
94
                String sentenceProperty = "s_id"
95
                String posProperty = "frpos"
96
                String lemmaProperty = "frlemma"
97

    
98

    
99
                def split = sentenceProperty.split("_", 2)
100
                StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
101
                Property word = null;//corpus.getProperty("word")
102
                Property lemma = null;//corpus.getProperty(lemmaProperty)
103
                Property pos = null;//corpus.getProperty(posProperty)
104

    
105
                //                if (s == null) { println "Error sentence property: $sentenceProperty"; return}
106
                //                if (word == null) { println "Error no word property"; return}
107
                //                if (pos == null) { println "Error pos property: $posProperty"; return}
108
                //                if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
109

    
110

    
111
                ToCoNLL2009 processor = new ToCoNLL2009()
112
                processor.process(outfile, corpus, s, word, lemma, pos, encoding)
113
        }
114
}