root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / export / conll2009 / ToCoNLL2009.groovy @ 2769
History | View | Annotate | Download (3.6 kB)
1 |
package org.txm.scripts.export.conll2009
|
---|---|
2 |
|
3 |
import java.io.File; |
4 |
import java.io.FileOutputStream; |
5 |
import java.io.OutputStreamWriter; |
6 |
import java.io.Writer; |
7 |
import java.util.Arrays; |
8 |
import java.util.List; |
9 |
|
10 |
import org.txm.searchengine.cqp.CqpDataProxy; |
11 |
import org.txm.searchengine.cqp.corpus.* |
12 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery |
13 |
|
14 |
class ToCoNLL2009 { |
15 |
|
16 |
boolean debug = true |
17 |
/**
|
18 |
* To CoNLL2009.
|
19 |
*
|
20 |
* @param outfile the outfile, where is result is saved
|
21 |
* @param corpus the corpus or sub-corpus to export
|
22 |
* @param sentenceUnit the StructuralUnitProperty that identify a sentence
|
23 |
* @param word the word property
|
24 |
* @param lemme the lemme property
|
25 |
* @param pos the pos property
|
26 |
* @param encoding the encoding of the outfile
|
27 |
* @return true, if successful
|
28 |
*/
|
29 |
public boolean process(File outfile, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) |
30 |
{ |
31 |
def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONNL", false).getMatches() |
32 |
if (debug) println matches.size()
|
33 |
int npositions = 0; |
34 |
for (def match : matches) npositions += match.size() +1 |
35 |
if (debug) println "npositions= $npositions" |
36 |
|
37 |
int[] positions = new int[npositions+1] |
38 |
int i = 0; |
39 |
for (def match : matches) { |
40 |
for (int p : match.getRange()) |
41 |
positions[i++] = p |
42 |
} |
43 |
|
44 |
List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions); |
45 |
|
46 |
List<String> lemmes = null; |
47 |
if (lemme != null) { |
48 |
lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions); |
49 |
} |
50 |
|
51 |
List<String> poss = null; |
52 |
if (pos != null) { |
53 |
poss = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions); |
54 |
} |
55 |
|
56 |
Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$; |
57 |
|
58 |
String lemmestr;
|
59 |
String posstr;
|
60 |
int id = 0; |
61 |
int iposition = 0; |
62 |
for (def match : matches) { |
63 |
id = 0 // first position in sentence |
64 |
for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length |
65 |
|
66 |
if (lemme != null) |
67 |
lemmestr = lemmes.get(iposition); |
68 |
else
|
69 |
lemmestr = "_";
|
70 |
|
71 |
if (pos != null) |
72 |
posstr = poss.get(iposition); |
73 |
else
|
74 |
posstr = "_";
|
75 |
|
76 |
//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
|
77 |
writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n"); |
78 |
|
79 |
id++; // next word
|
80 |
iposition++ // nextposition
|
81 |
} |
82 |
} |
83 |
writer.flush(); |
84 |
writer.close(); |
85 |
|
86 |
return true; |
87 |
} |
88 |
|
89 |
public static void main(String[] args) { |
90 |
CQPCorpus corpus = null
|
91 |
File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv") |
92 |
String encoding = "UTF-8" |
93 |
|
94 |
String sentenceProperty = "s_id" |
95 |
String posProperty = "frpos" |
96 |
String lemmaProperty = "frlemma" |
97 |
|
98 |
|
99 |
def split = sentenceProperty.split("_", 2) |
100 |
StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1]) |
101 |
Property word = null;//corpus.getProperty("word") |
102 |
Property lemma = null;//corpus.getProperty(lemmaProperty) |
103 |
Property pos = null;//corpus.getProperty(posProperty) |
104 |
|
105 |
// if (s == null) { println "Error sentence property: $sentenceProperty"; return}
|
106 |
// if (word == null) { println "Error no word property"; return}
|
107 |
// if (pos == null) { println "Error pos property: $posProperty"; return}
|
108 |
// if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
|
109 |
|
110 |
|
111 |
ToCoNLL2009 processor = new ToCoNLL2009()
|
112 |
processor.process(outfile, corpus, s, word, lemma, pos, encoding) |
113 |
} |
114 |
} |