10 |
10 |
import org.txm.searchengine.cqp.CqpDataProxy;
|
11 |
11 |
import org.txm.searchengine.cqp.corpus.*
|
12 |
12 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery
|
|
13 |
import org.txm.utils.ConsoleProgressBar
|
13 |
14 |
|
14 |
15 |
public class ToCoNLL2009 {
|
15 |
|
|
|
16 |
|
16 |
17 |
boolean debug = true
|
17 |
18 |
/**
|
18 |
19 |
* To CoNLL2009.
|
... | ... | |
26 |
27 |
* @param encoding the encoding of the outfile
|
27 |
28 |
* @return true, if successful
|
28 |
29 |
*/
|
29 |
|
public boolean process(File outfile, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
|
|
30 |
public boolean process(File outdir, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
|
30 |
31 |
|
31 |
|
def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONLL", false).getMatches()
|
32 |
|
if (debug) println matches.size()
|
33 |
|
int npositions = 0;
|
34 |
|
for (def match : matches) npositions += match.size() +1
|
35 |
|
if (debug) println "npositions= $npositions"
|
36 |
|
|
37 |
|
int[] positions = new int[npositions+1]
|
38 |
|
int i = 0;
|
39 |
|
for (def match : matches) {
|
40 |
|
for (int p : match.getRange())
|
41 |
|
positions[i++] = p
|
42 |
|
}
|
43 |
|
|
44 |
|
List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
|
45 |
|
|
46 |
|
List<String> lemmes = null;
|
47 |
|
if (lemme != null) {
|
48 |
|
lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
|
49 |
|
}
|
50 |
|
|
51 |
|
List<String> poss = null;
|
52 |
|
if (pos != null) {
|
53 |
|
poss = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
|
54 |
|
}
|
55 |
|
|
56 |
|
Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
|
57 |
|
|
58 |
|
String lemmestr;
|
59 |
|
String posstr;
|
60 |
|
int id = 0;
|
61 |
|
int iposition = 0;
|
62 |
|
for (def match : matches) {
|
63 |
|
id = 0 // first position in sentence
|
64 |
|
for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
|
65 |
|
|
66 |
|
if (lemme != null)
|
67 |
|
lemmestr = lemmes.get(iposition);
|
68 |
|
else
|
69 |
|
lemmestr = "_";
|
70 |
|
|
71 |
|
if (pos != null)
|
72 |
|
posstr = poss.get(iposition);
|
73 |
|
else
|
74 |
|
posstr = "_";
|
75 |
|
|
76 |
|
//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
|
77 |
|
writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
|
78 |
|
|
79 |
|
id++; // next word
|
80 |
|
iposition++ // nextposition
|
|
32 |
String sstruct = sentenceUnit.getStructuralUnit().getName()
|
|
33 |
|
|
34 |
def texts = corpus.getCorpusTextIdsList();
|
|
35 |
def idProperty = corpus.getProperty("id")
|
|
36 |
|
|
37 |
ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size());
|
|
38 |
|
|
39 |
for (def text : texts) {
|
|
40 |
|
|
41 |
cpb.tick()
|
|
42 |
|
|
43 |
def matches = corpus.query(new CQLQuery("[_.text_id=\"$text\"] expand to $sstruct"), "TMPEXPORTCONLL", false).getMatches()
|
|
44 |
if (debug) println matches.size()
|
|
45 |
int npositions = 0;
|
|
46 |
for (def match : matches) npositions += match.size() +1
|
|
47 |
if (debug) println "npositions= $npositions"
|
|
48 |
|
|
49 |
int[] positions = new int[npositions+1]
|
|
50 |
int i = 0;
|
|
51 |
for (def match : matches) {
|
|
52 |
for (int p : match.getRange())
|
|
53 |
positions[i++] = p
|
81 |
54 |
}
|
|
55 |
|
|
56 |
List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
|
|
57 |
|
|
58 |
List<String> idsList = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(idProperty.getQualifiedName(), positions);
|
|
59 |
|
|
60 |
List<String> lemmes = null;
|
|
61 |
if (lemme != null) {
|
|
62 |
lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
|
|
63 |
}
|
|
64 |
|
|
65 |
List<String> poss = null;
|
|
66 |
if (pos != null) {
|
|
67 |
poss = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
|
|
68 |
}
|
|
69 |
|
|
70 |
File outfile = new File(outdir, text+".conllu")
|
|
71 |
Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
|
|
72 |
|
|
73 |
String lemmestr;
|
|
74 |
String posstr;
|
|
75 |
int id = 0;
|
|
76 |
int iposition = 0;
|
|
77 |
for (def match : matches) {
|
|
78 |
|
|
79 |
writer.write("\n");
|
|
80 |
writer.write("# newdoc id=$text\n");
|
|
81 |
writer.write("# sent_id=$match\n");
|
|
82 |
|
|
83 |
id = 0 // first position in sentence
|
|
84 |
for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
|
|
85 |
|
|
86 |
if (lemme != null)
|
|
87 |
lemmestr = lemmes.get(iposition);
|
|
88 |
else
|
|
89 |
lemmestr = "_";
|
|
90 |
|
|
91 |
if (pos != null)
|
|
92 |
posstr = poss.get(iposition);
|
|
93 |
else
|
|
94 |
posstr = "_";
|
|
95 |
|
|
96 |
//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
|
|
97 |
writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
|
|
98 |
|
|
99 |
id++; // next word
|
|
100 |
iposition++ // nextposition
|
|
101 |
}
|
|
102 |
}
|
|
103 |
writer.flush();
|
|
104 |
writer.close();
|
82 |
105 |
}
|
83 |
|
writer.flush();
|
84 |
|
writer.close();
|
85 |
|
|
|
106 |
|
|
107 |
cpb.done()
|
86 |
108 |
return true;
|
87 |
109 |
}
|
88 |
|
|
|
110 |
|
89 |
111 |
public static void main(String[] args) {
|
90 |
112 |
CQPCorpus corpus = null
|
91 |
113 |
File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv")
|
92 |
114 |
String encoding = "UTF-8"
|
93 |
|
|
|
115 |
|
94 |
116 |
String sentenceProperty = "s_id"
|
95 |
117 |
String posProperty = "frpos"
|
96 |
118 |
String lemmaProperty = "frlemma"
|
97 |
|
|
98 |
|
|
|
119 |
|
|
120 |
|
99 |
121 |
def split = sentenceProperty.split("_", 2)
|
100 |
122 |
StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
|
101 |
123 |
Property word = null;//corpus.getProperty("word")
|
102 |
124 |
Property lemma = null;//corpus.getProperty(lemmaProperty)
|
103 |
125 |
Property pos = null;//corpus.getProperty(posProperty)
|
104 |
|
|
|
126 |
|
105 |
127 |
// if (s == null) { println "Error sentence property: $sentenceProperty"; return}
|
106 |
128 |
// if (word == null) { println "Error no word property"; return}
|
107 |
129 |
// if (pos == null) { println "Error pos property: $posProperty"; return}
|
108 |
130 |
// if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
|
109 |
|
|
110 |
|
|
|
131 |
|
|
132 |
|
111 |
133 |
ToCoNLL2009 processor = new ToCoNLL2009()
|
112 |
134 |
processor.process(outfile, corpus, s, word, lemma, pos, encoding)
|
113 |
135 |
}
|