root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / testTnT.groovy @ 1000
History | View | Annotate | Download (17.6 kB)
1 | 321 | mdecorde | /**
|
---|---|---|---|
2 | 321 | mdecorde | * Main.
|
3 | 321 | mdecorde | *
|
4 | 321 | mdecorde | * @param args the args
|
5 | 321 | mdecorde | */
|
6 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
7 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
8 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
9 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
10 | 321 | mdecorde | //
|
11 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
12 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
13 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
14 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
15 | 321 | mdecorde | // later version.
|
16 | 321 | mdecorde | //
|
17 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
18 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
19 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
20 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
21 | 321 | mdecorde | // details.
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | // You should have received a copy of the GNU General
|
24 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
25 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
26 | 321 | mdecorde | //
|
27 | 321 | mdecorde | //
|
28 | 321 | mdecorde | //
|
29 | 479 | mdecorde | // $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
|
30 | 321 | mdecorde | // $LastChangedRevision: 3185 $
|
31 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
32 | 321 | mdecorde | //
|
33 | 1000 | mdecorde | package org.txm.scripts.scripts;
|
34 | 321 | mdecorde | |
35 | 321 | mdecorde | /**
|
36 | 321 | mdecorde | * test tnt Wrapper :
|
37 | 321 | mdecorde | * add NLP annotations
|
38 | 321 | mdecorde | * build result matrix
|
39 | 321 | mdecorde | *
|
40 | 321 | mdecorde | */
|
41 | 321 | mdecorde | |
42 | 321 | mdecorde | import org.txm.utils.*; |
43 | 986 | mdecorde | import org.txm.scripts.importer.*; |
44 | 986 | mdecorde | import org.txm.scripts.importer.RGAQCJ.*; |
45 | 321 | mdecorde | |
46 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
47 | 321 | mdecorde | /* (non-Javadoc)
|
48 | 321 | mdecorde | * @see groovy.lang.Script#run()
|
49 | 321 | mdecorde | */
|
50 | 321 | mdecorde | String rootDir = "~/xml/rgaqcj/TnT/"; |
51 | 321 | mdecorde | String modelsDir = rootDir+"models/"; |
52 | 321 | mdecorde | String textsDir = rootDir+"texts/"; |
53 | 321 | mdecorde | String projsDir = rootDir+"proj/"; |
54 | 321 | mdecorde | String srcDir = rootDir+"src/" |
55 | 321 | mdecorde | String xslDir = rootDir+"xsl/" |
56 | 321 | mdecorde | String anaDir = rootDir+"anainline/" |
57 | 321 | mdecorde | new File(modelsDir).mkdir(); |
58 | 321 | mdecorde | new File(textsDir).mkdir(); |
59 | 321 | mdecorde | new File(projsDir).mkdir(); |
60 | 321 | mdecorde | |
61 | 321 | mdecorde | def texts =["roland","artu","qjm","commyn1","jehpar","rgaqcj"] |
62 | 321 | mdecorde | //def texts =["commyn1","jehpar"]
|
63 | 321 | mdecorde | String initiales = "RAQCJZ" |
64 | 321 | mdecorde | |
65 | 321 | mdecorde | //import src
|
66 | 321 | mdecorde | def srcfiles = ["roland.xml","artu.xml","qjm.xml","commyn1.xml","jehpar.xml"]; |
67 | 321 | mdecorde | def anafiles = ["roland-ana.xml","artu-ana.xml","qjm-ana.xml","commyn1-ana.xml","jehpar-ana.xml"]; |
68 | 321 | mdecorde | def xslfiles = ["bfm2txm-w.xsl","bfm2txm-w.xsl","bfm2txm-w.xsl","bfm2txm-w.xsl","bfm2txm-w.xsl"]; |
69 | 321 | mdecorde | /*
|
70 | 321 | mdecorde | for(int i=0 ; i < xslfiles.size() ; i++)
|
71 | 321 | mdecorde | {
|
72 | 321 | mdecorde | String xslfile = xslDir+xslfiles[i];
|
73 | 321 | mdecorde | String infile = srcDir+srcfiles[i];
|
74 | 321 | mdecorde | String outfile = anaDir+anafiles[i];
|
75 | 321 | mdecorde | println("create xml-txm from "+srcfiles[i]);
|
76 | 321 | mdecorde | ApplyXsl a = new ApplyXsl(xslfile);
|
77 | 321 | mdecorde | a.process(infile,outfile);
|
78 | 321 | mdecorde | }
|
79 | 321 | mdecorde | //Import to CWB
|
80 | 714 | mdecorde | BuildXmlRGAQCJ.process( anafiles, rootDir)
|
81 | 321 | mdecorde | */
|
82 | 321 | mdecorde | //build CWBDecode wrapper
|
83 | 321 | mdecorde | ProcessBuilderBuilder.build(new File("src/groovy/org/txm/scripts/cwb-decode-wrapper-definition.xml"), new File("src/groovy/org/textometrie/scripts/CwbDecode.groovy")); |
84 | 321 | mdecorde | GroovyClassLoader gcl = new GroovyClassLoader();
|
85 | 321 | mdecorde | gcl.addClasspath(".");
|
86 | 321 | mdecorde | |
87 | 321 | mdecorde | String registryPath = rootDir+"/registry"; |
88 | 321 | mdecorde | String cwbdecodeexecDir = "~/Bureau/trunkCWB/cwb-3.0/utils/" |
89 | 321 | mdecorde | |
90 | 321 | mdecorde | Class clazz = gcl.parseClass(new File("src/groovy/org/txm/scripts/CwbDecode.groovy")); |
91 | 321 | mdecorde | def cwbdecodewrapper = clazz.newInstance(cwbdecodeexecDir);
|
92 | 321 | mdecorde | def pAttributesTrain = ["word","cat"]; |
93 | 321 | mdecorde | def pAttributesTag = ["word"]; |
94 | 321 | mdecorde | def sAttributes = ["s"]; |
95 | 321 | mdecorde | cwbdecodewrapper.setC(); |
96 | 321 | mdecorde | cwbdecodewrapper.setP(pAttributesTag); |
97 | 321 | mdecorde | cwbdecodewrapper.setS(sAttributes); |
98 | 321 | mdecorde | cwbdecodewrapper.setr(registryPath); |
99 | 321 | mdecorde | cwbdecodewrapper.debug(true);
|
100 | 321 | mdecorde | int[] nbmots = [0,35306,134170,173526,198929,228149]; |
101 | 321 | mdecorde | |
102 | 321 | mdecorde | //create file to proj on
|
103 | 321 | mdecorde | for(int i = 0 ; i < texts.size() ; i++) |
104 | 321 | mdecorde | { |
105 | 321 | mdecorde | String text = texts[i];
|
106 | 321 | mdecorde | println("creation TTsrc "+text);
|
107 | 321 | mdecorde | |
108 | 321 | mdecorde | //build src files
|
109 | 321 | mdecorde | cwbdecodewrapper.setP(pAttributesTag); |
110 | 321 | mdecorde | |
111 | 321 | mdecorde | if(!(initiales[i]+"").equals("Z")) |
112 | 321 | mdecorde | { |
113 | 321 | mdecorde | if(i > 0) |
114 | 321 | mdecorde | cwbdecodewrapper.sets(nbmots[i]+1)
|
115 | 321 | mdecorde | else
|
116 | 321 | mdecorde | cwbdecodewrapper.sets(nbmots[i]) |
117 | 321 | mdecorde | cwbdecodewrapper.sete(nbmots[i+1])
|
118 | 321 | mdecorde | } |
119 | 321 | mdecorde | else
|
120 | 321 | mdecorde | { |
121 | 321 | mdecorde | cwbdecodewrapper.unsets(); |
122 | 321 | mdecorde | cwbdecodewrapper.unsete(); |
123 | 321 | mdecorde | } |
124 | 321 | mdecorde | FileOutputStream fos = new FileOutputStream(textsDir+texts[i]+".t"); |
125 | 321 | mdecorde | PrintStream ps = new PrintStream(fos); |
126 | 321 | mdecorde | def out = System.out; |
127 | 321 | mdecorde | System.setOut(ps);
|
128 | 321 | mdecorde | |
129 | 321 | mdecorde | if(System.getProperty("os.name").contains("Windows")) |
130 | 321 | mdecorde | cwbdecodewrapper.cwbdecodeexe("RGAQCJ")
|
131 | 321 | mdecorde | else
|
132 | 321 | mdecorde | cwbdecodewrapper.cwbdecode("RGAQCJ")
|
133 | 321 | mdecorde | System.setOut(out);
|
134 | 321 | mdecorde | |
135 | 321 | mdecorde | //build train files
|
136 | 321 | mdecorde | cwbdecodewrapper.setP(pAttributesTrain); |
137 | 321 | mdecorde | |
138 | 321 | mdecorde | fos = new FileOutputStream(modelsDir+texts[i]+".t"); |
139 | 321 | mdecorde | ps = new PrintStream(fos); |
140 | 321 | mdecorde | System.setOut(ps);
|
141 | 321 | mdecorde | |
142 | 321 | mdecorde | if(System.getProperty("os.name").contains("Windows")) |
143 | 321 | mdecorde | cwbdecodewrapper.cwbdecodeexe("RGAQCJ")
|
144 | 321 | mdecorde | else
|
145 | 321 | mdecorde | cwbdecodewrapper.cwbdecode("RGAQCJ")
|
146 | 321 | mdecorde | System.setOut(out);
|
147 | 321 | mdecorde | |
148 | 321 | mdecorde | } |
149 | 321 | mdecorde | |
150 | 321 | mdecorde | //need to replace <s> by nothing and </s> by \n
|
151 | 321 | mdecorde | String encoding = "UTF-8" |
152 | 321 | mdecorde | for(String text : texts) |
153 | 321 | mdecorde | { |
154 | 321 | mdecorde | //patch src files
|
155 | 321 | mdecorde | File f = new File(textsDir,text+".t"); |
156 | 321 | mdecorde | File temp = new File("tempFileCVScleaner") |
157 | 321 | mdecorde | println("patch texts files "+f+": rmv <s> and replace </s>"); |
158 | 321 | mdecorde | Reader reader = new InputStreamReader(new FileInputStream(f),encoding); |
159 | 321 | mdecorde | Writer writer = new FileWriter(temp); |
160 | 321 | mdecorde | reader.eachLine |
161 | 321 | mdecorde | { |
162 | 321 | mdecorde | if(it.trim().startsWith("</s")) |
163 | 321 | mdecorde | writer.write("\n")
|
164 | 321 | mdecorde | else if(it.trim().startsWith("<s")) |
165 | 321 | mdecorde | writer.write("")
|
166 | 321 | mdecorde | else
|
167 | 321 | mdecorde | writer.write(it+"\n") |
168 | 321 | mdecorde | } |
169 | 321 | mdecorde | reader.close(); |
170 | 321 | mdecorde | writer.close(); |
171 | 321 | mdecorde | if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f |
172 | 321 | mdecorde | } |
173 | 321 | mdecorde | //need to replace <s> by nothing and </s> by \n
|
174 | 321 | mdecorde | for(String text : texts) |
175 | 321 | mdecorde | { |
176 | 321 | mdecorde | //patch training files
|
177 | 321 | mdecorde | File f = new File(modelsDir,text+".t"); |
178 | 321 | mdecorde | File temp = new File("tempFileCVScleaner") |
179 | 321 | mdecorde | println("patch models files "+f+": rmv <s> and replace </s>"); |
180 | 321 | mdecorde | Reader reader = new InputStreamReader(new FileInputStream(f),encoding); |
181 | 321 | mdecorde | Writer writer = new FileWriter(temp); |
182 | 321 | mdecorde | reader.eachLine |
183 | 321 | mdecorde | { |
184 | 321 | mdecorde | if(it.trim().startsWith("</s")) |
185 | 321 | mdecorde | writer.write("\n")
|
186 | 321 | mdecorde | else if(it.trim().startsWith("<s")) |
187 | 321 | mdecorde | writer.write("")
|
188 | 321 | mdecorde | else
|
189 | 321 | mdecorde | writer.write(it+"\n") |
190 | 321 | mdecorde | } |
191 | 321 | mdecorde | reader.close(); |
192 | 321 | mdecorde | writer.close(); |
193 | 321 | mdecorde | if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f |
194 | 321 | mdecorde | } |
195 | 321 | mdecorde | |
196 | 321 | mdecorde | //build Tnt Wrapper
|
197 | 321 | mdecorde | //ProcessBuilderBuilder.build(new File("src/groovy/org/txm/scripts/tnt-wrapper-definition.xml"), new File("src/groovy/org/textometrie/scripts/TnT.groovy"));
|
198 | 321 | mdecorde | gcl = new GroovyClassLoader();
|
199 | 321 | mdecorde | gcl.addClasspath(".");
|
200 | 321 | mdecorde | String tntexecDir = "~/Bureau/tnt/" |
201 | 321 | mdecorde | clazz = gcl.parseClass(new File("src/groovy/org/txm/scripts/TnT.groovy")); |
202 | 321 | mdecorde | def tntwrapper = clazz.newInstance(tntexecDir);
|
203 | 321 | mdecorde | |
204 | 321 | mdecorde | def ttrezfiles = []; |
205 | 321 | mdecorde | for(String text : texts) |
206 | 321 | mdecorde | { |
207 | 321 | mdecorde | println "Apprentissage de "+text+"..." |
208 | 321 | mdecorde | tntwrapper.tntpara(new File(modelsDir,text+".t")); |
209 | 321 | mdecorde | //new File(text+".123").renameTo(new File(modelsDir,text+".123"));
|
210 | 321 | mdecorde | //new File(text+".lex").renameTo(new File(modelsDir,text+".lex"));
|
211 | 321 | mdecorde | for(String target : texts) |
212 | 321 | mdecorde | { |
213 | 321 | mdecorde | println "Projection de "+target+" sur "+target+"..." |
214 | 321 | mdecorde | for(int mode=3 ; mode <=3 ; mode++) |
215 | 321 | mdecorde | { |
216 | 321 | mdecorde | println("Mode "+mode+"...") |
217 | 321 | mdecorde | tntwrapper.setu(2);
|
218 | 321 | mdecorde | |
219 | 321 | mdecorde | FileOutputStream fos = new FileOutputStream(projsDir+"model_"+text+"_target_"+target+"_mode"+mode+".t"); |
220 | 321 | mdecorde | ttrezfiles.add("model_"+text+"_target_"+target+"_mode"+mode+".t") |
221 | 321 | mdecorde | PrintStream ps = new PrintStream(fos); |
222 | 321 | mdecorde | def out = System.out; |
223 | 321 | mdecorde | System.setOut(ps);
|
224 | 321 | mdecorde | |
225 | 321 | mdecorde | tntwrapper.tnt(text,new File(textsDir,target+".t")); |
226 | 321 | mdecorde | |
227 | 321 | mdecorde | System.setOut(out);
|
228 | 321 | mdecorde | } |
229 | 321 | mdecorde | } |
230 | 321 | mdecorde | } |
231 | 321 | mdecorde | |
232 | 321 | mdecorde | //def ttrezfiles = ["model_roland_targetroland_mode2.t", "model_roland_targetroland_mode3.t", "model_roland_targetartu_mode2.t", "model_roland_targetartu_mode3.t", "model_roland_targetqjm_mode2.t", "model_roland_targetqjm_mode3.t", "model_roland_targetcommyn1_mode2.t", "model_roland_targetcommyn1_mode3.t", "model_roland_targetjehpar_mode2.t", "model_roland_targetjehpar_mode3.t", "model_roland_targetrgaqcj_mode2.t", "model_roland_targetrgaqcj_mode3.t", "model_artu_targetroland_mode2.t","model_artu_targetroland_mode3.t", "model_artu_targetartu_mode2.t", "model_artu_targetartu_mode3.t", "model_artu_targetqjm_mode2.t", "model_artu_targetqjm_mode3.t", "model_artu_targetcommyn1_mode2.t", "model_artu_targetcommyn1_mode3.t", "model_artu_targetjehpar_mode2.t", "model_artu_targetjehpar_mode3.t", "model_artu_targetrgaqcj_mode2.t", "model_artu_targetrgaqcj_mode3.t", "model_qjm_targetroland_mode2.t, model_qjm_targetroland_mode3.t, model_qjm_targetartu_mode2.t, model_qjm_targetartu_mode3.t, model_qjm_targetqjm_mode2.t, model_qjm_targetqjm_mode3.t, model_qjm_targetcommyn1_mode2.t, model_qjm_targetcommyn1_mode3.t, model_qjm_targetjehpar_mode2.t, model_qjm_targetjehpar_mode3.t, model_qjm_targetrgaqcj_mode2.t, model_qjm_targetrgaqcj_mode3.t, model_commyn1_targetroland_mode2.t, model_commyn1_targetroland_mode3.t, model_commyn1_targetartu_mode2.t, model_commyn1_targetartu_mode3.t, model_commyn1_targetqjm_mode2.t, model_commyn1_targetqjm_mode3.t, model_commyn1_targetcommyn1_mode2.t, model_commyn1_targetcommyn1_mode3.t, model_commyn1_targetjehpar_mode2.t, model_commyn1_targetjehpar_mode3.t, model_commyn1_targetrgaqcj_mode2.t, model_commyn1_targetrgaqcj_mode3.t, model_jehpar_targetroland_mode2.t, model_jehpar_targetroland_mode3.t, model_jehpar_targetartu_mode2.t, model_jehpar_targetartu_mode3.t, model_jehpar_targetqjm_mode2.t, model_jehpar_targetqjm_mode3.t, model_jehpar_targetcommyn1_mode2.t, model_jehpar_targetcommyn1_mode3.t, model_jehpar_targetjehpar_mode2.t, model_jehpar_targetjehpar_mode3.t, model_jehpar_targetrgaqcj_mode2.t, model_jehpar_targetrgaqcj_mode3.t, model_rgaqcj_targetroland_mode2.t, model_rgaqcj_targetroland_mode3.t, model_rgaqcj_targetartu_mode2.t, model_rgaqcj_targetartu_mode3.t, model_rgaqcj_targetqjm_mode2.t, model_rgaqcj_targetqjm_mode3.t, model_rgaqcj_targetcommyn1_mode2.t, model_rgaqcj_targetcommyn1_mode3.t", "model_rgaqcj_targetjehpar_mode2.t", "model_rgaqcj_targetjehpar_mode3.t", "model_rgaqcj_targetrgaqcj_mode2.t", "model_rgaqcj_targetrgaqcj_mode3.t"];
|
233 | 321 | mdecorde | println ttrezfiles; |
234 | 321 | mdecorde | //Build process infos
|
235 | 321 | mdecorde | //remove lines which starts with %%
|
236 | 321 | mdecorde | for(String text : ttrezfiles) |
237 | 321 | mdecorde | { |
238 | 321 | mdecorde | //def encoding ="UTF-8"
|
239 | 321 | mdecorde | println "patch proj file"+text+" : remove %% lines and blank lines AND replace n\t by only one \t"; |
240 | 321 | mdecorde | File f = new File(projsDir,text); |
241 | 321 | mdecorde | File temp = new File("tempFileCVScleaner") |
242 | 321 | mdecorde | Reader reader = new InputStreamReader(new FileInputStream(f),encoding); |
243 | 321 | mdecorde | Writer writer = new FileWriter(temp); |
244 | 321 | mdecorde | reader.eachLine |
245 | 321 | mdecorde | { |
246 | 321 | mdecorde | if(it.trim().startsWith("%%") || it.length() == 0) |
247 | 321 | mdecorde | writer.write("")
|
248 | 321 | mdecorde | else
|
249 | 321 | mdecorde | writer.write(it.replaceAll("(\t)+","\t")+"\n") |
250 | 321 | mdecorde | } |
251 | 321 | mdecorde | reader.close(); |
252 | 321 | mdecorde | writer.close(); |
253 | 321 | mdecorde | if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f |
254 | 321 | mdecorde | } |
255 | 321 | mdecorde | |
256 | 321 | mdecorde | for(String text : texts) |
257 | 321 | mdecorde | { |
258 | 321 | mdecorde | //def encoding ="UTF-8"
|
259 | 321 | mdecorde | println "patch model file"+text+" : remove blank lines"; |
260 | 321 | mdecorde | File f = new File(modelsDir,text+".t"); |
261 | 321 | mdecorde | File temp = new File("tempFileCVScleaner") |
262 | 321 | mdecorde | Reader reader = new InputStreamReader(new FileInputStream(f),encoding); |
263 | 321 | mdecorde | Writer writer = new FileWriter(temp); |
264 | 321 | mdecorde | reader.eachLine |
265 | 321 | mdecorde | { |
266 | 321 | mdecorde | if(it.length() == 0) |
267 | 321 | mdecorde | writer.write("")
|
268 | 321 | mdecorde | else
|
269 | 321 | mdecorde | writer.write(it+"\n") |
270 | 321 | mdecorde | } |
271 | 321 | mdecorde | reader.close(); |
272 | 321 | mdecorde | writer.close(); |
273 | 321 | mdecorde | if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f |
274 | 321 | mdecorde | } |
275 | 321 | mdecorde | |
276 | 321 | mdecorde | HSQLFunctions.clearAll(); |
277 | 321 | mdecorde | //import proj table into hsql
|
278 | 321 | mdecorde | for(int i=0; i < ttrezfiles.size();i++) |
279 | 321 | mdecorde | { |
280 | 321 | mdecorde | String csvfile = rootDir+"proj/"+ttrezfiles[i]; |
281 | 321 | mdecorde | def argsname = ["form","cat"]; |
282 | 321 | mdecorde | def types = ["VARCHAR(30)","VARCHAR(30)"]; |
283 | 321 | mdecorde | int linenumber = HSQLFunctions.ImportOrderedCSVTable(ttrezfiles[i].replace(".",""), argsname,types,new File(csvfile),"\t","UTF-8"); |
284 | 321 | mdecorde | println("create Table "+ttrezfiles[i].replace(".","")+" : "+linenumber+" lines"); |
285 | 321 | mdecorde | } |
286 | 321 | mdecorde | |
287 | 321 | mdecorde | for(String text : texts) |
288 | 321 | mdecorde | { |
289 | 321 | mdecorde | File f = new File(modelsDir,text+".t"); |
290 | 321 | mdecorde | def argsname = ["form","cat"]; |
291 | 321 | mdecorde | def types = ["VARCHAR(30)","VARCHAR(30)"]; |
292 | 321 | mdecorde | int linenumber = HSQLFunctions.ImportOrderedCSVTable("lexbrut_"+text.replace(".",""), argsname,types,f,"\t","UTF-8"); |
293 | 321 | mdecorde | println("create Table lexbrut_"+text.replace(".","")+" : "+linenumber+" lines"); |
294 | 321 | mdecorde | } |
295 | 321 | mdecorde | |
296 | 321 | mdecorde | //calc richesses lexicales
|
297 | 321 | mdecorde | LinkedHashMap<String,ArrayList<String>> richesses = new LinkedHashMap<String,ArrayList<String>>(); |
298 | 321 | mdecorde | richesses.put("Text", ["TTrola","TTartu","TTqjm","TTcomm","TTjehpar","TTrgaqcj"]); |
299 | 321 | mdecorde | |
300 | 321 | mdecorde | LinkedHashMap<String,ArrayList<String>> richessesCat = new LinkedHashMap<String,ArrayList<String>>(); |
301 | 321 | mdecorde | richessesCat.put("Text", ["TTrola","TTartu","TTqjm","TTcomm","TTjehpar","TTrgaqcj"]); |
302 | 321 | mdecorde | |
303 | 321 | mdecorde | LinkedHashMap<String,ArrayList<String>> richessesocc = new LinkedHashMap<String,ArrayList<String>>(); |
304 | 321 | mdecorde | richessesocc.put("Text", ["TTrola","TTartu","TTqjm","TTcomm","TTjehpar","TTrgaqcj"]); |
305 | 321 | mdecorde | |
306 | 321 | mdecorde | LinkedHashMap<String,ArrayList<String>> richessesoccCat = new LinkedHashMap<String,ArrayList<String>>(); |
307 | 321 | mdecorde | richessesoccCat.put("Text", ["TTrola","TTartu","TTqjm","TTcomm","TTjehpar","TTrgaqcj"]); |
308 | 321 | mdecorde | for(String text : texts) |
309 | 321 | mdecorde | { |
310 | 321 | mdecorde | richesses.put(text, new ArrayList<String>()); |
311 | 321 | mdecorde | richessesCat.put(text, new ArrayList<String>()); |
312 | 321 | mdecorde | richessesocc.put(text, new ArrayList<String>()); |
313 | 321 | mdecorde | richessesoccCat.put(text, new ArrayList<String>()); |
314 | 321 | mdecorde | } |
315 | 321 | mdecorde | |
316 | 321 | mdecorde | for(String text : texts) |
317 | 321 | mdecorde | { |
318 | 321 | mdecorde | for(String model : texts) |
319 | 321 | mdecorde | { |
320 | 321 | mdecorde | if(text.matches(model))
|
321 | 321 | mdecorde | { |
322 | 321 | mdecorde | richesses.get(text).add( "0" ) ;
|
323 | 321 | mdecorde | richessesCat.get(text).add( "0") ;
|
324 | 321 | mdecorde | richessesocc.get(text).add( "0" ) ;
|
325 | 321 | mdecorde | richessesoccCat.get(text).add( "0" ) ;
|
326 | 321 | mdecorde | } |
327 | 321 | mdecorde | else
|
328 | 321 | mdecorde | { |
329 | 321 | mdecorde | String query = "SELECT count(*) FROM ((SELECT DISTINCT form FROM lexbrut_"+text+" ) MINUS (SELECT DISTINCT form FROM lexbrut_"+model+"))" |
330 | 321 | mdecorde | println(query); |
331 | 321 | mdecorde | String query2 = "SELECT count(*) FROM ((SELECT DISTINCT cat FROM lexbrut_"+text+" ) MINUS (SELECT DISTINCT cat FROM lexbrut_"+model+"))" |
332 | 321 | mdecorde | println(query2); |
333 | 321 | mdecorde | String query3 = "SELECT count(form) FROM lexbrut_"+text+" WHERE form NOT IN (SELECT form FROM lexbrut_"+model+")" |
334 | 321 | mdecorde | println(query3); |
335 | 321 | mdecorde | String query4 = "SELECT count(cat) FROM lexbrut_"+text+" WHERE cat NOT IN (SELECT cat FROM lexbrut_"+model+")" |
336 | 321 | mdecorde | println(query4); |
337 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query) { |
338 | 321 | mdecorde | def rich = it.getAt(0); |
339 | 321 | mdecorde | println("rich form :"+ rich);
|
340 | 321 | mdecorde | richesses.get(text).add( ""+rich ) ;
|
341 | 321 | mdecorde | } |
342 | 321 | mdecorde | |
343 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query2) { |
344 | 321 | mdecorde | def rich = it.getAt(0); |
345 | 321 | mdecorde | println("rich cat :"+ rich);
|
346 | 321 | mdecorde | richessesCat.get(text).add( ""+rich ) ;
|
347 | 321 | mdecorde | } |
348 | 321 | mdecorde | |
349 | 321 | mdecorde | |
350 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query3) { |
351 | 321 | mdecorde | def rich = it.getAt(0); |
352 | 321 | mdecorde | println("richocc form :"+ rich);
|
353 | 321 | mdecorde | richessesocc.get(text).add( ""+rich ) ;
|
354 | 321 | mdecorde | } |
355 | 321 | mdecorde | |
356 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query4) { |
357 | 321 | mdecorde | def rich = it.getAt(0); |
358 | 321 | mdecorde | println("richocc cat :"+ rich);
|
359 | 321 | mdecorde | richessesoccCat.get(text).add( ""+rich ) ;
|
360 | 321 | mdecorde | } |
361 | 321 | mdecorde | } |
362 | 321 | mdecorde | } |
363 | 321 | mdecorde | } |
364 | 321 | mdecorde | |
365 | 321 | mdecorde | LinkedHashMap<String,ArrayList<String>> matrix = new LinkedHashMap<String,ArrayList<String>>(); |
366 | 321 | mdecorde | |
367 | 321 | mdecorde | matrix.put("Text", ["T ","Vf ","Vc ","TTrola","TTartu","TTqjm","TTcomm","TTjehpar","TTrgaqcj"]); |
368 | 321 | mdecorde | for(String text : texts) |
369 | 321 | mdecorde | { |
370 | 321 | mdecorde | matrix.put(text, new ArrayList<String>()); |
371 | 321 | mdecorde | } |
372 | 321 | mdecorde | |
373 | 321 | mdecorde | ArrayList<Integer> countOccForm = new ArrayList<Integer>(); |
374 | 321 | mdecorde | ArrayList<Integer> countOccCat = new ArrayList<Integer>(); |
375 | 321 | mdecorde | ArrayList<Integer> countLexForm = new ArrayList<Integer>(); |
376 | 321 | mdecorde | ArrayList<Integer> countLexCat = new ArrayList<Integer>(); |
377 | 321 | mdecorde | |
378 | 321 | mdecorde | for(String text : texts) |
379 | 321 | mdecorde | { |
380 | 321 | mdecorde | println("count occ and voc : lexbrut_"+text)
|
381 | 321 | mdecorde | String query1 = "SELECT count(*) FROM (SELECT form from lexbrut_"+text+")"; |
382 | 321 | mdecorde | String query2 = "SELECT count(*) FROM (SELECT cat from lexbrut_"+text+")"; |
383 | 321 | mdecorde | String query3 = "SELECT count(*) FROM (SELECT DISTINCT form from lexbrut_"+text+")"; |
384 | 321 | mdecorde | String query4 = "SELECT count(*) FROM (SELECT DISTINCT cat from lexbrut_"+text+")"; |
385 | 321 | mdecorde | |
386 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query1) { |
387 | 321 | mdecorde | def rich = it.getAt(0); |
388 | 321 | mdecorde | println("count occ form :"+ rich);
|
389 | 321 | mdecorde | countOccForm.add( ""+rich ) ;
|
390 | 321 | mdecorde | } |
391 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query2) { |
392 | 321 | mdecorde | def rich = it.getAt(0); |
393 | 321 | mdecorde | println("coutn occ cat :"+ rich);
|
394 | 321 | mdecorde | countOccCat.add( ""+rich ) ;
|
395 | 321 | mdecorde | } |
396 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query3) { |
397 | 321 | mdecorde | def rich = it.getAt(0); |
398 | 321 | mdecorde | println("count lex form :"+ rich);
|
399 | 321 | mdecorde | countLexForm.add( ""+rich ) ;
|
400 | 321 | mdecorde | } |
401 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query4) { |
402 | 321 | mdecorde | def rich = it.getAt(0); |
403 | 321 | mdecorde | println("count lex cat :"+ rich);
|
404 | 321 | mdecorde | countLexCat.add( ""+rich ) ;
|
405 | 321 | mdecorde | } |
406 | 321 | mdecorde | //println("col1 et 2 : "+nbOccurencesTxt.get("lexbrut_"+text)+" "+nbFormLexique.get("lex_"+text))
|
407 | 321 | mdecorde | matrix.get(text).add( "0") ;
|
408 | 321 | mdecorde | matrix.get(text).add( "0") ;
|
409 | 321 | mdecorde | matrix.get(text).add( "0") ;
|
410 | 321 | mdecorde | } |
411 | 321 | mdecorde | |
412 | 321 | mdecorde | for(String text : texts) |
413 | 321 | mdecorde | { |
414 | 321 | mdecorde | int textindex = 0; |
415 | 321 | mdecorde | for(String target : texts) |
416 | 321 | mdecorde | { |
417 | 321 | mdecorde | //get occurences ordonnées de la gold
|
418 | 321 | mdecorde | //get occurences ordonnées de la projection
|
419 | 321 | mdecorde | //compar
|
420 | 321 | mdecorde | String proj = "model_"+text+"_target_"+target+"_mode"+3+"t" |
421 | 321 | mdecorde | println("comp "+proj+" VS lexbrut_"+target); |
422 | 321 | mdecorde | String query = "SELECT count(*) FROM ((SELECT n,form,cat FROM lexbrut_"+target+" ) MINUS (SELECT n,form,cat FROM "+proj+"))" |
423 | 321 | mdecorde | //HSQLFunctions.executeQuery(query);
|
424 | 321 | mdecorde | //println(query)
|
425 | 321 | mdecorde | int ttotal=0; |
426 | 321 | mdecorde | int total ;
|
427 | 321 | mdecorde | HSQLFunctions.getGroovySql().eachRow(query) { |
428 | 321 | mdecorde | def dif = it.getAt(0); |
429 | 321 | mdecorde | /*if((""+initiales[id]) != "Z")
|
430 | 321 | mdecorde | {
|
431 | 321 | mdecorde | total = nbmots[id+1] - nbmots[id];
|
432 | 321 | mdecorde | ttotal += total;
|
433 | 321 | mdecorde | }
|
434 | 321 | mdecorde | else
|
435 | 321 | mdecorde | total= ttotal;
|
436 | 321 | mdecorde | */
|
437 | 321 | mdecorde | |
438 | 321 | mdecorde | //total = nbOccurencesTxt.get("lexbrut_"+initiales[id])
|
439 | 321 | mdecorde | println("textindex : "+textindex);
|
440 | 321 | mdecorde | total = Integer.parseInt(countOccForm[textindex]);
|
441 | 321 | mdecorde | println("dif "+dif+"/ tot "+total+" = "+((float)dif/(float)total)); |
442 | 321 | mdecorde | Float perf = ((float)dif/(float)total)*100f; |
443 | 321 | mdecorde | matrix.get(target).add( ""+perf ) ;
|
444 | 321 | mdecorde | } |
445 | 321 | mdecorde | textindex++; |
446 | 321 | mdecorde | } |
447 | 321 | mdecorde | |
448 | 321 | mdecorde | } |
449 | 321 | mdecorde | |
450 | 321 | mdecorde | println("Matrice d'erreur : ");
|
451 | 321 | mdecorde | for(String k : matrix.keySet()) |
452 | 321 | mdecorde | { |
453 | 321 | mdecorde | print(k) |
454 | 321 | mdecorde | for(String f : matrix.get(k)) |
455 | 321 | mdecorde | print("\t"+f)
|
456 | 321 | mdecorde | println() |
457 | 321 | mdecorde | } |
458 | 321 | mdecorde | println("richesse lexiques en Form : ");
|
459 | 321 | mdecorde | for(String k : richesses.keySet()) |
460 | 321 | mdecorde | { |
461 | 321 | mdecorde | print(k) |
462 | 321 | mdecorde | for(String f : richesses.get(k)) |
463 | 321 | mdecorde | print("\t"+f)
|
464 | 321 | mdecorde | println() |
465 | 321 | mdecorde | } |
466 | 321 | mdecorde | println("richesse lexiques en Cat : ");
|
467 | 321 | mdecorde | for(String k : richessesCat.keySet()) |
468 | 321 | mdecorde | { |
469 | 321 | mdecorde | print(k) |
470 | 321 | mdecorde | for(String f : richessesCat.get(k)) |
471 | 321 | mdecorde | print("\t"+f)
|
472 | 321 | mdecorde | println() |
473 | 321 | mdecorde | } |
474 | 321 | mdecorde | |
475 | 321 | mdecorde | println("richesse occurance en Form : ");
|
476 | 321 | mdecorde | for(String k : richessesocc.keySet()) |
477 | 321 | mdecorde | { |
478 | 321 | mdecorde | print(k) |
479 | 321 | mdecorde | for(String f : richessesocc.get(k)) |
480 | 321 | mdecorde | print("\t"+f)
|
481 | 321 | mdecorde | println() |
482 | 321 | mdecorde | } |
483 | 321 | mdecorde | println("richesse occurance en Cat : ");
|
484 | 321 | mdecorde | for(String k : richessesoccCat.keySet()) |
485 | 321 | mdecorde | { |
486 | 321 | mdecorde | print(k) |
487 | 321 | mdecorde | for(String f : richessesoccCat.get(k)) |
488 | 321 | mdecorde | print("\t"+f)
|
489 | 321 | mdecorde | println() |
490 | 321 | mdecorde | } |
491 | 321 | mdecorde | |
492 | 321 | mdecorde | println("RichBrutes roland artu qjm comm jehpar rgaqcj");
|
493 | 321 | mdecorde | print("Focc")
|
494 | 321 | mdecorde | for(int i=0;i < countOccForm.size() ; i++) |
495 | 321 | mdecorde | print("\t"+countOccForm[i]);
|
496 | 321 | mdecorde | print("\nCocc")
|
497 | 321 | mdecorde | for(int i=0;i < countOccCat.size() ; i++) |
498 | 321 | mdecorde | print("\t"+countOccCat[i]);
|
499 | 321 | mdecorde | print("\nFvoc")
|
500 | 321 | mdecorde | for(int i=0;i < countLexForm.size() ; i++) |
501 | 321 | mdecorde | print("\t"+countLexForm[i]);
|
502 | 321 | mdecorde | print("\nCvoc")
|
503 | 321 | mdecorde | for(int i=0;i < countLexCat.size() ; i++) |
504 | 321 | mdecorde | print("\t"+countLexCat[i]);
|
505 | 321 | mdecorde | println() |