root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / hyperbase / compiler.groovy @ 1000
History | View | Annotate | Download (6.9 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 479 | mdecorde | // $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
|
25 | 321 | mdecorde | // $LastChangedRevision: 3219 $
|
26 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 321 | mdecorde | //
|
28 | 986 | mdecorde | package org.txm.scripts.importer.hyperbase;
|
29 | 321 | mdecorde | |
30 | 1000 | mdecorde | import org.txm.importer.cwb.CwbEncode |
31 | 1000 | mdecorde | import org.txm.importer.cwb.CwbMakeAll |
32 | 986 | mdecorde | import org.txm.scripts.importer.*; |
33 | 321 | mdecorde | import org.txm.scripts.*; |
34 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
35 | 321 | mdecorde | import org.txm.utils.treetagger.TreeTagger; |
36 | 321 | mdecorde | |
37 | 321 | mdecorde | import javax.xml.stream.*; |
38 | 321 | mdecorde | import java.net.URL; |
39 | 321 | mdecorde | import java.io.File; |
40 | 321 | mdecorde | import java.util.HashMap; |
41 | 321 | mdecorde | import java.util.List; |
42 | 321 | mdecorde | |
43 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
44 | 321 | mdecorde | /**
|
45 | 321 | mdecorde | * The Class compiler.
|
46 | 321 | mdecorde | */
|
47 | 321 | mdecorde | class compiler |
48 | 321 | mdecorde | { |
49 | 321 | mdecorde | |
50 | 321 | mdecorde | /** The debug. */
|
51 | 321 | mdecorde | private boolean debug= false; |
52 | 321 | mdecorde | |
53 | 321 | mdecorde | /** The input data. */
|
54 | 321 | mdecorde | private def inputData; |
55 | 321 | mdecorde | |
56 | 321 | mdecorde | /** The factory. */
|
57 | 321 | mdecorde | private def factory; |
58 | 321 | mdecorde | |
59 | 321 | mdecorde | /** The parser. */
|
60 | 321 | mdecorde | private XMLStreamReader parser;
|
61 | 321 | mdecorde | |
62 | 321 | mdecorde | /** The dir. */
|
63 | 321 | mdecorde | private def dir; |
64 | 321 | mdecorde | |
65 | 321 | mdecorde | /** The output. */
|
66 | 321 | mdecorde | private def output; |
67 | 321 | mdecorde | |
68 | 321 | mdecorde | /** The url. */
|
69 | 321 | mdecorde | private def url; |
70 | 321 | mdecorde | |
71 | 321 | mdecorde | /** The anahash. */
|
72 | 321 | mdecorde | private HashMap<String,String> anahash = new HashMap<String,String>() ; |
73 | 321 | mdecorde | |
74 | 321 | mdecorde | /** The text. */
|
75 | 321 | mdecorde | String text=""; |
76 | 321 | mdecorde | |
77 | 321 | mdecorde | /** The base. */
|
78 | 321 | mdecorde | String base=""; |
79 | 321 | mdecorde | |
80 | 321 | mdecorde | /** The project. */
|
81 | 321 | mdecorde | String project=""; |
82 | 321 | mdecorde | |
83 | 321 | mdecorde | /** The text attributes. */
|
84 | 321 | mdecorde | String[] textAttributes = null; |
85 | 321 | mdecorde | |
86 | 321 | mdecorde | /** The lang. */
|
87 | 321 | mdecorde | private String lang ="fr"; |
88 | 321 | mdecorde | |
89 | 321 | mdecorde | /**
|
90 | 321 | mdecorde | * initialize.
|
91 | 321 | mdecorde | *
|
92 | 321 | mdecorde | */
|
93 | 321 | mdecorde | public compiler(){}
|
94 | 321 | mdecorde | |
95 | 321 | mdecorde | /**
|
96 | 321 | mdecorde | * Instantiates a new compiler.
|
97 | 321 | mdecorde | *
|
98 | 321 | mdecorde | * @param url the url
|
99 | 321 | mdecorde | * @param text the text
|
100 | 321 | mdecorde | * @param base the base
|
101 | 321 | mdecorde | * @param project the project
|
102 | 321 | mdecorde | */
|
103 | 321 | mdecorde | public compiler(URL url,String text,String base, String project) |
104 | 321 | mdecorde | { |
105 | 321 | mdecorde | this.text = text
|
106 | 321 | mdecorde | this.base = base;
|
107 | 321 | mdecorde | this.project = project;
|
108 | 321 | mdecorde | this.textAttributes = textAttributes;
|
109 | 321 | mdecorde | try {
|
110 | 321 | mdecorde | this.url = url;
|
111 | 321 | mdecorde | inputData = url.openStream(); |
112 | 321 | mdecorde | |
113 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
114 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
115 | 321 | mdecorde | } catch (XMLStreamException ex) {
|
116 | 321 | mdecorde | System.out.println(ex);
|
117 | 321 | mdecorde | }catch (IOException ex) { |
118 | 321 | mdecorde | System.err.println("IOException while parsing "); |
119 | 321 | mdecorde | } |
120 | 321 | mdecorde | } |
121 | 321 | mdecorde | |
122 | 321 | mdecorde | /**
|
123 | 321 | mdecorde | * set the language of the corpus.
|
124 | 321 | mdecorde | *
|
125 | 321 | mdecorde | * @param lang the lang
|
126 | 321 | mdecorde | * @return the java.lang. object
|
127 | 321 | mdecorde | */
|
128 | 321 | mdecorde | public setLang(String lang) |
129 | 321 | mdecorde | { |
130 | 321 | mdecorde | this.lang = lang;
|
131 | 321 | mdecorde | } |
132 | 321 | mdecorde | |
133 | 321 | mdecorde | /** The annotation success. */
|
134 | 321 | mdecorde | boolean annotationSuccess = false; |
135 | 321 | mdecorde | |
136 | 321 | mdecorde | /**
|
137 | 321 | mdecorde | * Sets the annotation success.
|
138 | 321 | mdecorde | *
|
139 | 321 | mdecorde | * @param value the value
|
140 | 321 | mdecorde | * @return the java.lang. object
|
141 | 321 | mdecorde | */
|
142 | 321 | mdecorde | public setAnnotationSuccess(boolean value) |
143 | 321 | mdecorde | { |
144 | 321 | mdecorde | this.annotationSuccess = value
|
145 | 321 | mdecorde | } |
146 | 321 | mdecorde | |
147 | 321 | mdecorde | /**
|
148 | 321 | mdecorde | * Creates the output.
|
149 | 321 | mdecorde | *
|
150 | 321 | mdecorde | * @param f the f
|
151 | 321 | mdecorde | * @return true, if successful
|
152 | 321 | mdecorde | */
|
153 | 321 | mdecorde | private boolean createOutput(File f){ |
154 | 321 | mdecorde | try {
|
155 | 321 | mdecorde | output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8"); |
156 | 321 | mdecorde | return true; |
157 | 321 | mdecorde | } catch (Exception e) { |
158 | 321 | mdecorde | System.err.println(e);
|
159 | 321 | mdecorde | |
160 | 321 | mdecorde | return false; |
161 | 321 | mdecorde | } |
162 | 321 | mdecorde | } |
163 | 321 | mdecorde | |
164 | 321 | mdecorde | /**
|
165 | 321 | mdecorde | * Go to text.
|
166 | 321 | mdecorde | */
|
167 | 321 | mdecorde | private void GoToText() |
168 | 321 | mdecorde | { |
169 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
170 | 321 | mdecorde | { |
171 | 321 | mdecorde | if(event == XMLStreamConstants.END_ELEMENT)
|
172 | 321 | mdecorde | if(parser.getLocalName().equals("teiHeader")) |
173 | 321 | mdecorde | return;
|
174 | 321 | mdecorde | } |
175 | 321 | mdecorde | } |
176 | 321 | mdecorde | |
177 | 321 | mdecorde | /**
|
178 | 321 | mdecorde | * Run.
|
179 | 321 | mdecorde | *
|
180 | 321 | mdecorde | * @param rootDirFile the root dir file
|
181 | 321 | mdecorde | * @param basename the basename
|
182 | 321 | mdecorde | * @return true, if successful
|
183 | 321 | mdecorde | */
|
184 | 321 | mdecorde | public boolean run(File binDir, File txmDir, String corpusname) |
185 | 321 | mdecorde | { |
186 | 321 | mdecorde | String rootDir = binDir.getAbsolutePath();
|
187 | 321 | mdecorde | |
188 | 714 | mdecorde | if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
189 | 714 | mdecorde | println ("Error: CWB executables not well set.")
|
190 | 321 | mdecorde | return false; |
191 | 321 | mdecorde | } |
192 | 321 | mdecorde | if(!binDir.exists())
|
193 | 321 | mdecorde | { |
194 | 321 | mdecorde | println ("binary directory does not exists: "+rootDir)
|
195 | 321 | mdecorde | return false; |
196 | 321 | mdecorde | } |
197 | 803 | mdecorde | File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp"); |
198 | 803 | mdecorde | new File(binDir,"cqp").deleteDir(); |
199 | 803 | mdecorde | new File(binDir,"cqp").mkdir(); |
200 | 321 | mdecorde | new File(binDir,"data").deleteDir(); |
201 | 321 | mdecorde | new File(binDir,"data").mkdir(); |
202 | 321 | mdecorde | new File(binDir,"registry").mkdir(); |
203 | 321 | mdecorde | |
204 | 321 | mdecorde | //start corpus
|
205 | 803 | mdecorde | if (createOutput(cqpFile)) {
|
206 | 321 | mdecorde | output.write("<txmcorpus lang=\""+lang+"\">\n"); |
207 | 321 | mdecorde | output.close(); |
208 | 321 | mdecorde | } |
209 | 321 | mdecorde | |
210 | 321 | mdecorde | String textid=""; |
211 | 321 | mdecorde | int counttext =0; |
212 | 321 | mdecorde | List<File> files = txmDir.listFiles(); |
213 | 321 | mdecorde | Collections.sort(files);
|
214 | 804 | mdecorde | //1- Transform into CQP file
|
215 | 804 | mdecorde | XMLTXM2CQP cqpbuilder = null;
|
216 | 321 | mdecorde | println("Compiling "+files.size()+" files") |
217 | 321 | mdecorde | for (File f : files) { |
218 | 321 | mdecorde | print "."
|
219 | 321 | mdecorde | counttext++; |
220 | 321 | mdecorde | if (!f.exists()) {
|
221 | 321 | mdecorde | println("file "+f+ " does not exists") |
222 | 321 | mdecorde | } |
223 | 321 | mdecorde | else {
|
224 | 804 | mdecorde | cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
|
225 | 321 | mdecorde | String txtname = f.getName().substring(0,f.getName().length()-4); |
226 | 803 | mdecorde | cqpbuilder.setTextInfo(txtname, corpusname, "project");
|
227 | 321 | mdecorde | |
228 | 803 | mdecorde | cqpbuilder.setBalisesToKeep(["text","p","s"]); |
229 | 803 | mdecorde | cqpbuilder.setSendToPAttributes(["s":["n"]]); |
230 | 803 | mdecorde | cqpbuilder.setLang(lang); |
231 | 803 | mdecorde | if (!cqpbuilder.transformFile(cqpFile)) {
|
232 | 321 | mdecorde | println("Failed to compile "+f)
|
233 | 321 | mdecorde | } |
234 | 321 | mdecorde | } |
235 | 321 | mdecorde | } |
236 | 321 | mdecorde | println ""
|
237 | 321 | mdecorde | |
238 | 321 | mdecorde | //end corpus
|
239 | 803 | mdecorde | if (createOutput(cqpFile)) {
|
240 | 321 | mdecorde | output.write("</txmcorpus>\n");
|
241 | 321 | mdecorde | output.close(); |
242 | 321 | mdecorde | } |
243 | 321 | mdecorde | |
244 | 803 | mdecorde | if (cqpbuilder == null) return false; |
245 | 321 | mdecorde | |
246 | 321 | mdecorde | //2- Import into CWB
|
247 | 321 | mdecorde | def outDir =rootDir;
|
248 | 321 | mdecorde | |
249 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode();
|
250 | 321 | mdecorde | cwbEn.setDebug(debug); |
251 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll();
|
252 | 321 | mdecorde | cwbMa.setDebug(debug); |
253 | 321 | mdecorde | |
254 | 803 | mdecorde | List<String> pAttributesList = cqpbuilder.getpAttributs(); |
255 | 803 | mdecorde | List<String> sAttributesList = cqpbuilder.getsAttributs(); |
256 | 321 | mdecorde | println "word properties : "+pAttributesList
|
257 | 321 | mdecorde | println "structures : "+sAttributesList
|
258 | 321 | mdecorde | String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()]) |
259 | 321 | mdecorde | String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()]) |
260 | 321 | mdecorde | |
261 | 321 | mdecorde | try {
|
262 | 321 | mdecorde | String regPath = outDir + "/registry/"+corpusname.toLowerCase(); |
263 | 803 | mdecorde | cwbEn.run(outDir + "/data/$corpusname", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes); |
264 | 321 | mdecorde | if (!new File(regPath).exists()) { |
265 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
266 | 321 | mdecorde | return false; |
267 | 321 | mdecorde | } |
268 | 714 | mdecorde | cwbMa.run(corpusname, outDir + "/registry");
|
269 | 321 | mdecorde | |
270 | 321 | mdecorde | } catch (Exception ex) {System.out.println(ex); return false;} |
271 | 321 | mdecorde | |
272 | 321 | mdecorde | return true; |
273 | 321 | mdecorde | } |
274 | 321 | mdecorde | |
275 | 321 | mdecorde | /**
|
276 | 321 | mdecorde | * Sets the debug.
|
277 | 321 | mdecorde | */
|
278 | 321 | mdecorde | public void setDebug() |
279 | 321 | mdecorde | { |
280 | 321 | mdecorde | this.debug = true; |
281 | 321 | mdecorde | } |
282 | 321 | mdecorde | |
283 | 321 | mdecorde | /**
|
284 | 321 | mdecorde | * The main method.
|
285 | 321 | mdecorde | *
|
286 | 321 | mdecorde | * @param args the arguments
|
287 | 321 | mdecorde | */
|
288 | 321 | mdecorde | public static void main(String[] args) |
289 | 321 | mdecorde | { |
290 | 321 | mdecorde | File dir = new File("~/xml/geo"); |
291 | 321 | mdecorde | def c = new compiler(); |
292 | 321 | mdecorde | c.setDebug(); |
293 | 321 | mdecorde | c.setCwbPath("~/TXM/cwb/bin");
|
294 | 321 | mdecorde | c.run(dir,"geo");
|
295 | 321 | mdecorde | } |
296 | 321 | mdecorde | } |