root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtractWithMode.groovy @ 1688
History | View | Annotate | Download (9.5 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 2038 $
|
26 | 881 | mdecorde | // $LastChangedBy: alavrentev $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import javax.xml.parsers.DocumentBuilder; |
31 | 881 | mdecorde | import javax.xml.parsers.DocumentBuilderFactory; |
32 | 881 | mdecorde | import javax.xml.parsers.ParserConfigurationException; |
33 | 881 | mdecorde | import javax.xml.transform.OutputKeys; |
34 | 881 | mdecorde | import javax.xml.transform.Result; |
35 | 881 | mdecorde | import javax.xml.transform.Source; |
36 | 881 | mdecorde | import javax.xml.transform.Transformer; |
37 | 881 | mdecorde | import javax.xml.transform.TransformerFactory; |
38 | 881 | mdecorde | import javax.xml.transform.dom.DOMSource; |
39 | 881 | mdecorde | import javax.xml.transform.stream.StreamResult; |
40 | 881 | mdecorde | |
41 | 1370 | mdecorde | import org.txm.utils.io.IOUtils |
42 | 881 | mdecorde | import org.w3c.dom.Document; |
43 | 881 | mdecorde | import org.w3c.dom.Element; |
44 | 881 | mdecorde | import org.w3c.dom.NodeList; |
45 | 881 | mdecorde | import org.xml.sax.SAXException; |
46 | 881 | mdecorde | |
47 | 881 | mdecorde | import javax.xml.stream.*; |
48 | 881 | mdecorde | import java.io.File; |
49 | 881 | mdecorde | import java.net.URL; |
50 | 881 | mdecorde | |
51 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
52 | 881 | mdecorde | /**
|
53 | 881 | mdecorde | * Extract w tags from a tei file
|
54 | 881 | mdecorde | * not finished.
|
55 | 881 | mdecorde | *
|
56 | 881 | mdecorde | * @author mdecorde
|
57 | 881 | mdecorde | */
|
58 | 881 | mdecorde | class WExtractWithMode |
59 | 881 | mdecorde | { |
60 | 881 | mdecorde | |
61 | 881 | mdecorde | /**
|
62 | 881 | mdecorde | * Process.
|
63 | 881 | mdecorde | *
|
64 | 881 | mdecorde | * @param infile the infile
|
65 | 881 | mdecorde | * @param outfile the outfile
|
66 | 881 | mdecorde | * @param max the max
|
67 | 881 | mdecorde | * @return the java.lang. object
|
68 | 881 | mdecorde | */
|
69 | 881 | mdecorde | public process(File infile, File outfile, String modemax) |
70 | 881 | mdecorde | { |
71 | 881 | mdecorde | println "Process "+infile.getName()+", keep $modemax words" |
72 | 881 | mdecorde | int count = this.countW(infile); |
73 | 881 | mdecorde | |
74 | 881 | mdecorde | int max = 0 |
75 | 881 | mdecorde | String mode = "" |
76 | 881 | mdecorde | |
77 | 881 | mdecorde | try {
|
78 | 881 | mdecorde | mode = modemax.split("/")[0] |
79 | 881 | mdecorde | max = Integer.parseInt(modemax.split("/")[1]) |
80 | 881 | mdecorde | }catch(Exception e ){} |
81 | 881 | mdecorde | |
82 | 881 | mdecorde | if(count < max)
|
83 | 881 | mdecorde | { |
84 | 881 | mdecorde | println "can't extract $max words, the file "+infile.getName()+" contains only $count words" |
85 | 881 | mdecorde | return;
|
86 | 881 | mdecorde | } |
87 | 881 | mdecorde | //String ms = "#ms_K"
|
88 | 881 | mdecorde | int part = 0; |
89 | 881 | mdecorde | if (mode == "3") |
90 | 881 | mdecorde | { |
91 | 881 | mdecorde | part = max/3
|
92 | 881 | mdecorde | } |
93 | 881 | mdecorde | else if (mode == "2") |
94 | 881 | mdecorde | { |
95 | 881 | mdecorde | part = max/2
|
96 | 881 | mdecorde | } |
97 | 881 | mdecorde | else if (mode == "1a" || mode == "1m" || mode == "1z") |
98 | 881 | mdecorde | { |
99 | 881 | mdecorde | part = max |
100 | 881 | mdecorde | } |
101 | 881 | mdecorde | else
|
102 | 881 | mdecorde | { |
103 | 881 | mdecorde | println "mode must be 1a, 1m, 1z, 2 or 3"
|
104 | 881 | mdecorde | return
|
105 | 881 | mdecorde | } |
106 | 881 | mdecorde | int from1 = 0 |
107 | 881 | mdecorde | int to1 = 0 |
108 | 881 | mdecorde | if (mode != "1m" && mode != "1z") |
109 | 881 | mdecorde | { |
110 | 881 | mdecorde | to1 = part |
111 | 881 | mdecorde | } |
112 | 881 | mdecorde | int from2 = 0 |
113 | 881 | mdecorde | int to2 = 0 |
114 | 881 | mdecorde | if (mode == "3" || mode == "1m") |
115 | 881 | mdecorde | { |
116 | 881 | mdecorde | from2 = (count/2) - (part/2); |
117 | 881 | mdecorde | to2 =(count/2) + (part/2); |
118 | 881 | mdecorde | } |
119 | 881 | mdecorde | int from3 = 0 |
120 | 881 | mdecorde | int to3 = 0 |
121 | 881 | mdecorde | if (mode != "1a" && mode != "1m") |
122 | 881 | mdecorde | { |
123 | 881 | mdecorde | from3 = count -part; |
124 | 881 | mdecorde | to3= count-1;
|
125 | 881 | mdecorde | } |
126 | 881 | mdecorde | boolean isSic = false; |
127 | 881 | mdecorde | boolean isW = false; |
128 | 881 | mdecorde | boolean isText = false; |
129 | 881 | mdecorde | boolean printW = true; |
130 | 881 | mdecorde | int wcount=0; |
131 | 881 | mdecorde | |
132 | 881 | mdecorde | println " count : "+count
|
133 | 881 | mdecorde | println " get from "+from1+" to "+to1 |
134 | 881 | mdecorde | println " get from "+from2+" to "+to2 |
135 | 881 | mdecorde | println " get from "+from3+" to "+to3 |
136 | 881 | mdecorde | |
137 | 881 | mdecorde | |
138 | 881 | mdecorde | String localname;
|
139 | 881 | mdecorde | String prefix;
|
140 | 881 | mdecorde | InputStream inputData = infile.toURI().toURL().openStream();
|
141 | 881 | mdecorde | XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
142 | 881 | mdecorde | XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
143 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
144 | 881 | mdecorde | |
145 | 881 | mdecorde | FileOutputStream output = new FileOutputStream(outfile) |
146 | 881 | mdecorde | XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
|
147 | 881 | mdecorde | |
148 | 881 | mdecorde | writer.writeStartDocument("utf-8", "1.0"); |
149 | 881 | mdecorde | |
150 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
151 | 881 | mdecorde | { |
152 | 881 | mdecorde | if(isText)
|
153 | 881 | mdecorde | { |
154 | 881 | mdecorde | if((wcount >= from1 && wcount <= to1 )||
|
155 | 881 | mdecorde | (wcount >= from2 && wcount <= to2) || |
156 | 881 | mdecorde | (wcount >= from3 && wcount <= to3)) |
157 | 881 | mdecorde | printW = true;
|
158 | 881 | mdecorde | else
|
159 | 881 | mdecorde | printW = false;
|
160 | 881 | mdecorde | } |
161 | 881 | mdecorde | else
|
162 | 881 | mdecorde | printW = true;
|
163 | 881 | mdecorde | |
164 | 881 | mdecorde | switch (event)
|
165 | 881 | mdecorde | { |
166 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
167 | 881 | mdecorde | localname = parser.getLocalName(); |
168 | 881 | mdecorde | prefix = parser.getPrefix(); |
169 | 881 | mdecorde | |
170 | 881 | mdecorde | /*
|
171 | 881 | mdecorde | if(localname == "supplied")
|
172 | 881 | mdecorde | if(parser.getAttributeValue(null,"source") != null)
|
173 | 881 | mdecorde | ms = parser.getAttributeValue(null,"source")
|
174 | 881 | mdecorde | if(localname == "sic")
|
175 | 881 | mdecorde | {
|
176 | 881 | mdecorde | isSic= true;
|
177 | 881 | mdecorde | }
|
178 | 881 | mdecorde | */
|
179 | 881 | mdecorde | if(localname == "text") |
180 | 881 | mdecorde | isText = true;
|
181 | 881 | mdecorde | |
182 | 881 | mdecorde | if(localname == "w") |
183 | 881 | mdecorde | { |
184 | 881 | mdecorde | isW= true;
|
185 | 881 | mdecorde | wcount++; |
186 | 881 | mdecorde | |
187 | 881 | mdecorde | if(isText)
|
188 | 881 | mdecorde | { |
189 | 881 | mdecorde | if((wcount >= from1 && wcount <= to1 )||
|
190 | 881 | mdecorde | (wcount >= from2 && wcount <= to2) || |
191 | 881 | mdecorde | (wcount >= from3 && wcount <= to3)) |
192 | 881 | mdecorde | printW = true;
|
193 | 881 | mdecorde | else
|
194 | 881 | mdecorde | printW = false;
|
195 | 881 | mdecorde | } |
196 | 881 | mdecorde | else
|
197 | 881 | mdecorde | printW = true;
|
198 | 881 | mdecorde | } |
199 | 881 | mdecorde | |
200 | 881 | mdecorde | /*if(!isSic)
|
201 | 881 | mdecorde | if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
202 | 881 | mdecorde | {*/
|
203 | 881 | mdecorde | if(localname == "w") |
204 | 881 | mdecorde | { |
205 | 881 | mdecorde | if(printW)
|
206 | 881 | mdecorde | { |
207 | 881 | mdecorde | if(prefix != null && prefix.length() > 0) |
208 | 881 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
209 | 881 | mdecorde | else
|
210 | 881 | mdecorde | writer.writeStartElement(localname); |
211 | 881 | mdecorde | |
212 | 881 | mdecorde | for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
213 | 881 | mdecorde | { |
214 | 881 | mdecorde | if(parser.getAttributePrefix(i)!= "") |
215 | 881 | mdecorde | writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
216 | 881 | mdecorde | else
|
217 | 881 | mdecorde | writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
218 | 881 | mdecorde | } |
219 | 881 | mdecorde | //writer.writeAttribute("srcmf:src", ms);
|
220 | 881 | mdecorde | } |
221 | 881 | mdecorde | } |
222 | 881 | mdecorde | else
|
223 | 881 | mdecorde | { |
224 | 881 | mdecorde | if(prefix != null && prefix.length() > 0) |
225 | 881 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
226 | 881 | mdecorde | else
|
227 | 881 | mdecorde | writer.writeStartElement(localname); |
228 | 881 | mdecorde | |
229 | 881 | mdecorde | if(localname == "teiHeader") |
230 | 881 | mdecorde | { |
231 | 881 | mdecorde | writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0"); |
232 | 881 | mdecorde | writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0"); |
233 | 881 | mdecorde | //writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
|
234 | 881 | mdecorde | } |
235 | 881 | mdecorde | |
236 | 881 | mdecorde | if(localname == "TEI") |
237 | 881 | mdecorde | { |
238 | 881 | mdecorde | writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0"); |
239 | 881 | mdecorde | } |
240 | 881 | mdecorde | |
241 | 881 | mdecorde | for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
242 | 881 | mdecorde | { |
243 | 881 | mdecorde | if(parser.getAttributePrefix(i)!= "") |
244 | 881 | mdecorde | writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
245 | 881 | mdecorde | else
|
246 | 881 | mdecorde | writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
247 | 881 | mdecorde | } |
248 | 881 | mdecorde | } |
249 | 881 | mdecorde | //}
|
250 | 881 | mdecorde | break;
|
251 | 881 | mdecorde | |
252 | 881 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
253 | 881 | mdecorde | localname =parser.getLocalName() |
254 | 881 | mdecorde | |
255 | 881 | mdecorde | /*if(localname == "sic")
|
256 | 881 | mdecorde | isSic= false;
|
257 | 881 | mdecorde | if(localname == "w")
|
258 | 881 | mdecorde | isW= false;
|
259 | 881 | mdecorde | if(localname == "supplied" && ms != "#ms_K")
|
260 | 881 | mdecorde | ms = "#ms_K";
|
261 | 881 | mdecorde | if(!isSic)
|
262 | 881 | mdecorde | if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
263 | 881 | mdecorde | {*/
|
264 | 881 | mdecorde | if(localname == "w") |
265 | 881 | mdecorde | { |
266 | 881 | mdecorde | if(printW)
|
267 | 881 | mdecorde | { |
268 | 881 | mdecorde | writer.writeEndElement(); |
269 | 1618 | mdecorde | writer.writeComment("\n");
|
270 | 881 | mdecorde | } |
271 | 881 | mdecorde | } |
272 | 881 | mdecorde | else
|
273 | 881 | mdecorde | { |
274 | 881 | mdecorde | writer.writeEndElement(); |
275 | 881 | mdecorde | writer.writeCharacters("\n");
|
276 | 881 | mdecorde | } |
277 | 881 | mdecorde | // }
|
278 | 881 | mdecorde | |
279 | 881 | mdecorde | break;
|
280 | 881 | mdecorde | |
281 | 881 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
282 | 881 | mdecorde | //if(!isSic)
|
283 | 881 | mdecorde | if(isW)
|
284 | 881 | mdecorde | { |
285 | 881 | mdecorde | if(printW)
|
286 | 881 | mdecorde | { |
287 | 881 | mdecorde | writer.writeCharacters(parser.getText().trim()); |
288 | 881 | mdecorde | } |
289 | 881 | mdecorde | } |
290 | 881 | mdecorde | else
|
291 | 881 | mdecorde | writer.writeCharacters(parser.getText().trim()); |
292 | 881 | mdecorde | break;
|
293 | 881 | mdecorde | } |
294 | 881 | mdecorde | } |
295 | 881 | mdecorde | writer.flush(); |
296 | 881 | mdecorde | writer.close(); |
297 | 881 | mdecorde | output.close() |
298 | 1688 | mdecorde | if (parser != null) parser.close(); |
299 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
300 | 881 | mdecorde | } |
301 | 881 | mdecorde | |
302 | 881 | mdecorde | /**
|
303 | 881 | mdecorde | * Count w.
|
304 | 881 | mdecorde | *
|
305 | 881 | mdecorde | * @param infile the infile
|
306 | 881 | mdecorde | * @return the int
|
307 | 881 | mdecorde | */
|
308 | 881 | mdecorde | public int countW(File infile) |
309 | 881 | mdecorde | { |
310 | 881 | mdecorde | InputStream inputData = infile.toURI().toURL().openStream();
|
311 | 881 | mdecorde | XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
312 | 881 | mdecorde | XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
313 | 881 | mdecorde | |
314 | 881 | mdecorde | int count = 0; |
315 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
316 | 881 | mdecorde | { |
317 | 881 | mdecorde | switch (event)
|
318 | 881 | mdecorde | { |
319 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
320 | 881 | mdecorde | if(parser.getLocalName() == "w") |
321 | 881 | mdecorde | count++; |
322 | 881 | mdecorde | } |
323 | 881 | mdecorde | } |
324 | 1688 | mdecorde | if (parser != null) parser.close(); |
325 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
326 | 881 | mdecorde | return count;
|
327 | 881 | mdecorde | } |
328 | 881 | mdecorde | |
329 | 881 | mdecorde | /**
|
330 | 881 | mdecorde | * The main method.
|
331 | 881 | mdecorde | *
|
332 | 881 | mdecorde | * @param args the arguments
|
333 | 881 | mdecorde | */
|
334 | 881 | mdecorde | public static void main(String[] args) |
335 | 881 | mdecorde | { |
336 | 881 | mdecorde | String userDir = System.getProperty("user.home"); |
337 | 881 | mdecorde | |
338 | 881 | mdecorde | File directory = new File(userDir+"/xml/extract/"); |
339 | 881 | mdecorde | File outdir = new File(userDir+"/xml/extract/","results"); |
340 | 881 | mdecorde | outdir.mkdir(); |
341 | 881 | mdecorde | |
342 | 881 | mdecorde | File maxfilemode = new File(userDir+"/xml/extract/maxfilemode"); |
343 | 881 | mdecorde | /*
|
344 | 881 | mdecorde | * maxfilemode format:
|
345 | 881 | mdecorde | *
|
346 | 881 | mdecorde | * filename1.xml 3 45000
|
347 | 881 | mdecorde | * filename2.xml 1a 15000
|
348 | 881 | mdecorde | * filename3.xml 1m 15000
|
349 | 881 | mdecorde | * filename4.xml 1z 15000
|
350 | 881 | mdecorde | * filename5.xml 2 22500
|
351 | 881 | mdecorde | */
|
352 | 881 | mdecorde | HashMap<File, String> maxperfile = new HashMap<File, String>(); |
353 | 881 | mdecorde | maxfilemode.eachLine{it->
|
354 | 881 | mdecorde | String[] split = it.split("\t"); |
355 | 881 | mdecorde | if(split.length == 3) |
356 | 881 | mdecorde | { |
357 | 881 | mdecorde | try
|
358 | 881 | mdecorde | { |
359 | 881 | mdecorde | String filename = it.split("\t")[0]; |
360 | 881 | mdecorde | String modemax = it.split("\t")[1]+"/"+it.split("\t")[2] |
361 | 881 | mdecorde | maxperfile.put(filename, modemax); |
362 | 881 | mdecorde | }catch(Exception e ){} |
363 | 881 | mdecorde | } |
364 | 881 | mdecorde | } |
365 | 881 | mdecorde | println maxperfile; |
366 | 881 | mdecorde | |
367 | 1615 | mdecorde | def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
|
368 | 1370 | mdecorde | for(File infile : files) { |
369 | 1370 | mdecorde | if(maxperfile.containsKey(infile.getName())) {
|
370 | 881 | mdecorde | File outfile = new File(outdir, infile.getName()); |
371 | 881 | mdecorde | String modemax = maxperfile.get(infile.getName());
|
372 | 881 | mdecorde | new WExtractWithMode().process(infile, outfile, modemax)
|
373 | 881 | mdecorde | } |
374 | 881 | mdecorde | } |
375 | 881 | mdecorde | } |
376 | 881 | mdecorde | } |