root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtractWithMode.groovy @ 2473
History | View | Annotate | Download (9.5 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $
|
25 |
// $LastChangedRevision: 2038 $
|
26 |
// $LastChangedBy: alavrentev $
|
27 |
//
|
28 |
package org.txm.scripts.importer
|
29 |
|
30 |
import javax.xml.parsers.DocumentBuilder; |
31 |
import javax.xml.parsers.DocumentBuilderFactory; |
32 |
import javax.xml.parsers.ParserConfigurationException; |
33 |
import javax.xml.transform.OutputKeys; |
34 |
import javax.xml.transform.Result; |
35 |
import javax.xml.transform.Source; |
36 |
import javax.xml.transform.Transformer; |
37 |
import javax.xml.transform.TransformerFactory; |
38 |
import javax.xml.transform.dom.DOMSource; |
39 |
import javax.xml.transform.stream.StreamResult; |
40 |
|
41 |
import org.txm.utils.io.IOUtils |
42 |
import org.w3c.dom.Document; |
43 |
import org.w3c.dom.Element; |
44 |
import org.w3c.dom.NodeList; |
45 |
import org.xml.sax.SAXException; |
46 |
|
47 |
import javax.xml.stream.*; |
48 |
import java.io.File; |
49 |
import java.net.URL; |
50 |
|
51 |
// TODO: Auto-generated Javadoc
|
52 |
/**
|
53 |
* Extract w tags from a tei file
|
54 |
* not finished.
|
55 |
*
|
56 |
* @author mdecorde
|
57 |
*/
|
58 |
class WExtractWithMode |
59 |
{ |
60 |
|
61 |
/**
|
62 |
* Process.
|
63 |
*
|
64 |
* @param infile the infile
|
65 |
* @param outfile the outfile
|
66 |
* @param max the max
|
67 |
* @return the java.lang. object
|
68 |
*/
|
69 |
public process(File infile, File outfile, String modemax) |
70 |
{ |
71 |
println "Process "+infile.getName()+", keep $modemax words" |
72 |
int count = this.countW(infile); |
73 |
|
74 |
int max = 0 |
75 |
String mode = "" |
76 |
|
77 |
try {
|
78 |
mode = modemax.split("/")[0] |
79 |
max = Integer.parseInt(modemax.split("/")[1]) |
80 |
}catch(Exception e ){} |
81 |
|
82 |
if(count < max)
|
83 |
{ |
84 |
println "can't extract $max words, the file "+infile.getName()+" contains only $count words" |
85 |
return;
|
86 |
} |
87 |
//String ms = "#ms_K"
|
88 |
int part = 0; |
89 |
if (mode == "3") |
90 |
{ |
91 |
part = max/3
|
92 |
} |
93 |
else if (mode == "2") |
94 |
{ |
95 |
part = max/2
|
96 |
} |
97 |
else if (mode == "1a" || mode == "1m" || mode == "1z") |
98 |
{ |
99 |
part = max |
100 |
} |
101 |
else
|
102 |
{ |
103 |
println "mode must be 1a, 1m, 1z, 2 or 3"
|
104 |
return
|
105 |
} |
106 |
int from1 = 0 |
107 |
int to1 = 0 |
108 |
if (mode != "1m" && mode != "1z") |
109 |
{ |
110 |
to1 = part |
111 |
} |
112 |
int from2 = 0 |
113 |
int to2 = 0 |
114 |
if (mode == "3" || mode == "1m") |
115 |
{ |
116 |
from2 = (count/2) - (part/2); |
117 |
to2 =(count/2) + (part/2); |
118 |
} |
119 |
int from3 = 0 |
120 |
int to3 = 0 |
121 |
if (mode != "1a" && mode != "1m") |
122 |
{ |
123 |
from3 = count -part; |
124 |
to3= count-1;
|
125 |
} |
126 |
boolean isSic = false; |
127 |
boolean isW = false; |
128 |
boolean isText = false; |
129 |
boolean printW = true; |
130 |
int wcount=0; |
131 |
|
132 |
println " count : "+count
|
133 |
println " get from "+from1+" to "+to1 |
134 |
println " get from "+from2+" to "+to2 |
135 |
println " get from "+from3+" to "+to3 |
136 |
|
137 |
|
138 |
String localname;
|
139 |
String prefix;
|
140 |
InputStream inputData = infile.toURI().toURL().openStream();
|
141 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
142 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
143 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
144 |
|
145 |
FileOutputStream output = new FileOutputStream(outfile) |
146 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
|
147 |
|
148 |
writer.writeStartDocument("utf-8", "1.0"); |
149 |
|
150 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
151 |
{ |
152 |
if(isText)
|
153 |
{ |
154 |
if((wcount >= from1 && wcount <= to1 )||
|
155 |
(wcount >= from2 && wcount <= to2) || |
156 |
(wcount >= from3 && wcount <= to3)) |
157 |
printW = true;
|
158 |
else
|
159 |
printW = false;
|
160 |
} |
161 |
else
|
162 |
printW = true;
|
163 |
|
164 |
switch (event)
|
165 |
{ |
166 |
case XMLStreamConstants.START_ELEMENT:
|
167 |
localname = parser.getLocalName(); |
168 |
prefix = parser.getPrefix(); |
169 |
|
170 |
/*
|
171 |
if(localname == "supplied")
|
172 |
if(parser.getAttributeValue(null,"source") != null)
|
173 |
ms = parser.getAttributeValue(null,"source")
|
174 |
if(localname == "sic")
|
175 |
{
|
176 |
isSic= true;
|
177 |
}
|
178 |
*/
|
179 |
if(localname == "text") |
180 |
isText = true;
|
181 |
|
182 |
if(localname == "w") |
183 |
{ |
184 |
isW= true;
|
185 |
wcount++; |
186 |
|
187 |
if(isText)
|
188 |
{ |
189 |
if((wcount >= from1 && wcount <= to1 )||
|
190 |
(wcount >= from2 && wcount <= to2) || |
191 |
(wcount >= from3 && wcount <= to3)) |
192 |
printW = true;
|
193 |
else
|
194 |
printW = false;
|
195 |
} |
196 |
else
|
197 |
printW = true;
|
198 |
} |
199 |
|
200 |
/*if(!isSic)
|
201 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
202 |
{*/
|
203 |
if(localname == "w") |
204 |
{ |
205 |
if(printW)
|
206 |
{ |
207 |
if(prefix != null && prefix.length() > 0) |
208 |
writer.writeStartElement(prefix+":"+localname);
|
209 |
else
|
210 |
writer.writeStartElement(localname); |
211 |
|
212 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
213 |
{ |
214 |
if(parser.getAttributePrefix(i)!= "") |
215 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
216 |
else
|
217 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
218 |
} |
219 |
//writer.writeAttribute("srcmf:src", ms);
|
220 |
} |
221 |
} |
222 |
else
|
223 |
{ |
224 |
if(prefix != null && prefix.length() > 0) |
225 |
writer.writeStartElement(prefix+":"+localname);
|
226 |
else
|
227 |
writer.writeStartElement(localname); |
228 |
|
229 |
if(localname == "teiHeader") |
230 |
{ |
231 |
writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0"); |
232 |
writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0"); |
233 |
//writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
|
234 |
} |
235 |
|
236 |
if(localname == "TEI") |
237 |
{ |
238 |
writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0"); |
239 |
} |
240 |
|
241 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
242 |
{ |
243 |
if(parser.getAttributePrefix(i)!= "") |
244 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
245 |
else
|
246 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
247 |
} |
248 |
} |
249 |
//}
|
250 |
break;
|
251 |
|
252 |
case XMLStreamConstants.END_ELEMENT:
|
253 |
localname =parser.getLocalName() |
254 |
|
255 |
/*if(localname == "sic")
|
256 |
isSic= false;
|
257 |
if(localname == "w")
|
258 |
isW= false;
|
259 |
if(localname == "supplied" && ms != "#ms_K")
|
260 |
ms = "#ms_K";
|
261 |
if(!isSic)
|
262 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
263 |
{*/
|
264 |
if(localname == "w") |
265 |
{ |
266 |
if(printW)
|
267 |
{ |
268 |
writer.writeEndElement(); |
269 |
writer.writeComment("\n");
|
270 |
} |
271 |
} |
272 |
else
|
273 |
{ |
274 |
writer.writeEndElement(); |
275 |
writer.writeCharacters("\n");
|
276 |
} |
277 |
// }
|
278 |
|
279 |
break;
|
280 |
|
281 |
case XMLStreamConstants.CHARACTERS:
|
282 |
//if(!isSic)
|
283 |
if(isW)
|
284 |
{ |
285 |
if(printW)
|
286 |
{ |
287 |
writer.writeCharacters(parser.getText().trim()); |
288 |
} |
289 |
} |
290 |
else
|
291 |
writer.writeCharacters(parser.getText().trim()); |
292 |
break;
|
293 |
} |
294 |
} |
295 |
writer.flush(); |
296 |
writer.close(); |
297 |
output.close() |
298 |
if (parser != null) parser.close(); |
299 |
if (inputData != null) inputData.close(); |
300 |
} |
301 |
|
302 |
/**
|
303 |
* Count w.
|
304 |
*
|
305 |
* @param infile the infile
|
306 |
* @return the int
|
307 |
*/
|
308 |
public int countW(File infile) |
309 |
{ |
310 |
InputStream inputData = infile.toURI().toURL().openStream();
|
311 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
312 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
313 |
|
314 |
int count = 0; |
315 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
316 |
{ |
317 |
switch (event)
|
318 |
{ |
319 |
case XMLStreamConstants.START_ELEMENT:
|
320 |
if(parser.getLocalName() == "w") |
321 |
count++; |
322 |
} |
323 |
} |
324 |
if (parser != null) parser.close(); |
325 |
if (inputData != null) inputData.close(); |
326 |
return count;
|
327 |
} |
328 |
|
329 |
/**
|
330 |
* The main method.
|
331 |
*
|
332 |
* @param args the arguments
|
333 |
*/
|
334 |
public static void main(String[] args) |
335 |
{ |
336 |
String userDir = System.getProperty("user.home"); |
337 |
|
338 |
File directory = new File(userDir+"/xml/extract/"); |
339 |
File outdir = new File(userDir+"/xml/extract/","results"); |
340 |
outdir.mkdir(); |
341 |
|
342 |
File maxfilemode = new File(userDir+"/xml/extract/maxfilemode"); |
343 |
/*
|
344 |
* maxfilemode format:
|
345 |
*
|
346 |
* filename1.xml 3 45000
|
347 |
* filename2.xml 1a 15000
|
348 |
* filename3.xml 1m 15000
|
349 |
* filename4.xml 1z 15000
|
350 |
* filename5.xml 2 22500
|
351 |
*/
|
352 |
HashMap<File, String> maxperfile = new HashMap<File, String>(); |
353 |
maxfilemode.eachLine{it->
|
354 |
String[] split = it.split("\t"); |
355 |
if(split.length == 3) |
356 |
{ |
357 |
try
|
358 |
{ |
359 |
String filename = it.split("\t")[0]; |
360 |
String modemax = it.split("\t")[1]+"/"+it.split("\t")[2] |
361 |
maxperfile.put(filename, modemax); |
362 |
}catch(Exception e ){} |
363 |
} |
364 |
} |
365 |
println maxperfile; |
366 |
|
367 |
def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
|
368 |
for(File infile : files) { |
369 |
if(maxperfile.containsKey(infile.getName())) {
|
370 |
File outfile = new File(outdir, infile.getName()); |
371 |
String modemax = maxperfile.get(infile.getName());
|
372 |
new WExtractWithMode().process(infile, outfile, modemax)
|
373 |
} |
374 |
} |
375 |
} |
376 |
} |