root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtract.groovy @ 2473
History | View | Annotate | Download (8.8 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
|
25 |
// $LastChangedRevision: 3087 $
|
26 |
// $LastChangedBy: mdecorde $
|
27 |
//
|
28 |
package org.txm.scripts.importer
|
29 |
|
30 |
import javax.xml.parsers.DocumentBuilder; |
31 |
import javax.xml.parsers.DocumentBuilderFactory; |
32 |
import javax.xml.parsers.ParserConfigurationException; |
33 |
import javax.xml.transform.OutputKeys; |
34 |
import javax.xml.transform.Result; |
35 |
import javax.xml.transform.Source; |
36 |
import javax.xml.transform.Transformer; |
37 |
import javax.xml.transform.TransformerFactory; |
38 |
import javax.xml.transform.dom.DOMSource; |
39 |
import javax.xml.transform.stream.StreamResult; |
40 |
|
41 |
import org.txm.utils.io.IOUtils |
42 |
import org.w3c.dom.Document; |
43 |
import org.w3c.dom.Element; |
44 |
import org.w3c.dom.NodeList; |
45 |
import org.xml.sax.SAXException; |
46 |
|
47 |
import javax.xml.stream.*; |
48 |
import java.io.File; |
49 |
import java.net.URL; |
50 |
|
51 |
// TODO: Auto-generated Javadoc
|
52 |
/**
|
53 |
* Extract w tags from a tei file
|
54 |
* not finished.
|
55 |
*
|
56 |
* @author mdecorde
|
57 |
*/
|
58 |
class WExtract |
59 |
{ |
60 |
|
61 |
/**
|
62 |
* Process.
|
63 |
*
|
64 |
* @param infile the infile
|
65 |
* @param outfile the outfile
|
66 |
* @param max the max
|
67 |
* @return the java.lang. object
|
68 |
*/
|
69 |
public process(File infile, File outfile, int max) |
70 |
{ |
71 |
println "Process "+infile.getName()+", keep $max words" |
72 |
int count = this.countW(infile); |
73 |
if(count < max)
|
74 |
{ |
75 |
println "can't extract $max words, the file "+infile.getName()+" contains only $count words" |
76 |
return;
|
77 |
} |
78 |
String ms = "#ms_K" |
79 |
int tier = max/3; |
80 |
int from1 = 0 |
81 |
int to1 = tier;
|
82 |
int from2 = (count/2) - (tier/2); |
83 |
int to2 =(count/2) + (tier/2); |
84 |
int from3 = count -tier;
|
85 |
int to3= count-1; |
86 |
boolean isSic = false; |
87 |
boolean isW = false; |
88 |
boolean isText = false; |
89 |
boolean printW = true; |
90 |
int wcount=0; |
91 |
|
92 |
println " count : "+count
|
93 |
println " get from "+from1+" to "+to1 |
94 |
println " get from "+from2+" to "+to2 |
95 |
println " get from "+from3+" to "+to3 |
96 |
|
97 |
|
98 |
String localname;
|
99 |
String prefix;
|
100 |
InputStream inputData = infile.toURI().toURL().openStream();
|
101 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
102 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
103 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
104 |
|
105 |
FileOutputStream output = new FileOutputStream(outfile); |
106 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
|
107 |
|
108 |
writer.writeStartDocument("utf-8", "1.0"); |
109 |
|
110 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
111 |
{ |
112 |
if(isText)
|
113 |
{ |
114 |
if((wcount >= from1 && wcount <= to1 )||
|
115 |
(wcount >= from2 && wcount <= to2) || |
116 |
(wcount >= from3 && wcount <= to3)) |
117 |
printW = true;
|
118 |
else
|
119 |
printW = false;
|
120 |
} |
121 |
else
|
122 |
printW = true;
|
123 |
|
124 |
switch (event)
|
125 |
{ |
126 |
case XMLStreamConstants.START_ELEMENT:
|
127 |
localname = parser.getLocalName(); |
128 |
prefix = parser.getPrefix(); |
129 |
|
130 |
/*
|
131 |
if(localname == "supplied")
|
132 |
if(parser.getAttributeValue(null,"source") != null)
|
133 |
ms = parser.getAttributeValue(null,"source")
|
134 |
if(localname == "sic")
|
135 |
{
|
136 |
isSic= true;
|
137 |
}
|
138 |
*/
|
139 |
if(localname == "text") |
140 |
isText = true;
|
141 |
|
142 |
if(localname == "w") |
143 |
{ |
144 |
isW= true;
|
145 |
wcount++; |
146 |
|
147 |
if(isText)
|
148 |
{ |
149 |
if((wcount >= from1 && wcount <= to1 )||
|
150 |
(wcount >= from2 && wcount <= to2) || |
151 |
(wcount >= from3 && wcount <= to3)) |
152 |
printW = true;
|
153 |
else
|
154 |
printW = false;
|
155 |
} |
156 |
else
|
157 |
printW = true;
|
158 |
} |
159 |
|
160 |
/*if(!isSic)
|
161 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
162 |
{*/
|
163 |
if(localname == "w") |
164 |
{ |
165 |
if(printW)
|
166 |
{ |
167 |
if(prefix != null && prefix.length() > 0) |
168 |
writer.writeStartElement(prefix+":"+localname);
|
169 |
else
|
170 |
writer.writeStartElement(localname); |
171 |
|
172 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
173 |
{ |
174 |
if(parser.getAttributePrefix(i)!= "") |
175 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
176 |
else
|
177 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
178 |
} |
179 |
//writer.writeAttribute("srcmf:src", ms);
|
180 |
} |
181 |
} |
182 |
else
|
183 |
{ |
184 |
if(prefix != null && prefix.length() > 0) |
185 |
writer.writeStartElement(prefix+":"+localname);
|
186 |
else
|
187 |
writer.writeStartElement(localname); |
188 |
|
189 |
if(localname == "teiHeader") |
190 |
{ |
191 |
writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0"); |
192 |
writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0"); |
193 |
//writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
|
194 |
} |
195 |
|
196 |
if(localname == "TEI") |
197 |
{ |
198 |
writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0"); |
199 |
} |
200 |
|
201 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
202 |
{ |
203 |
if(parser.getAttributePrefix(i)!= "") |
204 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
205 |
else
|
206 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
207 |
} |
208 |
} |
209 |
//}
|
210 |
break;
|
211 |
|
212 |
case XMLStreamConstants.END_ELEMENT:
|
213 |
localname =parser.getLocalName() |
214 |
|
215 |
/*if(localname == "sic")
|
216 |
isSic= false;
|
217 |
if(localname == "w")
|
218 |
isW= false;
|
219 |
if(localname == "supplied" && ms != "#ms_K")
|
220 |
ms = "#ms_K";
|
221 |
if(!isSic)
|
222 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
223 |
{*/
|
224 |
if(localname == "w") |
225 |
{ |
226 |
if(printW)
|
227 |
{ |
228 |
writer.writeEndElement(); |
229 |
writer.writeComment("\n");
|
230 |
} |
231 |
} |
232 |
else
|
233 |
{ |
234 |
writer.writeEndElement(); |
235 |
writer.writeCharacters("\n");
|
236 |
} |
237 |
// }
|
238 |
|
239 |
break;
|
240 |
|
241 |
case XMLStreamConstants.CHARACTERS:
|
242 |
//if(!isSic)
|
243 |
if(isW)
|
244 |
{ |
245 |
if(printW)
|
246 |
{ |
247 |
writer.writeCharacters(parser.getText().trim()); |
248 |
} |
249 |
} |
250 |
else
|
251 |
writer.writeCharacters(parser.getText().trim()); |
252 |
break;
|
253 |
} |
254 |
} |
255 |
writer.flush(); |
256 |
writer.close(); |
257 |
output.close() |
258 |
if (parser != null) parser.close(); |
259 |
if (inputData != null) inputData.close(); |
260 |
} |
261 |
|
262 |
/**
|
263 |
* Count w.
|
264 |
*
|
265 |
* @param infile the infile
|
266 |
* @return the int
|
267 |
*/
|
268 |
public int countW(File infile) |
269 |
{ |
270 |
InputStream inputData = infile.toURI().toURL().openStream();
|
271 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
272 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
273 |
|
274 |
int count = 0; |
275 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
276 |
{ |
277 |
switch (event)
|
278 |
{ |
279 |
case XMLStreamConstants.START_ELEMENT:
|
280 |
if(parser.getLocalName() == "w") |
281 |
count++; |
282 |
} |
283 |
} |
284 |
if (parser != null) parser.close(); |
285 |
if (inputData != null) inputData.close(); |
286 |
return count;
|
287 |
} |
288 |
|
289 |
/**
|
290 |
* The main method.
|
291 |
*
|
292 |
* @param args the arguments
|
293 |
*/
|
294 |
public static void main(String[] args) |
295 |
{ |
296 |
String userDir = System.getProperty("user.home"); |
297 |
|
298 |
File directory = new File(userDir+"/xml/extract/"); |
299 |
File outdir = new File(userDir+"/xml/extract/","results"); |
300 |
outdir.mkdir(); |
301 |
|
302 |
File maxfile = new File(userDir+"/xml/extract/maxfile"); |
303 |
/*
|
304 |
* maxfile format:
|
305 |
*
|
306 |
* filename1.xml 45000
|
307 |
* filename2.xml 22500
|
308 |
* filename3.xml 45000
|
309 |
*/
|
310 |
HashMap<File, Integer> maxperfile = new HashMap<File, Integer>(); |
311 |
maxfile.eachLine{it->
|
312 |
String[] split = it.split("\t"); |
313 |
if(split.length == 2) |
314 |
{ |
315 |
try
|
316 |
{ |
317 |
String filename = it.split("\t")[0]; |
318 |
int max = Integer.parseInt(it.split("\t")[1]) |
319 |
maxperfile.put(filename, max); |
320 |
}catch(Exception e ){} |
321 |
} |
322 |
} |
323 |
println maxperfile; |
324 |
|
325 |
def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
|
326 |
for (File infile : files) { |
327 |
if (maxperfile.containsKey(infile.getName())) {
|
328 |
File outfile = new File(outdir, infile.getName()); |
329 |
int max= maxperfile.get(infile.getName());
|
330 |
new WExtract().process(infile, outfile, max)
|
331 |
} |
332 |
} |
333 |
} |
334 |
} |