root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtract.groovy @ 2473
History | View | Annotate | Download (8.8 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 3087 $
|
26 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import javax.xml.parsers.DocumentBuilder; |
31 | 881 | mdecorde | import javax.xml.parsers.DocumentBuilderFactory; |
32 | 881 | mdecorde | import javax.xml.parsers.ParserConfigurationException; |
33 | 881 | mdecorde | import javax.xml.transform.OutputKeys; |
34 | 881 | mdecorde | import javax.xml.transform.Result; |
35 | 881 | mdecorde | import javax.xml.transform.Source; |
36 | 881 | mdecorde | import javax.xml.transform.Transformer; |
37 | 881 | mdecorde | import javax.xml.transform.TransformerFactory; |
38 | 881 | mdecorde | import javax.xml.transform.dom.DOMSource; |
39 | 881 | mdecorde | import javax.xml.transform.stream.StreamResult; |
40 | 881 | mdecorde | |
41 | 1370 | mdecorde | import org.txm.utils.io.IOUtils |
42 | 881 | mdecorde | import org.w3c.dom.Document; |
43 | 881 | mdecorde | import org.w3c.dom.Element; |
44 | 881 | mdecorde | import org.w3c.dom.NodeList; |
45 | 881 | mdecorde | import org.xml.sax.SAXException; |
46 | 881 | mdecorde | |
47 | 881 | mdecorde | import javax.xml.stream.*; |
48 | 881 | mdecorde | import java.io.File; |
49 | 881 | mdecorde | import java.net.URL; |
50 | 881 | mdecorde | |
51 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
52 | 881 | mdecorde | /**
|
53 | 881 | mdecorde | * Extract w tags from a tei file
|
54 | 881 | mdecorde | * not finished.
|
55 | 881 | mdecorde | *
|
56 | 881 | mdecorde | * @author mdecorde
|
57 | 881 | mdecorde | */
|
58 | 881 | mdecorde | class WExtract |
59 | 881 | mdecorde | { |
60 | 881 | mdecorde | |
61 | 881 | mdecorde | /**
|
62 | 881 | mdecorde | * Process.
|
63 | 881 | mdecorde | *
|
64 | 881 | mdecorde | * @param infile the infile
|
65 | 881 | mdecorde | * @param outfile the outfile
|
66 | 881 | mdecorde | * @param max the max
|
67 | 881 | mdecorde | * @return the java.lang. object
|
68 | 881 | mdecorde | */
|
69 | 881 | mdecorde | public process(File infile, File outfile, int max) |
70 | 881 | mdecorde | { |
71 | 881 | mdecorde | println "Process "+infile.getName()+", keep $max words" |
72 | 881 | mdecorde | int count = this.countW(infile); |
73 | 881 | mdecorde | if(count < max)
|
74 | 881 | mdecorde | { |
75 | 881 | mdecorde | println "can't extract $max words, the file "+infile.getName()+" contains only $count words" |
76 | 881 | mdecorde | return;
|
77 | 881 | mdecorde | } |
78 | 881 | mdecorde | String ms = "#ms_K" |
79 | 881 | mdecorde | int tier = max/3; |
80 | 881 | mdecorde | int from1 = 0 |
81 | 881 | mdecorde | int to1 = tier;
|
82 | 881 | mdecorde | int from2 = (count/2) - (tier/2); |
83 | 881 | mdecorde | int to2 =(count/2) + (tier/2); |
84 | 881 | mdecorde | int from3 = count -tier;
|
85 | 881 | mdecorde | int to3= count-1; |
86 | 881 | mdecorde | boolean isSic = false; |
87 | 881 | mdecorde | boolean isW = false; |
88 | 881 | mdecorde | boolean isText = false; |
89 | 881 | mdecorde | boolean printW = true; |
90 | 881 | mdecorde | int wcount=0; |
91 | 881 | mdecorde | |
92 | 881 | mdecorde | println " count : "+count
|
93 | 881 | mdecorde | println " get from "+from1+" to "+to1 |
94 | 881 | mdecorde | println " get from "+from2+" to "+to2 |
95 | 881 | mdecorde | println " get from "+from3+" to "+to3 |
96 | 881 | mdecorde | |
97 | 881 | mdecorde | |
98 | 881 | mdecorde | String localname;
|
99 | 881 | mdecorde | String prefix;
|
100 | 881 | mdecorde | InputStream inputData = infile.toURI().toURL().openStream();
|
101 | 881 | mdecorde | XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
102 | 881 | mdecorde | XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
103 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
104 | 881 | mdecorde | |
105 | 881 | mdecorde | FileOutputStream output = new FileOutputStream(outfile); |
106 | 881 | mdecorde | XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
|
107 | 881 | mdecorde | |
108 | 881 | mdecorde | writer.writeStartDocument("utf-8", "1.0"); |
109 | 881 | mdecorde | |
110 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
111 | 881 | mdecorde | { |
112 | 881 | mdecorde | if(isText)
|
113 | 881 | mdecorde | { |
114 | 881 | mdecorde | if((wcount >= from1 && wcount <= to1 )||
|
115 | 881 | mdecorde | (wcount >= from2 && wcount <= to2) || |
116 | 881 | mdecorde | (wcount >= from3 && wcount <= to3)) |
117 | 881 | mdecorde | printW = true;
|
118 | 881 | mdecorde | else
|
119 | 881 | mdecorde | printW = false;
|
120 | 881 | mdecorde | } |
121 | 881 | mdecorde | else
|
122 | 881 | mdecorde | printW = true;
|
123 | 881 | mdecorde | |
124 | 881 | mdecorde | switch (event)
|
125 | 881 | mdecorde | { |
126 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
127 | 881 | mdecorde | localname = parser.getLocalName(); |
128 | 881 | mdecorde | prefix = parser.getPrefix(); |
129 | 881 | mdecorde | |
130 | 881 | mdecorde | /*
|
131 | 881 | mdecorde | if(localname == "supplied")
|
132 | 881 | mdecorde | if(parser.getAttributeValue(null,"source") != null)
|
133 | 881 | mdecorde | ms = parser.getAttributeValue(null,"source")
|
134 | 881 | mdecorde | if(localname == "sic")
|
135 | 881 | mdecorde | {
|
136 | 881 | mdecorde | isSic= true;
|
137 | 881 | mdecorde | }
|
138 | 881 | mdecorde | */
|
139 | 881 | mdecorde | if(localname == "text") |
140 | 881 | mdecorde | isText = true;
|
141 | 881 | mdecorde | |
142 | 881 | mdecorde | if(localname == "w") |
143 | 881 | mdecorde | { |
144 | 881 | mdecorde | isW= true;
|
145 | 881 | mdecorde | wcount++; |
146 | 881 | mdecorde | |
147 | 881 | mdecorde | if(isText)
|
148 | 881 | mdecorde | { |
149 | 881 | mdecorde | if((wcount >= from1 && wcount <= to1 )||
|
150 | 881 | mdecorde | (wcount >= from2 && wcount <= to2) || |
151 | 881 | mdecorde | (wcount >= from3 && wcount <= to3)) |
152 | 881 | mdecorde | printW = true;
|
153 | 881 | mdecorde | else
|
154 | 881 | mdecorde | printW = false;
|
155 | 881 | mdecorde | } |
156 | 881 | mdecorde | else
|
157 | 881 | mdecorde | printW = true;
|
158 | 881 | mdecorde | } |
159 | 881 | mdecorde | |
160 | 881 | mdecorde | /*if(!isSic)
|
161 | 881 | mdecorde | if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
162 | 881 | mdecorde | {*/
|
163 | 881 | mdecorde | if(localname == "w") |
164 | 881 | mdecorde | { |
165 | 881 | mdecorde | if(printW)
|
166 | 881 | mdecorde | { |
167 | 881 | mdecorde | if(prefix != null && prefix.length() > 0) |
168 | 881 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
169 | 881 | mdecorde | else
|
170 | 881 | mdecorde | writer.writeStartElement(localname); |
171 | 881 | mdecorde | |
172 | 881 | mdecorde | for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
173 | 881 | mdecorde | { |
174 | 881 | mdecorde | if(parser.getAttributePrefix(i)!= "") |
175 | 881 | mdecorde | writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
176 | 881 | mdecorde | else
|
177 | 881 | mdecorde | writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
178 | 881 | mdecorde | } |
179 | 881 | mdecorde | //writer.writeAttribute("srcmf:src", ms);
|
180 | 881 | mdecorde | } |
181 | 881 | mdecorde | } |
182 | 881 | mdecorde | else
|
183 | 881 | mdecorde | { |
184 | 881 | mdecorde | if(prefix != null && prefix.length() > 0) |
185 | 881 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
186 | 881 | mdecorde | else
|
187 | 881 | mdecorde | writer.writeStartElement(localname); |
188 | 881 | mdecorde | |
189 | 881 | mdecorde | if(localname == "teiHeader") |
190 | 881 | mdecorde | { |
191 | 881 | mdecorde | writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0"); |
192 | 881 | mdecorde | writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0"); |
193 | 881 | mdecorde | //writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
|
194 | 881 | mdecorde | } |
195 | 881 | mdecorde | |
196 | 881 | mdecorde | if(localname == "TEI") |
197 | 881 | mdecorde | { |
198 | 881 | mdecorde | writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0"); |
199 | 881 | mdecorde | } |
200 | 881 | mdecorde | |
201 | 881 | mdecorde | for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
202 | 881 | mdecorde | { |
203 | 881 | mdecorde | if(parser.getAttributePrefix(i)!= "") |
204 | 881 | mdecorde | writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
205 | 881 | mdecorde | else
|
206 | 881 | mdecorde | writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
207 | 881 | mdecorde | } |
208 | 881 | mdecorde | } |
209 | 881 | mdecorde | //}
|
210 | 881 | mdecorde | break;
|
211 | 881 | mdecorde | |
212 | 881 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
213 | 881 | mdecorde | localname =parser.getLocalName() |
214 | 881 | mdecorde | |
215 | 881 | mdecorde | /*if(localname == "sic")
|
216 | 881 | mdecorde | isSic= false;
|
217 | 881 | mdecorde | if(localname == "w")
|
218 | 881 | mdecorde | isW= false;
|
219 | 881 | mdecorde | if(localname == "supplied" && ms != "#ms_K")
|
220 | 881 | mdecorde | ms = "#ms_K";
|
221 | 881 | mdecorde | if(!isSic)
|
222 | 881 | mdecorde | if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
|
223 | 881 | mdecorde | {*/
|
224 | 881 | mdecorde | if(localname == "w") |
225 | 881 | mdecorde | { |
226 | 881 | mdecorde | if(printW)
|
227 | 881 | mdecorde | { |
228 | 881 | mdecorde | writer.writeEndElement(); |
229 | 1618 | mdecorde | writer.writeComment("\n");
|
230 | 881 | mdecorde | } |
231 | 881 | mdecorde | } |
232 | 881 | mdecorde | else
|
233 | 881 | mdecorde | { |
234 | 881 | mdecorde | writer.writeEndElement(); |
235 | 881 | mdecorde | writer.writeCharacters("\n");
|
236 | 881 | mdecorde | } |
237 | 881 | mdecorde | // }
|
238 | 881 | mdecorde | |
239 | 881 | mdecorde | break;
|
240 | 881 | mdecorde | |
241 | 881 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
242 | 881 | mdecorde | //if(!isSic)
|
243 | 881 | mdecorde | if(isW)
|
244 | 881 | mdecorde | { |
245 | 881 | mdecorde | if(printW)
|
246 | 881 | mdecorde | { |
247 | 881 | mdecorde | writer.writeCharacters(parser.getText().trim()); |
248 | 881 | mdecorde | } |
249 | 881 | mdecorde | } |
250 | 881 | mdecorde | else
|
251 | 881 | mdecorde | writer.writeCharacters(parser.getText().trim()); |
252 | 881 | mdecorde | break;
|
253 | 881 | mdecorde | } |
254 | 881 | mdecorde | } |
255 | 881 | mdecorde | writer.flush(); |
256 | 881 | mdecorde | writer.close(); |
257 | 881 | mdecorde | output.close() |
258 | 1688 | mdecorde | if (parser != null) parser.close(); |
259 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
260 | 881 | mdecorde | } |
261 | 881 | mdecorde | |
262 | 881 | mdecorde | /**
|
263 | 881 | mdecorde | * Count w.
|
264 | 881 | mdecorde | *
|
265 | 881 | mdecorde | * @param infile the infile
|
266 | 881 | mdecorde | * @return the int
|
267 | 881 | mdecorde | */
|
268 | 881 | mdecorde | public int countW(File infile) |
269 | 881 | mdecorde | { |
270 | 881 | mdecorde | InputStream inputData = infile.toURI().toURL().openStream();
|
271 | 881 | mdecorde | XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
272 | 881 | mdecorde | XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
273 | 881 | mdecorde | |
274 | 881 | mdecorde | int count = 0; |
275 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
276 | 881 | mdecorde | { |
277 | 881 | mdecorde | switch (event)
|
278 | 881 | mdecorde | { |
279 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
280 | 881 | mdecorde | if(parser.getLocalName() == "w") |
281 | 881 | mdecorde | count++; |
282 | 881 | mdecorde | } |
283 | 881 | mdecorde | } |
284 | 1688 | mdecorde | if (parser != null) parser.close(); |
285 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
286 | 881 | mdecorde | return count;
|
287 | 881 | mdecorde | } |
288 | 881 | mdecorde | |
289 | 881 | mdecorde | /**
|
290 | 881 | mdecorde | * The main method.
|
291 | 881 | mdecorde | *
|
292 | 881 | mdecorde | * @param args the arguments
|
293 | 881 | mdecorde | */
|
294 | 881 | mdecorde | public static void main(String[] args) |
295 | 881 | mdecorde | { |
296 | 881 | mdecorde | String userDir = System.getProperty("user.home"); |
297 | 881 | mdecorde | |
298 | 881 | mdecorde | File directory = new File(userDir+"/xml/extract/"); |
299 | 881 | mdecorde | File outdir = new File(userDir+"/xml/extract/","results"); |
300 | 881 | mdecorde | outdir.mkdir(); |
301 | 881 | mdecorde | |
302 | 881 | mdecorde | File maxfile = new File(userDir+"/xml/extract/maxfile"); |
303 | 881 | mdecorde | /*
|
304 | 881 | mdecorde | * maxfile format:
|
305 | 881 | mdecorde | *
|
306 | 881 | mdecorde | * filename1.xml 45000
|
307 | 881 | mdecorde | * filename2.xml 22500
|
308 | 881 | mdecorde | * filename3.xml 45000
|
309 | 881 | mdecorde | */
|
310 | 881 | mdecorde | HashMap<File, Integer> maxperfile = new HashMap<File, Integer>(); |
311 | 881 | mdecorde | maxfile.eachLine{it->
|
312 | 881 | mdecorde | String[] split = it.split("\t"); |
313 | 881 | mdecorde | if(split.length == 2) |
314 | 881 | mdecorde | { |
315 | 881 | mdecorde | try
|
316 | 881 | mdecorde | { |
317 | 881 | mdecorde | String filename = it.split("\t")[0]; |
318 | 881 | mdecorde | int max = Integer.parseInt(it.split("\t")[1]) |
319 | 881 | mdecorde | maxperfile.put(filename, max); |
320 | 881 | mdecorde | }catch(Exception e ){} |
321 | 881 | mdecorde | } |
322 | 881 | mdecorde | } |
323 | 881 | mdecorde | println maxperfile; |
324 | 881 | mdecorde | |
325 | 1615 | mdecorde | def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
|
326 | 1370 | mdecorde | for (File infile : files) { |
327 | 1370 | mdecorde | if (maxperfile.containsKey(infile.getName())) {
|
328 | 881 | mdecorde | File outfile = new File(outdir, infile.getName()); |
329 | 881 | mdecorde | int max= maxperfile.get(infile.getName());
|
330 | 881 | mdecorde | new WExtract().process(infile, outfile, max)
|
331 | 881 | mdecorde | } |
332 | 881 | mdecorde | } |
333 | 881 | mdecorde | } |
334 | 881 | mdecorde | } |