Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtractWithMode.groovy @ 2473

History | View | Annotate | Download (9.5 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
//
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
// $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $
25 881 mdecorde
// $LastChangedRevision: 2038 $
26 881 mdecorde
// $LastChangedBy: alavrentev $
27 881 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.importer
29 881 mdecorde
30 881 mdecorde
import javax.xml.parsers.DocumentBuilder;
31 881 mdecorde
import javax.xml.parsers.DocumentBuilderFactory;
32 881 mdecorde
import javax.xml.parsers.ParserConfigurationException;
33 881 mdecorde
import javax.xml.transform.OutputKeys;
34 881 mdecorde
import javax.xml.transform.Result;
35 881 mdecorde
import javax.xml.transform.Source;
36 881 mdecorde
import javax.xml.transform.Transformer;
37 881 mdecorde
import javax.xml.transform.TransformerFactory;
38 881 mdecorde
import javax.xml.transform.dom.DOMSource;
39 881 mdecorde
import javax.xml.transform.stream.StreamResult;
40 881 mdecorde
41 1370 mdecorde
import org.txm.utils.io.IOUtils
42 881 mdecorde
import org.w3c.dom.Document;
43 881 mdecorde
import org.w3c.dom.Element;
44 881 mdecorde
import org.w3c.dom.NodeList;
45 881 mdecorde
import org.xml.sax.SAXException;
46 881 mdecorde
47 881 mdecorde
import javax.xml.stream.*;
48 881 mdecorde
import java.io.File;
49 881 mdecorde
import java.net.URL;
50 881 mdecorde
51 881 mdecorde
// TODO: Auto-generated Javadoc
52 881 mdecorde
/**
53 881 mdecorde
 * Extract w tags from a tei file
54 881 mdecorde
 * not finished.
55 881 mdecorde
 *
56 881 mdecorde
 * @author mdecorde
57 881 mdecorde
 */
58 881 mdecorde
class WExtractWithMode
59 881 mdecorde
{
60 881 mdecorde
61 881 mdecorde
        /**
62 881 mdecorde
         * Process.
63 881 mdecorde
         *
64 881 mdecorde
         * @param infile the infile
65 881 mdecorde
         * @param outfile the outfile
66 881 mdecorde
         * @param max the max
67 881 mdecorde
         * @return the java.lang. object
68 881 mdecorde
         */
69 881 mdecorde
        public process(File infile, File outfile, String modemax)
70 881 mdecorde
        {
71 881 mdecorde
                println "Process "+infile.getName()+", keep $modemax words"
72 881 mdecorde
                int count = this.countW(infile);
73 881 mdecorde
74 881 mdecorde
                int max = 0
75 881 mdecorde
                String mode = ""
76 881 mdecorde
77 881 mdecorde
                try {
78 881 mdecorde
                mode = modemax.split("/")[0]
79 881 mdecorde
                max = Integer.parseInt(modemax.split("/")[1])
80 881 mdecorde
                }catch(Exception e ){}
81 881 mdecorde
82 881 mdecorde
                if(count < max)
83 881 mdecorde
                {
84 881 mdecorde
                        println "can't extract $max words, the file "+infile.getName()+" contains only $count words"
85 881 mdecorde
                        return;
86 881 mdecorde
                }
87 881 mdecorde
                //String ms = "#ms_K"
88 881 mdecorde
                int part = 0;
89 881 mdecorde
                if (mode == "3")
90 881 mdecorde
                {
91 881 mdecorde
                        part = max/3
92 881 mdecorde
                }
93 881 mdecorde
                else if (mode == "2")
94 881 mdecorde
                {
95 881 mdecorde
                        part = max/2
96 881 mdecorde
                }
97 881 mdecorde
                else if (mode == "1a" || mode == "1m" || mode == "1z")
98 881 mdecorde
                {
99 881 mdecorde
                        part = max
100 881 mdecorde
                }
101 881 mdecorde
                else
102 881 mdecorde
                {
103 881 mdecorde
                        println "mode must be 1a, 1m, 1z, 2 or 3"
104 881 mdecorde
                        return
105 881 mdecorde
                }
106 881 mdecorde
                int from1 = 0
107 881 mdecorde
                int to1 = 0
108 881 mdecorde
                if (mode != "1m" && mode != "1z")
109 881 mdecorde
                {
110 881 mdecorde
                        to1 = part
111 881 mdecorde
                }
112 881 mdecorde
                int from2 = 0
113 881 mdecorde
                int to2 = 0
114 881 mdecorde
                if (mode == "3" || mode == "1m")
115 881 mdecorde
                {
116 881 mdecorde
                        from2 = (count/2) - (part/2);
117 881 mdecorde
                        to2 =(count/2) + (part/2);
118 881 mdecorde
                }
119 881 mdecorde
                int from3 = 0
120 881 mdecorde
                int to3 = 0
121 881 mdecorde
                if (mode != "1a" && mode != "1m")
122 881 mdecorde
                {
123 881 mdecorde
                        from3 = count -part;
124 881 mdecorde
                        to3= count-1;
125 881 mdecorde
                }
126 881 mdecorde
                boolean isSic = false;
127 881 mdecorde
                boolean isW = false;
128 881 mdecorde
                boolean isText = false;
129 881 mdecorde
                boolean printW = true;
130 881 mdecorde
                int wcount=0;
131 881 mdecorde
132 881 mdecorde
                println " count : "+count
133 881 mdecorde
                println "  get from "+from1+" to "+to1
134 881 mdecorde
                println "  get from "+from2+" to "+to2
135 881 mdecorde
                println "  get from "+from3+" to "+to3
136 881 mdecorde
137 881 mdecorde
138 881 mdecorde
                String localname;
139 881 mdecorde
                String prefix;
140 881 mdecorde
                InputStream inputData = infile.toURI().toURL().openStream();
141 881 mdecorde
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
142 881 mdecorde
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
143 881 mdecorde
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
144 881 mdecorde
145 881 mdecorde
                FileOutputStream output = new FileOutputStream(outfile)
146 881 mdecorde
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
147 881 mdecorde
148 881 mdecorde
                writer.writeStartDocument("utf-8", "1.0");
149 881 mdecorde
150 881 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
151 881 mdecorde
                {
152 881 mdecorde
                        if(isText)
153 881 mdecorde
                        {
154 881 mdecorde
                                if((wcount >= from1 && wcount <= to1 )||
155 881 mdecorde
                                (wcount >= from2 && wcount <= to2) ||
156 881 mdecorde
                                (wcount >= from3 && wcount <= to3))
157 881 mdecorde
                                        printW = true;
158 881 mdecorde
                                else
159 881 mdecorde
                                        printW = false;
160 881 mdecorde
                        }
161 881 mdecorde
                        else
162 881 mdecorde
                                printW = true;
163 881 mdecorde
164 881 mdecorde
                        switch (event)
165 881 mdecorde
                        {
166 881 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
167 881 mdecorde
                                        localname = parser.getLocalName();
168 881 mdecorde
                                        prefix = parser.getPrefix();
169 881 mdecorde
170 881 mdecorde
                                /*
171 881 mdecorde
                                 if(localname == "supplied")
172 881 mdecorde
                                 if(parser.getAttributeValue(null,"source") != null)
173 881 mdecorde
                                 ms = parser.getAttributeValue(null,"source")
174 881 mdecorde
                                 if(localname == "sic")
175 881 mdecorde
                                 {
176 881 mdecorde
                                 isSic= true;
177 881 mdecorde
                                 }
178 881 mdecorde
                                 */
179 881 mdecorde
                                        if(localname == "text")
180 881 mdecorde
                                                isText = true;
181 881 mdecorde
182 881 mdecorde
                                        if(localname == "w")
183 881 mdecorde
                                        {
184 881 mdecorde
                                                isW= true;
185 881 mdecorde
                                                wcount++;
186 881 mdecorde
187 881 mdecorde
                                                if(isText)
188 881 mdecorde
                                                {
189 881 mdecorde
                                                        if((wcount >= from1 && wcount <= to1 )||
190 881 mdecorde
                                                        (wcount >= from2 && wcount <= to2) ||
191 881 mdecorde
                                                        (wcount >= from3 && wcount <= to3))
192 881 mdecorde
                                                                printW = true;
193 881 mdecorde
                                                        else
194 881 mdecorde
                                                                printW = false;
195 881 mdecorde
                                                }
196 881 mdecorde
                                                else
197 881 mdecorde
                                                        printW = true;
198 881 mdecorde
                                        }
199 881 mdecorde
200 881 mdecorde
                                /*if(!isSic)
201 881 mdecorde
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
202 881 mdecorde
                                 {*/
203 881 mdecorde
                                        if(localname == "w")
204 881 mdecorde
                                        {
205 881 mdecorde
                                                if(printW)
206 881 mdecorde
                                                {
207 881 mdecorde
                                                        if(prefix != null && prefix.length() > 0)
208 881 mdecorde
                                                                writer.writeStartElement(prefix+":"+localname);
209 881 mdecorde
                                                        else
210 881 mdecorde
                                                                writer.writeStartElement(localname);
211 881 mdecorde
212 881 mdecorde
                                                        for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
213 881 mdecorde
                                                        {
214 881 mdecorde
                                                                if(parser.getAttributePrefix(i)!= "")
215 881 mdecorde
                                                                        writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
216 881 mdecorde
                                                                else
217 881 mdecorde
                                                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
218 881 mdecorde
                                                        }
219 881 mdecorde
                                                        //writer.writeAttribute("srcmf:src", ms);
220 881 mdecorde
                                                }
221 881 mdecorde
                                        }
222 881 mdecorde
                                        else
223 881 mdecorde
                                        {
224 881 mdecorde
                                                if(prefix != null && prefix.length() > 0)
225 881 mdecorde
                                                        writer.writeStartElement(prefix+":"+localname);
226 881 mdecorde
                                                else
227 881 mdecorde
                                                        writer.writeStartElement(localname);
228 881 mdecorde
229 881 mdecorde
                                                if(localname == "teiHeader")
230 881 mdecorde
                                                {
231 881 mdecorde
                                                        writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0");
232 881 mdecorde
                                                        writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0");
233 881 mdecorde
                                                        //writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
234 881 mdecorde
                                                }
235 881 mdecorde
236 881 mdecorde
                                                if(localname == "TEI")
237 881 mdecorde
                                                {
238 881 mdecorde
                                                        writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0");
239 881 mdecorde
                                                }
240 881 mdecorde
241 881 mdecorde
                                                for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
242 881 mdecorde
                                                {
243 881 mdecorde
                                                        if(parser.getAttributePrefix(i)!= "")
244 881 mdecorde
                                                                writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
245 881 mdecorde
                                                        else
246 881 mdecorde
                                                                writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
247 881 mdecorde
                                                }
248 881 mdecorde
                                        }
249 881 mdecorde
                                //}
250 881 mdecorde
                                        break;
251 881 mdecorde
252 881 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
253 881 mdecorde
                                        localname =parser.getLocalName()
254 881 mdecorde
255 881 mdecorde
                                /*if(localname == "sic")
256 881 mdecorde
                                 isSic= false;
257 881 mdecorde
                                 if(localname == "w")
258 881 mdecorde
                                 isW= false;
259 881 mdecorde
                                 if(localname == "supplied" && ms != "#ms_K")
260 881 mdecorde
                                 ms = "#ms_K";
261 881 mdecorde
                                 if(!isSic)
262 881 mdecorde
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
263 881 mdecorde
                                 {*/
264 881 mdecorde
                                        if(localname == "w")
265 881 mdecorde
                                        {
266 881 mdecorde
                                                if(printW)
267 881 mdecorde
                                                {
268 881 mdecorde
                                                        writer.writeEndElement();
269 1618 mdecorde
                                                        writer.writeComment("\n");
270 881 mdecorde
                                                }
271 881 mdecorde
                                        }
272 881 mdecorde
                                        else
273 881 mdecorde
                                        {
274 881 mdecorde
                                                writer.writeEndElement();
275 881 mdecorde
                                                writer.writeCharacters("\n");
276 881 mdecorde
                                        }
277 881 mdecorde
                                //        }
278 881 mdecorde
279 881 mdecorde
                                        break;
280 881 mdecorde
281 881 mdecorde
                                case XMLStreamConstants.CHARACTERS:
282 881 mdecorde
                                //if(!isSic)
283 881 mdecorde
                                        if(isW)
284 881 mdecorde
                                        {
285 881 mdecorde
                                                if(printW)
286 881 mdecorde
                                                {
287 881 mdecorde
                                                        writer.writeCharacters(parser.getText().trim());
288 881 mdecorde
                                                }
289 881 mdecorde
                                        }
290 881 mdecorde
                                        else
291 881 mdecorde
                                                writer.writeCharacters(parser.getText().trim());
292 881 mdecorde
                                        break;
293 881 mdecorde
                        }
294 881 mdecorde
                }
295 881 mdecorde
                writer.flush();
296 881 mdecorde
                writer.close();
297 881 mdecorde
                output.close()
298 1688 mdecorde
                if (parser != null) parser.close();
299 1688 mdecorde
                if (inputData != null) inputData.close();
300 881 mdecorde
        }
301 881 mdecorde
302 881 mdecorde
        /**
303 881 mdecorde
         * Count w.
304 881 mdecorde
         *
305 881 mdecorde
         * @param infile the infile
306 881 mdecorde
         * @return the int
307 881 mdecorde
         */
308 881 mdecorde
        public int countW(File infile)
309 881 mdecorde
        {
310 881 mdecorde
                InputStream inputData = infile.toURI().toURL().openStream();
311 881 mdecorde
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
312 881 mdecorde
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
313 881 mdecorde
314 881 mdecorde
                int count = 0;
315 881 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
316 881 mdecorde
                {
317 881 mdecorde
                        switch (event)
318 881 mdecorde
                        {
319 881 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
320 881 mdecorde
                                        if(parser.getLocalName() == "w")
321 881 mdecorde
                                                count++;
322 881 mdecorde
                        }
323 881 mdecorde
                }
324 1688 mdecorde
                if (parser != null) parser.close();
325 1688 mdecorde
                if (inputData != null) inputData.close();
326 881 mdecorde
                return count;
327 881 mdecorde
        }
328 881 mdecorde
329 881 mdecorde
        /**
330 881 mdecorde
         * The main method.
331 881 mdecorde
         *
332 881 mdecorde
         * @param args the arguments
333 881 mdecorde
         */
334 881 mdecorde
        public static void main(String[] args)
335 881 mdecorde
        {
336 881 mdecorde
                String userDir = System.getProperty("user.home");
337 881 mdecorde
338 881 mdecorde
                File directory = new File(userDir+"/xml/extract/");
339 881 mdecorde
                File outdir = new File(userDir+"/xml/extract/","results");
340 881 mdecorde
                outdir.mkdir();
341 881 mdecorde
342 881 mdecorde
                File maxfilemode = new File(userDir+"/xml/extract/maxfilemode");
343 881 mdecorde
                /*
344 881 mdecorde
                 * maxfilemode format:
345 881 mdecorde
                 *
346 881 mdecorde
                 * filename1.xml        3        45000
347 881 mdecorde
                 * filename2.xml        1a        15000
348 881 mdecorde
                 * filename3.xml        1m        15000
349 881 mdecorde
                 * filename4.xml        1z        15000
350 881 mdecorde
                 * filename5.xml        2        22500
351 881 mdecorde
                 */
352 881 mdecorde
                HashMap<File, String> maxperfile = new HashMap<File, String>();
353 881 mdecorde
                maxfilemode.eachLine{it->
354 881 mdecorde
                        String[] split = it.split("\t");
355 881 mdecorde
                        if(split.length == 3)
356 881 mdecorde
                        {
357 881 mdecorde
                                try
358 881 mdecorde
                                {
359 881 mdecorde
                                String filename = it.split("\t")[0];
360 881 mdecorde
                                String modemax = it.split("\t")[1]+"/"+it.split("\t")[2]
361 881 mdecorde
                                maxperfile.put(filename, modemax);
362 881 mdecorde
                                }catch(Exception e ){}
363 881 mdecorde
                        }
364 881 mdecorde
                }
365 881 mdecorde
                println maxperfile;
366 881 mdecorde
367 1615 mdecorde
                def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
368 1370 mdecorde
                for(File infile : files) {
369 1370 mdecorde
                        if(maxperfile.containsKey(infile.getName())) {
370 881 mdecorde
                                File outfile = new File(outdir, infile.getName());
371 881 mdecorde
                                String modemax = maxperfile.get(infile.getName());
372 881 mdecorde
                                new WExtractWithMode().process(infile, outfile, modemax)
373 881 mdecorde
                        }
374 881 mdecorde
                }
375 881 mdecorde
        }
376 881 mdecorde
}