Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtractWithMode.groovy @ 1688

History | View | Annotate | Download (9.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $
25
// $LastChangedRevision: 2038 $
26
// $LastChangedBy: alavrentev $ 
27
//
28
package org.txm.scripts.importer
29

    
30
import javax.xml.parsers.DocumentBuilder;
31
import javax.xml.parsers.DocumentBuilderFactory;
32
import javax.xml.parsers.ParserConfigurationException;
33
import javax.xml.transform.OutputKeys;
34
import javax.xml.transform.Result;
35
import javax.xml.transform.Source;
36
import javax.xml.transform.Transformer;
37
import javax.xml.transform.TransformerFactory;
38
import javax.xml.transform.dom.DOMSource;
39
import javax.xml.transform.stream.StreamResult;
40

    
41
import org.txm.utils.io.IOUtils
42
import org.w3c.dom.Document;
43
import org.w3c.dom.Element;
44
import org.w3c.dom.NodeList;
45
import org.xml.sax.SAXException;
46

    
47
import javax.xml.stream.*;
48
import java.io.File;
49
import java.net.URL;
50

    
51
// TODO: Auto-generated Javadoc
52
/**
53
 * Extract w tags from a tei file
54
 * not finished.
55
 *
56
 * @author mdecorde
57
 */
58
class WExtractWithMode 
59
{
60
        
61
        /**
62
         * Process.
63
         *
64
         * @param infile the infile
65
         * @param outfile the outfile
66
         * @param max the max
67
         * @return the java.lang. object
68
         */
69
        public process(File infile, File outfile, String modemax)
70
        {
71
                println "Process "+infile.getName()+", keep $modemax words"
72
                int count = this.countW(infile);
73
                
74
                int max = 0
75
                String mode = ""
76
                
77
                try {
78
                mode = modemax.split("/")[0]
79
                max = Integer.parseInt(modemax.split("/")[1])                
80
                }catch(Exception e ){}
81
                
82
                if(count < max)
83
                {
84
                        println "can't extract $max words, the file "+infile.getName()+" contains only $count words"
85
                        return;
86
                }
87
                //String ms = "#ms_K"
88
                int part = 0; 
89
                if (mode == "3")
90
                {
91
                        part = max/3                
92
                }
93
                else if (mode == "2")
94
                {
95
                        part = max/2
96
                }
97
                else if (mode == "1a" || mode == "1m" || mode == "1z")
98
                {
99
                        part = max
100
                }
101
                else
102
                {
103
                        println "mode must be 1a, 1m, 1z, 2 or 3"
104
                        return
105
                }
106
                int from1 = 0 
107
                int to1 = 0
108
                if (mode != "1m" && mode != "1z")
109
                {
110
                        to1 = part
111
                }
112
                int from2 = 0
113
                int to2 = 0
114
                if (mode == "3" || mode == "1m")
115
                {
116
                        from2 = (count/2) - (part/2);
117
                        to2 =(count/2) + (part/2);                        
118
                }
119
                int from3 = 0
120
                int to3 = 0
121
                if (mode != "1a" && mode != "1m")
122
                {
123
                        from3 = count -part;
124
                        to3= count-1;                        
125
                }
126
                boolean isSic = false;
127
                boolean isW = false;
128
                boolean isText = false;
129
                boolean printW = true;
130
                int wcount=0;
131
                
132
                println " count : "+count
133
                println "  get from "+from1+" to "+to1
134
                println "  get from "+from2+" to "+to2
135
                println "  get from "+from3+" to "+to3
136
                
137
                
138
                String localname;
139
                String prefix;
140
                InputStream inputData = infile.toURI().toURL().openStream();
141
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
142
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
143
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
144
                
145
                FileOutputStream output = new FileOutputStream(outfile)
146
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
147
                
148
                writer.writeStartDocument("utf-8", "1.0");
149
                
150
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
151
                {
152
                        if(isText)
153
                        {
154
                                if((wcount >= from1 && wcount <= to1 )||
155
                                (wcount >= from2 && wcount <= to2) ||
156
                                (wcount >= from3 && wcount <= to3))
157
                                        printW = true;
158
                                else
159
                                        printW = false;
160
                        }
161
                        else
162
                                printW = true;
163
                        
164
                        switch (event) 
165
                        {
166
                                case XMLStreamConstants.START_ELEMENT:
167
                                        localname = parser.getLocalName();
168
                                        prefix = parser.getPrefix();
169
                                
170
                                /*
171
                                 if(localname == "supplied")
172
                                 if(parser.getAttributeValue(null,"source") != null)
173
                                 ms = parser.getAttributeValue(null,"source")
174
                                 if(localname == "sic")
175
                                 {
176
                                 isSic= true;
177
                                 }
178
                                 */
179
                                        if(localname == "text")
180
                                                isText = true;
181
                                
182
                                        if(localname == "w")
183
                                        {
184
                                                isW= true;
185
                                                wcount++;
186
                                                
187
                                                if(isText)
188
                                                {
189
                                                        if((wcount >= from1 && wcount <= to1 )||
190
                                                        (wcount >= from2 && wcount <= to2) ||
191
                                                        (wcount >= from3 && wcount <= to3))
192
                                                                printW = true;
193
                                                        else
194
                                                                printW = false;
195
                                                }
196
                                                else
197
                                                        printW = true;
198
                                        }
199
                                
200
                                /*if(!isSic)
201
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
202
                                 {*/
203
                                        if(localname == "w")
204
                                        {
205
                                                if(printW)
206
                                                {
207
                                                        if(prefix != null && prefix.length() > 0)
208
                                                                writer.writeStartElement(prefix+":"+localname);
209
                                                        else
210
                                                                writer.writeStartElement(localname);
211
                                                        
212
                                                        for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
213
                                                        {
214
                                                                if(parser.getAttributePrefix(i)!= "")
215
                                                                        writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
216
                                                                else
217
                                                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
218
                                                        }
219
                                                        //writer.writeAttribute("srcmf:src", ms);
220
                                                }
221
                                        }
222
                                        else
223
                                        {
224
                                                if(prefix != null && prefix.length() > 0)
225
                                                        writer.writeStartElement(prefix+":"+localname);
226
                                                else
227
                                                        writer.writeStartElement(localname);
228
                                                
229
                                                if(localname == "teiHeader")
230
                                                {
231
                                                        writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0");
232
                                                        writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0");
233
                                                        //writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
234
                                                }
235
                                                
236
                                                if(localname == "TEI")
237
                                                {
238
                                                        writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0");
239
                                                }
240
                                                
241
                                                for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
242
                                                {
243
                                                        if(parser.getAttributePrefix(i)!= "")
244
                                                                writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
245
                                                        else
246
                                                                writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
247
                                                }
248
                                        }
249
                                //}
250
                                        break;
251
                                
252
                                case XMLStreamConstants.END_ELEMENT:
253
                                        localname =parser.getLocalName()
254
                                
255
                                /*if(localname == "sic")
256
                                 isSic= false;
257
                                 if(localname == "w")
258
                                 isW= false;
259
                                 if(localname == "supplied" && ms != "#ms_K")
260
                                 ms = "#ms_K";
261
                                 if(!isSic)
262
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
263
                                 {*/
264
                                        if(localname == "w")
265
                                        {
266
                                                if(printW)
267
                                                {
268
                                                        writer.writeEndElement();
269
                                                        writer.writeComment("\n");
270
                                                }
271
                                        }
272
                                        else
273
                                        {        
274
                                                writer.writeEndElement();
275
                                                writer.writeCharacters("\n");
276
                                        }
277
                                //        }
278
                                
279
                                        break;
280
                                
281
                                case XMLStreamConstants.CHARACTERS:
282
                                //if(!isSic)
283
                                        if(isW)
284
                                        {
285
                                                if(printW)
286
                                                {
287
                                                        writer.writeCharacters(parser.getText().trim());
288
                                                }
289
                                        }
290
                                        else
291
                                                writer.writeCharacters(parser.getText().trim());
292
                                        break;
293
                        }
294
                }
295
                writer.flush();
296
                writer.close();
297
                output.close()
298
                if (parser != null) parser.close();
299
                if (inputData != null) inputData.close();
300
        }
301
        
302
        /**
303
         * Count w.
304
         *
305
         * @param infile the infile
306
         * @return the int
307
         */
308
        public int countW(File infile)
309
        {
310
                InputStream inputData = infile.toURI().toURL().openStream();
311
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
312
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
313
                
314
                int count = 0;
315
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
316
                {
317
                        switch (event) 
318
                        {
319
                                case XMLStreamConstants.START_ELEMENT:
320
                                        if(parser.getLocalName() == "w")
321
                                                count++;
322
                        }
323
                }
324
                if (parser != null) parser.close();
325
                if (inputData != null) inputData.close();
326
                return count;
327
        }
328
        
329
        /**
330
         * The main method.
331
         *
332
         * @param args the arguments
333
         */
334
        public static void main(String[] args)
335
        {
336
                String userDir = System.getProperty("user.home");
337
                
338
                File directory = new File(userDir+"/xml/extract/");
339
                File outdir = new File(userDir+"/xml/extract/","results");
340
                outdir.mkdir();
341
                
342
                File maxfilemode = new File(userDir+"/xml/extract/maxfilemode");
343
                /*
344
                 * maxfilemode format:
345
                 * 
346
                 * filename1.xml        3        45000
347
                 * filename2.xml        1a        15000
348
                 * filename3.xml        1m        15000
349
                 * filename4.xml        1z        15000
350
                 * filename5.xml        2        22500
351
                 */
352
                HashMap<File, String> maxperfile = new HashMap<File, String>();
353
                maxfilemode.eachLine{it->
354
                        String[] split = it.split("\t");
355
                        if(split.length == 3)
356
                        {
357
                                try
358
                                {
359
                                String filename = it.split("\t")[0];
360
                                String modemax = it.split("\t")[1]+"/"+it.split("\t")[2]
361
                                maxperfile.put(filename, modemax);
362
                                }catch(Exception e ){}
363
                        }
364
                }
365
                println maxperfile;
366
                
367
                def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
368
                for(File infile : files) {
369
                        if(maxperfile.containsKey(infile.getName())) {
370
                                File outfile = new File(outdir, infile.getName());
371
                                String modemax = maxperfile.get(infile.getName());
372
                                new WExtractWithMode().process(infile, outfile, modemax)
373
                        }
374
                }
375
        }
376
}