Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WExtract.groovy @ 1688

History | View | Annotate | Download (8.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
25
// $LastChangedRevision: 3087 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer
29

    
30
import javax.xml.parsers.DocumentBuilder;
31
import javax.xml.parsers.DocumentBuilderFactory;
32
import javax.xml.parsers.ParserConfigurationException;
33
import javax.xml.transform.OutputKeys;
34
import javax.xml.transform.Result;
35
import javax.xml.transform.Source;
36
import javax.xml.transform.Transformer;
37
import javax.xml.transform.TransformerFactory;
38
import javax.xml.transform.dom.DOMSource;
39
import javax.xml.transform.stream.StreamResult;
40

    
41
import org.txm.utils.io.IOUtils
42
import org.w3c.dom.Document;
43
import org.w3c.dom.Element;
44
import org.w3c.dom.NodeList;
45
import org.xml.sax.SAXException;
46

    
47
import javax.xml.stream.*;
48
import java.io.File;
49
import java.net.URL;
50

    
51
// TODO: Auto-generated Javadoc
52
/**
53
 * Extract w tags from a tei file
54
 * not finished.
55
 *
56
 * @author mdecorde
57
 */
58
class WExtract 
59
{
60
        
61
        /**
62
         * Process.
63
         *
64
         * @param infile the infile
65
         * @param outfile the outfile
66
         * @param max the max
67
         * @return the java.lang. object
68
         */
69
        public process(File infile, File outfile, int max)
70
        {
71
                println "Process "+infile.getName()+", keep $max words"
72
                int count = this.countW(infile);
73
                if(count < max)
74
                {
75
                        println "can't extract $max words, the file "+infile.getName()+" contains only $count words"
76
                        return;
77
                }
78
                String ms = "#ms_K"
79
                int tier = max/3;
80
                int from1 = 0 
81
                int to1 = tier;
82
                int from2 = (count/2) - (tier/2);
83
                int to2 =(count/2) + (tier/2);
84
                int from3 = count -tier;
85
                int to3= count-1;
86
                boolean isSic = false;
87
                boolean isW = false;
88
                boolean isText = false;
89
                boolean printW = true;
90
                int wcount=0;
91
                
92
                println " count : "+count
93
                println "  get from "+from1+" to "+to1
94
                println "  get from "+from2+" to "+to2
95
                println "  get from "+from3+" to "+to3
96
                
97
                
98
                String localname;
99
                String prefix;
100
                InputStream inputData = infile.toURI().toURL().openStream();
101
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
102
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
103
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
104
                
105
                FileOutputStream output = new FileOutputStream(outfile);
106
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
107
                
108
                writer.writeStartDocument("utf-8", "1.0");
109
                
110
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
111
                {
112
                        if(isText)
113
                        {
114
                                if((wcount >= from1 && wcount <= to1 )||
115
                                (wcount >= from2 && wcount <= to2) ||
116
                                (wcount >= from3 && wcount <= to3))
117
                                        printW = true;
118
                                else
119
                                        printW = false;
120
                        }
121
                        else
122
                                printW = true;
123
                        
124
                        switch (event) 
125
                        {
126
                                case XMLStreamConstants.START_ELEMENT:
127
                                        localname = parser.getLocalName();
128
                                        prefix = parser.getPrefix();
129
                                
130
                                /*
131
                                 if(localname == "supplied")
132
                                 if(parser.getAttributeValue(null,"source") != null)
133
                                 ms = parser.getAttributeValue(null,"source")
134
                                 if(localname == "sic")
135
                                 {
136
                                 isSic= true;
137
                                 }
138
                                 */
139
                                        if(localname == "text")
140
                                                isText = true;
141
                                
142
                                        if(localname == "w")
143
                                        {
144
                                                isW= true;
145
                                                wcount++;
146
                                                
147
                                                if(isText)
148
                                                {
149
                                                        if((wcount >= from1 && wcount <= to1 )||
150
                                                        (wcount >= from2 && wcount <= to2) ||
151
                                                        (wcount >= from3 && wcount <= to3))
152
                                                                printW = true;
153
                                                        else
154
                                                                printW = false;
155
                                                }
156
                                                else
157
                                                        printW = true;
158
                                        }
159
                                
160
                                /*if(!isSic)
161
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
162
                                 {*/
163
                                        if(localname == "w")
164
                                        {
165
                                                if(printW)
166
                                                {
167
                                                        if(prefix != null && prefix.length() > 0)
168
                                                                writer.writeStartElement(prefix+":"+localname);
169
                                                        else
170
                                                                writer.writeStartElement(localname);
171
                                                        
172
                                                        for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
173
                                                        {
174
                                                                if(parser.getAttributePrefix(i)!= "")
175
                                                                        writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
176
                                                                else
177
                                                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
178
                                                        }
179
                                                        //writer.writeAttribute("srcmf:src", ms);
180
                                                }
181
                                        }
182
                                        else
183
                                        {
184
                                                if(prefix != null && prefix.length() > 0)
185
                                                        writer.writeStartElement(prefix+":"+localname);
186
                                                else
187
                                                        writer.writeStartElement(localname);
188
                                                
189
                                                if(localname == "teiHeader")
190
                                                {
191
                                                        writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0");
192
                                                        writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0");
193
                                                        //writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
194
                                                }
195
                                                
196
                                                if(localname == "TEI")
197
                                                {
198
                                                        writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0");
199
                                                }
200
                                                
201
                                                for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
202
                                                {
203
                                                        if(parser.getAttributePrefix(i)!= "")
204
                                                                writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
205
                                                        else
206
                                                                writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
207
                                                }
208
                                        }
209
                                //}
210
                                        break;
211
                                
212
                                case XMLStreamConstants.END_ELEMENT:
213
                                        localname =parser.getLocalName()
214
                                
215
                                /*if(localname == "sic")
216
                                 isSic= false;
217
                                 if(localname == "w")
218
                                 isW= false;
219
                                 if(localname == "supplied" && ms != "#ms_K")
220
                                 ms = "#ms_K";
221
                                 if(!isSic)
222
                                 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
223
                                 {*/
224
                                        if(localname == "w")
225
                                        {
226
                                                if(printW)
227
                                                {
228
                                                        writer.writeEndElement();
229
                                                        writer.writeComment("\n");
230
                                                }
231
                                        }
232
                                        else
233
                                        {        
234
                                                writer.writeEndElement();
235
                                                writer.writeCharacters("\n");
236
                                        }
237
                                //        }
238
                                
239
                                        break;
240
                                
241
                                case XMLStreamConstants.CHARACTERS:
242
                                //if(!isSic)
243
                                        if(isW)
244
                                        {
245
                                                if(printW)
246
                                                {
247
                                                        writer.writeCharacters(parser.getText().trim());
248
                                                }
249
                                        }
250
                                        else
251
                                                writer.writeCharacters(parser.getText().trim());
252
                                        break;
253
                        }
254
                }
255
                writer.flush();
256
                writer.close();
257
                output.close()
258
                if (parser != null) parser.close();
259
                if (inputData != null) inputData.close();
260
        }
261
        
262
        /**
263
         * Count w.
264
         *
265
         * @param infile the infile
266
         * @return the int
267
         */
268
        public int countW(File infile)
269
        {
270
                InputStream inputData = infile.toURI().toURL().openStream();
271
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
272
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
273
                
274
                int count = 0;
275
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
276
                {
277
                        switch (event) 
278
                        {
279
                                case XMLStreamConstants.START_ELEMENT:
280
                                        if(parser.getLocalName() == "w")
281
                                                count++;
282
                        }
283
                }
284
                if (parser != null) parser.close();
285
                if (inputData != null) inputData.close();
286
                return count;
287
        }
288
        
289
        /**
290
         * The main method.
291
         *
292
         * @param args the arguments
293
         */
294
        public static void main(String[] args)
295
        {
296
                String userDir = System.getProperty("user.home");
297
                
298
                File directory = new File(userDir+"/xml/extract/");
299
                File outdir = new File(userDir+"/xml/extract/","results");
300
                outdir.mkdir();
301
                
302
                File maxfile = new File(userDir+"/xml/extract/maxfile");
303
                /*
304
                 * maxfile format:
305
                 * 
306
                 * filename1.xml        45000
307
                 * filename2.xml        22500
308
                 * filename3.xml        45000
309
                 */
310
                HashMap<File, Integer> maxperfile = new HashMap<File, Integer>();
311
                maxfile.eachLine{it->
312
                        String[] split = it.split("\t");
313
                        if(split.length == 2)
314
                        {
315
                                try
316
                                {
317
                                String filename = it.split("\t")[0];
318
                                int max = Integer.parseInt(it.split("\t")[1])
319
                                maxperfile.put(filename, max);
320
                                }catch(Exception e ){}
321
                        }
322
                }
323
                println maxperfile;
324
                
325
                def files = directory.listFiles(IOUtils.HIDDENFILE_FILTER);
326
                for (File infile : files) {
327
                        if (maxperfile.containsKey(infile.getName())) {
328
                                File outfile = new File(outdir, infile.getName());
329
                                int max= maxperfile.get(infile.getName());
330
                                new WExtract().process(infile, outfile, max)
331
                        }
332
                }
333
        }
334
}