Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / Tokeniser / TokenizerXml.groovy @ 187

History | View | Annotate | Download (10.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
// 
38
// 
39
// 
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$ 
43
//
44
package filters.Tokeniser;
45

    
46
import javax.xml.parsers.DocumentBuilder;
47
import javax.xml.parsers.DocumentBuilderFactory;
48
import javax.xml.parsers.ParserConfigurationException;
49
import javax.xml.transform.OutputKeys;
50
import javax.xml.transform.Result;
51
import javax.xml.transform.Source;
52
import javax.xml.transform.Transformer;
53
import javax.xml.transform.TransformerFactory;
54
import javax.xml.transform.dom.DOMSource;
55
import javax.xml.transform.stream.StreamResult;
56

    
57
import org.txm.objects.Base;
58
import org.w3c.dom.Document;
59
import org.w3c.dom.Element;
60
import org.w3c.dom.NodeList;
61
import org.xml.sax.SAXException;
62

    
63
import java.io.File;
64
import java.io.FileInputStream;
65
import java.io.InputStreamReader;
66

    
67
import javax.xml.stream.*;
68
import java.net.URL;
69
import org.txm.tokenizer.TokenizerClasses;
70

    
71
// TODO: Auto-generated Javadoc
72
/**
73
 * The Class TokenizerXml.
74
 */
75
public class TokenizerXml {
76
        
77
        /** The word_tags. */
78
        String word_tags = TokenizerClasses.word_tags;
79
        
80
        /** The intraword_tags. */
81
        String intraword_tags = TokenizerClasses.intraword_tags;
82
        
83
        /** The punct_strong. */
84
        def punct_strong = TokenizerClasses.punct_strong;
85
        
86
        /** The punct_all. */
87
        def punct_all = TokenizerClasses.punct_all;
88
        
89
        /** The word_chars. */
90
        def word_chars = TokenizerClasses.word_chars;
91
        
92
        /** The ignorable_tags. */
93
        String ignorable_tags = "note"
94
        
95
        /** The outfile. */
96
        File outfile;
97
        
98
        /** The infile. */
99
        File infile;
100
        
101
        /** The writer. */
102
        XMLStreamWriter writer;
103
        FileOutputStream output;
104
        
105
        /** The parser. */
106
        XMLStreamReader parser
107
        
108
        /** The localname. */
109
        String localname;
110
        
111
        /** The prefix. */
112
        String prefix;
113
        
114
        /**
115
         * Instantiates a new tokenizer xml.
116
         *
117
         * @param infile the infile
118
         * @param outfile the outfile
119
         */
120
        public TokenizerXml(File infile, File outfile)
121
        {
122
                this.outfile = outfile;
123
                this.infile = infile;
124
        }
125
        
126
        /**
127
         * Fill infos.
128
         *
129
         * @param event the event
130
         * @return the java.lang. object
131
         */
132
        public fillInfos(int event)
133
        {
134
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) 
135
                {
136
                        localname = parser.getLocalName();
137
                        prefix = parser.getPrefix();
138
                }
139
        }
140
        
141
        /**
142
         * Donothing.
143
         *
144
         * @param event the event
145
         * @return the java.lang. object
146
         */
147
        public donothing(int event)
148
        {
149
                if (event == XMLStreamConstants.START_ELEMENT ) {
150
                        if(prefix != null && prefix.length() > 0)
151
                                writer.writeStartElement(prefix+":"+localname);
152
                        else
153
                                writer.writeStartElement(localname);
154
                        String attrprefix;
155
                        for(int i = 0 ; i < parser.getAttributeCount() ; i++)
156
                        {
157
                                attrprefix = parser.getAttributePrefix(i);
158
                                if(attrprefix != null && attrprefix.length() > 0)
159
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
160
                                else
161
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
162
                        }
163
                }
164
                else if(event == XMLStreamConstants.END_ELEMENT)
165
                {
166
                        writer.writeEndElement();
167
                        writer.writeCharacters("\n");
168
                }
169
                else if(event == XMLStreamConstants.CHARACTERS)
170
                {
171
                        writer.writeCharacters(parser.getText());
172
                }
173
        }
174
        
175
        /**
176
         * Process.
177
         */
178
        public void process()
179
        {
180
                boolean ignorecontent = true;//tokenize a partir de <body>
181
                int wordcount = 0;
182
                
183
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
184
                output = new FileOutputStream(outfile)
185
                writer = factory.createXMLStreamWriter(output, "UTF-8")
186
                
187
                def inputData = infile.toURI().toURL().openStream();
188
                def inputfactory = XMLInputFactory.newInstance();
189
                parser = inputfactory.createXMLStreamReader(inputData);
190
                
191
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
192
                {
193
                        fillInfos(event);
194
                        if (event == XMLStreamConstants.START_ELEMENT) 
195
                        {
196
                                //println "Open: "+localname;
197
                                if(localname == "text")
198
                                {
199
                                        ignorecontent = false;
200
                                        donothing(event);
201
                                }
202
                                if(localname.matches(word_tags))
203
                                {
204
                                        //println "Found pretagged word";
205
                                        donothing(event);
206
                                        wordcount++;
207
                                        if(parser.getAttributeValue(null, "id") == null)
208
                                                writer.writeAttribute("id", "w_"+wordcount);
209
                                        if(parser.getAttributeValue(null, "type") == null)
210
                                                writer.writeAttribute("type","w");
211
                                        ignorecontent = true;
212
                                }
213
                                else if(localname.matches(ignorable_tags))
214
                                {
215
                                        //println "Found note";
216
                                        donothing(event);
217
                                        ignorecontent = true;
218
                                }
219
                                else
220
                                {
221
                                        donothing(event);
222
                                }
223
                        }
224
                        else if(event == XMLStreamConstants.END_ELEMENT) 
225
                        {
226
                                //println "Close: "+localname;
227
                                if(localname == "w")
228
                                {
229
                                        ignorecontent = false;
230
                                        writer.writeEndElement();
231
                                        writer.writeCharacters("\n");
232
                                }
233
                                else if(localname == "note")
234
                                {
235
                                        ignorecontent = false;
236
                                        writer.writeEndElement();
237
                                        writer.writeCharacters("\n");
238
                                }
239
                                else
240
                                {
241
                                        donothing(event);
242
                                }
243
                        }
244
                        else if(event == XMLStreamConstants.CHARACTERS) 
245
                        {
246
                                if(ignorecontent)
247
                                {        
248
                                        //println " ignore chars: "+parser.getText().trim();
249
                                        donothing(event);
250
                                }
251
                                else
252
                                {
253
                                        //println " chars: "+parser.getText().trim();
254
                                        String text = parser.getText().trim().replace("\t"," ");
255
                                        text = text.replaceAll("\\p{C}", "");
256
                                        for(String s : tokenize(text))
257
                                        {
258
                                                if (s.matches(/\.[^ .]+\./)) {
259
                                                        wordcount++;        
260
                                                        writer.writeStartElement("w");
261
                                                        writer.writeAttribute("id","w_"+wordcount);
262
                                                        writer.writeAttribute("type","num");
263
                                                        writer.writeCharacters(s);
264
                                                        writer.writeEndElement();
265
                                                        writer.writeCharacters("\n");
266
                                                } else if(s.matches(/$punct_all|['‘’]/))
267
                                                                                {
268
                                                        wordcount++;        
269
                                                        writer.writeStartElement("w");
270
                                                        writer.writeAttribute("id","w_"+wordcount);
271
                                                        writer.writeAttribute("type","pon");
272
                                                        writer.writeCharacters(s);
273
                                                        writer.writeEndElement();
274
                                                        writer.writeCharacters("\n");
275
                                                }
276
                                                else if(s.matches(/[\[\(\{].*[\]\)\}]/))
277
                                                {
278
                                                        wordcount++;        
279
                                                        writer.writeStartElement("w");
280
                                                        writer.writeAttribute("id","w_"+wordcount);
281
                                                        writer.writeStartElement("supplied");
282
                                                        writer.writeAttribute("rend","crochets");
283
                                                        writer.writeCharacters(s);
284
                                                        writer.writeEndElement();
285
                                                        writer.writeEndElement();
286
                                                        writer.writeCharacters("\n");
287
                                                }
288
                                                else if(s.contains("'") || s.contains(""))
289
                                                {
290
                                                        int sep = s.indexOf("'");
291
                                                        if(sep < 0)
292
                                                                sep = s.indexOf("");
293

    
294
                                                        wordcount++;        
295
                                                        writer.writeStartElement("w");
296
                                                        writer.writeAttribute("id","w_"+wordcount);
297
                                                        writer.writeCharacters(s.substring(0,sep+1));
298
                                                        writer.writeEndElement();
299
                                                        writer.writeCharacters("\n");
300
                                                        
301
                                                        if(s.substring(sep+1).length() > 0)
302
                                                        {
303
                                                                wordcount++;        
304
                                                                writer.writeStartElement("w");
305
                                                                writer.writeAttribute("id","w_"+wordcount);
306
                                                                writer.writeCharacters(s.substring(sep+1));
307
                                                                writer.writeEndElement();
308
                                                                writer.writeCharacters("\n");
309
                                                        }
310
                                                }
311
                                                else if(s.matches(/($punct_all)(.*)/))
312
                                                {
313
                                                        wordcount++;        
314
                                                        writer.writeStartElement("w");
315
                                                        writer.writeAttribute("id","w_"+wordcount);
316
                                                        writer.writeAttribute("type","pon");
317
                                                        writer.writeCharacters(s.substring(0, 1));
318
                                                        writer.writeEndElement();
319
                                                        writer.writeCharacters("\n");
320
                                                        
321
                                                        wordcount++;        
322
                                                        writer.writeStartElement("w");
323
                                                        writer.writeAttribute("id","w_"+wordcount);
324
                                                        writer.writeCharacters(s.substring(1));
325
                                                        writer.writeEndElement();
326
                                                        writer.writeCharacters("\n");
327
                                                }
328
                                                else if(s.matches(/(.*)($punct_all)/))
329
                                                {
330
                                                        wordcount++;        
331
                                                        writer.writeStartElement("w");
332
                                                        writer.writeAttribute("id","w_"+wordcount);
333
                                                        writer.writeCharacters(s.substring(0, s.length()-1));
334
                                                        writer.writeEndElement();
335
                                                        writer.writeCharacters("\n");
336
                                                        
337
                                                        wordcount++;        
338
                                                        writer.writeStartElement("w");
339
                                                        writer.writeAttribute("id","w_"+wordcount);
340
                                                        writer.writeAttribute("type","pon");
341
                                                        writer.writeCharacters(s.substring(s.length()-1, s.length()));
342
                                                        writer.writeEndElement();
343
                                                        writer.writeCharacters("\n");
344
                                                } else {
345
                                                        wordcount++;        
346
                                                        writer.writeStartElement("w");
347
                                                        writer.writeAttribute("id","w_"+wordcount);
348
                                                        writer.writeCharacters(s);
349
                                                        writer.writeEndElement();
350
                                                        writer.writeCharacters("\n");
351
                                                }
352
                                        }
353
                                }
354
                        }
355
                        else if(event == XMLStreamConstants.COMMENT) 
356
                        {
357
                                writer.writeComment(parser.getText())
358
                        }
359
                        else if(event == XMLStreamConstants.DTD) 
360
                        {
361
                                //println "DTD!";
362
                        }
363
                        else
364
                        {
365
                                
366
                        }
367
                }
368
                
369
                parser.close()
370
                writer.close();
371
                output.close();
372
                inputData.close();
373
        }
374
        
375
        /**
376
         * Tokenize.
377
         *
378
         * @param str the str
379
         * @return the list
380
         */
381
        public List<String> tokenize(String str)
382
        {
383
                return str.tokenize()//cut by whitespace
384
        }
385
        
386
        /**
387
         * The main method.
388
         *
389
         * @param args the arguments
390
         */
391
        public static void main(String[] args)
392
        {
393
                File infile = new File("~/xml/quete","qgraal_cm.xml");
394
                File outfile = new File("~/xml/quete","qgraal_cm-out.xml");
395
                TokenizerXml tokenizer = new TokenizerXml(infile, outfile)
396
                tokenizer.process();
397
        }
398
}