Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / ParseAndInject.groovy @ 479

History | View | Annotate | Download (7.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
// 
38
// 
39
// 
40
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
41
// $LastChangedRevision: 3087 $
42
// $LastChangedBy: mdecorde $ 
43
//
44
package org.txm.scripts;
45

    
46
import java.io.BufferedReader;
47
import java.io.File;
48
import java.io.FileNotFoundException;
49
import java.io.FileOutputStream;
50
import java.io.FileReader;
51
import java.io.IOException;
52
import java.io.InputStream;
53
import java.io.OutputStreamWriter;
54
import java.io.Reader;
55
import java.io.StreamTokenizer;
56
import java.io.UnsupportedEncodingException;
57
import java.io.Writer;
58

    
59
import javax.xml.stream.XMLInputFactory;
60
import javax.xml.stream.XMLOutputFactory;
61
import javax.xml.stream.XMLStreamConstants;
62
import javax.xml.stream.XMLStreamException;
63
import javax.xml.stream.XMLStreamReader;
64
import javax.xml.stream.XMLStreamWriter;
65

    
66
// TODO: Auto-generated Javadoc
67
/**
68
 * Read a list of word from a txt file and inject them in w elements
69
 *
70
 * @author mdecorde
71
 */
72
class ParseAndInject {
73
        
74
        /** The writer. */
75
        XMLStreamWriter writer;
76
        def output;
77
        
78
        /** The lang. */
79
        String lang;
80
        
81
        /** The type. */
82
        String type;
83
        
84
        /** The brutreader. */
85
        BufferedReader brutreader;
86
        
87
        /** The line. */
88
        String line;
89
        
90
        /** The current. */
91
        int current;
92
        
93
        /** The split. */
94
        String[] split;
95
        
96
        /** The token. */
97
        String token;
98
        
99
        /** The is ex. */
100
        boolean isEx;
101
        
102
        /** The is w. */
103
        boolean isW;
104
        
105
        /**
106
         * start the processing.
107
         *
108
         * @param brutfile the brutfile
109
         * @param xmlfile the xmlfile
110
         * @param outfile the outfile
111
         */
112
        public void run(File brutfile, File xmlfile,  File outfile) throws Exception
113
        {
114
                String localname;
115
                
116
                // create XML writer
117
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
118
                output = new FileOutputStream(outfile);
119
                writer = factory.createXMLStreamWriter(output, "UTF-8");
120
                
121
                // create XML reader
122
                InputStream inputData = xmlfile.toURI().toURL().openStream();
123
                XMLInputFactory inputfactory = XMLInputFactory.newInstance();
124
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
125
                
126
                //create Exbrut whitespace tokenizer
127
                brutreader = new BufferedReader(new FileReader(brutfile));
128
                
129
                writer.writeStartDocument("UTF-8","1.0");
130
                
131
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
132
                {
133
                        switch (event) {
134
                                case XMLStreamConstants.START_ELEMENT:
135
                                        localname = parser.getLocalName();
136
                                
137
                                        if (localname == "w") {
138
                                                isW=true;
139
                                        }
140
                                        writer.writeStartElement(localname); // create text tag in each xml file
141
                                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
142
                                                writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
143
                                        }
144
                                        break;
145
                                
146
                                case XMLStreamConstants.END_ELEMENT:
147
                                        localname = parser.getLocalName();
148
                                        if (localname == "w") {
149
                                                isW=false;
150
                                        }
151
                                        writer.writeEndElement();
152
                                
153
                                        break;
154
                                
155
                                case XMLStreamConstants.CHARACTERS:
156
                                        if (isW) {
157
                                                token = this.getNextToken();
158
                                                //System.err.println("next token "+token);
159
                                                if (token == null) {
160
                                                        System.err.println("error getting next word");
161
                                                        return;
162
                                                }
163
                                                String text = parser.getText();
164
                                                int dist = Distance.LD(token, text);// check integrity
165
                                                int max = 0;
166
                                                if (token.contains("(("))
167
                                                 max += 2
168
                                                if (token.contains("))"))
169
                                                        max += 2
170
                                                if (dist > max) {
171
                                                        writer.writeComment("Warning !! Distance($token, $text) > 4")
172
                                                }
173
                                                
174
                                                processToken();
175
                                                //writer.writeCharacters(token);
176
                                        } else {
177
                                                writer.writeCharacters(parser.getText());
178
                                        }
179
                                        break;
180
                        }
181
                }
182
                
183
                writer.close();
184
                output.close();
185
                parser.close();
186
                inputData.close();
187
        }
188
        
189
        /**
190
         * Process token.
191
         */
192
        public void processToken()
193
        {
194
                //System.err.println("process $token");
195
                if (token.length() == 0)
196
                        return;
197
                int idxopen = token.indexOf("((")
198
                int idxclose = token.indexOf("))")
199
                if (isEx) {
200
                        if (idxopen != -1)
201
                        if (idxopen < idxclose) {
202
                                System.err.println("Error: isEx $isEx token: $token : $idxopen < $idxclose");
203
                                return;
204
                        }
205
                        if (idxclose == -1) {
206
                                writer.writeStartElement("ex");
207
                                writer.writeCharacters(token);
208
                                writer.writeEndElement();
209
                                return;
210
                        }
211
                        
212
                        String ok = token.substring(0, idxclose);
213
                        
214
                        writer.writeStartElement("ex");
215
                        writer.writeCharacters(ok);
216
                        writer.writeEndElement();
217
                        isEx = !isEx;
218
                        
219
                        token = token.substring(idxclose+2);
220
                        processToken();
221
                } else {
222
                        if (idxclose != -1)
223
                        if (idxclose < idxopen) {
224
                                System.err.println("Error: isEx $isEx token: $token : $idxclose < $idxopen");
225
                                return;
226
                        }
227
                        if (idxopen == -1) {
228
                                //System.out.println("write "+token);
229
                                writer.writeCharacters(token);
230
                                return;
231
                        }
232
                        
233
                        String ok = token.substring(0, idxopen);
234
                        writer.writeCharacters(ok);
235
                        isEx = !isEx;
236
                        token = token.substring(idxopen+2);
237
                        processToken();
238
                }                
239
        }
240
        
241
        /**
242
         * Gets the next token.
243
         *
244
         * @return the next token
245
         */
246
        public String getNextToken() throws IOException
247
        {
248
                if (split == null) {
249
                        line = brutreader.readLine();
250
                        if (line == null)
251
                                return null;
252
                        
253
                        split = line.split(" ");
254
                        if (split == null || split.length == 1) {
255
                                split = null;
256
                                return line;
257
                        } else {
258
                                current = 0;
259
                                return split[current++];
260
                        }
261
                } else {
262
                        if (current < split.length) {
263
                                return split[current++];
264
                        } else {
265
                                split = null;
266
                                return getNextToken();
267
                        }
268
                }
269
        }
270
        
271
        /**
272
         * The main method.
273
         *
274
         * @param args the arguments
275
         */
276
        public static void main(String[] args)
277
        {
278
                File xmlfile = new File(System.getProperty("user.home"),"xml/injectEx/test.xml");
279
                File brutfile = new File(System.getProperty("user.home"),"xml/injectEx/brut.txt");
280
                File outfile = new File(System.getProperty("user.home"),"xml/injectEx/test-out.xml");
281
                try {
282
                        new ParseAndInject().run(brutfile, xmlfile, outfile);
283
                        System.out.println("done");
284
                } catch (Exception e) {
285
                        e.printStackTrace();
286
                }
287
        }
288
}
289

    
290

    
291

    
292