root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / SimpleTokenizerXml.groovy @ 1688
History | View | Annotate | Download (23.9 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | |
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | // This file is part of the TXM platform.
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
26 | 321 | mdecorde | // it under the terms of the GNU General Public License as published by
|
27 | 321 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
28 | 321 | mdecorde | // (at your option) any later version.
|
29 | 321 | mdecorde | //
|
30 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
31 | 321 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 | 321 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 | 321 | mdecorde | // GNU General Public License for more details.
|
34 | 321 | mdecorde | //
|
35 | 321 | mdecorde | // You should have received a copy of the GNU General Public License
|
36 | 321 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 | 321 | mdecorde | //
|
38 | 321 | mdecorde | //
|
39 | 321 | mdecorde | //
|
40 | 321 | mdecorde | // $LastChangedDate:$
|
41 | 321 | mdecorde | // $LastChangedRevision:$
|
42 | 321 | mdecorde | // $LastChangedBy:$
|
43 | 321 | mdecorde | //
|
44 | 1000 | mdecorde | package org.txm.scripts.filters.Tokeniser;
|
45 | 321 | mdecorde | |
46 | 321 | mdecorde | import static groovy.transform.TypeCheckingMode.SKIP |
47 | 321 | mdecorde | import groovy.transform.CompileStatic |
48 | 321 | mdecorde | |
49 | 321 | mdecorde | import java.util.regex.Matcher |
50 | 321 | mdecorde | import java.util.regex.Pattern |
51 | 321 | mdecorde | |
52 | 321 | mdecorde | import javax.xml.stream.* |
53 | 321 | mdecorde | |
54 | 986 | mdecorde | import org.txm.scripts.importer.graal.PersonalNamespaceContext |
55 | 321 | mdecorde | import org.txm.tokenizer.TokenizerClasses |
56 | 321 | mdecorde | |
57 | 321 | mdecorde | @CompileStatic
|
58 | 321 | mdecorde | public class SimpleTokenizerXml { |
59 | 321 | mdecorde | |
60 | 321 | mdecorde | /** The word_tags. */
|
61 | 321 | mdecorde | String word_tags = TokenizerClasses.word_tags;
|
62 | 1177 | mdecorde | String word_element_to_create = TokenizerClasses.word_element_to_create;
|
63 | 321 | mdecorde | Pattern reg_word_tags;
|
64 | 321 | mdecorde | /** The intraword_tags. */
|
65 | 321 | mdecorde | String intraword_tags = TokenizerClasses.intraword_tags;
|
66 | 321 | mdecorde | |
67 | 321 | mdecorde | /** The punct_strong. */
|
68 | 321 | mdecorde | String punct_strong = TokenizerClasses.punct_strong;
|
69 | 321 | mdecorde | |
70 | 321 | mdecorde | /** The punct_all. */
|
71 | 321 | mdecorde | String punct_all = TokenizerClasses.punct_all;
|
72 | 321 | mdecorde | |
73 | 321 | mdecorde | /** The word_chars. */
|
74 | 321 | mdecorde | String word_chars = TokenizerClasses.word_chars;
|
75 | 321 | mdecorde | |
76 | 321 | mdecorde | String fclitics = null; // default behavior don't manage clitics |
77 | 321 | mdecorde | String pclitics = null; // default behavior don't manage clitics |
78 | 321 | mdecorde | |
79 | 321 | mdecorde | /** The outside_text_tags_ignore_content. */
|
80 | 321 | mdecorde | String note_content = null; |
81 | 321 | mdecorde | String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized |
82 | 321 | mdecorde | String outside_text_tags = null // tag and content removed |
83 | 321 | mdecorde | String startTag = null |
84 | 321 | mdecorde | Pattern reg_note_content;
|
85 | 321 | mdecorde | Pattern reg_outside_text_tags_keep_content;
|
86 | 321 | mdecorde | Pattern reg_outside_text_tags;
|
87 | 321 | mdecorde | Pattern reg_startTag;
|
88 | 321 | mdecorde | |
89 | 321 | mdecorde | Pattern reg_punct_other = Pattern.compile("\\p{P}") |
90 | 321 | mdecorde | |
91 | 321 | mdecorde | /** The DEBUG. */
|
92 | 321 | mdecorde | public boolean DEBUG = false; |
93 | 321 | mdecorde | |
94 | 321 | mdecorde | /** The outfile. */
|
95 | 321 | mdecorde | File outfile;
|
96 | 321 | mdecorde | |
97 | 321 | mdecorde | /** The infile. */
|
98 | 321 | mdecorde | File infile;
|
99 | 321 | mdecorde | |
100 | 321 | mdecorde | String lang;
|
101 | 321 | mdecorde | |
102 | 321 | mdecorde | /** The buffer. */
|
103 | 321 | mdecorde | StringBuffer buffer;
|
104 | 321 | mdecorde | |
105 | 321 | mdecorde | /** The writer. */
|
106 | 321 | mdecorde | XMLStreamWriter writer; |
107 | 321 | mdecorde | BufferedOutputStream output;
|
108 | 321 | mdecorde | |
109 | 321 | mdecorde | /** The parser. */
|
110 | 321 | mdecorde | XMLStreamReader parser |
111 | 321 | mdecorde | |
112 | 321 | mdecorde | /** The localname. */
|
113 | 321 | mdecorde | String localname;
|
114 | 321 | mdecorde | |
115 | 321 | mdecorde | /** The prefix. */
|
116 | 321 | mdecorde | String prefix;
|
117 | 321 | mdecorde | String filename;
|
118 | 321 | mdecorde | |
119 | 321 | mdecorde | def regElision = null; |
120 | 321 | mdecorde | def reg3pts = null; |
121 | 321 | mdecorde | def regPunct;
|
122 | 321 | mdecorde | def regFClitics = null ; |
123 | 321 | mdecorde | def regPClitics = null ; |
124 | 321 | mdecorde | String whitespaces;
|
125 | 321 | mdecorde | Pattern regWhitespaces;
|
126 | 321 | mdecorde | Pattern regLN;
|
127 | 321 | mdecorde | Pattern regCTRL;
|
128 | 321 | mdecorde | Pattern regSplitWhiteSpaces;
|
129 | 321 | mdecorde | |
130 | 321 | mdecorde | public SimpleTokenizerXml(File infile, File outfile) { |
131 | 321 | mdecorde | this(infile, outfile, ""); |
132 | 321 | mdecorde | } |
133 | 321 | mdecorde | |
134 | 321 | mdecorde | /**
|
135 | 321 | mdecorde | * Instantiates a new simple tokenizer xml.
|
136 | 321 | mdecorde | *
|
137 | 321 | mdecorde | * @param infile the infile
|
138 | 321 | mdecorde | * @param outfile the outfile
|
139 | 321 | mdecorde | */
|
140 | 321 | mdecorde | public SimpleTokenizerXml(File infile, File outfile, String lang) { |
141 | 321 | mdecorde | this.lang = lang;
|
142 | 321 | mdecorde | if (lang != null) |
143 | 321 | mdecorde | if (lang.startsWith("en")) { |
144 | 321 | mdecorde | fclitics = TokenizerClasses.FClitic_en |
145 | 321 | mdecorde | } else if (lang.startsWith("fr")) { |
146 | 321 | mdecorde | fclitics = TokenizerClasses.FClitic_fr |
147 | 321 | mdecorde | pclitics = TokenizerClasses.PClitic_fr |
148 | 321 | mdecorde | } else if (lang.startsWith("gl")) { |
149 | 321 | mdecorde | fclitics = TokenizerClasses.FClitic_gl |
150 | 321 | mdecorde | } else if (lang.startsWith("it")) { |
151 | 321 | mdecorde | pclitics = TokenizerClasses.PClitic_it |
152 | 321 | mdecorde | } |
153 | 321 | mdecorde | |
154 | 321 | mdecorde | word_tags = TokenizerClasses.word_tags; |
155 | 321 | mdecorde | reg_word_tags = Pattern.compile(word_tags);
|
156 | 1177 | mdecorde | |
157 | 321 | mdecorde | intraword_tags = TokenizerClasses.intraword_tags; |
158 | 321 | mdecorde | punct_strong = TokenizerClasses.punct_strong; |
159 | 321 | mdecorde | punct_all = TokenizerClasses.punct_all; |
160 | 321 | mdecorde | word_chars = TokenizerClasses.word_chars; |
161 | 321 | mdecorde | |
162 | 321 | mdecorde | this.outfile = outfile;
|
163 | 321 | mdecorde | this.infile = infile;
|
164 | 321 | mdecorde | this.filename = infile.getName();
|
165 | 479 | mdecorde | int index = filename.lastIndexOf("."); |
166 | 321 | mdecorde | if (index > 0) filename = filename.substring(0, index); |
167 | 321 | mdecorde | |
168 | 321 | mdecorde | String strRegElision = TokenizerClasses.regElision;
|
169 | 321 | mdecorde | if (strRegElision != null && strRegElision.length() > 0) |
170 | 321 | mdecorde | regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string ^ |
171 | 321 | mdecorde | reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
|
172 | 321 | mdecorde | |
173 | 321 | mdecorde | String strRegPunct = TokenizerClasses.regPunct;
|
174 | 321 | mdecorde | if (strRegPunct != null && strRegPunct.length() > 0) |
175 | 321 | mdecorde | regPunct = ~/^(.*?)($strRegPunct)(.*)$/
|
176 | 321 | mdecorde | |
177 | 321 | mdecorde | if (fclitics != null && fclitics.length() > 0) |
178 | 321 | mdecorde | regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $ |
179 | 321 | mdecorde | |
180 | 321 | mdecorde | if (pclitics != null && pclitics.length() > 0) |
181 | 321 | mdecorde | regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string ^ |
182 | 321 | mdecorde | |
183 | 321 | mdecorde | if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) |
184 | 321 | mdecorde | regWhitespaces = ~TokenizerClasses.whitespaces; |
185 | 321 | mdecorde | |
186 | 321 | mdecorde | if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) |
187 | 321 | mdecorde | regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
|
188 | 321 | mdecorde | |
189 | 321 | mdecorde | regLN = Pattern.compile("/\n/"); |
190 | 321 | mdecorde | regCTRL = Pattern.compile("/\\p{C}/"); |
191 | 321 | mdecorde | } |
192 | 321 | mdecorde | |
193 | 321 | mdecorde | /**
|
194 | 321 | mdecorde | * Fill infos.
|
195 | 321 | mdecorde | *
|
196 | 321 | mdecorde | * @param event the event
|
197 | 321 | mdecorde | * @return the java.lang. object
|
198 | 321 | mdecorde | */
|
199 | 321 | mdecorde | public fillInfos(int event) { |
200 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
|
201 | 321 | mdecorde | localname = parser.getLocalName(); |
202 | 321 | mdecorde | prefix = parser.getPrefix(); |
203 | 321 | mdecorde | } |
204 | 321 | mdecorde | } |
205 | 321 | mdecorde | |
206 | 321 | mdecorde | /**
|
207 | 321 | mdecorde | * Donothing: just write what is read
|
208 | 321 | mdecorde | *
|
209 | 321 | mdecorde | * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
|
210 | 321 | mdecorde | *
|
211 | 321 | mdecorde | * @param event the event
|
212 | 321 | mdecorde | * @param wordid the wordid
|
213 | 321 | mdecorde | * @return the java.lang. object
|
214 | 321 | mdecorde | */
|
215 | 321 | mdecorde | public donothing(int event, Integer wordid) { |
216 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT ) {
|
217 | 479 | mdecorde | |
218 | 321 | mdecorde | localname = parser.getLocalName(); |
219 | 479 | mdecorde | if (wordid != null) localname = word_element_to_create; |
220 | 479 | mdecorde | |
221 | 321 | mdecorde | if (prefix != null && prefix.length() > 0) |
222 | 321 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
223 | 321 | mdecorde | else
|
224 | 321 | mdecorde | // if(namespace != null)
|
225 | 321 | mdecorde | // writer.writeStartElement(namespace, localname);
|
226 | 321 | mdecorde | // else
|
227 | 321 | mdecorde | writer.writeStartElement(localname); |
228 | 321 | mdecorde | |
229 | 321 | mdecorde | // if(parser.getNamespaceCount() > 0)
|
230 | 321 | mdecorde | // writer.writeDefaultNamespace(parser.getNamespaceURI(0))
|
231 | 321 | mdecorde | // for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
|
232 | 321 | mdecorde | // writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
|
233 | 321 | mdecorde | |
234 | 321 | mdecorde | String namespace_prefix;
|
235 | 321 | mdecorde | for (int i = 0 ; i< parser.getNamespaceCount() ; i++) { |
236 | 321 | mdecorde | namespace_prefix = parser.getNamespacePrefix(i); |
237 | 321 | mdecorde | if ((namespace_prefix != null)&& (namespace_prefix.length()> 0)) { |
238 | 321 | mdecorde | writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i)); |
239 | 321 | mdecorde | } else {
|
240 | 321 | mdecorde | writer.writeDefaultNamespace(parser.getNamespaceURI(i)); |
241 | 321 | mdecorde | } |
242 | 321 | mdecorde | } |
243 | 321 | mdecorde | |
244 | 321 | mdecorde | String attrprefix, attname;
|
245 | 321 | mdecorde | boolean hasId = false; |
246 | 321 | mdecorde | //boolean hasType = false
|
247 | 321 | mdecorde | boolean hasN = false |
248 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
249 | 321 | mdecorde | attname = parser.getAttributeLocalName(i); |
250 | 321 | mdecorde | attrprefix = parser.getAttributePrefix(i); |
251 | 321 | mdecorde | if ("id".equals(attname)) hasId = true; |
252 | 321 | mdecorde | //if ("type".equals(attname)) hasType = true;
|
253 | 321 | mdecorde | if ("n".equals(attname)) hasN = true; |
254 | 321 | mdecorde | |
255 | 321 | mdecorde | if (attrprefix != null && attrprefix.length() > 0) |
256 | 321 | mdecorde | writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
|
257 | 321 | mdecorde | else
|
258 | 321 | mdecorde | writer.writeAttribute(attname, parser.getAttributeValue(i)) |
259 | 321 | mdecorde | } |
260 | 321 | mdecorde | |
261 | 479 | mdecorde | if (wordid != null && !hasId && localname == word_element_to_create) |
262 | 321 | mdecorde | writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
263 | 321 | mdecorde | |
264 | 479 | mdecorde | if (!hasN && localname == word_element_to_create)
|
265 | 321 | mdecorde | writer.writeAttribute("n", ""+wordcount); |
266 | 321 | mdecorde | |
267 | 321 | mdecorde | writer.writeCharacters("\n");
|
268 | 321 | mdecorde | } |
269 | 321 | mdecorde | else if(event == XMLStreamConstants.END_ELEMENT) |
270 | 321 | mdecorde | { |
271 | 321 | mdecorde | writer.writeEndElement(); |
272 | 321 | mdecorde | writer.writeCharacters("\n");
|
273 | 321 | mdecorde | } |
274 | 321 | mdecorde | else if(event == XMLStreamConstants.CHARACTERS) |
275 | 321 | mdecorde | { |
276 | 321 | mdecorde | //println parser.getText();
|
277 | 321 | mdecorde | //writer.writeCharacters("𦟛");
|
278 | 321 | mdecorde | |
279 | 321 | mdecorde | |
280 | 321 | mdecorde | // checks if the token starts with an high surrogate
|
281 | 321 | mdecorde | // if(isHighSurrogate(parser.getText().charAt(0))) {
|
282 | 321 | mdecorde | // println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
|
283 | 321 | mdecorde | // writer.writeCharacters("__invalidXMLChar__")
|
284 | 321 | mdecorde | // //writer.writeCharacters("𦟛");
|
285 | 321 | mdecorde | // println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
|
286 | 321 | mdecorde | // println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
|
287 | 321 | mdecorde | // int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
|
288 | 321 | mdecorde | // println "char sum: " + charSum;
|
289 | 321 | mdecorde | // println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
|
290 | 321 | mdecorde | // int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
|
291 | 321 | mdecorde | // //writer.writeCharacters(String.valueOf((char)112692));
|
292 | 321 | mdecorde | // writer.writeCharacters("&#" + scalar + ";");
|
293 | 321 | mdecorde | // }
|
294 | 321 | mdecorde | // else
|
295 | 321 | mdecorde | |
296 | 321 | mdecorde | |
297 | 321 | mdecorde | if (insideword) { // ensure there is not \t or \n in the word form value |
298 | 321 | mdecorde | writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " ")); |
299 | 321 | mdecorde | } else {
|
300 | 321 | mdecorde | writer.writeCharacters(parser.getText()); |
301 | 321 | mdecorde | } |
302 | 321 | mdecorde | } |
303 | 321 | mdecorde | } |
304 | 321 | mdecorde | |
305 | 321 | mdecorde | |
306 | 321 | mdecorde | /**
|
307 | 321 | mdecorde | * Converts the specified surrogates pair to scalar.
|
308 | 321 | mdecorde | * @param highSurrogate
|
309 | 321 | mdecorde | * @param lowSurrogate
|
310 | 321 | mdecorde | * @return
|
311 | 321 | mdecorde | */
|
312 | 321 | mdecorde | public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate) { |
313 | 321 | mdecorde | return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000; |
314 | 321 | mdecorde | } |
315 | 321 | mdecorde | |
316 | 321 | mdecorde | /**
|
317 | 321 | mdecorde | * Checks if the specified character is an high/leading surrogate.
|
318 | 321 | mdecorde | * @param character
|
319 | 321 | mdecorde | * @return
|
320 | 321 | mdecorde | */
|
321 | 321 | mdecorde | public boolean isHighSurrogate(char character) { |
322 | 321 | mdecorde | return (character >= 0xD800 && character <= 0xDBFF); |
323 | 321 | mdecorde | } |
324 | 321 | mdecorde | |
325 | 321 | mdecorde | |
326 | 321 | mdecorde | |
327 | 321 | mdecorde | /** The wordcount. */
|
328 | 321 | mdecorde | int wordcount = 0; |
329 | 321 | mdecorde | |
330 | 321 | mdecorde | /** The ignorecontent. */
|
331 | 321 | mdecorde | boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag |
332 | 321 | mdecorde | boolean insideword = false; |
333 | 321 | mdecorde | /**
|
334 | 321 | mdecorde | * Process.
|
335 | 321 | mdecorde | *
|
336 | 321 | mdecorde | * @return true, if successful
|
337 | 321 | mdecorde | */
|
338 | 321 | mdecorde | public boolean process() |
339 | 321 | mdecorde | { |
340 | 321 | mdecorde | if (!infile.exists()) {
|
341 | 321 | mdecorde | println "$infile does not exists"
|
342 | 321 | mdecorde | return false; |
343 | 321 | mdecorde | } |
344 | 321 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
345 | 321 | mdecorde | output = new BufferedOutputStream(new FileOutputStream(outfile)) |
346 | 321 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8")
|
347 | 321 | mdecorde | writer.setNamespaceContext(new PersonalNamespaceContext());
|
348 | 321 | mdecorde | |
349 | 321 | mdecorde | def inputData = infile.toURI().toURL().openStream();
|
350 | 321 | mdecorde | def inputfactory = XMLInputFactory.newInstance();
|
351 | 321 | mdecorde | //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
|
352 | 321 | mdecorde | //inputfactory.setExpandEntityReferences(false);
|
353 | 321 | mdecorde | parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
|
354 | 321 | mdecorde | //println "PARSER: "+parser.getClass()
|
355 | 321 | mdecorde | writer.writeStartDocument("UTF-8","1.0"); |
356 | 321 | mdecorde | writer.writeCharacters("\n");
|
357 | 321 | mdecorde | |
358 | 321 | mdecorde | int previousEvent = 0; |
359 | 321 | mdecorde | boolean startProcess = false; |
360 | 321 | mdecorde | if (startTag == null) // if no startTag specified we process from the start |
361 | 321 | mdecorde | startProcess = true;
|
362 | 321 | mdecorde | ignorecontent = !startProcess; |
363 | 321 | mdecorde | |
364 | 321 | mdecorde | buffer = new StringBuffer(); |
365 | 321 | mdecorde | //println "process - start start tag: "+startTag+" startProcess: $startProcess"
|
366 | 321 | mdecorde | // println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
|
367 | 321 | mdecorde | // println "reg_outside_text_tags=$reg_outside_text_tags"
|
368 | 321 | mdecorde | // println "reg_note_content=$reg_note_content"
|
369 | 321 | mdecorde | try {
|
370 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
371 | 321 | mdecorde | if (!startProcess) {
|
372 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) {
|
373 | 321 | mdecorde | if (reg_startTag.matcher(parser.getLocalName()).matches()) {
|
374 | 321 | mdecorde | startProcess = true
|
375 | 321 | mdecorde | ignorecontent = false;
|
376 | 321 | mdecorde | } |
377 | 321 | mdecorde | } |
378 | 321 | mdecorde | if (!startProcess) {
|
379 | 321 | mdecorde | donothing(event, null);
|
380 | 321 | mdecorde | continue;
|
381 | 321 | mdecorde | } |
382 | 321 | mdecorde | } |
383 | 321 | mdecorde | |
384 | 321 | mdecorde | if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
|
385 | 321 | mdecorde | processWord(); // tokenize now!
|
386 | 321 | mdecorde | buffer.setLength(0);
|
387 | 321 | mdecorde | } |
388 | 321 | mdecorde | fillInfos(event);//get localname and prefix
|
389 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) {
|
390 | 321 | mdecorde | //println "Open: "+localname;
|
391 | 321 | mdecorde | localname = parser.getLocalName() |
392 | 321 | mdecorde | if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting |
393 | 321 | mdecorde | //println "Found pretagged word";
|
394 | 321 | mdecorde | wordcount++; |
395 | 321 | mdecorde | donothing(event, wordcount); |
396 | 479 | mdecorde | //ignorecontent = true;
|
397 | 321 | mdecorde | insideword = true;
|
398 | 321 | mdecorde | } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY |
399 | 321 | mdecorde | // ignore the tag only
|
400 | 321 | mdecorde | donothing(event, null); // write the tag |
401 | 321 | mdecorde | //println "IGNORING NOTE CONTENT OF "+localname
|
402 | 321 | mdecorde | ignorecontent = true;
|
403 | 321 | mdecorde | } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY |
404 | 321 | mdecorde | // ignore the content only
|
405 | 321 | mdecorde | donothing(event, null); // write the tag |
406 | 321 | mdecorde | //println "IGNORING CONTENT OF "+localname
|
407 | 321 | mdecorde | ignorecontent = true;
|
408 | 321 | mdecorde | } else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag |
409 | 321 | mdecorde | goToEndOfElement(localname); // parse until the end of the element is passed
|
410 | 321 | mdecorde | } else {
|
411 | 321 | mdecorde | donothing(event, null);
|
412 | 321 | mdecorde | } |
413 | 321 | mdecorde | } else if(event == XMLStreamConstants.END_ELEMENT) { |
414 | 321 | mdecorde | //println "Close: "+localname;
|
415 | 321 | mdecorde | localname = parser.getLocalName() |
416 | 321 | mdecorde | if (reg_word_tags.matcher(localname).matches()) {
|
417 | 321 | mdecorde | //ignorecontent = false;
|
418 | 321 | mdecorde | insideword = false;
|
419 | 321 | mdecorde | writer.writeEndElement(); |
420 | 321 | mdecorde | writer.writeCharacters("\n");
|
421 | 321 | mdecorde | } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag |
422 | 321 | mdecorde | ignorecontent = false;
|
423 | 321 | mdecorde | donothing(event, null);
|
424 | 321 | mdecorde | } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY |
425 | 321 | mdecorde | ignorecontent = false;
|
426 | 321 | mdecorde | donothing(event, null);
|
427 | 321 | mdecorde | } else {
|
428 | 321 | mdecorde | donothing(event, null);
|
429 | 321 | mdecorde | } |
430 | 321 | mdecorde | } else if (event == XMLStreamConstants.CHARACTERS) { |
431 | 479 | mdecorde | if (ignorecontent || insideword) {
|
432 | 321 | mdecorde | //println " dont tokenize chars: "+parser.getText().trim();
|
433 | 321 | mdecorde | donothing(event, null);
|
434 | 321 | mdecorde | } else {
|
435 | 321 | mdecorde | //println " process chars: "+parser.getText().trim();
|
436 | 321 | mdecorde | buffer.append(parser.getText()); |
437 | 321 | mdecorde | if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") { |
438 | 321 | mdecorde | processWord(); |
439 | 321 | mdecorde | buffer = new StringBuffer(); |
440 | 321 | mdecorde | } |
441 | 321 | mdecorde | } |
442 | 321 | mdecorde | } else if (event == XMLStreamConstants.COMMENT) { |
443 | 321 | mdecorde | writer.writeComment(parser.getText()) |
444 | 321 | mdecorde | } else if (event == XMLStreamConstants.DTD) { |
445 | 321 | mdecorde | //println "DTD!";
|
446 | 321 | mdecorde | } else {
|
447 | 321 | mdecorde | if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation() |
448 | 321 | mdecorde | } |
449 | 321 | mdecorde | previousEvent = event; |
450 | 321 | mdecorde | } |
451 | 321 | mdecorde | |
452 | 321 | mdecorde | parser.close() |
453 | 321 | mdecorde | writer.close(); |
454 | 321 | mdecorde | output.close(); |
455 | 321 | mdecorde | inputData.close(); |
456 | 321 | mdecorde | } catch (Exception e) { |
457 | 321 | mdecorde | System.err.println("Error : "+infile); |
458 | 321 | mdecorde | e.printStackTrace(); |
459 | 1688 | mdecorde | if (writer != null) writer.close(); |
460 | 1688 | mdecorde | if (output != null) output.close(); |
461 | 1688 | mdecorde | if (parser != null) parser.close(); |
462 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
463 | 321 | mdecorde | return false; |
464 | 321 | mdecorde | } |
465 | 321 | mdecorde | return true; |
466 | 321 | mdecorde | } |
467 | 321 | mdecorde | |
468 | 321 | mdecorde | /**
|
469 | 321 | mdecorde | * Set the element and content to ignore
|
470 | 321 | mdecorde | *
|
471 | 321 | mdecorde | * @param regexp
|
472 | 321 | mdecorde | */
|
473 | 321 | mdecorde | public void setOutSideTextTags(String regexp) { |
474 | 321 | mdecorde | this.outside_text_tags = regexp;
|
475 | 321 | mdecorde | this.reg_outside_text_tags = Pattern.compile(outside_text_tags); |
476 | 321 | mdecorde | } |
477 | 321 | mdecorde | |
478 | 321 | mdecorde | /**
|
479 | 321 | mdecorde | * Set element content to NOT tokenize
|
480 | 321 | mdecorde | *
|
481 | 321 | mdecorde | * @param regexp
|
482 | 321 | mdecorde | */
|
483 | 321 | mdecorde | public void setNote(String regexp) { |
484 | 321 | mdecorde | this.note_content = regexp;
|
485 | 321 | mdecorde | this.reg_note_content = Pattern.compile(note_content); |
486 | 321 | mdecorde | } |
487 | 321 | mdecorde | |
488 | 321 | mdecorde | /**
|
489 | 321 | mdecorde | * Set the element to ignore but not their content
|
490 | 321 | mdecorde | *
|
491 | 321 | mdecorde | * @param regexp
|
492 | 321 | mdecorde | */
|
493 | 321 | mdecorde | public void setOutSideTextTagsAndKeepContent(String regexp) { |
494 | 321 | mdecorde | this.outside_text_tags_keep_content = regexp;
|
495 | 321 | mdecorde | this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content); |
496 | 321 | mdecorde | } |
497 | 321 | mdecorde | |
498 | 321 | mdecorde | protected void goToEndOfElement(String name) { |
499 | 321 | mdecorde | //println "START ignoring tag and content of $name"
|
500 | 321 | mdecorde | def openedTags = [] |
501 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
502 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) {
|
503 | 321 | mdecorde | openedTags << parser.getLocalName() |
504 | 321 | mdecorde | //println "append "+openedTags
|
505 | 321 | mdecorde | } else if (event == XMLStreamConstants.END_ELEMENT) { |
506 | 321 | mdecorde | if (openedTags.size() == 0 && name == parser.getLocalName()) { |
507 | 321 | mdecorde | //println "END ignoring tag and content of $name"
|
508 | 321 | mdecorde | return;
|
509 | 321 | mdecorde | } |
510 | 321 | mdecorde | openedTags.pop() |
511 | 321 | mdecorde | //println "pop $openedTags"
|
512 | 321 | mdecorde | } |
513 | 321 | mdecorde | } |
514 | 321 | mdecorde | } |
515 | 321 | mdecorde | |
516 | 321 | mdecorde | public final static String WHITESPACE = " "; |
517 | 321 | mdecorde | public final static String EMPTY = ""; |
518 | 321 | mdecorde | /**
|
519 | 321 | mdecorde | * Process word.
|
520 | 321 | mdecorde | */
|
521 | 321 | mdecorde | protected void processWord() { |
522 | 321 | mdecorde | String text = buffer.toString();//parser.getText().trim().replace("\t", " "); |
523 | 321 | mdecorde | //if (DEBUG) println "-- chars: "+text+"--";
|
524 | 321 | mdecorde | text = regLN.matcher(text).replaceAll(WHITESPACE); |
525 | 321 | mdecorde | text = regCTRL.matcher(text).replaceAll(EMPTY); // remove ctrl characters
|
526 | 321 | mdecorde | if (regSplitWhiteSpaces != null) { |
527 | 321 | mdecorde | for (String s : regSplitWhiteSpaces.split(text)) { // separate with unicode white spaces |
528 | 321 | mdecorde | // if (DEBUG){println "process $s"}
|
529 | 321 | mdecorde | iterate(s); |
530 | 321 | mdecorde | } |
531 | 321 | mdecorde | } else {
|
532 | 321 | mdecorde | iterate(text); |
533 | 321 | mdecorde | } |
534 | 321 | mdecorde | } |
535 | 321 | mdecorde | |
536 | 321 | mdecorde | /**
|
537 | 321 | mdecorde | * Iterate. a String, should be called when a word is found in a String
|
538 | 321 | mdecorde | *
|
539 | 321 | mdecorde | * @param s the s
|
540 | 321 | mdecorde | * @return the java.lang. object
|
541 | 321 | mdecorde | */
|
542 | 321 | mdecorde | protected iterate(String s) { |
543 | 321 | mdecorde | while (s != null && s.length() > 0) { |
544 | 321 | mdecorde | // if (DEBUG){println " > $s"}
|
545 | 321 | mdecorde | s = standardChecks(s); |
546 | 321 | mdecorde | } |
547 | 321 | mdecorde | } |
548 | 321 | mdecorde | |
549 | 321 | mdecorde | /**
|
550 | 321 | mdecorde | * Standard checks.
|
551 | 321 | mdecorde | *
|
552 | 321 | mdecorde | * @param s the s
|
553 | 321 | mdecorde | * @return the java.lang. object
|
554 | 321 | mdecorde | */
|
555 | 321 | mdecorde | // @CompileStatic(SKIP)
|
556 | 321 | mdecorde | public String standardChecks(String s) { |
557 | 321 | mdecorde | Matcher m;
|
558 | 321 | mdecorde | if (fclitics != null && (m = s =~ regFClitics) ) { |
559 | 321 | mdecorde | // if (DEBUG) println "CLITIC found: $s ->"+ m
|
560 | 321 | mdecorde | iterate(m.group(1))
|
561 | 321 | mdecorde | |
562 | 321 | mdecorde | wordcount++; |
563 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
564 | 321 | mdecorde | writeWordAttributes();// id
|
565 | 321 | mdecorde | //writer.writeAttribute("type", "w");
|
566 | 321 | mdecorde | writer.writeCharacters(m.group(2));
|
567 | 321 | mdecorde | writer.writeEndElement(); |
568 | 321 | mdecorde | writer.writeCharacters("\n");
|
569 | 321 | mdecorde | |
570 | 321 | mdecorde | return ""; |
571 | 321 | mdecorde | } else if (pclitics != null && (m = s =~ regPClitics) ) { |
572 | 321 | mdecorde | if (DEBUG) println "PCLITIC found: $s ->"+ m |
573 | 321 | mdecorde | |
574 | 321 | mdecorde | wordcount++; |
575 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
576 | 321 | mdecorde | writeWordAttributes();// id
|
577 | 321 | mdecorde | //writer.writeAttribute("type", "w");
|
578 | 321 | mdecorde | writer.writeCharacters(m.group(1));
|
579 | 321 | mdecorde | writer.writeEndElement(); |
580 | 321 | mdecorde | writer.writeCharacters("\n");
|
581 | 321 | mdecorde | |
582 | 321 | mdecorde | iterate(m.group(2))
|
583 | 321 | mdecorde | |
584 | 321 | mdecorde | return ""; |
585 | 321 | mdecorde | } else if (regElision != null && (m = s =~ regElision) ) { |
586 | 321 | mdecorde | // if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
|
587 | 321 | mdecorde | //iterate(m.group(1))
|
588 | 321 | mdecorde | |
589 | 321 | mdecorde | // int sep = s.indexOf("'");
|
590 | 321 | mdecorde | // if (sep < 0)
|
591 | 321 | mdecorde | // sep = s.indexOf("’");
|
592 | 321 | mdecorde | // if (sep < 0)
|
593 | 321 | mdecorde | // sep = s.indexOf("‘");
|
594 | 321 | mdecorde | |
595 | 321 | mdecorde | wordcount++; |
596 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
597 | 321 | mdecorde | writeWordAttributes();// id
|
598 | 321 | mdecorde | //writer.writeAttribute("type", "w");
|
599 | 321 | mdecorde | writer.writeCharacters(m.group(1));
|
600 | 321 | mdecorde | writer.writeEndElement(); |
601 | 321 | mdecorde | writer.writeCharacters("\n");
|
602 | 321 | mdecorde | |
603 | 321 | mdecorde | iterate(m.group(2))
|
604 | 321 | mdecorde | |
605 | 321 | mdecorde | return ""; |
606 | 321 | mdecorde | } else if (reg3pts != null && (m = s =~ reg3pts) ) { |
607 | 321 | mdecorde | // if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
|
608 | 321 | mdecorde | iterate(m.group(1))
|
609 | 321 | mdecorde | |
610 | 321 | mdecorde | wordcount++; |
611 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
612 | 321 | mdecorde | writeWordAttributes();// id
|
613 | 321 | mdecorde | //writer.writeAttribute("type","pon");
|
614 | 321 | mdecorde | writer.writeCharacters("...");
|
615 | 321 | mdecorde | writer.writeEndElement(); |
616 | 321 | mdecorde | writer.writeCharacters("\n");
|
617 | 321 | mdecorde | |
618 | 321 | mdecorde | return m.group(3); |
619 | 321 | mdecorde | } else if (regPunct != null && (m = s =~ regPunct) ) { |
620 | 321 | mdecorde | if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)} |
621 | 321 | mdecorde | iterate(m.group(1));
|
622 | 321 | mdecorde | |
623 | 321 | mdecorde | wordcount++; |
624 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
625 | 321 | mdecorde | writeWordAttributes();// id
|
626 | 321 | mdecorde | //writer.writeAttribute("type","pon");
|
627 | 321 | mdecorde | writer.writeCharacters(m.group(2));
|
628 | 321 | mdecorde | writer.writeEndElement(); |
629 | 321 | mdecorde | writer.writeCharacters("\n");
|
630 | 321 | mdecorde | |
631 | 321 | mdecorde | return m.group(3); |
632 | 321 | mdecorde | } else {
|
633 | 321 | mdecorde | // if(DEBUG){println "Other found: "+s}
|
634 | 321 | mdecorde | wordcount++; |
635 | 479 | mdecorde | writer.writeStartElement(word_element_to_create); |
636 | 321 | mdecorde | writeWordAttributes();// id
|
637 | 321 | mdecorde | // if (reg_punct_other.matcher(s).matches())
|
638 | 321 | mdecorde | // writer.writeAttribute("type","pon");
|
639 | 321 | mdecorde | // else
|
640 | 321 | mdecorde | // writer.writeAttribute("type","w");
|
641 | 321 | mdecorde | writer.writeCharacters(s); |
642 | 321 | mdecorde | writer.writeEndElement(); |
643 | 321 | mdecorde | writer.writeCharacters("\n");
|
644 | 321 | mdecorde | |
645 | 321 | mdecorde | return ""; |
646 | 321 | mdecorde | } |
647 | 321 | mdecorde | } |
648 | 321 | mdecorde | |
649 | 321 | mdecorde | /**
|
650 | 321 | mdecorde | * Write word attributes.
|
651 | 321 | mdecorde | *
|
652 | 321 | mdecorde | * @return the java.lang. object
|
653 | 321 | mdecorde | */
|
654 | 321 | mdecorde | protected writeWordAttributes() {
|
655 | 321 | mdecorde | writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
656 | 321 | mdecorde | writer.writeAttribute("n",""+wordcount); |
657 | 321 | mdecorde | } |
658 | 321 | mdecorde | |
659 | 321 | mdecorde | public void setStartTag(String tag) |
660 | 321 | mdecorde | { |
661 | 321 | mdecorde | this.startTag = tag;
|
662 | 321 | mdecorde | this.reg_startTag = Pattern.compile(startTag); |
663 | 321 | mdecorde | } |
664 | 321 | mdecorde | |
665 | 321 | mdecorde | /**
|
666 | 321 | mdecorde | * Tokenize.
|
667 | 321 | mdecorde | *
|
668 | 321 | mdecorde | * @param str the str
|
669 | 321 | mdecorde | * @return the list
|
670 | 321 | mdecorde | */
|
671 | 321 | mdecorde | public List<String> tokenize(String str) |
672 | 321 | mdecorde | { |
673 | 321 | mdecorde | return str.tokenize() // cut by whitespace |
674 | 321 | mdecorde | } |
675 | 321 | mdecorde | |
676 | 321 | mdecorde | /**
|
677 | 321 | mdecorde | * The main method.
|
678 | 321 | mdecorde | *
|
679 | 321 | mdecorde | * @param args the arguments
|
680 | 321 | mdecorde | */
|
681 | 321 | mdecorde | public static void main(String[] args) |
682 | 321 | mdecorde | { |
683 | 479 | mdecorde | File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum"); |
684 | 321 | mdecorde | Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath()); |
685 | 479 | mdecorde | File inputFile = new File(SVNDIR, "baye1.xml") |
686 | 479 | mdecorde | File outputFile = new File(SVNDIR, "baye1-t.xml") |
687 | 321 | mdecorde | SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr") |
688 | 321 | mdecorde | tokenizer.DEBUG = true
|
689 | 321 | mdecorde | tokenizer.setNote("note")
|
690 | 479 | mdecorde | //tokenizer.setOutSideTextTags("teiHeader")
|
691 | 479 | mdecorde | tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
|
692 | 321 | mdecorde | println outputFile.toString() + " : "+tokenizer.process();
|
693 | 321 | mdecorde | // FIXME: tests UTF-8 XML ranges
|
694 | 321 | mdecorde | // File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
|
695 | 321 | mdecorde | // File inputFile = new File(inputDir, "problem2.xml");
|
696 | 321 | mdecorde | |
697 | 321 | mdecorde | /*
|
698 | 321 | mdecorde | File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
|
699 | 321 | mdecorde | File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
|
700 | 321 | mdecorde | File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
|
701 | 321 | mdecorde | File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
|
702 | 321 | mdecorde | File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
|
703 | 321 | mdecorde | outputDir.deleteDir()
|
704 | 321 | mdecorde | outputDir2.deleteDir()
|
705 | 321 | mdecorde | outputDir3.deleteDir()
|
706 | 321 | mdecorde | outputDir.mkdir()
|
707 | 321 | mdecorde | outputDir2.mkdir()
|
708 | 321 | mdecorde | outputDir3.mkdir()
|
709 | 321 | mdecorde | long time;
|
710 | 321 | mdecorde | for (String lang : ["none"]) {
|
711 | 321 | mdecorde | println "LANG: $lang"
|
712 | 321 | mdecorde | |
713 | 321 | mdecorde | time = System.currentTimeMillis();
|
714 | 321 | mdecorde | File outputFile = new File(outputDir, "${lang}.xml")
|
715 | 321 | mdecorde | SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
|
716 | 321 | mdecorde | tokenizer.process();
|
717 | 321 | mdecorde | println "FAST $lang: "+(System.currentTimeMillis()-time)
|
718 | 321 | mdecorde | |
719 | 321 | mdecorde | // time = System.currentTimeMillis();
|
720 | 321 | mdecorde | // File outputFile3 = new File(outputDir3, "${lang}.xml")
|
721 | 321 | mdecorde | // FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
|
722 | 321 | mdecorde | // tokenizer3.process();
|
723 | 321 | mdecorde | // println "FAST2 $lang: "+(System.currentTimeMillis()-time)
|
724 | 321 | mdecorde | //
|
725 | 321 | mdecorde | // time = System.currentTimeMillis();
|
726 | 321 | mdecorde | // File outputFile2 = new File(outputDir2, "${lang}.xml")
|
727 | 321 | mdecorde | // OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
|
728 | 321 | mdecorde | // tokenizer2.process();
|
729 | 321 | mdecorde | // println "SLOW $lang: "+(System.currentTimeMillis()-time)
|
730 | 321 | mdecorde | }
|
731 | 321 | mdecorde | */
|
732 | 321 | mdecorde | println ""
|
733 | 321 | mdecorde | println "Done"
|
734 | 321 | mdecorde | |
735 | 321 | mdecorde | // String lang = "en"
|
736 | 321 | mdecorde | // File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
|
737 | 321 | mdecorde | // File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
|
738 | 321 | mdecorde | // outDir.deleteDir()
|
739 | 321 | mdecorde | // outDir.mkdir()
|
740 | 321 | mdecorde | //
|
741 | 321 | mdecorde | // println "processing "+inDir.listFiles().size()+" files."
|
742 | 321 | mdecorde | // for (def infile : inDir.listFiles()) {
|
743 | 321 | mdecorde | // if (!infile.getName().endsWith(".xml")) continue;
|
744 | 321 | mdecorde | //
|
745 | 321 | mdecorde | // print "."
|
746 | 321 | mdecorde | // File outfile = new File(outDir, infile.getName())
|
747 | 321 | mdecorde | // SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
|
748 | 321 | mdecorde | // tokenizer.setDEBUG false
|
749 | 321 | mdecorde | // tokenizer.process();
|
750 | 321 | mdecorde | // }
|
751 | 321 | mdecorde | // println ""
|
752 | 321 | mdecorde | // println "Done"
|
753 | 321 | mdecorde | |
754 | 321 | mdecorde | } |
755 | 321 | mdecorde | } |