root / tmp / org.txm.core / src / java / org / txm / scripts / importer / EncodeTEIQuotes.groovy @ 2473
History | View | Annotate | Download (12.5 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer;
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
3 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
4 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
5 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
6 | 881 | mdecorde | //
|
7 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
8 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
9 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
10 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
11 | 881 | mdecorde | // later version.
|
12 | 881 | mdecorde | //
|
13 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
14 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
15 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
16 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
17 | 881 | mdecorde | // details.
|
18 | 881 | mdecorde | //
|
19 | 881 | mdecorde | // You should have received a copy of the GNU General
|
20 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
21 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | //
|
25 | 881 | mdecorde | // $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
|
26 | 881 | mdecorde | // $LastChangedRevision: 2386 $
|
27 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
28 | 881 | mdecorde | //
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import java.io.BufferedReader; |
31 | 881 | mdecorde | import java.io.File; |
32 | 881 | mdecorde | import java.io.FileInputStream; |
33 | 881 | mdecorde | import java.io.InputStreamReader; |
34 | 881 | mdecorde | |
35 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
36 | 881 | mdecorde | /**
|
37 | 881 | mdecorde | * The Class EncodeTEIQuotes.
|
38 | 881 | mdecorde | */
|
39 | 1000 | mdecorde | class EncodeTEIQuotes |
40 | 881 | mdecorde | { |
41 | 881 | mdecorde | |
42 | 881 | mdecorde | /** The name. */
|
43 | 881 | mdecorde | def name;
|
44 | 881 | mdecorde | |
45 | 881 | mdecorde | /** The initsize. */
|
46 | 881 | mdecorde | def initsize;
|
47 | 881 | mdecorde | |
48 | 881 | mdecorde | /** The quote. */
|
49 | 881 | mdecorde | String quote ="\""; |
50 | 881 | mdecorde | |
51 | 881 | mdecorde | /** The quote_type. */
|
52 | 881 | mdecorde | String quote_type= "pon" |
53 | 881 | mdecorde | |
54 | 881 | mdecorde | /**
|
55 | 881 | mdecorde | * Gets the name.
|
56 | 881 | mdecorde | *
|
57 | 881 | mdecorde | * @param node the xml node
|
58 | 881 | mdecorde | * @return the name of the node
|
59 | 881 | mdecorde | */
|
60 | 881 | mdecorde | String getName(def node) |
61 | 881 | mdecorde | { |
62 | 881 | mdecorde | def name = node.name();
|
63 | 881 | mdecorde | try{name = name.getLocalPart()}catch(Exception e){return name;} // might throw an exception depending on the node class |
64 | 881 | mdecorde | return name;
|
65 | 881 | mdecorde | } |
66 | 881 | mdecorde | |
67 | 881 | mdecorde | /**
|
68 | 881 | mdecorde | * method to know if the Node is an element.
|
69 | 881 | mdecorde | *
|
70 | 881 | mdecorde | * @param n the n
|
71 | 881 | mdecorde | * @return true, if is elem
|
72 | 881 | mdecorde | */
|
73 | 881 | mdecorde | boolean isElem(def n) |
74 | 881 | mdecorde | { |
75 | 881 | mdecorde | try{ n.name(); return true; } |
76 | 881 | mdecorde | catch(MissingMethodException e){ return false;} |
77 | 881 | mdecorde | } |
78 | 881 | mdecorde | |
79 | 881 | mdecorde | /**
|
80 | 881 | mdecorde | * Contains open quote.
|
81 | 881 | mdecorde | *
|
82 | 881 | mdecorde | * @param selem the node to process
|
83 | 881 | mdecorde | * @return true if the node contains a tag <q> at the beginning
|
84 | 881 | mdecorde | */
|
85 | 881 | mdecorde | boolean containsOpenQuote(Node selem)
|
86 | 881 | mdecorde | { |
87 | 881 | mdecorde | def children = selem.children()
|
88 | 881 | mdecorde | //println "OTS: "+selem.attributes()
|
89 | 881 | mdecorde | for(int i = 0 ; i < children.size()-1 ; i++)//on part de la fin jusqu'au 2e w |
90 | 881 | mdecorde | { |
91 | 881 | mdecorde | def child = children.get(i)
|
92 | 881 | mdecorde | if(isElem(child))
|
93 | 881 | mdecorde | { |
94 | 881 | mdecorde | name = getName(child) |
95 | 881 | mdecorde | if(name == "w" && child.@type != quote_type) |
96 | 881 | mdecorde | { |
97 | 881 | mdecorde | //println "not opening Q"
|
98 | 881 | mdecorde | return false; |
99 | 881 | mdecorde | } |
100 | 881 | mdecorde | else if(name == "w" && child.@type == quote_type && child.text().equals(quote)) |
101 | 881 | mdecorde | { |
102 | 881 | mdecorde | //println "opening Q"
|
103 | 881 | mdecorde | return true; |
104 | 881 | mdecorde | } |
105 | 881 | mdecorde | } |
106 | 881 | mdecorde | } |
107 | 881 | mdecorde | //println "not opening Q"
|
108 | 881 | mdecorde | return false; |
109 | 881 | mdecorde | } |
110 | 881 | mdecorde | |
111 | 881 | mdecorde | /**
|
112 | 881 | mdecorde | * Contains close quote.
|
113 | 881 | mdecorde | *
|
114 | 881 | mdecorde | * @param selem the node to process
|
115 | 881 | mdecorde | * @return true if the node contains a tag <q> at the end
|
116 | 881 | mdecorde | */
|
117 | 881 | mdecorde | boolean containsCloseQuote(Node selem)
|
118 | 881 | mdecorde | { |
119 | 881 | mdecorde | if(selem.w.size() == 0) |
120 | 881 | mdecorde | return false |
121 | 881 | mdecorde | //println "CTS: "+selem.attributes()
|
122 | 881 | mdecorde | def children = selem.children()
|
123 | 881 | mdecorde | for(int i = children.size()-1 ; i > 0 ; i--)//on part de la fin jusqu'au 2e w |
124 | 881 | mdecorde | { |
125 | 881 | mdecorde | def child = children.get(i)
|
126 | 881 | mdecorde | if(isElem(child))
|
127 | 881 | mdecorde | { |
128 | 881 | mdecorde | name = getName(child) |
129 | 881 | mdecorde | if(name == "w" && child.@type != quote_type) |
130 | 881 | mdecorde | { |
131 | 881 | mdecorde | //println "not closing Q"
|
132 | 881 | mdecorde | return false; |
133 | 881 | mdecorde | } |
134 | 881 | mdecorde | else if(name == "w" && child.@type == quote_type && child.text().equals(quote)) |
135 | 881 | mdecorde | { |
136 | 881 | mdecorde | //println "closing Q"
|
137 | 881 | mdecorde | return true; |
138 | 881 | mdecorde | } |
139 | 881 | mdecorde | } |
140 | 881 | mdecorde | } |
141 | 881 | mdecorde | return false; |
142 | 881 | mdecorde | } |
143 | 881 | mdecorde | |
144 | 881 | mdecorde | /**
|
145 | 881 | mdecorde | * Next children are words.
|
146 | 881 | mdecorde | *
|
147 | 881 | mdecorde | * @param children the children
|
148 | 881 | mdecorde | * @param index the index
|
149 | 881 | mdecorde | * @return true, if successful
|
150 | 881 | mdecorde | */
|
151 | 881 | mdecorde | def boolean nextChildrenAreWords(def children, def index) |
152 | 881 | mdecorde | { |
153 | 881 | mdecorde | //println "at "+children.get(index).attributes()+" test("+children.size()+") "+children
|
154 | 881 | mdecorde | for(int i = index ; i < children.size() ; i++) |
155 | 881 | mdecorde | { |
156 | 881 | mdecorde | def nextC = children.get(i)
|
157 | 881 | mdecorde | if(isElem(nextC) && getName(nextC) == "w" && nextC.@type != quote_type) |
158 | 881 | mdecorde | return true; |
159 | 881 | mdecorde | } |
160 | 881 | mdecorde | //println "no more words"
|
161 | 881 | mdecorde | return false; |
162 | 881 | mdecorde | } |
163 | 881 | mdecorde | |
164 | 881 | mdecorde | /**
|
165 | 881 | mdecorde | * Process containers.
|
166 | 881 | mdecorde | *
|
167 | 881 | mdecorde | * @param root the root
|
168 | 881 | mdecorde | * @param containers the containers
|
169 | 881 | mdecorde | * @return the java.lang. object
|
170 | 881 | mdecorde | */
|
171 | 881 | mdecorde | def processContainers(Node root, containers) |
172 | 881 | mdecorde | { |
173 | 881 | mdecorde | int countcontainer = 0; |
174 | 881 | mdecorde | int countinits = 0; |
175 | 881 | mdecorde | int countnews = 0; |
176 | 881 | mdecorde | int countinitquote = 0 |
177 | 881 | mdecorde | int quotecount = 0; |
178 | 881 | mdecorde | boolean openq = false; |
179 | 881 | mdecorde | // cut sentences whit a wpon not first nor last
|
180 | 881 | mdecorde | int initsizecontainer = root.children().size()
|
181 | 881 | mdecorde | //println "process "+initsizecontainer+" elements"
|
182 | 881 | mdecorde | for(int k = 0 ; k < initsizecontainer ; k++) |
183 | 881 | mdecorde | { |
184 | 881 | mdecorde | def node = root.children().get(0); |
185 | 881 | mdecorde | root.children().remove(0);
|
186 | 881 | mdecorde | //println getName(node)
|
187 | 881 | mdecorde | if(isElem(node) && containers.contains(getName(node)))
|
188 | 881 | mdecorde | { |
189 | 881 | mdecorde | countcontainer++; |
190 | 881 | mdecorde | Node pelem = node; |
191 | 881 | mdecorde | openq = false;
|
192 | 881 | mdecorde | //println "PAB================="
|
193 | 881 | mdecorde | int initsize = pelem.children().size()
|
194 | 881 | mdecorde | for(int j = 0 ; j < initsize ; j++) // pour chq fils de p |
195 | 881 | mdecorde | { |
196 | 881 | mdecorde | def subnode = pelem.children().get(0); |
197 | 881 | mdecorde | pelem.children().remove(0)
|
198 | 881 | mdecorde | //println getName(subnode)
|
199 | 881 | mdecorde | if(isElem(subnode) && getName(subnode) == "s") // si c un S |
200 | 881 | mdecorde | { |
201 | 881 | mdecorde | //println "\nS: "+subnode.attributes()
|
202 | 881 | mdecorde | def selem = subnode;
|
203 | 881 | mdecorde | def newS = pelem.appendNode(selem.name(), selem.attributes())
|
204 | 881 | mdecorde | boolean isOpenningQuote = false; |
205 | 881 | mdecorde | if(containsOpenQuote(selem))
|
206 | 881 | mdecorde | { |
207 | 881 | mdecorde | isOpenningQuote = true
|
208 | 881 | mdecorde | if(openq)
|
209 | 881 | mdecorde | { |
210 | 881 | mdecorde | //println "ERROR: unclosed quote before "+selem.@id
|
211 | 881 | mdecorde | } |
212 | 881 | mdecorde | openq = false;
|
213 | 881 | mdecorde | //println "force openq false"
|
214 | 881 | mdecorde | } |
215 | 881 | mdecorde | int tempwponcount = 0 |
216 | 881 | mdecorde | for(int i = 0 ; i < selem.children().size() ; i++) // append chacun de ses fils au S courant |
217 | 881 | mdecorde | { |
218 | 881 | mdecorde | def child = selem.children().get(i) // test si == wpon" |
219 | 881 | mdecorde | if(isElem(child) && getName(child) == "w" && child.@type == quote_type && child.text().equals(quote)) |
220 | 881 | mdecorde | { |
221 | 881 | mdecorde | countinitquote++ |
222 | 881 | mdecorde | //println "Q: "+child.attributes()
|
223 | 881 | mdecorde | //println "openq "+openq
|
224 | 881 | mdecorde | if(openq)
|
225 | 881 | mdecorde | { |
226 | 881 | mdecorde | newS.children().add(child) |
227 | 881 | mdecorde | if(nextChildrenAreWords(selem.children(), i))
|
228 | 881 | mdecorde | { |
229 | 881 | mdecorde | //println "NEW S"
|
230 | 881 | mdecorde | newS = pelem.appendNode(selem.name(), selem.attributes()) |
231 | 881 | mdecorde | } |
232 | 881 | mdecorde | //println "set openq false"
|
233 | 881 | mdecorde | openq = false;
|
234 | 881 | mdecorde | } |
235 | 881 | mdecorde | else
|
236 | 881 | mdecorde | { |
237 | 881 | mdecorde | if(tempwponcount != 0)// premier wpon" |
238 | 881 | mdecorde | { |
239 | 881 | mdecorde | //println "pas premier wpon''"
|
240 | 881 | mdecorde | if(nextChildrenAreWords(selem.children(), i))// si y'a aut' chose deriere, new S |
241 | 881 | mdecorde | { |
242 | 881 | mdecorde | newS = pelem.appendNode(selem.name(), selem.attributes()) |
243 | 881 | mdecorde | //println "NEW S"
|
244 | 881 | mdecorde | //println "set openq true"
|
245 | 881 | mdecorde | openq = true
|
246 | 881 | mdecorde | } |
247 | 881 | mdecorde | } |
248 | 881 | mdecorde | else // tempwponcount == 0 |
249 | 881 | mdecorde | { |
250 | 881 | mdecorde | if(!isOpenningQuote)
|
251 | 881 | mdecorde | { |
252 | 881 | mdecorde | if(nextChildrenAreWords(selem.children(), i))// si y'a aut' chose deriere, new S |
253 | 881 | mdecorde | { |
254 | 881 | mdecorde | newS = pelem.appendNode(selem.name(), selem.attributes()) |
255 | 881 | mdecorde | } |
256 | 881 | mdecorde | } |
257 | 881 | mdecorde | //println "set openq true"
|
258 | 881 | mdecorde | openq = true
|
259 | 881 | mdecorde | } |
260 | 881 | mdecorde | newS.children().add(child) |
261 | 881 | mdecorde | } |
262 | 881 | mdecorde | tempwponcount++; |
263 | 881 | mdecorde | } |
264 | 881 | mdecorde | else
|
265 | 881 | mdecorde | { |
266 | 881 | mdecorde | newS.children().add(child) |
267 | 881 | mdecorde | } |
268 | 881 | mdecorde | } |
269 | 881 | mdecorde | } |
270 | 881 | mdecorde | else
|
271 | 881 | mdecorde | pelem.children().add(subnode) |
272 | 881 | mdecorde | } |
273 | 881 | mdecorde | } |
274 | 881 | mdecorde | root.children().add(node) |
275 | 881 | mdecorde | } |
276 | 881 | mdecorde | // A partir d'ici on a que des phrases de la forme :
|
277 | 881 | mdecorde | // s q wwww s
|
278 | 881 | mdecorde | // s q wwww q s
|
279 | 881 | mdecorde | // s wwww q s
|
280 | 881 | mdecorde | //println "****"
|
281 | 881 | mdecorde | //wrap sentences in q tags
|
282 | 881 | mdecorde | initsize = root.children().size() |
283 | 881 | mdecorde | for(int j = 0 ; j < initsize ; j++) |
284 | 881 | mdecorde | { |
285 | 881 | mdecorde | def node = root.children().get(0); |
286 | 881 | mdecorde | root.children().remove(0)
|
287 | 881 | mdecorde | if(isElem(node) && containers.contains(getName(node)))
|
288 | 881 | mdecorde | { |
289 | 881 | mdecorde | Node pelem = node |
290 | 881 | mdecorde | //println "PPPPPPPP"
|
291 | 881 | mdecorde | def newp = root.appendNode(getName(pelem), pelem.attributes());
|
292 | 881 | mdecorde | //newp.attribute("test")
|
293 | 881 | mdecorde | openq = false;
|
294 | 881 | mdecorde | def children = pelem.children()
|
295 | 881 | mdecorde | //println children
|
296 | 881 | mdecorde | Node qelem; |
297 | 881 | mdecorde | for(Node subnode : children)
|
298 | 881 | mdecorde | { |
299 | 881 | mdecorde | if(isElem(subnode) && (getName(subnode) == "s")) |
300 | 881 | mdecorde | { |
301 | 881 | mdecorde | countnews++; |
302 | 881 | mdecorde | //println "ELEM s"
|
303 | 881 | mdecorde | Node selem = subnode; |
304 | 881 | mdecorde | if(containsOpenQuote(selem))
|
305 | 881 | mdecorde | { |
306 | 881 | mdecorde | // println "openning wpon"
|
307 | 881 | mdecorde | if(openq)
|
308 | 881 | mdecorde | { |
309 | 881 | mdecorde | // println "ERROR: unclosed quote (found swpon)"
|
310 | 881 | mdecorde | //on ferme, et ouvre un new
|
311 | 881 | mdecorde | qelem.appendNode("note",[type:"auto"]).setValue("unclosed quote") |
312 | 881 | mdecorde | Map attrs = [:];
|
313 | 881 | mdecorde | attrs.put("xml:id",""+(quotecount+1)) |
314 | 881 | mdecorde | qelem = newp.appendNode("q", attrs)
|
315 | 881 | mdecorde | quotecount++; |
316 | 881 | mdecorde | qelem.appendNode(getName(pelem), selem.attributes(), selem.value()); |
317 | 881 | mdecorde | if(containsCloseQuote(selem)) // la phrase étaient bien balisée |
318 | 881 | mdecorde | openq = false;
|
319 | 881 | mdecorde | } |
320 | 881 | mdecorde | else
|
321 | 881 | mdecorde | { |
322 | 881 | mdecorde | // println "open q + append current s"
|
323 | 881 | mdecorde | openq = true;
|
324 | 881 | mdecorde | Map attrs = [:];
|
325 | 881 | mdecorde | attrs.put("xml:id",""+(quotecount+1)) |
326 | 881 | mdecorde | qelem = newp.appendNode("q", attrs)
|
327 | 881 | mdecorde | quotecount++; |
328 | 881 | mdecorde | qelem.appendNode(getName(selem), selem.attributes(), selem.value()); |
329 | 881 | mdecorde | if(containsCloseQuote(selem)) // la phrase étaient bien balisée |
330 | 881 | mdecorde | openq = false;
|
331 | 881 | mdecorde | } |
332 | 881 | mdecorde | } |
333 | 881 | mdecorde | else if(containsCloseQuote(selem)) |
334 | 881 | mdecorde | { |
335 | 881 | mdecorde | //println "closing wpon"
|
336 | 881 | mdecorde | if(openq)
|
337 | 881 | mdecorde | { |
338 | 881 | mdecorde | // println "close q + add dernier s du quote"
|
339 | 881 | mdecorde | qelem.appendNode(getName(selem), selem.attributes(), selem.value()); |
340 | 881 | mdecorde | openq = false;
|
341 | 881 | mdecorde | } |
342 | 881 | mdecorde | else
|
343 | 881 | mdecorde | { |
344 | 881 | mdecorde | //println "ERROR: unopened quote"
|
345 | 881 | mdecorde | Map attrs = [:];
|
346 | 881 | mdecorde | attrs.put("xml:id",""+(quotecount+1)) |
347 | 881 | mdecorde | qelem = newp.appendNode("q", attrs)
|
348 | 881 | mdecorde | qelem.appendNode(getName(selem), selem.attributes(), selem.value()); |
349 | 881 | mdecorde | quotecount++; |
350 | 881 | mdecorde | qelem.appendNode("note",[type:"auto"]).setValue("unopened quote") |
351 | 881 | mdecorde | } |
352 | 881 | mdecorde | } |
353 | 881 | mdecorde | else // no wpon |
354 | 881 | mdecorde | { |
355 | 881 | mdecorde | // println "normal s"
|
356 | 881 | mdecorde | if(openq)
|
357 | 881 | mdecorde | { |
358 | 881 | mdecorde | // println "append to q"
|
359 | 881 | mdecorde | qelem.appendNode(getName(selem), selem.attributes(), selem.value()); |
360 | 881 | mdecorde | } |
361 | 881 | mdecorde | else
|
362 | 881 | mdecorde | { |
363 | 881 | mdecorde | // println "append to p"
|
364 | 881 | mdecorde | newp.appendNode(getName(selem), selem.attributes(), selem.value()); |
365 | 881 | mdecorde | } |
366 | 881 | mdecorde | } |
367 | 881 | mdecorde | } |
368 | 881 | mdecorde | else
|
369 | 881 | mdecorde | { |
370 | 881 | mdecorde | if(openq)
|
371 | 881 | mdecorde | { |
372 | 881 | mdecorde | qelem.children().add(subnode) |
373 | 881 | mdecorde | } |
374 | 881 | mdecorde | else
|
375 | 881 | mdecorde | { |
376 | 881 | mdecorde | newp.children().add(subnode) |
377 | 881 | mdecorde | } |
378 | 881 | mdecorde | } |
379 | 881 | mdecorde | } |
380 | 881 | mdecorde | if(openq)// un quote fermé a la fin du P !! |
381 | 881 | mdecorde | { |
382 | 881 | mdecorde | // println "ERROR: unmatched quote"
|
383 | 881 | mdecorde | qelem.appendNode("note",[type:"auto"]).setValue("unmatched quote") |
384 | 881 | mdecorde | } |
385 | 881 | mdecorde | //root.children().add(newp)
|
386 | 881 | mdecorde | } |
387 | 881 | mdecorde | else
|
388 | 881 | mdecorde | root.children().add(node) |
389 | 881 | mdecorde | } |
390 | 881 | mdecorde | //count sentences
|
391 | 881 | mdecorde | println "processed "+countcontainer+" "+containers |
392 | 881 | mdecorde | println "initial number of " "+countinitquote;
|
393 | 881 | mdecorde | println "created "+(countnews )+" init "+ countinits+" s" |
394 | 881 | mdecorde | println "create "+(quotecount)+" quotes elements" |
395 | 881 | mdecorde | } |
396 | 881 | mdecorde | |
397 | 881 | mdecorde | /**
|
398 | 881 | mdecorde | * Instantiates a new encode tei quotes.
|
399 | 881 | mdecorde | *
|
400 | 881 | mdecorde | * @param nodesToInspect the nodes to inspect
|
401 | 881 | mdecorde | * @param containers the containers
|
402 | 881 | mdecorde | * @param quote_value the quote_value
|
403 | 881 | mdecorde | * @param quote_type the quote_type
|
404 | 881 | mdecorde | */
|
405 | 881 | mdecorde | public EncodeTEIQuotes(List<Node> nodesToInspect, containers, String quote_value, String quote_type) |
406 | 881 | mdecorde | { |
407 | 881 | mdecorde | this.quote= quote_value;
|
408 | 881 | mdecorde | this.quote_type= quote_type;
|
409 | 881 | mdecorde | println "process "+nodesToInspect.size()+" elements with containers "+containers |
410 | 881 | mdecorde | for(groovy.util.slurpersupport.Node root : nodesToInspect)//.body) |
411 | 881 | mdecorde | { |
412 | 881 | mdecorde | println(" root : "+root.children().size())
|
413 | 881 | mdecorde | processContainers(root, containers) |
414 | 881 | mdecorde | } |
415 | 881 | mdecorde | } |
416 | 881 | mdecorde | |
417 | 881 | mdecorde | /**
|
418 | 881 | mdecorde | * The main method.
|
419 | 881 | mdecorde | *
|
420 | 881 | mdecorde | * @param args the arguments
|
421 | 881 | mdecorde | */
|
422 | 881 | mdecorde | public static void main(String[] args) |
423 | 881 | mdecorde | { |
424 | 881 | mdecorde | List<String> containers = ["p","ab"]; // elements qui contiennent des <s> |
425 | 881 | mdecorde | |
426 | 881 | mdecorde | File infile = new File(System.getProperty("user.home"), "xml/quote/gormont.xml") |
427 | 881 | mdecorde | File outfile = new File(System.getProperty("user.home"), "xml/quote/gormont-q.xml") |
428 | 881 | mdecorde | |
429 | 881 | mdecorde | def doc = new XmlParser().parse (infile) |
430 | 881 | mdecorde | List<Node> nodesToInspect = doc.text.body
|
431 | 881 | mdecorde | // nodesToInspect << doc.text.body.div // on en rajoute
|
432 | 881 | mdecorde | |
433 | 881 | mdecorde | /*************************/
|
434 | 1000 | mdecorde | new org.txm.scripts.importer.EncodeTEIQuotes(nodesToInspect, containers, "\"", "pon"); |
435 | 881 | mdecorde | |
436 | 881 | mdecorde | //copy the doc in "outfile" File
|
437 | 881 | mdecorde | String encoding = "UTF-8" |
438 | 881 | mdecorde | OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outfile) , encoding); |
439 | 881 | mdecorde | writer.write("<?xml version=\"1.0\" encoding=\""+encoding+"\"?>\n") |
440 | 881 | mdecorde | def pwriter = new PrintWriter(writer, true) |
441 | 881 | mdecorde | XmlNodePrinter xmlwriter = new XmlNodePrinter(pwriter)
|
442 | 881 | mdecorde | xmlwriter.setPreserveWhitespace(false)
|
443 | 881 | mdecorde | xmlwriter.print(doc) |
444 | 881 | mdecorde | pwriter.close() |
445 | 881 | mdecorde | writer.close() |
446 | 881 | mdecorde | xmlwriter = null
|
447 | 881 | mdecorde | //println "write output file "+outfile
|
448 | 881 | mdecorde | |
449 | 881 | mdecorde | //update counts
|
450 | 881 | mdecorde | if(outfile.exists())
|
451 | 881 | mdecorde | { |
452 | 1000 | mdecorde | String txtid = org.txm.scripts.importer.WordCounter.findTextId(infile, "s"); // retrouve l'id du text qui a été concaténé aux id des S originels (ex : s19_12 >> 19) |
453 | 1000 | mdecorde | new org.txm.scripts.importer.WordCounter(outfile, "s", txtid); |
454 | 1000 | mdecorde | new org.txm.scripts.importer.WordCounter(outfile, "q", txtid); |
455 | 881 | mdecorde | } |
456 | 881 | mdecorde | } |
457 | 881 | mdecorde | } |