Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / EncodeTEIQuotes.groovy @ 187

History | View | Annotate | Download (12.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer;
29

    
30
import java.io.BufferedReader;
31
import java.io.File;
32
import java.io.FileInputStream;
33
import java.io.InputStreamReader;
34

    
35
// TODO: Auto-generated Javadoc
36
/**
37
 * The Class EncodeTEIQuotes.
38
 */
39
public class EncodeTEIQuotes
40
{
41
        
42
        /** The name. */
43
        def name;
44
        
45
        /** The initsize. */
46
        def initsize;
47
        
48
        /** The quote. */
49
        String quote ="\"";
50
        
51
        /** The quote_type. */
52
        String quote_type= "pon"
53
        
54
                /**
55
                 * Gets the name.
56
                 *
57
                 * @param node the xml node
58
                 * @return the name of the node
59
                 */
60
        String getName(def node)
61
        {
62
                def name = node.name();
63
                try{name = name.getLocalPart()}catch(Exception e){return name;} // might throw an exception depending on the node class
64
                return name;
65
        }
66
        
67
        /**
68
         * method to know if the Node is an element.
69
         *
70
         * @param n the n
71
         * @return true, if is elem
72
         */
73
        boolean isElem(def n)
74
        {
75
                try{ n.name(); return true; }
76
                catch(MissingMethodException e){ return false;}
77
        }
78
        
79
        /**
80
         * Contains open quote.
81
         *
82
         * @param selem the node to process
83
         * @return true if the node contains a tag <q> at the beginning
84
         */
85
        boolean containsOpenQuote(Node selem)
86
        {
87
                def children = selem.children()
88
                //println "OTS: "+selem.attributes()
89
                for(int i = 0 ; i < children.size()-1 ; i++)//on part de la fin jusqu'au 2e w
90
                {
91
                        def child = children.get(i)
92
                        if(isElem(child))
93
                        {
94
                                name = getName(child)
95
                                if(name == "w" && child.@type != quote_type)
96
                                {
97
                                        //println "not opening Q"
98
                                        return false;
99
                                }
100
                                else if(name == "w" && child.@type == quote_type && child.text().equals(quote))
101
                                {
102
                                        //println "opening Q"
103
                                        return true;
104
                                }
105
                        }
106
                }
107
                //println "not opening Q"
108
                return false;
109
        }
110
        
111
        /**
112
         * Contains close quote.
113
         *
114
         * @param selem the node to process
115
         * @return true if the node contains a tag <q> at the end
116
         */
117
        boolean containsCloseQuote(Node selem)
118
        {
119
                if(selem.w.size() == 0)
120
                        return false
121
                //println "CTS: "+selem.attributes()
122
                def children = selem.children()
123
                for(int i = children.size()-1 ; i > 0 ; i--)//on part de la fin jusqu'au 2e w
124
                {
125
                        def child = children.get(i)
126
                        if(isElem(child))
127
                        {
128
                                name = getName(child)
129
                                if(name == "w" && child.@type != quote_type)
130
                                {
131
                                        //println "not closing Q"
132
                                        return false;
133
                                }
134
                                else if(name == "w" && child.@type == quote_type && child.text().equals(quote))
135
                                {
136
                                        //println "closing Q"
137
                                        return true;
138
                                }
139
                        }
140
                }
141
                return false;
142
        }
143
        
144
        /**
145
         * Next children are words.
146
         *
147
         * @param children the children
148
         * @param index the index
149
         * @return true, if successful
150
         */
151
        def boolean nextChildrenAreWords(def children, def index)
152
        {
153
                //println "at "+children.get(index).attributes()+" test("+children.size()+") "+children
154
                for(int i = index ; i < children.size() ; i++)
155
                {
156
                        def nextC = children.get(i)
157
                        if(isElem(nextC) && getName(nextC) == "w" && nextC.@type != quote_type)
158
                                return true;
159
                }
160
                //println "no more words"
161
                return false;
162
        }
163
        
164
        /**
165
         * Process containers.
166
         *
167
         * @param root the root
168
         * @param containers the containers
169
         * @return the java.lang. object
170
         */
171
        def processContainers(Node root, containers)
172
        {
173
                int countcontainer = 0;
174
                int countinits = 0;
175
                int countnews = 0;
176
                int countinitquote = 0
177
                int quotecount = 0;
178
                boolean openq = false;
179
                // cut sentences whit a wpon not first nor last
180
                int initsizecontainer = root.children().size()
181
                //println "process "+initsizecontainer+" elements"
182
                for(int k = 0 ; k < initsizecontainer ; k++)
183
                {
184
                        def node = root.children().get(0);
185
                        root.children().remove(0);
186
                        //println getName(node)
187
                        if(isElem(node) && containers.contains(getName(node)))
188
                        {
189
                                countcontainer++;
190
                                Node pelem = node;
191
                                openq = false;
192
                                //println "PAB================="
193
                                int initsize = pelem.children().size()
194
                                for(int j = 0 ; j < initsize ; j++) // pour chq fils de p
195
                                {
196
                                        def subnode = pelem.children().get(0);
197
                                        pelem.children().remove(0)
198
                                        //println getName(subnode)
199
                                        if(isElem(subnode) && getName(subnode) == "s") // si c un S
200
                                        {
201
                                                //println "\nS: "+subnode.attributes()
202
                                                def selem = subnode;
203
                                                def newS = pelem.appendNode(selem.name(), selem.attributes())
204
                                                boolean isOpenningQuote = false;
205
                                                if(containsOpenQuote(selem))
206
                                                {
207
                                                        isOpenningQuote = true
208
                                                        if(openq)
209
                                                        {
210
                                                                //println "ERROR: unclosed quote before "+selem.@id
211
                                                        }
212
                                                        openq = false;
213
                                                        //println "force openq false"
214
                                                }
215
                                                int tempwponcount = 0
216
                                                for(int i = 0 ; i < selem.children().size() ; i++) // append chacun de ses fils au S courant
217
                                                {
218
                                                        def child = selem.children().get(i) // test si == wpon"
219
                                                        if(isElem(child) && getName(child) == "w" && child.@type == quote_type && child.text().equals(quote))
220
                                                        {
221
                                                                countinitquote++
222
                                                                //println "Q: "+child.attributes()
223
                                                                //println "openq "+openq
224
                                                                if(openq)
225
                                                                {
226
                                                                        newS.children().add(child)
227
                                                                        if(nextChildrenAreWords(selem.children(), i))
228
                                                                        {                                                                             
229
                                                                                //println "NEW S"
230
                                                                                newS = pelem.appendNode(selem.name(), selem.attributes())
231
                                                                        }
232
                                                                        //println "set openq false"
233
                                                                        openq = false;
234
                                                                }
235
                                                                else
236
                                                                {
237
                                                                        if(tempwponcount != 0)// premier wpon"
238
                                                                        {
239
                                                                                //println "pas premier wpon''"
240
                                                                                if(nextChildrenAreWords(selem.children(), i))// si y'a aut' chose deriere, new S
241
                                                                                {
242
                                                                                        newS = pelem.appendNode(selem.name(), selem.attributes())
243
                                                                                        //println "NEW S"
244
                                                                                        //println "set openq true"
245
                                                                                        openq = true
246
                                                                                }
247
                                                                        }
248
                                                                        else // tempwponcount == 0
249
                                                                        {
250
                                                                                if(!isOpenningQuote)
251
                                                                                {
252
                                                                                        if(nextChildrenAreWords(selem.children(), i))// si y'a aut' chose deriere, new S
253
                                                                                        {
254
                                                                                                newS = pelem.appendNode(selem.name(), selem.attributes())
255
                                                                                        }
256
                                                                                }
257
                                                                                //println "set openq true"
258
                                                                                openq = true
259
                                                                        }
260
                                                                        newS.children().add(child)
261
                                                                }
262
                                                                tempwponcount++;
263
                                                        }
264
                                                        else
265
                                                        {
266
                                                                newS.children().add(child)
267
                                                        }
268
                                                }
269
                                        }
270
                                        else
271
                                                pelem.children().add(subnode)
272
                                }
273
                        }
274
                        root.children().add(node)
275
                }
276
                // A partir d'ici on a que des phrases de la forme :
277
                // s q wwww s
278
                // s q wwww q s
279
                // s wwww q s
280
                //println "****"
281
                //wrap sentences in q tags
282
                initsize = root.children().size()
283
                for(int j = 0 ; j < initsize ; j++)
284
                {
285
                        def node = root.children().get(0);
286
                        root.children().remove(0)
287
                        if(isElem(node) && containers.contains(getName(node)))
288
                        {
289
                                Node pelem = node
290
                                //println "PPPPPPPP"
291
                                def newp = root.appendNode(getName(pelem), pelem.attributes());
292
                                //newp.attribute("test")
293
                                openq = false;
294
                                def children = pelem.children()
295
                                //println children
296
                                Node qelem;
297
                                for(Node subnode : children)
298
                                {
299
                                        if(isElem(subnode) && (getName(subnode) == "s"))
300
                                        {
301
                                                countnews++;
302
                                                //println "ELEM s"
303
                                                Node selem = subnode;
304
                                                if(containsOpenQuote(selem))
305
                                                {
306
                                                        // println "openning wpon"
307
                                                        if(openq)
308
                                                        {
309
                                                                // println "ERROR: unclosed quote (found swpon)"
310
                                                                //on ferme, et ouvre un new
311
                                                                qelem.appendNode("note",[type:"auto"]).setValue("unclosed quote")
312
                                                                Map attrs = [:];
313
                                                                attrs.put("xml:id",""+(quotecount+1))
314
                                                                qelem = newp.appendNode("q", attrs)
315
                                                                quotecount++;
316
                                                                qelem.appendNode(getName(pelem), selem.attributes(), selem.value());
317
                                                                if(containsCloseQuote(selem)) // la phrase étaient bien balisée
318
                                                                        openq = false;
319
                                                        }
320
                                                        else
321
                                                        {
322
                                                                // println "open q + append current s"
323
                                                                openq = true;
324
                                                                Map attrs = [:];
325
                                                                attrs.put("xml:id",""+(quotecount+1))
326
                                                                qelem = newp.appendNode("q", attrs)
327
                                                                quotecount++;
328
                                                                qelem.appendNode(getName(selem), selem.attributes(), selem.value());
329
                                                                if(containsCloseQuote(selem)) // la phrase étaient bien balisée
330
                                                                        openq = false;
331
                                                        }
332
                                                }
333
                                                else if(containsCloseQuote(selem))
334
                                                {
335
                                                        //println "closing wpon"
336
                                                        if(openq)
337
                                                        {
338
                                                                // println "close q + add dernier s du quote"
339
                                                                qelem.appendNode(getName(selem), selem.attributes(), selem.value());
340
                                                                openq = false;
341
                                                        }
342
                                                        else
343
                                                        {
344
                                                                //println "ERROR: unopened quote"
345
                                                                Map attrs = [:];
346
                                                                attrs.put("xml:id",""+(quotecount+1))
347
                                                                qelem = newp.appendNode("q", attrs)
348
                                                                qelem.appendNode(getName(selem), selem.attributes(), selem.value());
349
                                                                quotecount++;
350
                                                                qelem.appendNode("note",[type:"auto"]).setValue("unopened quote")
351
                                                        }
352
                                                }
353
                                                else // no wpon
354
                                                {
355
                                                        // println "normal s"
356
                                                        if(openq)
357
                                                        {
358
                                                                // println "append to q"
359
                                                                qelem.appendNode(getName(selem), selem.attributes(), selem.value());
360
                                                        }
361
                                                        else
362
                                                        {
363
                                                                // println "append to p"
364
                                                                newp.appendNode(getName(selem), selem.attributes(), selem.value());
365
                                                        }
366
                                                }
367
                                        }
368
                                        else
369
                                        {
370
                                                if(openq)
371
                                                {
372
                                                        qelem.children().add(subnode)
373
                                                }
374
                                                else
375
                                                {
376
                                                        newp.children().add(subnode)
377
                                                }
378
                                        }
379
                                }
380
                                if(openq)// un quote fermé a la fin du P !!
381
                                {
382
                                        // println "ERROR: unmatched quote"
383
                                        qelem.appendNode("note",[type:"auto"]).setValue("unmatched quote")
384
                                }
385
                                //root.children().add(newp)
386
                        }
387
                        else
388
                                root.children().add(node)
389
                }
390
                //count sentences
391
                println "processed "+countcontainer+" "+containers
392
                println "initial number of &quot; "+countinitquote;
393
                println "created "+(countnews )+" init "+ countinits+" s"
394
                println "create "+(quotecount)+" quotes elements"
395
        }
396
        
397
        /**
398
         * Instantiates a new encode tei quotes.
399
         *
400
         * @param nodesToInspect the nodes to inspect
401
         * @param containers the containers
402
         * @param quote_value the quote_value
403
         * @param quote_type the quote_type
404
         */
405
        public EncodeTEIQuotes(List<Node> nodesToInspect, containers, String quote_value, String quote_type)
406
        {
407
                this.quote= quote_value;
408
                this.quote_type= quote_type;
409
                println "process "+nodesToInspect.size()+" elements with containers "+containers
410
                for(groovy.util.slurpersupport.Node root : nodesToInspect)//.body)
411
                {
412
                        println(" root : "+root.children().size())
413
                        processContainers(root, containers)
414
                }
415
        }
416
        
417
        /**
418
         * The main method.
419
         *
420
         * @param args the arguments
421
         */
422
        public static void main(String[] args)
423
        {
424
                List<String> containers = ["p","ab"]; // elements qui contiennent des <s>
425
                
426
                File infile = new File(System.getProperty("user.home"), "xml/quote/gormont.xml")
427
                File outfile = new File(System.getProperty("user.home"), "xml/quote/gormont-q.xml")
428
                
429
                def doc = new XmlParser().parse (infile)
430
                List<Node> nodesToInspect = doc.text.body
431
                // nodesToInspect << doc.text.body.div // on en rajoute
432
                
433
                /*************************/
434
                new org.txm.importer.EncodeTEIQuotes(nodesToInspect, containers, "\"", "pon");
435
                
436
                //copy the doc in "outfile" File  
437
                String encoding = "UTF-8"
438
                OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outfile) , encoding);
439
                writer.write("<?xml version=\"1.0\" encoding=\""+encoding+"\"?>\n")
440
                def pwriter = new PrintWriter(writer, true)
441
                XmlNodePrinter xmlwriter = new XmlNodePrinter(pwriter)
442
                xmlwriter.setPreserveWhitespace(false)
443
                xmlwriter.print(doc)
444
                pwriter.close()
445
                writer.close()
446
                xmlwriter = null
447
                //println "write output file "+outfile
448
                
449
                //update counts
450
                if(outfile.exists())
451
                {
452
                        String txtid = org.txm.importer.WordCounter.findTextId(infile, "s"); // retrouve l'id du text qui a été concaténé aux id des S originels (ex : s19_12 >> 19)
453
                        new org.txm.importer.WordCounter(outfile, "s", txtid);
454
                        new org.txm.importer.WordCounter(outfile, "q", txtid);
455
                }
456
        }
457
}