Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / src / org / txm / importer / srcmf / SubcorpusDataInject.groovy @ 458

History | View | Annotate | Download (13 kB)

1
#! /usr/bin/groovy
2
package org.txm.importer.srcmf;
3
/*
4
 * To change this template, choose Tools | Templates
5
 * and open the template in the editor.
6
 */
7

    
8
import java.util.logging.Logger
9
import java.util.logging.FileHandler
10
import java.util.logging.SimpleFormatter
11
import java.util.logging.Level
12

    
13

    
14
class SubcorpusDataInject {
15
        // Globals
16
        def textForm = 'prose'
17
        def TEI = null
18
        def wNodes = []
19
        def wNodesInQ = []
20
        // Set up namespaces
21
        static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
22
        static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
23
        // Set up a logger
24
        static logger = Logger.getLogger('subcorpusDataInjectLog')
25
        // END GLOBALS
26

    
27
        SubcorpusDataInject (
28
        File xmlTxmFile,
29
        FileHandler fh,
30
        String newTextForm
31
        ) {
32
                // Reset global values
33
                if (newTextForm) {
34
                        textForm = newTextForm
35
                }
36
                // Set up logger
37
                def formatter = new SimpleFormatter()
38
                logger.addHandler(fh)
39
                logger.setLevel(Level.ALL)
40
                fh.setFormatter(formatter)
41
                // Parse the XML-TXM file
42
                logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).")
43
                TEI = new XmlParser().parse(xmlTxmFile)
44
                // Get text form from TEI header (if expressed)
45
                logger.log(Level.INFO, "Reading TEI header to detect text form.")
46
                if (TEI.teiHeader.profileDesc.textClass.catRef) {
47
                        def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1]
48
                        textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
49
                                it.'@id' == catId || it.attribute(nsXml.id) == catId
50
                        }.catDesc[0].text()
51
                        logger.log(Level.INFO, "Found text form '$textForm' in TEI header.")
52
                } else {
53
                        logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.")
54
                }
55
                // Convert Lbs to Property
56
                if (textForm == 'vers' || textForm == 'mixte') {
57
                        logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.")
58
                        def undone = TEI.text.body
59
                        def lastLb = false
60
                        def lastWord = null
61
                        while (undone) {
62
                                def aNode = undone.remove(0)
63
                                if (aNode.name().getLocalPart() == 'lb') {
64
                                        lastLb = true
65
                                        if (lastWord) {
66
                                                lastWord.'@lb' = 'end'
67
                                        }
68
                                } else if (aNode.name().getLocalPart() == 'w') {
69
                                        if (!punctTest(aNode)) {
70
                                                if (lastLb) {
71
                                                        aNode.'@lb' = 'start'
72
                                                }
73
                                                lastWord = aNode
74
                                                lastLb = false
75
                                        }
76
                                } else {
77
                                        undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
78
                                }
79
                        }
80
                }
81
                // Build node lists
82
                logger.log(Level.INFO, "Building node lists.")
83
                wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
84
                wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
85
        }
86

    
87
        static main(def args) {
88
                // Parse command line args
89
                def outputFile = 'test-out.xml'
90
                def logName = 'subcorpusDataInjectLog.log'
91
                def cli = new CliBuilder(
92
                                usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml'
93
                                )
94
                cli.h(longOpt:'help', 'Prints this message')
95
                cli.m(longOpt:'master', 'Specifies input file is a TS master file.')
96
                cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.')
97
                cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.')
98
                cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file')
99
                println args
100
                def options = cli.parse(args)
101
                // Call XML TXM loader
102
                if (options.arguments().size() == 2) {
103
                        def arglist = [new File (options.arguments()[0])]
104
                        if (options.logfile) {
105
                                arglist.add(new FileHandler(options.logfile))
106
                        } else {
107
                                arglist.add(new FileHandler(logName))
108
                        }
109
                        if (options.v) {
110
                                arglist.add('vers')
111
                        } else {
112
                                arglist.add('')
113
                        }
114
                        def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2])
115
                        // Call process master or process subcorpus
116
                        arglist = [new File(options.arguments()[1]).getCanonicalFile()]
117
                        if (options.output) {
118
                                arglist.add(new File (options.output))
119
                        } else {
120
                                arglist.add(new File (outputFile))
121
                        }
122
                        if (options.m) {
123
                                sdi.processMaster(arglist[0], arglist[1])
124
                        } else {
125
                                sdi.processSubcorpus(arglist[0], arglist[1])
126
                        }
127
                } else {
128
                        println "Wrong number of arguments."
129
                        println cli.usage()
130
                }
131
        }
132

    
133
        def processMaster (File masterFile, File outputFile) {
134
                // Runs the transform on each subcorpus listed in a master file.
135
                // Resaves the master file.
136
                logger.log(Level.INFO, "Loading corpus master file")
137
                def masterFolder = ''
138
                def outputFolder = ''
139
                if (masterFile.getParent()) {
140
                        // If not cwd
141
                        masterFolder = masterFile.getParent()
142
                }
143
                if (outputFile.getParent()) {
144
                        // If not cwd
145
                        outputFolder = outputFile.getParent()
146
                }
147
                def master = new XmlParser().parse(masterFile)
148
                // Process each subcorpus file
149
                master.body.subcorpus.each {
150
                        processSubcorpus(
151
                                        new File (masterFolder, it.'@external'[5..-1]),
152
                                        new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
153
                                        )
154
                        it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
155
                }
156
                logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
157

    
158
                outputFile.withWriter { writer ->
159
                        writer << groovy.xml.XmlUtil.serialize(master)
160
                }
161
                logger.log(Level.INFO, "Run reheader on this file!")
162
        }
163

    
164
        def processSubcorpus (File subcorpusFile, File outputFile) {
165
                // Parse Subcorpus file
166
                logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
167
                def TS = new XmlParser().parse(subcorpusFile)
168
                // This is used as a stack
169
                def tNodes = TS.s.graph.terminals.t
170
                // This isn't
171
                def tNodesFixed = TS.s.graph.terminals.t
172

    
173
                // Build duplicata correspondance list
174
                logger.log(Level.INFO, "Building duplicata correspondance list")
175
                def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
176
                        def duplId = dupl.'@id'
177
                        def realId = dupl.parent().parent().nonterminals.nt.edge.find {
178
                                it.'@idref' == duplId
179
                        }.parent().secedge.find {
180
                                it.'@label' == 'dupl'
181
                        }.'@idref'
182
                        [duplId, realId]
183
                }
184
                logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
185

    
186
                // Add pos & form tags
187
                logger.log(Level.INFO, "Adding PoS and form tags to each t-node")
188
                // Get first wNode corresponding to a tNode
189
                def i = 0
190
                def wIx = null
191
                while (i < tNodes.size()) {
192
                        // Avoid trying to find "fake" first words, it's very inefficient
193
                        if (tNodes[i].'@word' != '#') {
194
                                wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
195
                        }
196
                        if (wIx) {
197
                                break
198
                        } else {
199
                                i++
200
                        }
201
                }
202
                assert wIx, "$i"
203
                /*
204
                 * MAIN ITERATOR BEGINS HERE FOR POS INJECTION
205
                 */
206
                // Iterate over tNodes.
207
                while (tNodes) {
208
                        def thisTNode = tNodes.remove(0)
209
                        def thisTNodeId = thisTNode.'@editionId'
210
                        def wNode = null
211
                        def q = false
212
                        // Define the PoS / q / form injecting closure
213
                        def injectPos = {
214
                                // Q tag
215
                                if (q) {
216
                                        thisTNode.'@q' = 'y'
217
                                } else {
218
                                        thisTNode.'@q' = 'n'
219
                                }
220
                                // Form tag
221
                                if (textForm == 'prose') {
222
                                        thisTNode.'@form' = 'prose'
223
                                } else if (
224
                                textForm == 'vers' ||
225
                                (
226
                                textForm == 'mixte'
227
                                && wNode.parent().parent().name().getLocalPart() == 'ab'
228
                                )
229
                                ) {
230
                                        if (! wNode.'@lb') {
231
                                                thisTNode.'@form' = 'vers'
232
                                        } else if (wNode.'@lb' == 'start') {
233
                                                thisTNode.'@form' = 'vers_debut'
234
                                        } else if (wNode.'@lb' == 'end')  {
235
                                                thisTNode.'@form' = 'vers_fin'
236
                                        }
237
                                }
238
                                // DEBUG TMR 22/08/2012
239
                                else {
240
                                        thisTNode.'@form' = 'prose'
241
                                }
242
                                // END DEBUG TMR 22/08/2012
243
                                // Find PoS tag in BFM file
244
                                // Try txm:ana tags
245
                                def txmAnaList = wNode[nsTxm.ana]
246
                                // Then interp tags
247
                                if (! txmAnaList) {
248
                                        txmAnaList = wNode.interp
249
                                }
250
                                // Try type="#pos_syn"
251
                                if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
252
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
253
                                        // type attr of wNode
254
                                } else if (! txmAnaList && wNode.'@type') {
255
                                        thisTNode.'@pos' = wNode.'@type'
256
                                        // type="#pos"
257
                                } else if (txmAnaList.find { it.'@type' =  '#pos' }) {
258
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
259
                                        // type="#fropos"
260
                                } else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
261
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
262
                                } else {
263
                                        thisTNode.'@pos' = '--'
264
                                }
265
                        }
266
                        // End of injecting closure.
267
                        // Check whether wNode matches tNode
268
                        if (thisTNode.'@word' == '#') {
269
                                // Not expecting to find these in BFM file, all OK.
270
                                thisTNode.'@pos' = '--'
271
                                thisTNode.'@form' = '--'
272
                                thisTNode.'@q' = '--'
273
                        } else if ( idmatch(thisTNode, wNodes[wIx]) ) {
274
                                // Perfect match
275
                                wNode = wNodes[wIx]
276
                                q = wNodesInQ.contains(wNode)
277
                                injectPos.call()
278
                                wIx++
279
                        } else if (thisTNode.'@word' == '*') {
280
                                // Duplicata, use dupl match list to find correct wNode
281
                                def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
282
                                if (duplIdMatch) {
283
                                        def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
284
                                        // To enhance performance, I assume initially that the wNode matching the
285
                                        // duplicata is within +- 10 words of the current position in the
286
                                        // list.  The wNodes list is very long, and searches over the entire
287
                                        // list are avoided.
288
                                        assert matchTNode, "$duplIdMatch"
289
                                        if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
290
                                                wNode = wNodes[wIx-10 .. wIx+10].find {
291
                                                        idmatch(matchTNode, it)
292
                                                }
293
                                        }
294
                                        // Use whole list if it failed.
295
                                        if (! wNode) {
296
                                                wNode = wNodes.find {
297
                                                        idmatch(matchTNode, it)
298
                                                }
299
                                        }
300
                                        q = wNodesInQ.contains(wNode)
301
                                        injectPos.call()
302
                                        // Don't update wIx value!
303
                                } else {
304
                                        thisTNode.'@pos' = '--'
305
                                        thisTNode.'@form' = '--'
306
                                        thisTNode.'@q' = '--'
307
                                        logger.log(
308
                                                        Level.WARNING,
309
                                                        "<w/> node matching TS duplicata $thisTNodeId not found."
310
                                                        )
311
                                }
312
                        }  else if (punctTest(wNodes[wIx])) {
313
                                // not expecting to find these in TS file; try same tNode again.
314
                                tNodes.add(0, thisTNode)
315
                                wIx++
316
                        } else if (!
317
                        wNodes.find {
318
                                idmatch (thisTNode, it)
319
                        }
320
                        ) {
321
                                // Check the SRCMF corpus hasn't acquired an extra word
322
                                logger.log(
323
                                                Level.WARNING,
324
                                                "TS word $thisTNodeId (${thisTNode.'@word'}) is " +
325
                                                "absent from BFM file."
326
                                                )
327
                                thisTNode.'@pos' = '--'
328
                                thisTNode.'@form' = '--'
329
                                thisTNode.'@q' = '--'
330
                        } else {
331
                                // it's doesn't match, it's not a duplicata,
332
                                // it's not BFM punctuation, but it IS found somewhere in the BFM
333
                                // corpus.
334
                                logger.log(
335
                                                Level.WARNING,
336
                                                "Discontinuity in TS corpus before word $thisTNodeId " +
337
                                                "(${thisTNode.'@word'})."
338
                                                )
339
                                wNode = wNodes.find {
340
                                        idmatch(thisTNode, it)
341
                                }
342
                                q = wNodesInQ.contains(wNode)
343
                                injectPos.call()
344
                                wIx = wNodes.indexOf(wNode) + 1
345
                        }
346
                }
347
                /*
348
                 * End Main iteration
349
                 */
350
                /*******************
351
                 * Recalculate headpos 
352
                 */
353
                logger.log(Level.INFO, "Recalculating headpos.")
354
                TS.s.graph.nonterminals.nt.each { nt ->
355
                        def a = nt.edge.findAll { it.'@label' == 'L' }
356
                        if (a.size() == 0) {
357
                                nt.'@headpos' = '--'
358
                        } else if (a.size() == 1) {
359
                                nt.'@headpos' = nt.parent().parent().terminals.t.find {
360
                                        it.'@id' == a[0].'@idref'
361
                                }.'@pos'
362
                        } else {
363
                                for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
364
                                        if (
365
                                        a.each { edge ->
366
                                                nt.parent().parent().terminals.t.find {
367
                                                        it.'@id' == edge.'@idref'
368
                                                }.'@pos'
369
                                        }.contains(headpos)
370
                                        ) {
371
                                                nt.'@headpos' = headpos + '?'
372
                                                break
373
                                        }
374
                                }
375
                        }
376
                        assert nt.'@headpos'
377
                }
378
                logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
379

    
380
                outputFile.withWriter { writer ->
381
                        writer << groovy.xml.XmlUtil.serialize(TS)
382
                }
383
        }
384
        // Subroutine self-test to establish whether a Groovy node is punctuation
385
        private punctTest(gNode) {
386
                if (gNode.'@type') {
387
                        gNode.'@type'.toLowerCase().startsWith('pon')
388
                } else {
389
                        if (gNode[nsTxm.ana]) {
390
                                return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
391
                        } else if (gNode.interp) {
392
                                return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
393
                        } else return false
394
                }
395
        }
396

    
397
        private idmatch (tNode, wNode) {
398
                // Matches a TS node with a <w/> node
399
                def idMatch = false
400
                def wNodeId = wNode.'@id'
401
                if (! wNodeId) {
402
                        wNodeId = wNode.attribute(nsXml.id)
403
                }
404
                def tNodeId = tNode.'@editionId'
405
                def regex = /(\d+)([_a-z]+)?$/
406
                def m1 = (tNodeId =~ regex)
407
                def m2 = (wNodeId =~ regex)
408
                if ( m1.find() && m2.find() ) {
409
                        idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger())
410
                } else {
411
                        logger.log(
412
                                        Level.WARNING,
413
                                        "Idmatch routine can't find integer part of one or both of the" +
414
                                        "paired Ids for nodes $tNode, $wNode"
415
                                        )
416
                }
417
                if (! idMatch) {
418
                        return false
419
                }
420
                
421
                //def wNodeForm = wNode.text() // Enable if treating old-style XML-BFM
422
                def wNodeForm = "";
423
                if (! wNodeForm) {
424
                        wNodeForm = getDeepText(wNode[nsTxm.form][0])
425
                }
426
                def tNodeForm = tNode.'@word'
427
                def formMatch = (tNodeForm == wNodeForm)
428
                if (idMatch && formMatch) {
429
                        return true
430
                } else {
431
                        logger.log(Level.WARNING,
432
                                        "Matching IDs ($tNodeId) but unmatched forms: TS form '$tNodeForm', BFM form '$wNodeForm'.")
433
                        return true
434
                }
435
        }
436
        
437
        def getDeepText(def node) {
438
                def str = ""
439
                node.children().each { child ->
440
                        if (child instanceof Node) {
441
                                str += getDeepText(child)
442
                        } else if (child instanceof String) {
443
                                str += child
444
                        }
445
                }
446
                return str
447
        }
448
}