Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / importer / srcmf / SubcorpusDataInject.groovy @ 966

History | View | Annotate | Download (13 kB)

1 911 mdecorde
#! /usr/bin/groovy
2 911 mdecorde
package org.txm.importer.srcmf;
3 911 mdecorde
/*
4 911 mdecorde
 * To change this template, choose Tools | Templates
5 911 mdecorde
 * and open the template in the editor.
6 911 mdecorde
 */
7 911 mdecorde
8 911 mdecorde
import java.util.logging.Logger
9 911 mdecorde
import java.util.logging.FileHandler
10 911 mdecorde
import java.util.logging.SimpleFormatter
11 911 mdecorde
import java.util.logging.Level
12 911 mdecorde
13 911 mdecorde
14 911 mdecorde
class SubcorpusDataInject {
15 911 mdecorde
        // Globals
16 911 mdecorde
        def textForm = 'prose'
17 911 mdecorde
        def TEI = null
18 911 mdecorde
        def wNodes = []
19 911 mdecorde
        def wNodesInQ = []
20 911 mdecorde
        // Set up namespaces
21 911 mdecorde
        static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
22 911 mdecorde
        static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
23 911 mdecorde
        // Set up a logger
24 911 mdecorde
        static logger = Logger.getLogger('subcorpusDataInjectLog')
25 911 mdecorde
        // END GLOBALS
26 911 mdecorde
27 911 mdecorde
        SubcorpusDataInject (
28 911 mdecorde
        File xmlTxmFile,
29 911 mdecorde
        FileHandler fh,
30 911 mdecorde
        String newTextForm
31 911 mdecorde
        ) {
32 911 mdecorde
                // Reset global values
33 911 mdecorde
                if (newTextForm) {
34 911 mdecorde
                        textForm = newTextForm
35 911 mdecorde
                }
36 911 mdecorde
                // Set up logger
37 911 mdecorde
                def formatter = new SimpleFormatter()
38 911 mdecorde
                logger.addHandler(fh)
39 911 mdecorde
                logger.setLevel(Level.ALL)
40 911 mdecorde
                fh.setFormatter(formatter)
41 911 mdecorde
                // Parse the XML-TXM file
42 911 mdecorde
                logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).")
43 911 mdecorde
                TEI = new XmlParser().parse(xmlTxmFile)
44 911 mdecorde
                // Get text form from TEI header (if expressed)
45 911 mdecorde
                logger.log(Level.INFO, "Reading TEI header to detect text form.")
46 911 mdecorde
                if (TEI.teiHeader.profileDesc.textClass.catRef) {
47 911 mdecorde
                        def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1]
48 911 mdecorde
                        textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
49 911 mdecorde
                                it.'@id' == catId || it.attribute(nsXml.id) == catId
50 911 mdecorde
                        }.catDesc[0].text()
51 911 mdecorde
                        logger.log(Level.INFO, "Found text form '$textForm' in TEI header.")
52 911 mdecorde
                } else {
53 911 mdecorde
                        logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.")
54 911 mdecorde
                }
55 911 mdecorde
                // Convert Lbs to Property
56 911 mdecorde
                if (textForm == 'vers' || textForm == 'mixte') {
57 911 mdecorde
                        logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.")
58 911 mdecorde
                        def undone = TEI.text.body
59 911 mdecorde
                        def lastLb = false
60 911 mdecorde
                        def lastWord = null
61 911 mdecorde
                        while (undone) {
62 911 mdecorde
                                def aNode = undone.remove(0)
63 911 mdecorde
                                if (aNode.name().getLocalPart() == 'lb') {
64 911 mdecorde
                                        lastLb = true
65 911 mdecorde
                                        if (lastWord) {
66 911 mdecorde
                                                lastWord.'@lb' = 'end'
67 911 mdecorde
                                        }
68 911 mdecorde
                                } else if (aNode.name().getLocalPart() == 'w') {
69 911 mdecorde
                                        if (!punctTest(aNode)) {
70 911 mdecorde
                                                if (lastLb) {
71 911 mdecorde
                                                        aNode.'@lb' = 'start'
72 911 mdecorde
                                                }
73 911 mdecorde
                                                lastWord = aNode
74 911 mdecorde
                                                lastLb = false
75 911 mdecorde
                                        }
76 911 mdecorde
                                } else {
77 911 mdecorde
                                        undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
78 911 mdecorde
                                }
79 911 mdecorde
                        }
80 911 mdecorde
                }
81 911 mdecorde
                // Build node lists
82 911 mdecorde
                logger.log(Level.INFO, "Building node lists.")
83 911 mdecorde
                wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
84 911 mdecorde
                wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
85 911 mdecorde
        }
86 911 mdecorde
87 911 mdecorde
        static main(def args) {
88 911 mdecorde
                // Parse command line args
89 911 mdecorde
                def outputFile = 'test-out.xml'
90 911 mdecorde
                def logName = 'subcorpusDataInjectLog.log'
91 911 mdecorde
                def cli = new CliBuilder(
92 911 mdecorde
                                usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml'
93 911 mdecorde
                                )
94 911 mdecorde
                cli.h(longOpt:'help', 'Prints this message')
95 911 mdecorde
                cli.m(longOpt:'master', 'Specifies input file is a TS master file.')
96 911 mdecorde
                cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.')
97 911 mdecorde
                cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.')
98 911 mdecorde
                cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file')
99 911 mdecorde
                println args
100 911 mdecorde
                def options = cli.parse(args)
101 911 mdecorde
                // Call XML TXM loader
102 911 mdecorde
                if (options.arguments().size() == 2) {
103 911 mdecorde
                        def arglist = [new File (options.arguments()[0])]
104 911 mdecorde
                        if (options.logfile) {
105 911 mdecorde
                                arglist.add(new FileHandler(options.logfile))
106 911 mdecorde
                        } else {
107 911 mdecorde
                                arglist.add(new FileHandler(logName))
108 911 mdecorde
                        }
109 911 mdecorde
                        if (options.v) {
110 911 mdecorde
                                arglist.add('vers')
111 911 mdecorde
                        } else {
112 911 mdecorde
                                arglist.add('')
113 911 mdecorde
                        }
114 911 mdecorde
                        def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2])
115 911 mdecorde
                        // Call process master or process subcorpus
116 911 mdecorde
                        arglist = [new File(options.arguments()[1]).getCanonicalFile()]
117 911 mdecorde
                        if (options.output) {
118 911 mdecorde
                                arglist.add(new File (options.output))
119 911 mdecorde
                        } else {
120 911 mdecorde
                                arglist.add(new File (outputFile))
121 911 mdecorde
                        }
122 911 mdecorde
                        if (options.m) {
123 911 mdecorde
                                sdi.processMaster(arglist[0], arglist[1])
124 911 mdecorde
                        } else {
125 911 mdecorde
                                sdi.processSubcorpus(arglist[0], arglist[1])
126 911 mdecorde
                        }
127 911 mdecorde
                } else {
128 911 mdecorde
                        println "Wrong number of arguments."
129 911 mdecorde
                        println cli.usage()
130 911 mdecorde
                }
131 911 mdecorde
        }
132 911 mdecorde
133 911 mdecorde
        def processMaster (File masterFile, File outputFile) {
134 911 mdecorde
                // Runs the transform on each subcorpus listed in a master file.
135 911 mdecorde
                // Resaves the master file.
136 911 mdecorde
                logger.log(Level.INFO, "Loading corpus master file")
137 911 mdecorde
                def masterFolder = ''
138 911 mdecorde
                def outputFolder = ''
139 911 mdecorde
                if (masterFile.getParent()) {
140 911 mdecorde
                        // If not cwd
141 911 mdecorde
                        masterFolder = masterFile.getParent()
142 911 mdecorde
                }
143 911 mdecorde
                if (outputFile.getParent()) {
144 911 mdecorde
                        // If not cwd
145 911 mdecorde
                        outputFolder = outputFile.getParent()
146 911 mdecorde
                }
147 911 mdecorde
                def master = new XmlParser().parse(masterFile)
148 911 mdecorde
                // Process each subcorpus file
149 911 mdecorde
                master.body.subcorpus.each {
150 911 mdecorde
                        processSubcorpus(
151 911 mdecorde
                                        new File (masterFolder, it.'@external'[5..-1]),
152 911 mdecorde
                                        new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
153 911 mdecorde
                                        )
154 911 mdecorde
                        it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
155 911 mdecorde
                }
156 911 mdecorde
                logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
157 911 mdecorde
158 911 mdecorde
                outputFile.withWriter { writer ->
159 911 mdecorde
                        writer << groovy.xml.XmlUtil.serialize(master)
160 911 mdecorde
                }
161 911 mdecorde
                logger.log(Level.INFO, "Run reheader on this file!")
162 911 mdecorde
        }
163 911 mdecorde
164 911 mdecorde
        def processSubcorpus (File subcorpusFile, File outputFile) {
165 911 mdecorde
                // Parse Subcorpus file
166 911 mdecorde
                logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
167 911 mdecorde
                def TS = new XmlParser().parse(subcorpusFile)
168 911 mdecorde
                // This is used as a stack
169 911 mdecorde
                def tNodes = TS.s.graph.terminals.t
170 911 mdecorde
                // This isn't
171 911 mdecorde
                def tNodesFixed = TS.s.graph.terminals.t
172 911 mdecorde
173 911 mdecorde
                // Build duplicata correspondance list
174 911 mdecorde
                logger.log(Level.INFO, "Building duplicata correspondance list")
175 911 mdecorde
                def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
176 911 mdecorde
                        def duplId = dupl.'@id'
177 911 mdecorde
                        def realId = dupl.parent().parent().nonterminals.nt.edge.find {
178 911 mdecorde
                                it.'@idref' == duplId
179 911 mdecorde
                        }.parent().secedge.find {
180 911 mdecorde
                                it.'@label' == 'dupl'
181 911 mdecorde
                        }.'@idref'
182 911 mdecorde
                        [duplId, realId]
183 911 mdecorde
                }
184 911 mdecorde
                logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
185 911 mdecorde
186 911 mdecorde
                // Add pos & form tags
187 911 mdecorde
                logger.log(Level.INFO, "Adding PoS and form tags to each t-node")
188 911 mdecorde
                // Get first wNode corresponding to a tNode
189 911 mdecorde
                def i = 0
190 911 mdecorde
                def wIx = null
191 911 mdecorde
                while (i < tNodes.size()) {
192 911 mdecorde
                        // Avoid trying to find "fake" first words, it's very inefficient
193 911 mdecorde
                        if (tNodes[i].'@word' != '#') {
194 911 mdecorde
                                wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
195 911 mdecorde
                        }
196 911 mdecorde
                        if (wIx) {
197 911 mdecorde
                                break
198 911 mdecorde
                        } else {
199 911 mdecorde
                                i++
200 911 mdecorde
                        }
201 911 mdecorde
                }
202 911 mdecorde
                assert wIx, "$i"
203 911 mdecorde
                /*
204 911 mdecorde
                 * MAIN ITERATOR BEGINS HERE FOR POS INJECTION
205 911 mdecorde
                 */
206 911 mdecorde
                // Iterate over tNodes.
207 911 mdecorde
                while (tNodes) {
208 911 mdecorde
                        def thisTNode = tNodes.remove(0)
209 911 mdecorde
                        def thisTNodeId = thisTNode.'@editionId'
210 911 mdecorde
                        def wNode = null
211 911 mdecorde
                        def q = false
212 911 mdecorde
                        // Define the PoS / q / form injecting closure
213 911 mdecorde
                        def injectPos = {
214 911 mdecorde
                                // Q tag
215 911 mdecorde
                                if (q) {
216 911 mdecorde
                                        thisTNode.'@q' = 'y'
217 911 mdecorde
                                } else {
218 911 mdecorde
                                        thisTNode.'@q' = 'n'
219 911 mdecorde
                                }
220 911 mdecorde
                                // Form tag
221 911 mdecorde
                                if (textForm == 'prose') {
222 911 mdecorde
                                        thisTNode.'@form' = 'prose'
223 911 mdecorde
                                } else if (
224 911 mdecorde
                                textForm == 'vers' ||
225 911 mdecorde
                                (
226 911 mdecorde
                                textForm == 'mixte'
227 911 mdecorde
                                && wNode.parent().parent().name().getLocalPart() == 'ab'
228 911 mdecorde
                                )
229 911 mdecorde
                                ) {
230 911 mdecorde
                                        if (! wNode.'@lb') {
231 911 mdecorde
                                                thisTNode.'@form' = 'vers'
232 911 mdecorde
                                        } else if (wNode.'@lb' == 'start') {
233 911 mdecorde
                                                thisTNode.'@form' = 'vers_debut'
234 911 mdecorde
                                        } else if (wNode.'@lb' == 'end')  {
235 911 mdecorde
                                                thisTNode.'@form' = 'vers_fin'
236 911 mdecorde
                                        }
237 911 mdecorde
                                }
238 911 mdecorde
                                // DEBUG TMR 22/08/2012
239 911 mdecorde
                                else {
240 911 mdecorde
                                        thisTNode.'@form' = 'prose'
241 911 mdecorde
                                }
242 911 mdecorde
                                // END DEBUG TMR 22/08/2012
243 911 mdecorde
                                // Find PoS tag in BFM file
244 911 mdecorde
                                // Try txm:ana tags
245 911 mdecorde
                                def txmAnaList = wNode[nsTxm.ana]
246 911 mdecorde
                                // Then interp tags
247 911 mdecorde
                                if (! txmAnaList) {
248 911 mdecorde
                                        txmAnaList = wNode.interp
249 911 mdecorde
                                }
250 911 mdecorde
                                // Try type="#pos_syn"
251 911 mdecorde
                                if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
252 911 mdecorde
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
253 911 mdecorde
                                        // type attr of wNode
254 911 mdecorde
                                } else if (! txmAnaList && wNode.'@type') {
255 911 mdecorde
                                        thisTNode.'@pos' = wNode.'@type'
256 911 mdecorde
                                        // type="#pos"
257 911 mdecorde
                                } else if (txmAnaList.find { it.'@type' =  '#pos' }) {
258 911 mdecorde
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
259 911 mdecorde
                                        // type="#fropos"
260 911 mdecorde
                                } else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
261 911 mdecorde
                                        thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
262 911 mdecorde
                                } else {
263 911 mdecorde
                                        thisTNode.'@pos' = '--'
264 911 mdecorde
                                }
265 911 mdecorde
                        }
266 911 mdecorde
                        // End of injecting closure.
267 911 mdecorde
                        // Check whether wNode matches tNode
268 911 mdecorde
                        if (thisTNode.'@word' == '#') {
269 911 mdecorde
                                // Not expecting to find these in BFM file, all OK.
270 911 mdecorde
                                thisTNode.'@pos' = '--'
271 911 mdecorde
                                thisTNode.'@form' = '--'
272 911 mdecorde
                                thisTNode.'@q' = '--'
273 911 mdecorde
                        } else if ( idmatch(thisTNode, wNodes[wIx]) ) {
274 911 mdecorde
                                // Perfect match
275 911 mdecorde
                                wNode = wNodes[wIx]
276 911 mdecorde
                                q = wNodesInQ.contains(wNode)
277 911 mdecorde
                                injectPos.call()
278 911 mdecorde
                                wIx++
279 911 mdecorde
                        } else if (thisTNode.'@word' == '*') {
280 911 mdecorde
                                // Duplicata, use dupl match list to find correct wNode
281 911 mdecorde
                                def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
282 911 mdecorde
                                if (duplIdMatch) {
283 911 mdecorde
                                        def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
284 911 mdecorde
                                        // To enhance performance, I assume initially that the wNode matching the
285 911 mdecorde
                                        // duplicata is within +- 10 words of the current position in the
286 911 mdecorde
                                        // list.  The wNodes list is very long, and searches over the entire
287 911 mdecorde
                                        // list are avoided.
288 911 mdecorde
                                        assert matchTNode, "$duplIdMatch"
289 911 mdecorde
                                        if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
290 911 mdecorde
                                                wNode = wNodes[wIx-10 .. wIx+10].find {
291 911 mdecorde
                                                        idmatch(matchTNode, it)
292 911 mdecorde
                                                }
293 911 mdecorde
                                        }
294 911 mdecorde
                                        // Use whole list if it failed.
295 911 mdecorde
                                        if (! wNode) {
296 911 mdecorde
                                                wNode = wNodes.find {
297 911 mdecorde
                                                        idmatch(matchTNode, it)
298 911 mdecorde
                                                }
299 911 mdecorde
                                        }
300 911 mdecorde
                                        q = wNodesInQ.contains(wNode)
301 911 mdecorde
                                        injectPos.call()
302 911 mdecorde
                                        // Don't update wIx value!
303 911 mdecorde
                                } else {
304 911 mdecorde
                                        thisTNode.'@pos' = '--'
305 911 mdecorde
                                        thisTNode.'@form' = '--'
306 911 mdecorde
                                        thisTNode.'@q' = '--'
307 911 mdecorde
                                        logger.log(
308 911 mdecorde
                                                        Level.WARNING,
309 911 mdecorde
                                                        "<w/> node matching TS duplicata $thisTNodeId not found."
310 911 mdecorde
                                                        )
311 911 mdecorde
                                }
312 911 mdecorde
                        }  else if (punctTest(wNodes[wIx])) {
313 911 mdecorde
                                // not expecting to find these in TS file; try same tNode again.
314 911 mdecorde
                                tNodes.add(0, thisTNode)
315 911 mdecorde
                                wIx++
316 911 mdecorde
                        } else if (!
317 911 mdecorde
                        wNodes.find {
318 911 mdecorde
                                idmatch (thisTNode, it)
319 911 mdecorde
                        }
320 911 mdecorde
                        ) {
321 911 mdecorde
                                // Check the SRCMF corpus hasn't acquired an extra word
322 911 mdecorde
                                logger.log(
323 911 mdecorde
                                                Level.WARNING,
324 911 mdecorde
                                                "TS word $thisTNodeId (${thisTNode.'@word'}) is " +
325 911 mdecorde
                                                "absent from BFM file."
326 911 mdecorde
                                                )
327 911 mdecorde
                                thisTNode.'@pos' = '--'
328 911 mdecorde
                                thisTNode.'@form' = '--'
329 911 mdecorde
                                thisTNode.'@q' = '--'
330 911 mdecorde
                        } else {
331 911 mdecorde
                                // it's doesn't match, it's not a duplicata,
332 911 mdecorde
                                // it's not BFM punctuation, but it IS found somewhere in the BFM
333 911 mdecorde
                                // corpus.
334 911 mdecorde
                                logger.log(
335 911 mdecorde
                                                Level.WARNING,
336 911 mdecorde
                                                "Discontinuity in TS corpus before word $thisTNodeId " +
337 911 mdecorde
                                                "(${thisTNode.'@word'})."
338 911 mdecorde
                                                )
339 911 mdecorde
                                wNode = wNodes.find {
340 911 mdecorde
                                        idmatch(thisTNode, it)
341 911 mdecorde
                                }
342 911 mdecorde
                                q = wNodesInQ.contains(wNode)
343 911 mdecorde
                                injectPos.call()
344 911 mdecorde
                                wIx = wNodes.indexOf(wNode) + 1
345 911 mdecorde
                        }
346 911 mdecorde
                }
347 911 mdecorde
                /*
348 911 mdecorde
                 * End Main iteration
349 911 mdecorde
                 */
350 911 mdecorde
                /*******************
351 911 mdecorde
                 * Recalculate headpos
352 911 mdecorde
                 */
353 911 mdecorde
                logger.log(Level.INFO, "Recalculating headpos.")
354 911 mdecorde
                TS.s.graph.nonterminals.nt.each { nt ->
355 911 mdecorde
                        def a = nt.edge.findAll { it.'@label' == 'L' }
356 911 mdecorde
                        if (a.size() == 0) {
357 911 mdecorde
                                nt.'@headpos' = '--'
358 911 mdecorde
                        } else if (a.size() == 1) {
359 911 mdecorde
                                nt.'@headpos' = nt.parent().parent().terminals.t.find {
360 911 mdecorde
                                        it.'@id' == a[0].'@idref'
361 911 mdecorde
                                }.'@pos'
362 911 mdecorde
                        } else {
363 911 mdecorde
                                for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
364 911 mdecorde
                                        if (
365 911 mdecorde
                                        a.each { edge ->
366 911 mdecorde
                                                nt.parent().parent().terminals.t.find {
367 911 mdecorde
                                                        it.'@id' == edge.'@idref'
368 911 mdecorde
                                                }.'@pos'
369 911 mdecorde
                                        }.contains(headpos)
370 911 mdecorde
                                        ) {
371 911 mdecorde
                                                nt.'@headpos' = headpos + '?'
372 911 mdecorde
                                                break
373 911 mdecorde
                                        }
374 911 mdecorde
                                }
375 911 mdecorde
                        }
376 911 mdecorde
                        assert nt.'@headpos'
377 911 mdecorde
                }
378 911 mdecorde
                logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
379 911 mdecorde
380 911 mdecorde
                outputFile.withWriter { writer ->
381 911 mdecorde
                        writer << groovy.xml.XmlUtil.serialize(TS)
382 911 mdecorde
                }
383 911 mdecorde
        }
384 911 mdecorde
        // Subroutine self-test to establish whether a Groovy node is punctuation
385 911 mdecorde
        private punctTest(gNode) {
386 911 mdecorde
                if (gNode.'@type') {
387 911 mdecorde
                        gNode.'@type'.toLowerCase().startsWith('pon')
388 911 mdecorde
                } else {
389 911 mdecorde
                        if (gNode[nsTxm.ana]) {
390 911 mdecorde
                                return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
391 911 mdecorde
                        } else if (gNode.interp) {
392 911 mdecorde
                                return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
393 911 mdecorde
                        } else return false
394 911 mdecorde
                }
395 911 mdecorde
        }
396 911 mdecorde
397 911 mdecorde
        private idmatch (tNode, wNode) {
398 911 mdecorde
                // Matches a TS node with a <w/> node
399 911 mdecorde
                def idMatch = false
400 911 mdecorde
                def wNodeId = wNode.'@id'
401 911 mdecorde
                if (! wNodeId) {
402 911 mdecorde
                        wNodeId = wNode.attribute(nsXml.id)
403 911 mdecorde
                }
404 911 mdecorde
                def tNodeId = tNode.'@editionId'
405 911 mdecorde
                def regex = /(\d+)([_a-z]+)?$/
406 911 mdecorde
                def m1 = (tNodeId =~ regex)
407 911 mdecorde
                def m2 = (wNodeId =~ regex)
408 911 mdecorde
                if ( m1.find() && m2.find() ) {
409 911 mdecorde
                        idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger())
410 911 mdecorde
                } else {
411 911 mdecorde
                        logger.log(
412 911 mdecorde
                                        Level.WARNING,
413 911 mdecorde
                                        "Idmatch routine can't find integer part of one or both of the" +
414 911 mdecorde
                                        "paired Ids for nodes $tNode, $wNode"
415 911 mdecorde
                                        )
416 911 mdecorde
                }
417 911 mdecorde
                if (! idMatch) {
418 911 mdecorde
                        return false
419 911 mdecorde
                }
420 911 mdecorde
421 911 mdecorde
                //def wNodeForm = wNode.text() // Enable if treating old-style XML-BFM
422 911 mdecorde
                def wNodeForm = "";
423 911 mdecorde
                if (! wNodeForm) {
424 911 mdecorde
                        wNodeForm = getDeepText(wNode[nsTxm.form][0])
425 911 mdecorde
                }
426 911 mdecorde
                def tNodeForm = tNode.'@word'
427 911 mdecorde
                def formMatch = (tNodeForm == wNodeForm)
428 911 mdecorde
                if (idMatch && formMatch) {
429 911 mdecorde
                        return true
430 911 mdecorde
                } else {
431 911 mdecorde
                        logger.log(Level.WARNING,
432 911 mdecorde
                                        "Matching IDs ($tNodeId) but unmatched forms: TS form '$tNodeForm', BFM form '$wNodeForm'.")
433 911 mdecorde
                        return true
434 911 mdecorde
                }
435 911 mdecorde
        }
436 911 mdecorde
437 911 mdecorde
        def getDeepText(def node) {
438 911 mdecorde
                def str = ""
439 911 mdecorde
                node.children().each { child ->
440 911 mdecorde
                        if (child instanceof Node) {
441 911 mdecorde
                                str += getDeepText(child)
442 911 mdecorde
                        } else if (child instanceof String) {
443 911 mdecorde
                                str += child
444 911 mdecorde
                        }
445 911 mdecorde
                }
446 911 mdecorde
                return str
447 911 mdecorde
        }
448 911 mdecorde
}