Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / scripts / importer / srcmf / PunctInjectImport.groovy @ 1000

History | View | Annotate | Download (7.3 kB)

1
// Script to restore punctuation etc. to results of TS query.
2
// Inputs:
3
// --- Tiger-XML Document node
4
// --- Java array:
5
// --- --- String [index][type] where:
6
// --- --- --- type == 0 gives the xml:id
7
// --- --- --- type == 1 gives the word form
8
// Process:
9
// --- Injects punctuation.
10
// Returns:
11
// --- Tiger-XML Document node.
12

    
13
// TXM package statement
14
package org.txm.scripts.importer.srcmf
15

    
16
import javax.xml.parsers.DocumentBuilderFactory
17

    
18

    
19

    
20
/**
21
 * 
22
 * @author tmr
23
 *
24
 */
25
class PunctInjectImport
26
{
27

    
28
        static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
29
        static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
30
        def tigerXml = null
31
        def txmIdWordTableFixed = []
32
        def outputFile = null
33

    
34
        /*
35
         * Call this constructor when no need of XML-TXM file for word ID table
36
         */
37
        PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
38
                def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
39
                tigerXml = builder.parse(tigerXmlFile).documentElement
40
                txmIdWordTableFixed = txmIdWordTable
41
                outputFile = new File("output.xml")
42
        }
43

    
44
        /*
45
         * Call this constructor when the XML-TXM file is needed for the word list.    
46
         */
47
        PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
48
                //
49
                def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
50
                tigerXml = builder.parse(tigerXmlFile).documentElement
51
                txmIdWordTableFixed = []
52
                def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile)
53
                for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
54
                        def anId = null
55
                        anId = w.'@id'
56
                        if (! anId) {
57
                                anId = w.attribute(nsXml.id)
58
                        }
59
                        if (! anId) {
60
                                println "Error: ID attribute not located in XML TXM file"
61
                        }
62
                        txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])]
63
                }
64
                // assert txmIdWordTableFixed.size() > 0
65
                outputFile = new File("output.xml")
66
        }
67

    
68
        def process () {
69
                def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed)
70
                def allTs = toList(this.tigerXml.getElementsByTagName('t'))
71
                def tAttrs = getTAttrNames(allTs[0])
72
                def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
73
                def addedWordIds = []
74
                int tIx = 0
75
                while (txmIdWordTable) {
76
                        def word = txmIdWordTable.remove(0)
77
                        if (tIx == allTs.size()) {
78
                                // End of TS file, but still words left in the BFM file.
79
                                addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
80
                        }
81
                        else {
82
                                def tNode = allTs[tIx]
83
                                def tId = getTNodeId(tNode)
84
                                if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
85
                                        // alles gut
86
                                        tIx += 1
87
                                }
88
                                else if (tId == word[0]) {
89
                                        println("Mismatched Ids! ($tId)")
90
                                        tIx += 1
91
                                }
92
                                else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
93
                                        // SRCMF duplicata; try comparing word against the next tNode next time
94
                                        // around.
95
                                        txmIdWordTable.add(0, word)
96
                                        tIx += 1
97
                                }
98
                                // Check that the SRCMF corpus doesn't have a bug in it...
99
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
100
                                && (allTs[0..tIx - 1].find{
101
                                        it.getAttribute('id') == "$tNodeIdPrefix#$tId"
102
                                })) {
103
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
104
                                        txmIdWordTable.add(0, word)
105
                                        tIx += 1
106
                                }
107
                                // Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
108
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
109
                                && (allTs[tIx..-1].find{
110
                                        it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
111
                                })) {
112
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
113
                                        txmIdWordTable.add(0, word)
114
                                        tIx += 1
115
                                }
116
                                else if (addedWordIds.contains(tId)) {
117
                                        println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
118
                                        txmIdWordTable.add(0, word)
119
                                        tIx += 1
120
                                }
121
                                else {
122
                                        // Insert word.  In the first instance, it will have the same parent as
123
                                        // the tNode before which it's being inserted.
124
                                        addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
125
                                        addedWordIds.add(word[0])
126
                                }
127
                        }
128
                }
129
                // Second phase: move punctuation into previous sentence,
130
                // dependent on sequence.
131
                def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
132
                for (def i = 1 ; i < allTerminalses.size() ; i++) {
133
                        def ts = toList(allTerminalses[i].getElementsByTagName('t'))
134
                        def startPunc = true
135
                        def puncStack = []
136
                        while (ts && startPunc) {
137
                                if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
138
                                        puncStack.add(ts.remove(0))
139
                                }
140
                                else {
141
                                        startPunc = false
142
                                }
143
                        }
144
                        // Now, treat the punctuation stack at the beginning of the sentence
145
                        if ( puncStack ) {
146
                                int moveLeft = 0
147
                                // First, identify LAST instance of sentence-final punctuation.
148
                                def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
149
                                def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
150
                                if (matches.size() > 0) {
151
                                        moveLeft = puncString.lastIndexOf(matches[-1]) + 1
152
                                }
153
                                // Second, split pairs of straight quotes
154
                                matches = puncString =~ /(""|'')/ //"
155
                                if (matches.size() > 0) {
156
                                        moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
157
                                }
158
                                // Now, move moveLeft punctuation nodes to the end of the prev. sentence
159
                                ts = toList(allTerminalses[i].getElementsByTagName('t'))
160
                                for (def j = 0 ; j < moveLeft ; j++ ) {
161
                                        allTerminalses[i - 1].appendChild(ts[j])
162
                                }
163
                        }
164
                }
165
                outputFile.withWriter("UTF-8") { writer ->
166
                        writer.println(this.tigerXml)
167
                }
168
        }
169

    
170
        private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
171
                def newTNode = tNode.getOwnerDocument().createElement('t')
172
                for (def anAttr : tAttrs) {
173
                        if (anAttr == 'id') {
174
                                newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
175
                        }
176
                        else if (anAttr == 'word') {
177
                                newTNode.setAttribute('word', word[1])
178
                        }
179
                        else {
180
                                newTNode.setAttribute(anAttr, '--')
181
                        }
182
                }
183
                if (where == 'before') {
184
                        tNode.getParentNode().insertBefore(newTNode, tNode)
185
                }
186
                else if (where == 'append') {
187
                        tNode.getParentNode().appendChild(newTNode)
188
                }
189
                else {
190
                        throw new IllegalArgumentException('Bad before value')
191
                }
192
        }
193
        def getTAttrNames(tNode) {
194
                def nodeMap = tNode.attributes
195
                def nameList = []
196
                for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
197
                        nameList.add( nodeMap.item(i).nodeName )
198
                }
199
                return nameList
200
        }
201
        def getTNodeId(tNode) {
202
                return tNode.getAttribute('id').tokenize('#').last()
203
        }
204
        def toList(def iterable) {
205
                return iterable.findAll {true};
206
        }
207
        def getDeepText(def node) {
208
                def str = ""
209
                node.children().each { child ->
210
                        if (child instanceof Node) {
211
                                str += getDeepText(child)
212
                        } else if (child instanceof String) {
213
                                str += child
214
                        }
215
                }
216
                return str
217
        }
218

    
219
        public static void main(String[] args)
220
        {
221
                def cli = new CliBuilder(
222
                                usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml'
223
                                )
224
                cli.h(longOpt:'help', 'Prints this message.')
225
                cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
226
                def options = cli.parse(args)
227
                def tigerXmlFile = null
228
                def xmlTxmFile = null
229
                if (options.arguments().size() == 2) {
230
                        tigerXmlFile = new File(options.arguments()[0])
231
                        xmlTxmFile = new File(options.arguments()[1])
232
                } else {
233
                        println 'Incorrect number of command line arguments... exiting'
234
                        println cli.usage()
235
                        System.exit(2)
236
                }
237
                def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile)
238
                if (options.o) {
239
                        pii.outputFile = new File(options.o)
240
                }
241
                pii.process()
242
        }
243
}