Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / importer / srcmf / PunctInjectImport.groovy @ 966

History | View | Annotate | Download (7.3 kB)

1 911 mdecorde
// Script to restore punctuation etc. to results of TS query.
2 911 mdecorde
// Inputs:
3 911 mdecorde
// --- Tiger-XML Document node
4 911 mdecorde
// --- Java array:
5 911 mdecorde
// --- --- String [index][type] where:
6 911 mdecorde
// --- --- --- type == 0 gives the xml:id
7 911 mdecorde
// --- --- --- type == 1 gives the word form
8 911 mdecorde
// Process:
9 911 mdecorde
// --- Injects punctuation.
10 911 mdecorde
// Returns:
11 911 mdecorde
// --- Tiger-XML Document node.
12 911 mdecorde
13 911 mdecorde
// TXM package statement
14 911 mdecorde
package org.txm.importer.srcmf
15 911 mdecorde
16 911 mdecorde
import javax.xml.parsers.DocumentBuilderFactory
17 911 mdecorde
18 911 mdecorde
19 911 mdecorde
20 911 mdecorde
/**
21 911 mdecorde
 *
22 911 mdecorde
 * @author tmr
23 911 mdecorde
 *
24 911 mdecorde
 */
25 911 mdecorde
class PunctInjectImport
26 911 mdecorde
{
27 911 mdecorde
28 911 mdecorde
        static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
29 911 mdecorde
        static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
30 911 mdecorde
        def tigerXml = null
31 911 mdecorde
        def txmIdWordTableFixed = []
32 911 mdecorde
        def outputFile = null
33 911 mdecorde
34 911 mdecorde
        /*
35 911 mdecorde
         * Call this constructor when no need of XML-TXM file for word ID table
36 911 mdecorde
         */
37 911 mdecorde
        PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
38 911 mdecorde
                def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
39 911 mdecorde
                tigerXml = builder.parse(tigerXmlFile).documentElement
40 911 mdecorde
                txmIdWordTableFixed = txmIdWordTable
41 911 mdecorde
                outputFile = new File("output.xml")
42 911 mdecorde
        }
43 911 mdecorde
44 911 mdecorde
        /*
45 911 mdecorde
         * Call this constructor when the XML-TXM file is needed for the word list.
46 911 mdecorde
         */
47 911 mdecorde
        PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
48 911 mdecorde
                //
49 911 mdecorde
                def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
50 911 mdecorde
                tigerXml = builder.parse(tigerXmlFile).documentElement
51 911 mdecorde
                txmIdWordTableFixed = []
52 911 mdecorde
                def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile)
53 911 mdecorde
                for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
54 911 mdecorde
                        def anId = null
55 911 mdecorde
                        anId = w.'@id'
56 911 mdecorde
                        if (! anId) {
57 911 mdecorde
                                anId = w.attribute(nsXml.id)
58 911 mdecorde
                        }
59 911 mdecorde
                        if (! anId) {
60 911 mdecorde
                                println "Error: ID attribute not located in XML TXM file"
61 911 mdecorde
                        }
62 911 mdecorde
                        txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])]
63 911 mdecorde
                }
64 911 mdecorde
                // assert txmIdWordTableFixed.size() > 0
65 911 mdecorde
                outputFile = new File("output.xml")
66 911 mdecorde
        }
67 911 mdecorde
68 911 mdecorde
        def process () {
69 911 mdecorde
                def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed)
70 911 mdecorde
                def allTs = toList(this.tigerXml.getElementsByTagName('t'))
71 911 mdecorde
                def tAttrs = getTAttrNames(allTs[0])
72 911 mdecorde
                def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
73 911 mdecorde
                def addedWordIds = []
74 911 mdecorde
                int tIx = 0
75 911 mdecorde
                while (txmIdWordTable) {
76 911 mdecorde
                        def word = txmIdWordTable.remove(0)
77 911 mdecorde
                        if (tIx == allTs.size()) {
78 911 mdecorde
                                // End of TS file, but still words left in the BFM file.
79 911 mdecorde
                                addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
80 911 mdecorde
                        }
81 911 mdecorde
                        else {
82 911 mdecorde
                                def tNode = allTs[tIx]
83 911 mdecorde
                                def tId = getTNodeId(tNode)
84 911 mdecorde
                                if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
85 911 mdecorde
                                        // alles gut
86 911 mdecorde
                                        tIx += 1
87 911 mdecorde
                                }
88 911 mdecorde
                                else if (tId == word[0]) {
89 911 mdecorde
                                        println("Mismatched Ids! ($tId)")
90 911 mdecorde
                                        tIx += 1
91 911 mdecorde
                                }
92 911 mdecorde
                                else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
93 911 mdecorde
                                        // SRCMF duplicata; try comparing word against the next tNode next time
94 911 mdecorde
                                        // around.
95 911 mdecorde
                                        txmIdWordTable.add(0, word)
96 911 mdecorde
                                        tIx += 1
97 911 mdecorde
                                }
98 911 mdecorde
                                // Check that the SRCMF corpus doesn't have a bug in it...
99 911 mdecorde
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
100 911 mdecorde
                                && (allTs[0..tIx - 1].find{
101 911 mdecorde
                                        it.getAttribute('id') == "$tNodeIdPrefix#$tId"
102 911 mdecorde
                                })) {
103 911 mdecorde
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
104 911 mdecorde
                                        txmIdWordTable.add(0, word)
105 911 mdecorde
                                        tIx += 1
106 911 mdecorde
                                }
107 911 mdecorde
                                // Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
108 911 mdecorde
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
109 911 mdecorde
                                && (allTs[tIx..-1].find{
110 911 mdecorde
                                        it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
111 911 mdecorde
                                })) {
112 911 mdecorde
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
113 911 mdecorde
                                        txmIdWordTable.add(0, word)
114 911 mdecorde
                                        tIx += 1
115 911 mdecorde
                                }
116 911 mdecorde
                                else if (addedWordIds.contains(tId)) {
117 911 mdecorde
                                        println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
118 911 mdecorde
                                        txmIdWordTable.add(0, word)
119 911 mdecorde
                                        tIx += 1
120 911 mdecorde
                                }
121 911 mdecorde
                                else {
122 911 mdecorde
                                        // Insert word.  In the first instance, it will have the same parent as
123 911 mdecorde
                                        // the tNode before which it's being inserted.
124 911 mdecorde
                                        addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
125 911 mdecorde
                                        addedWordIds.add(word[0])
126 911 mdecorde
                                }
127 911 mdecorde
                        }
128 911 mdecorde
                }
129 911 mdecorde
                // Second phase: move punctuation into previous sentence,
130 911 mdecorde
                // dependent on sequence.
131 911 mdecorde
                def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
132 911 mdecorde
                for (def i = 1 ; i < allTerminalses.size() ; i++) {
133 911 mdecorde
                        def ts = toList(allTerminalses[i].getElementsByTagName('t'))
134 911 mdecorde
                        def startPunc = true
135 911 mdecorde
                        def puncStack = []
136 911 mdecorde
                        while (ts && startPunc) {
137 911 mdecorde
                                if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
138 911 mdecorde
                                        puncStack.add(ts.remove(0))
139 911 mdecorde
                                }
140 911 mdecorde
                                else {
141 911 mdecorde
                                        startPunc = false
142 911 mdecorde
                                }
143 911 mdecorde
                        }
144 911 mdecorde
                        // Now, treat the punctuation stack at the beginning of the sentence
145 911 mdecorde
                        if ( puncStack ) {
146 911 mdecorde
                                int moveLeft = 0
147 911 mdecorde
                                // First, identify LAST instance of sentence-final punctuation.
148 911 mdecorde
                                def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
149 911 mdecorde
                                def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
150 911 mdecorde
                                if (matches.size() > 0) {
151 911 mdecorde
                                        moveLeft = puncString.lastIndexOf(matches[-1]) + 1
152 911 mdecorde
                                }
153 911 mdecorde
                                // Second, split pairs of straight quotes
154 911 mdecorde
                                matches = puncString =~ /(""|'')/ //"
155 911 mdecorde
                                if (matches.size() > 0) {
156 911 mdecorde
                                        moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
157 911 mdecorde
                                }
158 911 mdecorde
                                // Now, move moveLeft punctuation nodes to the end of the prev. sentence
159 911 mdecorde
                                ts = toList(allTerminalses[i].getElementsByTagName('t'))
160 911 mdecorde
                                for (def j = 0 ; j < moveLeft ; j++ ) {
161 911 mdecorde
                                        allTerminalses[i - 1].appendChild(ts[j])
162 911 mdecorde
                                }
163 911 mdecorde
                        }
164 911 mdecorde
                }
165 911 mdecorde
                outputFile.withWriter("UTF-8") { writer ->
166 911 mdecorde
                        writer.println(this.tigerXml)
167 911 mdecorde
                }
168 911 mdecorde
        }
169 911 mdecorde
170 911 mdecorde
        private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
171 911 mdecorde
                def newTNode = tNode.getOwnerDocument().createElement('t')
172 911 mdecorde
                for (def anAttr : tAttrs) {
173 911 mdecorde
                        if (anAttr == 'id') {
174 911 mdecorde
                                newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
175 911 mdecorde
                        }
176 911 mdecorde
                        else if (anAttr == 'word') {
177 911 mdecorde
                                newTNode.setAttribute('word', word[1])
178 911 mdecorde
                        }
179 911 mdecorde
                        else {
180 911 mdecorde
                                newTNode.setAttribute(anAttr, '--')
181 911 mdecorde
                        }
182 911 mdecorde
                }
183 911 mdecorde
                if (where == 'before') {
184 911 mdecorde
                        tNode.getParentNode().insertBefore(newTNode, tNode)
185 911 mdecorde
                }
186 911 mdecorde
                else if (where == 'append') {
187 911 mdecorde
                        tNode.getParentNode().appendChild(newTNode)
188 911 mdecorde
                }
189 911 mdecorde
                else {
190 911 mdecorde
                        throw new IllegalArgumentException('Bad before value')
191 911 mdecorde
                }
192 911 mdecorde
        }
193 911 mdecorde
        def getTAttrNames(tNode) {
194 911 mdecorde
                def nodeMap = tNode.attributes
195 911 mdecorde
                def nameList = []
196 911 mdecorde
                for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
197 911 mdecorde
                        nameList.add( nodeMap.item(i).nodeName )
198 911 mdecorde
                }
199 911 mdecorde
                return nameList
200 911 mdecorde
        }
201 911 mdecorde
        def getTNodeId(tNode) {
202 911 mdecorde
                return tNode.getAttribute('id').tokenize('#').last()
203 911 mdecorde
        }
204 911 mdecorde
        def toList(def iterable) {
205 911 mdecorde
                return iterable.findAll {true};
206 911 mdecorde
        }
207 911 mdecorde
        def getDeepText(def node) {
208 911 mdecorde
                def str = ""
209 911 mdecorde
                node.children().each { child ->
210 911 mdecorde
                        if (child instanceof Node) {
211 911 mdecorde
                                str += getDeepText(child)
212 911 mdecorde
                        } else if (child instanceof String) {
213 911 mdecorde
                                str += child
214 911 mdecorde
                        }
215 911 mdecorde
                }
216 911 mdecorde
                return str
217 911 mdecorde
        }
218 911 mdecorde
219 911 mdecorde
        public static void main(String[] args)
220 911 mdecorde
        {
221 911 mdecorde
                def cli = new CliBuilder(
222 911 mdecorde
                                usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml'
223 911 mdecorde
                                )
224 911 mdecorde
                cli.h(longOpt:'help', 'Prints this message.')
225 911 mdecorde
                cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
226 911 mdecorde
                def options = cli.parse(args)
227 911 mdecorde
                def tigerXmlFile = null
228 911 mdecorde
                def xmlTxmFile = null
229 911 mdecorde
                if (options.arguments().size() == 2) {
230 911 mdecorde
                        tigerXmlFile = new File(options.arguments()[0])
231 911 mdecorde
                        xmlTxmFile = new File(options.arguments()[1])
232 911 mdecorde
                } else {
233 911 mdecorde
                        println 'Incorrect number of command line arguments... exiting'
234 911 mdecorde
                        println cli.usage()
235 911 mdecorde
                        System.exit(2)
236 911 mdecorde
                }
237 911 mdecorde
                def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile)
238 911 mdecorde
                if (options.o) {
239 911 mdecorde
                        pii.outputFile = new File(options.o)
240 911 mdecorde
                }
241 911 mdecorde
                pii.process()
242 911 mdecorde
        }
243 911 mdecorde
}