root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / scripts / importer / srcmf / PunctInjectImport.groovy @ 1000
History | View | Annotate | Download (7.3 kB)
1 | 911 | mdecorde | // Script to restore punctuation etc. to results of TS query.
|
---|---|---|---|
2 | 911 | mdecorde | // Inputs:
|
3 | 911 | mdecorde | // --- Tiger-XML Document node
|
4 | 911 | mdecorde | // --- Java array:
|
5 | 911 | mdecorde | // --- --- String [index][type] where:
|
6 | 911 | mdecorde | // --- --- --- type == 0 gives the xml:id
|
7 | 911 | mdecorde | // --- --- --- type == 1 gives the word form
|
8 | 911 | mdecorde | // Process:
|
9 | 911 | mdecorde | // --- Injects punctuation.
|
10 | 911 | mdecorde | // Returns:
|
11 | 911 | mdecorde | // --- Tiger-XML Document node.
|
12 | 911 | mdecorde | |
13 | 911 | mdecorde | // TXM package statement
|
14 | 1000 | mdecorde | package org.txm.scripts.importer.srcmf
|
15 | 911 | mdecorde | |
16 | 911 | mdecorde | import javax.xml.parsers.DocumentBuilderFactory |
17 | 911 | mdecorde | |
18 | 911 | mdecorde | |
19 | 911 | mdecorde | |
20 | 911 | mdecorde | /**
|
21 | 911 | mdecorde | *
|
22 | 911 | mdecorde | * @author tmr
|
23 | 911 | mdecorde | *
|
24 | 911 | mdecorde | */
|
25 | 911 | mdecorde | class PunctInjectImport |
26 | 911 | mdecorde | { |
27 | 911 | mdecorde | |
28 | 911 | mdecorde | static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml') |
29 | 911 | mdecorde | static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei') |
30 | 911 | mdecorde | def tigerXml = null |
31 | 911 | mdecorde | def txmIdWordTableFixed = [] |
32 | 911 | mdecorde | def outputFile = null |
33 | 911 | mdecorde | |
34 | 911 | mdecorde | /*
|
35 | 911 | mdecorde | * Call this constructor when no need of XML-TXM file for word ID table
|
36 | 911 | mdecorde | */
|
37 | 911 | mdecorde | PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) { |
38 | 911 | mdecorde | def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
39 | 911 | mdecorde | tigerXml = builder.parse(tigerXmlFile).documentElement |
40 | 911 | mdecorde | txmIdWordTableFixed = txmIdWordTable |
41 | 911 | mdecorde | outputFile = new File("output.xml") |
42 | 911 | mdecorde | } |
43 | 911 | mdecorde | |
44 | 911 | mdecorde | /*
|
45 | 911 | mdecorde | * Call this constructor when the XML-TXM file is needed for the word list.
|
46 | 911 | mdecorde | */
|
47 | 911 | mdecorde | PunctInjectImport(File tigerXmlFile, File xmlTxmFile) { |
48 | 911 | mdecorde | //
|
49 | 911 | mdecorde | def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
50 | 911 | mdecorde | tigerXml = builder.parse(tigerXmlFile).documentElement |
51 | 911 | mdecorde | txmIdWordTableFixed = []
|
52 | 911 | mdecorde | def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile) |
53 | 911 | mdecorde | for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) { |
54 | 911 | mdecorde | def anId = null |
55 | 911 | mdecorde | anId = w.'@id'
|
56 | 911 | mdecorde | if (! anId) {
|
57 | 911 | mdecorde | anId = w.attribute(nsXml.id) |
58 | 911 | mdecorde | } |
59 | 911 | mdecorde | if (! anId) {
|
60 | 911 | mdecorde | println "Error: ID attribute not located in XML TXM file"
|
61 | 911 | mdecorde | } |
62 | 911 | mdecorde | txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])] |
63 | 911 | mdecorde | } |
64 | 911 | mdecorde | // assert txmIdWordTableFixed.size() > 0
|
65 | 911 | mdecorde | outputFile = new File("output.xml") |
66 | 911 | mdecorde | } |
67 | 911 | mdecorde | |
68 | 911 | mdecorde | def process () { |
69 | 911 | mdecorde | def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed) |
70 | 911 | mdecorde | def allTs = toList(this.tigerXml.getElementsByTagName('t')) |
71 | 911 | mdecorde | def tAttrs = getTAttrNames(allTs[0]) |
72 | 911 | mdecorde | def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first() |
73 | 911 | mdecorde | def addedWordIds = [] |
74 | 911 | mdecorde | int tIx = 0 |
75 | 911 | mdecorde | while (txmIdWordTable) {
|
76 | 911 | mdecorde | def word = txmIdWordTable.remove(0) |
77 | 911 | mdecorde | if (tIx == allTs.size()) {
|
78 | 911 | mdecorde | // End of TS file, but still words left in the BFM file.
|
79 | 911 | mdecorde | addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
|
80 | 911 | mdecorde | } |
81 | 911 | mdecorde | else {
|
82 | 911 | mdecorde | def tNode = allTs[tIx]
|
83 | 911 | mdecorde | def tId = getTNodeId(tNode)
|
84 | 911 | mdecorde | if (tId == word[0] && tNode.getAttribute('word') == word[1]) { |
85 | 911 | mdecorde | // alles gut
|
86 | 911 | mdecorde | tIx += 1
|
87 | 911 | mdecorde | } |
88 | 911 | mdecorde | else if (tId == word[0]) { |
89 | 911 | mdecorde | println("Mismatched Ids! ($tId)")
|
90 | 911 | mdecorde | tIx += 1
|
91 | 911 | mdecorde | } |
92 | 911 | mdecorde | else if (['#', '*'].contains(tNode.getAttribute('word')) ) { |
93 | 911 | mdecorde | // SRCMF duplicata; try comparing word against the next tNode next time
|
94 | 911 | mdecorde | // around.
|
95 | 911 | mdecorde | txmIdWordTable.add(0, word)
|
96 | 911 | mdecorde | tIx += 1
|
97 | 911 | mdecorde | } |
98 | 911 | mdecorde | // Check that the SRCMF corpus doesn't have a bug in it...
|
99 | 911 | mdecorde | else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
100 | 911 | mdecorde | && (allTs[0..tIx - 1].find{ |
101 | 911 | mdecorde | it.getAttribute('id') == "$tNodeIdPrefix#$tId" |
102 | 911 | mdecorde | })) { |
103 | 911 | mdecorde | println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
|
104 | 911 | mdecorde | txmIdWordTable.add(0, word)
|
105 | 911 | mdecorde | tIx += 1
|
106 | 911 | mdecorde | } |
107 | 911 | mdecorde | // Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
|
108 | 911 | mdecorde | else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
109 | 911 | mdecorde | && (allTs[tIx..-1].find{
|
110 | 911 | mdecorde | it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}" |
111 | 911 | mdecorde | })) { |
112 | 911 | mdecorde | println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
|
113 | 911 | mdecorde | txmIdWordTable.add(0, word)
|
114 | 911 | mdecorde | tIx += 1
|
115 | 911 | mdecorde | } |
116 | 911 | mdecorde | else if (addedWordIds.contains(tId)) { |
117 | 911 | mdecorde | println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
|
118 | 911 | mdecorde | txmIdWordTable.add(0, word)
|
119 | 911 | mdecorde | tIx += 1
|
120 | 911 | mdecorde | } |
121 | 911 | mdecorde | else {
|
122 | 911 | mdecorde | // Insert word. In the first instance, it will have the same parent as
|
123 | 911 | mdecorde | // the tNode before which it's being inserted.
|
124 | 911 | mdecorde | addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
|
125 | 911 | mdecorde | addedWordIds.add(word[0])
|
126 | 911 | mdecorde | } |
127 | 911 | mdecorde | } |
128 | 911 | mdecorde | } |
129 | 911 | mdecorde | // Second phase: move punctuation into previous sentence,
|
130 | 911 | mdecorde | // dependent on sequence.
|
131 | 911 | mdecorde | def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals')) |
132 | 911 | mdecorde | for (def i = 1 ; i < allTerminalses.size() ; i++) { |
133 | 911 | mdecorde | def ts = toList(allTerminalses[i].getElementsByTagName('t')) |
134 | 911 | mdecorde | def startPunc = true |
135 | 911 | mdecorde | def puncStack = [] |
136 | 911 | mdecorde | while (ts && startPunc) {
|
137 | 911 | mdecorde | if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) { |
138 | 911 | mdecorde | puncStack.add(ts.remove(0))
|
139 | 911 | mdecorde | } |
140 | 911 | mdecorde | else {
|
141 | 911 | mdecorde | startPunc = false
|
142 | 911 | mdecorde | } |
143 | 911 | mdecorde | } |
144 | 911 | mdecorde | // Now, treat the punctuation stack at the beginning of the sentence
|
145 | 911 | mdecorde | if ( puncStack ) {
|
146 | 911 | mdecorde | int moveLeft = 0 |
147 | 911 | mdecorde | // First, identify LAST instance of sentence-final punctuation.
|
148 | 911 | mdecorde | def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('') |
149 | 911 | mdecorde | def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/ |
150 | 911 | mdecorde | if (matches.size() > 0) { |
151 | 911 | mdecorde | moveLeft = puncString.lastIndexOf(matches[-1]) + 1 |
152 | 911 | mdecorde | } |
153 | 911 | mdecorde | // Second, split pairs of straight quotes
|
154 | 911 | mdecorde | matches = puncString =~ /(""|'')/ //" |
155 | 911 | mdecorde | if (matches.size() > 0) { |
156 | 911 | mdecorde | moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max() |
157 | 911 | mdecorde | } |
158 | 911 | mdecorde | // Now, move moveLeft punctuation nodes to the end of the prev. sentence
|
159 | 911 | mdecorde | ts = toList(allTerminalses[i].getElementsByTagName('t'))
|
160 | 911 | mdecorde | for (def j = 0 ; j < moveLeft ; j++ ) { |
161 | 911 | mdecorde | allTerminalses[i - 1].appendChild(ts[j])
|
162 | 911 | mdecorde | } |
163 | 911 | mdecorde | } |
164 | 911 | mdecorde | } |
165 | 911 | mdecorde | outputFile.withWriter("UTF-8") { writer ->
|
166 | 911 | mdecorde | writer.println(this.tigerXml)
|
167 | 911 | mdecorde | } |
168 | 911 | mdecorde | } |
169 | 911 | mdecorde | |
170 | 911 | mdecorde | private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
|
171 | 911 | mdecorde | def newTNode = tNode.getOwnerDocument().createElement('t') |
172 | 911 | mdecorde | for (def anAttr : tAttrs) { |
173 | 911 | mdecorde | if (anAttr == 'id') { |
174 | 911 | mdecorde | newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}") |
175 | 911 | mdecorde | } |
176 | 911 | mdecorde | else if (anAttr == 'word') { |
177 | 911 | mdecorde | newTNode.setAttribute('word', word[1]) |
178 | 911 | mdecorde | } |
179 | 911 | mdecorde | else {
|
180 | 911 | mdecorde | newTNode.setAttribute(anAttr, '--')
|
181 | 911 | mdecorde | } |
182 | 911 | mdecorde | } |
183 | 911 | mdecorde | if (where == 'before') { |
184 | 911 | mdecorde | tNode.getParentNode().insertBefore(newTNode, tNode) |
185 | 911 | mdecorde | } |
186 | 911 | mdecorde | else if (where == 'append') { |
187 | 911 | mdecorde | tNode.getParentNode().appendChild(newTNode) |
188 | 911 | mdecorde | } |
189 | 911 | mdecorde | else {
|
190 | 911 | mdecorde | throw new IllegalArgumentException('Bad before value') |
191 | 911 | mdecorde | } |
192 | 911 | mdecorde | } |
193 | 911 | mdecorde | def getTAttrNames(tNode) { |
194 | 911 | mdecorde | def nodeMap = tNode.attributes
|
195 | 911 | mdecorde | def nameList = [] |
196 | 911 | mdecorde | for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) { |
197 | 911 | mdecorde | nameList.add( nodeMap.item(i).nodeName ) |
198 | 911 | mdecorde | } |
199 | 911 | mdecorde | return nameList
|
200 | 911 | mdecorde | } |
201 | 911 | mdecorde | def getTNodeId(tNode) { |
202 | 911 | mdecorde | return tNode.getAttribute('id').tokenize('#').last() |
203 | 911 | mdecorde | } |
204 | 911 | mdecorde | def toList(def iterable) { |
205 | 911 | mdecorde | return iterable.findAll {true}; |
206 | 911 | mdecorde | } |
207 | 911 | mdecorde | def getDeepText(def node) { |
208 | 911 | mdecorde | def str = "" |
209 | 911 | mdecorde | node.children().each { child -> |
210 | 911 | mdecorde | if (child instanceof Node) { |
211 | 911 | mdecorde | str += getDeepText(child) |
212 | 911 | mdecorde | } else if (child instanceof String) { |
213 | 911 | mdecorde | str += child |
214 | 911 | mdecorde | } |
215 | 911 | mdecorde | } |
216 | 911 | mdecorde | return str
|
217 | 911 | mdecorde | } |
218 | 911 | mdecorde | |
219 | 911 | mdecorde | public static void main(String[] args) |
220 | 911 | mdecorde | { |
221 | 911 | mdecorde | def cli = new CliBuilder( |
222 | 911 | mdecorde | usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml' |
223 | 911 | mdecorde | ) |
224 | 911 | mdecorde | cli.h(longOpt:'help', 'Prints this message.') |
225 | 911 | mdecorde | cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
226 | 911 | mdecorde | def options = cli.parse(args)
|
227 | 911 | mdecorde | def tigerXmlFile = null |
228 | 911 | mdecorde | def xmlTxmFile = null |
229 | 911 | mdecorde | if (options.arguments().size() == 2) { |
230 | 911 | mdecorde | tigerXmlFile = new File(options.arguments()[0]) |
231 | 911 | mdecorde | xmlTxmFile = new File(options.arguments()[1]) |
232 | 911 | mdecorde | } else {
|
233 | 911 | mdecorde | println 'Incorrect number of command line arguments... exiting'
|
234 | 911 | mdecorde | println cli.usage() |
235 | 911 | mdecorde | System.exit(2) |
236 | 911 | mdecorde | } |
237 | 911 | mdecorde | def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile) |
238 | 911 | mdecorde | if (options.o) {
|
239 | 911 | mdecorde | pii.outputFile = new File(options.o) |
240 | 911 | mdecorde | } |
241 | 911 | mdecorde | pii.process() |
242 | 911 | mdecorde | } |
243 | 911 | mdecorde | } |