Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / src / org / txm / export / ts / PunctInject.groovy @ 713

History | View | Annotate | Download (7.3 kB)

1
// Script to restore punctuation etc. to results of TS query.
2
// Inputs:
3
// --- Tiger-XML Document node
4
// --- Java array:
5
// --- --- String [index][type] where:
6
// --- --- --- type == 0 gives the xml:id
7
// --- --- --- type == 1 gives the word form
8
// Process:
9
// --- Injects punctuation.
10
// Returns:
11
// --- Tiger-XML Document node.
12
package org.txm.export.ts;
13

    
14
import javax.xml.parsers.DocumentBuilderFactory
15
import org.txm.searchengine.cqp.CqpDataProxy;
16
import org.txm.searchengine.cqp.corpus.Corpus
17
import org.txm.searchengine.cqp.corpus.CorpusManager;
18
import org.txm.searchengine.cqp.corpus.QueryResult;
19
import org.txm.searchengine.cqp.corpus.query.Query
20

    
21
public class PunctInject
22
{
23
        public process (def tigerXml, def txmIdWordTable) {
24

    
25
                def allTs = toList(tigerXml.getElementsByTagName('t'))
26

    
27
                def tAttrs = getTAttrNames(allTs[0])
28

    
29
                def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
30

    
31
                def addedWordIds = []
32

    
33
                int tIx = 0
34

    
35
                while (txmIdWordTable) {
36

    
37
                        def word = txmIdWordTable.remove(0)
38

    
39
                        if (tIx == allTs.size()) {
40
                                // End of TS file, but still words left in the BFM file.
41
                                addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
42
                        }
43

    
44
                        else {
45

    
46
                                def tNode = allTs[tIx]
47

    
48
                                def tId = getTNodeId(tNode)
49

    
50
                                if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
51

    
52
                                        // alles gut
53

    
54
                                        tIx += 1
55

    
56
                                }
57

    
58
                                else if (tId == word[0]) {
59

    
60
                                        println("Mismatched Ids! ($tId)")
61
                                        tIx += 1
62
                                }
63

    
64
                                else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
65

    
66
                                        // SRCMF duplicata; try comparing word against the next tNode next time
67
                                        // around.
68

    
69
                                        txmIdWordTable.add(0, word)
70

    
71
                                        tIx += 1
72

    
73
                                }
74
                                
75
                                // Check that the SRCMF corpus doesn't have a bug in it...
76

    
77
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
78
                                && (allTs[0..tIx - 1].find{
79
                                        it.getAttribute('id') == "$tNodeIdPrefix#$tId"
80
                                })) {
81

    
82
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
83

    
84
                                        txmIdWordTable.add(0, word)
85

    
86
                                        tIx += 1
87

    
88
                                }
89
                                
90
                                // Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
91
                                
92
                                else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
93
                                && (allTs[tIx..-1].find{
94
                                        it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
95
                                })) {
96
                        
97
                                        println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
98

    
99
                                        txmIdWordTable.add(0, word)
100
                                        
101
                                        tIx += 1
102
                                }
103

    
104
                                else if (addedWordIds.contains(tId)) {
105

    
106
                                        println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
107

    
108
                                        txmIdWordTable.add(0, word)
109

    
110
                                        tIx += 1
111

    
112
                                }
113

    
114
                                else {
115

    
116
                                        // Insert word.  In the first instance, it will have the same parent as
117
                                        // the tNode before which it's being inserted.
118

    
119
                                        addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
120

    
121
                                        addedWordIds.add(word[0])
122

    
123
                                }
124

    
125
                        }
126

    
127
                }
128

    
129
                // Second phase: move punctuation into previous sentence,
130
                // dependent on sequence.
131

    
132
                def allTerminalses = toList(tigerXml.getElementsByTagName('terminals'))
133

    
134
                for (def i = 1 ; i < allTerminalses.size() ; i++) {
135

    
136
                        def ts = toList(allTerminalses[i].getElementsByTagName('t'))
137

    
138
                        def startPunc = true
139

    
140
                        def puncStack = []
141

    
142
                        while (ts && startPunc) {
143

    
144
                                if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
145

    
146
                                        puncStack.add(ts.remove(0))
147

    
148
                                }
149

    
150
                                else {
151

    
152
                                        startPunc = false
153

    
154
                                }
155

    
156
                        }
157

    
158
                        // Now, treat the punctuation stack at the beginning of the sentence
159

    
160
                        if ( puncStack ) {
161

    
162
                                int moveLeft = 0
163

    
164
                                // First, identify LAST instance of sentence-final punctuation.
165

    
166
                                def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
167

    
168
                                def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
169

    
170
                                if (matches.size() > 0) {
171

    
172
                                        moveLeft = puncString.lastIndexOf(matches[-1]) + 1
173

    
174
                                }
175

    
176
                                // Second, split pairs of straight quotes
177

    
178
                                matches = puncString =~ /(""|'')/ //"
179

    
180
                                if (matches.size() > 0) {
181

    
182
                                        moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
183
                                }
184

    
185
                                // Now, move moveLeft punctuation nodes to the end of the prev. sentence
186

    
187
                                ts = toList(allTerminalses[i].getElementsByTagName('t'))
188

    
189
                                for (def j = 0 ; j < moveLeft ; j++ ) {
190

    
191
                                        allTerminalses[i - 1].appendChild(ts[j])
192

    
193
                                }
194

    
195
                        }
196
                }
197
                return tigerXml
198

    
199
        }
200

    
201
        private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
202

    
203
                def newTNode = tNode.getOwnerDocument().createElement('t')
204

    
205
                for (def anAttr : tAttrs) {
206

    
207
                        if (anAttr == 'id') {
208

    
209
                                newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
210

    
211
                        }
212

    
213
                        else if (anAttr == 'word') {
214

    
215
                                newTNode.setAttribute('word', word[1])
216

    
217
                        }
218

    
219
                        else {
220

    
221
                                newTNode.setAttribute(anAttr, '--')
222

    
223
                        }
224

    
225
                }
226

    
227
                if (where == 'before') {
228

    
229
                        tNode.getParentNode().insertBefore(newTNode, tNode)
230

    
231
                }
232

    
233
                else if (where == 'append') {
234

    
235
                        tNode.getParentNode().appendChild(newTNode)
236

    
237
                }
238

    
239
                else {
240

    
241
                        throw new IllegalArgumentException('Bad before value')
242

    
243
                }
244

    
245
        }
246

    
247
        public getTAttrNames(tNode) {
248

    
249
                def nodeMap = tNode.attributes
250

    
251
                def nameList = []
252

    
253
                for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
254
                        nameList.add( nodeMap.item(i).nodeName )
255
                }
256

    
257
                return nameList
258

    
259
        }
260

    
261
        public getTNodeId(tNode) {
262

    
263
                return tNode.getAttribute('id').tokenize('#').last()
264

    
265
        }
266

    
267
        public def toList(def iterable) {
268
                return iterable.findAll {true};
269
        }
270

    
271
        public static def getWords(String corpusname, String query)
272
        {
273
                CorpusManager cm = CorpusManager.getCorpusManager();
274
                Corpus corpus = cm.getCorpus(corpusname);
275
                def word_property = corpus.getProperty("word")
276
                def id_property = corpus.getProperty("id")
277

    
278
                def wordCache = cm.getCorpusProxies(corpus).get(word_property);
279
                def idCache = cm.getCorpusProxies(corpus).get(id_property);
280

    
281
                def positions = new int[corpus.getSize()];
282
                for(int i = 0 ; i< corpus.getSize() ; i++)
283
                        positions[i] = i;
284
                def word_values = wordCache.getData(positions)
285
                def id_values = idCache.getData(positions)
286
                ArrayList<String[]> words = new ArrayList<String[]>(corpus.getSize());
287
                for(int p : positions)
288
                {
289
                        if(id_values[p].startsWith("w"))
290
                        {
291
                                words.add(new String[2])
292
                                words[p][0] = id_values[p]
293
                                words[p][1] = word_values[p]
294
                        }
295
                }
296

    
297
                return words;
298
        }
299

    
300
        public static void main(String[] args)
301
        {
302
                def words = [
303
                        ["w203_1", "Dominedeu"],
304
                        ["w203_2", "devemps"],
305
                        ["w203_3", "lauder"],
306
                        ["w203_4", "et"],
307
                        ["w203_5", "a"],
308
                        ["w203_6", "sus"],
309
                        ["w203_7", "sancz"],
310
                        ["w203_8", "honor"],
311
                        ["w203_9", "porter"],
312
                        ["w203_10", "»"],
313
                        ["w203_10.2", ")"],
314
                        ["w203_10.3", '.'],
315
                        ["w203_10.5", '"'],
316
                        ["w203_10.7", '"'],
317
                        ["w203_11", "in"],
318
                        ["w203_12", "su'"],
319
                        ["w203_13", "amor"],
320
                        ["w203_14", "cantomps"],
321
                        ["w203_15", "del"],
322
                        ["w203_16", "·sanz"],
323
                        ["w203_17", "quae"],
324
                        ["w203_18", "por"],
325
                        ["w203_19", "lui"],
326
                        ["w203_20", "augrent"],
327
                        ["w203_21", "granz"],
328
                        ["w203_22", "aanz"],
329
                        ["w203_23", "."],
330
                ];
331
                File tigerXml = new File(args[0]);
332
                def factory = DocumentBuilderFactory.newInstance()
333
                factory.setXIncludeAware(true)
334
                def builder = factory.newDocumentBuilder()
335
                def THEDOM = builder.parse(tigerXml).documentElement
336

    
337
                println THEDOM.getClass()
338

    
339
                def NEWDOM = new PunctInject().process(THEDOM, words);
340

    
341
                println NEWDOM
342

    
343
                // File outfile = new File("outfile.xml")
344
                // println outfile
345
                //outfile.withWriter("iso-8859-1"){writer ->
346
                //writer.write(NEWDOM.toString())
347
                // }
348
        }
349
}