Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / src / org / txm / export / ts / ConcordanceBlocks.groovy @ 713

History | View | Annotate | Download (12.5 kB)

1
package org.txm.export.ts;
2

    
3
import javax.xml.parsers.DocumentBuilderFactory
4

    
5
class ConcordanceBlocks {
6
        def codec = 'UTF-8'
7

    
8
        int cx = 30;
9
        def ntTypes = [];
10
        def tTypes = [];
11
        File xmlfile, outfile;
12

    
13
        public boolean process(File xmlfile, File outfile, int cx, def ntTypes, def tTypes) {
14
                this.cx = cx;
15
                this.xmlfile = xmlfile;
16
                this.outfile = outfile;
17
                this.ntTypes = ntTypes;
18
                this.tTypes = tTypes;
19

    
20
                Writer writer = outfile.newPrintWriter(codec)
21

    
22
                println 'Reading XML File'
23
                def factory = DocumentBuilderFactory.newInstance()
24
                factory.setXIncludeAware(true)
25
                def builder = factory.newDocumentBuilder()
26
                def records = builder.parse(xmlfile).documentElement
27
                println 'done.'
28

    
29
                println 'done.  Pre-treating file to combine results with the same pivot...'
30
                records = matchCombine(records)
31
                println 'done.'
32

    
33
                println 'Calculating max. number of blocks...'
34
                def nBlock = 0
35

    
36
                def allMatches = toList(records.getElementsByTagName('match'))
37

    
38
                for (def match : allMatches) {
39
                        nBlock = [
40
                                nBlock,
41
                                toList(match.getElementsByTagName('variable')).findAll{
42
                                        it.getAttribute('name').startsWith('#block')
43
                                }.size()
44
                        ].max()
45
                }
46

    
47
                println "done ( $nBlock )"
48

    
49
                // Table headers
50
                println 'Writing concordance...'
51

    
52
                def header = ['sId', 'LeftCxOutsideSnt', 'LeftCxInsideSnt']
53

    
54
                for (int i = nBlock ; i > 0 ; i--) {
55

    
56
                        header.add("${i}BlockBeforePivot")
57

    
58
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
59

    
60
                                header.add("${i}BlockBeforePivotType${j+1}")
61

    
62
                        }
63
                }
64

    
65
                header.add('Pivot')
66

    
67
                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
68

    
69
                        header.add("PivotType${j+1}")
70

    
71
                }
72

    
73
                for (int i = 1 ; i <= nBlock ; i++) {
74

    
75
                        header.add("${i}BlockAfterPivot")
76

    
77
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
78

    
79
                                header.add("${i}BlockAfterPivotType${j+1}")
80

    
81
                        }
82
                }
83

    
84
                header.addAll(['RightCxInsideSnt', 'RightCxOutsideSnt', 'Warnings'])
85

    
86
                writer.write(header.join("\t")+"\n");
87

    
88
                def allTerminals = toList(records.getElementsByTagName('t'))
89
                int tenPercentile = 0
90

    
91
                int i = 0;
92
                for (def aMatchNode : allMatches) {
93
                        if ((int)(((float)++i / allMatches.size()) * 10) > tenPercentile)
94
                                println ""+(++tenPercentile * 10)+ ' percent complete...'
95

    
96
                        def (rowDict, inSntCxLengthLeft, inSntCxLengthRight) = match2CSVrow(aMatchNode, header);
97
                        
98
                        // Add out-of-sentence context
99
                        def sNode = aMatchNode.parentNode.parentNode; // sentence
100
                        def terminals = toList(sNode.getElementsByTagName('t'));
101
                        def firstTInS = terminals[0] // get first node of the sentence
102
                        def lastTInS = terminals[-1] // get last node of the sentence
103
                        def firstTInSIx = allTerminals.indexOf(firstTInS) // get its position in the text
104
                        def lastTInSIx = allTerminals.indexOf(lastTInS) // get its position in the text
105

    
106
                        // Left context
107
                        def lexs = []
108
                        int start = Math.max(firstTInSIx - cx + inSntCxLengthLeft, 0);
109
                        int end = firstTInSIx;
110
                        if(start < end)
111
                        for (def tNode : allTerminals.subList(start, end)){
112
                                lexs.add(tNode.getAttribute('word'))
113
                                if (toList(tNode.parentNode.getElementsByTagName('t'))[-1] == tNode)
114
                                        lexs.add('/')
115
                        }
116
                        rowDict['LeftCxOutsideSnt'] = lexs.join(" ")
117

    
118
                        // Right context
119
                        lexs = []
120

    
121
                        start = lastTInSIx + 1;
122
                        end = Math.min(allTerminals.size(), lastTInSIx + cx - inSntCxLengthRight)
123

    
124
                        if(start < end) {
125
                                for (def tNode : allTerminals.subList(start, end)){
126
                                        lexs.add(tNode.getAttribute('word'))
127
                                        if (toList(tNode.parentNode.getElementsByTagName('t'))[-1] == tNode)
128
                                                lexs.add('/')
129
                                }}
130
                        rowDict["RightCxOutsideSnt"] = lexs.join(" ")
131

    
132
                        // Right rowDict to CSV
133

    
134
                        //                        CSVWriter.writerow(dict(zip([k for k in rowDict.iterkeys()], \
135
                        //        [ v.encode('utf-8') for v in rowDict.itervalues() ] )))
136

    
137
                        String line = "";
138
                        for (int ii = 0 ; ii < header.size() ; ii++){
139
                                String h = header.get(ii)
140
                                String val = rowDict.getAt(h);
141
                                if (val == null)
142
                                        line += "--"
143
                                else
144
                                        line += val
145
                                if (ii < header.size() -1)
146
                                        line +="\t"
147
                        }
148
                        writer.write(line+"\n")
149
                        writer.flush()
150

    
151
                }
152
                writer.close();
153
                return true;
154
        }
155

    
156
        private def match2CSVrow(def aMatchNode, def header) {
157
                // Step 1: Build a LIST of DICTIONARIES to describe the variables:
158
                // dict(name='varname_minus_the_hash'
159
                //      parent='nt_nodes'
160
                //      terminals='t_nodes in a list')
161
                
162
                def sNode = aMatchNode.parentNode.parentNode // get the sentence node
163

    
164
                def tNodesInSentence = toList(sNode.getElementsByTagName('t')) // get all terminal nodes of the sentence
165

    
166
                def varDetails = []
167

    
168
                for ( def aVariable : aMatchNode.getElementsByTagName('variable')){
169
                        if ( aVariable.getAttribute('name') == '#pivot' ||
170
                        aVariable.getAttribute('name').startsWith('#block')) {
171
                                def varParent = idKey(sNode, aVariable.getAttribute('idref'))
172

    
173
                                varDetails.add([
174
                                                        'name' : aVariable.getAttribute('name').substring(1),
175
                                                        'parent': varParent,
176
                                                        'terminals': toList(getTNodes(varParent)),
177
                                                        'lexform': writeLexForm(varParent),
178
                                                ])
179

    
180
                                // Add types
181

    
182
                                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
183

    
184
                                        def a = ''
185

    
186
                                        if (j < ntTypes.size() && varParent.getAttribute(ntTypes[0])
187
                                        ) {
188
                                                a = varParent.getAttribute(ntTypes[j])
189
                                        } else if (j < tTypes.size() && varParent.getAttribute(tTypes[0])
190
                                        ) {
191
                                                a = varParent.getAttribute(tTypes[j])
192
                                        }
193

    
194
                                        varDetails[-1]["type${j+1}"] = a
195

    
196
                                }
197

    
198
                                // write min idx in sentence of terminal nodes used
199
                                varDetails[-1]['terminalsIx'] = varDetails[-1]['terminals'].collect{tNodesInSentence.indexOf(it)}
200
                        }
201
                }
202

    
203
                // Sort varDetails by the start ID of the word
204

    
205
                varDetails =  varDetails.sort{it['terminalsIx'].min()}
206

    
207
                def pivotBlockPosition = 0;
208
                pivotBlockPosition = varDetails.findIndexOf{it['name'] == 'pivot'}
209

    
210
                // Write the table
211
                def rowDict = [:]
212
                rowDict['sId'] = sNode.getAttribute('id')
213

    
214
                // write the pivot
215
                rowDict['Pivot'] = varDetails[pivotBlockPosition]['lexform']
216

    
217
                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
218

    
219
                        rowDict["PivotType${j+1}"] = varDetails[pivotBlockPosition]["type${j+1}"]
220
                }
221

    
222
                def startNextId = (varDetails[pivotBlockPosition]['terminalsIx']).min()
223
                def endPrevId = (varDetails[pivotBlockPosition]['terminalsIx']).max()
224

    
225
                // Write the Pre-pivot blocks
226

    
227
                def i = 0
228

    
229
                //println "rowDict: $rowDict"
230
                if ( pivotBlockPosition > 0)        {
231
                        for ( int j = pivotBlockPosition -1 ; j >= 0 ; j--) {
232
                                
233
                                i++;
234
                                def block = varDetails.get(j)
235
                                //println "process block: $block"
236

    
237
                                rowDict["${i}BlockBeforePivot"] = block['lexform']
238

    
239
                                for ( int k = 0 ; k < [ntTypes.size(), tTypes.size()].max() ; k++ ) {
240
                                        rowDict["${i}BlockBeforePivotType${k+1}"] = block["type${k+1}"]
241
                                }
242

    
243
                                // Add any intervening words to the right edge.
244

    
245
                                def rightEdge = (block['terminalsIx'].findAll{ it < startNextId }).max();
246
                                //println ""+block['terminalsIx']+"   rightEdge: $rightEdge"
247

    
248
                                for (int ix = rightEdge + 1 ; ix < startNextId ; ix++) // add word to reach the pivot
249
                                {
250
                                        rowDict["${i}"+"BlockBeforePivot"] += ' {' + tNodesInSentence[ix].getAttribute('word') + '}';
251
                                }
252

    
253
                                startNextId = (block['terminalsIx']).min()
254
                        }
255
                }
256
                
257
                // Write the in-sentence left context
258

    
259
                def lexs = tNodesInSentence.subList(0, startNextId).collect {it.getAttribute('word')}
260
                rowDict['LeftCxInsideSnt'] = lexs.join(" ")
261

    
262
                def inSntCxLengthLeft = startNextId
263

    
264
                // Write the Post-pivot blocks
265

    
266
                i = 0
267
                for (def block in varDetails.subList(pivotBlockPosition + 1, varDetails.size())){
268
                        i++
269
                        rowDict["${i}BlockAfterPivot"] = block['lexform']
270

    
271
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
272
                                rowDict["${i}BlockAfterPivotType${j+1}"] = block["type${j+1}"]
273
                        }
274

    
275
                        // Add any intervening words to the left edge.
276
                        // Note that this isn't always possible, in particular
277
                        // if the preceding element is discontinuous.
278

    
279
                        def leftEdgeList = block['terminalsIx'].findAll{it > endPrevId }
280

    
281
                        if ( leftEdgeList.size() > 0) {
282
                                def leftEdge = leftEdgeList.min()
283
                                for (int ix = leftEdge - 1 ; ix > endPrevId ; ix--) {
284
                                        rowDict["${i}BlockAfterPivot"] = '{' + tNodesInSentence[ix].getAttribute('word') + '} ' + rowDict["${i}BlockAfterPivot"]
285
                                }
286
                        }
287
                        else {
288
                                rowDict["${i}BlockAfterPivot"] = '{?} ' + rowDict["${i}BlockAfterPivot"]
289
                        }
290
                        endPrevId = block['terminalsIx'].max()
291
                }
292

    
293
                // Write the in-sentence right-context
294

    
295
                lexs = tNodesInSentence.subList(endPrevId + 1, tNodesInSentence.size()).collect {it.getAttribute('word')}
296
                rowDict['RightCxInsideSnt'] = lexs.join(" ")
297

    
298
                def inSntCxLengthRight = tNodesInSentence.size() - endPrevId
299

    
300
                return [rowDict, inSntCxLengthLeft, inSntCxLengthRight]
301
                return null;
302
        }
303

    
304
        public def toList(def iterable) {
305
                return iterable.findAll {true};
306
        }
307

    
308
        /**
309
         * return the children of sNode with the id anId 
310
         */
311
        public def idKey(def sNode, String anId) {
312
                return sNode.getElementsByTagName("*").find{it.getAttribute("id") == anId}
313
        }
314

    
315
        /**
316
         * return a list of all terminal nodes of the node.
317
         * if the node is a non-terminal, iterate over children and so on
318
         */
319
        def getTNodes(theNode) {
320
                def terminals = []
321
                def unprocessed = [theNode]
322

    
323
                while( unprocessed.size() > 0) {
324
                        def aNode = unprocessed.pop()
325
                        def edges = toList(aNode.getElementsByTagName('edge'))
326
                        if (edges.size() == 0)
327
                                terminals.add(aNode)
328
                        else
329
                                for (def anEdge : edges)
330
                                        unprocessed.add(idKey(theNode.parentNode.parentNode, anEdge.getAttribute('idref')))
331
                }
332
                return terminals;
333
        }
334

    
335
        /**
336
         * 
337
         * @param theNode
338
         * @return the join of the terminal nodes value in theNode a varaible node
339
         */
340
        def writeLexForm(theNode) {
341
                def sNode = theNode.parentNode.parentNode // get the sentence of the variable
342
                def allTNodes = sNode.getElementsByTagName('t') // get all sentence children
343
                def tNodesInTheNode = getTNodes(theNode) // get the terminal nodes pointed by idref
344
                def begunNode = false
345
                def lexs = []
346
                def lexBuffer = []
347
                for (def aTNode : allTNodes){
348
                        if ( aTNode in tNodesInTheNode) {
349
                                begunNode = true
350
                                lexs.addAll(lexBuffer)
351
                                lexs.add(aTNode.getAttribute('word'))
352
                                lexBuffer = []
353
                        }
354
                        if ( begunNode && !tNodesInTheNode.contains(aTNode))
355
                                lexBuffer.add('[' + aTNode.getAttribute('word') + ']')
356
                }
357
                return lexs.join(" ");
358
        }
359

    
360
        def matchCombine(theDOM) {
361

    
362
                def matchesNodes = theDOM.getElementsByTagName('matches')
363

    
364
                for (def aMatchesNode : matchesNodes){
365

    
366
                        def pivotNodes = toList(
367
                                        aMatchesNode.getElementsByTagName('variable')
368
                                        ).findAll{it.getAttribute('name') == '#pivot'}
369

    
370
                        def checkedPivots = []
371

    
372
                        while (pivotNodes){
373

    
374
                                def aPivotNode = pivotNodes.remove(0)
375

    
376
                                def matchingPivotList = checkedPivots.findAll{
377
                                        it.getAttribute('idref') == aPivotNode.getAttribute('idref')
378
                                }
379

    
380
                                if (matchingPivotList) {
381

    
382
                                        // duplicate pivot; copy all variables
383

    
384

    
385
                                        for (def node : toList(aPivotNode.getParentNode()
386
                                        .getElementsByTagName('variable')
387
                                        )) {
388

    
389
                                                matchingPivotList[0].getParentNode().appendChild(node)
390

    
391
                                        }
392

    
393
                                        def variables = toList(
394
                                                        matchingPivotList[0].getParentNode()
395
                                                        .getElementsByTagName('variable')
396
                                                        )
397

    
398
                                        // remove duplicates
399

    
400
                                        def checkedVariables = []
401

    
402
                                        while (variables) {
403

    
404
                                                def aVariableNode = variables.remove(0)
405

    
406
                                                def matchingVariableList = checkedVariables.findAll{
407
                                                        (
408
                                                                        it.getAttribute('idref') == aVariableNode.getAttribute('idref')
409
                                                                        && (
410
                                                                        it.getAttribute('name') == aVariableNode.getAttribute('name')
411
                                                                        || (
412
                                                                        it.getAttribute('name').startsWith('#block')
413
                                                                        && aVariableNode.getAttribute('name').startsWith('#block')
414
                                                                        )
415
                                                                        )
416
                                                                        )
417
                                                }
418

    
419
                                                if (matchingVariableList) {
420

    
421
                                                        // remove variable node
422

    
423
                                                        aVariableNode.getParentNode().removeChild(aVariableNode)
424

    
425
                                                }
426

    
427
                                                else {
428

    
429
                                                        checkedVariables.add(aVariableNode)
430

    
431
                                                }
432
                                        }
433
                                }
434
                                else {
435

    
436
                                        // not the same pivot
437

    
438
                                        checkedPivots.add(aPivotNode)
439
                                }
440
                        }
441
                        // Tidying up: remove empty <match /> nodes
442
                        def matchNodes = toList(aMatchesNode.getElementsByTagName('match'))
443

    
444
                        for (def matchNode : matchNodes) {
445

    
446
                                if ( !toList(matchNode.getElementsByTagName('variable'))) {
447

    
448
                                        aMatchesNode.removeChild(matchNode)
449
                                }
450
                        }
451
                }
452

    
453
                return theDOM
454
        }
455

    
456
        public static void usage() {
457
                println 'concordance_blocks [OPTIONS] inputfile.xml [outputfile.csv]'
458
                println 'OPTIONS'
459
                println '-h, --help           Displays this message.'
460
                println '-c --context [length] Sets the number of words in context.'
461
        }
462

    
463
        static main(args) {
464
                if(args.length == 0)
465
                        usage()
466
                int cx = 30;
467
                def ntTypes = ['cat'];
468
                def tTypes = ['pos'];
469
                File xmlfile = new File(args[0])
470
                File outfile = new File(args[1])
471
                ConcordanceBlocks p = new ConcordanceBlocks();
472
                println "START"
473
                p.process(xmlfile, outfile, cx, ntTypes, tTypes)
474
                println "END"
475
        }
476
}