Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / export / ts / ConcordanceBlocks.groovy @ 187

History | View | Annotate | Download (12.5 kB)

1
package org.txm.export.ts;
2

    
3
import javax.xml.parsers.DocumentBuilderFactory
4

    
5
class ConcordanceBlocks {
6
        def codec = 'UTF-8'
7

    
8
        int cx = 30;
9
        def ntTypes = [];
10
        def tTypes = [];
11
        File xmlfile, outfile;
12

    
13
        public boolean process(File xmlfile, File outfile, int cx, def ntTypes, def tTypes) {
14
                this.cx = cx;
15
                this.xmlfile = xmlfile;
16
                this.outfile = outfile;
17
                this.ntTypes = ntTypes;
18
                this.tTypes = tTypes;
19

    
20
                Writer writer = outfile.newPrintWriter(codec)
21

    
22
                println 'Reading XML File'
23
                def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
24
                def records = builder.parse(xmlfile).documentElement
25
                println 'done.'
26

    
27
                println 'done.  Pre-treating file to combine results with the same pivot...'
28
                records = matchCombine(records)
29
                println 'done.'
30

    
31
                println 'Calculating max. number of blocks...'
32
                def nBlock = 0
33

    
34
                def allMatches = toList(records.getElementsByTagName('match'))
35

    
36
                for (def match : allMatches) {
37
                        nBlock = [
38
                                nBlock,
39
                                toList(match.getElementsByTagName('variable')).findAll{
40
                                        it.getAttribute('name').startsWith('#block')
41
                                }.size()
42
                        ].max()
43
                }
44

    
45
                println "done ( $nBlock )"
46

    
47
                // Table headers
48
                println 'Writing concordance...'
49

    
50
                def header = ['sId', 'LeftCxOutsideSnt', 'LeftCxInsideSnt']
51

    
52
                for (int i = nBlock ; i > 0 ; i--) {
53

    
54
                        header.add("${i}BlockBeforePivot")
55

    
56
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
57

    
58
                                header.add("${i}BlockBeforePivotType${j+1}")
59

    
60
                        }
61
                }
62

    
63
                header.add('Pivot')
64

    
65
                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
66

    
67
                        header.add("PivotType${j+1}")
68

    
69
                }
70

    
71
                for (int i = 1 ; i <= nBlock ; i++) {
72

    
73
                        header.add("${i}BlockAfterPivot")
74

    
75
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
76

    
77
                                header.add("${i}BlockAfterPivotType${j+1}")
78

    
79
                        }
80
                }
81

    
82
                header.addAll(['RightCxInsideSnt', 'RightCxOutsideSnt', 'Warnings'])
83

    
84
                writer.write(header.join("\t")+"\n");
85

    
86
                def allTerminals = toList(records.getElementsByTagName('t'))
87
                int tenPercentile = 0
88

    
89
                int i = 0;
90
                for (def aMatchNode : allMatches) {
91
                        if ((int)(((float)++i / allMatches.size()) * 10) > tenPercentile)
92
                                println ""+(++tenPercentile * 10)+ ' percent complete...'
93

    
94
                        def (rowDict, inSntCxLengthLeft, inSntCxLengthRight) = match2CSVrow(aMatchNode, header);
95
                        
96
                        // Add out-of-sentence context
97
                        def sNode = aMatchNode.parentNode.parentNode; // sentence
98
                        def terminals = toList(sNode.getElementsByTagName('t'));
99
                        def firstTInS = terminals[0] // get first node of the sentence
100
                        def lastTInS = terminals[-1] // get last node of the sentence
101
                        def firstTInSIx = allTerminals.indexOf(firstTInS) // get its position in the text
102
                        def lastTInSIx = allTerminals.indexOf(lastTInS) // get its position in the text
103

    
104
                        // Left context
105
                        def lexs = []
106
                        int start = Math.max(firstTInSIx - cx + inSntCxLengthLeft, 0);
107
                        int end = firstTInSIx;
108
                        if(start < end)
109
                        for (def tNode : allTerminals.subList(start, end)){
110
                                lexs.add(tNode.getAttribute('word'))
111
                                if (toList(tNode.parentNode.getElementsByTagName('t'))[-1] == tNode)
112
                                        lexs.add('/')
113
                        }
114
                        rowDict['LeftCxOutsideSnt'] = lexs.join(" ")
115

    
116
                        // Right context
117
                        lexs = []
118

    
119
                        start = lastTInSIx + 1;
120
                        end = Math.min(allTerminals.size(), lastTInSIx + cx - inSntCxLengthRight)
121

    
122
                        if(start < end) {
123
                                for (def tNode : allTerminals.subList(start, end)){
124
                                        lexs.add(tNode.getAttribute('word'))
125
                                        if (toList(tNode.parentNode.getElementsByTagName('t'))[-1] == tNode)
126
                                                lexs.add('/')
127
                                }}
128
                        rowDict["RightCxOutsideSnt"] = lexs.join(" ")
129

    
130
                        // Right rowDict to CSV
131

    
132
                        //                        CSVWriter.writerow(dict(zip([k for k in rowDict.iterkeys()], \
133
                        //        [ v.encode('utf-8') for v in rowDict.itervalues() ] )))
134

    
135
                        String line = "";
136
                        for (int ii = 0 ; ii < header.size() ; ii++){
137
                                String h = header.get(ii)
138
                                String val = rowDict.getAt(h);
139
                                if (val == null)
140
                                        line += "--"
141
                                else
142
                                        line += val
143
                                if (ii < header.size() -1)
144
                                        line +="\t"
145
                        }
146
                        writer.write(line+"\n")
147
                        writer.flush()
148

    
149
                }
150
                writer.close();
151
                return true;
152
        }
153

    
154
        private def match2CSVrow(def aMatchNode, def header) {
155
                // Step 1: Build a LIST of DICTIONARIES to describe the variables:
156
                // dict(name='varname_minus_the_hash'
157
                //      parent='nt_nodes'
158
                //      terminals='t_nodes in a list')
159
                
160
                def sNode = aMatchNode.parentNode.parentNode // get the sentence node
161

    
162
                def tNodesInSentence = toList(sNode.getElementsByTagName('t')) // get all terminal nodes of the sentence
163

    
164
                def varDetails = []
165

    
166
                for ( def aVariable : aMatchNode.getElementsByTagName('variable')){
167
                        if ( aVariable.getAttribute('name') == '#pivot' ||
168
                        aVariable.getAttribute('name').startsWith('#block')) {
169
                                def varParent = idKey(sNode, aVariable.getAttribute('idref'))
170

    
171
                                varDetails.add([
172
                                                        'name' : aVariable.getAttribute('name').substring(1),
173
                                                        'parent': varParent,
174
                                                        'terminals': toList(getTNodes(varParent)),
175
                                                        'lexform': writeLexForm(varParent),
176
                                                ])
177

    
178
                                // Add types
179

    
180
                                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
181

    
182
                                        def a = ''
183

    
184
                                        if (j < ntTypes.size() && varParent.getAttribute(ntTypes[0])
185
                                        ) {
186
                                                a = varParent.getAttribute(ntTypes[j])
187
                                        } else if (j < tTypes.size() && varParent.getAttribute(tTypes[0])
188
                                        ) {
189
                                                a = varParent.getAttribute(tTypes[j])
190
                                        }
191

    
192
                                        varDetails[-1]["type${j+1}"] = a
193

    
194
                                }
195

    
196
                                // write min idx in sentence of terminal nodes used
197
                                varDetails[-1]['terminalsIx'] = varDetails[-1]['terminals'].collect{tNodesInSentence.indexOf(it)}
198
                        }
199
                }
200

    
201
                // Sort varDetails by the start ID of the word
202

    
203
                varDetails =  varDetails.sort{it['terminalsIx'].min()}
204

    
205
                def pivotBlockPosition = 0;
206
                pivotBlockPosition = varDetails.findIndexOf{it['name'] == 'pivot'}
207

    
208
                // Write the table
209
                def rowDict = [:]
210
                rowDict['sId'] = sNode.getAttribute('id')
211

    
212
                // write the pivot
213
                rowDict['Pivot'] = varDetails[pivotBlockPosition]['lexform']
214

    
215
                for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
216

    
217
                        rowDict["PivotType${j+1}"] = varDetails[pivotBlockPosition]["type${j+1}"]
218
                }
219

    
220
                def startNextId = (varDetails[pivotBlockPosition]['terminalsIx']).min()
221
                def endPrevId = (varDetails[pivotBlockPosition]['terminalsIx']).max()
222

    
223
                // Write the Pre-pivot blocks
224

    
225
                def i = 0
226

    
227
                //println "rowDict: $rowDict"
228
                if ( pivotBlockPosition > 0)        {
229
                        for ( int j = pivotBlockPosition -1 ; j >= 0 ; j--) {
230
                                
231
                                i++;
232
                                def block = varDetails.get(j)
233
                                //println "process block: $block"
234

    
235
                                rowDict["${i}BlockBeforePivot"] = block['lexform']
236

    
237
                                for ( int k = 0 ; k < [ntTypes.size(), tTypes.size()].max() ; k++ ) {
238
                                        rowDict["${i}BlockBeforePivotType${k+1}"] = block["type${k+1}"]
239
                                }
240

    
241
                                // Add any intervening words to the right edge.
242

    
243
                                def rightEdge = (block['terminalsIx'].findAll{ it < startNextId }).max();
244
                                //println ""+block['terminalsIx']+"   rightEdge: $rightEdge"
245

    
246
                                for (int ix = rightEdge + 1 ; ix < startNextId ; ix++) // add word to reach the pivot
247
                                {
248
                                        rowDict["${i}"+"BlockBeforePivot"] += ' {' + tNodesInSentence[ix].getAttribute('word') + '}';
249
                                }
250

    
251
                                startNextId = (block['terminalsIx']).min()
252
                        }
253
                }
254
                
255
                // Write the in-sentence left context
256

    
257
                def lexs = tNodesInSentence.subList(0, startNextId).collect {it.getAttribute('word')}
258
                rowDict['LeftCxInsideSnt'] = lexs.join(" ")
259

    
260
                def inSntCxLengthLeft = startNextId
261

    
262
                // Write the Post-pivot blocks
263

    
264
                i = 0
265
                for (def block in varDetails.subList(pivotBlockPosition + 1, varDetails.size())){
266
                        i++
267
                        rowDict["${i}BlockAfterPivot"] = block['lexform']
268

    
269
                        for (int j = 0 ; j < [ntTypes.size(), tTypes.size()].max() ; j++) {
270
                                rowDict["${i}BlockAfterPivotType${j+1}"] = block["type${j+1}"]
271
                        }
272

    
273
                        // Add any intervening words to the left edge.
274
                        // Note that this isn't always possible, in particular
275
                        // if the preceding element is discontinuous.
276

    
277
                        def leftEdgeList = block['terminalsIx'].findAll{it > endPrevId }
278

    
279
                        if ( leftEdgeList.size() > 0) {
280
                                def leftEdge = leftEdgeList.min()
281
                                for (int ix = leftEdge - 1 ; ix > endPrevId ; ix--) {
282
                                        rowDict["${i}BlockAfterPivot"] = '{' + tNodesInSentence[ix].getAttribute('word') + '} ' + rowDict["${i}BlockAfterPivot"]
283
                                }
284
                        }
285
                        else {
286
                                rowDict["${i}BlockAfterPivot"] = '{?} ' + rowDict["${i}BlockAfterPivot"]
287
                        }
288
                        endPrevId = block['terminalsIx'].max()
289
                }
290

    
291
                // Write the in-sentence right-context
292

    
293
                lexs = tNodesInSentence.subList(endPrevId + 1, tNodesInSentence.size()).collect {it.getAttribute('word')}
294
                rowDict['RightCxInsideSnt'] = lexs.join(" ")
295

    
296
                def inSntCxLengthRight = tNodesInSentence.size() - endPrevId
297

    
298
                return [rowDict, inSntCxLengthLeft, inSntCxLengthRight]
299
                return null;
300
        }
301

    
302
        public def toList(def iterable) {
303
                return iterable.findAll {true};
304
        }
305

    
306
        /**
307
         * return the children of sNode with the id anId 
308
         */
309
        public def idKey(def sNode, String anId) {
310
                return sNode.getElementsByTagName("*").find{it.getAttribute("id") == anId}
311
        }
312

    
313
        /**
314
         * return a list of all terminal nodes of the node.
315
         * if the node is a non-terminal, iterate over children and so on
316
         */
317
        def getTNodes(theNode) {
318
                def terminals = []
319
                def unprocessed = [theNode]
320

    
321
                while( unprocessed.size() > 0) {
322
                        def aNode = unprocessed.pop()
323
                        def edges = toList(aNode.getElementsByTagName('edge'))
324
                        if (edges.size() == 0)
325
                                terminals.add(aNode)
326
                        else
327
                                for (def anEdge : edges)
328
                                        unprocessed.add(idKey(theNode.parentNode.parentNode, anEdge.getAttribute('idref')))
329
                }
330
                return terminals;
331
        }
332

    
333
        /**
334
         * 
335
         * @param theNode
336
         * @return the join of the terminal nodes value in theNode a varaible node
337
         */
338
        def writeLexForm(theNode) {
339
                def sNode = theNode.parentNode.parentNode // get the sentence of the variable
340
                def allTNodes = sNode.getElementsByTagName('t') // get all sentence children
341
                def tNodesInTheNode = getTNodes(theNode) // get the terminal nodes pointed by idref
342
                def begunNode = false
343
                def lexs = []
344
                def lexBuffer = []
345
                for (def aTNode : allTNodes){
346
                        if ( aTNode in tNodesInTheNode) {
347
                                begunNode = true
348
                                lexs.addAll(lexBuffer)
349
                                lexs.add(aTNode.getAttribute('word'))
350
                                lexBuffer = []
351
                        }
352
                        if ( begunNode && !tNodesInTheNode.contains(aTNode))
353
                                lexBuffer.add('[' + aTNode.getAttribute('word') + ']')
354
                }
355
                return lexs.join(" ");
356
        }
357

    
358
        def matchCombine(theDOM) {
359

    
360
                def matchesNodes = theDOM.getElementsByTagName('matches')
361

    
362
                for (def aMatchesNode : matchesNodes){
363

    
364
                        def pivotNodes = toList(
365
                                        aMatchesNode.getElementsByTagName('variable')
366
                                        ).findAll{it.getAttribute('name') == '#pivot'}
367

    
368
                        def checkedPivots = []
369

    
370
                        while (pivotNodes){
371

    
372
                                def aPivotNode = pivotNodes.remove(0)
373

    
374
                                def matchingPivotList = checkedPivots.findAll{
375
                                        it.getAttribute('idref') == aPivotNode.getAttribute('idref')
376
                                }
377

    
378
                                if (matchingPivotList) {
379

    
380
                                        // duplicate pivot; copy all variables
381

    
382

    
383
                                        for (def node : toList(aPivotNode.getParentNode()
384
                                        .getElementsByTagName('variable')
385
                                        )) {
386

    
387
                                                matchingPivotList[0].getParentNode().appendChild(node)
388

    
389
                                        }
390

    
391
                                        def variables = toList(
392
                                                        matchingPivotList[0].getParentNode()
393
                                                        .getElementsByTagName('variable')
394
                                                        )
395

    
396
                                        // remove duplicates
397

    
398
                                        def checkedVariables = []
399

    
400
                                        while (variables) {
401

    
402
                                                def aVariableNode = variables.remove(0)
403

    
404
                                                def matchingVariableList = checkedVariables.findAll{
405
                                                        (
406
                                                                        it.getAttribute('idref') == aVariableNode.getAttribute('idref')
407
                                                                        && (
408
                                                                        it.getAttribute('name') == aVariableNode.getAttribute('name')
409
                                                                        || (
410
                                                                        it.getAttribute('name').startsWith('#block')
411
                                                                        && aVariableNode.getAttribute('name').startsWith('#block')
412
                                                                        )
413
                                                                        )
414
                                                                        )
415
                                                }
416

    
417
                                                if (matchingVariableList) {
418

    
419
                                                        // remove variable node
420

    
421
                                                        aVariableNode.getParentNode().removeChild(aVariableNode)
422

    
423
                                                }
424

    
425
                                                else {
426

    
427
                                                        checkedVariables.add(aVariableNode)
428

    
429
                                                }
430
                                        }
431
                                }
432
                                else {
433

    
434
                                        // not the same pivot
435

    
436
                                        checkedPivots.add(aPivotNode)
437
                                }
438
                        }
439
                        // Tidying up: remove empty <match /> nodes
440
                        def matchNodes = toList(aMatchesNode.getElementsByTagName('match'))
441

    
442
                        for (def matchNode : matchNodes) {
443

    
444
                                if ( !toList(matchNode.getElementsByTagName('variable'))) {
445

    
446
                                        aMatchesNode.removeChild(matchNode)
447
                                }
448
                        }
449
                }
450

    
451
                return theDOM
452
        }
453

    
454
        public static void usage() {
455
                println 'concordance_blocks [OPTIONS] inputfile.xml [outputfile.csv]'
456
                println 'OPTIONS'
457
                println '-h, --help           Displays this message.'
458
                println '-c --context [length] Sets the number of words in context.'
459
        }
460

    
461
        static main(args) {
462
                if(args.length == 0)
463
                        usage()
464
                int cx = 30;
465
                def ntTypes = ['cat'];
466
                def tTypes = ['pos'];
467
                File xmlfile = new File(args[0])
468
                File outfile = new File(args[1])
469
                ConcordanceBlocks p = new ConcordanceBlocks();
470
                println "START"
471
                p.process(xmlfile, outfile, cx, ntTypes, tTypes)
472
                println "END"
473
        }
474
}