Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / scripts / importer / srcmf / MasterReheader.groovy @ 1000

History | View | Annotate | Download (5.4 kB)

1
#! /usr/bin/groovy
2
package org.txm.scripts.importer.srcmf;
3

    
4
/*
5
 * To change this template, choose Tools | Templates
6
 * and open the template in the editor.
7
 */
8

    
9
// Set up globals
10
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
11
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
12
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
13
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']]
14
// def firstFeat = [t:'word', nt:'cat']
15

    
16
// Command-line entry point
17
def cli = new CliBuilder(
18
    usage:'MasterReheader.groovy [options] master_file.xml header_file.xml'
19
)
20
cli.h(longOpt:'help', 'Prints this message.')
21
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
22
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.')
23
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which  to provide value node.')
24
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.')
25
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.')
26
options = cli.parse(args)
27
if (options.arguments().size() == 2) {
28
    def masterFile = new File(options.arguments()[0])
29
    def headerFile = new File(options.arguments()[1])
30
    def masterFolder = masterFile.getCanonicalFile().getParent()
31
    def outputFile = null
32
    if (options.o) {
33
        outputFile = new File(options.o)
34
    } else {
35
        outputFile = new File(masterFolder, 'MasterReheader_out.xml')
36
    }
37
    def ntfirst = 'cat'
38
    if (options.nt1) {
39
        ntfirst = options.nt1
40
    }
41
    def tfirst = 'word'
42
    if (options.t1) {
43
        tfirst = options.t1
44
    }
45
    script(
46
        masterFile, headerFile, outputFile, 
47
        ['nt':options.nts, 't':options.ts],
48
        ['nt':options.nt1, 't':options.t1]
49
    )    
50
} else {
51
    println 'Incorrect number of command line arguments... exiting'
52
    println cli.usage()
53
}
54

    
55
def script(
56
    File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat
57
) {
58
    // Load master and header files
59
    def master = new XmlParser().parse(masterFile)
60
    def header = new XmlParser().parse(headerFile)
61
    def masterFolder = masterFile.getCanonicalFile().getParent()
62

    
63
    // Set up locals
64
    def attrVal = [nt:[:], t:[:]]
65

    
66
    // Scan subcorpus files and build attribute lists.
67
    master.body.subcorpus.each { 
68
        def subcorpusFile = new File (masterFolder, it.'@external'[5..-1])
69
        def subcorpus = new XmlParser().parse(subcorpusFile)
70
        // Closure for t & nt nodes processing.
71
        def getvals = { node, type ->
72
            node.attributes().each { mEntry ->
73
                if (! attrVal[type].keySet().contains(mEntry.getKey())) {
74
                    attrVal[type][mEntry.getKey()] = new HashSet()
75
                }
76
                attrVal[type][mEntry.getKey()].add(mEntry.getValue())
77
            }
78
        }
79
        subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
80
        subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
81
    }
82
    // Id isn't an attribute in the header.
83
    attrVal['t'].remove('id')
84
    attrVal['nt'].remove('id')
85
    // Remove old feature nodes in master file
86
    def oldFeatureNodes = master.head.annotation.feature
87
    while (oldFeatureNodes) {
88
        node = oldFeatureNodes.pop()
89
        node.parent().remove(node)
90
    }
91
    assert (! master.head.annotation.feature)
92
    // Check firstFeat was relevant
93
    ['t', 'nt'].each { type ->
94
        if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
95
            firstFeat[type] = attrVal[type].keySet().sort()[0]
96
        }
97
    }
98
    assert attrVal['t'].keySet().contains(firstFeat['t'])
99
    assert attrVal['nt'].keySet().contains(firstFeat['nt'])
100
    def featList = [:]
101
    ['t', 'nt'].each { type ->
102
        featList[type] = [firstFeat[type]]
103
        featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
104
    }
105
    // Add new feature and value nodes
106
    ['t', 'nt'].each { type ->
107
        featList[type].each { feat ->
108
            def fNode = new Node(master.head.annotation[0], 'feature', 
109
                ['domain':type.toUpperCase(), 'name':feat]
110
            )
111
            // Add value node if the node value is given in 'feats'
112
            if (feats[type].contains(feat)) {
113
                attrVal[type][feat].each { value ->
114
                    assert header.'**'.feature
115
                    assert header.'**'.feature[0].'@name'
116
                    assert header.'**'.feature[0].'@domain'
117
                    assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain')
118
                    def hFNode = header.'**'.feature.find {
119
                        it.'@name' == feat && (
120
                            it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC'
121
                        )
122
                    }
123
                    def vText = '[unknown]'
124
                    if (hFNode && hFNode.value.find { it.'@name' == value }) {
125
                        vText = hFNode.value.find { it.'@name' ==  value }.text()
126
                    }
127
                    new Node(fNode, 'value', ['name':value], vText)
128
                }
129
            }
130
        }    
131
    }
132

    
133
    // Save to output_file
134
    outputFile.withWriter { writer ->
135
        writer << groovy.xml.XmlUtil.serialize(master)
136
    }
137
}
138