Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / oriflamms / prepare / OntologiesProjection.groovy @ 479

History | View | Annotate | Download (13.5 kB)

1
package org.txm.macro.oriflamms.prepare
2

    
3
import java.util.regex.Pattern
4
import org.codehaus.groovy.transform.trait.SuperCallTraitTransformer;
5
import org.txm.importer.StaxIdentityParser;
6
import org.txm.importer.StaxParser;
7

    
8
class OntologiesProjection extends StaxIdentityParser {
9

    
10
        File xmlFile
11

    
12
        String wordTag
13
        String textname
14
        String milestone
15
        String group
16

    
17
        HashMap links = [:]
18
        HashMap prefixDefsPatterns = [:]
19
        HashMap<String, HashMap> ggly_ontologies = [:]
20
        HashMap<String, HashMap> lgly_ontologies = [:]
21

    
22
        String current_ontology_link_file_name
23

    
24
        File ontologies_links_directory;
25

    
26
        public OntologiesProjection(File xmlFile, File corpusDirectory) {
27
                super(xmlFile)
28

    
29
                this.xmlFile = xmlFile
30
                this.ontologies_links_directory = new File(corpusDirectory, "ontologies_links")
31

    
32
                textname = xmlFile.getName()
33
                int idx = textname.lastIndexOf(".xml")
34
                if (idx > 0) textname = textname.substring(0, idx)
35
                textname = textname.replaceAll("-c", "")
36

    
37
                this.wordTag = "c";
38
        }
39

    
40
        public def buildGGlyOntology(String prefix) {
41
                String path = prefixDefsPatterns.get(prefix)[1];
42
                int idx = path.indexOf("#")
43
                if (idx > 0) path = path.substring(0, idx)
44

    
45
                File ggly_ontology_file = new File(xmlFile.getParentFile(), "../"+path)
46
                //println "ggly_ontology_file=$ggly_ontology_file "+ggly_ontology_file.exists()
47
                if (!ggly_ontology_file.exists()) {
48
                        println "WARNING: cannot found global ontology file: $ggly_ontology_file"
49
                        return false
50
                }
51
                def global_ontologies = [:]
52
                def unicode_global_ontologies = [:]
53
                //println "parse $ggly_ontology_file"
54
                StaxParser pontologies = new StaxParser(ggly_ontology_file) {
55
                                        boolean startChar = false, startLocalName = false, startValue = false, startMapping = false;
56
                                        String unicodeChar, standardizedChar, subtype, type;
57
                                        String id, charLocalName, charValue;
58
                                        StringBuilder c = new StringBuilder();
59

    
60
                                        void processStartElement() {
61
                                                if (localname.equals("char")) {
62
                                                        // get id
63
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
64
                                                                if (parser.getAttributeLocalName(i).equals("id")) {
65
                                                                        id = parser.getAttributeValue(i)
66
                                                                        break;
67
                                                                }
68
                                                        }
69
                                                        startChar = true;
70
                                                        c.setLength(0);
71
                                                } else if (localname.equals("mapping")) {
72
                                                        subtype = "";
73
                                                        type = "";
74
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
75
                                                                if (parser.getAttributeLocalName(i).equals("subtype")) {
76
                                                                        subtype = parser.getAttributeValue(i)
77
                                                                } else if (parser.getAttributeLocalName(i).equals("type")) {
78
                                                                        type = parser.getAttributeValue(i)
79
                                                                }
80
                                                        }
81
                                                        startMapping = true;
82
                                                        c.setLength(0);
83
                                                } else if (localname.equals("localName")) {
84
                                                        startLocalName = true;
85
                                                        c.setLength(0);
86
                                                } else if (localname.equals("value")) {
87
                                                        startLocalName = true;
88
                                                        c.setLength(0);
89
                                                }
90
                                        }
91

    
92
                                        void processCharacters() {
93
                                                //if (startChar) c.append(parser.getText());
94
                                                if (startMapping) c.append(parser.getText());
95
                                                else if (startLocalName) c.append(parser.getText());
96
                                                else if (startValue) c.append(parser.getText());
97
                                        }
98

    
99
                                        void processEndElement() {
100
                                                if (localname.equals("char")) {
101
                                                        startChar = false;
102
                                                        global_ontologies[id] = ["standard":standardizedChar, "unicode":unicodeChar, "value":charValue, "localname":charLocalName];
103
                                                        unicode_global_ontologies[unicodeChar] = standardizedChar
104
                                                } else if (localname.equals("mapping")) {
105
                                                        if (subtype.equals("Unicode")) {
106
                                                                unicodeChar = c.toString().trim();
107
                                                        } else if (type.equals("standardized")) {
108
                                                                standardizedChar = c.toString().trim();
109
                                                        }
110
                                                        startMapping = false;
111
                                                } else if (localname.equals("localName")) {
112
                                                        charLocalName = c.toString().trim()
113
                                                        startLocalName = false;
114
                                                } else if (localname.equals("value")) {
115
                                                        charValue = c.toString().trim()
116
                                                        startValue = false;
117
                                                }
118
                                        }
119
                                };
120
                pontologies.process();
121
                ggly_ontologies[prefix] = [global_ontologies, unicode_global_ontologies]
122
                //println ggly_ontologies
123
                return true
124
        }
125

    
126
        public def buildLGlyOntology(String prefix) {
127
                String path = prefixDefsPatterns.get(prefix)[1];
128
                int idx = path.indexOf("#")
129
                if (idx > 0) path = path.substring(0, idx)
130

    
131
                //File lgly_ontology_file = new File(xmlFile.getParentFile(), "../"+path) // add "../" because we are in txm/<corpus>-c directory
132
                File lgly_ontology_file = new File(ontologies_links_directory, textname+"-ontolinks.xml") // add "../" because we are in txm/<corpus>-c directory
133
                //println "lgly_ontology_file=$lgly_ontology_file "+lgly_ontology_file.exists()
134
                if (!lgly_ontology_file.exists()) {
135
                        println "WARNING: cannot find Local ontology file $lgly_ontology_file"
136
                        return false
137
                }
138

    
139
                def local_ontologies = [:]
140
                //println "parse $lgly_ontology_file"
141
                StaxParser pontologies = new StaxParser(lgly_ontology_file) {
142
                                        boolean startNote = false
143
                                        String id, change, parent;
144
                                        StringBuilder c = new StringBuilder();
145
                                        def glyph = [:]
146

    
147
                                        void processStartElement() {
148
                                                if (localname.equals("glyph")) {
149
                                                        // get id
150
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
151
                                                                change = ""
152
                                                                if (parser.getAttributeLocalName(i).equals("id")) {
153
                                                                        id = parser.getAttributeValue(i)
154
                                                                } else if (parser.getAttributeLocalName(i).equals("change")) {
155
                                                                        change = parser.getAttributeValue(i)
156
                                                                }
157
                                                        }
158
                                                        glyph = ["change":change, "id":id] // new glyph
159
                                                        parent = null
160
                                                } else if (localname.equals("note")) {
161
                                                        startNote = true;
162
                                                        c.setLength(0);
163
                                                }
164
                                        }
165

    
166
                                        void processCharacters() {
167
                                                if (startNote) c.append(parser.getText());
168
                                        }
169

    
170
                                        void processEndElement() {
171
                                                if (localname.equals("char")) {
172
                                                        if (parent != null)
173
                                                                glyph["parent"] = local_ontologies[parent]
174
                                                        local_ontologies[id] = glyph
175
                                                } else if (localname.equals("note")) {
176
                                                        parent = c.toString().trim()
177
                                                        startNote = false;
178
                                                }
179
                                        }
180
                                };
181
                pontologies.process();
182
                lgly_ontologies[prefix] = local_ontologies
183

    
184
                return true
185
        }
186

    
187
        public def loadOntologyLinkFile(String name) {
188
                links = [:]
189
                prefixDefsPatterns = ["ggly":[Pattern.compile("([a-z]+)"), '../../charDecl.xml#$1'],
190
                        "lgly":[Pattern.compile("([a-z]+)"), '../ontologies/'+textname+'.xml#$1'],
191
                        "txt":[Pattern.compile("([a-z]+)"), '../texts/'+textname+'.xml#$1']]
192
                        
193
                lgly_ontologies = [:]
194
                ggly_ontologies = [:]
195
                File ontology_link_file = new File(ontologies_links_directory, name)
196
                if (!ontology_link_file.exists()) {
197
                        println "WARNING: no ontology link file: "+ontology_link_file
198
                        return
199
                }
200

    
201
                StaxParser pLinks = new StaxParser(ontology_link_file) {
202
                                        void processStartElement() {
203
                                                if (localname.equals("linkGrp")) {
204
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
205
                                                                if (parser.getAttributeLocalName(i).equals("type")) {
206
                                                                        group = parser.getAttributeValue(i)
207
                                                                        break
208
                                                                }
209
                                                        }
210
                                                } else if (localname.equals("prefixDef")) {
211
                                                        String ident, matchPattern, replacementPattern;
212
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
213
                                                                if (parser.getAttributeLocalName(i).equals("ident")) {
214
                                                                        ident = parser.getAttributeValue(i)
215
                                                                } else if (parser.getAttributeLocalName(i).equals("matchPattern")) {
216
                                                                        matchPattern = parser.getAttributeValue(i)
217
                                                                } else if (parser.getAttributeLocalName(i).equals("replacementPattern")) {
218
                                                                        replacementPattern = parser.getAttributeValue(i)
219
                                                                }
220
                                                        }
221
                                                        if (!ident.equals("txt")) {
222
                                                                prefixDefsPatterns[ident] = [Pattern.compile(matchPattern), replacementPattern];
223
                                                                OntologiesProjection.this.getOntology(ident)
224
                                                        }
225
                                                } else if (localname.equals("link")) {
226
                                                        String target = "";
227

    
228
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
229
                                                                if (parser.getAttributeLocalName(i).equals("target")) {
230
                                                                        target = parser.getAttributeValue(i)
231
                                                                        break
232
                                                                }
233
                                                        }
234

    
235
                                                        def split = target.split(" ", 2) // first part word id next part are the ontologies id
236
                                                        links[split[0].substring(4)] = split[1].split(" ")
237
                                                }
238
                                        }
239
                                };
240
                pLinks.process();
241
                //                println "links size: "+links.size()
242
                //                println "ggly_ontologies size: "+ggly_ontologies.size()
243
                //                println "lgly_ontologies size: "+lgly_ontologies.size()
244
        }
245

    
246
        public def getOntology(String prefix) {
247
                if (prefix.startsWith("ggly")) {
248
                        if (!ggly_ontologies.containsKey(prefix)) buildGGlyOntology(prefix);
249
                        return ggly_ontologies.get(prefix)
250
                } else if (prefix.startsWith("lgly")) {
251
                        if (!lgly_ontologies.containsKey(prefix)) buildLGlyOntology(prefix);
252
                        return lgly_ontologies.get(prefix)
253
                }
254
        }
255

    
256
        public void processStartElement() {
257
                super.processStartElement();
258
                if (localname.equals("milestone")) {
259
                        String id = "";
260
                        String unit= "";
261
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
262
                                if (parser.getAttributeLocalName(i).equals("id")) {
263
                                        id = parser.getAttributeValue(i)
264
                                } else if (parser.getAttributeLocalName(i).equals("unit")) {
265
                                        unit = parser.getAttributeValue(i)
266
                                }
267
                        }
268

    
269
                        if (unit.equals("surface")) {
270
                                milestone = id;
271
                        }
272
                } else if (localname.equals(wordTag)) {
273
                        String id = "";
274
                        String characters = "";
275
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
276
                                if (parser.getAttributeLocalName(i).equals("id")) {
277
                                        id = parser.getAttributeValue(i)
278
                                } else if (parser.getAttributeLocalName(i).equals("characters")) {
279
                                        characters = parser.getAttributeValue(i)
280
                                }
281
                        }
282

    
283
                        String ontology_link_file_name = textname+"-ontolinks.xml"
284
                        if (!current_ontology_link_file_name.equals(ontology_link_file_name)) { // rebuild hashmaps
285
                                current_ontology_link_file_name = ontology_link_file_name
286
                                loadOntologyLinkFile(ontology_link_file_name);
287
                                getOntology("ggly")
288
                        }
289

    
290
                        String sign = null, allographExpert = null, allographAutomatic = null; // default value is attribute characters
291

    
292
                        //AUTO ALLOGRAPH
293
                        if (links.containsKey(id))
294
                                for (String link : links[id]) { // automatic allograph loop
295
                                        if (link.startsWith("lgly")) {
296
                                                int idx = link.indexOf(":")
297
                                                link = link.substring(idx+1);
298
                                                if (link.startsWith("auto_")) { // automatic lgly
299
                                                        if (allographAutomatic == null) allographAutomatic = link.substring(5)
300
                                                        else if (allographAutomatic.length()+5 < link.length()) allographAutomatic = link.substring(5)
301
                                                } else { // manual lgly
302

    
303
                                                }
304
                                        }
305
                                }
306
                        if (allographAutomatic == null) allographAutomatic = characters;
307

    
308
                        //EXPERT ALLOGRAPH
309
                        if (links.containsKey(id))
310
                                for (String link : links[id]) { // expert allograph loop, try to find a ggly entity
311
                                        //getOntology("ggly")
312
                                        if (link.startsWith("ggly")) {
313
                                                int idx = link.indexOf(":")
314
                                                def prefix = link.substring(0, idx);
315
                                                link = link.substring(idx+1);
316

    
317
                                                def onto = getOntology(prefix)
318
                                                if (onto != null) {
319
                                                        def charOnto = onto[0][link];
320
                                                        if (charOnto != null) {
321
                                                                String localname = charOnto["localname"]
322
                                                                String value =  charOnto["value"]
323
                                                                if ("entity".equals(localname)) {
324
                                                                        allographExpert = value
325
                                                                }
326
                                                        }
327
                                                }
328
                                        }
329
                                }
330
                        if (allographExpert == null)
331
                                if (links.containsKey(id))
332
                                        for (String link : links[id]) { // expert allograph loop, try to find the longest non-autolgly entity
333
                                                if (link.startsWith("lgly")) {
334
                                                        int idx = link.indexOf(":")
335
                                                        link = link.substring(idx+1);
336
                                                        if (!link.startsWith("auto_")) { // non automatic lgly
337
                                                                //println "link= "+link
338
                                                                if (allographExpert == null) allographExpert = link
339
                                                                else if (allographExpert.length()+5 < link.length()) allographExpert = link
340
                                                        }
341
                                                }
342
                                        }
343
                        if (allographExpert == null) allographExpert = allographAutomatic;
344

    
345
                        //SIGN
346
                        if (sign == null)
347
                                if (links.containsKey(id))
348
                                        for (String link : links[id]) { // expert allograph loop, try to find the shortest ggly entity
349
                                                //getOntology("ggly")
350
                                                if (link.startsWith("ggly")) {
351
                                                        int idx = link.indexOf(":")
352
                                                        def prefix = link.substring(0, idx);
353
                                                        link = link.substring(idx+1);
354

    
355
                                                        def onto = getOntology(prefix)
356
                                                        if (onto != null) {
357
                                                                def charOnto = onto[0][link];
358
                                                                if (charOnto != null) {
359
                                                                        sign = charOnto["standard"]
360
                                                                }
361
                                                        }
362
                                                }
363
                                        }
364
                        if (sign == null)
365
                                if (links.containsKey(id))
366
                                        for (String link : links[id]) { // sign loop, try to find the shortest non-autolgly entity
367
                                                if (link.startsWith("lgly")) {
368
                                                        int idx = link.indexOf(":")
369
                                                        link = link.substring(idx+1);
370
                                                        if (!link.startsWith("auto_")) { // non automatic lgly
371
                                                                if (sign == null) sign = link
372
                                                                else if (sign.length()+5 > link.length()) sign = link
373
                                                        }
374
                                                }
375
                                        }
376
                        if (sign == null) {
377
                                for (def ggly : ggly_ontologies.values()) {
378
                                        def chars = ggly[1]
379
                                        if (chars.containsKey(characters)) sign = chars[characters];
380
                                }
381
                        }
382
                        if (sign == null) sign = characters.toLowerCase();
383

    
384
                        //println "write characters attributes characters=$characters sign=$sign allograph-expert=$allographExpert allograph-auto=$allographAutomatic"
385
                        writer.writeAttribute("sign", sign)
386
                        writer.writeAttribute("allograph-expert", allographExpert)
387
                        writer.writeAttribute("allograph-auto", allographAutomatic)
388
                }
389
        }
390

    
391
        public static void main(String[] args) {
392
                File corpusDirectory = new File("/home/mdecorde/TEMP/testori/qgraal_cmTest")
393
                File xmlFile = new File(corpusDirectory, "txm/qgraal_cmTest-c/qgraal_cmTest-c_surf_qgraal_cmTest_lyonbm_pa77-160.xml")
394
                File outputFile = new File(corpusDirectory, "txm/qgraal_cmTest-c/out.xml")
395

    
396
                OntologiesProjection cp = new OntologiesProjection(xmlFile, corpusDirectory);
397
                println cp.process(outputFile)
398
        }
399
}