Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / export / ExportToGlozz1_0_0Macro.groovy @ 1166

History | View | Annotate | Download (5.6 kB)

1
package org.txm.macro.analec.export
2

    
3
// @author: Bruno Oberlé
4
// v1.0.0 2017-08-28
5

    
6
/*
7
 Cette macro exporte le corpus sélectionné et ses annotations vers deux fichiers de format Glozz:
8
 - un fichier .ac contenant le corpus brut,
9
 - un fichier .aa contenant les annotations au format XML utilisé par Glozz.
10
 Le corpus sélectionné dans TXM devrait contenir une structure Analec avec au moins un type d'unité défini (e.g. MENTION, maillon, etc.). S'il n'y a pas de structure,
11
 ce n'est pas grave: le fichier est exporter, mais aucune annotation n'est créer. Cela permet d'exporter n'importe quel corpus au format Glozz.
12
 Pour exporter un texte au format Glozz *sans* les annotations qu'il contient, simplement mettre un unit_type qui n'existe pas (e.g. "foobar" au lieu de "MENTION").
13
 La macro ne produit pour l'instant pas automatiquement de modèle Glozz (fichier .aam).  Cela n'est pas un problème pour ouvrir le résultat dans Glozz ou Analec.
14
 */
15

    
16
// STANDARD DECLARATIONS
17

    
18
import org.apache.commons.lang.*
19
import org.kohsuke.args4j.*
20
import groovy.transform.*
21
import org.txm.*
22
import org.txm.rcp.swt.widget.parameters.*
23
import org.txm.analec.*
24
import org.txm.searchengine.cqp.*
25
import org.txm.searchengine.cqp.corpus.*
26
import visuAnalec.Message.*
27
import visuAnalec.donnees.*
28
import visuAnalec.elements.*
29
import visuAnalec.vue.*
30

    
31
// CHECK CORPUS
32

    
33
if (!(corpusViewSelection instanceof MainCorpus)) {
34
        println "Corpora selection is not a Corpus"
35
        return;
36
}
37

    
38
// BEGINNING OF PARAMETERS
39

    
40
@Field @Option(name="unit_type",usage="", widget="String", required=true, def="MENTION")
41
                String unit_type
42

    
43
@Field @Option(name="filename",usage="", widget="String", required=true, def="filename without extension (.ac/.aa)")
44
                String filename
45

    
46
if (!ParametersDialog.open(this)) return;
47

    
48
// VARIABLES
49

    
50
corpus = corpusViewSelection
51

    
52
doExport(corpus, unit_type, filename)
53

    
54
public void doExport(MainCorpus corpus, String unit_type, String filename) {
55

    
56
        size = corpus.getSize() // you may also use: corpus.getTextEndLimits() (= index of last token = size-1)
57
        CQI = CQPSearchEngine.getCqiClient()
58
        word = corpus.getWordProperty()
59
        // note: using "lbn" seems to work better than "pn" (some imported corpora are missing this information)
60
        //pn = corpus.getProperty("pn")
61
        //if (pn == null) pn = corpus.getProperty("div")
62
        pn = corpus.getProperty("lbn")
63

    
64
        // BUILD THE RAW TEXT, THE POSITIONS AND FIND THE PARAGRAPHS
65

    
66
        rawText = "" // the corpus for the .ac file
67
        positions = [] // each element is an array [start, end] indicating the position in the rawText
68
        pnCount = 0 // the par counter, used for indexing the pns array
69
        lastPn = -1 // the last paragraph number
70
        pns = [] // each element is an array [start, end] representing the start and end of the paragraph in the rawText
71
        for (def i=0; i<size; i++) {
72
                f = CQI.cpos2Str(word.getQualifiedName(), (int[])[i])[0]
73
                if (pn == null) {
74
                        p = 1
75
                } else {
76
                        p = CQI.cpos2Str(pn.getQualifiedName(), (int[])[i])[0]
77
                }
78
                start = rawText.length()
79
                rawText += f
80
                if (lastPn != p) {
81
                        pnCount++;
82
                        if (pnCount > 1) {
83
                                pns[pnCount-2][1] = end
84
                        }
85
                        pns[pnCount-1] = [start, 0]
86
                }
87
                lastPn = p
88
                end = rawText.length() // must be after setting it up in pns!
89
                if (i != size-1) rawText += " "
90
                positions[i] = [start, end]
91
        }
92
        pns[pnCount-1][1] = end
93
        println pnCount + " paragraph(s) found."
94

    
95

    
96

    
97
        // CORPUS ANALEC (GET THE ANNOTATIONS)
98

    
99
        // note that unit_type has been defined with an option of the dialog at the beginning
100
        def analecCorpus = AnalecCorpora.getCorpus(corpus);
101

    
102
        // list of properties
103

    
104
        struct = analecCorpus.getStructure();
105
        propertyList = struct.getUniteProperties(unit_type);
106

    
107
        // export to file (corpus)
108

    
109
        corpusFilename = filename + ".ac";
110
        def corpusFile = new File(corpusFilename);
111
        corpusFile.write(rawText)
112
        println("Corpus written to `"+corpusFilename+"'.");
113

    
114
        // export to file (annotations)
115

    
116
        annotFilename = filename + ".aa";
117
        def annotFile = new File(annotFilename)
118
        annotFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<annotations>\n")
119
                // erase (if you use << you don't erase!)
120
        def counter = 0
121

    
122
        // export paragraphs
123
        for (def i=0; i<pns.size(); i++) {
124
                def start = pns[i][0]
125
                def end = pns[i][1]
126
                annotFile << "<unit id=\"me_"+counter+"\">\n";
127
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
128
                annotFile << "<characterisation><type>paragraph</type><featureSet /></characterisation>\n";
129
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
130
                annotFile << "</unit>\n";
131
                counter++;
132
        }
133

    
134
        // export units
135
        def units = analecCorpus.getUnites(unit_type);
136
        //units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() };
137
        def unitCount = 0
138
        for (Unite unit : units) {
139
                unitCount++;
140
                annotFile << "<unit id=\"me_"+counter+"\">\n";
141
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
142
                annotFile << "<characterisation>\n";
143
                annotFile << "<type>"+unit_type+"</type>\n";
144
                annotFile << "<featureSet>\n";
145
                for (String propertyName : propertyList) {
146
                        annotFile << "<feature name=\""+propertyName+"\">"+unit.getProp(propertyName)+"</feature>\n";
147
                }
148
                annotFile << "</featureSet>\n";
149
                annotFile << "</characterisation>\n";
150
                start = positions[unit.getDeb()][0]
151
                end = positions[unit.getFin()][1]
152
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
153
                annotFile << "</unit>\n";
154
                counter++;
155
        }
156
        annotFile << "</annotations>\n";
157

    
158
        println unitCount + " unit(s) found."
159

    
160
        println("Annotations written to `"+annotFilename+"'.");
161
}