Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / export / ExportToGlozzMacro.groovy @ 1166

History | View | Annotate | Download (5.6 kB)

1
package org.txm.macro.analec.export
2

    
3
// @author: Bruno Oberlé
4
// v1.0.0 2017-08-28
5

    
6
/*
7
 Cette macro exporte le corpus sélectionné et ses annotations vers deux fichiers de format Glozz:
8
 - un fichier .ac contenant le corpus brut,
9
 - un fichier .aa contenant les annotations au format XML utilisé par Glozz.
10
 Le corpus sélectionné dans TXM devrait contenir une structure Analec avec au moins un type d'unité défini (e.g. MENTION, maillon, etc.). S'il n'y a pas de structure,
11
 ce n'est pas grave: le fichier est exporter, mais aucune annotation n'est créer. Cela permet d'exporter n'importe quel corpus au format Glozz.
12
 Pour exporter un texte au format Glozz *sans* les annotations qu'il contient, simplement mettre un unit_type qui n'existe pas (e.g. "foobar" au lieu de "MENTION").
13
 La macro ne produit pour l'instant pas automatiquement de modèle Glozz (fichier .aam).  Cela n'est pas un problème pour ouvrir le résultat dans Glozz ou Analec.
14
 */
15

    
16
// STANDARD DECLARATIONS
17

    
18
import org.apache.commons.lang.*
19
import org.kohsuke.args4j.*
20
import groovy.transform.*
21
import org.txm.*
22
import org.txm.rcp.swt.widget.parameters.*
23
import org.txm.analec.*
24
import org.txm.searchengine.cqp.*
25
import org.txm.searchengine.cqp.corpus.*
26
import visuAnalec.Message.*
27
import visuAnalec.donnees.*
28
import visuAnalec.elements.*
29
import visuAnalec.vue.*
30

    
31
// CHECK CORPUS
32

    
33
if (!(corpusViewSelection instanceof MainCorpus)) {
34
        println "Corpora selection is not a Corpus"
35
        return;
36
}
37

    
38
// BEGINNING OF PARAMETERS
39

    
40
@Field @Option(name="unit_type",usage="", widget="String", required=true, def="MENTION")
41
                String unit_type
42

    
43
@Field @Option(name="filename",usage="", widget="String", required=true, def="filename without extension (.ac/.aa)")
44
                String filename
45

    
46
if (!ParametersDialog.open(this)) return;
47

    
48
// VARIABLES
49

    
50
corpus = corpusViewSelection
51

    
52
doExport(corpus, unit_type, filename)
53

    
54
public void doExport(MainCorpus corpus, String unit_type, String filename) {
55

    
56
        size = corpus.getSize() // you may also use: corpus.getTextEndLimits() (= index of last token = size-1)
57
        CQI = CQPSearchEngine.getCqiClient()
58
        word = corpus.getWordProperty()
59
        // note: using "lbn" seems to work better than "pn" (some imported corpora are missing this information)
60
        //pn = corpus.getProperty("pn")
61
        //if (pn == null) pn = corpus.getProperty("div")
62
        pn = corpus.getProperty("lbn")
63

    
64
        // BUILD THE RAW TEXT, THE POSITIONS AND FIND THE PARAGRAPHS
65

    
66
        rawText = "" // the corpus for the .ac file
67
        positions = [] // each element is an array [start, end] indicating the position in the rawText
68
        pnCount = 0 // the par counter, used for indexing the pns array
69
        lastPn = -1 // the last paragraph number
70
        pns = [] // each element is an array [start, end] representing the start and end of the paragraph in the rawText
71
        for (def i=0; i<size; i++) {
72
                f = CQI.cpos2Str(word.getQualifiedName(), (int[])[i])[0]
73
                if (pn == null) {
74
                        p = 1
75
                } else {
76
                        p = CQI.cpos2Str(pn.getQualifiedName(), (int[])[i])[0]
77
                }
78
                start = rawText.length()
79
                rawText += f
80
                if (lastPn != p) {
81
                        pnCount++;
82
                        if (pnCount > 1) {
83
                                pns[pnCount-2][1] = end
84
                        }
85
                        pns[pnCount-1] = [start, 0]
86
                }
87
                lastPn = p
88
                end = rawText.length() // must be after setting it up in pns!
89
                if (i != size-1) rawText += " "
90
                positions[i] = [start, end]
91
        }
92
        pns[pnCount-1][1] = end
93
        println pnCount + " paragraph(s) found."
94

    
95

    
96

    
97
        // CORPUS ANALEC (GET THE ANNOTATIONS)
98

    
99
        // note that unit_type has been defined with an option of the dialog at the beginning
100
        def analecCorpus = AnalecCorpora.getCorpus(corpus);
101

    
102
        // list of properties
103

    
104
        struct = analecCorpus.getStructure();
105
        propertyList = struct.getUniteProperties(unit_type);
106

    
107
        // export to file (corpus)
108

    
109
        corpusFilename = filename + ".ac";
110
        def corpusFile = new File(corpusFilename);
111
        corpusFile.write(rawText)
112
        println("Corpus written to `"+corpusFilename+"'.");
113

    
114
        // export to file (annotations)
115

    
116
        annotFilename = filename + ".aa";
117
        def annotFile = new File(annotFilename)
118
        annotFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<annotations>\n") // erase
119
        def counter = 0
120

    
121
        // export paragraphs
122
        for (def i=0; i<pns.size(); i++) {
123
                def start = pns[i][0]
124
                def end = pns[i][1]
125
                annotFile << "<unit id=\"me_"+counter+"\">\n";
126
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
127
                annotFile << "<characterisation><type>paragraph</type><featureSet /></characterisation>\n";
128
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
129
                annotFile << "</unit>\n";
130
                counter++;
131
        }
132

    
133
        // export units
134
        def units = analecCorpus.getUnites(unit_type);
135
        //units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() };
136
        def unitCount = 0
137
        for (Unite unit : units) {
138
                unitCount++;
139
                annotFile << "<unit id=\"me_"+counter+"\">\n";
140
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
141
                annotFile << "<characterisation>\n";
142
                annotFile << "<type>"+unit_type+"</type>\n";
143
                annotFile << "<featureSet>\n";
144
                for (String propertyName : propertyList) {
145
                        annotFile << "<feature name=\""+propertyName+"\">"+unit.getProp(propertyName)+"</feature>\n";
146
                }
147
                annotFile << "</featureSet>\n";
148
                annotFile << "</characterisation>\n";
149
                start = positions[unit.getDeb()][0]
150
                end = positions[unit.getFin()][1]
151
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
152
                annotFile << "</unit>\n";
153
                counter++;
154
        }
155
        annotFile << "</annotations>\n";
156

    
157
        println unitCount + " unit(s) found."
158

    
159
        println("Annotations written to `"+annotFilename+"'.");
160
}