Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / export / ExportToGlozz1_1_0Macro.groovy @ 1166

History | View | Annotate | Download (6.9 kB)

1
package org.txm.macro.analec.export
2

    
3
// @author: Bruno Oberle
4
// v1.1.0 2017-10-25:
5
//  - using the "p" structural unit if available to get paragraphs
6
//  - no space before comma, parenthesis, full stop, hyphen, etc.
7
//  - no space after hyphen, parenthesis, apostrophe, etc.
8
// v1.0.0 2017-08-28
9

    
10
/*
11
 Cette macro exporte le corpus sélectionné et ses annotations vers deux fichiers de format Glozz:
12
 - un fichier .ac contenant le corpus brut,
13
 - un fichier .aa contenant les annotations au format XML utilisé par Glozz.
14
 Le corpus sélectionné dans TXM devrait contenir une structure Analec avec au moins un type d'unité défini (e.g. MENTION, maillon, etc.). S'il n'y a pas de structure,
15
 ce n'est pas grave: le fichier est exporter, mais aucune annotation n'est créer. Cela permet d'exporter n'importe quel corpus au format Glozz.
16
 Pour exporter un texte au format Glozz *sans* les annotations qu'il contient, simplement mettre un unit_type qui n'existe pas (e.g. "foobar" au lieu de "MENTION").
17
 La macro ne produit pour l'instant pas automatiquement de modèle Glozz (fichier .aam).  Cela n'est pas un problème pour ouvrir le résultat dans Glozz ou Analec.
18
 */
19

    
20
// STANDARD DECLARATIONS
21

    
22
import org.apache.commons.lang.*
23
import org.kohsuke.args4j.*
24
import groovy.transform.*
25
import org.txm.*
26
import org.txm.rcp.swt.widget.parameters.*
27
import org.txm.analec.*
28
import org.txm.searchengine.cqp.*
29
import org.txm.searchengine.cqp.corpus.*
30
import visuAnalec.Message.*
31
import visuAnalec.donnees.*
32
import visuAnalec.elements.*
33
import visuAnalec.vue.*
34

    
35

    
36
// GLOBAL VARIABLES
37

    
38
corpus = corpusViewSelection
39
CQI = CQPSearchEngine.getCqiClient()
40

    
41
// CHECK CORPUS
42

    
43
if (!(corpusViewSelection instanceof MainCorpus)) {
44
        println "Corpora selection is not a Corpus"
45
        return;
46
}
47

    
48
// PARAGRAPH AS STRUCTURAL UNIT?
49

    
50
/* note: some text have a structural unit called "p", which represents a paragraph.
51
   If this structural unit is present, we use it.  Otherwise we ask the user (the
52
   best way is to use the "pn" (paragraph number) or "lbn" (line number) property),
53
   depending on what is available in the corpus. If the user give not property name,
54
   we only define one paragraph for the whole text. */
55

    
56
parUnit = corpus.getStructuralUnit("p")
57

    
58
// BEGINNING OF PARAMETERS
59

    
60
@Field @Option(name="unit_type",usage="", widget="String", required=true, def="MENTION")
61
                String unit_type
62

    
63
@Field @Option(name="filename",usage="", widget="String", required=true, def="filename without extension (.ac/.aa)")
64
                String filename
65

    
66
if (!parUnit) {
67
        @Field @Option(name="par_prop",usage="the property used to compute paragraphs", widget="String", required=false, def="lbn")
68
                String par_prop
69
}
70

    
71
if (!ParametersDialog.open(this)) return;
72

    
73
// what paragraph unit to use?
74

    
75
pn = null
76
if (!parUnit && !par_prop.equals("")) {
77
        pn = corpus.getProperty(par_prop)
78
        if (!pn) {
79
                println "Error: I can't find a the property `$par_prop'."
80
                return
81
        }
82
}
83

    
84
/*********************************/
85

    
86
doExport(corpus, unit_type, filename)
87

    
88
public void doExport(MainCorpus corpus, String unit_type, String filename) {
89

    
90
        size = corpus.getSize() // you may also use: corpus.getTextEndLimits() (= index of last token = size-1)
91
        word = corpus.getWordProperty()
92

    
93
        // BUILD THE RAW TEXT, THE POSITIONS AND FIND THE PARAGRAPHS
94

    
95
        rawText = "" // the corpus for the .ac file
96
        positions = [] // each element is an array [start, end] indicating the position in the rawText
97
        pnCount = 0 // the par counter, used for indexing the pars array
98
        lastPn = -1 // the last paragraph number
99
        pars = [] // each element is an array [start, end] representing the start and end of the paragraph in the rawText
100
        insertSpace = true
101
        for (def i=0; i<size; i++) {
102
                f = CQI.cpos2Str(word.getQualifiedName(), (int[])[i])[0]
103
                if (parUnit) {
104
                        p = CQI.cpos2Struc(parUnit.getQualifiedName(), (int[])[i])[0]
105
                } else if (pn == null) {
106
                        p = 1
107
                } else {
108
                        p = CQI.cpos2Str(pn.getQualifiedName(), (int[])[i])[0]
109
                }
110
                if (i > 0 && insertSpace
111
                                && !f.equals(".") && !f.equals(",") && !f.equals("'") && !f.equals("") && !f.equals("-")
112
                                && !f.equals(")") && !f.equals("]") && !f.startsWith("-")) {
113
                        rawText += " "
114
                }
115
                insertSpace = true // reset
116
                if (f.equals("-") || f.equals("[") || f.equals("(")
117
                                || f.endsWith("-") || f.endsWith("'") || f.endsWith("") || f.endsWith("-")) {
118
                        insertSpace = false
119
                }
120
                start = rawText.length()
121
                rawText += f
122
                if (lastPn != p) {
123
                        pnCount++;
124
                        if (pnCount > 1) {
125
                                pars[pnCount-2][1] = end
126
                        }
127
                        pars[pnCount-1] = [start, 0]
128
                }
129
                lastPn = p
130
                end = rawText.length() // must be after setting it up in pars!
131
                positions[i] = [start, end]
132
        }
133
        pars[pnCount-1][1] = end
134
        println pnCount + " paragraph(s) found."
135

    
136
        // CORPUS ANALEC (GET THE ANNOTATIONS)
137

    
138
        // note that unit_type has been defined with an option of the dialog at the beginning
139
        def analecCorpus = AnalecCorpora.getCorpus(corpus);
140

    
141
        // list of properties
142

    
143
        struct = analecCorpus.getStructure();
144
        propertyList = struct.getUniteProperties(unit_type);
145

    
146
        // export to file (corpus)
147

    
148
        corpusFilename = filename + ".ac";
149
        def corpusFile = new File(corpusFilename);
150
        corpusFile.write(rawText)
151
        println("Corpus written to `"+corpusFilename+"'.");
152

    
153
        // export to file (annotations)
154

    
155
        annotFilename = filename + ".aa";
156
        def annotFile = new File(annotFilename)
157
        annotFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<annotations>\n")
158
                // erase (if you use << you don't erase!)
159
        def counter = 0
160

    
161
        // export paragraphs
162
        for (def i=0; i<pars.size(); i++) {
163
                def start = pars[i][0]
164
                def end = pars[i][1]
165
                annotFile << "<unit id=\"me_"+counter+"\">\n";
166
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
167
                annotFile << "<characterisation><type>paragraph</type><featureSet /></characterisation>\n";
168
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
169
                annotFile << "</unit>\n";
170
                counter++;
171
        }
172

    
173
        // export units
174
        def units = analecCorpus.getUnites(unit_type);
175
        //units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() };
176
        def unitCount = 0
177
        for (Unite unit : units) {
178
                unitCount++;
179
                annotFile << "<unit id=\"me_"+counter+"\">\n";
180
                annotFile << "<metadata><author>me</author><creation-date>"+counter+"</creation-date></metadata>\n";
181
                annotFile << "<characterisation>\n";
182
                annotFile << "<type>"+unit_type+"</type>\n";
183
                annotFile << "<featureSet>\n";
184
                for (String propertyName : propertyList) {
185
                        annotFile << "<feature name=\""+propertyName+"\">"+unit.getProp(propertyName)+"</feature>\n";
186
                }
187
                annotFile << "</featureSet>\n";
188
                annotFile << "</characterisation>\n";
189
                start = positions[unit.getDeb()][0]
190
                end = positions[unit.getFin()][1]
191
                annotFile << "<positioning><start><singlePosition index=\""+start+"\" /></start><end><singlePosition index=\""+end+"\" /></end></positioning>\n";
192
                annotFile << "</unit>\n";
193
                counter++;
194
        }
195
        annotFile << "</annotations>\n";
196

    
197
        println unitCount + " unit(s) found."
198

    
199
        println("Annotations written to `"+annotFilename+"'.");
200
}