Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / oriflamms / analyse / AbbreviationsAndSemanticsMacro.groovy @ 479

History | View | Annotate | Download (6.5 kB)

1
// STANDARD DECLARATIONS
2
package org.txm.macro.oriflamms.analyse
3

    
4
import groovy.transform.Field
5
import org.kohsuke.args4j.*
6
import org.txm.*
7
import org.txm.rcpapplication.swt.widget.parameters.*
8
import org.txm.searchengine.cqp.corpus.*
9
import org.txm.searchengine.cqp.corpus.query.*
10
import org.txm.stat.utils.ConsoleProgressBar
11

    
12
if (corpusViewSelection == null) {
13
        println "Il faut d'abord sélectionner un corpus dans la vue 'Corpus'. Abandon."
14
        return
15
}
16

    
17
if (!(corpusViewSelection instanceof Corpus)) {
18
        println "$corpusViewSelection n'est pas un corpus. Abandon"
19
        return;
20
}
21

    
22
@Field @Option(name="tsvFile", usage="TSV output file", widget="CreateFile", required=false, def="file.tsv")
23
def tsvFile
24

    
25
@Field @Option(name="entities", usage="1        2        3", metaVar="name        persName        placeName        orgName        roleName", widget="StructuralUnits", required=false, def="persName")
26
def entities
27

    
28
// Open the parameters input dialog box
29
if (!ParametersDialog.open(this)) return;
30

    
31
entities = entities.split(",")
32

    
33
// END OF PARAMETERS
34

    
35
println "corpora selection: "+corpusViewSelection
36

    
37
def corpus = corpusViewSelection
38
CQI = Toolbox.getCqiClient();
39
def w = corpus.getStructuralUnit("w");
40

    
41
def wordCorpus = (w == null)
42
if (wordCorpus) {
43
        println "Corpus de mots"
44
} else {
45
        println "Corpus de lettres"
46
}
47

    
48
println "Dénombrement des abbréviations en '"+(wordCorpus?"mots":"lettres")+"' pour les entités $entities"
49

    
50
for (String prop : ["letters-all", "letters-alignable", "characters","abbr-n"]) {
51
        if (corpus.getProperty(prop) == null) {
52
                println "Le corpus '$corpus' n'a pas de propriété de mot '$prop'. Abandon."
53
                return
54
        }
55
}
56
for (String structName : entities) {
57
        StructuralUnit su = corpus.getStructuralUnit(structName);
58
        if (su == null) {
59
                println "Corpus '$corpus' has no structure '$structName'. Abandon."
60
                return
61
        } else if (su.getProperty("n") == null) {
62
                println "Le corpus '$corpus' n'a pas de structure'$structName' avec une propriété 'n'. Abandon."
63
                return
64
        }
65
}
66
structures = []
67
for (String structName : entities) structures << corpus.getStructuralUnit(structName).getProperty("n")
68

    
69
text_su = corpus.getStructuralUnit("text")
70
text_id = text_su.getProperty("id")
71
pb_id = corpus.getProperty("pbid")
72
cb_id = corpus.getProperty("cbid")
73
lb_id = corpus.getProperty("lbid")
74
lettersAll = corpus.getProperty("letters-all")
75
lettersAlignable = corpus.getProperty("letters-alignable")
76
characters = corpus.getProperty("characters")
77
abbrn = corpus.getProperty("abbr-n")
78
form = corpus.getProperty("word")
79

    
80
writer = tsvFile.newWriter("UTF-8")
81
writer.println "text_id\tpb_id\tcb_id\tlb_id\tentity\tNabbr\tNcharAbbr\ttotal\t%=NcharAbbr/Ntotal"
82
Partition p = corpus.createPartition("tmp", text_su,text_id)
83
for (Part part : p.getParts()) {
84
        processText(part, part.getName())
85
}
86
writer.close()
87
p.delete()
88
println "Result saved in "+tsvFile.getAbsolutePath()
89

    
90
def processText(Corpus corpus, def text_id) {
91

    
92
        def matches = corpus.getMatches()
93
        def first_match = matches[0]
94
        def last_match = matches[-1]
95
        int start = first_match.getStart()
96
        int end = last_match.getEnd()
97

    
98
        def CQI = Toolbox.getCqiClient();
99
        int[] positions;
100
        def r = corpus.query(new Query("[pbstart=\"0\"]"), "ABBRORI1", false)
101
        def pb_pos = r.starts
102
        r.drop()
103
        r = corpus.query(new Query("[cbstart=\"0\"]"), "ABBRORI2", false)
104
        def cb_pos = r.starts
105
        r.drop()
106
        r = corpus.query(new Query("[lbstart=\"0\"]"), "ABBRORI3", false)
107
        def lb_pos = r.starts
108
        r.drop()
109

    
110
        println "N pb = "+pb_pos.length
111
        println "N cb = "+cb_pos.length
112
        println "N lb = "+lb_pos.length
113
        
114
        def pb_idx = CQI.cpos2Id(pb_id.getQualifiedName(), pb_pos)
115
        def cb_idx = CQI.cpos2Id(cb_id.getQualifiedName(), cb_pos)
116
        def lb_idx = CQI.cpos2Id(lb_id.getQualifiedName(), lb_pos)
117

    
118
        def pb_idx_str = CQI.id2Str(pb_id.getQualifiedName(), pb_idx)
119
        def cb_idx_str = CQI.id2Str(cb_id.getQualifiedName(), cb_idx)
120
        def lb_idx_str = CQI.id2Str(lb_id.getQualifiedName(), lb_idx)
121

    
122
        ConsoleProgressBar cpb = new ConsoleProgressBar(lb_pos.size())
123
        int p = 0 ;
124
        int c = 0 ;
125

    
126
        for (int l = 0 ; l < lb_pos.size() ; l++) {
127
                cpb.tick();
128
                
129
                // get the current column milestone
130
                while (c < cb_pos.length - 1 && lb_pos[l] > cb_pos[c+1]) { 
131
                        c++
132
                }
133
                // get the current page milestone
134
                while (p < pb_pos.length - 1 && lb_pos[l] > pb_pos[p+1]) { 
135
                        p++
136
                }
137

    
138
                def line_length; // compute line length
139
                if (l == lb_pos.size() -1) line_length = end - lb_pos[l]
140
                else line_length = lb_pos[l+1] - lb_pos[l]
141

    
142

    
143
                def (abbrNs, allLetters, alignableLetters, characters, words, structuresPositions) = getInfos(lb_pos[l], line_length);
144

    
145
                processLine(text_id, pb_idx_str[p], cb_idx_str[c], lb_idx_str[l], line_length, abbrNs, allLetters, alignableLetters, characters, words, structuresPositions)
146
        }
147
}
148

    
149
def getInfos(int from ,int length) {
150

    
151
        int[] positions = new int[length]
152
        for (int i = 0 ; i < length ; i++) positions[i] = from++;
153
        
154
        def abbrNs = CQI.cpos2Str(abbrn.getQualifiedName(), positions);
155
        def allLetters = CQI.cpos2Str(lettersAll.getQualifiedName(), positions);
156
        def alignableLetters = CQI.cpos2Str(lettersAlignable.getQualifiedName(), positions);
157
        def characters = CQI.cpos2Str(characters.getQualifiedName(), positions);
158
        def words = CQI.cpos2Str(form.getQualifiedName(), positions);
159
        def structuresPositions = [:]
160
        for (def structProp : structures) {
161
                structuresPositions[structProp.getFullName()] = CQI.cpos2Struc(structProp.getQualifiedName(), positions);
162
        }
163
        
164
        return [abbrNs,allLetters,alignableLetters,characters,words, structuresPositions]
165
}
166

    
167
def processLine(def text_id, def pb_id, def cb_id, def lb_id, int length,
168
                def abbrNs, def allLetters, def alignableLetters, def characters, def words, def structuresPositions) {
169

    
170
        int NabbrTotal = 0, NsupAbbrTotal = 0, NtotalTotal = 0;
171
        for (int i = 0 ; i < length ; i++) {
172
                NabbrTotal += Integer.parseInt(abbrNs[i]);
173
                NsupAbbrTotal += allLetters[i].length() - characters[i].length();
174
                NtotalTotal += allLetters[i].length();
175
        }
176

    
177
        for (def strutcProp : structures) {
178
                int Nabbr = 0, NsupAbbr = 0, Ntotal = 0;
179
                def structureP = structuresPositions[strutcProp.getFullName()]
180

    
181
                for (int i = 0 ; i < length ; i++) {
182
                        if (structureP[i] >= 0) { // the position is in the structure
183
                                Nabbr += Integer.parseInt(abbrNs[i]);
184
                                NsupAbbr += allLetters[i].length() - characters[i].length();
185
                                Ntotal += allLetters[i].length();
186
                        }
187
                }
188
        
189
                writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\t"+strutcProp.getStructuralUnit()+"\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+((float)NsupAbbr/(float)Ntotal);
190
                NabbrTotal -= Nabbr
191
                NsupAbbrTotal -= NsupAbbr
192
                NtotalTotal -= Ntotal
193
        }
194
        
195
        int Nabbr = 0, NsupAbbr = 0, Ntotal = 0;        
196
        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\t#REST\t$NabbrTotal\t$NsupAbbrTotal\t$NtotalTotal\t"+((float)NsupAbbrTotal/(float)NtotalTotal);
197
}