Statistics
| Revision:

root / tmp / org.txm.oriflamms.rcp / groovy / org / txm / macro / oriflamms / analyse / AbbreviationsAndLinesMacro.groovy @ 1094

History | View | Annotate | Download (5.8 kB)

1
// STANDARD DECLARATIONS
2
package org.txm.macro.oriflamms.analyse
3

    
4
import groovy.transform.Field
5

    
6
import org.kohsuke.args4j.*
7
import org.txm.*
8
import org.txm.rcp.swt.widget.parameters.*
9
import org.txm.searchengine.cqp.corpus.*
10
import org.txm.searchengine.cqp.corpus.query.*
11
import org.txm.stat.utils.ConsoleProgressBar
12

    
13
if (corpusViewSelection == null) {
14
        println "Il faut d'abord sélectionner un corpus dans la vue 'Corpus'. Abandon."
15
        return
16
}
17

    
18
if (!(corpusViewSelection instanceof Corpus)) {
19
        println "$corpusViewSelection n'est pas un corpus. Abandon"
20
        return;
21
}
22

    
23
@Field @Option(name="tsvFile", usage="TSV output file", widget="CreateFile", required=false, def="file.tsv")
24
def tsvFile
25

    
26
@Field @Option(name="dist_start", usage="Distance depuis le début de ligne", widget="Integer", required=false, def="0")
27
                def dist_start
28

    
29
@Field @Option(name="dist_end", usage="Distance depuis la fin de ligne", widget="Integer", required=false, def="1")
30
                def dist_end
31

    
32
// Open the parameters input dialog box
33
if (!ParametersDialog.open(this)) return;
34
// END OF PARAMETERS
35

    
36
if (dist_start <= 0 && dist_end <= 0) {
37
        println "Au moins, une des deux distances doit être supérieure à 0"
38
        return
39
}
40
dist_start = Math.abs(dist_start)
41
dist_end = Math.abs(dist_end)
42

    
43
def corpus = corpusViewSelection
44
def w = corpus.getStructuralUnit("w");
45

    
46
def wordCorpus = (w == null)
47
if (wordCorpus) {
48
        println "Corpus de mots"
49
} else {
50
        println "Corpus de lettres"
51
}
52

    
53
println "Dénombrement des abbréviations de $corpus en '"+(wordCorpus?"mots":"lettres")+"' pour des distances au début de $dist_start et à la fin $dist_end de la ligne"
54

    
55
for (String prop : ["pbid", "pbstart", "pbend","cbid", 
56
        "cbstart","cbend","lbid", "lbstart", "lbend", "letters-all",
57
        "letters-alignable", "characters","abbr-n" ]) {
58
        if (corpus.getProperty(prop) == null) {
59
                println "Le corpus '$corpus' n'a pas de propriété de mot '$prop'. Abandon."
60
                return
61
        }
62
}
63

    
64
text_su = corpus.getStructuralUnit("text")
65
text_id = text_su.getProperty("id")
66
pb_id = corpus.getProperty("pbid")
67
cb_id = corpus.getProperty("cbid")
68
lb_id = corpus.getProperty("lbid")
69
lettersAll = corpus.getProperty("letters-all")
70
lettersAlignable = corpus.getProperty("letters-alignable")
71
characters = corpus.getProperty("characters")
72
abbrn = corpus.getProperty("abbr-n")
73
form = corpus.getProperty("word")
74

    
75
writer = tsvFile.newWriter("UTF-8")
76
writer.println "text_id\tpb_id\tcb_id\tlb_id\tline part\tNabbr\tNcharAbbr\ttotal\t%=NcharAbbr/Ntotal"
77
Partition p = corpus.createPartition("tmp", text_su,text_id)
78
for (Part part : p.getParts()) {
79
        processText(part, part.getName())
80
}
81
writer.close()
82
p.delete()
83
println "Result saved in "+tsvFile.getAbsolutePath()
84

    
85
def processText(Corpus corpus, def text_id) {
86

    
87
        def matches = corpus.getMatches()
88
        def first_match = matches[0]
89
        def last_match = matches[-1]
90
        int start = first_match.getStart()
91
        int end = last_match.getEnd()
92

    
93
        def CQI = CQPSearchEngine.getCqiClient();
94
        int[] positions;
95
        def r = corpus.query(new Query("[pbstart=\"0\"]"), "ABBRORI1", false)
96
        def pb_pos = r.starts
97
        r.drop()
98
        r = corpus.query(new Query("[cbstart=\"0\"]"), "ABBRORI2", false)
99
        def cb_pos = r.starts
100
        r.drop()
101
        r = corpus.query(new Query("[lbstart=\"0\"]"), "ABBRORI3", false)
102
        def lb_pos = r.starts
103
        r.drop()
104

    
105
        println "N pb = "+pb_pos.length
106
        println "N cb = "+cb_pos.length
107
        println "N lb = "+lb_pos.length
108
        
109
        def pb_idx = CQI.cpos2Id(pb_id.getQualifiedName(), pb_pos)
110
        def cb_idx = CQI.cpos2Id(cb_id.getQualifiedName(), cb_pos)
111
        def lb_idx = CQI.cpos2Id(lb_id.getQualifiedName(), lb_pos)
112

    
113
        def pb_idx_str = CQI.id2Str(pb_id.getQualifiedName(), pb_idx)
114
        def cb_idx_str = CQI.id2Str(cb_id.getQualifiedName(), cb_idx)
115
        def lb_idx_str = CQI.id2Str(lb_id.getQualifiedName(), lb_idx)
116

    
117
        ConsoleProgressBar cpb = new ConsoleProgressBar(lb_pos.size())
118
        int p = 0 ;
119
        int c = 0 ;
120

    
121
        for (int l = 0 ; l < lb_pos.size() ; l++) {
122
                cpb.tick();
123
                
124
                while (c < cb_pos.length - 1 && lb_pos[l] > cb_pos[c+1]) {
125
                        c++
126
                }
127
                while (p < pb_pos.length - 1 && lb_pos[l] > pb_pos[p+1]) {
128
                        p++
129
                }
130

    
131
                def line_length;
132
                if (l == lb_pos.size() -1) line_length = end - lb_pos[l]
133
                else line_length = lb_pos[l+1] - lb_pos[l]
134

    
135
                def (abbrNs, allLetters, alignableLetters, characters, words) = getInfos(lb_pos[l], line_length);
136

    
137
                processLine(text_id, pb_idx_str[p], cb_idx_str[c], lb_idx_str[l], line_length,
138
                                abbrNs, allLetters, alignableLetters, characters, words)
139
        }
140
}
141

    
142
def processLine(def text_id, def pb_id, def cb_id, def lb_id, int length,
143
                def abbrNs, def allLetters, def alignableLetters, def characters, def words) {
144

    
145
        int Nabbr, NsupAbbr, Ntotal;
146
        int p2 = length - dist_end
147
        if (p2 < 0) p2 = length+1;
148
        if (p2 < dist_start) p2 = dist_start
149
        
150
        for (int i = 0 ; i < length ; i++) {
151

    
152
                if (i == dist_start) {
153
                        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\ts\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
154

    
155
                        Nabbr = 0;
156
                        NsupAbbr = 0;
157
                        Ntotal = 0;
158
                } 
159
                if (i == p2) {
160
                        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\tm\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
161

    
162
                        Nabbr = 0;
163
                        NsupAbbr = 0;
164
                        Ntotal = 0;
165
                }
166

    
167
                Nabbr += Integer.parseInt(abbrNs[i]);
168
                NsupAbbr += allLetters[i].length() - characters[i].length();
169
                Ntotal += allLetters[i].length();
170

    
171
        }
172
        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\te\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
173
}
174

    
175
def getInfos(int from ,int length) {
176

    
177
        int[] positions = new int[length]
178
        for (int i = 0 ; i < length ; i++) positions[i] = from++;
179
        def CQI = CQPSearchEngine.getCqiClient();
180

    
181
        def abbrNs = CQI.cpos2Str(abbrn.getQualifiedName(), positions);
182
        def allLetters = CQI.cpos2Str(lettersAll.getQualifiedName(), positions);
183
        def alignableLetters = CQI.cpos2Str(lettersAlignable.getQualifiedName(), positions);
184
        def characters = CQI.cpos2Str(characters.getQualifiedName(), positions);
185
        def words = CQI.cpos2Str(form.getQualifiedName(), positions);
186
        
187
        return [abbrNs,allLetters,alignableLetters,characters,words]
188
}