Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / oriflamms / analyse / AbbreviationsAndLinesMacro.groovy @ 499

History | View | Annotate | Download (5.8 kB)

1 479 mdecorde
// STANDARD DECLARATIONS
2 479 mdecorde
package org.txm.macro.oriflamms.analyse
3 479 mdecorde
4 479 mdecorde
import groovy.transform.Field
5 479 mdecorde
6 479 mdecorde
import org.kohsuke.args4j.*
7 479 mdecorde
import org.txm.*
8 499 mdecorde
import org.txm.rcp.swt.widget.parameters.*
9 479 mdecorde
import org.txm.searchengine.cqp.corpus.*
10 479 mdecorde
import org.txm.searchengine.cqp.corpus.query.*
11 479 mdecorde
import org.txm.stat.utils.ConsoleProgressBar
12 479 mdecorde
13 479 mdecorde
if (corpusViewSelection == null) {
14 479 mdecorde
        println "Il faut d'abord sélectionner un corpus dans la vue 'Corpus'. Abandon."
15 479 mdecorde
        return
16 479 mdecorde
}
17 479 mdecorde
18 479 mdecorde
if (!(corpusViewSelection instanceof Corpus)) {
19 479 mdecorde
        println "$corpusViewSelection n'est pas un corpus. Abandon"
20 479 mdecorde
        return;
21 479 mdecorde
}
22 479 mdecorde
23 479 mdecorde
@Field @Option(name="tsvFile", usage="TSV output file", widget="CreateFile", required=false, def="file.tsv")
24 479 mdecorde
def tsvFile
25 479 mdecorde
26 479 mdecorde
@Field @Option(name="dist_start", usage="Distance depuis le début de ligne", widget="Integer", required=false, def="0")
27 479 mdecorde
                def dist_start
28 479 mdecorde
29 479 mdecorde
@Field @Option(name="dist_end", usage="Distance depuis la fin de ligne", widget="Integer", required=false, def="1")
30 479 mdecorde
                def dist_end
31 479 mdecorde
32 479 mdecorde
// Open the parameters input dialog box
33 479 mdecorde
if (!ParametersDialog.open(this)) return;
34 479 mdecorde
// END OF PARAMETERS
35 479 mdecorde
36 479 mdecorde
if (dist_start <= 0 && dist_end <= 0) {
37 479 mdecorde
        println "Au moins, une des deux distances doit être supérieure à 0"
38 479 mdecorde
        return
39 479 mdecorde
}
40 479 mdecorde
dist_start = Math.abs(dist_start)
41 479 mdecorde
dist_end = Math.abs(dist_end)
42 479 mdecorde
43 479 mdecorde
def corpus = corpusViewSelection
44 479 mdecorde
def w = corpus.getStructuralUnit("w");
45 479 mdecorde
46 479 mdecorde
def wordCorpus = (w == null)
47 479 mdecorde
if (wordCorpus) {
48 479 mdecorde
        println "Corpus de mots"
49 479 mdecorde
} else {
50 479 mdecorde
        println "Corpus de lettres"
51 479 mdecorde
}
52 479 mdecorde
53 479 mdecorde
println "Dénombrement des abbréviations de $corpus en '"+(wordCorpus?"mots":"lettres")+"' pour des distances au début de $dist_start et à la fin $dist_end de la ligne"
54 479 mdecorde
55 479 mdecorde
for (String prop : ["pbid", "pbstart", "pbend","cbid",
56 479 mdecorde
        "cbstart","cbend","lbid", "lbstart", "lbend", "letters-all",
57 479 mdecorde
        "letters-alignable", "characters","abbr-n" ]) {
58 479 mdecorde
        if (corpus.getProperty(prop) == null) {
59 479 mdecorde
                println "Le corpus '$corpus' n'a pas de propriété de mot '$prop'. Abandon."
60 479 mdecorde
                return
61 479 mdecorde
        }
62 479 mdecorde
}
63 479 mdecorde
64 479 mdecorde
text_su = corpus.getStructuralUnit("text")
65 479 mdecorde
text_id = text_su.getProperty("id")
66 479 mdecorde
pb_id = corpus.getProperty("pbid")
67 479 mdecorde
cb_id = corpus.getProperty("cbid")
68 479 mdecorde
lb_id = corpus.getProperty("lbid")
69 479 mdecorde
lettersAll = corpus.getProperty("letters-all")
70 479 mdecorde
lettersAlignable = corpus.getProperty("letters-alignable")
71 479 mdecorde
characters = corpus.getProperty("characters")
72 479 mdecorde
abbrn = corpus.getProperty("abbr-n")
73 479 mdecorde
form = corpus.getProperty("word")
74 479 mdecorde
75 479 mdecorde
writer = tsvFile.newWriter("UTF-8")
76 479 mdecorde
writer.println "text_id\tpb_id\tcb_id\tlb_id\tline part\tNabbr\tNcharAbbr\ttotal\t%=NcharAbbr/Ntotal"
77 479 mdecorde
Partition p = corpus.createPartition("tmp", text_su,text_id)
78 479 mdecorde
for (Part part : p.getParts()) {
79 479 mdecorde
        processText(part, part.getName())
80 479 mdecorde
}
81 479 mdecorde
writer.close()
82 479 mdecorde
p.delete()
83 479 mdecorde
println "Result saved in "+tsvFile.getAbsolutePath()
84 479 mdecorde
85 479 mdecorde
def processText(Corpus corpus, def text_id) {
86 479 mdecorde
87 479 mdecorde
        def matches = corpus.getMatches()
88 479 mdecorde
        def first_match = matches[0]
89 479 mdecorde
        def last_match = matches[-1]
90 479 mdecorde
        int start = first_match.getStart()
91 479 mdecorde
        int end = last_match.getEnd()
92 479 mdecorde
93 479 mdecorde
        def CQI = Toolbox.getCqiClient();
94 479 mdecorde
        int[] positions;
95 479 mdecorde
        def r = corpus.query(new Query("[pbstart=\"0\"]"), "ABBRORI1", false)
96 479 mdecorde
        def pb_pos = r.starts
97 479 mdecorde
        r.drop()
98 479 mdecorde
        r = corpus.query(new Query("[cbstart=\"0\"]"), "ABBRORI2", false)
99 479 mdecorde
        def cb_pos = r.starts
100 479 mdecorde
        r.drop()
101 479 mdecorde
        r = corpus.query(new Query("[lbstart=\"0\"]"), "ABBRORI3", false)
102 479 mdecorde
        def lb_pos = r.starts
103 479 mdecorde
        r.drop()
104 479 mdecorde
105 479 mdecorde
        println "N pb = "+pb_pos.length
106 479 mdecorde
        println "N cb = "+cb_pos.length
107 479 mdecorde
        println "N lb = "+lb_pos.length
108 479 mdecorde
109 479 mdecorde
        def pb_idx = CQI.cpos2Id(pb_id.getQualifiedName(), pb_pos)
110 479 mdecorde
        def cb_idx = CQI.cpos2Id(cb_id.getQualifiedName(), cb_pos)
111 479 mdecorde
        def lb_idx = CQI.cpos2Id(lb_id.getQualifiedName(), lb_pos)
112 479 mdecorde
113 479 mdecorde
        def pb_idx_str = CQI.id2Str(pb_id.getQualifiedName(), pb_idx)
114 479 mdecorde
        def cb_idx_str = CQI.id2Str(cb_id.getQualifiedName(), cb_idx)
115 479 mdecorde
        def lb_idx_str = CQI.id2Str(lb_id.getQualifiedName(), lb_idx)
116 479 mdecorde
117 479 mdecorde
        ConsoleProgressBar cpb = new ConsoleProgressBar(lb_pos.size())
118 479 mdecorde
        int p = 0 ;
119 479 mdecorde
        int c = 0 ;
120 479 mdecorde
121 479 mdecorde
        for (int l = 0 ; l < lb_pos.size() ; l++) {
122 479 mdecorde
                cpb.tick();
123 479 mdecorde
124 479 mdecorde
                while (c < cb_pos.length - 1 && lb_pos[l] > cb_pos[c+1]) {
125 479 mdecorde
                        c++
126 479 mdecorde
                }
127 479 mdecorde
                while (p < pb_pos.length - 1 && lb_pos[l] > pb_pos[p+1]) {
128 479 mdecorde
                        p++
129 479 mdecorde
                }
130 479 mdecorde
131 479 mdecorde
                def line_length;
132 479 mdecorde
                if (l == lb_pos.size() -1) line_length = end - lb_pos[l]
133 479 mdecorde
                else line_length = lb_pos[l+1] - lb_pos[l]
134 479 mdecorde
135 479 mdecorde
                def (abbrNs, allLetters, alignableLetters, characters, words) = getInfos(lb_pos[l], line_length);
136 479 mdecorde
137 479 mdecorde
                processLine(text_id, pb_idx_str[p], cb_idx_str[c], lb_idx_str[l], line_length,
138 479 mdecorde
                                abbrNs, allLetters, alignableLetters, characters, words)
139 479 mdecorde
        }
140 479 mdecorde
}
141 479 mdecorde
142 479 mdecorde
def processLine(def text_id, def pb_id, def cb_id, def lb_id, int length,
143 479 mdecorde
                def abbrNs, def allLetters, def alignableLetters, def characters, def words) {
144 479 mdecorde
145 479 mdecorde
        int Nabbr, NsupAbbr, Ntotal;
146 479 mdecorde
        int p2 = length - dist_end
147 479 mdecorde
        if (p2 < 0) p2 = length+1;
148 479 mdecorde
        if (p2 < dist_start) p2 = dist_start
149 479 mdecorde
150 479 mdecorde
        for (int i = 0 ; i < length ; i++) {
151 479 mdecorde
152 479 mdecorde
                if (i == dist_start) {
153 479 mdecorde
                        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\ts\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
154 479 mdecorde
155 479 mdecorde
                        Nabbr = 0;
156 479 mdecorde
                        NsupAbbr = 0;
157 479 mdecorde
                        Ntotal = 0;
158 479 mdecorde
                }
159 479 mdecorde
                if (i == p2) {
160 479 mdecorde
                        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\tm\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
161 479 mdecorde
162 479 mdecorde
                        Nabbr = 0;
163 479 mdecorde
                        NsupAbbr = 0;
164 479 mdecorde
                        Ntotal = 0;
165 479 mdecorde
                }
166 479 mdecorde
167 479 mdecorde
                Nabbr += Integer.parseInt(abbrNs[i]);
168 479 mdecorde
                NsupAbbr += allLetters[i].length() - characters[i].length();
169 479 mdecorde
                Ntotal += allLetters[i].length();
170 479 mdecorde
171 479 mdecorde
        }
172 479 mdecorde
        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\te\t$Nabbr\t$NsupAbbr\t$Ntotal\t"+(100.0*(float)NsupAbbr/(float)Ntotal);
173 479 mdecorde
}
174 479 mdecorde
175 479 mdecorde
def getInfos(int from ,int length) {
176 479 mdecorde
177 479 mdecorde
        int[] positions = new int[length]
178 479 mdecorde
        for (int i = 0 ; i < length ; i++) positions[i] = from++;
179 479 mdecorde
        def CQI = Toolbox.getCqiClient();
180 479 mdecorde
181 479 mdecorde
        def abbrNs = CQI.cpos2Str(abbrn.getQualifiedName(), positions);
182 479 mdecorde
        def allLetters = CQI.cpos2Str(lettersAll.getQualifiedName(), positions);
183 479 mdecorde
        def alignableLetters = CQI.cpos2Str(lettersAlignable.getQualifiedName(), positions);
184 479 mdecorde
        def characters = CQI.cpos2Str(characters.getQualifiedName(), positions);
185 479 mdecorde
        def words = CQI.cpos2Str(form.getQualifiedName(), positions);
186 479 mdecorde
187 479 mdecorde
        return [abbrNs,allLetters,alignableLetters,characters,words]
188 479 mdecorde
}