Statistics
| Revision:

root / tmp / org.txm.oriflamms.rcp / groovy / org / txm / macro / oriflamms / analyse / AllographsMacro.groovy @ 1094

History | View | Annotate | Download (5.5 kB)

1
// STANDARD DECLARATIONS
2
package org.txm.macro.oriflamms.analyse
3

    
4
import org.apache.tools.ant.types.resources.selectors.InstanceOf;
5
import org.kohsuke.args4j.*
6

    
7
import groovy.transform.Field
8
import java.util.regex.Pattern
9
import org.txm.*
10
import org.txm.rcp.swt.widget.parameters.*
11
import org.txm.searchengine.cqp.corpus.Corpus
12
import org.txm.searchengine.cqp.corpus.Part
13
import org.txm.searchengine.cqp.corpus.Partition
14
import org.txm.searchengine.cqp.corpus.StructuralUnit
15
import org.txm.searchengine.cqp.corpus.query.Query
16
import org.txm.stat.utils.ConsoleProgressBar
17

    
18
// BEGINNING OF PARAMETERS
19

    
20
if (!(corpusViewSelection instanceof Corpus)) {
21
        println "Corpora view selection must be a Corpus"
22
        return;
23
}
24

    
25
Corpus corpus = corpusViewSelection;
26
if (!corpus.getName().endsWith("C")) {
27
        println "The selected Corpus is not a character corpus."
28
        return;
29
}
30

    
31
@Field @Option(name="tsvFile", usage="TSV output file", widget="CreateFile", required=false, def="file.tsv")
32
tsvFile
33

    
34
@Field @Option(name="query", usage="The query", widget="Query", required=true, def='<s>[]')
35
query
36

    
37
@Field @Option(name="sign", usage="The sign", widget="String", required=false, def="amaj")
38
sign
39

    
40
@Field @Option(name="allograph", usage="The allograph", widget="String", required=false, def="amaj")
41
allograph
42

    
43
// Open the parameters input dialog box
44
if (!ParametersDialog.open(this)) return;
45

    
46
println "Dénombrement des allographes '$allograph' de signe '$sign' dans le contexte '$query'"
47

    
48
for (String prop : ["sign", "allograph-expert", "allograph-auto", "characters"]) {
49
        if (corpus.getProperty(prop) == null) {
50
                println "Le corpus '$corpus' n'a pas de propriété de mot '$prop'. Abandon."
51
                return
52
        }
53
}
54

    
55
signRegExp = ~/$sign/
56
characterRegExp = ~/$allograph/
57

    
58
CQI = CQPSearchEngine.getCqiClient();
59
text_su = corpus.getStructuralUnit("text")
60
text_id = text_su.getProperty("id")
61
pb_id = corpus.getProperty("pbid")
62
cb_id = corpus.getProperty("cbid")
63
lb_id = corpus.getProperty("lbid")
64
sign_property = corpus.getProperty("sign")
65
allograph_expert_property = corpus.getProperty("allograph-expert")
66
allograph_auto_property = corpus.getProperty("allograph-auto")
67
characters_property = corpus.getProperty("characters")
68

    
69
writer = tsvFile.newWriter("UTF-8")
70
writer.println "text_id\tpb_id\tcb_id\tlb_id\tsign\tchar\tcount\t%/sign"
71
Partition p = corpus.createPartition("tmp", text_su,text_id)
72
ConsoleProgressBar cpb = new ConsoleProgressBar(p.getNPart())
73
for (Part part : p.getParts()) {
74
        processText(part, part.getName())
75
        cpb.tick();
76
}
77
writer.close()
78
p.delete()
79
println "Result saved in "+tsvFile.getAbsolutePath()
80

    
81
def processText(Corpus corpus, def text_id) {
82

    
83
        def matches = corpus.getMatches()
84
        def first_match = matches[0]
85
        def last_match = matches[-1]
86
        int start = first_match.getStart()
87
        int end = last_match.getEnd()
88

    
89
        int[] positions;
90
        def r = corpus.query(new Query("[pbstart=\"0\"]"), "ABBRORI1", false)
91
        def pb_pos = r.starts
92
        r.drop()
93
        r = corpus.query(new Query("[cbstart=\"0\"]"), "ABBRORI2", false)
94
        def cb_pos = r.starts
95
        r.drop()
96
        r = corpus.query(new Query("[lbstart=\"0\"]"), "ABBRORI3", false)
97
        def lb_pos = r.starts
98
        r.drop()
99

    
100
//        println "N pb = "+pb_pos.length
101
//        println "N cb = "+cb_pos.length
102
//        println "N lb = "+lb_pos.length
103
        
104
        def pb_idx = CQI.cpos2Id(pb_id.getQualifiedName(), pb_pos)
105
        def cb_idx = CQI.cpos2Id(cb_id.getQualifiedName(), cb_pos)
106
        def lb_idx = CQI.cpos2Id(lb_id.getQualifiedName(), lb_pos)
107

    
108
        def pb_idx_str = CQI.id2Str(pb_id.getQualifiedName(), pb_idx)
109
        def cb_idx_str = CQI.id2Str(cb_id.getQualifiedName(), cb_idx)
110
        def lb_idx_str = CQI.id2Str(lb_id.getQualifiedName(), lb_idx)
111

    
112
        int p = 0 ;
113
        int c = 0 ;
114

    
115
        for (int l = 0 ; l < lb_pos.size() ; l++) {
116
                
117
                // get the current column milestone
118
                while (c < cb_pos.length - 1 && lb_pos[l] > cb_pos[c+1]) {
119
                        c++
120
                }
121
                // get the current page milestone
122
                while (p < pb_pos.length - 1 && lb_pos[l] > pb_pos[p+1]) {
123
                        p++
124
                }
125

    
126
                def line_length; // compute line length
127
                if (l == lb_pos.size() -1) line_length = end - lb_pos[l]
128
                else line_length = lb_pos[l+1] - lb_pos[l]
129

    
130

    
131
                def (signs, allographs_expert, allographs_auto, characters) = getInfos(lb_pos[l], line_length);
132

    
133
                processLine(text_id, pb_idx_str[p], cb_idx_str[c], lb_idx_str[l], line_length, signs, allographs_expert, allographs_auto, characters)
134
        }
135
}
136

    
137
def getInfos(int from ,int length) {
138

    
139
        int[] positions = new int[length]
140
        for (int i = 0 ; i < length ; i++) positions[i] = from++;
141
        
142
        def signs = CQI.cpos2Str(sign_property.getQualifiedName(), positions);
143
        def allographs_expert = CQI.cpos2Str(allograph_expert_property.getQualifiedName(), positions);
144
        def allographs_auto = CQI.cpos2Str(allograph_auto_property.getQualifiedName(), positions);
145
        def characters = CQI.cpos2Str(characters_property.getQualifiedName(), positions);
146
        
147
        return [signs,allographs_expert,allographs_auto, characters]
148
}
149

    
150
def processLine(def text_id, def pb_id, def cb_id, def lb_id, int length,
151
        def signs, def allographs_expert, def allographs_auto,def characters) {
152
        //println "$sign -> $signs $characters"
153
        def count_signs = [:]
154
        
155
        for (int i = 0 ; i < length ; i++) {
156
                
157
                if (signRegExp.matcher(signs[i]).find() && characterRegExp.matcher(characters[i]).find()) {
158
                        if (!count_signs.containsKey(signs[i])) count_signs[signs[i]] = [:];
159
                        def counts = count_signs[signs[i]]
160
                        
161
                        if (!counts.containsKey(characters[i])) counts[characters[i]] = 0;
162
                        counts[characters[i]] = counts[characters[i]] + 1;
163
                }
164
        }
165

    
166
        for (def s : count_signs.keySet()) {
167
                def counts = count_signs[s]
168
                def sum = counts.values().sum()
169
                for (def c : counts.keySet()) {
170
                        writer.println "$text_id\t$pb_id\t$cb_id\t$lb_id\t"+s+"\t"+c+"\t"+counts[c]+"\t"+((float)counts[c]/(float)sum);
171
                }
172
        }
173
}