Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildBFMLex.groovy @ 470

History | View | Annotate | Download (3.8 kB)

1
package org.txm.dictionary.functions.tsv
2

    
3
import java.util.regex.Pattern;
4

    
5
import org.txm.stat.utils.ConsoleProgressBar;
6

    
7
def DEBUG = false
8

    
9
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM")
10
File frolextsv = new File(workingDirectory, "BFM2013-dict.tsv")
11
/**
12
form        msd_cattex_bfm        F_bfm
13
,        NA        170673
14
et        NA        156135
15
de        NA        131501
16
 */
17
if (!frolextsv.exists()) {
18
        println "Can't find $frolextsv"
19
        return
20
}
21

    
22
File convCTX9NCA = new File(workingDirectory, "conv ctx9 ctx9-nca.tsv")
23
if (!convCTX9NCA.exists()) {
24
        println "Can't find $convCTX9NCA"
25
        return
26
}
27

    
28
Pattern pattern = Pattern.compile("\t");
29

    
30
String[] noposarray = ["_nopos_"]
31
def conversion_rules = ["NA":noposarray, "num":noposarray]
32

    
33
def conversion_rules2 = ConvRules.getConversionRulesFromFile(convCTX9NCA)
34

    
35
def cols = ["form","msd_cattex_bfm","F_bfm"]
36
def lines = frolextsv.readLines("UTF-8");
37

    
38
println "Read file. "+lines.size()+" lines..."
39
def data = []
40
for (int i = 1 ; i < lines.size() ; i++) {
41
        String line = lines[i]
42
        List split = pattern.split(line, cols.size())
43
        if (split.size() >= cols.size()) {
44
                def d = [split[0], split[1], split[2]]
45
                data << d
46
                if (DEBUG) println d
47
        }
48
}
49
println "Done: "+data.size()+" entries"
50

    
51
println "Transform NA and num to _nopos_ of the 'msd_cattex_bfm' column..."
52
int n = data.size()
53
for (int i = 0 ; i < n ; i++) {
54
        def d = data[i]
55
        if (DEBUG) println " $d"
56
        String msd_value = d[1];
57
        if (conversion_rules.containsKey(msd_value)) {
58
                data.remove(i)
59
                n--;
60
                i--;
61

    
62
                for (def v : conversion_rules.get(msd_value)) {
63
                        def dtmp = d.clone();
64
                        dtmp[1] = v
65
                        data.add(dtmp)
66
                }
67
        }
68
}
69
println "Done: "+data.size()+" entries"
70
if (DEBUG) for (def d : data) println d
71

    
72
def wrongMSD = new HashSet();
73
wrongMSD.add("PONbfl")
74
wrongMSD.add("PONpfbl")
75
wrongMSD.add("PONfbfl")
76
println "Removing "+wrongMSD.size()+" wrong entries..."
77
for (int i = 0 ; i < data.size() ; i++) {
78
        def d = data[i]
79
        if (wrongMSD.contains(d[1])) {
80
                data.remove(i)
81
                i--
82
        }
83
}
84
println "Done: "+data.size()+" entries"
85
if (DEBUG) for (def d : data) println d
86

    
87
println "Add new msd column..."
88
def missing_conv = new HashSet()
89
n = data.size()
90
for (int i = 0 ; i < n ; i++) {
91
        def d = data[i]
92

    
93
        String msd_value = d[1];
94
        if (conversion_rules2.containsKey(msd_value)) {
95
                data.remove(i)
96
                i--;
97
                n--;
98

    
99
                for (def v : conversion_rules2.get(msd_value)) {
100
                        def dtmp = d.clone();
101
                        dtmp << v
102
                        data.add(dtmp)
103
                }
104
        } else {
105
                missing_conv << d[1]
106
                d << ""
107
        }
108
}
109
println "Done: "+data.size()+" entries"
110
if (missing_conv.size() > 0) {
111
        println "Missing conversion rules: "+missing_conv
112
}
113
if (DEBUG) for (def d : data) println d
114

    
115
println "Sort by form..."
116
data.sort() {d1, d2 ->
117
        return d1[0].compareTo(d2[0])
118
}
119
println "Done: "+data.size()+" entries"
120
if (DEBUG) for (def d : data) println d
121

    
122
println "Remove _nopos_ not alone..."
123
def NOPOS = "_nopos_"
124
String currentForm = null
125
def tmp = []
126
def tmpi = [];
127
def toRemove = []
128
//ConsoleProgressBar cpb = new ConsoleProgressBar(data.size())
129
for (int i = 0 ; i < data.size() ; i++) {
130
//        cpb.tick()
131
        def d = data[i]
132
        if (d[0] != currentForm) {
133
                if (currentForm != null && tmp.size() > 1) {
134
                        for (int j = 0 ; j < tmp.size() ; j++) {
135
                                def dtmp = tmp[j]
136
                                if (dtmp[1] == NOPOS) {
137
//                                        data.remove(tmpi[j])
138
//                                        i--;
139
                                        toRemove << tmpi[j]
140
                                }
141
                        }
142
                }
143
                currentForm = d[0]
144
                tmp.clear()
145
                tmpi.clear()
146
        }
147
        tmp << d
148
        tmpi << i
149
}
150
if (currentForm != null && tmp.size() > 1) {
151
                        for (int j = 0 ; j < tmp.size() ; j++) {
152
                                def dtmp = tmp[j]
153
                                if (dtmp[1] == NOPOS) {
154
//                                        data.remove(tmpi[j])
155
//                                        i--;
156
                                        toRemove << tmpi[j]
157
                                }
158
                        }
159
                }
160
if (DEBUG) for (def i : toRemove) println data[i]
161
for (int i = toRemove.size() -1 ; i >= 0 ; i--) {// revert loop !
162
        //println "remove "+toRemove[i]+" th element = "+data[toRemove[i]]
163
        data.remove(toRemove[i])
164
}
165
println ""+toRemove.size()+" _nopos_ removed."
166
println "Done: "+data.size()+" entries"
167
if (DEBUG) for (def d : data) println d
168

    
169
return data