Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildAFRLex.groovy @ 470

History | View | Annotate | Download (1.9 kB)

1
package org.txm.dictionary.functions.tsv
2

    
3
import java.util.regex.Pattern;
4

    
5
DEBUG = false
6

    
7
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM")
8
File frolextsv = new File(workingDirectory, "afrlex.tsv")
9
if (!frolextsv.exists()) {
10
    println "Can't find $frolextsv"
11
    return
12
}
13

    
14
File convCTX9NCA = new File(workingDirectory, "conv nca ctx9-nca.tsv")
15
if (!convCTX9NCA.exists()) {
16
    println "Can't find $convCTX9NCA"
17
    return
18
}
19

    
20
Pattern pattern = Pattern.compile("\t");
21

    
22
String[] noposarray = ["_nopos_"]
23

    
24
def conversion_rules2 = ConvRules.getConversionRulesFromFile(convCTX9NCA)
25

    
26
def cols = ["form","msd","lemma","lemma-source"]
27
def lines = frolextsv.readLines("UTF-8");
28

    
29
println "Read file. "+lines.size()+" lines..."
30
def data = []
31
for (int i = 1 ; i < lines.size() ; i++) {
32
        String line = lines[i]
33
        List split = pattern.split(line, cols.size())
34
        if (split.size() >= cols.size()) {
35
                def d = [split[0], split[1], split[2], split[3]]
36
                data << d
37
                if (DEBUG) println d
38
        } else {
39
                println "Error with line '$line' gets '$split'"
40
        }
41
}
42
println "Done: "+data.size()+" entries"
43

    
44
println "Add new msd column..."
45
def missing_conv = new HashSet()
46
n = data.size()
47
for (int i = 0 ; i < n ; i++) {
48
        def d = data[i]
49
        
50
        String msd_value = d[1];
51
        if (conversion_rules2.containsKey(msd_value)) {
52
                data.remove(i)
53
                i--;
54
                n--;
55

    
56
                for (def v : conversion_rules2.get(msd_value)) {
57
                        def dtmp = d.clone();
58
                        dtmp << v
59
                        data.add(dtmp)
60
                }
61
        } else {
62
                missing_conv << d[1]
63
                d << ""
64
        }
65
}
66
println "Done: "+data.size()+" entries"
67
if (missing_conv.size() > 0) {
68
        println "Missing conversion rules: "+missing_conv
69
}
70
if (DEBUG) for (def d : data) println d
71

    
72
println "Sort by form..."
73
data.sort() {d1, d2 ->
74
        return d1[0].compareTo(d2[0])
75
}
76
println "Done: "+data.size()+" entries"
77
if (DEBUG) for (def d : data) println d
78

    
79
//for (int i = 0 ; i < 20 ; i++) println data[(int)(Math.random()*data.size())]
80

    
81
return data