Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildDMFLex.groovy @ 470

History | View | Annotate | Download (2.7 kB)

1
package org.txm.dictionary.functions.tsv
2

    
3
import java.util.regex.Pattern;
4

    
5
DEBUG = false
6

    
7
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM")
8
File dmflextsv = new File(workingDirectory, "dmflex.tsv")
9
if (!dmflextsv.exists()) {
10
    println "Can't find $dmflextsv"
11
    return
12
}
13

    
14
File convCTX9DMF = new File(workingDirectory, "conv dmf ctx9-nca.tsv")
15
if (!convCTX9DMF.exists()) {
16
    println "Can't find $convCTX9DMF"
17
    return
18
}
19

    
20
Pattern pattern = Pattern.compile("\t");
21

    
22
String[] noposarray = ["_nopos_"]
23

    
24

    
25
def cols = ["form","category","dmf_lemma","dmf_lemma_old","source","source_lemma"]
26
def lines = dmflextsv.readLines("UTF-8");
27

    
28
println "Read file. "+lines.size()+" lines..."
29
def data = []
30
for (int i = 1 ; i < lines.size() ; i++) {
31
        String line = lines[i]
32
        List split = pattern.split(line, cols.size())
33
        if (split.size() >= cols.size()) {
34
                def d = [split[0], split[1], split[2], split[3], split[4], split[5]]
35
                data << d
36
                if (DEBUG) println d
37
        } else {
38
                println "Error with line '$line' gets '$split'"
39
        }
40
}
41
println "Done: "+data.size()+" entries"
42

    
43
println "Add new msd column..."
44
def conversion_rules2 = ConvRules.getConversionRulesFromFile(convCTX9DMF)
45
cols << "msd_cattex_conv"
46
def missing_conv = new HashSet()
47
n = data.size()
48
for (int i = 0 ; i < n ; i++) {
49
        def d = data[i]
50
        
51
        String msd_value = d[1];
52
        if (conversion_rules2.containsKey(msd_value)) {
53
                data.remove(i)
54
                i--;
55
                n--;
56

    
57
                for (def v : conversion_rules2.get(msd_value)) {
58
                        def dtmp = d.clone();
59
                        dtmp << v
60
                        data.add(dtmp)
61
                }
62
        } else {
63
                missing_conv << d[1]
64
                d << ""
65
        }
66
}
67
println "Done: "+data.size()+" entries"
68
if (missing_conv.size() > 0) {
69
        println "Missing conversion rules: "+missing_conv
70
}
71
if (DEBUG) for (def d : data) println d
72

    
73
println "Sort by form..."
74
data.sort() {d1, d2 ->
75
        return d1[0].compareTo(d2[0])
76
}
77
println "Done: "+data.size()+" entries"
78
if (DEBUG) for (def d : data) println d
79

    
80
//for (int i = 0 ; i < 200 ; i++) println data[(int)(Math.random()*data.size())]
81
n = data.size()
82
def hset = new HashSet()
83
def hsetform = new HashSet()
84
def hmap = [:]
85
for (int i = 0 ; i < n ; i++) {
86
        //if (data[i][0].length() == 0) continue;
87
        def cat = data[i][1]
88
        def form = data[i][0]
89
        hset << cat
90
        hsetform << form
91
        
92
        if (!hmap.containsKey(cat)) hmap[cat] = new HashSet()
93
        if (hmap[cat].size() < 10)        hmap[cat] << form
94
        
95
}
96
def resultFile = new File("formdmf.txt")
97
resultFile.delete()
98
for (String form : hsetform) {
99
        resultFile << form+"\n"
100
}
101
println "N form : "+hsetform.size()+" in "+resultFile.getAbsolutePath()
102
resultFile = new File("catdmf.txt")
103
resultFile.delete()
104
for (String cat : hset) {
105
        resultFile << cat+"\t"+hmap[cat]+"\n"
106
}
107
println "N cat : "+hset.size()+" in "+resultFile.getAbsolutePath()
108

    
109
return data