root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildDMFLex.groovy @ 470
History | View | Annotate | Download (2.7 kB)
1 |
package org.txm.dictionary.functions.tsv
|
---|---|
2 |
|
3 |
import java.util.regex.Pattern; |
4 |
|
5 |
DEBUG = false
|
6 |
|
7 |
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM") |
8 |
File dmflextsv = new File(workingDirectory, "dmflex.tsv") |
9 |
if (!dmflextsv.exists()) {
|
10 |
println "Can't find $dmflextsv"
|
11 |
return
|
12 |
} |
13 |
|
14 |
File convCTX9DMF = new File(workingDirectory, "conv dmf ctx9-nca.tsv") |
15 |
if (!convCTX9DMF.exists()) {
|
16 |
println "Can't find $convCTX9DMF"
|
17 |
return
|
18 |
} |
19 |
|
20 |
Pattern pattern = Pattern.compile("\t"); |
21 |
|
22 |
String[] noposarray = ["_nopos_"] |
23 |
|
24 |
|
25 |
def cols = ["form","category","dmf_lemma","dmf_lemma_old","source","source_lemma"] |
26 |
def lines = dmflextsv.readLines("UTF-8"); |
27 |
|
28 |
println "Read file. "+lines.size()+" lines..." |
29 |
def data = [] |
30 |
for (int i = 1 ; i < lines.size() ; i++) { |
31 |
String line = lines[i]
|
32 |
List split = pattern.split(line, cols.size())
|
33 |
if (split.size() >= cols.size()) {
|
34 |
def d = [split[0], split[1], split[2], split[3], split[4], split[5]] |
35 |
data << d |
36 |
if (DEBUG) println d
|
37 |
} else {
|
38 |
println "Error with line '$line' gets '$split'"
|
39 |
} |
40 |
} |
41 |
println "Done: "+data.size()+" entries" |
42 |
|
43 |
println "Add new msd column..."
|
44 |
def conversion_rules2 = ConvRules.getConversionRulesFromFile(convCTX9DMF)
|
45 |
cols << "msd_cattex_conv"
|
46 |
def missing_conv = new HashSet() |
47 |
n = data.size() |
48 |
for (int i = 0 ; i < n ; i++) { |
49 |
def d = data[i]
|
50 |
|
51 |
String msd_value = d[1]; |
52 |
if (conversion_rules2.containsKey(msd_value)) {
|
53 |
data.remove(i) |
54 |
i--; |
55 |
n--; |
56 |
|
57 |
for (def v : conversion_rules2.get(msd_value)) { |
58 |
def dtmp = d.clone();
|
59 |
dtmp << v |
60 |
data.add(dtmp) |
61 |
} |
62 |
} else {
|
63 |
missing_conv << d[1]
|
64 |
d << ""
|
65 |
} |
66 |
} |
67 |
println "Done: "+data.size()+" entries" |
68 |
if (missing_conv.size() > 0) { |
69 |
println "Missing conversion rules: "+missing_conv
|
70 |
} |
71 |
if (DEBUG) for (def d : data) println d |
72 |
|
73 |
println "Sort by form..."
|
74 |
data.sort() {d1, d2 -> |
75 |
return d1[0].compareTo(d2[0]) |
76 |
} |
77 |
println "Done: "+data.size()+" entries" |
78 |
if (DEBUG) for (def d : data) println d |
79 |
|
80 |
//for (int i = 0 ; i < 200 ; i++) println data[(int)(Math.random()*data.size())]
|
81 |
n = data.size() |
82 |
def hset = new HashSet() |
83 |
def hsetform = new HashSet() |
84 |
def hmap = [:]
|
85 |
for (int i = 0 ; i < n ; i++) { |
86 |
//if (data[i][0].length() == 0) continue;
|
87 |
def cat = data[i][1] |
88 |
def form = data[i][0] |
89 |
hset << cat |
90 |
hsetform << form |
91 |
|
92 |
if (!hmap.containsKey(cat)) hmap[cat] = new HashSet() |
93 |
if (hmap[cat].size() < 10) hmap[cat] << form |
94 |
|
95 |
} |
96 |
def resultFile = new File("formdmf.txt") |
97 |
resultFile.delete() |
98 |
for (String form : hsetform) { |
99 |
resultFile << form+"\n"
|
100 |
} |
101 |
println "N form : "+hsetform.size()+" in "+resultFile.getAbsolutePath() |
102 |
resultFile = new File("catdmf.txt") |
103 |
resultFile.delete() |
104 |
for (String cat : hset) { |
105 |
resultFile << cat+"\t"+hmap[cat]+"\n" |
106 |
} |
107 |
println "N cat : "+hset.size()+" in "+resultFile.getAbsolutePath() |
108 |
|
109 |
return data
|