root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildFroLex.groovy @ 470
History | View | Annotate | Download (4.1 kB)
1 |
package org.txm.dictionary.functions.tsv
|
---|---|
2 |
|
3 |
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM") |
4 |
File frolextsv = new File(workingDirectory, "frolex.tsv") |
5 |
/**
|
6 |
form msd_afrlex lemma lemma_src
|
7 |
! PON !
|
8 |
" PON:cit QUOTED
|
9 |
*/
|
10 |
|
11 |
println "** BUILD BFM DICT"
|
12 |
bfmlex = new BuildBFMLex().run()
|
13 |
println "** BUILD AFR DICT"
|
14 |
afrlex = new BuildAFRLex().run()
|
15 |
|
16 |
def DEBUG = false |
17 |
|
18 |
if (DEBUG) println "BFMLEX: "+bfmlex.size() |
19 |
if (DEBUG) for (def d : bfmlex) println d |
20 |
if (DEBUG) println "AFRLEX: "+afrlex.size() |
21 |
if (DEBUG) for (def d : afrlex) println d |
22 |
|
23 |
bfmcols = ["form","msd_bfm","F_bfm", "msd_cattex_conv"] |
24 |
afrcols = ["form","msd_afrlex","lemma","lemma-source", "msd_cattex_conv"] |
25 |
frocols = ["form", "F_bfm", "msd_afrlex", "msd_bfm", "msd_cattex_conv", "lemma", "lemma_src", "comment"] |
26 |
|
27 |
int iBFM = 0; |
28 |
int iAFR = 0; |
29 |
int mergeCounter = 0; |
30 |
all = new HashSet() |
31 |
println "** MERGING BFM DICT WITH AFR DICT"
|
32 |
while (bfmlex.size() > 0 && afrlex.size() > 0) { |
33 |
|
34 |
def dafr = afrlex[0] |
35 |
def dbfm = bfmlex[0] |
36 |
if (DEBUG) println "data="+dafr+","+dbfm |
37 |
|
38 |
if (dafr[0] == dbfm[0]) { // same form |
39 |
// println "FORM="+dafr[0]","+dbfm[0]
|
40 |
if (dafr[4] == dbfm[3]) { // same msd ctx9-nca --> MERGE !!! |
41 |
def current = ["","","","","","","",""] |
42 |
|
43 |
current[0] = dafr[0] // same form |
44 |
current[1] = dbfm[2] // F BFM |
45 |
current[2] = dafr[1] // same msd MSD AFRLEX (nca) |
46 |
current[3] = dbfm[1] // MSD BFM CATTEX |
47 |
current[4] = dafr[4] // MSD AFRLEX CATTEX CONV |
48 |
current[5] = dafr[2] // AFR LEMMA |
49 |
current[6] = dafr[3] // AFR LEMMA SRC |
50 |
|
51 |
afrlex.remove(0)
|
52 |
bfmlex.remove(0)
|
53 |
mergeCounter++ |
54 |
if (DEBUG) println "MERGE: "+current |
55 |
all.add(current) |
56 |
} else {
|
57 |
if (dafr[4].compareTo(dbfm[3]) < 0) { // insert afr data |
58 |
insertAFRData(); |
59 |
} else { // insert bfm data |
60 |
insertBFMData(); |
61 |
} |
62 |
} |
63 |
} else { // different form |
64 |
if (dafr[0].compareTo(dbfm[0]) < 0) { // insert afr data |
65 |
insertAFRData(); |
66 |
} else { // insert bfm data |
67 |
insertBFMData(); |
68 |
} |
69 |
} |
70 |
} |
71 |
|
72 |
if (afrlex.size() > 0) println "add remaining "+afrlex.size()+" AFRlex entries..." |
73 |
while (afrlex.size() > 0) { |
74 |
insertAFRData(); |
75 |
} |
76 |
|
77 |
if (bfmlex.size() > 0) println "add remaining "+bfmlex.size()+" BFM lex entries..." |
78 |
while (bfmlex.size() > 0) { |
79 |
insertBFMData(); |
80 |
} |
81 |
|
82 |
if (DEBUG) println frocols
|
83 |
if (DEBUG) for (def d : all) println d |
84 |
|
85 |
println "Done, "+mergeCounter+" entries merged" |
86 |
|
87 |
println "sorting..."
|
88 |
def data = [] |
89 |
data.addAll(all) |
90 |
data.sort() {d1, d2 -> |
91 |
return d1[0].compareTo(d2[0]) |
92 |
} |
93 |
|
94 |
println "Remove _nopos_ not alone..."
|
95 |
def NOPOS = "_nopos_" |
96 |
String currentForm = null |
97 |
def tmp = [] |
98 |
def tmpi = []; |
99 |
def toRemove = [] |
100 |
for (int i = 0 ; i < data.size() ; i++) { |
101 |
def d = data[i]
|
102 |
if (d[0] != currentForm) { |
103 |
if (currentForm != null && tmp.size() > 1) { |
104 |
for (int j = 0 ; j < tmp.size() ; j++) { |
105 |
def dtmp = tmp[j]
|
106 |
if (dtmp[3] == NOPOS) { |
107 |
toRemove << tmpi[j] |
108 |
} |
109 |
} |
110 |
} |
111 |
currentForm = d[0]
|
112 |
tmp.clear() |
113 |
tmpi.clear() |
114 |
} |
115 |
tmp << d |
116 |
tmpi << i |
117 |
} |
118 |
if (currentForm != null && tmp.size() > 1) { |
119 |
for (int j = 0 ; j < tmp.size() ; j++) { |
120 |
def dtmp = tmp[j]
|
121 |
if (dtmp[3] == NOPOS) { |
122 |
toRemove << tmpi[j] |
123 |
} |
124 |
} |
125 |
} |
126 |
if (DEBUG) for (def i : toRemove) println data[i] |
127 |
for (int i = toRemove.size() -1 ; i >= 0 ; i--) {// revert loop ! |
128 |
data.remove(toRemove[i]) |
129 |
} |
130 |
println ""+toRemove.size()+" _nopos_ removed." |
131 |
println "Done: "+data.size()+" entries" |
132 |
if (DEBUG) for (def d : data) println d |
133 |
|
134 |
|
135 |
println "Writing result in $frolextsv"
|
136 |
frolextsv.withWriter("UTF-8") { writer ->
|
137 |
writer.println frocols.join("\t")
|
138 |
for (def d : data) writer.println d.join("\t") |
139 |
} |
140 |
println "Done: "+data.size()+ "entries." |
141 |
|
142 |
def insertAFRData() { |
143 |
def current = ["","","","","","","",""] |
144 |
|
145 |
def dafr = afrlex[0] |
146 |
afrlex.remove(0)
|
147 |
|
148 |
current[0] = dafr[0] // FORM |
149 |
current[2] = dafr[1] // MSD NCA |
150 |
current[4] = dafr[4] // MSD CTX |
151 |
current[5] = dafr[2] // LEMMA |
152 |
current[6] = dafr[3] // LEMMA SRC |
153 |
//println "AFR INSERT: "+current+" with "+dafr
|
154 |
all.add(current) |
155 |
} |
156 |
|
157 |
def insertBFMData() { |
158 |
def current = ["","","","","","","",""] |
159 |
|
160 |
def dbfm = bfmlex[0] |
161 |
bfmlex.remove(0)
|
162 |
|
163 |
current[0] = dbfm[0] // FORM |
164 |
current[1] = dbfm[2] // FREQ |
165 |
current[3] = dbfm[1] // CTX |
166 |
current[4] = dbfm[3] // NCA CONV |
167 |
|
168 |
all.add(current) |
169 |
} |