root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / MergeCLDMFLEXSMacro.groovy @ 470
History | View | Annotate | Download (6.9 kB)
1 |
package org.txm.macro.frolex
|
---|---|
2 |
|
3 |
import java.io.File |
4 |
import java.io.IOException |
5 |
import java.nio.charset.Charset |
6 |
import java.util.LinkedHashMap |
7 |
|
8 |
import org.apache.commons.io.filefilter.IOFileFilter; |
9 |
import org.apache.commons.lang.StringUtils; |
10 |
import org.eclipse.jface.bindings.keys.KeyStroke |
11 |
import org.txm.utils.CsvReader |
12 |
import org.txm.utils.io.IOUtils; |
13 |
|
14 |
import org.kohsuke.args4j.* |
15 |
import groovy.transform.Field |
16 |
import org.txm.rcpapplication.swt.widget.parameters.* |
17 |
|
18 |
// PARAMETERS
|
19 |
@Field @Option(name="workingDirectory", usage="workingDirectory containing all files needed", widget="Folder", required=true, def="dir") |
20 |
File workingDirectory = null |
21 |
if (!ParametersDialog.open(this)) { |
22 |
System.out.println("Aborting CLBFMLEX creation.") |
23 |
return null |
24 |
} |
25 |
|
26 |
File cldmflexsitetsv = new File(workingDirectory, "cldmf-site.tsv") |
27 |
if (!cldmflexsitetsv.exists()) {
|
28 |
println "Aborting: file is missing: $cldmflexsitetsv"
|
29 |
return
|
30 |
} |
31 |
File cldmflexlgermtsv = new File(workingDirectory, "cldmf-lgerm.tsv") |
32 |
if (!cldmflexlgermtsv.exists()) {
|
33 |
println "Aborting: file is missing: $cldmflexlgermtsv"
|
34 |
return
|
35 |
} |
36 |
File tsvFile = new File(workingDirectory, "cldmf.tsv") |
37 |
|
38 |
// START
|
39 |
|
40 |
LinkedHashMap<String, String[]> finalvalues = new LinkedHashMap<String, String[]>() |
41 |
CsvReader reader_site = new CsvReader(cldmflexsitetsv.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
42 |
CsvReader reader_lgerm = new CsvReader(cldmflexlgermtsv.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
43 |
reader_site.readHeaders() |
44 |
reader_lgerm.readHeaders() |
45 |
|
46 |
reader_site.readRecord() |
47 |
reader_lgerm.readRecord() |
48 |
String key_site = reader_site.get("lemma")//+"_"+reader_site.get("category") |
49 |
String key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category") |
50 |
|
51 |
def fixSiteCategory(String s) { // special fix for dmf_site category troncated |
52 |
if (s == "s" || s == "su" || s == "sub" || s == "subs" || s == "subst" || s == "subst. " ) { |
53 |
return "subst." |
54 |
} else if (s == "v" || s == "ve" || s == "ver" || s == "verb" ) { |
55 |
return "verbe" |
56 |
} else if (s == "a" || s == "adj" ) { |
57 |
return "adj." |
58 |
} |
59 |
return s
|
60 |
} |
61 |
|
62 |
int nMerge = 0; |
63 |
int nSite = 0; |
64 |
int nLgerm = 0; |
65 |
// merge lgerm and site lemmas
|
66 |
while (key_site != null && key_lgerm != null) { |
67 |
|
68 |
def line = ["","","","","","","","","","","",""] |
69 |
if (key_site.equals(key_lgerm)) {
|
70 |
//println "$key_site VS $key_lgerm"
|
71 |
nMerge++ |
72 |
line[0] = reader_site.get("lemma") |
73 |
line[1] = reader_lgerm.get("category") // site category is not broken |
74 |
|
75 |
if (reader_site.get("AND") != reader_lgerm.get("AND") && reader_lgerm.get("AND").length() > 0 && reader_site.get("AND").length() > 0) { |
76 |
println "Different AND: $key_site\n site ="+reader_site.get("AND")+"\n lgerm="+reader_lgerm.get("AND"); |
77 |
} |
78 |
if (reader_site.get("AND").length() > 0) |
79 |
line[2] = reader_site.get("AND") |
80 |
else
|
81 |
line[2] = reader_lgerm.get("AND") |
82 |
|
83 |
line[3] = reader_lgerm.get("DEAF") |
84 |
|
85 |
line[4] = reader_site.get("DECT") |
86 |
line[5] = reader_site.get("FEW") |
87 |
line[6] = reader_site.get("GDF") |
88 |
line[7] = reader_site.get("GDC") |
89 |
line[8] = reader_site.get("HUG") |
90 |
line[9] = reader_site.get("TLF") |
91 |
|
92 |
if (reader_site.get("TL") != reader_lgerm.get("TL") && reader_lgerm.get("TL").length() > 0 && reader_site.get("TL").length() > 0) { |
93 |
println "Different TL: $key_site\n site ="+reader_site.get("TL")+"\n lgerm="+reader_lgerm.get("TL"); |
94 |
} |
95 |
if (reader_site.get("TL").length() > 0) |
96 |
line[10] = reader_site.get("TL") |
97 |
else
|
98 |
line[10] = reader_lgerm.get("TL") |
99 |
|
100 |
//line[10] = reader_site.get("TLF")
|
101 |
|
102 |
if (reader_site.get("lemma_source") != reader_lgerm.get("lemma_source")) { |
103 |
line[11] = "DMF" |
104 |
} |
105 |
line[11] = reader_site.get("lemma_source") |
106 |
|
107 |
finalvalues[key_site] = line |
108 |
|
109 |
if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category") |
110 |
else key_site = null; |
111 |
|
112 |
if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category") |
113 |
else key_lgerm = null |
114 |
} else if (key_site.compareTo(key_lgerm) >= 0) { |
115 |
nLgerm++ |
116 |
line[0] = reader_lgerm.get("lemma") |
117 |
line[1] = reader_lgerm.get("category") |
118 |
|
119 |
line[2] = reader_lgerm.get("AND") |
120 |
line[3] = reader_lgerm.get("DEAF") |
121 |
line[4] = reader_lgerm.get("DECT") |
122 |
line[5] = reader_lgerm.get("FEW") |
123 |
line[6] = reader_lgerm.get("GDF") |
124 |
line[7] = reader_lgerm.get("GDC") |
125 |
line[8] = reader_lgerm.get("HUG") |
126 |
line[9] = reader_lgerm.get("TLF") |
127 |
line[10] = reader_lgerm.get("TL") |
128 |
line[11] = reader_lgerm.get("lemma_source") |
129 |
|
130 |
finalvalues[key_lgerm] = line |
131 |
|
132 |
if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category") |
133 |
else key_lgerm = null |
134 |
} else {
|
135 |
nSite++ |
136 |
/*// ignore entries only in DMF site
|
137 |
line[0] = reader_site.get("lemma")
|
138 |
line[1] = reader_site.get("category")
|
139 |
line[1] = fixSiteCategory(line[1])
|
140 |
line[2] = reader_site.get("AND")
|
141 |
line[3] = reader_site.get("DEAF")
|
142 |
line[4] = reader_site.get("DECT")
|
143 |
line[5] = reader_site.get("FEW")
|
144 |
line[6] = reader_site.get("GDF")
|
145 |
line[7] = reader_site.get("GDC")
|
146 |
line[8] = reader_site.get("HUG")
|
147 |
line[9] = reader_site.get("TLF")
|
148 |
line[10] = reader_site.get("TL")
|
149 |
line[11] = reader_site.get("lemma_source")
|
150 |
|
151 |
finalvalues[key_site] = line
|
152 |
*/
|
153 |
if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category") |
154 |
else key_site = null; |
155 |
} |
156 |
} |
157 |
|
158 |
// write last Lgerm lemmas
|
159 |
while (key_lgerm != null) { |
160 |
nLgerm++ |
161 |
def line = ["","","","","","","","","","","",""] |
162 |
|
163 |
line[0] = reader_lgerm.get("lemma") |
164 |
line[1] = reader_lgerm.get("category") |
165 |
line[2] = reader_lgerm.get("AND") |
166 |
line[3] = reader_lgerm.get("DEAF") |
167 |
line[4] = reader_lgerm.get("DECT") |
168 |
line[5] = reader_lgerm.get("FEW") |
169 |
line[6] = reader_lgerm.get("GDF") |
170 |
line[7] = reader_lgerm.get("GDC") |
171 |
line[8] = reader_lgerm.get("HUG") |
172 |
line[9] = reader_lgerm.get("TLF") |
173 |
line[10] = reader_lgerm.get("TL") |
174 |
line[11] = reader_lgerm.get("lemma_source") |
175 |
|
176 |
finalvalues[key_lgerm] = line |
177 |
|
178 |
if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category") |
179 |
else key_lgerm = null |
180 |
} |
181 |
|
182 |
// write last site lemmas
|
183 |
while (key_site != null) { |
184 |
nSite++ |
185 |
def line = ["","","","","","","","","","","",""] |
186 |
|
187 |
line[0] = reader_site.get("lemma") |
188 |
line[1] = reader_site.get("category") |
189 |
line[1] = fixSiteCategory(line[1]) |
190 |
line[2] = reader_site.get("AND") |
191 |
line[3] = reader_site.get("DEAF") |
192 |
line[4] = reader_site.get("DECT") |
193 |
line[5] = reader_site.get("FEW") |
194 |
line[6] = reader_site.get("GDF") |
195 |
line[7] = reader_site.get("GDC") |
196 |
line[8] = reader_site.get("HUG") |
197 |
line[9] = reader_site.get("TLF") |
198 |
line[10] = reader_site.get("TL") |
199 |
line[11] = reader_site.get("lemma_source") |
200 |
|
201 |
finalvalues[key_site] = line |
202 |
|
203 |
if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category") |
204 |
else key_site = null; |
205 |
} |
206 |
|
207 |
def writer = IOUtils.getWriter(tsvFile);
|
208 |
writer.println("lemma category AND DEAF DECT FEW GDF GDC HUG TLF TL lemma_source");
|
209 |
for (String key : finalvalues.keySet()) { |
210 |
writer.println(StringUtils.join(finalvalues.get(key), "\t"));
|
211 |
} |
212 |
writer.close(); |
213 |
println "nMerge=$nMerge"
|
214 |
println "nLgerm=$nLgerm"
|
215 |
println "nSite=$nSite"
|
216 |
// END
|