Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / MergeCLDMFLEXSMacro.groovy @ 470

History | View | Annotate | Download (6.9 kB)

1
package org.txm.macro.frolex
2

    
3
import java.io.File
4
import java.io.IOException
5
import java.nio.charset.Charset
6
import java.util.LinkedHashMap
7

    
8
import org.apache.commons.io.filefilter.IOFileFilter;
9
import org.apache.commons.lang.StringUtils;
10
import org.eclipse.jface.bindings.keys.KeyStroke
11
import org.txm.utils.CsvReader
12
import org.txm.utils.io.IOUtils;
13

    
14
import org.kohsuke.args4j.*
15
import groovy.transform.Field
16
import org.txm.rcpapplication.swt.widget.parameters.*
17

    
18
// PARAMETERS
19
@Field @Option(name="workingDirectory", usage="workingDirectory containing all files needed", widget="Folder", required=true, def="dir")
20
File workingDirectory = null
21
if (!ParametersDialog.open(this)) {
22
        System.out.println("Aborting CLBFMLEX creation.")
23
        return null
24
}
25

    
26
File cldmflexsitetsv = new File(workingDirectory, "cldmf-site.tsv")
27
if (!cldmflexsitetsv.exists()) {
28
        println "Aborting: file is missing: $cldmflexsitetsv"
29
        return
30
}
31
File cldmflexlgermtsv = new File(workingDirectory, "cldmf-lgerm.tsv")
32
if (!cldmflexlgermtsv.exists()) {
33
        println "Aborting: file is missing: $cldmflexlgermtsv"
34
        return
35
}
36
File tsvFile = new File(workingDirectory, "cldmf.tsv")
37

    
38
// START
39

    
40
LinkedHashMap<String, String[]> finalvalues = new LinkedHashMap<String, String[]>()
41
CsvReader reader_site = new CsvReader(cldmflexsitetsv.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
42
CsvReader reader_lgerm = new CsvReader(cldmflexlgermtsv.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
43
reader_site.readHeaders()
44
reader_lgerm.readHeaders()
45

    
46
reader_site.readRecord()
47
reader_lgerm.readRecord()
48
String key_site = reader_site.get("lemma")//+"_"+reader_site.get("category")
49
String key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category")
50

    
51
def fixSiteCategory(String s) { // special fix for dmf_site category troncated
52
                if (s == "s" || s == "su" || s == "sub" || s == "subs" || s == "subst" || s == "subst. " ) {
53
                        return "subst."
54
                } else if (s == "v" || s == "ve" || s == "ver" || s == "verb" ) {
55
                        return "verbe"
56
                } else if (s == "a" || s == "adj" ) {
57
                        return "adj."
58
                }
59
                return s
60
}
61

    
62
int nMerge = 0;
63
int nSite = 0;
64
int nLgerm = 0;
65
// merge lgerm and site lemmas
66
while (key_site != null && key_lgerm != null) {
67
        
68
        def line = ["","","","","","","","","","","",""]
69
        if (key_site.equals(key_lgerm)) {
70
                //println "$key_site VS $key_lgerm"
71
                nMerge++
72
                line[0] = reader_site.get("lemma")
73
                line[1] = reader_lgerm.get("category") // site category is not broken
74
                
75
                if (reader_site.get("AND") != reader_lgerm.get("AND") && reader_lgerm.get("AND").length() > 0  && reader_site.get("AND").length() > 0) {
76
                        println "Different AND: $key_site\n site ="+reader_site.get("AND")+"\n lgerm="+reader_lgerm.get("AND");
77
                }
78
                if (reader_site.get("AND").length() > 0)
79
                        line[2] = reader_site.get("AND")
80
                else
81
                        line[2] = reader_lgerm.get("AND")
82

    
83
                line[3] = reader_lgerm.get("DEAF")
84

    
85
                line[4] = reader_site.get("DECT")
86
                line[5] = reader_site.get("FEW")
87
                line[6] = reader_site.get("GDF")
88
                line[7] = reader_site.get("GDC")
89
                line[8] = reader_site.get("HUG")
90
                line[9] = reader_site.get("TLF")
91

    
92
                if (reader_site.get("TL") != reader_lgerm.get("TL") && reader_lgerm.get("TL").length() > 0 && reader_site.get("TL").length() > 0) {
93
                        println "Different TL: $key_site\n site ="+reader_site.get("TL")+"\n lgerm="+reader_lgerm.get("TL");
94
                }
95
                if (reader_site.get("TL").length() > 0)
96
                        line[10] = reader_site.get("TL")
97
                else
98
                        line[10] = reader_lgerm.get("TL")
99
                        
100
                //line[10] = reader_site.get("TLF")
101

    
102
                if (reader_site.get("lemma_source") != reader_lgerm.get("lemma_source")) {
103
                        line[11] = "DMF"
104
                }
105
                line[11] = reader_site.get("lemma_source")
106

    
107
                finalvalues[key_site] = line
108
                
109
                if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category")
110
                else key_site = null;
111
                
112
                if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category")
113
                else key_lgerm = null
114
        } else if (key_site.compareTo(key_lgerm) >= 0) {
115
                nLgerm++
116
                line[0] = reader_lgerm.get("lemma")
117
                line[1] = reader_lgerm.get("category")
118
                
119
                line[2] = reader_lgerm.get("AND")
120
                line[3] = reader_lgerm.get("DEAF")
121
                line[4] = reader_lgerm.get("DECT")
122
                line[5] = reader_lgerm.get("FEW")
123
                line[6] = reader_lgerm.get("GDF")
124
                line[7] = reader_lgerm.get("GDC")
125
                line[8] = reader_lgerm.get("HUG")
126
                line[9] = reader_lgerm.get("TLF")
127
                line[10] = reader_lgerm.get("TL")
128
                line[11] = reader_lgerm.get("lemma_source")
129

    
130
                finalvalues[key_lgerm] = line
131
                
132
                if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category")
133
                else key_lgerm = null
134
        } else {
135
                nSite++
136
                /*// ignore entries only in DMF site
137
                line[0] = reader_site.get("lemma")
138
                line[1] = reader_site.get("category")
139
                line[1] = fixSiteCategory(line[1])
140
                line[2] = reader_site.get("AND")
141
                line[3] = reader_site.get("DEAF")
142
                line[4] = reader_site.get("DECT")
143
                line[5] = reader_site.get("FEW")
144
                line[6] = reader_site.get("GDF")
145
                line[7] = reader_site.get("GDC")
146
                line[8] = reader_site.get("HUG")
147
                line[9] = reader_site.get("TLF")
148
                line[10] = reader_site.get("TL")
149
                line[11] = reader_site.get("lemma_source")
150

151
                finalvalues[key_site] = line
152
                */
153
                if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category")
154
                else key_site = null;
155
        }
156
}
157

    
158
// write last Lgerm lemmas
159
while (key_lgerm != null) {
160
        nLgerm++
161
        def line = ["","","","","","","","","","","",""]
162

    
163
        line[0] = reader_lgerm.get("lemma")
164
        line[1] = reader_lgerm.get("category")
165
        line[2] = reader_lgerm.get("AND")
166
        line[3] = reader_lgerm.get("DEAF")
167
        line[4] = reader_lgerm.get("DECT")
168
        line[5] = reader_lgerm.get("FEW")
169
        line[6] = reader_lgerm.get("GDF")
170
        line[7] = reader_lgerm.get("GDC")
171
        line[8] = reader_lgerm.get("HUG")
172
        line[9] = reader_lgerm.get("TLF")
173
        line[10] = reader_lgerm.get("TL")
174
        line[11] = reader_lgerm.get("lemma_source")
175

    
176
        finalvalues[key_lgerm] = line
177
        
178
        if (reader_lgerm.readRecord()) key_lgerm = reader_lgerm.get("lemma")//+"_"+reader_lgerm.get("category")
179
        else key_lgerm = null
180
}
181

    
182
// write last site lemmas
183
while (key_site != null) {
184
        nSite++
185
        def line = ["","","","","","","","","","","",""]
186

    
187
        line[0] = reader_site.get("lemma")
188
        line[1] = reader_site.get("category")
189
        line[1] = fixSiteCategory(line[1])
190
        line[2] = reader_site.get("AND")
191
        line[3] = reader_site.get("DEAF")
192
        line[4] = reader_site.get("DECT")
193
        line[5] = reader_site.get("FEW")
194
        line[6] = reader_site.get("GDF")
195
        line[7] = reader_site.get("GDC")
196
        line[8] = reader_site.get("HUG")
197
        line[9] = reader_site.get("TLF")
198
        line[10] = reader_site.get("TL")
199
        line[11] = reader_site.get("lemma_source")
200

    
201
        finalvalues[key_site] = line
202
        
203
        if (reader_site.readRecord()) key_site = reader_site.get("lemma")//+"_"+reader_site.get("category")
204
        else key_site = null;
205
}
206

    
207
def writer = IOUtils.getWriter(tsvFile);
208
writer.println("lemma        category        AND        DEAF        DECT        FEW        GDF        GDC        HUG        TLF        TL        lemma_source");
209
for (String key : finalvalues.keySet()) {
210
        writer.println(StringUtils.join(finalvalues.get(key), "\t"));
211
}
212
writer.close();
213
println "nMerge=$nMerge"
214
println "nLgerm=$nLgerm"
215
println "nSite=$nSite"
216
// END