Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / CreateFrolexMacro.groovy @ 470

History | View | Annotate | Download (34.2 kB)

1
package org.txm.macro.frolex
2

    
3
import java.io.File
4
import java.io.Serializable
5
import java.util.ArrayList
6
import java.util.Arrays
7
import java.util.HashSet
8
import java.util.LinkedHashMap
9
import java.util.List
10
import java.util.regex.Pattern
11

    
12
import org.kohsuke.args4j.*
13
import groovy.transform.Field
14
import org.txm.rcpapplication.swt.widget.parameters.*
15

    
16
import org.txm.Toolbox
17
import org.txm.dictionary.functions.sql.Dictionary
18
import org.txm.dictionary.functions.sql.DictionaryManager
19
import org.txm.dictionary.functions.sql.HSQLFunctions
20
import org.txm.index.core.functions.Index
21
import org.txm.searchengine.cqp.corpus.CorpusManager
22
import org.txm.searchengine.cqp.corpus.MainCorpus
23
import org.txm.searchengine.cqp.corpus.Property
24
import org.txm.searchengine.cqp.corpus.query.Query
25
import org.txm.utils.DeleteDir
26
import java.sql.*;
27

    
28
@Field @Option(name="workingDirectory", usage="workingDirectory containing all files needed", widget="Folder", required=true, def="dir")
29
File workingDirectory = null
30
@Field @Option(name="rebuildAFR", usage="Rebuild AFRLEX from afrlex.tsv file", widget="Boolean", required=true, def="true")
31
boolean rebuildAFR
32
@Field @Option(name="rebuildBFM", usage="rebuild BFM", widget="Boolean", required=true, def="true")
33
boolean rebuildBFM
34
@Field @Option(name="rebuildDMF", usage="rebuild DMF", widget="Boolean", required=true, def="true")
35
boolean rebuildDMF
36
@Field @Option(name="rebuildFrolex", usage="rebuild FRO", widget="Boolean", required=true, def="true")
37
boolean rebuildFrolex
38
@Field @Option(name="debug", usage="show SQL queries", widget="Boolean", required=true, def="false")
39
boolean debug
40

    
41
if (!ParametersDialog.open(this)) {
42
        System.out.println("Aborting Frolex creation.")
43
        return null
44
}
45

    
46
Dictionary.setDebug(debug)
47
HSQLFunctions.setDebug(debug)
48
process(workingDirectory, rebuildAFR, rebuildBFM, rebuildDMF, rebuildFrolex)
49

    
50
Dictionary process(File workingDirectory) {
51
        return process(workingDirectory, true, true, true, true, true)
52
}
53

    
54
Dictionary process(File workingDirectory, boolean rebuildAFR, boolean rebuildBFM, boolean rebuildDMF, boolean rebuildFrolex) {
55
        File resultDirectory = new File(workingDirectory, "result")
56
        DeleteDir.deleteDirectory(resultDirectory)
57
        resultDirectory.mkdirs()
58

    
59
        MainCorpus bfmCorpus = null
60

    
61
        if (Toolbox.isInitialized()) {
62
                try {
63
                        CorpusManager cm = CorpusManager.getCorpusManager()
64

    
65
                        bfmCorpus = cm.getCorpus("BFM2016LEX")
66
                        for (MainCorpus corpus : cm.getCorpora()) {
67
                                if (corpus.getName().equals("BFM2016LEX")) bfmCorpus = cm.getCorpus("BFM2016LEX")
68
                        }
69
                } catch (Exception e) {
70
                        System.out.println("Necessary corpus are missing: "+e)
71
                        e.printStackTrace()
72
                        return null
73
                }
74

    
75
                if (bfmCorpus == null) {
76
                        System.out.println("BFM2016LEX corpus not found. Aborting.")
77
                        return null
78
                }
79
        }
80

    
81
        try {
82
                Dictionary afrlex = null
83
                Dictionary clafrlex = null
84
                if (rebuildAFR) {
85
                        def ret = buildAFRlex(workingDirectory)
86
                        afrlex = ret[0]
87
                        clafrlex = ret[1]
88
                } else {
89
                        afrlex = DictionaryManager.getInstance().getDictionary("afrlex")
90
                        clafrlex = DictionaryManager.getInstance().getDictionary("clafrlex")
91
                }
92
                if (afrlex == null) {
93
                        System.out.println("Failed to build AFRLEX")
94
                        return null
95
                }
96

    
97
                Dictionary bfmlex = null
98
                Dictionary clbfmlex = null
99
                if (rebuildBFM) {
100
                        def ret = buildBFMlex(workingDirectory, bfmCorpus)
101
                        bfmlex = ret[0]
102
                        clbfmlex = ret[1]
103
                } else {
104
                        bfmlex = DictionaryManager.getInstance().getDictionary("bfmlex")
105
                        clbfmlex = DictionaryManager.getInstance().getDictionary("clbfmlex")
106
                }
107
                if (bfmlex == null) {
108
                        System.out.println("Failed to build BFMLEX")
109
                        return null
110
                }
111

    
112
                Dictionary dmflex = null
113
                Dictionary cldmflex = null
114
                if (rebuildDMF) {
115
                        def ret = buildDMFlex(workingDirectory)
116
                        dmflex = ret[0]
117
                        cldmflex = ret[1]
118
                } else {
119
                        dmflex = DictionaryManager.getInstance().getDictionary("dmflex")
120
                        cldmflex = DictionaryManager.getInstance().getDictionary("cldmflex")
121
                }
122
                if (dmflex == null) {
123
                        System.out.println("Failed to build DMFLEX & CLDMDF")
124
                        return null
125
                }
126

    
127

    
128
                Dictionary frolex = null
129
                if (rebuildFrolex) {
130
                        frolex = buildFROlex(workingDirectory, afrlex, clafrlex, bfmlex, clbfmlex, dmflex, cldmflex)
131
                } else {
132
                        frolex = DictionaryManager.getInstance().getDictionary("frolex")
133
                }
134

    
135
                if (frolex == null) {
136
                        System.out.println("Failed to build FROLEX")
137
                        return null
138
                }
139
        } catch (Exception e) {
140
                e.printStackTrace()
141
        }
142
        return null
143
}
144

    
145
def buildAFRlex(File workingDirectory) throws Exception {
146
        System.out.println("** Building AFRlex...")
147
        DictionaryManager.getInstance().remove("afrlex")
148
        DictionaryManager.getInstance().remove("clafrlex")
149

    
150
        File afrlexSteinFile = new File(workingDirectory, "afrlex.utf.v1.0.txt")
151
        if (!afrlexSteinFile.exists()) {
152
                System.out.println("Can't find "+afrlexSteinFile)
153
                return null
154
        }
155
        File perlScript = new File(workingDirectory, "reformat-afrlex.pl")
156
        if (!perlScript.exists()) {
157
                System.out.println("Can't find "+perlScript)
158
                return null
159
        }
160
        File convNCA_CTX9 = new File(workingDirectory, "conv nca ctx9-nca.tsv")
161
        if (!convNCA_CTX9.exists()) {
162
                System.out.println("Can't find "+convNCA_CTX9)
163
                return null
164
        }
165
        File convNCA_DMF = new File(workingDirectory, "conv nca ctx9-dmf.tsv")
166
        if (!convNCA_DMF.exists()) {
167
                System.out.println("Can't find "+convNCA_DMF)
168
                return null
169
        }
170
        File convLemmaSRC = new File(workingDirectory, "conv afr-src frolex-src.tsv")
171
        if (!convLemmaSRC.exists()) {
172
                System.out.println("Can't find "+convLemmaSRC)
173
                return null
174
        }
175
        
176

    
177
        File afrlexFile = new File(workingDirectory, "afrlex.tsv")
178
        afrlexFile.delete()
179
        System.out.println("Recoding original afrlex.txt...")
180
        String[] cmd = ["perl", perlScript.getAbsolutePath(), afrlexSteinFile.getAbsolutePath(), afrlexFile.getAbsolutePath()]
181
        System.out.println(Arrays.toString(cmd))
182
        if (Runtime.getRuntime().exec(cmd).waitFor() == 1) {
183
                System.out.println("recoding failed. Aborting.")
184
                return null
185
        }
186

    
187
        if (!afrlexFile.exists()) {
188
                System.out.println("Can't find "+afrlexFile)
189
                return null
190
        }
191

    
192
        Dictionary afrlex = DictionaryManager.getInstance().getDictionary("afrlex")
193
        Dictionary clafrlex = DictionaryManager.getInstance().getDictionary("clafrlex")
194

    
195
        System.out.println("Loading data from "+afrlexFile)
196
        afrlex.loadFromTSVFile(afrlexFile)
197

    
198
        // check columns
199
        List<String> types = afrlex.getTypes()
200
        String[] colsNeeded = ["form", "msd_afrlex", "lemma", "lemma_src"]
201
        for (String col : colsNeeded) {
202
                if (!types.contains(col)) {
203
                        System.out.println("'"+col+"' column is missing in "+types+". Aborting.")
204
                        return null
205
                }
206
        }
207

    
208
        System.out.println("Recoding msd_afrlex with "+convNCA_CTX9+" in msd_cattex_conv1.")
209
        int n = afrlex.recodeEntryProperties("msd_afrlex", "msd_cattex_conv1", convNCA_CTX9, "abandon", true)
210
        if (n == -1) {
211
                System.out.println("** Conversion failed see errors logs")
212
                return null
213
        }
214
        System.out.println("Done, "+n+" lines recoded.")
215
        //afrlex.print(5)
216

    
217
        System.out.println("Recoding msd_afrlex with "+convNCA_CTX9+" in msd_cattex_conv2.")
218
        n = afrlex.recodeEntryProperties("msd_afrlex", "msd_cattex_conv2", convNCA_DMF, "abandon", true)
219
        if (n == -1) {
220
                System.out.println("** Conversion failed see errors logs")
221
                return null
222
        }
223
        System.out.println("Done, "+n+" lines recoded.")
224
        //afrlex.print(5)
225

    
226
        System.out.println("Recoding 'lemma_src' with "+convLemmaSRC)
227
        n = afrlex.recodeEntryProperties("lemma_src", "lemma_src", convLemmaSRC, "supprimer", false)
228
        System.out.println("Done, "+n+" lines recoded.")
229
        
230
        // CREATE AFRLEXBULK and select best lemma using source and create the TL,GDF, PKAS, LFA, VFM, PVR and LMG columns
231
        println "Creating afrlexbulk: building the TL,GDF, PKAS, LFA, VFM, PVR and LMG columns"
232
        def functions = DictionaryManager.getInstance().getHSQLFunctions()
233
        if (functions.containsTable("afrlexbulk")) {
234
                functions.dropTable("afrlexbulk");
235
        }
236
        functions.createTable("afrlexbulk", 
237
                ["form", "msd_afrlex", "lemma", "lemma_src", "msd_cattex_conv1", "msd_cattex_conv2", "TL", "GDF", "PKAS", "LFA", "VFM", "PVR", "LMG"]);
238
        def result = afrlex.getOrderedLines("form", "msd_afrlex")
239
        def current = null
240
        def msd_cattex = []
241
        def sources = ["TL":"", "GDF":"", "PKAS":"", "LFA":"", "VFM":"", "PVR":"", "LMG":""]
242
        String batchquery = "INSERT INTO \"afrlexbulk\" VALUES ("+Dictionary.questionmarks[13]+")";
243
        PreparedStatement ps = functions.getConnection().prepareStatement(batchquery);
244
         int nUpdate = 0;
245
        while (result.next()) {
246
                def line_key = [result.getObject("form"), result.getObject("msd_afrlex")]
247
                
248
                // process the packet with the current line signature is different than the previous ones
249
                if (current == null) current = line_key
250
                if (current != line_key || result.isLast()) { // process the line
251
                        // select the right lemma: TL > GDF > PKAS > LFA > VFM > PVR > LMG
252
                        def lemma = sources["TL"]
253
                        def source = "TL"
254
                        def msd_cattex_conv1 = msd_cattex[0]
255
                        def msd_cattex_conv2 = msd_cattex[1]
256
                        lemma = sources["TL"]
257
                        if (lemma.length() == 0) {lemma = sources["GDF"];source = "GDF"}
258
                        if (lemma.length() == 0) {lemma = sources["PKAS"];source = "PKAS"}
259
                        if (lemma.length() == 0) {lemma = sources["LFA"];source = "LFA"}
260
                        if (lemma.length() == 0) {lemma = sources["VFM"];source = "VFM"}
261
                        if (lemma.length() == 0) {lemma = sources["PVR"];source = "PVR"}
262
                        if (lemma.length() == 0) {lemma = sources["LMG"];source = "LMG"}
263
                        //println "INSERT INTO \"txmtmp\" values ('"+current[0]+"', '"+current[1]+"', '"+lemma+"', '"+source+"', '"+sources["TL"]+"', '"+sources["GDF"]+"', '"+sources["PKAS"]+"', '"+sources["LFA"]+"', '"+sources["VFM"]+"', '"+sources["PVR"]+"', '"+sources["LMG"]+"')"
264
                        ps.setString(1, current[0]);
265
                        ps.setString(2, current[1]);
266
                        ps.setString(3, lemma);
267
                        ps.setString(4, source);
268
                        ps.setString(5, msd_cattex_conv1);
269
                        ps.setString(6, msd_cattex_conv2);
270
                        ps.setString(7, sources["TL"]);
271
                        ps.setString(8, sources["GDF"]);
272
                        ps.setString(9, sources["PKAS"]);
273
                        ps.setString(10, sources["LFA"]);
274
                        ps.setString(11, sources["VFM"]);
275
                        ps.setString(12, sources["PVR"]);
276
                        ps.setString(13, sources["LMG"]);
277
                        
278
                        ps.executeUpdate(); 
279
                        nUpdate++ 
280
                        if (nUpdate%1000 == 0) ps.getConnection().commit()
281
                        
282
                        // set the new packet signature
283
                        sources = ["TL":"", "GDF":"", "PKAS":"", "LFA":"", "VFM":"", "PVR":"", "LMG":""]
284
                        msd_cattex = [result.getObject("msd_cattex_conv1"), result.getObject("msd_cattex_conv2")]
285
                        current = line_key
286
                }
287
                
288
                //store the lemma + lemma_src
289
                sources[result.getObject("lemma_src")] = result.getObject("lemma")
290
        }
291
        ps.close()
292
        result.getStatement().close();
293
        System.out.println("AFRlexBULK created.")
294
        
295
        // Clear&Update AFRLEX with the new columns
296
        afrlex.clear()
297
        println afrlex.getTypes()
298
        afrlex.insertValues("SELECT \"form\",\"msd_afrlex\",\"lemma\",\"lemma_src\",\"msd_cattex_conv1\",\"msd_cattex_conv2\" FROM \"afrlexbulk\";")
299
        afrlex.print(10)
300

    
301
        // Build CLAFRLEX & merge corresp columns if necessary
302
        println "Build CLAFRLEX using AFRLEXBULK"
303
        clafrlex.addTypes(["msd_afrlex", "lemma", "lemma_src", "msd_cattex_conv1", "msd_cattex_conv2", "TL", "GDF", "PKAS", "LFA", "VFM", "PVR", "LMG"])
304
        Statement statement = functions.getConnection().createStatement();
305
        String query = "SELECT * FROM \"afrlexbulk\" ORDER BY \""+["msd_afrlex", "lemma", "lemma_src", "msd_cattex_conv1", "msd_cattex_conv2", "TL", "GDF", "PKAS", "LFA", "VFM", "PVR", "LMG"].join("\",\"")+"\";";
306
        result = statement.executeQuery(query);
307
        batchquery = "INSERT INTO \"clafrlex\" VALUES ("+Dictionary.questionmarks[12]+")";
308
        ps = functions.getConnection().prepareStatement(batchquery);
309
        current = null
310
        sources = ["TL":new HashSet(), "GDF":new HashSet(), "PKAS":new HashSet(), "LFA":new HashSet(), "VFM":new HashSet(), "PVR":new HashSet(), "LMG":new HashSet()]
311
        others = ["lemma_src":"", "msd_cattex_conv1":"", "msd_cattex_conv2": ""]
312
        while (result.next()) {
313
                def line_key = [result.getObject("msd_afrlex"), result.getObject("lemma")]
314
                
315
                // process the packet with the current line signature is different than the previous ones
316
                if (current == null) current = line_key
317
                if (current != line_key || result.isLast()) { // process the line        
318
                                                
319
                        ps.setString(1, current[0]); // msd_afrlex
320
                        ps.setString(2, current[1]); // lemma
321
                        ps.setString(3, others["lemma_src"]); // lemma_src
322
                        ps.setString(4, others["msd_cattex_conv1"]); // msd_cattex_conv1
323
                        ps.setString(5, others["msd_cattex_conv2"]); // msd_cattex_conv2
324
                        ps.setString(6, sources["TL"].join("|"));
325
                        ps.setString(7, sources["GDF"].join("|"));
326
                        ps.setString(8, sources["PKAS"].join("|"));
327
                        ps.setString(9, sources["LFA"].join("|"));
328
                        ps.setString(10, sources["VFM"].join("|"));
329
                        ps.setString(11, sources["PVR"].join("|"));
330
                        ps.setString(12, sources["LMG"].join("|"));
331
                        
332
                        ps.executeUpdate();  
333

    
334
                        // set the new packet signature
335
                        sources = ["TL":new HashSet(), "GDF":new HashSet(), "PKAS":new HashSet(), "LFA":new HashSet(), "VFM":new HashSet(), "PVR":new HashSet(), "LMG":new HashSet()]
336
                        others = ["lemma_src":"", "msd_cattex_conv1":"", "msd_cattex_conv2": ""]
337
                        current = line_key
338
                }
339
                
340
                //store the lemma + lemma_src
341
                others = ["lemma_src":result.getObject("lemma_src"), "msd_cattex_conv1":result.getObject("msd_cattex_conv1"), "msd_cattex_conv2":result.getObject("msd_cattex_conv2")]
342
                for (String key : sources.keySet()) {
343
                        if (result.getObject(key).length() > 0) sources[key] << result.getObject(key);
344
                }
345
        }
346
        ps.close()
347
        statement.close()
348
        clafrlex.setPopulated(true) // set the dict as populated
349
        clafrlex.print(10)
350
        
351
        // write in files
352
        File tsvFile = new File(workingDirectory, "result/afrlex.tsv")
353
        File tsvFile2 = new File(workingDirectory, "result/clafrlex.tsv")
354
        if (afrlex.print(tsvFile, -1, "form", "FRENCH 3") && clafrlex.print(tsvFile2, -1, "lemma", "FRENCH 3")) {
355
                System.out.println("afrlex exported in "+tsvFile.getAbsolutePath())
356
                System.out.println("clafrlex exported in "+tsvFile2.getAbsolutePath())
357
                return [afrlex, clafrlex]
358
        } else {
359
                System.out.println("Fail to export afrlex.")
360
                return null
361
        }
362
}
363

    
364
def buildDMFlex(File workingDirectory) {
365
        System.out.println("** Building CLDMFlex and DMFlex...")
366
        DictionaryManager.getInstance().remove("dmflex")
367
        DictionaryManager.getInstance().remove("cldmflex")
368

    
369
        File dmflexFile = new File(workingDirectory, "dmf.tsv")
370
        if (!dmflexFile.exists()) {
371
                System.out.println("Can't find "+dmflexFile)
372
                return null
373
        }
374
        File cldmflexFile = new File(workingDirectory, "cldmf.tsv")
375
        if (!cldmflexFile.exists()) {
376
                System.out.println("Can't find "+cldmflexFile)
377
                return null
378
        }
379
        File convDMFCTX9 = new File(workingDirectory, "conv dmf ctx9-dmf.tsv")
380
        if (!convDMFCTX9.exists()) {
381
                System.out.println("Can't find "+convDMFCTX9)
382
                return null
383
        }
384
        File convLemmaSRC = new File(workingDirectory, "conv dmf-src frolex-src.tsv")
385
        if (!convLemmaSRC.exists()) {
386
                System.out.println("Can't find "+convLemmaSRC)
387
                return null
388
        }
389

    
390
        Dictionary cldmflex = DictionaryManager.getInstance().getDictionary("cldmflex")
391
        System.out.println("Loading data from "+cldmflexFile)
392
        cldmflex.loadFromTSVFile(cldmflexFile)
393
        // check columns
394
        List<String> types = cldmflex.getTypes()
395
        String[] colsNeeded = ["lemma", "category", "lemma_source",
396
                "DEAF", "DECT", "FEW", "GDF", "GDC", "HUG", "TL", "TLF"]
397
        for (String col : colsNeeded) {
398
                if (!types.contains(col)) {
399
                        System.out.println("'"+col+"' column is missing in "+types+". Aborting.")
400
                        return null
401
                }
402
        }
403
        cldmflex.renameType("lemma_source", "lemma_src")
404
        cldmflex.renameType("category", "msd_cattex_conv2")
405
        
406
        System.out.println("Recoding data with "+convDMFCTX9)
407
        int n = cldmflex.recodeEntryProperties("msd_cattex_conv2", "msd_cattex_conv2", convDMFCTX9, "abandon", true)
408
        if (n == -1) {
409
                System.out.println("** Conversion failed see errors logs")
410
                return null
411
        }
412
        System.out.println("Done, "+n+" lines recoded.")
413
        
414
        
415
        Dictionary dmflex = DictionaryManager.getInstance().getDictionary("dmflex")
416

    
417
        System.out.println("Loading data from "+dmflexFile)
418
        dmflex.loadFromTSVFile(dmflexFile)
419

    
420
        // check columns
421
        List<String> types2 = dmflex.getTypes()
422
        String[] colsNeeded2 = ["form", "category", "lemma", "F_dmf"]
423
        for (String col : colsNeeded2) {
424
                if (!types2.contains(col)) {
425
                        System.out.println("'"+col+"' column is missing in "+types2+". Aborting.")
426
                        return null
427
                }
428
        }
429

    
430
        System.out.println("Recoding data with "+convDMFCTX9)
431
        n = dmflex.recodeEntryProperties("category", "msd_cattex_conv2", convDMFCTX9, "abandon", true)
432
        if (n == -1) {
433
                System.out.println("** Conversion failed see errors logs")
434
                return null
435
        }
436
        System.out.println("Done, "+n+" lines recoded.")
437

    
438
        System.out.println("DMFlex created: "+dmflex.getSize()+" entries.")
439
        dmflex.print(5)
440

    
441
        DictionaryManager.getInstance().getHSQLFunctions().getConnection().commit()
442

    
443
        File tsvFile = new File(workingDirectory, "result/dmflex.tsv")
444
        if (dmflex.print(tsvFile, -1, "form", "FRENCH 3")) {
445
                System.out.println("dmflex exported in "+tsvFile.getAbsolutePath())
446
                return [dmflex, cldmflex]
447
        } else {
448
                System.out.println("Fail to export dmflex.")
449
                return null
450
        }
451
}
452

    
453
def buildBFMlex(File workingDirectory, MainCorpus bfmCorpus) {
454
def functions = DictionaryManager.getInstance().getHSQLFunctions()
455
        System.out.println("** Building BFMlex...")
456
        DictionaryManager.getInstance().remove("bfmlex")
457
        DictionaryManager.getInstance().remove("clbfmlex")
458

    
459
        File convCTX9NCA = new File(workingDirectory, "conv ctx9 ctx9-nca.tsv")
460
        if (!convCTX9NCA.exists()) {
461
                System.out.println("Can't find "+convCTX9NCA)
462
                return null
463
        }
464

    
465
        File convCTX9DMF = new File(workingDirectory, "conv ctx9 ctx9-dmf.tsv")
466
        if (!convCTX9DMF.exists()) {
467
                System.out.println("Can't find "+convCTX9DMF)
468
                return null
469
        }
470

    
471
        File convPunctLemma = new File(workingDirectory, "conv punct lemma.tsv")
472
        if (!convPunctLemma.exists()) {
473
                System.out.println("Can't find "+convPunctLemma)
474
                return null
475
        }
476

    
477
        File convNumLemma = new File(workingDirectory, "conv num lemma.tsv")
478
        if (!convNumLemma.exists()) {
479
                System.out.println("Can't find "+convNumLemma)
480
                return null
481
        }
482

    
483
        File outfile = new File(workingDirectory, "bfmlex.tsv")
484

    
485
        if (Toolbox.isInitialized()) {
486
                System.out.println("Creating Index of [pos!=\"pon\"] with corpus BFM2016LEX")
487
                ArrayList<Property> props = new ArrayList<Property>()
488
                Property posProperty = bfmCorpus.getProperty("pos")
489
                if (posProperty == null) {
490
                        System.out.println("No 'pos' property in BFM2016LEX corpus. Aborting.")
491
                        return null
492
                }
493
                Property lemmaProperty = bfmCorpus.getProperty("lemma")
494
                if (lemmaProperty == null) {
495
                        System.out.println("No 'lemma' property in BFM2016LEX corpus. Aborting.")
496
                        return null
497
                }
498
                Property lemmaSrcProperty = bfmCorpus.getProperty("lemma_src")
499
                if (lemmaSrcProperty == null) {
500
                        System.out.println("No 'lemma_src' property in BFM2016LEX corpus. Aborting.")
501
                        return null
502
                }
503
                props.add(bfmCorpus.getProperty("word"))
504
                props.add(posProperty)
505
                props.add(lemmaProperty)
506
                props.add(lemmaSrcProperty)
507
                Index index = new Index(bfmCorpus, new Query("[pos!=\"pon\"]"), props)
508

    
509
                index.toTSVDictionnary(outfile, "\t", "UTF-8")
510
        }
511

    
512
        Dictionary bfmlex = DictionaryManager.getInstance().getDictionary("bfmlex")
513
        Dictionary clbfmlex = DictionaryManager.getInstance().getDictionary("clbfmlex")
514

    
515
        if (!outfile.exists()) {
516
                System.out.println("Error: BFM lex file not created: "+outfile)
517
                return null
518
        }
519

    
520
        System.out.println("Loading BFMLEX data from "+outfile)
521
        bfmlex.loadFromTSVFile(outfile)
522

    
523
        // check columns
524
        List<String> types = bfmlex.getTypes()
525
        String[] colsNeeded = ["word", "pos", "lemma", "lemma_src", "F"]
526
        for (String col : colsNeeded) {
527
                if (!types.contains(col)) {
528
                        System.out.println("'"+col+"' column is missing in "+types+". Aborting.")
529
                        return null
530
                }
531
        }
532

    
533
        System.out.println("Converting 'NA' & 'num' to '<no_pos>'")
534
        LinkedHashMap<Pattern, Serializable[]> conversion_rules = new LinkedHashMap<Pattern, Serializable[]>()
535
        String[] values = ["<no_pos>"]
536
        conversion_rules.put(Pattern.compile("NA|num|"), values)
537
        int n = bfmlex.recodeEntryProperties("pos", "pos", conversion_rules, "copier", true) // keep the other lines
538
        System.out.println("Done, "+n+" lines recoded.")
539

    
540
        System.out.println("Set punctuation lemmas")
541
        n = bfmlex.recodeEntryProperties("word", "lemma", convPunctLemma, Dictionary.COPYDEST, true)
542
        if (n == -1) {
543
                System.out.println("** Conversion failed see errors logs")
544
                return null
545
        }
546
        System.out.println("Done, "+n+" lines recoded.")
547

    
548
        System.out.println("Set num lemmas")
549
        n = bfmlex.recodeEntryProperties("word", "lemma", convNumLemma, Dictionary.COPYDEST, true) // copy previously  created lemma
550
        System.out.println("Done, "+n+" lines recoded.")
551

    
552
        System.out.println("Converting '' lemma to '<no_lemma>' lemma")
553
        LinkedHashMap<Pattern, Serializable[]> conversion_rules2 = new LinkedHashMap<Pattern, Serializable[]>()
554
        String[] values2 = ["<no_lemma>"]
555
        conversion_rules2.put(Pattern.compile(""), values2)
556
        int n2 = bfmlex.recodeEntryProperties("lemma", "lemma", conversion_rules2, "copier", true) // copy previously  created lemma
557
        System.out.println("Done, "+n2+" lines recoded.")
558

    
559
        HashSet<String> wrongMSD = new HashSet<String>()
560
        wrongMSD.add("PONbfl")
561
        wrongMSD.add("PONpfbl")
562
        wrongMSD.add("PONfbfl")
563
        n = 0
564
        System.out.println("Removing lines with errors: "+wrongMSD)
565
        for (String p : wrongMSD) {
566
                n += bfmlex.removeEntries("pos", p)
567
        }
568
        if (n > 0) System.out.println("Done, "+n+" lines removed.")
569

    
570
        System.out.println("Remove non-alone '<no_pos>' lines.")
571
        n = bfmlex.removeNonAloneLines("word", "pos", "<no_pos>")
572
        System.out.println("Number of lines removed: "+n)
573
        System.out.println("  left:"+bfmlex.getSize())
574

    
575
        System.out.println("Creating msd_cattex_conv1 using "+convCTX9NCA)
576
        n = bfmlex.recodeEntryProperties("pos", "msd_cattex_conv1", convCTX9NCA, "copier", true)
577
        System.out.println("Done, "+n+" lines recoded.")
578

    
579
        System.out.println("Creating msd_cattex_conv2 using "+convCTX9DMF)
580
        n = bfmlex.recodeEntryProperties("pos", "msd_cattex_conv2", convCTX9DMF, "copier", true)
581
        System.out.println("Done, "+n+" lines recoded.")
582

    
583
        System.out.println("Renaming columns...")
584
        bfmlex.renameType("word", "form")
585
        bfmlex.renameType("pos", "msd_cattex")
586
        bfmlex.renameType("F", "F_bfm")
587

    
588
        System.out.println("BFMlex created: "+bfmlex.getSize()+" entries.")
589
        bfmlex.print(10)
590

    
591
        DictionaryManager.getInstance().getHSQLFunctions().getConnection().commit()
592
        
593
        // Build CLBFMLEX & merge corresp columns if necessary
594
        println "Build CLBFMLEX using BFMLEX"
595
        clbfmlex.addTypes(["msd_cattex", "msd_cattex_conv1", "msd_cattex_conv2", "lemma", "lemma_src", "DMF", "DECT", "TL", "BFM", "GDF", "AND"])
596
        Statement statement = functions.getConnection().createStatement();
597
        String query = "SELECT * FROM \"bfmlex\" ORDER BY \""+["msd_cattex", "lemma"].join("\",\"")+"\";";
598
        result = statement.executeQuery(query);
599
        batchquery = "INSERT INTO \"clbfmlex\" VALUES ("+Dictionary.questionmarks[11]+")";
600
        ps = functions.getConnection().prepareStatement(batchquery);
601
        current = null
602
        sources = ["DMF":new HashSet(), "DECT":new HashSet(), "TL":new HashSet(), "BFM":new HashSet(), "GDF":new HashSet(), "AND":new HashSet()]
603
        others = ["msd_cattex_conv1":"", "msd_cattex_conv2": ""]
604
        while (result.next()) {
605
                def line_key = [result.getObject("msd_cattex"), result.getObject("lemma")]
606
                
607
                // process the packet with the current line signature is different than the previous ones
608
                if (current == null) current = line_key
609
                if (current != line_key || result.isLast()) { // process the line        
610
                        
611
                        String lemma_src = "";
612
                        if (sources["BFM"].size() > 0) lemma_src = "BFM"
613
                        else if (sources["DMF"].size() > 0) lemma_src = "DMF"
614
                        else if (sources["TL"].size() > 0) lemma_src = "TL"
615
                        else if (sources["DECT"].size() > 0) lemma_src = "DECT"
616
                        else if (sources["GDF"].size() > 0) lemma_src = "GDF"
617
                        else if (sources["AND"].size() > 0) lemma_src = "AND"
618
                        
619
                        ps.setString(1, current[0]); // msd_cattex
620
                        ps.setString(2, others["msd_cattex_conv1"]);
621
                        ps.setString(3, others["msd_cattex_conv2"]);
622
                        ps.setString(4, current[1]); // lemma
623
                        ps.setString(5, lemma_src); // lemma_src
624
                        ps.setString(6, sources["DMF"].join("|"));
625
                        ps.setString(7, sources["DECT"].join("|"));
626
                        ps.setString(8, sources["TL"].join("|"));
627
                        ps.setString(9, sources["BFM"].join("|"));
628
                        ps.setString(10, sources["GDF"].join("|"));
629
                        ps.setString(11, sources["AND"].join("|"));
630
                        
631
                        ps.executeUpdate();  
632
                        
633
                        // set the new packet signature
634
                        sources = ["DMF":new HashSet(), "DECT":new HashSet(), "TL":new HashSet(), "BFM":new HashSet(), "GDF":new HashSet(), "AND":new HashSet()]
635
                        others = ["msd_cattex_conv1":"", "msd_cattex_conv2": ""]
636
                        current = line_key
637
                }
638
                
639
                //store the lemma + lemma_src
640
                others = ["msd_cattex_conv1":result.getObject("msd_cattex_conv1"), "msd_cattex_conv2":result.getObject("msd_cattex_conv2")]
641
                String src = result.getObject("lemma_src");
642
                if (sources.containsKey(src)) {
643
                        sources[src] << result.getObject("lemma");
644
                }
645
        }
646
        ps.close()
647
        statement.close()
648
        clbfmlex.setPopulated(true) // set the dict as populated
649
        clbfmlex.print(10)
650
        
651
        File tsvFile = new File(workingDirectory, "result/bfmlex.tsv")
652
        File tsvFile2 = new File(workingDirectory, "result/clbfmlex.tsv")
653
        if (bfmlex.print(tsvFile, -1, "form", "FRENCH 3") 
654
                && clbfmlex.print(tsvFile2, -1, "lemma", "FRENCH 3")) {
655
                System.out.println("bfmlex exported in "+tsvFile.getAbsolutePath())
656
                return [bfmlex, clbfmlex]
657
        } else {
658
                System.out.println("Fail to export bfmlex.")
659
                return null
660
        }
661
}
662

    
663
Dictionary buildFROlex(File workingDirectory, Dictionary afrlex, Dictionary clafrlex, Dictionary bfmlex, Dictionary clbfmlex, Dictionary dmflex, Dictionary cldmflex) {
664
        System.out.println("** Building FROlex...")
665
        def functions = DictionaryManager.getInstance().getHSQLFunctions()
666
        File convCTX9NCA = new File(workingDirectory, "conv nca ctx9-dmf.tsv")
667
        if (!convCTX9NCA.exists()) {
668
                System.out.println("Can't find "+convCTX9NCA)
669
                return null
670
        }
671
        
672
                // Added AL
673
        
674
        File convLemmaSRC2 = new File(workingDirectory, "merge frolex-src.tsv")
675
        if (!convLemmaSRC2.exists()) {
676
                System.out.println("Can't find "+convLemmaSRC2)
677
                return null
678
        }
679
        
680

    
681
        DictionaryManager.getInstance().remove("frolex")
682
        DictionaryManager.getInstance().remove("clfrolex")
683

    
684
        Dictionary frolex = DictionaryManager.getInstance().getDictionary("frolex")
685
        Dictionary clfrolex = DictionaryManager.getInstance().getDictionary("clfrolex")
686

    
687
        //CLFROLEX
688
        System.out.println("clfrolex types: "+clfrolex.getTypes())
689
        clfrolex.addTypes(              Arrays.asList("msd_cattex_conv2",        "lemma",        "lemma_src",        "AND",        "TL",        "DEAF",        "DECT",        "FEW",        "GDF",        "GDC",        "HUG", "TLF", "BFM", "PKAS", "LFA", "VFM", "PVR", "LMG", "DMF"))
690
        System.out.println("clfrolex types: "+clfrolex.getTypes())
691
        println "Insert values of clbfmlex"
692
        clfrolex.insertValues(clbfmlex, Arrays.asList("msd_cattex_conv2",        "lemma",        "lemma_src",        "AND",        "TL",        "",        "DECT",        "",        "GDF",        "",        "", "", "BFM", "", "", "", "", "", "DMF"))
693
        println "Insert values of clafrlex"
694
        clfrolex.insertValues(clafrlex, Arrays.asList("msd_cattex_conv2",        "lemma",        "lemma_src",        "",        "TL",        "",        "",        "",        "GDF",        "",        "", "", "", "PKAS", "LFA", "VFM", "PVR", "LMG", ""))
695
        println "Insert values of cldmflex"
696
        clfrolex.insertValues(cldmflex, Arrays.asList("msd_cattex_conv2",        "lemma",        "lemma_src",        "AND",        "TL",        "DEAF",        "DECT",        "FEW",        "GDF",        "GDC",        "HUG", "TLF", "", "", "", "", "", "", ""))
697

    
698
        System.out.println("Merging lines with lemma and msd_cattex_conv2.")
699
        
700
        Statement clstatement = functions.getConnection().createStatement();
701
        String clquery = "SELECT * FROM \"clfrolex\" ORDER BY \""+["lemma", "msd_cattex_conv2"].join("\",\"")+"\";";
702
        result = clstatement.executeQuery(clquery);
703
        if (functions.containsTable("tmp")) functions.dropTable("tmp") // drop table
704
        def clcolumns = ["msd_cattex_conv2", "lemma","lemma_src" ,
705
        "AND","TL","DEAF","DECT","FEW","GDF","GDC","HUG","TLF","BFM","PKAS","LFA","VFM","PVR","LMG","DMF"]
706
        def othersclcolumns = ["lemma_src" ,
707
        "AND","TL","DEAF","DECT","FEW","GDF","GDC","HUG","TLF","BFM","PKAS","LFA","VFM","PVR","LMG","DMF"]
708
        functions.createTable("tmp", clcolumns)
709
        
710
        int countCLLinesBefore = clfrolex.getSize();
711
        
712
        batchquery = "INSERT INTO \"tmp\" VALUES ("+Dictionary.questionmarks[19]+")";
713
        ps = functions.getConnection().prepareStatement(batchquery);
714
        current = null
715
        others = [:]
716
        for (def col : othersclcolumns) others[col] = new HashSet();
717
        while (result.next()) {
718
                def line_key = [result.getObject("lemma"), result.getObject("msd_cattex_conv2")]
719
                
720
                // process the packet with the current line signature is different than the previous ones
721
                if (current == null) current = line_key
722
                if (current != line_key || result.isLast()) { // process the line        
723
                
724
                        //println " merging: $line_key with $others"
725

    
726
                        ps.setString(1, current[1]); // msd_cattex_conv2
727
                        ps.setString(2, current[0]); // lemma
728
                        int icol = 1;
729
                        for (def col : othersclcolumns) {
730
                                if (others[col].size() > 1 ) others[col].removeAll("");
731
                        
732
                                ps.setString(2+icol, others[col].join("|"));
733
                                icol++
734
                        }
735
                        
736
                        ps.executeUpdate();
737
                        
738
                        // set the new packet signature
739
                        for (def col : othersclcolumns) others[col] = new HashSet();
740
                        current = line_key
741
                }
742
                
743
                // store other columns values
744
                for (String k : others.keySet()) {
745
                        others[k] << result.getObject(k);
746
                }
747
        }
748
        ps.close()
749
        clstatement.close()
750
        functions.dropTable("clfrolex")
751
        functions.renameTable("tmp", "clfrolex")
752
        
753
        int countCLLinesAfter = clfrolex.getSize();
754
        println "Number of lines merged: "+(countCLLinesBefore-countCLLinesAfter)
755
        
756
        System.out.println("Done clfrolex size:"+clfrolex.getSize()+"\n")
757
        // FROLEX
758
        System.out.println("frolex types: "+frolex.getTypes())
759
        frolex.addTypes(            Arrays.asList("form",        "F_bfm",        "F_dmf",        "msd_afrlex",        "msd_bfm",        "msd_dmf",        "msd_cattex_conv1",        "msd_cattex_conv2",        "lemma",        "lemma_src",        "comment"))
760
        System.out.println("frolex types: "+frolex.getTypes())
761
        println "Insert values of bfmlex"
762
        frolex.insertValues(bfmlex, Arrays.asList("form",        "F_bfm",        "",                        "",                                "msd_cattex","",                "msd_cattex_conv1","msd_cattex_conv2", "lemma",    "lemma_src",    ""))
763
        System.out.println("Done frolex size:"+frolex.getSize())
764
        println "Insert values of afrlex"
765
        frolex.insertValues(afrlex, Arrays.asList("form",         "",                        "",                        "msd_afrlex",         "",                   "",                        "msd_cattex_conv1","msd_cattex_conv2", "lemma",        "lemma_src",         ""))
766
        System.out.println("Done frolex size:"+frolex.getSize())
767
        println "Insert values of dmflex"
768
        frolex.insertValues(dmflex, Arrays.asList("form",         "",                        "F_dmf",        "",                                "",                        "category",        "","msd_cattex_conv2", "lemma",         "lemma_src",         ""))
769
        System.out.println("Done frolex size:"+frolex.getSize()+"\n")
770
        
771
        System.out.println("Merging lines with form and msd_cattex_conv.")
772

    
773
    Statement statement = functions.getConnection().createStatement();
774
        String query = "SELECT * FROM \"frolex\" ORDER BY \""+["form", "msd_cattex_conv2"].join("\",\"")+"\";";
775
        result = statement.executeQuery(query);
776
        if (functions.containsTable("tmp")) functions.dropTable("tmp") // drop table
777
        functions.createTable("tmp", ["form","F_bfm","F_dmf","msd_afrlex","msd_bfm","msd_dmf",
778
                                        "msd_cattex_conv1","msd_cattex_conv2","lemma","lemma_src","comment"])
779
        batchquery = "INSERT INTO \"tmp\" VALUES ("+Dictionary.questionmarks[11]+")";
780
        ps = functions.getConnection().prepareStatement(batchquery);
781
        current = null
782
        others = ["F_bfm":[0], "F_dmf": [0], "msd_afrlex":new HashSet(), "msd_bfm":new HashSet(), "msd_dmf":new HashSet(), "msd_cattex_conv1":new HashSet(), "lemma":new HashSet(), "lemma_src":new HashSet(), "comment":new HashSet()]
783
        while (result.next()) {
784
                def line_key = [result.getObject("form"), result.getObject("msd_cattex_conv2")]
785
                // process the packet with the current line signature is different than the previous ones
786
                if (current == null) current = line_key
787
                
788
                if (current != line_key || result.isLast()) { // process the line        
789
                
790
                        
791
                        
792
                        for (String k : others.keySet()) if (others[k].size() > 1 && !k.startsWith("F_")) others[k].removeAll("");
793
                        
794
                        ps.setString(1, current[0]); // form
795
                        ps.setInt(2, others["F_bfm"].sum()); // F_bfm
796
                        ps.setInt(3, others["F_dmf"].sum()); // F_dmf
797
                        ps.setString(4, others["msd_afrlex"].join("|")); // msd_afrlex
798
                        ps.setString(5, others["msd_bfm"].join("|")); // msd_bfm
799
                        ps.setString(6, others["msd_dmf"].join("|")); // msd_dmf
800
                        if (others["msd_cattex_conv1"].size() > 1) others["msd_cattex_conv1"].removeAll("<no_pos>")
801
                        ps.setString(7, others["msd_cattex_conv1"].join("|")); // msd_cattex_conv1
802
                        ps.setString(8, current[1]); // msd_cattex_conv2
803
                        if (others["lemma"].size() > 1) others["lemma"].removeAll("<no_lemma>")
804
                        ps.setString(9, others["lemma"].join("|")); // lemma
805
                        ps.setString(10, others["lemma_src"].join("|")); // lemma_src
806
                        ps.setString(11, others["comment"].join("|")); // comment
807
                        
808
                        ps.executeUpdate();
809
                        
810
                        // set the new packet signature
811
                        others = ["F_bfm":[0], "F_dmf": [0], "msd_afrlex":new HashSet(), "msd_bfm":new HashSet(), "msd_dmf":new HashSet(), "msd_cattex_conv1":new HashSet(), "lemma":new HashSet(), "lemma_src":new HashSet(), "comment":new HashSet()]
812
                        current = line_key
813
                }
814
                
815
                //store the lemma + lemma_src
816
                for (String k : others.keySet()) {
817
                        others[k] << result.getObject(k);
818
                }
819
        }
820
        ps.close()
821
        statement.close()
822
        functions.dropTable("frolex")
823
        functions.renameTable("tmp", "frolex")
824
        System.out.println("Done frolex size:"+frolex.getSize()+"\n")
825

    
826
        System.out.println("Remove non-alone msd_cattex_conv2='OUT' lines.")
827
        int n = frolex.removeNonAloneLines("form", "msd_cattex_conv2", "OUT")
828
        System.out.println("Number of lines removed: "+n+"  left:"+frolex.getSize()+"\n")
829

    
830
        System.out.println("Remove non-alone msd_cattex_conv2='<no_pos>' lines.")
831
        int n2 = frolex.removeNonAloneLines("form", "msd_cattex_conv2", "<no_pos>")
832
        System.out.println("Number of lines removed: "+n2+"  left:"+frolex.getSize()+"\n")
833
        
834
        // Added AL
835
        
836
        System.out.println("Recoding 'lemma_src' with "+convLemmaSRC2)
837
        int n3 = frolex.recodeEntryProperties("lemma_src", "lemma_src", convLemmaSRC2, "abandon", false)
838
        System.out.println("Done, "+n3+" lines recoded.")
839
        
840
        // end added AL
841

    
842
        DictionaryManager.getInstance().getHSQLFunctions().getConnection().commit()
843

    
844
        File tsvFile = new File(workingDirectory, "result/frolex.tsv")
845
        File tsvFile2 = new File(workingDirectory, "result/clfrolex.tsv")
846

    
847
        if (frolex.print(tsvFile, -1, "form", "FRENCH 3", true) && clfrolex.print(tsvFile2, -1, "lemma", "FRENCH 3", true)) {
848
                System.out.println("FROLEX exported in "+tsvFile.getAbsolutePath())
849
                System.out.println("CLFROLEX exported in "+tsvFile2.getAbsolutePath())
850
                return frolex
851
        } else {
852
                System.out.println("Fail to export frolex.")
853
                return null
854
        }
855
}
856

    
857
/* SQL QUERIES
858
comparer les 'category' du dmf_site avec celles du dmf (lgerm)
859
SELECT DISTINCT "category" FROM "dmfsite" WHERE "category" not in (SELECT DISTINCT "category" FROM "dmf")
860

861

862
affiche les lemmes de AFRLEX déjà référencés dans le DMF
863
SELECT * FROM "clfrolex" WHERE lemma_src='TL' AND "lemma" IN (SELECT "TL" FROM "clfrolex" WHERE "lemma_src"='DMF' AND "TL" != '')
864

865
supprime les lemmes de AFRLEX déjà référencés dans le DMF
866
DELETE FROM "clfrolex" WHERE lemma_src='TL' AND "lemma" IN (SELECT "TL" FROM "clfrolex" WHERE "lemma_src"='DMF' AND "TL" != '')
867
*/