Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / dictionary / functions / tsv / BuildFroLex.groovy @ 470

History | View | Annotate | Download (4.1 kB)

1
package org.txm.dictionary.functions.tsv
2

    
3
File workingDirectory = new File("/home/mdecorde/TXM/results/lexique BFM")
4
File frolextsv = new File(workingDirectory, "frolex.tsv")
5
/**
6
form        msd_afrlex        lemma        lemma_src
7
!        PON        !        
8
"        PON:cit        QUOTED        
9
 */
10

    
11
println "** BUILD BFM DICT"
12
bfmlex = new BuildBFMLex().run()
13
println "** BUILD AFR DICT"
14
afrlex = new BuildAFRLex().run()
15

    
16
def DEBUG = false
17

    
18
if (DEBUG) println "BFMLEX: "+bfmlex.size()
19
if (DEBUG) for (def d : bfmlex) println d
20
if (DEBUG) println "AFRLEX: "+afrlex.size()
21
if (DEBUG) for (def d : afrlex) println d
22

    
23
bfmcols = ["form","msd_bfm","F_bfm", "msd_cattex_conv"]
24
afrcols = ["form","msd_afrlex","lemma","lemma-source", "msd_cattex_conv"]
25
frocols = ["form", "F_bfm", "msd_afrlex", "msd_bfm", "msd_cattex_conv", "lemma", "lemma_src", "comment"]
26

    
27
int iBFM = 0;
28
int iAFR = 0;
29
int mergeCounter = 0;
30
all = new HashSet()
31
println "** MERGING BFM DICT WITH AFR DICT"
32
while (bfmlex.size() > 0 && afrlex.size() > 0) {
33
        
34
        def dafr = afrlex[0]
35
        def dbfm = bfmlex[0]
36
        if (DEBUG) println "data="+dafr+","+dbfm
37
        
38
        if (dafr[0] == dbfm[0]) { // same form
39
        //        println "FORM="+dafr[0]","+dbfm[0]
40
                if (dafr[4] == dbfm[3]) { // same msd ctx9-nca --> MERGE !!!
41
                        def current = ["","","","","","","",""]
42
                        
43
                        current[0] = dafr[0] // same form
44
                        current[1] = dbfm[2] // F BFM 
45
                        current[2] = dafr[1] // same msd MSD AFRLEX (nca)
46
                        current[3] = dbfm[1] // MSD BFM CATTEX
47
                        current[4] = dafr[4] // MSD AFRLEX CATTEX CONV
48
                        current[5] = dafr[2] // AFR LEMMA
49
                        current[6] = dafr[3] // AFR LEMMA SRC
50
                        
51
                        afrlex.remove(0)
52
                        bfmlex.remove(0)
53
                        mergeCounter++
54
                        if (DEBUG) println "MERGE: "+current
55
                        all.add(current)
56
                } else {
57
                        if (dafr[4].compareTo(dbfm[3]) < 0) { // insert afr data
58
                                insertAFRData();
59
                        } else { // insert bfm data
60
                                insertBFMData();
61
                        }
62
                }
63
        } else { // different form
64
                if (dafr[0].compareTo(dbfm[0]) < 0) { // insert afr data
65
                        insertAFRData();
66
                } else { // insert bfm data
67
                        insertBFMData();
68
                }
69
        }
70
}
71

    
72
if (afrlex.size() > 0) println "add remaining "+afrlex.size()+" AFRlex entries..."
73
while (afrlex.size() > 0) {
74
        insertAFRData();
75
}
76

    
77
if (bfmlex.size() > 0) println "add remaining "+bfmlex.size()+" BFM lex entries..."
78
while (bfmlex.size() > 0) {
79
        insertBFMData();
80
}
81

    
82
if (DEBUG) println frocols
83
if (DEBUG) for (def d : all) println d
84

    
85
println "Done, "+mergeCounter+" entries merged"
86

    
87
println "sorting..."
88
def data = []
89
data.addAll(all)
90
data.sort() {d1, d2 ->
91
        return d1[0].compareTo(d2[0])
92
}
93

    
94
println "Remove _nopos_ not alone..."
95
def NOPOS = "_nopos_"
96
String currentForm = null
97
def tmp = []
98
def tmpi = [];
99
def toRemove = []
100
for (int i = 0 ; i < data.size() ; i++) {
101
        def d = data[i]
102
        if (d[0] != currentForm) {
103
                if (currentForm != null && tmp.size() > 1) {
104
                        for (int j = 0 ; j < tmp.size() ; j++) {
105
                                def dtmp = tmp[j]
106
                                if (dtmp[3] == NOPOS) {
107
                                        toRemove << tmpi[j]
108
                                }
109
                        }
110
                }
111
                currentForm = d[0]
112
                tmp.clear()
113
                tmpi.clear()
114
        }
115
        tmp << d
116
        tmpi << i
117
}
118
if (currentForm != null && tmp.size() > 1) {
119
                        for (int j = 0 ; j < tmp.size() ; j++) {
120
                                def dtmp = tmp[j]
121
                                if (dtmp[3] == NOPOS) {
122
                                        toRemove << tmpi[j]
123
                                }
124
                        }
125
                }
126
if (DEBUG) for (def i : toRemove) println data[i]
127
for (int i = toRemove.size() -1 ; i >= 0 ; i--) {// revert loop !
128
        data.remove(toRemove[i])
129
}
130
println ""+toRemove.size()+" _nopos_ removed."
131
println "Done: "+data.size()+" entries"
132
if (DEBUG) for (def d : data) println d
133

    
134

    
135
println "Writing result in $frolextsv"
136
frolextsv.withWriter("UTF-8") { writer ->
137
        writer.println frocols.join("\t")
138
        for (def d : data) writer.println d.join("\t")
139
}
140
println "Done: "+data.size()+ "entries."
141

    
142
def insertAFRData() {
143
        def current = ["","","","","","","",""]
144

    
145
        def dafr = afrlex[0]
146
        afrlex.remove(0)
147

    
148
        current[0] = dafr[0] // FORM
149
        current[2] = dafr[1] // MSD NCA
150
        current[4] = dafr[4] // MSD CTX
151
        current[5] = dafr[2] // LEMMA
152
        current[6] = dafr[3] // LEMMA SRC
153
        //println "AFR INSERT: "+current+" with "+dafr
154
        all.add(current)
155
}
156

    
157
def insertBFMData() {
158
        def current = ["","","","","","","",""]
159

    
160
        def dbfm = bfmlex[0]
161
        bfmlex.remove(0)
162

    
163
        current[0] = dbfm[0] // FORM
164
        current[1] = dbfm[2] // FREQ
165
        current[3] = dbfm[1] // CTX
166
        current[4] = dbfm[3] // NCA CONV
167

    
168
        all.add(current)
169
}