Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / LGerm2CLDMFTSVMacro.groovy @ 1688

History | View | Annotate | Download (5.8 kB)

1
package org.txm.macro.frolex
2

    
3
import java.io.File
4
import java.io.IOException
5
import java.io.InputStream
6
import java.io.PrintWriter
7
import java.net.URL
8
import java.util.Arrays
9
import java.util.LinkedHashMap
10

    
11
import javax.xml.stream.XMLInputFactory
12
import javax.xml.stream.XMLStreamConstants
13
import javax.xml.stream.XMLStreamException
14
import javax.xml.stream.XMLStreamReader
15

    
16
import org.txm.importer.ApplyXsl2
17
import org.txm.utils.io.IOUtils
18

    
19
import org.kohsuke.args4j.*
20
import groovy.transform.Field
21
import org.txm.rcp.swt.widget.parameters.*
22

    
23
// PARAMETERS
24
@Field @Option(name="workingDirectory", usage="workingDirectory containing all files needed", widget="Folder", required=true, def="dir")
25
File workingDirectory = null
26
if (!ParametersDialog.open(this)) {
27
        System.out.println("Aborting CLBFMSITELEX creation.")
28
        return null
29
}
30
File xmlDirectory = new File(workingDirectory, "lgerm-processed")
31
if (!xmlDirectory.exists()) {
32
        println "Aborting: file is missing: $xmlDirectory"
33
        return
34
}
35
File tsvFile = new File(workingDirectory, "cldmf-lgerm.tsv")
36

    
37
// INNERCLASS
38

    
39
class Entry2 {
40
        public String lemma
41
        public String lemma_source // not written
42
        public String category
43
        public LinkedHashMap<String, String> corresps = new LinkedHashMap<String, String>()
44

    
45
        public Entry2() { corresps.put("AND", "")
46
                corresps.put("DEAF", "")
47
                corresps.put("DECT", "")
48
                corresps.put("FEW", "")
49
                corresps.put("GDF", "")
50
                corresps.put("GDC", "")
51
                corresps.put("HUG", "")
52
                corresps.put("TLF", "")
53
                corresps.put("TL", "")
54
                corresps.put("UNDEF", "")}
55

    
56
        public Entry2(String lemma2, String category2) {
57
                this();
58
                lemma = lemma2
59
                category = category2
60
        }
61

    
62
        public String toString() {
63
                StringBuilder str = new StringBuilder()
64
                if (lemma == null) System.out.println("ERROR with entry="+lemma+" "+category+" "+corresps)
65
                else if (category == null) System.out.println("ERROR with entry="+lemma+" "+category+" "+corresps)
66
                else {
67
                        str.append(lemma+"\t"+category)
68
                        for (String corresp : corresps.keySet()) {
69
                                str.append("\t"+corresps.get(corresp))
70
                        }
71
                        str.append("\t"+lemma_source)
72
                        str.append("\n")
73
                }
74
                return str.toString()
75
        }
76

    
77
        public boolean equals(Entry2 entry) {
78
                return lemma.equals(entry.lemma) && category.equals(entry.category)
79
        }
80

    
81
        public int hashCode() {
82
                return lemma.hashCode() + category.hashCode()
83
        }
84
}
85

    
86

    
87
//START
88

    
89
factory = XMLInputFactory.newInstance()
90
writer = IOUtils.getWriter(tsvFile, "UTF-8")
91
lines = [:]
92

    
93

    
94
File[] xmlFiles = ApplyXsl2.listFiles(xmlDirectory)
95
if (xmlFiles == null) return false
96
Arrays.sort(xmlFiles)
97

    
98
for (File xmlFile : xmlFiles) {
99
        if (!xmlFile.getName().endsWith(".xml")) continue
100
        if (!processFile(xmlFile)) {
101
                System.out.println("Error while parsing "+xmlFile)
102
                return false
103
        }
104
}
105

    
106
writer.print("lemma\tcategory")
107
Entry2 tmp = new Entry2()
108
for (String corresp : tmp.corresps.keySet()) {
109
        writer.print("\t"+corresp)
110
}
111
writer.print("\tlemma_source")
112
writer.println("")
113
def keys = lines.keySet().sort()
114
for (def key : keys) {
115
        writer.print(lines[key]);
116
}
117
writer.close()
118

    
119
return true
120

    
121

    
122

    
123
boolean processFile(File xmlFile) throws IOException, XMLStreamException {
124

    
125
        URL inputurl = xmlFile.toURI().toURL()
126
        InputStream inputData = inputurl.openStream()
127
        XMLStreamReader parser = factory.createXMLStreamReader(inputData)
128
        
129
        try {
130
                
131
                Entry2 current = null
132
                String target = null
133
                boolean inLemmatizedForm = false
134
                String orthography = null
135
                boolean inOrthography = false
136
                String grammaticalCategory = null
137
                boolean inGrammaticalCategory = false
138

    
139
                String localname
140
                for (int event = parser.next() ; event != XMLStreamConstants.END_DOCUMENT ; event = parser.next()) {
141
                        switch (event) {
142
                                case XMLStreamConstants.START_ELEMENT:
143
                                        localname = parser.getLocalName()
144
                                        if ("lemmatizedForm".equals(localname)) {
145
                                                if (current != null) {
146
                                                        lines[current.lemma+"_"+current.category] = current;
147
                                                }
148
                                                current = new Entry2()
149
                                                inLemmatizedForm = true
150
                                        } else if ("orthography".equals(localname)) {
151
                                                inOrthography = true
152
                                                orthography = ""
153
                                                target = parser.getAttributeValue(null, "target")
154
                                        } else if ("grammaticalCategory".equals(localname)) {
155
                                                inGrammaticalCategory = true
156
                                                grammaticalCategory = ""
157
                                        }
158
                                        break
159
                                case XMLStreamConstants.CHARACTERS:
160
                                        if (inGrammaticalCategory) grammaticalCategory += parser.getText()
161
                                        else if (inOrthography) orthography += parser.getText()
162
                                        break
163
                                case XMLStreamConstants.END_ELEMENT:
164
                                        localname = parser.getLocalName()
165
                                        if ("lemmatizedForm".equals(localname)) {
166
                                                inLemmatizedForm = false
167
                                        } else if ("orthography".equals(localname)) {
168
                                                inOrthography = false
169
                                                orthography = orthography.trim()
170
                                                if (inLemmatizedForm) { // lemma orthography
171
                                                        if (target.equals("DMF")) {
172
                                                                current.lemma = orthography
173
                                                                current.lemma_source = target
174
                                                        } else if (target.equals("LGeRM") && !"DMF".equals(current.lemma_source)) {
175
                                                                current.lemma = orthography
176
                                                                current.lemma_source = target
177
                                                        } else if (current.corresps.keySet().contains(target)) {
178
                                                                current.corresps.put(target, orthography)
179
                                                        } else {
180
                                                                System.out.println("Error unknown lemma="+orthography+" with corresp="+target)
181
                                                        }
182
                                                }
183
                                        } else if ("grammaticalCategory".equals(localname)) {
184
                                                inGrammaticalCategory = false
185
                                                grammaticalCategory = grammaticalCategory.trim()
186
                                                current.category = grammaticalCategory
187
                                        }
188
                                        break
189
                        }
190
                }
191

    
192
                if (current != null) { // last entry
193
                        lines[current.lemma+"_"+current.category] = current;
194
                }
195

    
196
                parser.close()
197
                inputData.close()
198
        } catch(Exception e) {
199
                System.out.println("Unexpected error while parsing file "+xmlFile+" : "+e)
200
                System.out.println("Location line: "+parser.getLocation().getLineNumber()+" character: "+parser.getLocation().getColumnNumber())
201
                org.txm.utils.logger.Log.printStackTrace(e)
202
                //e.printStackTrace()
203

    
204
                parser.close()
205
                inputData.close()
206
                return false
207
        }
208

    
209
        return true
210
}
211