Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / LGerm2DMFTSVMacro.groovy @ 1688

History | View | Annotate | Download (5.1 kB)

1
package org.txm.macro.frolex
2

    
3
import java.io.BufferedOutputStream
4
import java.io.File
5
import java.io.IOException
6
import java.io.InputStream
7
import java.io.PrintWriter
8
import java.net.URL
9
import java.util.ArrayList
10
import java.util.Arrays
11
import java.util.List
12

    
13
import javax.xml.stream.XMLInputFactory
14
import javax.xml.stream.XMLStreamConstants
15
import javax.xml.stream.XMLStreamException
16
import javax.xml.stream.XMLStreamReader
17

    
18
import org.txm.importer.ApplyXsl2
19
import org.txm.utils.io.IOUtils
20

    
21
//PARAMETERS
22

    
23
File xmlDirectory = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed")
24
File tsvFile = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed/dmf.tsv")
25

    
26
// START
27

    
28
factory = XMLInputFactory.newInstance()
29

    
30
writer = IOUtils.getWriter(tsvFile, "UTF-8")
31
writer.println("form\tcategory\tlemma\tlemma_src\tF_dmf")
32
File[] xmlFiles = ApplyXsl2.listFiles(xmlDirectory)
33
if (xmlFiles == null) return false
34
Arrays.sort(xmlFiles)
35

    
36
for (File xmlFile : xmlFiles) {
37
        if (!xmlFile.getName().endsWith(".xml")) continue
38
        if (!processFile(xmlFile)) {
39
                println("Error while parsing "+xmlFile)
40
                return false
41
        }
42
}
43

    
44
if (writer != null) writer.close()
45
return true
46
//END
47

    
48

    
49
class Entry {
50
        String lemma
51
        String lemma_source
52
        String category
53
        List<String> forms = new ArrayList<String>()
54
        List<String> freqs = new ArrayList<String>()
55

    
56
        public Entry() {}
57

    
58
        public Entry(String lemma2, String category2) {
59
                // TODO Auto-generated constructor stub
60
        }
61

    
62
        String toString() {
63
                StringBuilder str = new StringBuilder()
64
                if (forms.size() != freqs.size()) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
65
                else if (lemma == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
66
                else if (lemma_source == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
67
                else if (category == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
68
                else
69
                        for (int i = 0 ;  i < forms.size() ; i++) {
70
                                str.append(forms.get(i)+"\t"+category+"\t"+lemma+"\t"+lemma_source+"\t"+freqs.get(i)+"\n")
71
                        }
72
                return str.toString()
73
        }
74
}
75

    
76
boolean processFile(File xmlFile) throws IOException, XMLStreamException {
77
        URL inputurl = xmlFile.toURI().toURL()
78
        InputStream inputData = inputurl.openStream()
79

    
80
        XMLStreamReader parser = factory.createXMLStreamReader(inputData)
81
        
82
        try {
83

    
84
                Entry current = null
85
                String target = null
86
                boolean inLemmatizedForm = false
87
                String orthography = null
88
                boolean inOrthography = false
89
                String grammaticalCategory = null
90
                boolean inGrammaticalCategory = false
91
                String frequency = null
92
                boolean inFrequency = false
93

    
94
                String localname
95
                for (int event = parser.next() ; event != XMLStreamConstants.END_DOCUMENT ; event = parser.next()) {
96
                        switch (event) {
97
                                case XMLStreamConstants.START_ELEMENT:
98
                                        localname = parser.getLocalName()
99
                                        if ("lemmatizedForm".equals(localname)) {
100
                                                if (current != null) {
101
                                                        writer.print(current)
102
                                                }
103
                                                current = new Entry()
104
                                                inLemmatizedForm = true
105
                                        } else if ("orthography".equals(localname)) {
106
                                                inOrthography = true
107
                                                orthography = ""
108
                                                target = parser.getAttributeValue(null, "target")
109
                                        } else if ("grammaticalCategory".equals(localname)) {
110
                                                inGrammaticalCategory = true
111
                                                grammaticalCategory = ""
112
                                        } else if ("frequency".equals(localname)) {
113
                                                inFrequency = true
114
                                                frequency = ""
115
                                        }
116
                                        break
117
                                case XMLStreamConstants.CHARACTERS:
118
                                        if (inFrequency) frequency += parser.getText()
119
                                        else if (inGrammaticalCategory) grammaticalCategory += parser.getText()
120
                                        else if (inOrthography) orthography += parser.getText()
121
                                        break
122
                                case XMLStreamConstants.END_ELEMENT:
123
                                        localname = parser.getLocalName()
124
                                        if ("lemmatizedForm".equals(localname)) {
125
                                                inLemmatizedForm = false
126
                                        } else if ("orthography".equals(localname)) {
127
                                                inOrthography = false
128
                                                orthography = orthography.trim()
129
                                                if (inLemmatizedForm) { // lemma orthography
130
                                                        if (target.equals("DMF")) {
131
                                                                current.lemma = orthography
132
                                                                current.lemma_source = target
133
                                                        } else if (target.equals("LGeRM") && !"DMF".equals(current.lemma_source)) {
134
                                                                current.lemma = orthography
135
                                                                current.lemma_source = target
136
                                                        }
137
                                                } else { // form orthography
138
                                                        current.forms.add(orthography)
139
                                                }
140
                                        } else if ("grammaticalCategory".equals(localname)) {
141
                                                inGrammaticalCategory = false
142
                                                grammaticalCategory = grammaticalCategory.trim()
143
                                                current.category = grammaticalCategory
144
                                        } else if ("frequency".equals(localname)) {
145
                                                inFrequency = false
146
                                                frequency = frequency.trim()
147
                                                current.freqs.add(frequency)
148
                                        }
149
                                        break
150
                        }
151
                }
152

    
153
                if (current != null) { // last entry
154
                        writer.print(current)
155
                }
156

    
157
                parser.close()
158
                inputData.close()
159
        } catch(Exception e) {
160
                println("Unexpected error while parsing file "+xmlFile+" : "+e)
161
                println("Location line: "+parser.getLocation().getLineNumber()+" character: "+parser.getLocation().getColumnNumber())
162
                org.txm.utils.logger.Log.printStackTrace(e)
163
                //e.printStackTrace()
164

    
165
                parser.close()
166
                inputData.close()
167
                return false
168
        }
169

    
170
        return true
171
}
172

    
173
println "Done. DMF TSV file: $tsvFile"