Statistics
| Revision:

root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / LGerm2DMFTSVMacro.groovy @ 1688

History | View | Annotate | Download (5.1 kB)

1 470 mdecorde
package org.txm.macro.frolex
2 470 mdecorde
3 470 mdecorde
import java.io.BufferedOutputStream
4 470 mdecorde
import java.io.File
5 470 mdecorde
import java.io.IOException
6 470 mdecorde
import java.io.InputStream
7 470 mdecorde
import java.io.PrintWriter
8 470 mdecorde
import java.net.URL
9 470 mdecorde
import java.util.ArrayList
10 470 mdecorde
import java.util.Arrays
11 470 mdecorde
import java.util.List
12 470 mdecorde
13 470 mdecorde
import javax.xml.stream.XMLInputFactory
14 470 mdecorde
import javax.xml.stream.XMLStreamConstants
15 470 mdecorde
import javax.xml.stream.XMLStreamException
16 470 mdecorde
import javax.xml.stream.XMLStreamReader
17 470 mdecorde
18 470 mdecorde
import org.txm.importer.ApplyXsl2
19 470 mdecorde
import org.txm.utils.io.IOUtils
20 470 mdecorde
21 470 mdecorde
//PARAMETERS
22 470 mdecorde
23 470 mdecorde
File xmlDirectory = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed")
24 470 mdecorde
File tsvFile = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed/dmf.tsv")
25 470 mdecorde
26 470 mdecorde
// START
27 470 mdecorde
28 470 mdecorde
factory = XMLInputFactory.newInstance()
29 470 mdecorde
30 470 mdecorde
writer = IOUtils.getWriter(tsvFile, "UTF-8")
31 470 mdecorde
writer.println("form\tcategory\tlemma\tlemma_src\tF_dmf")
32 470 mdecorde
File[] xmlFiles = ApplyXsl2.listFiles(xmlDirectory)
33 470 mdecorde
if (xmlFiles == null) return false
34 470 mdecorde
Arrays.sort(xmlFiles)
35 470 mdecorde
36 470 mdecorde
for (File xmlFile : xmlFiles) {
37 470 mdecorde
        if (!xmlFile.getName().endsWith(".xml")) continue
38 470 mdecorde
        if (!processFile(xmlFile)) {
39 470 mdecorde
                println("Error while parsing "+xmlFile)
40 470 mdecorde
                return false
41 470 mdecorde
        }
42 470 mdecorde
}
43 470 mdecorde
44 470 mdecorde
if (writer != null) writer.close()
45 470 mdecorde
return true
46 470 mdecorde
//END
47 470 mdecorde
48 470 mdecorde
49 470 mdecorde
class Entry {
50 470 mdecorde
        String lemma
51 470 mdecorde
        String lemma_source
52 470 mdecorde
        String category
53 470 mdecorde
        List<String> forms = new ArrayList<String>()
54 470 mdecorde
        List<String> freqs = new ArrayList<String>()
55 470 mdecorde
56 470 mdecorde
        public Entry() {}
57 470 mdecorde
58 470 mdecorde
        public Entry(String lemma2, String category2) {
59 470 mdecorde
                // TODO Auto-generated constructor stub
60 470 mdecorde
        }
61 470 mdecorde
62 470 mdecorde
        String toString() {
63 470 mdecorde
                StringBuilder str = new StringBuilder()
64 470 mdecorde
                if (forms.size() != freqs.size()) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
65 470 mdecorde
                else if (lemma == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
66 470 mdecorde
                else if (lemma_source == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
67 470 mdecorde
                else if (category == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs)
68 470 mdecorde
                else
69 470 mdecorde
                        for (int i = 0 ;  i < forms.size() ; i++) {
70 470 mdecorde
                                str.append(forms.get(i)+"\t"+category+"\t"+lemma+"\t"+lemma_source+"\t"+freqs.get(i)+"\n")
71 470 mdecorde
                        }
72 470 mdecorde
                return str.toString()
73 470 mdecorde
        }
74 470 mdecorde
}
75 470 mdecorde
76 470 mdecorde
boolean processFile(File xmlFile) throws IOException, XMLStreamException {
77 470 mdecorde
        URL inputurl = xmlFile.toURI().toURL()
78 470 mdecorde
        InputStream inputData = inputurl.openStream()
79 470 mdecorde
80 470 mdecorde
        XMLStreamReader parser = factory.createXMLStreamReader(inputData)
81 470 mdecorde
82 470 mdecorde
        try {
83 470 mdecorde
84 470 mdecorde
                Entry current = null
85 470 mdecorde
                String target = null
86 470 mdecorde
                boolean inLemmatizedForm = false
87 470 mdecorde
                String orthography = null
88 470 mdecorde
                boolean inOrthography = false
89 470 mdecorde
                String grammaticalCategory = null
90 470 mdecorde
                boolean inGrammaticalCategory = false
91 470 mdecorde
                String frequency = null
92 470 mdecorde
                boolean inFrequency = false
93 470 mdecorde
94 470 mdecorde
                String localname
95 470 mdecorde
                for (int event = parser.next() ; event != XMLStreamConstants.END_DOCUMENT ; event = parser.next()) {
96 470 mdecorde
                        switch (event) {
97 470 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
98 470 mdecorde
                                        localname = parser.getLocalName()
99 470 mdecorde
                                        if ("lemmatizedForm".equals(localname)) {
100 470 mdecorde
                                                if (current != null) {
101 470 mdecorde
                                                        writer.print(current)
102 470 mdecorde
                                                }
103 470 mdecorde
                                                current = new Entry()
104 470 mdecorde
                                                inLemmatizedForm = true
105 470 mdecorde
                                        } else if ("orthography".equals(localname)) {
106 470 mdecorde
                                                inOrthography = true
107 470 mdecorde
                                                orthography = ""
108 470 mdecorde
                                                target = parser.getAttributeValue(null, "target")
109 470 mdecorde
                                        } else if ("grammaticalCategory".equals(localname)) {
110 470 mdecorde
                                                inGrammaticalCategory = true
111 470 mdecorde
                                                grammaticalCategory = ""
112 470 mdecorde
                                        } else if ("frequency".equals(localname)) {
113 470 mdecorde
                                                inFrequency = true
114 470 mdecorde
                                                frequency = ""
115 470 mdecorde
                                        }
116 470 mdecorde
                                        break
117 470 mdecorde
                                case XMLStreamConstants.CHARACTERS:
118 470 mdecorde
                                        if (inFrequency) frequency += parser.getText()
119 470 mdecorde
                                        else if (inGrammaticalCategory) grammaticalCategory += parser.getText()
120 470 mdecorde
                                        else if (inOrthography) orthography += parser.getText()
121 470 mdecorde
                                        break
122 470 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
123 470 mdecorde
                                        localname = parser.getLocalName()
124 470 mdecorde
                                        if ("lemmatizedForm".equals(localname)) {
125 470 mdecorde
                                                inLemmatizedForm = false
126 470 mdecorde
                                        } else if ("orthography".equals(localname)) {
127 470 mdecorde
                                                inOrthography = false
128 470 mdecorde
                                                orthography = orthography.trim()
129 470 mdecorde
                                                if (inLemmatizedForm) { // lemma orthography
130 470 mdecorde
                                                        if (target.equals("DMF")) {
131 470 mdecorde
                                                                current.lemma = orthography
132 470 mdecorde
                                                                current.lemma_source = target
133 470 mdecorde
                                                        } else if (target.equals("LGeRM") && !"DMF".equals(current.lemma_source)) {
134 470 mdecorde
                                                                current.lemma = orthography
135 470 mdecorde
                                                                current.lemma_source = target
136 470 mdecorde
                                                        }
137 470 mdecorde
                                                } else { // form orthography
138 470 mdecorde
                                                        current.forms.add(orthography)
139 470 mdecorde
                                                }
140 470 mdecorde
                                        } else if ("grammaticalCategory".equals(localname)) {
141 470 mdecorde
                                                inGrammaticalCategory = false
142 470 mdecorde
                                                grammaticalCategory = grammaticalCategory.trim()
143 470 mdecorde
                                                current.category = grammaticalCategory
144 470 mdecorde
                                        } else if ("frequency".equals(localname)) {
145 470 mdecorde
                                                inFrequency = false
146 470 mdecorde
                                                frequency = frequency.trim()
147 470 mdecorde
                                                current.freqs.add(frequency)
148 470 mdecorde
                                        }
149 470 mdecorde
                                        break
150 470 mdecorde
                        }
151 470 mdecorde
                }
152 470 mdecorde
153 470 mdecorde
                if (current != null) { // last entry
154 470 mdecorde
                        writer.print(current)
155 470 mdecorde
                }
156 470 mdecorde
157 470 mdecorde
                parser.close()
158 470 mdecorde
                inputData.close()
159 470 mdecorde
        } catch(Exception e) {
160 470 mdecorde
                println("Unexpected error while parsing file "+xmlFile+" : "+e)
161 470 mdecorde
                println("Location line: "+parser.getLocation().getLineNumber()+" character: "+parser.getLocation().getColumnNumber())
162 470 mdecorde
                org.txm.utils.logger.Log.printStackTrace(e)
163 470 mdecorde
                //e.printStackTrace()
164 470 mdecorde
165 470 mdecorde
                parser.close()
166 1688 mdecorde
                inputData.close()
167 470 mdecorde
                return false
168 470 mdecorde
        }
169 470 mdecorde
170 470 mdecorde
        return true
171 470 mdecorde
}
172 470 mdecorde
173 470 mdecorde
println "Done. DMF TSV file: $tsvFile"