root / tmp / org.txm.dictionary.rcp / src / org / txm / macro / frolex / LGerm2DMFTSVMacro.groovy @ 1688
History | View | Annotate | Download (5.1 kB)
1 | 470 | mdecorde | package org.txm.macro.frolex
|
---|---|---|---|
2 | 470 | mdecorde | |
3 | 470 | mdecorde | import java.io.BufferedOutputStream |
4 | 470 | mdecorde | import java.io.File |
5 | 470 | mdecorde | import java.io.IOException |
6 | 470 | mdecorde | import java.io.InputStream |
7 | 470 | mdecorde | import java.io.PrintWriter |
8 | 470 | mdecorde | import java.net.URL |
9 | 470 | mdecorde | import java.util.ArrayList |
10 | 470 | mdecorde | import java.util.Arrays |
11 | 470 | mdecorde | import java.util.List |
12 | 470 | mdecorde | |
13 | 470 | mdecorde | import javax.xml.stream.XMLInputFactory |
14 | 470 | mdecorde | import javax.xml.stream.XMLStreamConstants |
15 | 470 | mdecorde | import javax.xml.stream.XMLStreamException |
16 | 470 | mdecorde | import javax.xml.stream.XMLStreamReader |
17 | 470 | mdecorde | |
18 | 470 | mdecorde | import org.txm.importer.ApplyXsl2 |
19 | 470 | mdecorde | import org.txm.utils.io.IOUtils |
20 | 470 | mdecorde | |
21 | 470 | mdecorde | //PARAMETERS
|
22 | 470 | mdecorde | |
23 | 470 | mdecorde | File xmlDirectory = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed") |
24 | 470 | mdecorde | File tsvFile = new File(System.getProperty("user.home"), "TXM/results/lexique BFM/lgerm-processed/dmf.tsv") |
25 | 470 | mdecorde | |
26 | 470 | mdecorde | // START
|
27 | 470 | mdecorde | |
28 | 470 | mdecorde | factory = XMLInputFactory.newInstance() |
29 | 470 | mdecorde | |
30 | 470 | mdecorde | writer = IOUtils.getWriter(tsvFile, "UTF-8")
|
31 | 470 | mdecorde | writer.println("form\tcategory\tlemma\tlemma_src\tF_dmf")
|
32 | 470 | mdecorde | File[] xmlFiles = ApplyXsl2.listFiles(xmlDirectory) |
33 | 470 | mdecorde | if (xmlFiles == null) return false |
34 | 470 | mdecorde | Arrays.sort(xmlFiles)
|
35 | 470 | mdecorde | |
36 | 470 | mdecorde | for (File xmlFile : xmlFiles) { |
37 | 470 | mdecorde | if (!xmlFile.getName().endsWith(".xml")) continue |
38 | 470 | mdecorde | if (!processFile(xmlFile)) {
|
39 | 470 | mdecorde | println("Error while parsing "+xmlFile)
|
40 | 470 | mdecorde | return false |
41 | 470 | mdecorde | } |
42 | 470 | mdecorde | } |
43 | 470 | mdecorde | |
44 | 470 | mdecorde | if (writer != null) writer.close() |
45 | 470 | mdecorde | return true |
46 | 470 | mdecorde | //END
|
47 | 470 | mdecorde | |
48 | 470 | mdecorde | |
49 | 470 | mdecorde | class Entry { |
50 | 470 | mdecorde | String lemma
|
51 | 470 | mdecorde | String lemma_source
|
52 | 470 | mdecorde | String category
|
53 | 470 | mdecorde | List<String> forms = new ArrayList<String>() |
54 | 470 | mdecorde | List<String> freqs = new ArrayList<String>() |
55 | 470 | mdecorde | |
56 | 470 | mdecorde | public Entry() {}
|
57 | 470 | mdecorde | |
58 | 470 | mdecorde | public Entry(String lemma2, String category2) { |
59 | 470 | mdecorde | // TODO Auto-generated constructor stub
|
60 | 470 | mdecorde | } |
61 | 470 | mdecorde | |
62 | 470 | mdecorde | String toString() {
|
63 | 470 | mdecorde | StringBuilder str = new StringBuilder() |
64 | 470 | mdecorde | if (forms.size() != freqs.size()) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs) |
65 | 470 | mdecorde | else if (lemma == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs) |
66 | 470 | mdecorde | else if (lemma_source == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs) |
67 | 470 | mdecorde | else if (category == null) println("ERROR with entry="+lemma+" "+category+" "+forms+ " "+freqs) |
68 | 470 | mdecorde | else
|
69 | 470 | mdecorde | for (int i = 0 ; i < forms.size() ; i++) { |
70 | 470 | mdecorde | str.append(forms.get(i)+"\t"+category+"\t"+lemma+"\t"+lemma_source+"\t"+freqs.get(i)+"\n") |
71 | 470 | mdecorde | } |
72 | 470 | mdecorde | return str.toString()
|
73 | 470 | mdecorde | } |
74 | 470 | mdecorde | } |
75 | 470 | mdecorde | |
76 | 470 | mdecorde | boolean processFile(File xmlFile) throws IOException, XMLStreamException { |
77 | 470 | mdecorde | URL inputurl = xmlFile.toURI().toURL()
|
78 | 470 | mdecorde | InputStream inputData = inputurl.openStream()
|
79 | 470 | mdecorde | |
80 | 470 | mdecorde | XMLStreamReader parser = factory.createXMLStreamReader(inputData) |
81 | 470 | mdecorde | |
82 | 470 | mdecorde | try {
|
83 | 470 | mdecorde | |
84 | 470 | mdecorde | Entry current = null
|
85 | 470 | mdecorde | String target = null |
86 | 470 | mdecorde | boolean inLemmatizedForm = false |
87 | 470 | mdecorde | String orthography = null |
88 | 470 | mdecorde | boolean inOrthography = false |
89 | 470 | mdecorde | String grammaticalCategory = null |
90 | 470 | mdecorde | boolean inGrammaticalCategory = false |
91 | 470 | mdecorde | String frequency = null |
92 | 470 | mdecorde | boolean inFrequency = false |
93 | 470 | mdecorde | |
94 | 470 | mdecorde | String localname
|
95 | 470 | mdecorde | for (int event = parser.next() ; event != XMLStreamConstants.END_DOCUMENT ; event = parser.next()) { |
96 | 470 | mdecorde | switch (event) {
|
97 | 470 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
98 | 470 | mdecorde | localname = parser.getLocalName() |
99 | 470 | mdecorde | if ("lemmatizedForm".equals(localname)) { |
100 | 470 | mdecorde | if (current != null) { |
101 | 470 | mdecorde | writer.print(current) |
102 | 470 | mdecorde | } |
103 | 470 | mdecorde | current = new Entry()
|
104 | 470 | mdecorde | inLemmatizedForm = true
|
105 | 470 | mdecorde | } else if ("orthography".equals(localname)) { |
106 | 470 | mdecorde | inOrthography = true
|
107 | 470 | mdecorde | orthography = ""
|
108 | 470 | mdecorde | target = parser.getAttributeValue(null, "target") |
109 | 470 | mdecorde | } else if ("grammaticalCategory".equals(localname)) { |
110 | 470 | mdecorde | inGrammaticalCategory = true
|
111 | 470 | mdecorde | grammaticalCategory = ""
|
112 | 470 | mdecorde | } else if ("frequency".equals(localname)) { |
113 | 470 | mdecorde | inFrequency = true
|
114 | 470 | mdecorde | frequency = ""
|
115 | 470 | mdecorde | } |
116 | 470 | mdecorde | break
|
117 | 470 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
118 | 470 | mdecorde | if (inFrequency) frequency += parser.getText()
|
119 | 470 | mdecorde | else if (inGrammaticalCategory) grammaticalCategory += parser.getText() |
120 | 470 | mdecorde | else if (inOrthography) orthography += parser.getText() |
121 | 470 | mdecorde | break
|
122 | 470 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
123 | 470 | mdecorde | localname = parser.getLocalName() |
124 | 470 | mdecorde | if ("lemmatizedForm".equals(localname)) { |
125 | 470 | mdecorde | inLemmatizedForm = false
|
126 | 470 | mdecorde | } else if ("orthography".equals(localname)) { |
127 | 470 | mdecorde | inOrthography = false
|
128 | 470 | mdecorde | orthography = orthography.trim() |
129 | 470 | mdecorde | if (inLemmatizedForm) { // lemma orthography |
130 | 470 | mdecorde | if (target.equals("DMF")) { |
131 | 470 | mdecorde | current.lemma = orthography |
132 | 470 | mdecorde | current.lemma_source = target |
133 | 470 | mdecorde | } else if (target.equals("LGeRM") && !"DMF".equals(current.lemma_source)) { |
134 | 470 | mdecorde | current.lemma = orthography |
135 | 470 | mdecorde | current.lemma_source = target |
136 | 470 | mdecorde | } |
137 | 470 | mdecorde | } else { // form orthography |
138 | 470 | mdecorde | current.forms.add(orthography) |
139 | 470 | mdecorde | } |
140 | 470 | mdecorde | } else if ("grammaticalCategory".equals(localname)) { |
141 | 470 | mdecorde | inGrammaticalCategory = false
|
142 | 470 | mdecorde | grammaticalCategory = grammaticalCategory.trim() |
143 | 470 | mdecorde | current.category = grammaticalCategory |
144 | 470 | mdecorde | } else if ("frequency".equals(localname)) { |
145 | 470 | mdecorde | inFrequency = false
|
146 | 470 | mdecorde | frequency = frequency.trim() |
147 | 470 | mdecorde | current.freqs.add(frequency) |
148 | 470 | mdecorde | } |
149 | 470 | mdecorde | break
|
150 | 470 | mdecorde | } |
151 | 470 | mdecorde | } |
152 | 470 | mdecorde | |
153 | 470 | mdecorde | if (current != null) { // last entry |
154 | 470 | mdecorde | writer.print(current) |
155 | 470 | mdecorde | } |
156 | 470 | mdecorde | |
157 | 470 | mdecorde | parser.close() |
158 | 470 | mdecorde | inputData.close() |
159 | 470 | mdecorde | } catch(Exception e) { |
160 | 470 | mdecorde | println("Unexpected error while parsing file "+xmlFile+" : "+e) |
161 | 470 | mdecorde | println("Location line: "+parser.getLocation().getLineNumber()+" character: "+parser.getLocation().getColumnNumber()) |
162 | 470 | mdecorde | org.txm.utils.logger.Log.printStackTrace(e) |
163 | 470 | mdecorde | //e.printStackTrace()
|
164 | 470 | mdecorde | |
165 | 470 | mdecorde | parser.close() |
166 | 1688 | mdecorde | inputData.close() |
167 | 470 | mdecorde | return false |
168 | 470 | mdecorde | } |
169 | 470 | mdecorde | |
170 | 470 | mdecorde | return true |
171 | 470 | mdecorde | } |
172 | 470 | mdecorde | |
173 | 470 | mdecorde | println "Done. DMF TSV file: $tsvFile" |