root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 2126
History | View | Annotate | Download (7.3 kB)
1 | 986 | mdecorde | package org.txm.scripts.importer.xtz;
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 321 | mdecorde | import java.io.File; |
4 | 321 | mdecorde | import javax.xml.stream.* |
5 | 321 | mdecorde | import java.net.URL |
6 | 321 | mdecorde | import java.util.HashMap; |
7 | 321 | mdecorde | import org.txm.utils.* |
8 | 1000 | mdecorde | import org.txm.utils.io.* |
9 | 1000 | mdecorde | import org.txm.importer.xtz.* |
10 | 321 | mdecorde | |
11 | 1137 | mdecorde | /**
|
12 | 1137 | mdecorde | * Compiles the CQP file of ONE text
|
13 | 1137 | mdecorde | *
|
14 | 1137 | mdecorde | * @author mdecorde
|
15 | 1137 | mdecorde | *
|
16 | 1137 | mdecorde | */
|
17 | 1115 | mdecorde | public class XTZCompilerStep extends Step { |
18 | 321 | mdecorde | |
19 | 321 | mdecorde | static String FORM = "form"; |
20 | 321 | mdecorde | static String ANA = "ana"; |
21 | 321 | mdecorde | static String ID = "id"; |
22 | 321 | mdecorde | static String TYPE = "type"; |
23 | 321 | mdecorde | static String TAB = "\t"; |
24 | 321 | mdecorde | static String QUOTE = "\""; |
25 | 321 | mdecorde | |
26 | 321 | mdecorde | File xmlFile
|
27 | 803 | mdecorde | File cqpFile
|
28 | 321 | mdecorde | String textname, corpusname, projectname;
|
29 | 321 | mdecorde | boolean normalizeAttributeValues = false; |
30 | 321 | mdecorde | boolean normalizeAnaValues = true; |
31 | 321 | mdecorde | boolean normalizeFormValues = true; |
32 | 321 | mdecorde | |
33 | 321 | mdecorde | def inputData;
|
34 | 321 | mdecorde | XMLInputFactory factory; |
35 | 321 | mdecorde | XMLStreamReader parser; |
36 | 321 | mdecorde | OutputStreamWriter output;
|
37 | 321 | mdecorde | |
38 | 321 | mdecorde | def anavalues = [:];
|
39 | 321 | mdecorde | def anatypes;
|
40 | 321 | mdecorde | |
41 | 321 | mdecorde | String WTAG = "w" |
42 | 321 | mdecorde | |
43 | 321 | mdecorde | public void setNormalizeAttributeValues(boolean n) { |
44 | 321 | mdecorde | this.normalizeAttributeValues = n;
|
45 | 321 | mdecorde | } |
46 | 321 | mdecorde | |
47 | 321 | mdecorde | public void setNormalizeAnaValues(boolean n) { |
48 | 321 | mdecorde | this.normalizeAnaValues = n;
|
49 | 321 | mdecorde | } |
50 | 321 | mdecorde | |
51 | 321 | mdecorde | public void setNormalizeFormValues(boolean n) { |
52 | 321 | mdecorde | this.normalizeFormValues = n;
|
53 | 321 | mdecorde | } |
54 | 321 | mdecorde | |
55 | 803 | mdecorde | public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) { |
56 | 321 | mdecorde | this.xmlFile = xmlFile;
|
57 | 803 | mdecorde | this.cqpFile = cqpFile;
|
58 | 321 | mdecorde | this.textname = textname
|
59 | 321 | mdecorde | this.corpusname = corpusname;
|
60 | 321 | mdecorde | this.projectname = projectname;
|
61 | 321 | mdecorde | this.anatypes = anatypes;
|
62 | 321 | mdecorde | this.WTAG = wtag
|
63 | 321 | mdecorde | |
64 | 321 | mdecorde | try {
|
65 | 321 | mdecorde | inputData = xmlFile.toURI().toURL().openStream(); |
66 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
67 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
68 | 321 | mdecorde | |
69 | 321 | mdecorde | } catch (Exception ex) { |
70 | 321 | mdecorde | System.err.println("Exception while parsing $xmlFile : "+ex); |
71 | 321 | mdecorde | } |
72 | 321 | mdecorde | } |
73 | 321 | mdecorde | |
74 | 321 | mdecorde | /**
|
75 | 321 | mdecorde | * Creates the output.
|
76 | 321 | mdecorde | *
|
77 | 321 | mdecorde | * @param dirPathName the dir path name
|
78 | 321 | mdecorde | * @param fileName the file name
|
79 | 321 | mdecorde | * @return true, if successful
|
80 | 321 | mdecorde | */
|
81 | 321 | mdecorde | private boolean createOutput(File f) { |
82 | 321 | mdecorde | try {
|
83 | 321 | mdecorde | output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8"); |
84 | 321 | mdecorde | return true; |
85 | 321 | mdecorde | } catch (Exception e) { |
86 | 321 | mdecorde | System.err.println(e);
|
87 | 321 | mdecorde | return false; |
88 | 321 | mdecorde | } |
89 | 321 | mdecorde | } |
90 | 321 | mdecorde | |
91 | 321 | mdecorde | /**
|
92 | 803 | mdecorde | * Transfom file cqp.
|
93 | 321 | mdecorde | *
|
94 | 321 | mdecorde | * @param dirPathName the dir path name
|
95 | 321 | mdecorde | * @param fileName the file name
|
96 | 321 | mdecorde | * @return true, if successful
|
97 | 321 | mdecorde | */
|
98 | 321 | mdecorde | public boolean process() |
99 | 321 | mdecorde | { |
100 | 803 | mdecorde | if (!createOutput(cqpFile))
|
101 | 321 | mdecorde | return false; |
102 | 321 | mdecorde | |
103 | 321 | mdecorde | String headvalue="" |
104 | 321 | mdecorde | String vAna = ""; |
105 | 321 | mdecorde | String vForm = ""; |
106 | 321 | mdecorde | String wordid= ""; |
107 | 321 | mdecorde | String vHead = ""; |
108 | 321 | mdecorde | |
109 | 321 | mdecorde | |
110 | 321 | mdecorde | int p_id = 0; |
111 | 321 | mdecorde | int s_id = 0; |
112 | 321 | mdecorde | |
113 | 321 | mdecorde | def divs = [] |
114 | 321 | mdecorde | def ncounts = [:] // contains the n values per tags with no attribute |
115 | 321 | mdecorde | |
116 | 321 | mdecorde | boolean captureword = false; |
117 | 321 | mdecorde | boolean flagWord = false; |
118 | 321 | mdecorde | boolean flagForm = false; |
119 | 321 | mdecorde | boolean flagAna = false; |
120 | 321 | mdecorde | |
121 | 321 | mdecorde | String anatype = ""; |
122 | 321 | mdecorde | String anavalue = ""; |
123 | 321 | mdecorde | boolean stopAtFirstSort = true; |
124 | 321 | mdecorde | boolean foundtei = false; |
125 | 321 | mdecorde | boolean foundtext = false; |
126 | 1725 | mdecorde | int nWords = 0; |
127 | 321 | mdecorde | try {
|
128 | 321 | mdecorde | String localname;
|
129 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
130 | 321 | mdecorde | { |
131 | 321 | mdecorde | switch (event) {
|
132 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
133 | 321 | mdecorde | localname = parser.getLocalName().toLowerCase(); |
134 | 321 | mdecorde | if ("tei".equals(localname)) foundtei = true; |
135 | 321 | mdecorde | switch (localname) {
|
136 | 321 | mdecorde | case "text": |
137 | 321 | mdecorde | foundtext = true;
|
138 | 321 | mdecorde | output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE); |
139 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
140 | 321 | mdecorde | String attrname = parser.getAttributeLocalName(i);
|
141 | 321 | mdecorde | String attrvalue = parser.getAttributeValue(i)
|
142 | 321 | mdecorde | |
143 | 321 | mdecorde | if (normalizeAttributeValues)
|
144 | 321 | mdecorde | attrvalue = attrvalue.trim(); |
145 | 321 | mdecorde | |
146 | 321 | mdecorde | if (attrname != ID)
|
147 | 321 | mdecorde | output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE) |
148 | 321 | mdecorde | } |
149 | 321 | mdecorde | output.write(">\n");
|
150 | 321 | mdecorde | |
151 | 321 | mdecorde | break;
|
152 | 321 | mdecorde | |
153 | 321 | mdecorde | case WTAG: |
154 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
155 | 321 | mdecorde | if (parser.getAttributeLocalName(i).equals(ID)) {
|
156 | 321 | mdecorde | wordid = parser.getAttributeValue(i); |
157 | 321 | mdecorde | } |
158 | 321 | mdecorde | } |
159 | 321 | mdecorde | anavalues = [:]; |
160 | 321 | mdecorde | flagWord = true
|
161 | 1725 | mdecorde | nWords++ |
162 | 321 | mdecorde | break;
|
163 | 321 | mdecorde | case FORM: |
164 | 321 | mdecorde | flagForm = true;
|
165 | 321 | mdecorde | vForm = "";
|
166 | 321 | mdecorde | vAna = "";
|
167 | 321 | mdecorde | break;
|
168 | 321 | mdecorde | |
169 | 321 | mdecorde | case ANA: |
170 | 321 | mdecorde | flagAna = true;
|
171 | 321 | mdecorde | anavalue = "";
|
172 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
173 | 321 | mdecorde | //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
|
174 | 321 | mdecorde | if (TYPE.equals(parser.getAttributeLocalName(i))) {
|
175 | 321 | mdecorde | anatype = parser.getAttributeValue(i).substring(1);//remove the # |
176 | 321 | mdecorde | break;
|
177 | 321 | mdecorde | } |
178 | 321 | mdecorde | } |
179 | 321 | mdecorde | break;
|
180 | 321 | mdecorde | |
181 | 321 | mdecorde | default:
|
182 | 1941 | mdecorde | if (!foundtei || !foundtext) break; |
183 | 321 | mdecorde | |
184 | 321 | mdecorde | output.write("<"+localname);
|
185 | 321 | mdecorde | |
186 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
187 | 321 | mdecorde | String attrname = parser.getAttributeLocalName(i);
|
188 | 321 | mdecorde | |
189 | 321 | mdecorde | String attrvalue = parser.getAttributeValue(i)
|
190 | 321 | mdecorde | if (normalizeAttributeValues)
|
191 | 321 | mdecorde | attrvalue = attrvalue.trim(); |
192 | 321 | mdecorde | |
193 | 321 | mdecorde | output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE) |
194 | 321 | mdecorde | } |
195 | 321 | mdecorde | if (parser.getAttributeCount() == 0) { // add the n attribute |
196 | 321 | mdecorde | if (!ncounts.containsKey(localname)) ncounts.put(localname, 0); |
197 | 321 | mdecorde | int ncount = ncounts.get(localname);
|
198 | 321 | mdecorde | ncounts.put(localname, ncount+1);
|
199 | 321 | mdecorde | output.write(" n=\""+ncount+QUOTE)
|
200 | 321 | mdecorde | } |
201 | 321 | mdecorde | output.write(">\n");
|
202 | 321 | mdecorde | } |
203 | 321 | mdecorde | break;
|
204 | 321 | mdecorde | |
205 | 321 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
206 | 321 | mdecorde | localname = parser.getLocalName().toLowerCase(); |
207 | 321 | mdecorde | switch (localname) {
|
208 | 321 | mdecorde | case WTAG: |
209 | 321 | mdecorde | for (String type : anatypes) { |
210 | 321 | mdecorde | def v = anavalues.get(type);
|
211 | 321 | mdecorde | if (v != null) vAna +=TAB+v; |
212 | 321 | mdecorde | else vAna +=TAB;
|
213 | 321 | mdecorde | } |
214 | 321 | mdecorde | vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<"); |
215 | 321 | mdecorde | if (vAna != null) { |
216 | 321 | mdecorde | output.write(vForm+TAB+wordid+vAna+"\n");
|
217 | 321 | mdecorde | } |
218 | 321 | mdecorde | vAna = "";
|
219 | 321 | mdecorde | vForm = "";
|
220 | 321 | mdecorde | flagWord = false;
|
221 | 321 | mdecorde | break;
|
222 | 321 | mdecorde | |
223 | 321 | mdecorde | case "tei": |
224 | 1941 | mdecorde | foundtei = false;
|
225 | 321 | mdecorde | break;
|
226 | 1941 | mdecorde | case "text": |
227 | 2077 | mdecorde | output.write("</text>\n");
|
228 | 1941 | mdecorde | foundtext = false;
|
229 | 1941 | mdecorde | break;
|
230 | 321 | mdecorde | case FORM: |
231 | 321 | mdecorde | flagForm = false;
|
232 | 321 | mdecorde | break;
|
233 | 321 | mdecorde | case ANA: |
234 | 321 | mdecorde | anavalues.put(anatype, anavalue) |
235 | 321 | mdecorde | flagAna = false;
|
236 | 321 | mdecorde | break;
|
237 | 321 | mdecorde | default:
|
238 | 1941 | mdecorde | if (!foundtei || !foundtext) break; |
239 | 321 | mdecorde | |
240 | 321 | mdecorde | output.write("</"+localname+">\n"); |
241 | 321 | mdecorde | } |
242 | 321 | mdecorde | break;
|
243 | 321 | mdecorde | |
244 | 321 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
245 | 1941 | mdecorde | if (!foundtei || !foundtext) break; |
246 | 321 | mdecorde | if (flagWord) {
|
247 | 321 | mdecorde | if (flagForm) {
|
248 | 321 | mdecorde | if (normalizeFormValues) {
|
249 | 321 | mdecorde | vForm += parser.getText().trim(); |
250 | 321 | mdecorde | } else {
|
251 | 321 | mdecorde | vForm += parser.getText(); |
252 | 321 | mdecorde | } |
253 | 321 | mdecorde | } |
254 | 321 | mdecorde | if (flagAna) {
|
255 | 321 | mdecorde | if (normalizeAnaValues)
|
256 | 321 | mdecorde | anavalue += parser.getText().trim(); |
257 | 321 | mdecorde | else
|
258 | 321 | mdecorde | anavalue += parser.getText(); |
259 | 321 | mdecorde | } |
260 | 321 | mdecorde | } |
261 | 321 | mdecorde | break;
|
262 | 321 | mdecorde | } |
263 | 321 | mdecorde | } |
264 | 321 | mdecorde | |
265 | 321 | mdecorde | output.close(); |
266 | 1688 | mdecorde | if (parser != null) parser.close(); |
267 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
268 | 321 | mdecorde | } catch (Exception ex) { |
269 | 321 | mdecorde | System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex); |
270 | 321 | mdecorde | File errorDir = null |
271 | 321 | mdecorde | try {
|
272 | 803 | mdecorde | errorDir = new File(cqpFile.getParentFile(), "compiler-error") |
273 | 321 | mdecorde | println "Warning: Moving $xmlFile to $errorDir"
|
274 | 321 | mdecorde | errorDir.mkdir(); |
275 | 321 | mdecorde | FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName())) |
276 | 321 | mdecorde | } catch(Exception eCopy) { |
277 | 321 | mdecorde | println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy |
278 | 321 | mdecorde | } |
279 | 1688 | mdecorde | if (parser != null) parser.close(); |
280 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
281 | 321 | mdecorde | return false; |
282 | 321 | mdecorde | } |
283 | 1725 | mdecorde | if (nWords == 0) { |
284 | 1725 | mdecorde | println "** no words written."
|
285 | 1725 | mdecorde | } |
286 | 1725 | mdecorde | return nWords > 0; |
287 | 321 | mdecorde | } |
288 | 927 | mdecorde | } |