root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 2126
History | View | Annotate | Download (7.3 kB)
1 |
package org.txm.scripts.importer.xtz;
|
---|---|
2 |
|
3 |
import java.io.File; |
4 |
import javax.xml.stream.* |
5 |
import java.net.URL |
6 |
import java.util.HashMap; |
7 |
import org.txm.utils.* |
8 |
import org.txm.utils.io.* |
9 |
import org.txm.importer.xtz.* |
10 |
|
11 |
/**
|
12 |
* Compiles the CQP file of ONE text
|
13 |
*
|
14 |
* @author mdecorde
|
15 |
*
|
16 |
*/
|
17 |
public class XTZCompilerStep extends Step { |
18 |
|
19 |
static String FORM = "form"; |
20 |
static String ANA = "ana"; |
21 |
static String ID = "id"; |
22 |
static String TYPE = "type"; |
23 |
static String TAB = "\t"; |
24 |
static String QUOTE = "\""; |
25 |
|
26 |
File xmlFile
|
27 |
File cqpFile
|
28 |
String textname, corpusname, projectname;
|
29 |
boolean normalizeAttributeValues = false; |
30 |
boolean normalizeAnaValues = true; |
31 |
boolean normalizeFormValues = true; |
32 |
|
33 |
def inputData;
|
34 |
XMLInputFactory factory; |
35 |
XMLStreamReader parser; |
36 |
OutputStreamWriter output;
|
37 |
|
38 |
def anavalues = [:];
|
39 |
def anatypes;
|
40 |
|
41 |
String WTAG = "w" |
42 |
|
43 |
public void setNormalizeAttributeValues(boolean n) { |
44 |
this.normalizeAttributeValues = n;
|
45 |
} |
46 |
|
47 |
public void setNormalizeAnaValues(boolean n) { |
48 |
this.normalizeAnaValues = n;
|
49 |
} |
50 |
|
51 |
public void setNormalizeFormValues(boolean n) { |
52 |
this.normalizeFormValues = n;
|
53 |
} |
54 |
|
55 |
public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) { |
56 |
this.xmlFile = xmlFile;
|
57 |
this.cqpFile = cqpFile;
|
58 |
this.textname = textname
|
59 |
this.corpusname = corpusname;
|
60 |
this.projectname = projectname;
|
61 |
this.anatypes = anatypes;
|
62 |
this.WTAG = wtag
|
63 |
|
64 |
try {
|
65 |
inputData = xmlFile.toURI().toURL().openStream(); |
66 |
factory = XMLInputFactory.newInstance(); |
67 |
parser = factory.createXMLStreamReader(inputData); |
68 |
|
69 |
} catch (Exception ex) { |
70 |
System.err.println("Exception while parsing $xmlFile : "+ex); |
71 |
} |
72 |
} |
73 |
|
74 |
/**
|
75 |
* Creates the output.
|
76 |
*
|
77 |
* @param dirPathName the dir path name
|
78 |
* @param fileName the file name
|
79 |
* @return true, if successful
|
80 |
*/
|
81 |
private boolean createOutput(File f) { |
82 |
try {
|
83 |
output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8"); |
84 |
return true; |
85 |
} catch (Exception e) { |
86 |
System.err.println(e);
|
87 |
return false; |
88 |
} |
89 |
} |
90 |
|
91 |
/**
|
92 |
* Transfom file cqp.
|
93 |
*
|
94 |
* @param dirPathName the dir path name
|
95 |
* @param fileName the file name
|
96 |
* @return true, if successful
|
97 |
*/
|
98 |
public boolean process() |
99 |
{ |
100 |
if (!createOutput(cqpFile))
|
101 |
return false; |
102 |
|
103 |
String headvalue="" |
104 |
String vAna = ""; |
105 |
String vForm = ""; |
106 |
String wordid= ""; |
107 |
String vHead = ""; |
108 |
|
109 |
|
110 |
int p_id = 0; |
111 |
int s_id = 0; |
112 |
|
113 |
def divs = [] |
114 |
def ncounts = [:] // contains the n values per tags with no attribute |
115 |
|
116 |
boolean captureword = false; |
117 |
boolean flagWord = false; |
118 |
boolean flagForm = false; |
119 |
boolean flagAna = false; |
120 |
|
121 |
String anatype = ""; |
122 |
String anavalue = ""; |
123 |
boolean stopAtFirstSort = true; |
124 |
boolean foundtei = false; |
125 |
boolean foundtext = false; |
126 |
int nWords = 0; |
127 |
try {
|
128 |
String localname;
|
129 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
130 |
{ |
131 |
switch (event) {
|
132 |
case XMLStreamConstants.START_ELEMENT:
|
133 |
localname = parser.getLocalName().toLowerCase(); |
134 |
if ("tei".equals(localname)) foundtei = true; |
135 |
switch (localname) {
|
136 |
case "text": |
137 |
foundtext = true;
|
138 |
output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE); |
139 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
140 |
String attrname = parser.getAttributeLocalName(i);
|
141 |
String attrvalue = parser.getAttributeValue(i)
|
142 |
|
143 |
if (normalizeAttributeValues)
|
144 |
attrvalue = attrvalue.trim(); |
145 |
|
146 |
if (attrname != ID)
|
147 |
output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE) |
148 |
} |
149 |
output.write(">\n");
|
150 |
|
151 |
break;
|
152 |
|
153 |
case WTAG: |
154 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
155 |
if (parser.getAttributeLocalName(i).equals(ID)) {
|
156 |
wordid = parser.getAttributeValue(i); |
157 |
} |
158 |
} |
159 |
anavalues = [:]; |
160 |
flagWord = true
|
161 |
nWords++ |
162 |
break;
|
163 |
case FORM: |
164 |
flagForm = true;
|
165 |
vForm = "";
|
166 |
vAna = "";
|
167 |
break;
|
168 |
|
169 |
case ANA: |
170 |
flagAna = true;
|
171 |
anavalue = "";
|
172 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
173 |
//println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
|
174 |
if (TYPE.equals(parser.getAttributeLocalName(i))) {
|
175 |
anatype = parser.getAttributeValue(i).substring(1);//remove the # |
176 |
break;
|
177 |
} |
178 |
} |
179 |
break;
|
180 |
|
181 |
default:
|
182 |
if (!foundtei || !foundtext) break; |
183 |
|
184 |
output.write("<"+localname);
|
185 |
|
186 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
187 |
String attrname = parser.getAttributeLocalName(i);
|
188 |
|
189 |
String attrvalue = parser.getAttributeValue(i)
|
190 |
if (normalizeAttributeValues)
|
191 |
attrvalue = attrvalue.trim(); |
192 |
|
193 |
output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE) |
194 |
} |
195 |
if (parser.getAttributeCount() == 0) { // add the n attribute |
196 |
if (!ncounts.containsKey(localname)) ncounts.put(localname, 0); |
197 |
int ncount = ncounts.get(localname);
|
198 |
ncounts.put(localname, ncount+1);
|
199 |
output.write(" n=\""+ncount+QUOTE)
|
200 |
} |
201 |
output.write(">\n");
|
202 |
} |
203 |
break;
|
204 |
|
205 |
case XMLStreamConstants.END_ELEMENT:
|
206 |
localname = parser.getLocalName().toLowerCase(); |
207 |
switch (localname) {
|
208 |
case WTAG: |
209 |
for (String type : anatypes) { |
210 |
def v = anavalues.get(type);
|
211 |
if (v != null) vAna +=TAB+v; |
212 |
else vAna +=TAB;
|
213 |
} |
214 |
vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<"); |
215 |
if (vAna != null) { |
216 |
output.write(vForm+TAB+wordid+vAna+"\n");
|
217 |
} |
218 |
vAna = "";
|
219 |
vForm = "";
|
220 |
flagWord = false;
|
221 |
break;
|
222 |
|
223 |
case "tei": |
224 |
foundtei = false;
|
225 |
break;
|
226 |
case "text": |
227 |
output.write("</text>\n");
|
228 |
foundtext = false;
|
229 |
break;
|
230 |
case FORM: |
231 |
flagForm = false;
|
232 |
break;
|
233 |
case ANA: |
234 |
anavalues.put(anatype, anavalue) |
235 |
flagAna = false;
|
236 |
break;
|
237 |
default:
|
238 |
if (!foundtei || !foundtext) break; |
239 |
|
240 |
output.write("</"+localname+">\n"); |
241 |
} |
242 |
break;
|
243 |
|
244 |
case XMLStreamConstants.CHARACTERS:
|
245 |
if (!foundtei || !foundtext) break; |
246 |
if (flagWord) {
|
247 |
if (flagForm) {
|
248 |
if (normalizeFormValues) {
|
249 |
vForm += parser.getText().trim(); |
250 |
} else {
|
251 |
vForm += parser.getText(); |
252 |
} |
253 |
} |
254 |
if (flagAna) {
|
255 |
if (normalizeAnaValues)
|
256 |
anavalue += parser.getText().trim(); |
257 |
else
|
258 |
anavalue += parser.getText(); |
259 |
} |
260 |
} |
261 |
break;
|
262 |
} |
263 |
} |
264 |
|
265 |
output.close(); |
266 |
if (parser != null) parser.close(); |
267 |
if (inputData != null) inputData.close(); |
268 |
} catch (Exception ex) { |
269 |
System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex); |
270 |
File errorDir = null |
271 |
try {
|
272 |
errorDir = new File(cqpFile.getParentFile(), "compiler-error") |
273 |
println "Warning: Moving $xmlFile to $errorDir"
|
274 |
errorDir.mkdir(); |
275 |
FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName())) |
276 |
} catch(Exception eCopy) { |
277 |
println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy |
278 |
} |
279 |
if (parser != null) parser.close(); |
280 |
if (inputData != null) inputData.close(); |
281 |
return false; |
282 |
} |
283 |
if (nWords == 0) { |
284 |
println "** no words written."
|
285 |
} |
286 |
return nWords > 0; |
287 |
} |
288 |
} |