Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 2126

History | View | Annotate | Download (7.3 kB)

1 986 mdecorde
package org.txm.scripts.importer.xtz;
2 321 mdecorde
3 321 mdecorde
import java.io.File;
4 321 mdecorde
import javax.xml.stream.*
5 321 mdecorde
import java.net.URL
6 321 mdecorde
import java.util.HashMap;
7 321 mdecorde
import org.txm.utils.*
8 1000 mdecorde
import org.txm.utils.io.*
9 1000 mdecorde
import org.txm.importer.xtz.*
10 321 mdecorde
11 1137 mdecorde
/**
12 1137 mdecorde
 * Compiles the CQP file of ONE text
13 1137 mdecorde
 *
14 1137 mdecorde
 * @author mdecorde
15 1137 mdecorde
 *
16 1137 mdecorde
 */
17 1115 mdecorde
public class XTZCompilerStep extends Step {
18 321 mdecorde
19 321 mdecorde
        static String FORM = "form";
20 321 mdecorde
        static String ANA = "ana";
21 321 mdecorde
        static String ID = "id";
22 321 mdecorde
        static String TYPE = "type";
23 321 mdecorde
        static String TAB = "\t";
24 321 mdecorde
        static String QUOTE = "\"";
25 321 mdecorde
26 321 mdecorde
        File xmlFile
27 803 mdecorde
        File cqpFile
28 321 mdecorde
        String textname, corpusname, projectname;
29 321 mdecorde
        boolean normalizeAttributeValues = false;
30 321 mdecorde
        boolean normalizeAnaValues = true;
31 321 mdecorde
        boolean normalizeFormValues = true;
32 321 mdecorde
33 321 mdecorde
        def inputData;
34 321 mdecorde
        XMLInputFactory factory;
35 321 mdecorde
        XMLStreamReader parser;
36 321 mdecorde
        OutputStreamWriter output;
37 321 mdecorde
38 321 mdecorde
        def anavalues = [:];
39 321 mdecorde
        def anatypes;
40 321 mdecorde
41 321 mdecorde
        String WTAG = "w"
42 321 mdecorde
43 321 mdecorde
        public void setNormalizeAttributeValues(boolean n) {
44 321 mdecorde
                this.normalizeAttributeValues = n;
45 321 mdecorde
        }
46 321 mdecorde
47 321 mdecorde
        public void setNormalizeAnaValues(boolean n) {
48 321 mdecorde
                this.normalizeAnaValues = n;
49 321 mdecorde
        }
50 321 mdecorde
51 321 mdecorde
        public void setNormalizeFormValues(boolean n) {
52 321 mdecorde
                this.normalizeFormValues = n;
53 321 mdecorde
        }
54 321 mdecorde
55 803 mdecorde
        public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) {
56 321 mdecorde
                this.xmlFile = xmlFile;
57 803 mdecorde
                this.cqpFile = cqpFile;
58 321 mdecorde
                this.textname = textname
59 321 mdecorde
                this.corpusname = corpusname;
60 321 mdecorde
                this.projectname = projectname;
61 321 mdecorde
                this.anatypes = anatypes;
62 321 mdecorde
                this.WTAG = wtag
63 321 mdecorde
64 321 mdecorde
                try {
65 321 mdecorde
                        inputData = xmlFile.toURI().toURL().openStream();
66 321 mdecorde
                        factory = XMLInputFactory.newInstance();
67 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
68 321 mdecorde
69 321 mdecorde
                } catch (Exception ex) {
70 321 mdecorde
                        System.err.println("Exception while parsing $xmlFile : "+ex);
71 321 mdecorde
                }
72 321 mdecorde
        }
73 321 mdecorde
74 321 mdecorde
        /**
75 321 mdecorde
         * Creates the output.
76 321 mdecorde
         *
77 321 mdecorde
         * @param dirPathName the dir path name
78 321 mdecorde
         * @param fileName the file name
79 321 mdecorde
         * @return true, if successful
80 321 mdecorde
         */
81 321 mdecorde
        private boolean createOutput(File f) {
82 321 mdecorde
                try {
83 321 mdecorde
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8");
84 321 mdecorde
                        return true;
85 321 mdecorde
                } catch (Exception e) {
86 321 mdecorde
                        System.err.println(e);
87 321 mdecorde
                        return false;
88 321 mdecorde
                }
89 321 mdecorde
        }
90 321 mdecorde
91 321 mdecorde
        /**
92 803 mdecorde
         * Transfom file cqp.
93 321 mdecorde
         *
94 321 mdecorde
         * @param dirPathName the dir path name
95 321 mdecorde
         * @param fileName the file name
96 321 mdecorde
         * @return true, if successful
97 321 mdecorde
         */
98 321 mdecorde
        public boolean process()
99 321 mdecorde
        {
100 803 mdecorde
                if (!createOutput(cqpFile))
101 321 mdecorde
                        return false;
102 321 mdecorde
103 321 mdecorde
                String headvalue=""
104 321 mdecorde
                String vAna = "";
105 321 mdecorde
                String vForm = "";
106 321 mdecorde
                String wordid= "";
107 321 mdecorde
                String vHead = "";
108 321 mdecorde
109 321 mdecorde
110 321 mdecorde
                int p_id = 0;
111 321 mdecorde
                int s_id = 0;
112 321 mdecorde
113 321 mdecorde
                def divs = []
114 321 mdecorde
                def ncounts = [:] // contains the n values per tags with no attribute
115 321 mdecorde
116 321 mdecorde
                boolean captureword = false;
117 321 mdecorde
                boolean flagWord = false;
118 321 mdecorde
                boolean flagForm = false;
119 321 mdecorde
                boolean flagAna = false;
120 321 mdecorde
121 321 mdecorde
                String anatype = "";
122 321 mdecorde
                String anavalue = "";
123 321 mdecorde
                boolean stopAtFirstSort = true;
124 321 mdecorde
                boolean foundtei = false;
125 321 mdecorde
                boolean foundtext = false;
126 1725 mdecorde
                int nWords = 0;
127 321 mdecorde
                try {
128 321 mdecorde
                        String localname;
129 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
130 321 mdecorde
                        {
131 321 mdecorde
                                switch (event) {
132 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
133 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
134 321 mdecorde
                                                if ("tei".equals(localname)) foundtei = true;
135 321 mdecorde
                                                switch (localname) {
136 321 mdecorde
                                                        case "text":
137 321 mdecorde
                                                                foundtext = true;
138 321 mdecorde
                                                                output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE);
139 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
140 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
141 321 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i)
142 321 mdecorde
143 321 mdecorde
                                                                        if (normalizeAttributeValues)
144 321 mdecorde
                                                                                attrvalue = attrvalue.trim();
145 321 mdecorde
146 321 mdecorde
                                                                        if (attrname != ID)
147 321 mdecorde
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
148 321 mdecorde
                                                                }
149 321 mdecorde
                                                                output.write(">\n");
150 321 mdecorde
151 321 mdecorde
                                                                break;
152 321 mdecorde
153 321 mdecorde
                                                        case WTAG:
154 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
155 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i).equals(ID)) {
156 321 mdecorde
                                                                                wordid = parser.getAttributeValue(i);
157 321 mdecorde
                                                                        }
158 321 mdecorde
                                                                }
159 321 mdecorde
                                                                anavalues = [:];
160 321 mdecorde
                                                                flagWord = true
161 1725 mdecorde
                                                                nWords++
162 321 mdecorde
                                                                break;
163 321 mdecorde
                                                        case FORM:
164 321 mdecorde
                                                                flagForm = true;
165 321 mdecorde
                                                                vForm = "";
166 321 mdecorde
                                                                vAna = "";
167 321 mdecorde
                                                                break;
168 321 mdecorde
169 321 mdecorde
                                                        case ANA:
170 321 mdecorde
                                                                flagAna = true;
171 321 mdecorde
                                                                anavalue = "";
172 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
173 321 mdecorde
                                                                        //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
174 321 mdecorde
                                                                        if (TYPE.equals(parser.getAttributeLocalName(i))) {
175 321 mdecorde
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
176 321 mdecorde
                                                                                break;
177 321 mdecorde
                                                                        }
178 321 mdecorde
                                                                }
179 321 mdecorde
                                                                break;
180 321 mdecorde
181 321 mdecorde
                                                        default:
182 1941 mdecorde
                                                                if (!foundtei || !foundtext) break;
183 321 mdecorde
184 321 mdecorde
                                                                output.write("<"+localname);
185 321 mdecorde
186 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
187 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
188 321 mdecorde
189 321 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i)
190 321 mdecorde
                                                                        if (normalizeAttributeValues)
191 321 mdecorde
                                                                                attrvalue = attrvalue.trim();
192 321 mdecorde
193 321 mdecorde
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
194 321 mdecorde
                                                                }
195 321 mdecorde
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
196 321 mdecorde
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
197 321 mdecorde
                                                                        int ncount = ncounts.get(localname);
198 321 mdecorde
                                                                        ncounts.put(localname, ncount+1);
199 321 mdecorde
                                                                        output.write(" n=\""+ncount+QUOTE)
200 321 mdecorde
                                                                }
201 321 mdecorde
                                                                output.write(">\n");
202 321 mdecorde
                                                }
203 321 mdecorde
                                                break;
204 321 mdecorde
205 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
206 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
207 321 mdecorde
                                                switch (localname) {
208 321 mdecorde
                                                        case WTAG:
209 321 mdecorde
                                                                for (String type : anatypes) {
210 321 mdecorde
                                                                        def v = anavalues.get(type);
211 321 mdecorde
                                                                        if (v != null) vAna +=TAB+v;
212 321 mdecorde
                                                                        else vAna +=TAB;
213 321 mdecorde
                                                                }
214 321 mdecorde
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
215 321 mdecorde
                                                                if (vAna != null) {
216 321 mdecorde
                                                                        output.write(vForm+TAB+wordid+vAna+"\n");
217 321 mdecorde
                                                                }
218 321 mdecorde
                                                                vAna = "";
219 321 mdecorde
                                                                vForm = "";
220 321 mdecorde
                                                                flagWord = false;
221 321 mdecorde
                                                                break;
222 321 mdecorde
223 321 mdecorde
                                                        case "tei":
224 1941 mdecorde
                                                                foundtei = false;
225 321 mdecorde
                                                                break;
226 1941 mdecorde
                                                        case "text":
227 2077 mdecorde
                                                                output.write("</text>\n");
228 1941 mdecorde
                                                                foundtext = false;
229 1941 mdecorde
                                                                break;
230 321 mdecorde
                                                        case FORM:
231 321 mdecorde
                                                                flagForm = false;
232 321 mdecorde
                                                                break;
233 321 mdecorde
                                                        case ANA:
234 321 mdecorde
                                                                anavalues.put(anatype, anavalue)
235 321 mdecorde
                                                                flagAna = false;
236 321 mdecorde
                                                                break;
237 321 mdecorde
                                                        default:
238 1941 mdecorde
                                                                if (!foundtei || !foundtext) break;
239 321 mdecorde
240 321 mdecorde
                                                                output.write("</"+localname+">\n");
241 321 mdecorde
                                                }
242 321 mdecorde
                                                break;
243 321 mdecorde
244 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
245 1941 mdecorde
                                                if (!foundtei || !foundtext) break;
246 321 mdecorde
                                                if (flagWord) {
247 321 mdecorde
                                                        if (flagForm) {
248 321 mdecorde
                                                                if (normalizeFormValues) {
249 321 mdecorde
                                                                        vForm += parser.getText().trim();
250 321 mdecorde
                                                                } else {
251 321 mdecorde
                                                                        vForm += parser.getText();
252 321 mdecorde
                                                                }
253 321 mdecorde
                                                        }
254 321 mdecorde
                                                        if (flagAna) {
255 321 mdecorde
                                                                if (normalizeAnaValues)
256 321 mdecorde
                                                                        anavalue += parser.getText().trim();
257 321 mdecorde
                                                                else
258 321 mdecorde
                                                                        anavalue += parser.getText();
259 321 mdecorde
                                                        }
260 321 mdecorde
                                                }
261 321 mdecorde
                                                break;
262 321 mdecorde
                                }
263 321 mdecorde
                        }
264 321 mdecorde
265 321 mdecorde
                        output.close();
266 1688 mdecorde
                        if (parser != null) parser.close();
267 1688 mdecorde
                if (inputData != null) inputData.close();
268 321 mdecorde
                } catch (Exception ex) {
269 321 mdecorde
                        System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex);
270 321 mdecorde
                        File errorDir = null
271 321 mdecorde
                        try {
272 803 mdecorde
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
273 321 mdecorde
                                println "Warning: Moving $xmlFile to $errorDir"
274 321 mdecorde
                                errorDir.mkdir();
275 321 mdecorde
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
276 321 mdecorde
                        } catch(Exception eCopy) {
277 321 mdecorde
                                println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy
278 321 mdecorde
                        }
279 1688 mdecorde
                        if (parser != null) parser.close();
280 1688 mdecorde
                        if (inputData != null) inputData.close();
281 321 mdecorde
                        return false;
282 321 mdecorde
                }
283 1725 mdecorde
                if (nWords == 0) {
284 1725 mdecorde
                        println "** no words written."
285 1725 mdecorde
                }
286 1725 mdecorde
                return nWords > 0;
287 321 mdecorde
        }
288 927 mdecorde
}