Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 2288

History | View | Annotate | Download (7.3 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import javax.xml.stream.*
5
import java.net.URL
6
import java.util.HashMap;
7
import org.txm.utils.*
8
import org.txm.utils.io.*
9
import org.txm.importer.xtz.*
10

    
11
/**
12
 * Compiles the CQP file of ONE text
13
 * 
14
 * @author mdecorde
15
 *
16
 */
17
public class XTZCompilerStep extends Step {
18

    
19
        static String FORM = "form";
20
        static String ANA = "ana";
21
        static String ID = "id";
22
        static String TYPE = "type";
23
        static String TAB = "\t";
24
        static String QUOTE = "\"";
25

    
26
        File xmlFile
27
        File cqpFile
28
        String textname, corpusname, projectname;
29
        boolean normalizeAttributeValues = false;
30
        boolean normalizeAnaValues = true;
31
        boolean normalizeFormValues = true;
32

    
33
        def inputData;
34
        XMLInputFactory factory;
35
        XMLStreamReader parser;
36
        OutputStreamWriter output;
37

    
38
        def anavalues = [:];
39
        def anatypes;
40

    
41
        String WTAG = "w"
42

    
43
        public void setNormalizeAttributeValues(boolean n) {
44
                this.normalizeAttributeValues = n;
45
        }
46

    
47
        public void setNormalizeAnaValues(boolean n) {
48
                this.normalizeAnaValues = n;
49
        }
50

    
51
        public void setNormalizeFormValues(boolean n) {
52
                this.normalizeFormValues = n;
53
        }
54

    
55
        public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) {
56
                this.xmlFile = xmlFile;
57
                this.cqpFile = cqpFile;
58
                this.textname = textname
59
                this.corpusname = corpusname;
60
                this.projectname = projectname;
61
                this.anatypes = anatypes;
62
                this.WTAG = wtag
63

    
64
                try {
65
                        inputData = xmlFile.toURI().toURL().openStream();
66
                        factory = XMLInputFactory.newInstance();
67
                        parser = factory.createXMLStreamReader(inputData);
68
                } catch (Exception ex) {
69
                        System.err.println("Exception while parsing $xmlFile : "+ex);
70
                }
71
        }
72

    
73
        /**
74
         * Creates the output.
75
         *
76
         * @param dirPathName the dir path name
77
         * @param fileName the file name
78
         * @return true, if successful
79
         */
80
        private boolean createOutput(File f) {
81
                try {
82
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8");
83
                        return true;
84
                } catch (Exception e) {
85
                        System.err.println(e);
86
                        return false;
87
                }
88
        }
89

    
90
        /**
91
         * Transfom file cqp.
92
         *
93
         * @param dirPathName the dir path name
94
         * @param fileName the file name
95
         * @return true, if successful
96
         */
97
        public boolean process()
98
        {
99
                if (!createOutput(cqpFile)) {
100
                        return false;
101
                }
102
                
103
                String headvalue=""
104
                String vAna = "";
105
                String vForm = "";
106
                String wordid= "";
107
                String vHead = "";
108

    
109
                int p_id = 0;
110
                int s_id = 0;
111

    
112
                def divs = []
113
                def ncounts = [:] // contains the n values per tags with no attribute
114

    
115
                boolean captureword = false;
116
                boolean flagWord = false;
117
                boolean flagForm = false;
118
                boolean flagAna = false;
119

    
120
                String anatype = "";
121
                String anavalue = "";
122
                boolean stopAtFirstSort = true;
123
                boolean foundtei = false;
124
                boolean foundtext = false;
125
                int nWords = 0;
126
                try {
127
                        String localname;
128
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
129
                        {
130
                                switch (event) {
131
                                        case XMLStreamConstants.START_ELEMENT:
132
                                                localname = parser.getLocalName().toLowerCase();
133
                                                if ("tei".equals(localname)) foundtei = true;
134
                                                switch (localname) {
135
                                                        case "text":
136
                                                                foundtext = true;
137
                                                                output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE);
138
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
139
                                                                        String attrname = parser.getAttributeLocalName(i);
140
                                                                        String attrvalue = parser.getAttributeValue(i)
141

    
142
                                                                        if (normalizeAttributeValues)
143
                                                                                attrvalue = attrvalue.trim();
144

    
145
                                                                        if (attrname != ID)
146
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
147
                                                                }
148
                                                                output.write(">\n");
149

    
150
                                                                break;
151

    
152
                                                        case WTAG:
153
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
154
                                                                        if (parser.getAttributeLocalName(i).equals(ID)) {
155
                                                                                wordid = parser.getAttributeValue(i);
156
                                                                        }
157
                                                                }
158
                                                                anavalues = [:];
159
                                                                flagWord = true
160
                                                                nWords++
161
                                                                break;
162
                                                        case FORM:
163
                                                                flagForm = true;
164
                                                                vForm = "";
165
                                                                vAna = "";
166
                                                                break;
167

    
168
                                                        case ANA:
169
                                                                flagAna = true;
170
                                                                anavalue = "";
171
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
172
                                                                        //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
173
                                                                        if (TYPE.equals(parser.getAttributeLocalName(i))) {
174
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
175
                                                                                break;
176
                                                                        }
177
                                                                }
178
                                                                break;
179

    
180
                                                        default:
181
                                                                if (!foundtei || !foundtext) break;
182

    
183
                                                                output.write("<"+localname);
184

    
185
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
186
                                                                        String attrname = parser.getAttributeLocalName(i);
187

    
188
                                                                        String attrvalue = parser.getAttributeValue(i)
189
                                                                        if (normalizeAttributeValues)
190
                                                                                attrvalue = attrvalue.trim();
191

    
192
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
193
                                                                }
194
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
195
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
196
                                                                        int ncount = ncounts.get(localname);
197
                                                                        ncounts.put(localname, ncount+1);
198
                                                                        output.write(" n=\""+ncount+QUOTE)
199
                                                                }
200
                                                                output.write(">\n");
201
                                                }
202
                                                break;
203

    
204
                                        case XMLStreamConstants.END_ELEMENT:
205
                                                localname = parser.getLocalName().toLowerCase();
206
                                                switch (localname) {
207
                                                        case WTAG:
208
                                                                for (String type : anatypes) {
209
                                                                        def v = anavalues.get(type);
210
                                                                        if (v != null) vAna +=TAB+v;
211
                                                                        else vAna +=TAB;
212
                                                                }
213
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
214
                                                                if (vAna != null) {
215
                                                                        output.write(vForm+TAB+wordid+vAna+"\n");
216
                                                                }
217
                                                                vAna = "";
218
                                                                vForm = "";
219
                                                                flagWord = false;
220
                                                                break;
221

    
222
                                                        case "tei":
223
                                                                foundtei = false;
224
                                                                break;
225
                                                        case "text":
226
                                                                output.write("</text>\n");
227
                                                                foundtext = false;
228
                                                                break;
229
                                                        case FORM:
230
                                                                flagForm = false;
231
                                                                break;
232
                                                        case ANA:
233
                                                                anavalues.put(anatype, anavalue)
234
                                                                flagAna = false;
235
                                                                break;
236
                                                        default:
237
                                                                if (!foundtei || !foundtext) break;
238

    
239
                                                                output.write("</"+localname+">\n");
240
                                                }
241
                                                break;
242

    
243
                                        case XMLStreamConstants.CHARACTERS:
244
                                                if (!foundtei || !foundtext) break;
245
                                                if (flagWord) {
246
                                                        if (flagForm) {
247
                                                                if (normalizeFormValues) {
248
                                                                        vForm += parser.getText().trim();
249
                                                                } else {
250
                                                                        vForm += parser.getText();
251
                                                                }
252
                                                        }
253
                                                        if (flagAna) {
254
                                                                if (normalizeAnaValues)
255
                                                                        anavalue += parser.getText().trim();
256
                                                                else
257
                                                                        anavalue += parser.getText();
258
                                                        }
259
                                                }
260
                                                break;
261
                                }
262
                        }
263

    
264
                        output.close();
265
                        if (parser != null) parser.close();
266
                if (inputData != null) inputData.close();
267
                } catch (Exception ex) {
268
                        System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex);
269
                        File errorDir = null
270
                        try {
271
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
272
                                println "Warning: Moving $xmlFile to $errorDir"
273
                                errorDir.mkdir();
274
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
275
                        } catch(Exception eCopy) {
276
                                println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy
277
                        }
278
                        if (parser != null) parser.close();
279
                        if (inputData != null) inputData.close();
280
                        return false;
281
                }
282
                if (nWords == 0) {
283
                        println "** no words written."
284
                }
285
                return nWords > 0;
286
        }
287
}