Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 2126

History | View | Annotate | Download (7.3 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import javax.xml.stream.*
5
import java.net.URL
6
import java.util.HashMap;
7
import org.txm.utils.*
8
import org.txm.utils.io.*
9
import org.txm.importer.xtz.*
10

    
11
/**
12
 * Compiles the CQP file of ONE text
13
 * 
14
 * @author mdecorde
15
 *
16
 */
17
public class XTZCompilerStep extends Step {
18

    
19
        static String FORM = "form";
20
        static String ANA = "ana";
21
        static String ID = "id";
22
        static String TYPE = "type";
23
        static String TAB = "\t";
24
        static String QUOTE = "\"";
25

    
26
        File xmlFile
27
        File cqpFile
28
        String textname, corpusname, projectname;
29
        boolean normalizeAttributeValues = false;
30
        boolean normalizeAnaValues = true;
31
        boolean normalizeFormValues = true;
32

    
33
        def inputData;
34
        XMLInputFactory factory;
35
        XMLStreamReader parser;
36
        OutputStreamWriter output;
37

    
38
        def anavalues = [:];
39
        def anatypes;
40

    
41
        String WTAG = "w"
42

    
43
        public void setNormalizeAttributeValues(boolean n) {
44
                this.normalizeAttributeValues = n;
45
        }
46

    
47
        public void setNormalizeAnaValues(boolean n) {
48
                this.normalizeAnaValues = n;
49
        }
50

    
51
        public void setNormalizeFormValues(boolean n) {
52
                this.normalizeFormValues = n;
53
        }
54

    
55
        public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) {
56
                this.xmlFile = xmlFile;
57
                this.cqpFile = cqpFile;
58
                this.textname = textname
59
                this.corpusname = corpusname;
60
                this.projectname = projectname;
61
                this.anatypes = anatypes;
62
                this.WTAG = wtag
63

    
64
                try {
65
                        inputData = xmlFile.toURI().toURL().openStream();
66
                        factory = XMLInputFactory.newInstance();
67
                        parser = factory.createXMLStreamReader(inputData);
68

    
69
                } catch (Exception ex) {
70
                        System.err.println("Exception while parsing $xmlFile : "+ex);
71
                }
72
        }
73

    
74
        /**
75
         * Creates the output.
76
         *
77
         * @param dirPathName the dir path name
78
         * @param fileName the file name
79
         * @return true, if successful
80
         */
81
        private boolean createOutput(File f) {
82
                try {
83
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8");
84
                        return true;
85
                } catch (Exception e) {
86
                        System.err.println(e);
87
                        return false;
88
                }
89
        }
90

    
91
        /**
92
         * Transfom file cqp.
93
         *
94
         * @param dirPathName the dir path name
95
         * @param fileName the file name
96
         * @return true, if successful
97
         */
98
        public boolean process()
99
        {
100
                if (!createOutput(cqpFile))
101
                        return false;
102

    
103
                String headvalue=""
104
                String vAna = "";
105
                String vForm = "";
106
                String wordid= "";
107
                String vHead = "";
108

    
109

    
110
                int p_id = 0;
111
                int s_id = 0;
112

    
113
                def divs = []
114
                def ncounts = [:] // contains the n values per tags with no attribute
115

    
116
                boolean captureword = false;
117
                boolean flagWord = false;
118
                boolean flagForm = false;
119
                boolean flagAna = false;
120

    
121
                String anatype = "";
122
                String anavalue = "";
123
                boolean stopAtFirstSort = true;
124
                boolean foundtei = false;
125
                boolean foundtext = false;
126
                int nWords = 0;
127
                try {
128
                        String localname;
129
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
130
                        {
131
                                switch (event) {
132
                                        case XMLStreamConstants.START_ELEMENT:
133
                                                localname = parser.getLocalName().toLowerCase();
134
                                                if ("tei".equals(localname)) foundtei = true;
135
                                                switch (localname) {
136
                                                        case "text":
137
                                                                foundtext = true;
138
                                                                output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE);
139
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
140
                                                                        String attrname = parser.getAttributeLocalName(i);
141
                                                                        String attrvalue = parser.getAttributeValue(i)
142

    
143
                                                                        if (normalizeAttributeValues)
144
                                                                                attrvalue = attrvalue.trim();
145

    
146
                                                                        if (attrname != ID)
147
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
148
                                                                }
149
                                                                output.write(">\n");
150

    
151
                                                                break;
152

    
153
                                                        case WTAG:
154
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
155
                                                                        if (parser.getAttributeLocalName(i).equals(ID)) {
156
                                                                                wordid = parser.getAttributeValue(i);
157
                                                                        }
158
                                                                }
159
                                                                anavalues = [:];
160
                                                                flagWord = true
161
                                                                nWords++
162
                                                                break;
163
                                                        case FORM:
164
                                                                flagForm = true;
165
                                                                vForm = "";
166
                                                                vAna = "";
167
                                                                break;
168

    
169
                                                        case ANA:
170
                                                                flagAna = true;
171
                                                                anavalue = "";
172
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
173
                                                                        //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
174
                                                                        if (TYPE.equals(parser.getAttributeLocalName(i))) {
175
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
176
                                                                                break;
177
                                                                        }
178
                                                                }
179
                                                                break;
180

    
181
                                                        default:
182
                                                                if (!foundtei || !foundtext) break;
183

    
184
                                                                output.write("<"+localname);
185

    
186
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
187
                                                                        String attrname = parser.getAttributeLocalName(i);
188

    
189
                                                                        String attrvalue = parser.getAttributeValue(i)
190
                                                                        if (normalizeAttributeValues)
191
                                                                                attrvalue = attrvalue.trim();
192

    
193
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
194
                                                                }
195
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
196
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
197
                                                                        int ncount = ncounts.get(localname);
198
                                                                        ncounts.put(localname, ncount+1);
199
                                                                        output.write(" n=\""+ncount+QUOTE)
200
                                                                }
201
                                                                output.write(">\n");
202
                                                }
203
                                                break;
204

    
205
                                        case XMLStreamConstants.END_ELEMENT:
206
                                                localname = parser.getLocalName().toLowerCase();
207
                                                switch (localname) {
208
                                                        case WTAG:
209
                                                                for (String type : anatypes) {
210
                                                                        def v = anavalues.get(type);
211
                                                                        if (v != null) vAna +=TAB+v;
212
                                                                        else vAna +=TAB;
213
                                                                }
214
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
215
                                                                if (vAna != null) {
216
                                                                        output.write(vForm+TAB+wordid+vAna+"\n");
217
                                                                }
218
                                                                vAna = "";
219
                                                                vForm = "";
220
                                                                flagWord = false;
221
                                                                break;
222

    
223
                                                        case "tei":
224
                                                                foundtei = false;
225
                                                                break;
226
                                                        case "text":
227
                                                                output.write("</text>\n");
228
                                                                foundtext = false;
229
                                                                break;
230
                                                        case FORM:
231
                                                                flagForm = false;
232
                                                                break;
233
                                                        case ANA:
234
                                                                anavalues.put(anatype, anavalue)
235
                                                                flagAna = false;
236
                                                                break;
237
                                                        default:
238
                                                                if (!foundtei || !foundtext) break;
239

    
240
                                                                output.write("</"+localname+">\n");
241
                                                }
242
                                                break;
243

    
244
                                        case XMLStreamConstants.CHARACTERS:
245
                                                if (!foundtei || !foundtext) break;
246
                                                if (flagWord) {
247
                                                        if (flagForm) {
248
                                                                if (normalizeFormValues) {
249
                                                                        vForm += parser.getText().trim();
250
                                                                } else {
251
                                                                        vForm += parser.getText();
252
                                                                }
253
                                                        }
254
                                                        if (flagAna) {
255
                                                                if (normalizeAnaValues)
256
                                                                        anavalue += parser.getText().trim();
257
                                                                else
258
                                                                        anavalue += parser.getText();
259
                                                        }
260
                                                }
261
                                                break;
262
                                }
263
                        }
264

    
265
                        output.close();
266
                        if (parser != null) parser.close();
267
                if (inputData != null) inputData.close();
268
                } catch (Exception ex) {
269
                        System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex);
270
                        File errorDir = null
271
                        try {
272
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
273
                                println "Warning: Moving $xmlFile to $errorDir"
274
                                errorDir.mkdir();
275
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
276
                        } catch(Exception eCopy) {
277
                                println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy
278
                        }
279
                        if (parser != null) parser.close();
280
                        if (inputData != null) inputData.close();
281
                        return false;
282
                }
283
                if (nWords == 0) {
284
                        println "** no words written."
285
                }
286
                return nWords > 0;
287
        }
288
}