Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompilerStep.groovy @ 1137

History | View | Annotate | Download (6.9 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import javax.xml.stream.*
5
import java.net.URL
6
import java.util.HashMap;
7
import org.txm.utils.*
8
import org.txm.utils.io.*
9
import org.txm.importer.xtz.*
10

    
11
/**
12
 * Compiles the CQP file of ONE text
13
 * 
14
 * @author mdecorde
15
 *
16
 */
17
public class XTZCompilerStep extends Step {
18

    
19
        static String FORM = "form";
20
        static String ANA = "ana";
21
        static String ID = "id";
22
        static String TYPE = "type";
23
        static String TAB = "\t";
24
        static String QUOTE = "\"";
25

    
26
        File xmlFile
27
        File cqpFile
28
        String textname, corpusname, projectname;
29
        boolean normalizeAttributeValues = false;
30
        boolean normalizeAnaValues = true;
31
        boolean normalizeFormValues = true;
32

    
33
        def inputData;
34
        XMLInputFactory factory;
35
        XMLStreamReader parser;
36
        OutputStreamWriter output;
37

    
38
        def anavalues = [:];
39
        def anatypes;
40

    
41
        String WTAG = "w"
42

    
43
        public void setNormalizeAttributeValues(boolean n) {
44
                this.normalizeAttributeValues = n;
45
        }
46

    
47
        public void setNormalizeAnaValues(boolean n) {
48
                this.normalizeAnaValues = n;
49
        }
50

    
51
        public void setNormalizeFormValues(boolean n) {
52
                this.normalizeFormValues = n;
53
        }
54

    
55
        public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) {
56
                this.xmlFile = xmlFile;
57
                this.cqpFile = cqpFile;
58
                this.textname = textname
59
                this.corpusname = corpusname;
60
                this.projectname = projectname;
61
                this.anatypes = anatypes;
62
                this.WTAG = wtag
63

    
64
                try {
65
                        inputData = xmlFile.toURI().toURL().openStream();
66
                        factory = XMLInputFactory.newInstance();
67
                        parser = factory.createXMLStreamReader(inputData);
68

    
69
                } catch (Exception ex) {
70
                        System.err.println("Exception while parsing $xmlFile : "+ex);
71
                }
72
        }
73

    
74
        /**
75
         * Creates the output.
76
         *
77
         * @param dirPathName the dir path name
78
         * @param fileName the file name
79
         * @return true, if successful
80
         */
81
        private boolean createOutput(File f) {
82
                try {
83
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8");
84
                        return true;
85
                } catch (Exception e) {
86
                        System.err.println(e);
87
                        return false;
88
                }
89
        }
90

    
91
        /**
92
         * Transfom file cqp.
93
         *
94
         * @param dirPathName the dir path name
95
         * @param fileName the file name
96
         * @return true, if successful
97
         */
98
        public boolean process()
99
        {
100
                if (!createOutput(cqpFile))
101
                        return false;
102

    
103
                String headvalue=""
104
                String vAna = "";
105
                String vForm = "";
106
                String wordid= "";
107
                String vHead = "";
108

    
109

    
110
                int p_id = 0;
111
                int s_id = 0;
112

    
113
                def divs = []
114
                def ncounts = [:] // contains the n values per tags with no attribute
115

    
116
                boolean captureword = false;
117
                boolean flagWord = false;
118
                boolean flagForm = false;
119
                boolean flagAna = false;
120

    
121
                String anatype = "";
122
                String anavalue = "";
123
                boolean stopAtFirstSort = true;
124
                boolean foundtei = false;
125
                boolean foundtext = false;
126

    
127
                try {
128
                        String localname;
129
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
130
                        {
131
                                switch (event) {
132
                                        case XMLStreamConstants.START_ELEMENT:
133
                                                localname = parser.getLocalName().toLowerCase();
134
                                                if ("tei".equals(localname)) foundtei = true;
135
                                                switch (localname) {
136
                                                        case "text":
137
                                                                foundtext = true;
138
                                                                output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE);
139
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
140
                                                                        String attrname = parser.getAttributeLocalName(i);
141
                                                                        String attrvalue = parser.getAttributeValue(i)
142

    
143
                                                                        if (normalizeAttributeValues)
144
                                                                                attrvalue = attrvalue.trim();
145

    
146
                                                                        if (attrname != ID)
147
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
148
                                                                }
149
                                                                output.write(">\n");
150

    
151
                                                                break;
152

    
153
                                                        case WTAG:
154
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
155
                                                                        if (parser.getAttributeLocalName(i).equals(ID)) {
156
                                                                                wordid = parser.getAttributeValue(i);
157
                                                                        }
158
                                                                }
159
                                                                anavalues = [:];
160
                                                                flagWord = true
161
                                                                break;
162
                                                        case FORM:
163
                                                                flagForm = true;
164
                                                                vForm = "";
165
                                                                vAna = "";
166
                                                                break;
167

    
168
                                                        case ANA:
169
                                                                flagAna = true;
170
                                                                anavalue = "";
171
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
172
                                                                        //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
173
                                                                        if (TYPE.equals(parser.getAttributeLocalName(i))) {
174
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
175
                                                                                break;
176
                                                                        }
177
                                                                }
178
                                                                break;
179

    
180
                                                        default:
181
                                                                if (foundtei && !foundtext) break;
182

    
183
                                                                output.write("<"+localname);
184

    
185
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
186
                                                                        String attrname = parser.getAttributeLocalName(i);
187

    
188
                                                                        String attrvalue = parser.getAttributeValue(i)
189
                                                                        if (normalizeAttributeValues)
190
                                                                                attrvalue = attrvalue.trim();
191

    
192
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
193
                                                                }
194
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
195
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
196
                                                                        int ncount = ncounts.get(localname);
197
                                                                        ncounts.put(localname, ncount+1);
198
                                                                        output.write(" n=\""+ncount+QUOTE)
199
                                                                }
200
                                                                output.write(">\n");
201
                                                }
202
                                                break;
203

    
204
                                        case XMLStreamConstants.END_ELEMENT:
205
                                                localname = parser.getLocalName().toLowerCase();
206
                                                switch (localname) {
207
                                                        case WTAG:
208
                                                                for (String type : anatypes) {
209
                                                                        def v = anavalues.get(type);
210
                                                                        if (v != null) vAna +=TAB+v;
211
                                                                        else vAna +=TAB;
212
                                                                }
213
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
214
                                                                if (vAna != null) {
215
                                                                        output.write(vForm+TAB+wordid+vAna+"\n");
216
                                                                }
217
                                                                vAna = "";
218
                                                                vForm = "";
219
                                                                flagWord = false;
220
                                                                break;
221

    
222
                                                        case "tei":
223
                                                                break;
224
                                                        case FORM:
225
                                                                flagForm = false;
226
                                                                break;
227
                                                        case ANA:
228
                                                                anavalues.put(anatype, anavalue)
229
                                                                flagAna = false;
230
                                                                break;
231
                                                        default:
232
                                                                if (foundtei && !foundtext) break;
233

    
234
                                                                output.write("</"+localname+">\n");
235
                                                }
236
                                                break;
237

    
238
                                        case XMLStreamConstants.CHARACTERS:
239
                                                if (flagWord) {
240
                                                        if (flagForm) {
241
                                                                if (normalizeFormValues) {
242
                                                                        vForm += parser.getText().trim();
243
                                                                } else {
244
                                                                        vForm += parser.getText();
245
                                                                }
246
                                                        }
247
                                                        if (flagAna) {
248
                                                                if (normalizeAnaValues)
249
                                                                        anavalue += parser.getText().trim();
250
                                                                else
251
                                                                        anavalue += parser.getText();
252
                                                        }
253
                                                }
254
                                                break;
255
                                }
256
                        }
257

    
258
                        output.close();
259
                        parser.close();
260
                } catch (Exception ex) {
261
                        System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex);
262
                        File errorDir = null
263
                        try {
264
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
265
                                println "Warning: Moving $xmlFile to $errorDir"
266
                                errorDir.mkdir();
267
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
268
                        } catch(Exception eCopy) {
269
                                println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy
270
                        }
271
                        return false;
272
                }
273
                return true;
274
        }
275
}