Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZCompilerStep.groovy @ 187

History | View | Annotate | Download (6.8 kB)

1
package org.txm.importer.xtz;
2

    
3
import java.io.File;
4
import javax.xml.stream.*
5
import java.net.URL
6
import java.util.HashMap;
7
import org.txm.utils.*
8

    
9
public class XTZCompilerStep {
10

    
11
        static String FORM = "form";
12
        static String ANA = "ana";
13
        static String ID = "id";
14
        static String TYPE = "type";
15
        static String TAB = "\t";
16
        static String QUOTE = "\"";
17

    
18
        File xmlFile
19
        File wtcFile
20
        String textname, corpusname, projectname;
21
        boolean normalizeAttributeValues = false;
22
        boolean normalizeAnaValues = true;
23
        boolean normalizeFormValues = true;
24

    
25
        def inputData;
26
        XMLInputFactory factory;
27
        XMLStreamReader parser;
28
        OutputStreamWriter output;
29

    
30
        def anavalues = [:];
31
        def anatypes;
32

    
33
        String WTAG = "w"
34

    
35
        public void setNormalizeAttributeValues(boolean n) {
36
                this.normalizeAttributeValues = n;
37
        }
38

    
39
        public void setNormalizeAnaValues(boolean n) {
40
                this.normalizeAnaValues = n;
41
        }
42

    
43
        public void setNormalizeFormValues(boolean n) {
44
                this.normalizeFormValues = n;
45
        }
46

    
47
        public XTZCompilerStep(File xmlFile, File wtcFile, String textname, String corpusname, String projectname, def anatypes, def wtag) {
48
                this.xmlFile = xmlFile;
49
                this.wtcFile = wtcFile;
50
                this.textname = textname
51
                this.corpusname = corpusname;
52
                this.projectname = projectname;
53
                this.anatypes = anatypes;
54
                this.WTAG = wtag
55

    
56
                try {
57
                        inputData = xmlFile.toURI().toURL().openStream();
58
                        factory = XMLInputFactory.newInstance();
59
                        parser = factory.createXMLStreamReader(inputData);
60

    
61
                } catch (Exception ex) {
62
                        System.err.println("Exception while parsing $xmlFile : "+ex);
63
                }
64
        }
65

    
66
        /**
67
         * Creates the output.
68
         *
69
         * @param dirPathName the dir path name
70
         * @param fileName the file name
71
         * @return true, if successful
72
         */
73
        private boolean createOutput(File f) {
74
                try {
75
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f)) , "UTF-8");
76
                        return true;
77
                } catch (Exception e) {
78
                        System.err.println(e);
79
                        return false;
80
                }
81
        }
82

    
83
        /**
84
         * Transfom file wtc.
85
         *
86
         * @param dirPathName the dir path name
87
         * @param fileName the file name
88
         * @return true, if successful
89
         */
90
        public boolean process()
91
        {
92
                if (!createOutput(wtcFile))
93
                        return false;
94

    
95
                String headvalue=""
96
                String vAna = "";
97
                String vForm = "";
98
                String wordid= "";
99
                String vHead = "";
100

    
101

    
102
                int p_id = 0;
103
                int s_id = 0;
104

    
105
                def divs = []
106
                def ncounts = [:] // contains the n values per tags with no attribute
107

    
108
                boolean captureword = false;
109
                boolean flagWord = false;
110
                boolean flagForm = false;
111
                boolean flagAna = false;
112

    
113
                String anatype = "";
114
                String anavalue = "";
115
                boolean stopAtFirstSort = true;
116
                boolean foundtei = false;
117
                boolean foundtext = false;
118

    
119
                try {
120
                        String localname;
121
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
122
                        {
123
                                switch (event) {
124
                                        case XMLStreamConstants.START_ELEMENT:
125
                                                localname = parser.getLocalName().toLowerCase();
126
                                                if ("tei".equals(localname)) foundtei = true;
127
                                                switch (localname) {
128
                                                        case "text":
129
                                                                foundtext = true;
130
                                                                output.write("<text id=\""+textname+"\" base=\""+corpusname+QUOTE + " project=\""+projectname+QUOTE);
131
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
132
                                                                        String attrname = parser.getAttributeLocalName(i);
133
                                                                        String attrvalue = parser.getAttributeValue(i)
134

    
135
                                                                        if (normalizeAttributeValues)
136
                                                                                attrvalue = attrvalue.trim();
137

    
138
                                                                        if (attrname != ID)
139
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
140
                                                                }
141
                                                                output.write(">\n");
142

    
143
                                                                break;
144

    
145
                                                        case WTAG:
146
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
147
                                                                        if (parser.getAttributeLocalName(i).equals(ID)) {
148
                                                                                wordid = parser.getAttributeValue(i);
149
                                                                        }
150
                                                                }
151
                                                                anavalues = [:];
152
                                                                flagWord = true
153
                                                                break;
154
                                                        case FORM:
155
                                                                flagForm = true;
156
                                                                vForm = "";
157
                                                                vAna = "";
158
                                                                break;
159

    
160
                                                        case ANA:
161
                                                                flagAna = true;
162
                                                                anavalue = "";
163
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
164
                                                                        //println parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
165
                                                                        if (TYPE.equals(parser.getAttributeLocalName(i))) {
166
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
167
                                                                                break;
168
                                                                        }
169
                                                                }
170
                                                                break;
171

    
172
                                                        default:
173
                                                                if (foundtei && !foundtext) break;
174

    
175
                                                                output.write("<"+localname);
176

    
177
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
178
                                                                        String attrname = parser.getAttributeLocalName(i);
179

    
180
                                                                        String attrvalue = parser.getAttributeValue(i)
181
                                                                        if (normalizeAttributeValues)
182
                                                                                attrvalue = attrvalue.trim();
183

    
184
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+QUOTE)
185
                                                                }
186
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
187
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
188
                                                                        int ncount = ncounts.get(localname);
189
                                                                        ncounts.put(localname, ncount+1);
190
                                                                        output.write(" n=\""+ncount+QUOTE)
191
                                                                }
192
                                                                output.write(">\n");
193
                                                }
194
                                                break;
195

    
196
                                        case XMLStreamConstants.END_ELEMENT:
197
                                                localname = parser.getLocalName().toLowerCase();
198
                                                switch (localname) {
199
                                                        case WTAG:
200
                                                                for (String type : anatypes) {
201
                                                                        def v = anavalues.get(type);
202
                                                                        if (v != null) vAna +=TAB+v;
203
                                                                        else vAna +=TAB;
204
                                                                }
205
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
206
                                                                if (vAna != null) {
207
                                                                        output.write(vForm+TAB+wordid+vAna+"\n");
208
                                                                }
209
                                                                vAna = "";
210
                                                                vForm = "";
211
                                                                flagWord = false;
212
                                                                break;
213

    
214
                                                        case "tei":
215
                                                                break;
216
                                                        case FORM:
217
                                                                flagForm = false;
218
                                                                break;
219
                                                        case ANA:
220
                                                                anavalues.put(anatype, anavalue)
221
                                                                flagAna = false;
222
                                                                break;
223
                                                        default:
224
                                                                if (foundtei && !foundtext) break;
225

    
226
                                                                output.write("</"+localname+">\n");
227
                                                }
228
                                                break;
229

    
230
                                        case XMLStreamConstants.CHARACTERS:
231
                                                if (flagWord) {
232
                                                        if (flagForm) {
233
                                                                if (normalizeFormValues) {
234
                                                                        vForm += parser.getText().trim();
235
                                                                } else {
236
                                                                        vForm += parser.getText();
237
                                                                }
238
                                                        }
239
                                                        if (flagAna) {
240
                                                                if (normalizeAnaValues)
241
                                                                        anavalue += parser.getText().trim();
242
                                                                else
243
                                                                        anavalue += parser.getText();
244
                                                        }
245
                                                }
246
                                                break;
247
                                }
248
                        }
249

    
250
                        output.close();
251
                        parser.close();
252
                } catch (Exception ex) {
253
                        System.out.println("Exception while parsing " + inputData+" of Text "+textname+" : "+ex);
254
                        File errorDir = null
255
                        try {
256
                                errorDir = new File(wtcFile.getParentFile(), "compiler-error")
257
                                println "Warning: Moving $xmlFile to $errorDir"
258
                                errorDir.mkdir();
259
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
260
                        } catch(Exception eCopy) {
261
                                println "Error while moving "+xmlFile+" to "+errorDir+" : "+eCopy
262
                        }
263
                        return false;
264
                }
265
                return true;
266
        }
267
}