Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / tmx / Tmx2XmlFiles.groovy @ 187

History | View | Annotate | Download (13 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (Thu, 17 Dec 2015) $
25
// $LastChangedRevision: 3087 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.tmx;
29

    
30
import javax.xml.parsers.DocumentBuilder;
31
import javax.xml.parsers.DocumentBuilderFactory;
32
import javax.xml.parsers.ParserConfigurationException;
33
import javax.xml.transform.OutputKeys;
34
import javax.xml.transform.Result;
35
import javax.xml.transform.Source;
36
import javax.xml.transform.Transformer;
37
import javax.xml.transform.TransformerFactory;
38
import javax.xml.transform.dom.DOMSource;
39
import javax.xml.transform.stream.StreamResult;
40
import javax.xml.stream.*;
41

    
42
import org.txm.objects.Base;
43
import org.w3c.dom.Document;
44
import org.w3c.dom.Element;
45
import org.w3c.dom.NodeList;
46
import org.xml.sax.SAXException;
47

    
48
import java.io.File;
49
import java.io.FileInputStream;
50
import java.io.InputStreamReader;
51

    
52

    
53
import java.net.URL;
54

    
55
import org.txm.objects.*;
56
// TODO: Auto-generated Javadoc
57

    
58
/**
59
 * Split a TMX file into N xml files, N the number of lang declared in the <tuv> tags.
60
 *
61
 * @author mdecorde
62
 */
63
class Tmx2XmlFiles {
64

    
65
        /** The lang. */
66
        String lang;
67

    
68
        /** The type. */
69
        String type;
70

    
71
        /** The infile. */
72
        File infile;
73

    
74
        /** The outdir. */
75
        File outdir;
76

    
77
        /** The writers. */
78
        ArrayList<XMLStreamWriter> writers;
79
        def outputs;
80

    
81
        /** The textslangs. */
82
        HashMap<String, String> textslangs;
83
        HashMap<Integer, ArrayList<String>> langGroups;
84
        Set<String> langs;
85
        def header = [:];
86

    
87
        /**
88
         * Instantiates a new tmx2 xml files.
89
         *
90
         * @param base the base
91
         */
92
        public Tmx2XmlFiles(HashMap<String, String> textslangs, HashMap<Integer, ArrayList<String>> langGroups)
93
        {
94
                this.textslangs = textslangs;
95
                this.langGroups = langGroups;
96
        }
97

    
98
        /**
99
         * Gets the text langs.
100
         *
101
         * @return the langs of the TMX files
102
         */
103
        public HashMap<String, String> getTextLangs()
104
        {
105
                return textslangs;
106
        }
107

    
108
        /**
109
         * Gets the original texts
110
         *
111
         * @return the langs of the TMX files
112
         */
113
        public HashMap<String, String> getLangGroups()
114
        {
115
                return langGroups;
116
        }
117
        
118
        def corpusIDS = [:];
119
        int noCorpusID = 0;
120
        public def getCorpusIDS() {
121
                return corpusIDS;
122
        }
123

    
124
        int TOTALTUV = 0;
125
        public def initWriters(File tmxFile) {
126
                println "initialize writers for : $tmxFile"
127
                def inputData = infile.toURI().toURL().openStream();
128
                def inputfactory = XMLInputFactory.newInstance();
129
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
130

    
131
                TOTALTUV = 0;
132
                writers = new ArrayList<XMLStreamWriter>(); // one writer per text
133
                outputs = []; // one writer per text
134
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
135
                {
136
                        if (event == XMLStreamConstants.START_ELEMENT) {
137
                                if (parser.getLocalName() == "tuv") {
138

    
139
                                        String lang = "fr";
140
                                        for(int i = 0 ; i < parser.getAttributeCount() ; i++)
141
                                                if (parser.getAttributeLocalName(i) == "lang")
142
                                        {
143
                                                lang = (parser.getAttributeValue(i)).toLowerCase()
144
                                                break;
145
                                        }
146

    
147
                                        buildWriter(TOTALTUV, lang);
148
                                        TOTALTUV++
149
                                }
150
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
151
                                if (parser.getLocalName() == "tu") {
152
                                        break; // stop at the end of the first </tu>
153
                                }
154
                        }
155
                }
156

    
157
                parser.close()
158
                inputData.close();
159
        }
160

    
161
        String textname;
162
        int noTuv = -1;
163
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
164
        public def buildWriter(int no, String lang) {
165
                //println "build Writer : $no $lang"
166
                if (writers.size() <= no) {
167
                        File outfile = new File(outdir, textname+no+".xml");
168
                        textLangs.put(outfile.getName(), lang);
169
                        if (!langGroups.containsKey(no)) langGroups[no] = [];
170
                        langGroups.get(no).add(outfile.getName());
171

    
172
                        FileOutputStream output = new FileOutputStream(outfile)
173
                        def writer = factory.createXMLStreamWriter(output, "UTF-8")
174
                        writers.add(no, writer);
175
                        outputs.add(no, output);
176

    
177
                        writer.writeStartDocument("UTF-8","1.0");
178
                        writer.writeStartElement ("TEI");
179
                        writer.writeStartElement ("teiHeader");
180
                        writer.writeEndElement();
181
                }
182
                return writers.get(no);
183
        }
184

    
185
        XMLStreamReader parser;
186
        int tuCounter = 0;
187
        /**
188
         * start the processing.
189
         *
190
         * @param infile the infile
191
         * @param outdir the outdir
192
         */
193
        public void run(File indir, File outdir)
194
        {
195
                boolean isTuv = false;
196
                boolean isProp = false;
197
                boolean isTu = false;
198
                boolean flagHeader = false;
199
                String propvalue;
200
                String localname;
201

    
202
                def tuvProps = [:];
203
                def tuProps = [:];
204
                def corpusTITLES = [];
205
                def corpusSUBTITLES = [];
206
                def corpusAUTHORS = [];
207
                def corpusTRANSLATORS = [];
208
                
209
                def tuTitles = [];
210
                boolean endOfTuProp = false;
211

    
212
                for (File infile : indir.listFiles()) {
213
                        if (!infile.canRead() || infile.getName().startsWith("import") || infile.getName().endsWith(".css"))  {
214
                                println "skip file : "+infile;
215
                                continue;
216
                        }
217
                        this.infile = infile;
218
                        this.outdir = outdir;
219
                        outdir.mkdir(); // ensure the directory exists
220
                        // get the filename without ".xml"
221
                        textname = infile.getName().substring(0,infile.getName().length()-4)+"_";
222
                        initWriters(infile);
223

    
224
                        // create XML reader
225
                        def inputData = infile.toURI().toURL().openStream();
226
                        def inputfactory = XMLInputFactory.newInstance();
227
                        parser = inputfactory.createXMLStreamReader(inputData);
228
                        def headerProps = [:];
229
                        noTuv = -1;
230
                        //getWriter(noTuv);
231
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
232
                        {
233
                                switch (event) {
234
                                        case XMLStreamConstants.START_ELEMENT:
235
                                                localname = parser.getLocalName();
236

    
237
                                                if (localname == "tuv") {
238
                                                        noTuv++;
239
                                                        
240
                                                        if (endOfTuProp) {
241
                                                                endOfTuProp = false;
242
                                                                //println "localname: "+localname+" $tuTitles "//+parser.getLocation()
243
                                                                for(int i = 0 ; i < writers.size() && i < tuTitles.size() ; i++) {
244
                                                                        writers.get(i).writeAttribute("title", tuTitles.get(i));
245
                                                                }
246
                                                        }
247
                                                        
248
                                                        this.writeStartElement("seg");
249
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) // get the lang attribute of the tuv
250
                                                                if (parser.getAttributeLocalName(i) == "lang")
251
                                                        {
252
                                                                lang = parser.getAttributeValue(i)
253
                                                                break;
254
                                                        }
255
                                                        
256
                                                        
257
                                                        
258
                                                        this.writeAttribute("id", "seg_$tuCounter");
259
                                                } else if (localname == "seg") {
260
                                                        // do nothing :)
261
                                                } else if (localname == "prop") {
262
                                                        type = null;
263
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) { // get the type attribute of the prop
264
                                                                if (parser.getAttributeLocalName(i) == "type") {
265
                                                                        type = parser.getAttributeValue(i)
266
                                                                        break;
267
                                                                }
268
                                                        }
269
                                                        if (type == null) {
270
                                                                println "ERROR NO PROP TYPE : "+parser.getLocation()
271
                                                        }
272
                                                        propvalue= "";
273
                                                        isProp= true;
274
                                                } else if (localname == "header") {
275
                                                        flagHeader = true;
276
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) { // get the type attribute of the prop
277
                                                                headerProps[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
278
                                                        }
279
                                                } else {
280

    
281
                                                        if (endOfTuProp && localname == "note") {
282
                                                                //println "localname: note $tuTitles "
283
                                                                for (int i = 0 ; i < writers.size() ; i++) {
284
                                                                        writers.get(i).writeAttribute("title", tuTitles.get(i));
285
                                                                }
286
                                                                endOfTuProp = false;
287
                                                        }
288
                                                        
289
                                                        if (localname == "tmx") continue;
290
                                                        if (localname == "tu") {
291
                                                                tuTitles = [];
292
                                                                isTu= true;
293
                                                                noTuv = -1;
294
                                                                endOfTuProp = true; // wait for the next start element that is not note or prop
295
                                                                tuCounter++;
296
                                                        }
297
                                                        if (localname == "body")
298
                                                                localname = "text"
299
                                                        this.writeStartElement(localname); // create text tag in each xml file
300

    
301
                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++) {
302
                                                                this.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
303
                                                        }
304
                                                        if (localname == "text") {
305
                                                                println "add header : $headerProps"
306
                                                                //for(String key : headerProps.keySet())        this.writeAttribute(key, headerProps[key]);
307

    
308
                                                                for (int i = 0 ; i < writers.size() ; i++) {
309
                                                                        if (corpusIDS.size() > i)
310
                                                                                writers.get(i).writeAttribute("corpusid", corpusIDS[i]);
311
                                                                        if (corpusTITLES.size() > i)
312
                                                                                writers.get(i).writeAttribute("title", corpusTITLES.get(i));
313
                                                                        if (corpusSUBTITLES.size() > i)
314
                                                                                writers.get(i).writeAttribute("subtitle", corpusSUBTITLES.get(i));
315
                                                                        if (corpusAUTHORS.size() > i)
316
                                                                                writers.get(i).writeAttribute("author", corpusAUTHORS.get(i));
317
                                                                        if (corpusTRANSLATORS.size() > i)
318
                                                                                writers.get(i).writeAttribute("translator", corpusTRANSLATORS.get(i));
319
                                                                }
320
                                                        }
321
                                                }
322
                                                break;
323

    
324
                                        case XMLStreamConstants.END_ELEMENT:
325
                                                localname = parser.getLocalName();
326
                                                switch (localname) {
327
                                                        case "tuv":
328
                                                                lang = null
329
                                                                this.writeEndElement();
330
                                                                break
331
                                                        case "seg":
332
                                                                break;
333
                                                        case "prop":
334
                                                                //println "flagHeader $flagHeader isTu $isTu type $type"
335
                                                                if (flagHeader) {
336
                                                                        if (type == "corpusId") {
337
                                                                                corpusIDS[noCorpusID++] = propvalue.toLowerCase();
338
                                                                        } else if (type == "title") {
339
                                                                                corpusTITLES << propvalue;
340
                                                                        } else if (type == "subtitle") {
341
                                                                                corpusSUBTITLES << propvalue;
342
                                                                        } else if (type == "author") {
343
                                                                                corpusAUTHORS << propvalue;
344
                                                                        } else if (type == "translator") {
345
                                                                                corpusTRANSLATORS << propvalue;
346
                                                                        }
347
                                                                } else if (isTu && type == "tuTitle") {
348
                                                                        tuTitles << propvalue
349
                                                                } else {
350
                                                                        writeAttribute(type, propvalue); // transform all properties to attributes
351
                                                                }
352
                                                                isProp= false;
353
                                                                break;
354
                                                        case "tmx":
355
                                                                break;
356
                                                        case "tu":
357
                                                                isTu= false;
358
                                                                endOfTuProp = false;
359
                                                                noTuv = -1;
360
                                                                this.writeEndElement();
361
                                                        case "header":
362
                                                                flagHeader = false;
363
                                                                break;
364
                                                        default:
365
                                                                this.writeEndElement();
366
                                                                break;
367
                                                }
368
                                                break;
369

    
370
                                        case XMLStreamConstants.CHARACTERS:
371
                                                if (isProp) {
372
                                                        propvalue += parser.getText(); // get prop tag text
373
                                                } else {
374
                                                        this.writeCharacters(parser.getText());
375
                                                }
376
                                                break;
377

    
378
                                }
379
                        }
380

    
381
                        this.writeEndElement(); // close TEI
382
                        parser.close();
383
                        inputData.close();
384

    
385
                        for (XMLStreamWriter writer : writers) { // close all xml file output streamS
386
                                writer.close();
387
                        }
388
                        for (def output : outputs) { // close all xml file output streamS
389
                                output.close();
390
                        }
391
                }
392
        }
393

    
394
        /**
395
         * call writeEndElement in all outputstream
396
         * if lang field is set, write only in the stream of the file with the corresponding lang.
397
         */
398
        private void writeEndElement()
399
        {
400
                if (noTuv == -1) {
401
                        for (def writer : writers) writer.writeEndElement();
402
                } else
403
                        writers.get(noTuv).writeEndElement();
404
        }
405

    
406
        /**
407
         * call writeStartElement in all outputstream
408
         * if lang field is set, write only in the stream of the file with the corresponding lang.
409
         *
410
         * @param localname the localname
411
         */
412
        private void writeStartElement(String localname)
413
        {
414
                if (noTuv == -1) {
415
                        for (def writer : writers) writer.writeStartElement(localname);
416
                } else
417
                        writers.get(noTuv).writeStartElement(localname);
418
        }
419

    
420
        /**
421
         * call writeStartDocument in all outputstream
422
         * if lang field is set, write only in the stream of the file with the corresponding lang.
423
         *
424
         * @param text the text
425
         */
426
        private void writeCharacters(String text)
427
        {
428
                //System.err.println "write chars elem : '"+text+"' >> "+lang
429
                text = text.trim();
430
                if (text.length() > 0) {
431
                        if (noTuv == -1) {
432
                                for (def writer : writers) writer.writeCharacters(text);
433
                        } else
434
                                writers.get(noTuv).writeCharacters(text);
435
                }
436
        }
437

    
438
        /**
439
         * call writeAttribute in all outputstream
440
         * if lang field is set, write only in the stream of the file with the corresponding lang.
441
         *
442
         * @param name the name
443
         * @param value the value
444
         */
445
        private void writeAttribute(String name, String value)
446
        {
447
                try {
448
                        if (noTuv == -1) {
449
                                for (def writer : writers) writer.writeAttribute(name, value);
450
                        } else
451
                                writers.get(noTuv).writeAttribute(name, value);
452
                } catch (Exception e) { org.txm.utils.logger.Log.printStackTrace(e); System.err.println("ERROR: "+parser.getLocation());}
453
        }
454

    
455
        /**
456
         * Main.
457
         *
458
         * @param args the args
459
         */
460
        static main(args)
461
        {
462
                File infile = new File(System.getProperty("user.home"),"xml/tmx/sample.tmx");
463
                File outfile = new File(System.getProperty("user.home"),"TXM/corpora/tmx/");
464
                new Tmx2XmlFiles().run(infile, outfile);
465
                println "done"
466
        }
467
}
468