Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / limsi / LimsiToCQP.groovy @ 187

History | View | Annotate | Download (1.9 kB)

1
package org.txm.importer.limsi
2

    
3
import javax.xml.stream.*
4
import java.net.URL
5

    
6
class LimsiToCQP {
7

    
8
        File xmlfile;
9

    
10
        public LimsiToCQP(File xmlfile) {
11
                this.xmlfile = xmlfile;
12
        }
13

    
14
        public boolean process(File outfile) {
15
                if (!xmlfile.exists()) return false;
16
                
17
                URL url = xmlfile.toURI().toURL();
18
                String filename = outfile.getName()
19
                filename = filename.substring(0, filename.length()-4); // remove ".wtc"
20
                def inputData = url.openStream();
21
                XMLInputFactory factory = XMLInputFactory.newInstance();
22
                XMLStreamReader parser = factory.createXMLStreamReader(inputData);
23

    
24
                def output = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8");
25
                
26
                boolean flagWord = false
27
                String word = ""
28
                
29

    
30
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
31
                        switch (event)
32
                        {
33
                                case XMLStreamConstants.START_ELEMENT:
34
                                        String localname = parser.getLocalName()
35
                                        //println "start $localname"
36
                                        switch(localname) {
37
                                                case "AudioDoc":
38
                                                        output.println "<text id=\"$filename\">"
39
                                                        break
40
                                                case "Word":
41
                                                        flagWord = true
42
                                                        word = ""
43
                                                break;
44
                                        }
45
                                        break;
46
                                case XMLStreamConstants.END_ELEMENT:
47
                                        String localname = parser.getLocalName()
48
                                        switch(localname) {
49
                                                case "AudioDoc":
50
                                                output.println "</text>"
51
                                                break
52
                                                case "Word":
53
                                                        flagWord = false
54
                                                        output.println word.trim()
55
                                                //        println "WORD: $word"
56
                                                break;
57
                                        }
58
                                        break
59
                                case XMLStreamConstants.CHARACTERS:
60
                                        if (flagWord) {
61
                                                word += parser.getText();
62
                                        }
63
                                        break
64
                        }
65
                }
66
                output.flush()
67
                output.close()
68
                //println "$xmlfile -> $outfile"
69
                return true;
70
        }
71

    
72
        public static void main(String[] args) {
73
                File infile = new File("/home/mdecorde/xml/limsi","20071220_1900_1920_inter.xml")
74
                File outfile = new File("/home/mdecorde/xml/limsi","out.wtc")
75
                def processor = new LimsiToCQP(infile);
76
                println processor.process(outfile);
77
        }
78
}