Statistics
| Revision:

root / tmp / org.txm.connlu.core / groovy / org / txm / scripts / importer / connlu / CONNLUImporter.groovy @ 2112

History | View | Annotate | Download (3 kB)

1
package org.txm.scripts.importer.connlu
2

    
3
import org.txm.Toolbox
4
import org.txm.importer.xtz.ImportModule;
5
import org.txm.metadatas.Metadatas
6
import org.txm.utils.io.FileCopy
7
import org.txm.utils.io.IOUtils
8
import org.txm.importer.xtz.*
9
import org.txm.scripts.importer.xtz.*
10
import org.txm.importer.ApplyXsl2;
11
import javax.xml.stream.*
12
import org.txm.utils.AsciiUtils
13

    
14
/**
15
 * Only build the Metadatas object since all XML-TXM files already exists.
16
 * Metadatas is used to build text order.
17
 * 
18
 * 
19
 * @author mdecorde
20
 *
21
 */
22
class CONNLUImporter extends XTZImporter {
23

    
24
        public CONNLUImporter(ImportModule module) {
25
                super(module);
26
        }
27

    
28
        @Override
29
        public void process() {
30

    
31
                File srcDirectory = new File(outputDirectory.getParentFile(), "src")
32
                srcDirectory.mkdirs();
33
                
34
                convert(inputDirectory, srcDirectory)
35

    
36
                inputDirectory = srcDirectory // switch source directory
37
                super.process();
38
        }
39
        
40
        public static void convert(File inputDirectory, File srcDirectory) {
41
                File master = null;
42
                def files = inputDirectory.listFiles()
43
                for (def f : files) {
44
                        if (f.getName().endsWith(".connlu")) {
45
                                master = f;
46
                                break;
47
                        }
48
                }
49

    
50
                if (master == null) {
51
                        println "Aborting. No CONNLU file found in $inputDirectory."
52
                        isSuccessFul = false;
53
                        return
54
                }
55

    
56
                
57
                
58
                String text_id = null;
59
                String sent_id = null;
60
                def content = [:];
61
                master.eachLine("UTF-8") { line ->
62
                        if (line.startsWith("# text = ")) {
63
                                text_id = line.substring("# text = ".length())
64
                        } else if (line.startsWith("# sent_id = ")) {
65
                                sent_id = line.substring("# sent_id = ".length())
66
                        } else {
67
                                if (text_id != null && sent_id != null) {
68
                                        if (!content.containsKey(text_id)) {
69
                                                content[text_id] = [:]
70
                                        }
71
                                        def text = content[text_id]
72
                                        if (!text.containsKey(sent_id)) {
73
                                                text[sent_id] = []
74
                                        }
75
                                        text[sent_id] << line.split("\t")
76
                                }
77
                        }
78
                }
79

    
80
                
81
                for (def text_id2 : content.keySet()) {
82
                        File xmlFile = new File(srcDirectory, AsciiUtils.buildAttributeId(text_id)+".xml")
83
                        BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
84
                        XMLOutputFactory factory = XMLOutputFactory.newInstance();
85
                        XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
86

    
87
                        writer.writeStartDocument("UTF-8","1.0");
88
                        writer.writeStartElement ("TEI");
89
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
90
                        writer.writeNamespace("txm", "http://textometrie.org/1.0");
91
                        writer.writeStartElement ("teiHeader");
92
                        writer.writeEndElement()
93
                        writer.writeStartElement ("text");
94
                        def text = content[text_id2]
95
                        for (def sent_id2 : text.keySet()) {
96
                                writer.writeStartElement ("s");
97
                                for (def line : text[sent_id]) {
98
                                        writer.writeStartElement ("w");
99
                                        for (int i = 0 ; i < line.size() ; i++) {
100
                                                writer.writeAttribute("p"+(i+1), line[i])
101
                                        }
102
                                        writer.writeCharacters(line[1])
103
                                        writer.writeEndElement() // w
104
                                        writer.writeCharacters(" ")
105
                                }
106
                                
107
                                writer.writeEndElement() // s
108
                                writer.writeCharacters("\n")
109
                        }
110
                        writer.writeEndElement() // text
111
                        writer.writeEndElement() // TEI
112
                }
113
        }
114
}