Statistics
| Revision:

root / tmp / org.txm.connlu.core / groovy / org / txm / scripts / importer / connlu / CONNLUImporter.groovy @ 2115

History | View | Annotate | Download (3.8 kB)

1
package org.txm.scripts.importer.connlu
2

    
3
import org.txm.Toolbox
4
import org.txm.importer.xtz.ImportModule;
5
import org.txm.metadatas.Metadatas
6
import org.txm.utils.io.FileCopy
7
import org.txm.utils.io.IOUtils
8
import org.txm.importer.xtz.*
9
import org.txm.scripts.importer.xtz.*
10
import org.txm.importer.ApplyXsl2;
11
import javax.xml.stream.*
12
import org.txm.utils.AsciiUtils
13
import org.txm.utils.ConsoleProgressBar
14

    
15
/**
16
 * Only build the Metadatas object since all XML-TXM files already exists.
17
 * Metadatas is used to build text order.
18
 * 
19
 * 
20
 * @author mdecorde
21
 *
22
 */
23
class CONNLUImporter extends XTZImporter {
24

    
25
        public CONNLUImporter(ImportModule module) {
26
                super(module);
27
        }
28

    
29
        @Override
30
        public void process() {
31

    
32
                File srcDirectory = new File(outputDirectory.getParentFile(), "src")
33
                srcDirectory.deleteDir();
34
                srcDirectory.mkdirs();
35
                
36
                convert(inputDirectory, srcDirectory)
37

    
38
                inputDirectory = srcDirectory // switch source directory
39
                super.process();
40
        }
41
        
42
        public static def convert(File inputDirectory, File srcDirectory) {
43
                File master = null;
44
                def files = inputDirectory.listFiles()
45
                for (def f : files) {
46
                        if (f.getName().endsWith(".conllu")) {
47
                                master = f;
48
                                break;
49
                        }
50
                }
51

    
52
                if (master == null) {
53
                        println "Aborting. No CONNLU file found in $inputDirectory."
54
                        return false
55
                }
56

    
57
def properties = ["n", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"]
58
                
59
                String text_id = null;
60
                String sent_id = null;
61
                def content = [:];
62
                master.eachLine("UTF-8") { line ->
63
                        if (line.startsWith("# newdoc id = ")) {
64
                                text_id = line.substring("# newdoc id = ".length())
65
                        } else if (line.startsWith("# sent_id = ")) {
66
                                sent_id = line.substring("# sent_id = ".length())
67
                        } else if (line.startsWith("#")) {
68
                                //sent_id = line.substring("# sent_id = ".length())
69
                        } else {
70
                                if (text_id != null && sent_id != null) {
71
                                        if (!content.containsKey(text_id)) {
72
                                                content[text_id] = [:]
73
                                        }
74
                                        def text = content[text_id]
75
                                        if (!text.containsKey(sent_id)) {
76
                                                text[sent_id] = []
77
                                        }
78
                                        text[sent_id] << line.split("\t")
79
                                }
80
                        }
81
                }
82

    
83
                ConsoleProgressBar cpb_texts = new ConsoleProgressBar(content.size())
84
                for (def text_id2 : content.keySet()) {
85
                        cpb_texts.tick()
86
                        File xmlFile = new File(srcDirectory, text_id2+".xml")
87
                        BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
88
                        XMLOutputFactory factory = XMLOutputFactory.newInstance();
89
                        XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
90

    
91
                        writer.writeStartDocument("UTF-8","1.0");
92
                        writer.writeStartElement ("TEI");
93
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
94
                        writer.writeNamespace("txm", "http://textometrie.org/1.0");
95
                        writer.writeCharacters("\n")
96
                        writer.writeStartElement ("teiHeader");
97
                        writer.writeEndElement()
98
                        writer.writeCharacters("\n")
99
                        writer.writeStartElement ("text");
100
                        writer.writeCharacters("\n")
101
                        def text = content[text_id2]
102
                        for (def sent_id2 : text.keySet()) {
103
                                writer.writeStartElement ("p");
104
                                writer.writeStartElement ("s");
105
                                writer.writeCharacters("\n")
106
                                for (def line : text[sent_id2]) {
107
                                        if (line.size() == properties.size()) {
108
                                                writer.writeStartElement ("w");
109
                                                for (int i = 0 ; i < line.size() ; i++) {
110
                                                        if (i == 5) line[i] = "|"+line[i]+"|";
111
                                                        writer.writeAttribute(properties[i], line[i])
112
                                                }
113
                                                
114
                                                writer.writeCharacters(line[1])
115
                                                writer.writeEndElement() // w
116
                                                writer.writeCharacters(" ")
117
                                        } else {
118
                                                //println "ERROR=$line size="+line.size()
119
                                        }
120
                                }
121
                                writer.writeCharacters("\n")
122
                                writer.writeEndElement() // s
123
                                writer.writeEndElement() // p
124
                                writer.writeCharacters("\n")
125
                        }
126
                        writer.writeEndElement() // text
127
                        writer.writeCharacters("\n")
128
                        writer.writeEndElement() // TEI
129
                        writer.close()
130
                        output.close()
131
                }
132
                cpb_texts.done()
133
                return true
134
        }
135
}