Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / txt / CQP2XMLMacro.groovy @ 3025

History | View | Annotate | Download (2.7 kB)

1
package org.txm.macro.txt
2

    
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

    
11
@Field @Option(name="inputDirectory", usage="CQP directory", widget="Folder", required=false, def="")
12
                File inputDirectory
13

    
14
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8")
15
                String encoding
16

    
17
@Field @Option(name="separator", usage="Column separator (TAB by default)", widget="String", required=false, def="\\t")
18
                String separator
19

    
20
@Field @Option(name="columns", usage="Column names separated by comma ('word,pos,lemma' by default)", widget="String", required=true, def="word,pos,lemma")
21
                String columns
22

    
23
def rootTag = "text"
24

    
25
if (!ParametersDialog.open(this)) return;
26

    
27
encoding = encoding.trim()
28

    
29
if (separator == "\\t") {
30
        separator = "\t"
31
} else {
32
        separator = separator[0]
33
}
34

    
35
columnNames = columns.trim().split(",")
36

    
37
outputDirectory = new File(inputDirectory, "out")
38
outputDirectory.mkdir()
39

    
40
println "processing: "+inputDirectory
41

    
42
for (File inputfile : inputDirectory.listFiles()) {
43
        if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".cqp")) {
44
                continue // ignore
45
        }
46
        
47
        println " file: "+inputfile
48
        
49
        String name = inputfile.getName()
50
        int idx = name.lastIndexOf(".")
51
        
52
        if (idx > 0) {
53
                name = name.substring(0, idx)
54
        }
55
        
56
        File outputfile = new File(outputDirectory, name+".xml")
57
        
58
        // output = new FileOutputStream(outputfile)
59
        
60
        output = new PrintWriter(new BufferedWriter(new FileWriter(outputfile, true)));
61
        outputStr = new StringWriter()
62
        factory = XMLOutputFactory.newInstance()
63
        writer = factory.createXMLStreamWriter(outputStr)
64
        
65
        writer.writeStartDocument("UTF-8","1.0")
66
        writer.writeCharacters("\n") // manage XML reserved characters
67
        writer.writeStartElement(rootTag)
68
        writer.writeAttribute("id", name)
69
        writer.writeCharacters("\n")
70
        
71
        inputfile.withReader(encoding) { reader ->
72
                
73
                def line
74
                while ((line = reader.readLine()) != null) {
75
                        if (line.length() > 0) {
76
                                if (line[0] == "<") {
77
                                        writer.flush()
78
                                        outputStr.write(line)
79
                                        outputStr.flush()
80
                                } else {
81
                                        writer.writeStartElement("w")
82
                                        def word = ""
83
                                        println "line = "+line
84
                                        println "columnNames = "+columnNames
85
                                        line.split(separator).eachWithIndex { columnValue, index ->
86
                                                if (columnNames[index] == "word") {
87
                                                        word = columnValue
88
                                                } else {
89
                                                        writer.writeAttribute(columnNames[index], columnValue)
90
                                                }
91
                                        }
92
                                        writer.writeCharacters(word)
93
                                        writer.writeEndElement() // w
94
                                }
95
                                writer.writeCharacters("\n")
96
                        }
97
                }
98
        }
99
        writer.writeEndElement() // rootTag
100
        writer.close()
101
        output.println(outputStr.toString())
102
        output.close()
103
}