Révision 3025
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/txt/CQP2XMLMacro.groovy (revision 3025) | ||
---|---|---|
1 |
package org.txm.macro.txt |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
@Field @Option(name="inputDirectory", usage="CQP directory", widget="Folder", required=false, def="") |
|
12 |
File inputDirectory |
|
13 |
|
|
14 |
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8") |
|
15 |
String encoding |
|
16 |
|
|
17 |
@Field @Option(name="separator", usage="Column separator (TAB by default)", widget="String", required=false, def="\\t") |
|
18 |
String separator |
|
19 |
|
|
20 |
@Field @Option(name="columns", usage="Column names separated by comma ('word,pos,lemma' by default)", widget="String", required=true, def="word,pos,lemma") |
|
21 |
String columns |
|
22 |
|
|
23 |
def rootTag = "text" |
|
24 |
|
|
25 |
if (!ParametersDialog.open(this)) return; |
|
26 |
|
|
27 |
encoding = encoding.trim() |
|
28 |
|
|
29 |
if (separator == "\\t") { |
|
30 |
separator = "\t" |
|
31 |
} else { |
|
32 |
separator = separator[0] |
|
33 |
} |
|
34 |
|
|
35 |
columnNames = columns.trim().split(",") |
|
36 |
|
|
37 |
outputDirectory = new File(inputDirectory, "out") |
|
38 |
outputDirectory.mkdir() |
|
39 |
|
|
40 |
println "processing: "+inputDirectory |
|
41 |
|
|
42 |
for (File inputfile : inputDirectory.listFiles()) { |
|
43 |
if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".cqp")) { |
|
44 |
continue // ignore |
|
45 |
} |
|
46 |
|
|
47 |
println " file: "+inputfile |
|
48 |
|
|
49 |
String name = inputfile.getName() |
|
50 |
int idx = name.lastIndexOf(".") |
|
51 |
|
|
52 |
if (idx > 0) { |
|
53 |
name = name.substring(0, idx) |
|
54 |
} |
|
55 |
|
|
56 |
File outputfile = new File(outputDirectory, name+".xml") |
|
57 |
|
|
58 |
// output = new FileOutputStream(outputfile) |
|
59 |
|
|
60 |
output = new PrintWriter(new BufferedWriter(new FileWriter(outputfile, true))); |
|
61 |
outputStr = new StringWriter() |
|
62 |
factory = XMLOutputFactory.newInstance() |
|
63 |
writer = factory.createXMLStreamWriter(outputStr) |
|
64 |
|
|
65 |
writer.writeStartDocument("UTF-8","1.0") |
|
66 |
writer.writeCharacters("\n") // manage XML reserved characters |
|
67 |
writer.writeStartElement(rootTag) |
|
68 |
writer.writeAttribute("id", name) |
|
69 |
writer.writeCharacters("\n") |
|
70 |
|
|
71 |
inputfile.withReader(encoding) { reader -> |
|
72 |
|
|
73 |
def line |
|
74 |
while ((line = reader.readLine()) != null) { |
|
75 |
if (line.length() > 0) { |
|
76 |
if (line[0] == "<") { |
|
77 |
writer.flush() |
|
78 |
outputStr.write(line) |
|
79 |
outputStr.flush() |
|
80 |
} else { |
|
81 |
writer.writeStartElement("w") |
|
82 |
def word = "" |
|
83 |
println "line = "+line |
|
84 |
println "columnNames = "+columnNames |
|
85 |
line.split(separator).eachWithIndex { columnValue, index -> |
|
86 |
if (columnNames[index] == "word") { |
|
87 |
word = columnValue |
|
88 |
} else { |
|
89 |
writer.writeAttribute(columnNames[index], columnValue) |
|
90 |
} |
|
91 |
} |
|
92 |
writer.writeCharacters(word) |
|
93 |
writer.writeEndElement() // w |
|
94 |
} |
|
95 |
writer.writeCharacters("\n") |
|
96 |
} |
|
97 |
} |
|
98 |
} |
|
99 |
writer.writeEndElement() // rootTag |
|
100 |
writer.close() |
|
101 |
output.println(outputStr.toString()) |
|
102 |
output.close() |
|
103 |
} |
Formats disponibles : Unified diff