root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / nlp / TT2XMLInDirectoryMacro.groovy @ 2769
History | View | Annotate | Download (3.4 kB)
1 |
package org.txm.macro.nlp;
|
---|---|
2 |
|
3 |
import org.kohsuke.args4j.* |
4 |
import groovy.transform.Field |
5 |
import java.nio.charset.Charset |
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
7 |
import org.txm.utils.* |
8 |
import javax.xml.stream.* |
9 |
|
10 |
@Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=true, def="dir") |
11 |
File inputDirectory
|
12 |
|
13 |
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8") |
14 |
String encoding
|
15 |
|
16 |
@Field @Option(name="debug", usage="Debug mode", widget="Boolean", required=false, def="false") |
17 |
Boolean debug
|
18 |
|
19 |
if (!ParametersDialog.open(this)) return |
20 |
|
21 |
encoding = encoding.trim() |
22 |
outputDirectory = new File(inputDirectory, "xml") |
23 |
outputDirectory.mkdir() |
24 |
|
25 |
println "Processing: "+inputDirectory
|
26 |
|
27 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
28 |
|
29 |
def files = inputDirectory.listFiles()
|
30 |
if (files == null || files.length == 0) { |
31 |
println "Error: no file to process in $inputDirectory"
|
32 |
return false; |
33 |
} |
34 |
for (File inputfile : files.sort()) { |
35 |
if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".tt")) continue // ignore |
36 |
println " file: "+inputfile
|
37 |
|
38 |
name = inputfile.getName() |
39 |
idx = name.lastIndexOf(".")
|
40 |
if (idx > 0) name = name.substring(0, idx) |
41 |
outputfile = new File(outputDirectory, name+".xml") |
42 |
|
43 |
Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8") |
44 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output) |
45 |
|
46 |
writer.writeStartDocument("UTF-8","1.0") |
47 |
writer.writeCharacters("\n")
|
48 |
writer.writeStartElement("text")
|
49 |
writer.writeCharacters("\n")
|
50 |
|
51 |
inSent = 0
|
52 |
nLine = 0
|
53 |
inputfile.eachLine(encoding) { line -> |
54 |
|
55 |
nLine++ |
56 |
|
57 |
if (debug) {
|
58 |
println nLine+": "+line
|
59 |
} |
60 |
|
61 |
def sent = false |
62 |
def sent_pattern = ~/^([^\t]+)\t(SENT)\t([^\t]+)$/ |
63 |
def sent_m = (line =~ sent_pattern)
|
64 |
if (sent_m) sent = true |
65 |
|
66 |
if (sent && inSent) {
|
67 |
writer.writeStartElement("w")
|
68 |
writer.writeAttribute("frpos", sent_m[0][2]) |
69 |
writer.writeAttribute("frlemma", sent_m[0][3]) |
70 |
writer.writeCharacters(sent_m[0][3]) |
71 |
writer.writeEndElement() // close w
|
72 |
writer.writeComment("\n")
|
73 |
writer.writeEndElement() // close s
|
74 |
writer.writeEndElement() // close p (sent)
|
75 |
writer.writeCharacters("\n")
|
76 |
inSent=0
|
77 |
} else {
|
78 |
if (!inSent) {
|
79 |
writer.writeStartElement("p")
|
80 |
writer.writeStartElement("s")
|
81 |
inSent=1
|
82 |
} |
83 |
def w_pattern = ~/^([^\t]+)\t([^\t]+)\t([^\t]+)$/ |
84 |
def w_m = (line =~ w_pattern)
|
85 |
|
86 |
if (w_m.size() > 0) { |
87 |
writer.writeStartElement("w")
|
88 |
writer.writeAttribute("frpos", w_m[0][2]) |
89 |
writer.writeAttribute("frlemma", w_m[0][3]) |
90 |
writer.writeCharacters(w_m[0][1]) |
91 |
writer.writeEndElement() // close w
|
92 |
writer.writeCharacters("\n")
|
93 |
} else {
|
94 |
writer.writeStartElement("w")
|
95 |
writer.writeAttribute("frpos", "NAM") |
96 |
writer.writeAttribute("frlemma", "<unknown>") |
97 |
writer.writeCharacters(line) |
98 |
writer.writeEndElement() // close w
|
99 |
writer.writeCharacters("\n")
|
100 |
println "** TT2XMLInDirectory: unknown [word] line pattern, "+nLine+": <"+line+">." |
101 |
} |
102 |
} |
103 |
} |
104 |
|
105 |
if (inSent) {
|
106 |
writer.writeEndElement() // s
|
107 |
writer.writeEndElement() // p
|
108 |
writer.writeCharacters("\n")
|
109 |
} |
110 |
writer.writeEndElement() |
111 |
writer.writeCharacters("\n")
|
112 |
writer.close() |
113 |
output.close() |
114 |
} |
115 |
|
116 |
|