Revision 3025

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/txt/CQP2XMLMacro.groovy (revision 3025)
1
package org.txm.macro.txt
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
@Field @Option(name="inputDirectory", usage="CQP directory", widget="Folder", required=false, def="")
12
		File inputDirectory
13

  
14
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8")
15
		String encoding
16

  
17
@Field @Option(name="separator", usage="Column separator (TAB by default)", widget="String", required=false, def="\\t")
18
		String separator
19

  
20
@Field @Option(name="columns", usage="Column names separated by comma ('word,pos,lemma' by default)", widget="String", required=true, def="word,pos,lemma")
21
		String columns
22

  
23
def rootTag = "text"
24

  
25
if (!ParametersDialog.open(this)) return;
26

  
27
encoding = encoding.trim()
28

  
29
if (separator == "\\t") {
30
	separator = "\t"
31
} else {
32
	separator = separator[0]
33
}
34

  
35
columnNames = columns.trim().split(",")
36

  
37
outputDirectory = new File(inputDirectory, "out")
38
outputDirectory.mkdir()
39

  
40
println "processing: "+inputDirectory
41

  
42
for (File inputfile : inputDirectory.listFiles()) {
43
	if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".cqp")) {
44
		continue // ignore
45
	}
46
	
47
	println " file: "+inputfile
48
	
49
	String name = inputfile.getName()
50
	int idx = name.lastIndexOf(".")
51
	
52
	if (idx > 0) {
53
		name = name.substring(0, idx)
54
	}
55
	
56
	File outputfile = new File(outputDirectory, name+".xml")
57
	
58
	// output = new FileOutputStream(outputfile)
59
	
60
	output = new PrintWriter(new BufferedWriter(new FileWriter(outputfile, true)));
61
	outputStr = new StringWriter()
62
	factory = XMLOutputFactory.newInstance()
63
	writer = factory.createXMLStreamWriter(outputStr)
64
	
65
	writer.writeStartDocument("UTF-8","1.0")
66
	writer.writeCharacters("\n") // manage XML reserved characters
67
	writer.writeStartElement(rootTag)
68
	writer.writeAttribute("id", name)
69
	writer.writeCharacters("\n")
70
	
71
	inputfile.withReader(encoding) { reader ->
72
		
73
		def line
74
		while ((line = reader.readLine()) != null) {
75
			if (line.length() > 0) {
76
				if (line[0] == "<") {
77
					writer.flush()
78
					outputStr.write(line)
79
					outputStr.flush()
80
				} else {
81
					writer.writeStartElement("w")
82
					def word = ""
83
					println "line = "+line
84
					println "columnNames = "+columnNames
85
					line.split(separator).eachWithIndex { columnValue, index ->
86
						if (columnNames[index] == "word") {
87
							word = columnValue
88
						} else {
89
							writer.writeAttribute(columnNames[index], columnValue)
90
						}
91
					}
92
					writer.writeCharacters(word)
93
					writer.writeEndElement() // w
94
				}
95
				writer.writeCharacters("\n")
96
			}
97
		}
98
	}
99
	writer.writeEndElement() // rootTag
100
	writer.close()
101
	output.println(outputStr.toString())
102
	output.close()
103
}

Also available in: Unified diff