Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / nlp / TT2XMLInDirectoryMacro.groovy @ 2769

History | View | Annotate | Download (3.4 kB)

1
package org.txm.macro.nlp;
2
        
3
        import org.kohsuke.args4j.*
4
        import groovy.transform.Field
5
        import java.nio.charset.Charset
6
        import org.txm.rcpapplication.swt.widget.parameters.*
7
        import org.txm.utils.*
8
        import javax.xml.stream.*
9
        
10
        @Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=true, def="dir")
11
        File inputDirectory
12
        
13
        @Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8")
14
        String encoding
15
        
16
        @Field @Option(name="debug", usage="Debug mode", widget="Boolean", required=false, def="false")
17
        Boolean debug
18
        
19
        if (!ParametersDialog.open(this)) return
20
        
21
        encoding = encoding.trim()
22
        outputDirectory = new File(inputDirectory, "xml")
23
        outputDirectory.mkdir()
24
        
25
        println "Processing: "+inputDirectory
26
        
27
        XMLOutputFactory factory = XMLOutputFactory.newInstance()
28
        
29
        def files = inputDirectory.listFiles()
30
        if (files == null || files.length == 0) {
31
                println "Error: no file to process in $inputDirectory"
32
                return false;
33
        }
34
        for (File inputfile : files.sort()) {
35
                if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".tt")) continue // ignore
36
                println " file: "+inputfile
37
        
38
                name = inputfile.getName()
39
                idx = name.lastIndexOf(".")
40
                if (idx > 0) name = name.substring(0, idx)
41
                outputfile = new File(outputDirectory, name+".xml")
42
                
43
                Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8")
44
                XMLStreamWriter writer = factory.createXMLStreamWriter(output)
45
        
46
                writer.writeStartDocument("UTF-8","1.0")
47
                writer.writeCharacters("\n")
48
                writer.writeStartElement("text")
49
                writer.writeCharacters("\n")
50
        
51
                inSent = 0
52
                nLine = 0
53
                inputfile.eachLine(encoding) { line ->
54
                
55
                        nLine++
56
                        
57
                        if (debug) {
58
                                println nLine+": "+line
59
                        }
60
                        
61
                        def sent = false
62
                def sent_pattern = ~/^([^\t]+)\t(SENT)\t([^\t]+)$/
63
                def sent_m = (line =~ sent_pattern)
64
                if (sent_m) sent = true
65
                        
66
                        if (sent && inSent) {
67
                                writer.writeStartElement("w")
68
                                writer.writeAttribute("frpos", sent_m[0][2])
69
                                writer.writeAttribute("frlemma", sent_m[0][3])
70
                                writer.writeCharacters(sent_m[0][3])
71
                                writer.writeEndElement() // close w                
72
                                writer.writeComment("\n")
73
                                writer.writeEndElement() // close s
74
                                writer.writeEndElement() // close p (sent)
75
                                writer.writeCharacters("\n")
76
                                inSent=0
77
                        } else {
78
                                if (!inSent) {
79
                                        writer.writeStartElement("p")
80
                                        writer.writeStartElement("s")
81
                                        inSent=1
82
                                }
83
                                def w_pattern = ~/^([^\t]+)\t([^\t]+)\t([^\t]+)$/
84
                        def w_m = (line =~ w_pattern)
85
        
86
                        if (w_m.size() > 0) {
87
                                writer.writeStartElement("w")
88
                                        writer.writeAttribute("frpos", w_m[0][2])
89
                                        writer.writeAttribute("frlemma", w_m[0][3])
90
                                        writer.writeCharacters(w_m[0][1])
91
                                        writer.writeEndElement() // close w
92
                                        writer.writeCharacters("\n")
93
                                } else {
94
                                    writer.writeStartElement("w")
95
                                        writer.writeAttribute("frpos", "NAM")
96
                                        writer.writeAttribute("frlemma", "<unknown>")
97
                                        writer.writeCharacters(line)
98
                                        writer.writeEndElement() // close w
99
                                        writer.writeCharacters("\n")
100
                                        println "** TT2XMLInDirectory: unknown [word] line pattern, "+nLine+": <"+line+">."
101
                                }
102
                        }
103
                }
104
        
105
                if (inSent) {
106
                        writer.writeEndElement() // s
107
                        writer.writeEndElement() // p
108
                        writer.writeCharacters("\n")
109
                }
110
                writer.writeEndElement()
111
                writer.writeCharacters("\n")
112
                writer.close()
113
                output.close()
114
        }
115
        
116