Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / TabulatedToXml.groovy @ 187

History | View | Annotate | Download (6.5 kB)

1
package org.txm.importer 
2
// Copyright © 2010-2013 ENS de Lyon.
3
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
4
// Lyon 2, University of Franche-Comté, University of Nice
5
// Sophia Antipolis, University of Paris 3.
6
// 
7
// The TXM platform is free software: you can redistribute it
8
// and/or modify it under the terms of the GNU General Public
9
// License as published by the Free Software Foundation,
10
// either version 2 of the License, or (at your option) any
11
// later version.
12
// 
13
// The TXM platform is distributed in the hope that it will be
14
// useful, but WITHOUT ANY WARRANTY; without even the implied
15
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16
// PURPOSE. See the GNU General Public License for more
17
// details.
18
// 
19
// You should have received a copy of the GNU General
20
// Public License along with the TXM platform. If not, see
21
// http://www.gnu.org/licenses.
22
// 
23
// 
24
// 
25
// $LastChangedDate:$
26
// $LastChangedRevision:$
27
// $LastChangedBy:$ 
28
//
29
import java.io.BufferedReader;
30
import java.io.File;
31
import java.io.FileOutputStream;
32
import java.io.FileReader;
33
import java.io.OutputStreamWriter;
34
import java.io.Writer;
35
import java.util.ArrayList;
36

    
37
import javax.xml.stream.XMLOutputFactory;
38
import javax.xml.stream.XMLStreamWriter;
39

    
40
// TODO: Auto-generated Javadoc
41
/**
42
 * Tool to convert Tabulated file to xml file. Such as CWB files or TreeTagger ouput
43
 */
44
public class TabulatedToXml {
45
        String[] colnames;
46
        
47
        /** The texttag. */
48
        String texttag;
49
        
50
        /** The tag. */
51
        boolean tag;
52
        
53
        /** The no col form. */
54
        int noColForm;
55
        
56
        /**
57
         * Instantiates a new tabulated to xml.
58
         *
59
         * @param colnames the word properties name
60
         * @param texttag the element that represents a text unit
61
         * @param tag if true, split the file by the texttag, if false wrap the file with an element texttag
62
         */
63
        public TabulatedToXml(String[] colnames, String texttag, boolean tag) throws Exception
64
        {
65
                this.colnames = colnames;
66
                this.texttag = texttag;
67
                this.tag = tag;
68
                this.noColForm = 0;
69
        }
70
        
71
        /**
72
         * Instantiates a new tabulated to xml.
73
         *
74
         * @param colnames the colnames
75
         * @param texttag the texttag
76
         * @param tag the tag
77
         * @param noColForm the no col form
78
         */
79
        public TabulatedToXml(String[] colnames, String texttag, boolean tag, int noColForm) throws Exception
80
        {
81
                this.colnames = colnames;
82
                this.texttag = texttag;
83
                this.tag = tag;
84
                this.noColForm = noColForm;
85
        }
86
        
87
        /**
88
         * Process.
89
         *
90
         * @param tabulatedfile the tabulatedfile
91
         * @param xmlfile the xmlfile
92
         * @param encoding the encoding
93
         * @return true, if successful
94
         */
95
        public boolean process(File tabulatedfile, File xmlfile, String encoding)
96
        {
97
                BufferedReader reader = new BufferedReader(new FileReader(tabulatedfile));
98
                FileOutputStream output = new FileOutputStream(xmlfile)
99
                
100
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
101
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8"); 
102
                
103
                writer.writeStartDocument("UTF-8", "1.0")
104
                
105
                if(!tag) // there is no tag, we must wrap the content in an element
106
                {
107
                        if(texttag.length() == 0)
108
                        {
109
                                println "no text element given"
110
                                return false;
111
                        }
112
                        writer.writeStartElement(texttag);
113
                        String filename = tabulatedfile.getName()
114
                        filename = filename.substring(0, filename.lastIndexOf("."));
115
                        writer.writeAttribute("id", filename);
116
                }
117
                
118
                String line = reader.readLine();
119
                int noline = 0;
120
                while(line != null)
121
                {
122
                        if(tag && line.startsWith("</")) // closing xml tag
123
                        {
124
                                writer.writeEndElement();
125
                        }
126
                        else if(tag && line.startsWith("<")) // opening xml tag
127
                        {
128
                                int firstblank = line.indexOf(" ");
129
                                String tagname;
130
                                String[] attrnamevalues = null;
131
                                if(firstblank > 0) // contains attributes
132
                                {
133
                                        tagname = line.substring(1, firstblank);
134
                                        line = line.substring(firstblank+1, line.length() -1);
135
                                        attrnamevalues = line.split(" ");
136
                                }
137
                                else // no attr
138
                                {
139
                                        tagname = line.substring(1, line.length() -1);
140
                                }
141
                                
142
                                writer.writeStartElement(tagname);
143
                                if(attrnamevalues != null)
144
                                {
145
                                        for(String namevalue : attrnamevalues)
146
                                        {
147
                                                int egalidx = namevalue.indexOf("=");
148
                                                String name = namevalue.substring(0, egalidx);
149
                                                String value = namevalue.substring(egalidx+1, namevalue.length());
150
                                                
151
                                                writer.writeAttribute(name, value.replace("\"",""));
152
                                        }
153
                                }
154
                        }
155
                        else // it's a word
156
                        {
157
                                String[] split = line.split("\t");
158
                                if(split.length != colnames.length)
159
                                {
160
                                        System.out.println("Error line "+noline+" : missing columns :"+split.length+" instead of "+ colnames.length);
161
                                        println "line: "+line
162
                                }
163
                                String form = "";
164
                                writer.writeStartElement("w");
165
                                for(int i = 0 ; i < colnames.length ; i++)
166
                                {
167
                                        if(i == noColForm)
168
                                        {
169
                                                form = split[0];
170
                                        }
171
                                        else
172
                                        {
173
                                                if(split.length <= i)
174
                                                writer.writeAttribute(colnames[i], "N/A");
175
                                                else
176
                                                writer.writeAttribute(colnames[i], split[i]);
177
                                        }
178
                                }
179
                                writer.writeCharacters(form);
180
                                writer.writeEndElement();
181
                                
182
                        }
183
                        noline++;
184
                        line = reader.readLine();
185
                }
186
                
187
                if(!tag) // there is no tag, we close the wrapping element
188
                {
189
                        writer.writeEndElement();
190
                }
191
                
192
                writer.close();
193
                output.close();
194
                reader.close();
195
        }
196
        
197
        /**
198
         * The main method.
199
         *
200
         * @param args the arguments
201
         */
202
        public static void main(String[] args)
203
        {
204
                String home = System.getProperty("user.home");
205
                // sample_TXM_LA.vrt
206
                /*
207
                 File tabulatedfile = new File(home, "xml/tabulated/sample_TXM_LA.vrt");
208
                 File xmlfile = new File(home, "xml/tabulated/sample_TXM_LA.xml");
209
                 String[] colnames = ["form", "pos", "lem", "func"]; // TreeTagger
210
                 boolean tag = true;
211
                 String encoding = "UTF-8";
212
                 try {
213
                 TabulatedToXml ttx = new TabulatedToXml(colnames, "", tag);
214
                 ttx.process(tabulatedfile, xmlfile, encoding)
215
                 } catch (Exception e) {
216
                 // TODO Auto-generated catch block
217
                 org.txm.utils.logger.Log.printStackTrace(e);
218
                 }*/
219
                File srcdir = new File(home, "xml/Bendinelli");
220
                File outdir = new File(home, "xml/Bendinelli/xml");
221
                println "srcdir: "+srcdir
222
                outdir.deleteDir();
223
                outdir.mkdir();
224
                
225
                String[] colnames = ["form", "pos", "lem"]; // TreeTagger
226
                boolean tag = false;
227
                String encoding = "UTF-8";
228
                
229
                TabulatedToXml ttx = new TabulatedToXml(colnames, "debat", tag);
230
                for(File tabulatedfile : srcdir.listFiles())
231
                {
232
                        if(tabulatedfile.getName().endsWith(".txt"))
233
                        {
234
                                String filename = tabulatedfile.getName()
235
                                filename = filename.substring(0, filename.lastIndexOf("."));
236
                                File xmlfile = new File(outdir, filename+".xml")
237
                                println "process: "+tabulatedfile
238
                                ttx.process(tabulatedfile, xmlfile, encoding)
239
                        }
240
                }
241
                
242
                println "Done"
243
        }
244
}