Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / TabulatedToXml.groovy @ 1688

History | View | Annotate | Download (6.6 kB)

1
package org.txm.scripts.importer 
2
// Copyright © 2010-2013 ENS de Lyon.
3
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
4
// Lyon 2, University of Franche-Comté, University of Nice
5
// Sophia Antipolis, University of Paris 3.
6
// 
7
// The TXM platform is free software: you can redistribute it
8
// and/or modify it under the terms of the GNU General Public
9
// License as published by the Free Software Foundation,
10
// either version 2 of the License, or (at your option) any
11
// later version.
12
// 
13
// The TXM platform is distributed in the hope that it will be
14
// useful, but WITHOUT ANY WARRANTY; without even the implied
15
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16
// PURPOSE. See the GNU General Public License for more
17
// details.
18
// 
19
// You should have received a copy of the GNU General
20
// Public License along with the TXM platform. If not, see
21
// http://www.gnu.org/licenses.
22
// 
23
// 
24
// 
25
// $LastChangedDate:$
26
// $LastChangedRevision:$
27
// $LastChangedBy:$ 
28
//
29
import java.io.BufferedReader;
30
import java.io.File;
31
import java.io.FileOutputStream;
32
import java.io.FileReader;
33
import java.io.OutputStreamWriter;
34
import java.io.Writer;
35
import java.util.ArrayList;
36

    
37
import javax.xml.stream.XMLOutputFactory;
38
import javax.xml.stream.XMLStreamWriter;
39

    
40
import org.txm.utils.io.IOUtils
41

    
42
// TODO: Auto-generated Javadoc
43
/**
44
 * Tool to convert Tabulated file to xml file. Such as CWB files or TreeTagger ouput
45
 */
46
public class TabulatedToXml {
47
        String[] colnames;
48
        
49
        /** The texttag. */
50
        String texttag;
51
        
52
        /** The tag. */
53
        boolean tag;
54
        
55
        /** The no col form. */
56
        int noColForm;
57
        
58
        /**
59
         * Instantiates a new tabulated to xml.
60
         *
61
         * @param colnames the word properties name
62
         * @param texttag the element that represents a text unit
63
         * @param tag if true, split the file by the texttag, if false wrap the file with an element texttag
64
         */
65
        public TabulatedToXml(String[] colnames, String texttag, boolean tag) throws Exception
66
        {
67
                this.colnames = colnames;
68
                this.texttag = texttag;
69
                this.tag = tag;
70
                this.noColForm = 0;
71
        }
72
        
73
        /**
74
         * Instantiates a new tabulated to xml.
75
         *
76
         * @param colnames the colnames
77
         * @param texttag the texttag
78
         * @param tag the tag
79
         * @param noColForm the no col form
80
         */
81
        public TabulatedToXml(String[] colnames, String texttag, boolean tag, int noColForm) throws Exception
82
        {
83
                this.colnames = colnames;
84
                this.texttag = texttag;
85
                this.tag = tag;
86
                this.noColForm = noColForm;
87
        }
88
        
89
        /**
90
         * Process.
91
         *
92
         * @param tabulatedfile the tabulatedfile
93
         * @param xmlfile the xmlfile
94
         * @param encoding the encoding
95
         * @return true, if successful
96
         */
97
        public boolean process(File tabulatedfile, File xmlfile, String encoding)
98
        {
99
                BufferedReader reader = new BufferedReader(new FileReader(tabulatedfile));
100
                FileOutputStream output = new FileOutputStream(xmlfile)
101
                
102
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
103
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8"); 
104
                
105
                writer.writeStartDocument("UTF-8", "1.0")
106
                
107
                if(!tag) // there is no tag, we must wrap the content in an element
108
                {
109
                        if(texttag.length() == 0)
110
                        {
111
                                println "no text element given"
112
                                return false;
113
                        }
114
                        writer.writeStartElement(texttag);
115
                        String filename = tabulatedfile.getName()
116
                        filename = filename.substring(0, filename.lastIndexOf("."));
117
                        writer.writeAttribute("id", filename);
118
                }
119
                
120
                String line = reader.readLine();
121
                int noline = 0;
122
                while(line != null)
123
                {
124
                        if(tag && line.startsWith("</")) // closing xml tag
125
                        {
126
                                writer.writeEndElement();
127
                        }
128
                        else if(tag && line.startsWith("<")) // opening xml tag
129
                        {
130
                                int firstblank = line.indexOf(" ");
131
                                String tagname;
132
                                String[] attrnamevalues = null;
133
                                if(firstblank > 0) // contains attributes
134
                                {
135
                                        tagname = line.substring(1, firstblank);
136
                                        line = line.substring(firstblank+1, line.length() -1);
137
                                        attrnamevalues = line.split(" ");
138
                                }
139
                                else // no attr
140
                                {
141
                                        tagname = line.substring(1, line.length() -1);
142
                                }
143
                                
144
                                writer.writeStartElement(tagname);
145
                                if(attrnamevalues != null)
146
                                {
147
                                        for(String namevalue : attrnamevalues)
148
                                        {
149
                                                int egalidx = namevalue.indexOf("=");
150
                                                String name = namevalue.substring(0, egalidx);
151
                                                String value = namevalue.substring(egalidx+1, namevalue.length());
152
                                                
153
                                                writer.writeAttribute(name, value.replace("\"",""));
154
                                        }
155
                                }
156
                        }
157
                        else // it's a word
158
                        {
159
                                String[] split = line.split("\t");
160
                                if(split.length != colnames.length)
161
                                {
162
                                        System.out.println("Error line "+noline+" : missing columns :"+split.length+" instead of "+ colnames.length);
163
                                        println "line: "+line
164
                                }
165
                                String form = "";
166
                                writer.writeStartElement("w");
167
                                for(int i = 0 ; i < colnames.length ; i++)
168
                                {
169
                                        if(i == noColForm)
170
                                        {
171
                                                form = split[0];
172
                                        }
173
                                        else
174
                                        {
175
                                                if(split.length <= i)
176
                                                writer.writeAttribute(colnames[i], "N/A");
177
                                                else
178
                                                writer.writeAttribute(colnames[i], split[i]);
179
                                        }
180
                                }
181
                                writer.writeCharacters(form);
182
                                writer.writeEndElement();
183
                                
184
                        }
185
                        noline++;
186
                        line = reader.readLine();
187
                }
188
                
189
                if(!tag) // there is no tag, we close the wrapping element
190
                {
191
                        writer.writeEndElement();
192
                }
193
                
194
                writer.close();
195
                output.close();
196
                reader.close();
197
        }
198
        
199
        /**
200
         * The main method.
201
         *
202
         * @param args the arguments
203
         */
204
        public static void main(String[] args)
205
        {
206
                String home = System.getProperty("user.home");
207
                // sample_TXM_LA.vrt
208
                /*
209
                 File tabulatedfile = new File(home, "xml/tabulated/sample_TXM_LA.vrt");
210
                 File xmlfile = new File(home, "xml/tabulated/sample_TXM_LA.xml");
211
                 String[] colnames = ["form", "pos", "lem", "func"]; // TreeTagger
212
                 boolean tag = true;
213
                 String encoding = "UTF-8";
214
                 try {
215
                 TabulatedToXml ttx = new TabulatedToXml(colnames, "", tag);
216
                 ttx.process(tabulatedfile, xmlfile, encoding)
217
                 } catch (Exception e) {
218
                 // TODO Auto-generated catch block
219
                 org.txm.utils.logger.Log.printStackTrace(e);
220
                 }*/
221
                File srcdir = new File(home, "xml/Bendinelli");
222
                File outdir = new File(home, "xml/Bendinelli/xml");
223
                println "srcdir: "+srcdir
224
                outdir.deleteDir();
225
                outdir.mkdir();
226
                
227
                String[] colnames = ["form", "pos", "lem"]; // TreeTagger
228
                boolean tag = false;
229
                String encoding = "UTF-8";
230
                
231
                TabulatedToXml ttx = new TabulatedToXml(colnames, "debat", tag);
232
                for(File tabulatedfile : srcdir.listFiles(IOUtils.HIDDENFILE_FILTER))
233
                {
234
                        if(tabulatedfile.getName().endsWith(".txt"))
235
                        {
236
                                String filename = tabulatedfile.getName()
237
                                filename = filename.substring(0, filename.lastIndexOf("."));
238
                                File xmlfile = new File(outdir, filename+".xml")
239
                                println "process: "+tabulatedfile
240
                                ttx.process(tabulatedfile, xmlfile, encoding)
241
                        }
242
                }
243
                
244
                println "Done"
245
        }
246
}