Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / TabulatedToXml.groovy @ 1688

History | View | Annotate | Download (6.6 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
3 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
4 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
5 881 mdecorde
// Sophia Antipolis, University of Paris 3.
6 881 mdecorde
//
7 881 mdecorde
// The TXM platform is free software: you can redistribute it
8 881 mdecorde
// and/or modify it under the terms of the GNU General Public
9 881 mdecorde
// License as published by the Free Software Foundation,
10 881 mdecorde
// either version 2 of the License, or (at your option) any
11 881 mdecorde
// later version.
12 881 mdecorde
//
13 881 mdecorde
// The TXM platform is distributed in the hope that it will be
14 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
15 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
16 881 mdecorde
// PURPOSE. See the GNU General Public License for more
17 881 mdecorde
// details.
18 881 mdecorde
//
19 881 mdecorde
// You should have received a copy of the GNU General
20 881 mdecorde
// Public License along with the TXM platform. If not, see
21 881 mdecorde
// http://www.gnu.org/licenses.
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
//
25 881 mdecorde
// $LastChangedDate:$
26 881 mdecorde
// $LastChangedRevision:$
27 881 mdecorde
// $LastChangedBy:$
28 881 mdecorde
//
29 881 mdecorde
import java.io.BufferedReader;
30 881 mdecorde
import java.io.File;
31 881 mdecorde
import java.io.FileOutputStream;
32 881 mdecorde
import java.io.FileReader;
33 881 mdecorde
import java.io.OutputStreamWriter;
34 881 mdecorde
import java.io.Writer;
35 881 mdecorde
import java.util.ArrayList;
36 881 mdecorde
37 881 mdecorde
import javax.xml.stream.XMLOutputFactory;
38 881 mdecorde
import javax.xml.stream.XMLStreamWriter;
39 881 mdecorde
40 1370 mdecorde
import org.txm.utils.io.IOUtils
41 1370 mdecorde
42 881 mdecorde
// TODO: Auto-generated Javadoc
43 881 mdecorde
/**
44 881 mdecorde
 * Tool to convert Tabulated file to xml file. Such as CWB files or TreeTagger ouput
45 881 mdecorde
 */
46 881 mdecorde
public class TabulatedToXml {
47 881 mdecorde
        String[] colnames;
48 881 mdecorde
49 881 mdecorde
        /** The texttag. */
50 881 mdecorde
        String texttag;
51 881 mdecorde
52 881 mdecorde
        /** The tag. */
53 881 mdecorde
        boolean tag;
54 881 mdecorde
55 881 mdecorde
        /** The no col form. */
56 881 mdecorde
        int noColForm;
57 881 mdecorde
58 881 mdecorde
        /**
59 881 mdecorde
         * Instantiates a new tabulated to xml.
60 881 mdecorde
         *
61 881 mdecorde
         * @param colnames the word properties name
62 881 mdecorde
         * @param texttag the element that represents a text unit
63 881 mdecorde
         * @param tag if true, split the file by the texttag, if false wrap the file with an element texttag
64 881 mdecorde
         */
65 881 mdecorde
        public TabulatedToXml(String[] colnames, String texttag, boolean tag) throws Exception
66 881 mdecorde
        {
67 881 mdecorde
                this.colnames = colnames;
68 881 mdecorde
                this.texttag = texttag;
69 881 mdecorde
                this.tag = tag;
70 881 mdecorde
                this.noColForm = 0;
71 881 mdecorde
        }
72 881 mdecorde
73 881 mdecorde
        /**
74 881 mdecorde
         * Instantiates a new tabulated to xml.
75 881 mdecorde
         *
76 881 mdecorde
         * @param colnames the colnames
77 881 mdecorde
         * @param texttag the texttag
78 881 mdecorde
         * @param tag the tag
79 881 mdecorde
         * @param noColForm the no col form
80 881 mdecorde
         */
81 881 mdecorde
        public TabulatedToXml(String[] colnames, String texttag, boolean tag, int noColForm) throws Exception
82 881 mdecorde
        {
83 881 mdecorde
                this.colnames = colnames;
84 881 mdecorde
                this.texttag = texttag;
85 881 mdecorde
                this.tag = tag;
86 881 mdecorde
                this.noColForm = noColForm;
87 881 mdecorde
        }
88 881 mdecorde
89 881 mdecorde
        /**
90 881 mdecorde
         * Process.
91 881 mdecorde
         *
92 881 mdecorde
         * @param tabulatedfile the tabulatedfile
93 881 mdecorde
         * @param xmlfile the xmlfile
94 881 mdecorde
         * @param encoding the encoding
95 881 mdecorde
         * @return true, if successful
96 881 mdecorde
         */
97 881 mdecorde
        public boolean process(File tabulatedfile, File xmlfile, String encoding)
98 881 mdecorde
        {
99 881 mdecorde
                BufferedReader reader = new BufferedReader(new FileReader(tabulatedfile));
100 881 mdecorde
                FileOutputStream output = new FileOutputStream(xmlfile)
101 881 mdecorde
102 881 mdecorde
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
103 881 mdecorde
                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
104 881 mdecorde
105 881 mdecorde
                writer.writeStartDocument("UTF-8", "1.0")
106 881 mdecorde
107 881 mdecorde
                if(!tag) // there is no tag, we must wrap the content in an element
108 881 mdecorde
                {
109 881 mdecorde
                        if(texttag.length() == 0)
110 881 mdecorde
                        {
111 881 mdecorde
                                println "no text element given"
112 881 mdecorde
                                return false;
113 881 mdecorde
                        }
114 881 mdecorde
                        writer.writeStartElement(texttag);
115 881 mdecorde
                        String filename = tabulatedfile.getName()
116 881 mdecorde
                        filename = filename.substring(0, filename.lastIndexOf("."));
117 881 mdecorde
                        writer.writeAttribute("id", filename);
118 881 mdecorde
                }
119 881 mdecorde
120 881 mdecorde
                String line = reader.readLine();
121 881 mdecorde
                int noline = 0;
122 881 mdecorde
                while(line != null)
123 881 mdecorde
                {
124 881 mdecorde
                        if(tag && line.startsWith("</")) // closing xml tag
125 881 mdecorde
                        {
126 881 mdecorde
                                writer.writeEndElement();
127 881 mdecorde
                        }
128 881 mdecorde
                        else if(tag && line.startsWith("<")) // opening xml tag
129 881 mdecorde
                        {
130 881 mdecorde
                                int firstblank = line.indexOf(" ");
131 881 mdecorde
                                String tagname;
132 881 mdecorde
                                String[] attrnamevalues = null;
133 881 mdecorde
                                if(firstblank > 0) // contains attributes
134 881 mdecorde
                                {
135 881 mdecorde
                                        tagname = line.substring(1, firstblank);
136 881 mdecorde
                                        line = line.substring(firstblank+1, line.length() -1);
137 881 mdecorde
                                        attrnamevalues = line.split(" ");
138 881 mdecorde
                                }
139 881 mdecorde
                                else // no attr
140 881 mdecorde
                                {
141 881 mdecorde
                                        tagname = line.substring(1, line.length() -1);
142 881 mdecorde
                                }
143 881 mdecorde
144 881 mdecorde
                                writer.writeStartElement(tagname);
145 881 mdecorde
                                if(attrnamevalues != null)
146 881 mdecorde
                                {
147 881 mdecorde
                                        for(String namevalue : attrnamevalues)
148 881 mdecorde
                                        {
149 881 mdecorde
                                                int egalidx = namevalue.indexOf("=");
150 881 mdecorde
                                                String name = namevalue.substring(0, egalidx);
151 881 mdecorde
                                                String value = namevalue.substring(egalidx+1, namevalue.length());
152 881 mdecorde
153 881 mdecorde
                                                writer.writeAttribute(name, value.replace("\"",""));
154 881 mdecorde
                                        }
155 881 mdecorde
                                }
156 881 mdecorde
                        }
157 881 mdecorde
                        else // it's a word
158 881 mdecorde
                        {
159 881 mdecorde
                                String[] split = line.split("\t");
160 881 mdecorde
                                if(split.length != colnames.length)
161 881 mdecorde
                                {
162 881 mdecorde
                                        System.out.println("Error line "+noline+" : missing columns :"+split.length+" instead of "+ colnames.length);
163 881 mdecorde
                                        println "line: "+line
164 881 mdecorde
                                }
165 881 mdecorde
                                String form = "";
166 881 mdecorde
                                writer.writeStartElement("w");
167 881 mdecorde
                                for(int i = 0 ; i < colnames.length ; i++)
168 881 mdecorde
                                {
169 881 mdecorde
                                        if(i == noColForm)
170 881 mdecorde
                                        {
171 881 mdecorde
                                                form = split[0];
172 881 mdecorde
                                        }
173 881 mdecorde
                                        else
174 881 mdecorde
                                        {
175 881 mdecorde
                                                if(split.length <= i)
176 881 mdecorde
                                                writer.writeAttribute(colnames[i], "N/A");
177 881 mdecorde
                                                else
178 881 mdecorde
                                                writer.writeAttribute(colnames[i], split[i]);
179 881 mdecorde
                                        }
180 881 mdecorde
                                }
181 881 mdecorde
                                writer.writeCharacters(form);
182 881 mdecorde
                                writer.writeEndElement();
183 881 mdecorde
184 881 mdecorde
                        }
185 881 mdecorde
                        noline++;
186 881 mdecorde
                        line = reader.readLine();
187 881 mdecorde
                }
188 881 mdecorde
189 881 mdecorde
                if(!tag) // there is no tag, we close the wrapping element
190 881 mdecorde
                {
191 881 mdecorde
                        writer.writeEndElement();
192 881 mdecorde
                }
193 881 mdecorde
194 881 mdecorde
                writer.close();
195 881 mdecorde
                output.close();
196 881 mdecorde
                reader.close();
197 881 mdecorde
        }
198 881 mdecorde
199 881 mdecorde
        /**
200 881 mdecorde
         * The main method.
201 881 mdecorde
         *
202 881 mdecorde
         * @param args the arguments
203 881 mdecorde
         */
204 881 mdecorde
        public static void main(String[] args)
205 881 mdecorde
        {
206 881 mdecorde
                String home = System.getProperty("user.home");
207 881 mdecorde
                // sample_TXM_LA.vrt
208 881 mdecorde
                /*
209 881 mdecorde
                 File tabulatedfile = new File(home, "xml/tabulated/sample_TXM_LA.vrt");
210 881 mdecorde
                 File xmlfile = new File(home, "xml/tabulated/sample_TXM_LA.xml");
211 881 mdecorde
                 String[] colnames = ["form", "pos", "lem", "func"]; // TreeTagger
212 881 mdecorde
                 boolean tag = true;
213 881 mdecorde
                 String encoding = "UTF-8";
214 881 mdecorde
                 try {
215 881 mdecorde
                 TabulatedToXml ttx = new TabulatedToXml(colnames, "", tag);
216 881 mdecorde
                 ttx.process(tabulatedfile, xmlfile, encoding)
217 881 mdecorde
                 } catch (Exception e) {
218 881 mdecorde
                 // TODO Auto-generated catch block
219 881 mdecorde
                 org.txm.utils.logger.Log.printStackTrace(e);
220 881 mdecorde
                 }*/
221 881 mdecorde
                File srcdir = new File(home, "xml/Bendinelli");
222 881 mdecorde
                File outdir = new File(home, "xml/Bendinelli/xml");
223 881 mdecorde
                println "srcdir: "+srcdir
224 881 mdecorde
                outdir.deleteDir();
225 881 mdecorde
                outdir.mkdir();
226 881 mdecorde
227 881 mdecorde
                String[] colnames = ["form", "pos", "lem"]; // TreeTagger
228 881 mdecorde
                boolean tag = false;
229 881 mdecorde
                String encoding = "UTF-8";
230 881 mdecorde
231 881 mdecorde
                TabulatedToXml ttx = new TabulatedToXml(colnames, "debat", tag);
232 1615 mdecorde
                for(File tabulatedfile : srcdir.listFiles(IOUtils.HIDDENFILE_FILTER))
233 881 mdecorde
                {
234 881 mdecorde
                        if(tabulatedfile.getName().endsWith(".txt"))
235 881 mdecorde
                        {
236 881 mdecorde
                                String filename = tabulatedfile.getName()
237 881 mdecorde
                                filename = filename.substring(0, filename.lastIndexOf("."));
238 881 mdecorde
                                File xmlfile = new File(outdir, filename+".xml")
239 881 mdecorde
                                println "process: "+tabulatedfile
240 881 mdecorde
                                ttx.process(tabulatedfile, xmlfile, encoding)
241 881 mdecorde
                        }
242 881 mdecorde
                }
243 881 mdecorde
244 881 mdecorde
                println "Done"
245 881 mdecorde
        }
246 881 mdecorde
}