root / tmp / org.txm.core / src / java / org / txm / scripts / importer / TabulatedToXml.groovy @ 1688
History | View | Annotate | Download (6.6 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
3 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
4 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
5 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
6 | 881 | mdecorde | //
|
7 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
8 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
9 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
10 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
11 | 881 | mdecorde | // later version.
|
12 | 881 | mdecorde | //
|
13 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
14 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
15 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
16 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
17 | 881 | mdecorde | // details.
|
18 | 881 | mdecorde | //
|
19 | 881 | mdecorde | // You should have received a copy of the GNU General
|
20 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
21 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | //
|
25 | 881 | mdecorde | // $LastChangedDate:$
|
26 | 881 | mdecorde | // $LastChangedRevision:$
|
27 | 881 | mdecorde | // $LastChangedBy:$
|
28 | 881 | mdecorde | //
|
29 | 881 | mdecorde | import java.io.BufferedReader; |
30 | 881 | mdecorde | import java.io.File; |
31 | 881 | mdecorde | import java.io.FileOutputStream; |
32 | 881 | mdecorde | import java.io.FileReader; |
33 | 881 | mdecorde | import java.io.OutputStreamWriter; |
34 | 881 | mdecorde | import java.io.Writer; |
35 | 881 | mdecorde | import java.util.ArrayList; |
36 | 881 | mdecorde | |
37 | 881 | mdecorde | import javax.xml.stream.XMLOutputFactory; |
38 | 881 | mdecorde | import javax.xml.stream.XMLStreamWriter; |
39 | 881 | mdecorde | |
40 | 1370 | mdecorde | import org.txm.utils.io.IOUtils |
41 | 1370 | mdecorde | |
42 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
43 | 881 | mdecorde | /**
|
44 | 881 | mdecorde | * Tool to convert Tabulated file to xml file. Such as CWB files or TreeTagger ouput
|
45 | 881 | mdecorde | */
|
46 | 881 | mdecorde | public class TabulatedToXml { |
47 | 881 | mdecorde | String[] colnames; |
48 | 881 | mdecorde | |
49 | 881 | mdecorde | /** The texttag. */
|
50 | 881 | mdecorde | String texttag;
|
51 | 881 | mdecorde | |
52 | 881 | mdecorde | /** The tag. */
|
53 | 881 | mdecorde | boolean tag;
|
54 | 881 | mdecorde | |
55 | 881 | mdecorde | /** The no col form. */
|
56 | 881 | mdecorde | int noColForm;
|
57 | 881 | mdecorde | |
58 | 881 | mdecorde | /**
|
59 | 881 | mdecorde | * Instantiates a new tabulated to xml.
|
60 | 881 | mdecorde | *
|
61 | 881 | mdecorde | * @param colnames the word properties name
|
62 | 881 | mdecorde | * @param texttag the element that represents a text unit
|
63 | 881 | mdecorde | * @param tag if true, split the file by the texttag, if false wrap the file with an element texttag
|
64 | 881 | mdecorde | */
|
65 | 881 | mdecorde | public TabulatedToXml(String[] colnames, String texttag, boolean tag) throws Exception |
66 | 881 | mdecorde | { |
67 | 881 | mdecorde | this.colnames = colnames;
|
68 | 881 | mdecorde | this.texttag = texttag;
|
69 | 881 | mdecorde | this.tag = tag;
|
70 | 881 | mdecorde | this.noColForm = 0; |
71 | 881 | mdecorde | } |
72 | 881 | mdecorde | |
73 | 881 | mdecorde | /**
|
74 | 881 | mdecorde | * Instantiates a new tabulated to xml.
|
75 | 881 | mdecorde | *
|
76 | 881 | mdecorde | * @param colnames the colnames
|
77 | 881 | mdecorde | * @param texttag the texttag
|
78 | 881 | mdecorde | * @param tag the tag
|
79 | 881 | mdecorde | * @param noColForm the no col form
|
80 | 881 | mdecorde | */
|
81 | 881 | mdecorde | public TabulatedToXml(String[] colnames, String texttag, boolean tag, int noColForm) throws Exception |
82 | 881 | mdecorde | { |
83 | 881 | mdecorde | this.colnames = colnames;
|
84 | 881 | mdecorde | this.texttag = texttag;
|
85 | 881 | mdecorde | this.tag = tag;
|
86 | 881 | mdecorde | this.noColForm = noColForm;
|
87 | 881 | mdecorde | } |
88 | 881 | mdecorde | |
89 | 881 | mdecorde | /**
|
90 | 881 | mdecorde | * Process.
|
91 | 881 | mdecorde | *
|
92 | 881 | mdecorde | * @param tabulatedfile the tabulatedfile
|
93 | 881 | mdecorde | * @param xmlfile the xmlfile
|
94 | 881 | mdecorde | * @param encoding the encoding
|
95 | 881 | mdecorde | * @return true, if successful
|
96 | 881 | mdecorde | */
|
97 | 881 | mdecorde | public boolean process(File tabulatedfile, File xmlfile, String encoding) |
98 | 881 | mdecorde | { |
99 | 881 | mdecorde | BufferedReader reader = new BufferedReader(new FileReader(tabulatedfile)); |
100 | 881 | mdecorde | FileOutputStream output = new FileOutputStream(xmlfile) |
101 | 881 | mdecorde | |
102 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
103 | 881 | mdecorde | XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
|
104 | 881 | mdecorde | |
105 | 881 | mdecorde | writer.writeStartDocument("UTF-8", "1.0") |
106 | 881 | mdecorde | |
107 | 881 | mdecorde | if(!tag) // there is no tag, we must wrap the content in an element |
108 | 881 | mdecorde | { |
109 | 881 | mdecorde | if(texttag.length() == 0) |
110 | 881 | mdecorde | { |
111 | 881 | mdecorde | println "no text element given"
|
112 | 881 | mdecorde | return false; |
113 | 881 | mdecorde | } |
114 | 881 | mdecorde | writer.writeStartElement(texttag); |
115 | 881 | mdecorde | String filename = tabulatedfile.getName()
|
116 | 881 | mdecorde | filename = filename.substring(0, filename.lastIndexOf(".")); |
117 | 881 | mdecorde | writer.writeAttribute("id", filename);
|
118 | 881 | mdecorde | } |
119 | 881 | mdecorde | |
120 | 881 | mdecorde | String line = reader.readLine();
|
121 | 881 | mdecorde | int noline = 0; |
122 | 881 | mdecorde | while(line != null) |
123 | 881 | mdecorde | { |
124 | 881 | mdecorde | if(tag && line.startsWith("</")) // closing xml tag |
125 | 881 | mdecorde | { |
126 | 881 | mdecorde | writer.writeEndElement(); |
127 | 881 | mdecorde | } |
128 | 881 | mdecorde | else if(tag && line.startsWith("<")) // opening xml tag |
129 | 881 | mdecorde | { |
130 | 881 | mdecorde | int firstblank = line.indexOf(" "); |
131 | 881 | mdecorde | String tagname;
|
132 | 881 | mdecorde | String[] attrnamevalues = null; |
133 | 881 | mdecorde | if(firstblank > 0) // contains attributes |
134 | 881 | mdecorde | { |
135 | 881 | mdecorde | tagname = line.substring(1, firstblank);
|
136 | 881 | mdecorde | line = line.substring(firstblank+1, line.length() -1); |
137 | 881 | mdecorde | attrnamevalues = line.split(" ");
|
138 | 881 | mdecorde | } |
139 | 881 | mdecorde | else // no attr |
140 | 881 | mdecorde | { |
141 | 881 | mdecorde | tagname = line.substring(1, line.length() -1); |
142 | 881 | mdecorde | } |
143 | 881 | mdecorde | |
144 | 881 | mdecorde | writer.writeStartElement(tagname); |
145 | 881 | mdecorde | if(attrnamevalues != null) |
146 | 881 | mdecorde | { |
147 | 881 | mdecorde | for(String namevalue : attrnamevalues) |
148 | 881 | mdecorde | { |
149 | 881 | mdecorde | int egalidx = namevalue.indexOf("="); |
150 | 881 | mdecorde | String name = namevalue.substring(0, egalidx); |
151 | 881 | mdecorde | String value = namevalue.substring(egalidx+1, namevalue.length()); |
152 | 881 | mdecorde | |
153 | 881 | mdecorde | writer.writeAttribute(name, value.replace("\"","")); |
154 | 881 | mdecorde | } |
155 | 881 | mdecorde | } |
156 | 881 | mdecorde | } |
157 | 881 | mdecorde | else // it's a word |
158 | 881 | mdecorde | { |
159 | 881 | mdecorde | String[] split = line.split("\t"); |
160 | 881 | mdecorde | if(split.length != colnames.length)
|
161 | 881 | mdecorde | { |
162 | 881 | mdecorde | System.out.println("Error line "+noline+" : missing columns :"+split.length+" instead of "+ colnames.length); |
163 | 881 | mdecorde | println "line: "+line
|
164 | 881 | mdecorde | } |
165 | 881 | mdecorde | String form = ""; |
166 | 881 | mdecorde | writer.writeStartElement("w");
|
167 | 881 | mdecorde | for(int i = 0 ; i < colnames.length ; i++) |
168 | 881 | mdecorde | { |
169 | 881 | mdecorde | if(i == noColForm)
|
170 | 881 | mdecorde | { |
171 | 881 | mdecorde | form = split[0];
|
172 | 881 | mdecorde | } |
173 | 881 | mdecorde | else
|
174 | 881 | mdecorde | { |
175 | 881 | mdecorde | if(split.length <= i)
|
176 | 881 | mdecorde | writer.writeAttribute(colnames[i], "N/A");
|
177 | 881 | mdecorde | else
|
178 | 881 | mdecorde | writer.writeAttribute(colnames[i], split[i]); |
179 | 881 | mdecorde | } |
180 | 881 | mdecorde | } |
181 | 881 | mdecorde | writer.writeCharacters(form); |
182 | 881 | mdecorde | writer.writeEndElement(); |
183 | 881 | mdecorde | |
184 | 881 | mdecorde | } |
185 | 881 | mdecorde | noline++; |
186 | 881 | mdecorde | line = reader.readLine(); |
187 | 881 | mdecorde | } |
188 | 881 | mdecorde | |
189 | 881 | mdecorde | if(!tag) // there is no tag, we close the wrapping element |
190 | 881 | mdecorde | { |
191 | 881 | mdecorde | writer.writeEndElement(); |
192 | 881 | mdecorde | } |
193 | 881 | mdecorde | |
194 | 881 | mdecorde | writer.close(); |
195 | 881 | mdecorde | output.close(); |
196 | 881 | mdecorde | reader.close(); |
197 | 881 | mdecorde | } |
198 | 881 | mdecorde | |
199 | 881 | mdecorde | /**
|
200 | 881 | mdecorde | * The main method.
|
201 | 881 | mdecorde | *
|
202 | 881 | mdecorde | * @param args the arguments
|
203 | 881 | mdecorde | */
|
204 | 881 | mdecorde | public static void main(String[] args) |
205 | 881 | mdecorde | { |
206 | 881 | mdecorde | String home = System.getProperty("user.home"); |
207 | 881 | mdecorde | // sample_TXM_LA.vrt
|
208 | 881 | mdecorde | /*
|
209 | 881 | mdecorde | File tabulatedfile = new File(home, "xml/tabulated/sample_TXM_LA.vrt");
|
210 | 881 | mdecorde | File xmlfile = new File(home, "xml/tabulated/sample_TXM_LA.xml");
|
211 | 881 | mdecorde | String[] colnames = ["form", "pos", "lem", "func"]; // TreeTagger
|
212 | 881 | mdecorde | boolean tag = true;
|
213 | 881 | mdecorde | String encoding = "UTF-8";
|
214 | 881 | mdecorde | try {
|
215 | 881 | mdecorde | TabulatedToXml ttx = new TabulatedToXml(colnames, "", tag);
|
216 | 881 | mdecorde | ttx.process(tabulatedfile, xmlfile, encoding)
|
217 | 881 | mdecorde | } catch (Exception e) {
|
218 | 881 | mdecorde | // TODO Auto-generated catch block
|
219 | 881 | mdecorde | org.txm.utils.logger.Log.printStackTrace(e);
|
220 | 881 | mdecorde | }*/
|
221 | 881 | mdecorde | File srcdir = new File(home, "xml/Bendinelli"); |
222 | 881 | mdecorde | File outdir = new File(home, "xml/Bendinelli/xml"); |
223 | 881 | mdecorde | println "srcdir: "+srcdir
|
224 | 881 | mdecorde | outdir.deleteDir(); |
225 | 881 | mdecorde | outdir.mkdir(); |
226 | 881 | mdecorde | |
227 | 881 | mdecorde | String[] colnames = ["form", "pos", "lem"]; // TreeTagger |
228 | 881 | mdecorde | boolean tag = false; |
229 | 881 | mdecorde | String encoding = "UTF-8"; |
230 | 881 | mdecorde | |
231 | 881 | mdecorde | TabulatedToXml ttx = new TabulatedToXml(colnames, "debat", tag); |
232 | 1615 | mdecorde | for(File tabulatedfile : srcdir.listFiles(IOUtils.HIDDENFILE_FILTER)) |
233 | 881 | mdecorde | { |
234 | 881 | mdecorde | if(tabulatedfile.getName().endsWith(".txt")) |
235 | 881 | mdecorde | { |
236 | 881 | mdecorde | String filename = tabulatedfile.getName()
|
237 | 881 | mdecorde | filename = filename.substring(0, filename.lastIndexOf(".")); |
238 | 881 | mdecorde | File xmlfile = new File(outdir, filename+".xml") |
239 | 881 | mdecorde | println "process: "+tabulatedfile
|
240 | 881 | mdecorde | ttx.process(tabulatedfile, xmlfile, encoding) |
241 | 881 | mdecorde | } |
242 | 881 | mdecorde | } |
243 | 881 | mdecorde | |
244 | 881 | mdecorde | println "Done"
|
245 | 881 | mdecorde | } |
246 | 881 | mdecorde | } |