root / tmp / org.txm.core / src / java / org / txm / scripts / importer / FortranColumns2XML.groovy @ 2473
History | View | Annotate | Download (7.9 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | |
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | // This file is part of the TXM platform.
|
24 | 881 | mdecorde | //
|
25 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
26 | 881 | mdecorde | // it under the terms of the GNU General Public License as published by
|
27 | 881 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
28 | 881 | mdecorde | // (at your option) any later version.
|
29 | 881 | mdecorde | //
|
30 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
31 | 881 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 | 881 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 | 881 | mdecorde | // GNU General Public License for more details.
|
34 | 881 | mdecorde | //
|
35 | 881 | mdecorde | // You should have received a copy of the GNU General Public License
|
36 | 881 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 | 881 | mdecorde | //
|
38 | 881 | mdecorde | //
|
39 | 881 | mdecorde | //
|
40 | 881 | mdecorde | // $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
|
41 | 881 | mdecorde | // $LastChangedRevision: 3426 $
|
42 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
43 | 881 | mdecorde | //
|
44 | 1000 | mdecorde | package org.txm.scripts.importer;
|
45 | 881 | mdecorde | |
46 | 881 | mdecorde | import org.txm.utils.i18n.DetectBOM; |
47 | 881 | mdecorde | import org.w3c.dom.Document; |
48 | 881 | mdecorde | import org.w3c.dom.Element; |
49 | 881 | mdecorde | import org.w3c.dom.NodeList; |
50 | 881 | mdecorde | import org.xml.sax.SAXException; |
51 | 881 | mdecorde | |
52 | 881 | mdecorde | import java.io.File; |
53 | 881 | mdecorde | import java.io.FileInputStream; |
54 | 881 | mdecorde | import java.io.InputStreamReader; |
55 | 881 | mdecorde | |
56 | 881 | mdecorde | import javax.xml.stream.*; |
57 | 881 | mdecorde | import java.net.URL; |
58 | 881 | mdecorde | |
59 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
60 | 881 | mdecorde | /**
|
61 | 881 | mdecorde | * The Class FortranColumns2XML.
|
62 | 881 | mdecorde | */
|
63 | 881 | mdecorde | class FortranColumns2XML { |
64 | 881 | mdecorde | |
65 | 881 | mdecorde | /** The infile. */
|
66 | 881 | mdecorde | |
67 | 881 | mdecorde | File infile;
|
68 | 881 | mdecorde | |
69 | 881 | mdecorde | /** The outfile. */
|
70 | 881 | mdecorde | File outfile;
|
71 | 881 | mdecorde | |
72 | 881 | mdecorde | /** The encoding. */
|
73 | 881 | mdecorde | String encoding;
|
74 | 881 | mdecorde | |
75 | 881 | mdecorde | /** The column names. */
|
76 | 881 | mdecorde | List<String> columnNames |
77 | 881 | mdecorde | |
78 | 881 | mdecorde | /** The word tag. */
|
79 | 881 | mdecorde | String wordTag;
|
80 | 881 | mdecorde | |
81 | 881 | mdecorde | /** The wordcolumn. */
|
82 | 881 | mdecorde | int wordcolumn;
|
83 | 881 | mdecorde | |
84 | 881 | mdecorde | /** The linecounter. */
|
85 | 881 | mdecorde | int linecounter = 0; |
86 | 881 | mdecorde | |
87 | 881 | mdecorde | /** The check line number. */
|
88 | 881 | mdecorde | int checkLineNumber;
|
89 | 881 | mdecorde | |
90 | 881 | mdecorde | /** The check line number tag. */
|
91 | 881 | mdecorde | String checkLineNumberTag;
|
92 | 881 | mdecorde | |
93 | 881 | mdecorde | /** The columnindex. */
|
94 | 881 | mdecorde | private List<Integer> columnindex; |
95 | 881 | mdecorde | |
96 | 881 | mdecorde | /** The endcolumnindex. */
|
97 | 881 | mdecorde | private List<Integer> endcolumnindex; |
98 | 881 | mdecorde | |
99 | 881 | mdecorde | /** The line. */
|
100 | 881 | mdecorde | private String line; |
101 | 881 | mdecorde | |
102 | 881 | mdecorde | /** The writer. */
|
103 | 881 | mdecorde | private XMLStreamWriter writer;
|
104 | 881 | mdecorde | |
105 | 881 | mdecorde | /** The reader. */
|
106 | 881 | mdecorde | private Reader reader; |
107 | 881 | mdecorde | |
108 | 881 | mdecorde | /** The output. */
|
109 | 881 | mdecorde | private FileOutputStream output; |
110 | 881 | mdecorde | |
111 | 881 | mdecorde | String textname;
|
112 | 881 | mdecorde | |
113 | 881 | mdecorde | /**
|
114 | 881 | mdecorde | * Instantiates a new fortran columns2 xml.
|
115 | 881 | mdecorde | *
|
116 | 881 | mdecorde | * @param infile the infile
|
117 | 881 | mdecorde | * @param outfile the outfile
|
118 | 881 | mdecorde | * @param encoding the encoding
|
119 | 881 | mdecorde | */
|
120 | 881 | mdecorde | FortranColumns2XML(File infile, File outfile, String encoding) |
121 | 881 | mdecorde | { |
122 | 881 | mdecorde | this.infile = infile;
|
123 | 881 | mdecorde | this.outfile = outfile;
|
124 | 881 | mdecorde | |
125 | 881 | mdecorde | textname = infile.getName(); |
126 | 881 | mdecorde | int idx = -1; |
127 | 881 | mdecorde | idx = textname.lastIndexOf(".");
|
128 | 881 | mdecorde | if (idx > 0) |
129 | 881 | mdecorde | textname = textname.substring(0, idx);
|
130 | 881 | mdecorde | |
131 | 881 | mdecorde | def input = new FileInputStream(infile) |
132 | 881 | mdecorde | reader = new InputStreamReader(input , encoding); |
133 | 881 | mdecorde | DetectBOM bomdetector = new DetectBOM(infile);
|
134 | 881 | mdecorde | for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read() |
135 | 881 | mdecorde | |
136 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
137 | 881 | mdecorde | output = new FileOutputStream(outfile) |
138 | 881 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8");
|
139 | 881 | mdecorde | } |
140 | 881 | mdecorde | |
141 | 881 | mdecorde | /**
|
142 | 881 | mdecorde | * Process.
|
143 | 881 | mdecorde | *
|
144 | 881 | mdecorde | * @return true, if successful
|
145 | 881 | mdecorde | */
|
146 | 881 | mdecorde | public boolean process() |
147 | 881 | mdecorde | { |
148 | 881 | mdecorde | line = reader.readLine(); // use the first line to find columns
|
149 | 881 | mdecorde | |
150 | 881 | mdecorde | if(columnindex == null || endcolumnindex == null) |
151 | 881 | mdecorde | { |
152 | 881 | mdecorde | searchNumberOfColumns(); |
153 | 881 | mdecorde | } |
154 | 881 | mdecorde | |
155 | 881 | mdecorde | if(columnindex.size() != columnNames.size())
|
156 | 881 | mdecorde | { |
157 | 881 | mdecorde | System.err.println("Number of column("+columnindex.size()+") different of number of column name("+columnNames.size()+")"); |
158 | 881 | mdecorde | return false; |
159 | 881 | mdecorde | } |
160 | 881 | mdecorde | |
161 | 881 | mdecorde | //write start document
|
162 | 881 | mdecorde | writer.writeStartDocument("UTF-8", "1.0"); |
163 | 881 | mdecorde | writer.writeStartElement("text")
|
164 | 881 | mdecorde | writer.writeAttribute("id",infile.getName())
|
165 | 881 | mdecorde | |
166 | 881 | mdecorde | boolean gogogo = true; |
167 | 881 | mdecorde | while(gogogo)
|
168 | 881 | mdecorde | { |
169 | 881 | mdecorde | //println "line : "+line
|
170 | 881 | mdecorde | processLine();//split line accordinly to columnindexes
|
171 | 881 | mdecorde | linecounter++; |
172 | 881 | mdecorde | |
173 | 881 | mdecorde | //continue to read the document
|
174 | 881 | mdecorde | line = reader.readLine(); |
175 | 881 | mdecorde | if(line == null) |
176 | 881 | mdecorde | gogogo= false;
|
177 | 881 | mdecorde | } |
178 | 881 | mdecorde | |
179 | 881 | mdecorde | writer.writeEndElement(); |
180 | 881 | mdecorde | writer.writeEndDocument(); |
181 | 881 | mdecorde | writer.close(); |
182 | 881 | mdecorde | output.close(); |
183 | 881 | mdecorde | |
184 | 881 | mdecorde | return true; |
185 | 881 | mdecorde | } |
186 | 881 | mdecorde | |
187 | 881 | mdecorde | /**
|
188 | 881 | mdecorde | * Process line.
|
189 | 881 | mdecorde | */
|
190 | 881 | mdecorde | int wordcount = 1; |
191 | 881 | mdecorde | private void processLine() |
192 | 881 | mdecorde | { |
193 | 881 | mdecorde | String pcdata;
|
194 | 881 | mdecorde | if(line.length() > 0) |
195 | 881 | mdecorde | { |
196 | 881 | mdecorde | writer.writeStartElement(wordTag) |
197 | 881 | mdecorde | if(setIds)
|
198 | 881 | mdecorde | writer.writeAttribute("id", "w_"+textname+"_"+(wordcount++)); |
199 | 881 | mdecorde | for(int i = 0 ; i < columnindex.size(); i++) |
200 | 881 | mdecorde | { |
201 | 881 | mdecorde | if(line.length() > endcolumnindex.get(i))
|
202 | 881 | mdecorde | { |
203 | 881 | mdecorde | String substring = line.substring(columnindex.get(i), endcolumnindex.get(i))
|
204 | 881 | mdecorde | if(i != wordcolumn)
|
205 | 881 | mdecorde | { |
206 | 881 | mdecorde | writer.writeAttribute(columnNames.get(i), substring.trim()) |
207 | 881 | mdecorde | } |
208 | 881 | mdecorde | else
|
209 | 881 | mdecorde | { |
210 | 881 | mdecorde | pcdata = substring.trim() |
211 | 881 | mdecorde | } |
212 | 881 | mdecorde | } |
213 | 881 | mdecorde | else
|
214 | 881 | mdecorde | { |
215 | 881 | mdecorde | println "Error: line $linecounter: len="+line.length()+" maxidx="+endcolumnindex.get(i) |
216 | 881 | mdecorde | println line; |
217 | 881 | mdecorde | } |
218 | 881 | mdecorde | } |
219 | 881 | mdecorde | writer.writeCharacters(pcdata); |
220 | 881 | mdecorde | writer.writeEndElement(); |
221 | 881 | mdecorde | } |
222 | 881 | mdecorde | } |
223 | 881 | mdecorde | |
224 | 881 | mdecorde | /**
|
225 | 881 | mdecorde | * Sets the column indexes.
|
226 | 881 | mdecorde | *
|
227 | 881 | mdecorde | * @param starts the starts
|
228 | 881 | mdecorde | * @param ends the ends
|
229 | 881 | mdecorde | */
|
230 | 881 | mdecorde | private void setColumnIndexes(List<Integer> starts, List<Integer> ends) |
231 | 881 | mdecorde | { |
232 | 881 | mdecorde | columnindex = starts; |
233 | 881 | mdecorde | endcolumnindex = ends; |
234 | 881 | mdecorde | } |
235 | 881 | mdecorde | |
236 | 881 | mdecorde | boolean setIds = false; |
237 | 881 | mdecorde | public void setAddIds(boolean setIds) |
238 | 881 | mdecorde | { |
239 | 881 | mdecorde | this.setIds = setIds;
|
240 | 881 | mdecorde | } |
241 | 881 | mdecorde | |
242 | 881 | mdecorde | /**
|
243 | 881 | mdecorde | * Search number of columns.
|
244 | 881 | mdecorde | */
|
245 | 881 | mdecorde | private void searchNumberOfColumns() |
246 | 881 | mdecorde | { |
247 | 881 | mdecorde | columnindex = new ArrayList<Integer>(); |
248 | 881 | mdecorde | endcolumnindex = new ArrayList<Integer>(); |
249 | 881 | mdecorde | columnindex.add(0);
|
250 | 881 | mdecorde | int ic = 0 |
251 | 881 | mdecorde | char c;
|
252 | 881 | mdecorde | char[] cline = line.toCharArray(); |
253 | 881 | mdecorde | int state = 0; // 0 lettre, 1 espace |
254 | 881 | mdecorde | |
255 | 881 | mdecorde | for(ic = 0 ; ic < cline.length ; ic++) |
256 | 881 | mdecorde | { |
257 | 881 | mdecorde | c = cline[ic]; |
258 | 881 | mdecorde | if(state == 0)// zone lettre |
259 | 881 | mdecorde | { |
260 | 881 | mdecorde | if(c == " ") |
261 | 881 | mdecorde | { |
262 | 881 | mdecorde | state = 1;
|
263 | 881 | mdecorde | |
264 | 881 | mdecorde | } |
265 | 881 | mdecorde | } |
266 | 881 | mdecorde | else // zone espace |
267 | 881 | mdecorde | { |
268 | 881 | mdecorde | if(c != " ") |
269 | 881 | mdecorde | { |
270 | 881 | mdecorde | columnindex.add(ic); |
271 | 881 | mdecorde | endcolumnindex.add(ic-1);
|
272 | 881 | mdecorde | state = 0;
|
273 | 881 | mdecorde | } |
274 | 881 | mdecorde | } |
275 | 881 | mdecorde | } |
276 | 881 | mdecorde | endcolumnindex.add(cline.length); |
277 | 881 | mdecorde | println columnindex; |
278 | 881 | mdecorde | println endcolumnindex; |
279 | 881 | mdecorde | } |
280 | 881 | mdecorde | |
281 | 881 | mdecorde | /**
|
282 | 881 | mdecorde | * Sets the columns lines names.
|
283 | 881 | mdecorde | *
|
284 | 881 | mdecorde | * @param columnNames the column names
|
285 | 881 | mdecorde | * @param unitTag the unit tag
|
286 | 881 | mdecorde | * @param wordcolumn the wordcolumn
|
287 | 881 | mdecorde | */
|
288 | 881 | mdecorde | public void setColumnsLinesNames(List<String> columnNames, String unitTag, int wordcolumn) |
289 | 881 | mdecorde | { |
290 | 881 | mdecorde | this.columnNames = columnNames;
|
291 | 881 | mdecorde | this.wordTag = unitTag;
|
292 | 881 | mdecorde | this.wordcolumn = wordcolumn;
|
293 | 881 | mdecorde | } |
294 | 881 | mdecorde | |
295 | 881 | mdecorde | /**
|
296 | 881 | mdecorde | * Sets the line check.
|
297 | 881 | mdecorde | *
|
298 | 881 | mdecorde | * @param colindex the colindex
|
299 | 881 | mdecorde | * @param tagname the tagname
|
300 | 881 | mdecorde | */
|
301 | 881 | mdecorde | public void setLineCheck(int colindex, String tagname) |
302 | 881 | mdecorde | { |
303 | 881 | mdecorde | this.checkLineNumber = colindex;
|
304 | 881 | mdecorde | this.checkLineNumberTag = tagname;
|
305 | 881 | mdecorde | } |
306 | 881 | mdecorde | |
307 | 881 | mdecorde | /**
|
308 | 881 | mdecorde | * The main method.
|
309 | 881 | mdecorde | *
|
310 | 881 | mdecorde | * @param args the arguments
|
311 | 881 | mdecorde | */
|
312 | 881 | mdecorde | public static void main(String[] args) |
313 | 881 | mdecorde | { |
314 | 881 | mdecorde | File fortranfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\lasla\\L01PlaAmphi.APN"); |
315 | 881 | mdecorde | File outfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\out.xml"); |
316 | 881 | mdecorde | FortranColumns2XML builder = new FortranColumns2XML(fortranfile, outfile, "iso-8859-1"); |
317 | 881 | mdecorde | builder.setColumnIndexes([0, 3, 4, 8, 29, 55, 67], [3, 4, 8, 28, 54, 66, 79]); |
318 | 881 | mdecorde | builder.setColumnsLinesNames(["ref","s","sent","word","lemme","line","pos"], "w", 3); |
319 | 881 | mdecorde | builder.setLineCheck(3, "l"); |
320 | 881 | mdecorde | |
321 | 881 | mdecorde | builder.process(); |
322 | 881 | mdecorde | } |
323 | 881 | mdecorde | } |
324 | 881 | mdecorde |