root / tmp / org.txm.core / src / java / org / txm / scripts / importer / FortranColumns2XML.groovy @ 2473
History | View | Annotate | Download (7.9 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
|
22 |
//
|
23 |
// This file is part of the TXM platform.
|
24 |
//
|
25 |
// The TXM platform is free software: you can redistribute it and/or modif y
|
26 |
// it under the terms of the GNU General Public License as published by
|
27 |
// the Free Software Foundation, either version 3 of the License, or
|
28 |
// (at your option) any later version.
|
29 |
//
|
30 |
// The TXM platform is distributed in the hope that it will be useful,
|
31 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 |
// GNU General Public License for more details.
|
34 |
//
|
35 |
// You should have received a copy of the GNU General Public License
|
36 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 |
//
|
38 |
//
|
39 |
//
|
40 |
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
|
41 |
// $LastChangedRevision: 3426 $
|
42 |
// $LastChangedBy: mdecorde $
|
43 |
//
|
44 |
package org.txm.scripts.importer;
|
45 |
|
46 |
import org.txm.utils.i18n.DetectBOM; |
47 |
import org.w3c.dom.Document; |
48 |
import org.w3c.dom.Element; |
49 |
import org.w3c.dom.NodeList; |
50 |
import org.xml.sax.SAXException; |
51 |
|
52 |
import java.io.File; |
53 |
import java.io.FileInputStream; |
54 |
import java.io.InputStreamReader; |
55 |
|
56 |
import javax.xml.stream.*; |
57 |
import java.net.URL; |
58 |
|
59 |
// TODO: Auto-generated Javadoc
|
60 |
/**
|
61 |
* The Class FortranColumns2XML.
|
62 |
*/
|
63 |
class FortranColumns2XML { |
64 |
|
65 |
/** The infile. */
|
66 |
|
67 |
File infile;
|
68 |
|
69 |
/** The outfile. */
|
70 |
File outfile;
|
71 |
|
72 |
/** The encoding. */
|
73 |
String encoding;
|
74 |
|
75 |
/** The column names. */
|
76 |
List<String> columnNames |
77 |
|
78 |
/** The word tag. */
|
79 |
String wordTag;
|
80 |
|
81 |
/** The wordcolumn. */
|
82 |
int wordcolumn;
|
83 |
|
84 |
/** The linecounter. */
|
85 |
int linecounter = 0; |
86 |
|
87 |
/** The check line number. */
|
88 |
int checkLineNumber;
|
89 |
|
90 |
/** The check line number tag. */
|
91 |
String checkLineNumberTag;
|
92 |
|
93 |
/** The columnindex. */
|
94 |
private List<Integer> columnindex; |
95 |
|
96 |
/** The endcolumnindex. */
|
97 |
private List<Integer> endcolumnindex; |
98 |
|
99 |
/** The line. */
|
100 |
private String line; |
101 |
|
102 |
/** The writer. */
|
103 |
private XMLStreamWriter writer;
|
104 |
|
105 |
/** The reader. */
|
106 |
private Reader reader; |
107 |
|
108 |
/** The output. */
|
109 |
private FileOutputStream output; |
110 |
|
111 |
String textname;
|
112 |
|
113 |
/**
|
114 |
* Instantiates a new fortran columns2 xml.
|
115 |
*
|
116 |
* @param infile the infile
|
117 |
* @param outfile the outfile
|
118 |
* @param encoding the encoding
|
119 |
*/
|
120 |
FortranColumns2XML(File infile, File outfile, String encoding) |
121 |
{ |
122 |
this.infile = infile;
|
123 |
this.outfile = outfile;
|
124 |
|
125 |
textname = infile.getName(); |
126 |
int idx = -1; |
127 |
idx = textname.lastIndexOf(".");
|
128 |
if (idx > 0) |
129 |
textname = textname.substring(0, idx);
|
130 |
|
131 |
def input = new FileInputStream(infile) |
132 |
reader = new InputStreamReader(input , encoding); |
133 |
DetectBOM bomdetector = new DetectBOM(infile);
|
134 |
for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read() |
135 |
|
136 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
137 |
output = new FileOutputStream(outfile) |
138 |
writer = factory.createXMLStreamWriter(output, "UTF-8");
|
139 |
} |
140 |
|
141 |
/**
|
142 |
* Process.
|
143 |
*
|
144 |
* @return true, if successful
|
145 |
*/
|
146 |
public boolean process() |
147 |
{ |
148 |
line = reader.readLine(); // use the first line to find columns
|
149 |
|
150 |
if(columnindex == null || endcolumnindex == null) |
151 |
{ |
152 |
searchNumberOfColumns(); |
153 |
} |
154 |
|
155 |
if(columnindex.size() != columnNames.size())
|
156 |
{ |
157 |
System.err.println("Number of column("+columnindex.size()+") different of number of column name("+columnNames.size()+")"); |
158 |
return false; |
159 |
} |
160 |
|
161 |
//write start document
|
162 |
writer.writeStartDocument("UTF-8", "1.0"); |
163 |
writer.writeStartElement("text")
|
164 |
writer.writeAttribute("id",infile.getName())
|
165 |
|
166 |
boolean gogogo = true; |
167 |
while(gogogo)
|
168 |
{ |
169 |
//println "line : "+line
|
170 |
processLine();//split line accordinly to columnindexes
|
171 |
linecounter++; |
172 |
|
173 |
//continue to read the document
|
174 |
line = reader.readLine(); |
175 |
if(line == null) |
176 |
gogogo= false;
|
177 |
} |
178 |
|
179 |
writer.writeEndElement(); |
180 |
writer.writeEndDocument(); |
181 |
writer.close(); |
182 |
output.close(); |
183 |
|
184 |
return true; |
185 |
} |
186 |
|
187 |
/**
|
188 |
* Process line.
|
189 |
*/
|
190 |
int wordcount = 1; |
191 |
private void processLine() |
192 |
{ |
193 |
String pcdata;
|
194 |
if(line.length() > 0) |
195 |
{ |
196 |
writer.writeStartElement(wordTag) |
197 |
if(setIds)
|
198 |
writer.writeAttribute("id", "w_"+textname+"_"+(wordcount++)); |
199 |
for(int i = 0 ; i < columnindex.size(); i++) |
200 |
{ |
201 |
if(line.length() > endcolumnindex.get(i))
|
202 |
{ |
203 |
String substring = line.substring(columnindex.get(i), endcolumnindex.get(i))
|
204 |
if(i != wordcolumn)
|
205 |
{ |
206 |
writer.writeAttribute(columnNames.get(i), substring.trim()) |
207 |
} |
208 |
else
|
209 |
{ |
210 |
pcdata = substring.trim() |
211 |
} |
212 |
} |
213 |
else
|
214 |
{ |
215 |
println "Error: line $linecounter: len="+line.length()+" maxidx="+endcolumnindex.get(i) |
216 |
println line; |
217 |
} |
218 |
} |
219 |
writer.writeCharacters(pcdata); |
220 |
writer.writeEndElement(); |
221 |
} |
222 |
} |
223 |
|
224 |
/**
|
225 |
* Sets the column indexes.
|
226 |
*
|
227 |
* @param starts the starts
|
228 |
* @param ends the ends
|
229 |
*/
|
230 |
private void setColumnIndexes(List<Integer> starts, List<Integer> ends) |
231 |
{ |
232 |
columnindex = starts; |
233 |
endcolumnindex = ends; |
234 |
} |
235 |
|
236 |
boolean setIds = false; |
237 |
public void setAddIds(boolean setIds) |
238 |
{ |
239 |
this.setIds = setIds;
|
240 |
} |
241 |
|
242 |
/**
|
243 |
* Search number of columns.
|
244 |
*/
|
245 |
private void searchNumberOfColumns() |
246 |
{ |
247 |
columnindex = new ArrayList<Integer>(); |
248 |
endcolumnindex = new ArrayList<Integer>(); |
249 |
columnindex.add(0);
|
250 |
int ic = 0 |
251 |
char c;
|
252 |
char[] cline = line.toCharArray(); |
253 |
int state = 0; // 0 lettre, 1 espace |
254 |
|
255 |
for(ic = 0 ; ic < cline.length ; ic++) |
256 |
{ |
257 |
c = cline[ic]; |
258 |
if(state == 0)// zone lettre |
259 |
{ |
260 |
if(c == " ") |
261 |
{ |
262 |
state = 1;
|
263 |
|
264 |
} |
265 |
} |
266 |
else // zone espace |
267 |
{ |
268 |
if(c != " ") |
269 |
{ |
270 |
columnindex.add(ic); |
271 |
endcolumnindex.add(ic-1);
|
272 |
state = 0;
|
273 |
} |
274 |
} |
275 |
} |
276 |
endcolumnindex.add(cline.length); |
277 |
println columnindex; |
278 |
println endcolumnindex; |
279 |
} |
280 |
|
281 |
/**
|
282 |
* Sets the columns lines names.
|
283 |
*
|
284 |
* @param columnNames the column names
|
285 |
* @param unitTag the unit tag
|
286 |
* @param wordcolumn the wordcolumn
|
287 |
*/
|
288 |
public void setColumnsLinesNames(List<String> columnNames, String unitTag, int wordcolumn) |
289 |
{ |
290 |
this.columnNames = columnNames;
|
291 |
this.wordTag = unitTag;
|
292 |
this.wordcolumn = wordcolumn;
|
293 |
} |
294 |
|
295 |
/**
|
296 |
* Sets the line check.
|
297 |
*
|
298 |
* @param colindex the colindex
|
299 |
* @param tagname the tagname
|
300 |
*/
|
301 |
public void setLineCheck(int colindex, String tagname) |
302 |
{ |
303 |
this.checkLineNumber = colindex;
|
304 |
this.checkLineNumberTag = tagname;
|
305 |
} |
306 |
|
307 |
/**
|
308 |
* The main method.
|
309 |
*
|
310 |
* @param args the arguments
|
311 |
*/
|
312 |
public static void main(String[] args) |
313 |
{ |
314 |
File fortranfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\lasla\\L01PlaAmphi.APN"); |
315 |
File outfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\out.xml"); |
316 |
FortranColumns2XML builder = new FortranColumns2XML(fortranfile, outfile, "iso-8859-1"); |
317 |
builder.setColumnIndexes([0, 3, 4, 8, 29, 55, 67], [3, 4, 8, 28, 54, 66, 79]); |
318 |
builder.setColumnsLinesNames(["ref","s","sent","word","lemme","line","pos"], "w", 3); |
319 |
builder.setLineCheck(3, "l"); |
320 |
|
321 |
builder.process(); |
322 |
} |
323 |
} |
324 |
|
325 |
|