Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / FortranColumns2XML.groovy @ 1688

History | View | Annotate | Download (7.9 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
22 881 mdecorde
//
23 881 mdecorde
// This file is part of the TXM platform.
24 881 mdecorde
//
25 881 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 881 mdecorde
// it under the terms of the GNU General Public License as published by
27 881 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 881 mdecorde
// (at your option) any later version.
29 881 mdecorde
//
30 881 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 881 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 881 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 881 mdecorde
// GNU General Public License for more details.
34 881 mdecorde
//
35 881 mdecorde
// You should have received a copy of the GNU General Public License
36 881 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 881 mdecorde
//
38 881 mdecorde
//
39 881 mdecorde
//
40 881 mdecorde
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
41 881 mdecorde
// $LastChangedRevision: 3426 $
42 881 mdecorde
// $LastChangedBy: mdecorde $
43 881 mdecorde
//
44 1000 mdecorde
package org.txm.scripts.importer;
45 881 mdecorde
46 881 mdecorde
import org.txm.utils.i18n.DetectBOM;
47 881 mdecorde
import org.w3c.dom.Document;
48 881 mdecorde
import org.w3c.dom.Element;
49 881 mdecorde
import org.w3c.dom.NodeList;
50 881 mdecorde
import org.xml.sax.SAXException;
51 881 mdecorde
52 881 mdecorde
import java.io.File;
53 881 mdecorde
import java.io.FileInputStream;
54 881 mdecorde
import java.io.InputStreamReader;
55 881 mdecorde
56 881 mdecorde
import javax.xml.stream.*;
57 881 mdecorde
import java.net.URL;
58 881 mdecorde
59 881 mdecorde
// TODO: Auto-generated Javadoc
60 881 mdecorde
/**
61 881 mdecorde
 * The Class FortranColumns2XML.
62 881 mdecorde
 */
63 881 mdecorde
class FortranColumns2XML {
64 881 mdecorde
65 881 mdecorde
        /** The infile. */
66 881 mdecorde
67 881 mdecorde
        File infile;
68 881 mdecorde
69 881 mdecorde
        /** The outfile. */
70 881 mdecorde
        File outfile;
71 881 mdecorde
72 881 mdecorde
        /** The encoding. */
73 881 mdecorde
        String encoding;
74 881 mdecorde
75 881 mdecorde
        /** The column names. */
76 881 mdecorde
        List<String> columnNames
77 881 mdecorde
78 881 mdecorde
        /** The word tag. */
79 881 mdecorde
        String wordTag;
80 881 mdecorde
81 881 mdecorde
        /** The wordcolumn. */
82 881 mdecorde
        int wordcolumn;
83 881 mdecorde
84 881 mdecorde
        /** The linecounter. */
85 881 mdecorde
        int linecounter = 0;
86 881 mdecorde
87 881 mdecorde
        /** The check line number. */
88 881 mdecorde
        int checkLineNumber;
89 881 mdecorde
90 881 mdecorde
        /** The check line number tag. */
91 881 mdecorde
        String checkLineNumberTag;
92 881 mdecorde
93 881 mdecorde
        /** The columnindex. */
94 881 mdecorde
        private List<Integer> columnindex;
95 881 mdecorde
96 881 mdecorde
        /** The endcolumnindex. */
97 881 mdecorde
        private List<Integer> endcolumnindex;
98 881 mdecorde
99 881 mdecorde
        /** The line. */
100 881 mdecorde
        private String line;
101 881 mdecorde
102 881 mdecorde
        /** The writer. */
103 881 mdecorde
        private XMLStreamWriter writer;
104 881 mdecorde
105 881 mdecorde
        /** The reader. */
106 881 mdecorde
        private Reader reader;
107 881 mdecorde
108 881 mdecorde
        /** The output. */
109 881 mdecorde
        private FileOutputStream output;
110 881 mdecorde
111 881 mdecorde
        String textname;
112 881 mdecorde
113 881 mdecorde
        /**
114 881 mdecorde
         * Instantiates a new fortran columns2 xml.
115 881 mdecorde
         *
116 881 mdecorde
         * @param infile the infile
117 881 mdecorde
         * @param outfile the outfile
118 881 mdecorde
         * @param encoding the encoding
119 881 mdecorde
         */
120 881 mdecorde
        FortranColumns2XML(File infile, File outfile, String encoding)
121 881 mdecorde
        {
122 881 mdecorde
                this.infile = infile;
123 881 mdecorde
                this.outfile = outfile;
124 881 mdecorde
125 881 mdecorde
                textname = infile.getName();
126 881 mdecorde
                int idx = -1;
127 881 mdecorde
                idx = textname.lastIndexOf(".");
128 881 mdecorde
                if (idx > 0)
129 881 mdecorde
                        textname = textname.substring(0, idx);
130 881 mdecorde
131 881 mdecorde
                def input = new FileInputStream(infile)
132 881 mdecorde
                reader = new InputStreamReader(input , encoding);
133 881 mdecorde
                DetectBOM bomdetector = new DetectBOM(infile);
134 881 mdecorde
                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
135 881 mdecorde
136 881 mdecorde
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
137 881 mdecorde
                output = new FileOutputStream(outfile)
138 881 mdecorde
                writer = factory.createXMLStreamWriter(output, "UTF-8");
139 881 mdecorde
        }
140 881 mdecorde
141 881 mdecorde
        /**
142 881 mdecorde
         * Process.
143 881 mdecorde
         *
144 881 mdecorde
         * @return true, if successful
145 881 mdecorde
         */
146 881 mdecorde
        public boolean process()
147 881 mdecorde
        {
148 881 mdecorde
                line = reader.readLine(); // use the first line to find columns
149 881 mdecorde
150 881 mdecorde
                if(columnindex == null || endcolumnindex == null)
151 881 mdecorde
                {
152 881 mdecorde
                        searchNumberOfColumns();
153 881 mdecorde
                }
154 881 mdecorde
155 881 mdecorde
                if(columnindex.size() != columnNames.size())
156 881 mdecorde
                {
157 881 mdecorde
                        System.err.println("Number of column("+columnindex.size()+") different of number of column name("+columnNames.size()+")");
158 881 mdecorde
                        return false;
159 881 mdecorde
                }
160 881 mdecorde
161 881 mdecorde
                //write start document
162 881 mdecorde
                writer.writeStartDocument("UTF-8", "1.0");
163 881 mdecorde
                writer.writeStartElement("text")
164 881 mdecorde
                writer.writeAttribute("id",infile.getName())
165 881 mdecorde
166 881 mdecorde
                boolean gogogo = true;
167 881 mdecorde
                while(gogogo)
168 881 mdecorde
                {
169 881 mdecorde
                        //println "line : "+line
170 881 mdecorde
                        processLine();//split line accordinly to columnindexes
171 881 mdecorde
                        linecounter++;
172 881 mdecorde
173 881 mdecorde
                        //continue to read the document
174 881 mdecorde
                        line = reader.readLine();
175 881 mdecorde
                        if(line == null)
176 881 mdecorde
                                gogogo= false;
177 881 mdecorde
                }
178 881 mdecorde
179 881 mdecorde
                writer.writeEndElement();
180 881 mdecorde
                writer.writeEndDocument();
181 881 mdecorde
                writer.close();
182 881 mdecorde
                output.close();
183 881 mdecorde
184 881 mdecorde
                return true;
185 881 mdecorde
        }
186 881 mdecorde
187 881 mdecorde
        /**
188 881 mdecorde
         * Process line.
189 881 mdecorde
         */
190 881 mdecorde
        int wordcount = 1;
191 881 mdecorde
        private void processLine()
192 881 mdecorde
        {
193 881 mdecorde
                String pcdata;
194 881 mdecorde
                if(line.length() > 0)
195 881 mdecorde
                {
196 881 mdecorde
                        writer.writeStartElement(wordTag)
197 881 mdecorde
                        if(setIds)
198 881 mdecorde
                                writer.writeAttribute("id", "w_"+textname+"_"+(wordcount++));
199 881 mdecorde
                        for(int i = 0 ; i < columnindex.size(); i++)
200 881 mdecorde
                        {
201 881 mdecorde
                                if(line.length() > endcolumnindex.get(i))
202 881 mdecorde
                                {
203 881 mdecorde
                                        String substring = line.substring(columnindex.get(i), endcolumnindex.get(i))
204 881 mdecorde
                                        if(i != wordcolumn)
205 881 mdecorde
                                        {
206 881 mdecorde
                                                writer.writeAttribute(columnNames.get(i), substring.trim())
207 881 mdecorde
                                        }
208 881 mdecorde
                                        else
209 881 mdecorde
                                        {
210 881 mdecorde
                                                pcdata = substring.trim()
211 881 mdecorde
                                        }
212 881 mdecorde
                                }
213 881 mdecorde
                                else
214 881 mdecorde
                                {
215 881 mdecorde
                                        println "Error: line $linecounter: len="+line.length()+" maxidx="+endcolumnindex.get(i)
216 881 mdecorde
                                        println line;
217 881 mdecorde
                                }
218 881 mdecorde
                        }
219 881 mdecorde
                        writer.writeCharacters(pcdata);
220 881 mdecorde
                        writer.writeEndElement();
221 881 mdecorde
                }
222 881 mdecorde
        }
223 881 mdecorde
224 881 mdecorde
        /**
225 881 mdecorde
         * Sets the column indexes.
226 881 mdecorde
         *
227 881 mdecorde
         * @param starts the starts
228 881 mdecorde
         * @param ends the ends
229 881 mdecorde
         */
230 881 mdecorde
        private void setColumnIndexes(List<Integer> starts, List<Integer> ends)
231 881 mdecorde
        {
232 881 mdecorde
                columnindex = starts;
233 881 mdecorde
                endcolumnindex = ends;
234 881 mdecorde
        }
235 881 mdecorde
236 881 mdecorde
        boolean setIds = false;
237 881 mdecorde
        public void setAddIds(boolean setIds)
238 881 mdecorde
        {
239 881 mdecorde
                this.setIds = setIds;
240 881 mdecorde
        }
241 881 mdecorde
242 881 mdecorde
        /**
243 881 mdecorde
         * Search number of columns.
244 881 mdecorde
         */
245 881 mdecorde
        private void searchNumberOfColumns()
246 881 mdecorde
        {
247 881 mdecorde
                columnindex = new ArrayList<Integer>();
248 881 mdecorde
                endcolumnindex = new ArrayList<Integer>();
249 881 mdecorde
                columnindex.add(0);
250 881 mdecorde
                int ic = 0
251 881 mdecorde
                char c;
252 881 mdecorde
                char[] cline = line.toCharArray();
253 881 mdecorde
                int state = 0; // 0 lettre, 1 espace
254 881 mdecorde
255 881 mdecorde
                for(ic = 0 ; ic < cline.length ; ic++)
256 881 mdecorde
                {
257 881 mdecorde
                        c = cline[ic];
258 881 mdecorde
                        if(state == 0)// zone lettre
259 881 mdecorde
                        {
260 881 mdecorde
                                if(c == " ")
261 881 mdecorde
                                {
262 881 mdecorde
                                        state = 1;
263 881 mdecorde
264 881 mdecorde
                                }
265 881 mdecorde
                        }
266 881 mdecorde
                        else // zone espace
267 881 mdecorde
                        {
268 881 mdecorde
                                if(c != " ")
269 881 mdecorde
                                {
270 881 mdecorde
                                        columnindex.add(ic);
271 881 mdecorde
                                        endcolumnindex.add(ic-1);
272 881 mdecorde
                                        state = 0;
273 881 mdecorde
                                }
274 881 mdecorde
                        }
275 881 mdecorde
                }
276 881 mdecorde
                endcolumnindex.add(cline.length);
277 881 mdecorde
                println columnindex;
278 881 mdecorde
                println endcolumnindex;
279 881 mdecorde
        }
280 881 mdecorde
281 881 mdecorde
        /**
282 881 mdecorde
         * Sets the columns lines names.
283 881 mdecorde
         *
284 881 mdecorde
         * @param columnNames the column names
285 881 mdecorde
         * @param unitTag the unit tag
286 881 mdecorde
         * @param wordcolumn the wordcolumn
287 881 mdecorde
         */
288 881 mdecorde
        public void setColumnsLinesNames(List<String> columnNames, String unitTag, int wordcolumn)
289 881 mdecorde
        {
290 881 mdecorde
                this.columnNames = columnNames;
291 881 mdecorde
                this.wordTag = unitTag;
292 881 mdecorde
                this.wordcolumn = wordcolumn;
293 881 mdecorde
        }
294 881 mdecorde
295 881 mdecorde
        /**
296 881 mdecorde
         * Sets the line check.
297 881 mdecorde
         *
298 881 mdecorde
         * @param colindex the colindex
299 881 mdecorde
         * @param tagname the tagname
300 881 mdecorde
         */
301 881 mdecorde
        public void setLineCheck(int colindex, String tagname)
302 881 mdecorde
        {
303 881 mdecorde
                this.checkLineNumber = colindex;
304 881 mdecorde
                this.checkLineNumberTag = tagname;
305 881 mdecorde
        }
306 881 mdecorde
307 881 mdecorde
        /**
308 881 mdecorde
         * The main method.
309 881 mdecorde
         *
310 881 mdecorde
         * @param args the arguments
311 881 mdecorde
         */
312 881 mdecorde
        public static void main(String[] args)
313 881 mdecorde
        {
314 881 mdecorde
                File fortranfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\lasla\\L01PlaAmphi.APN");
315 881 mdecorde
                File outfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\out.xml");
316 881 mdecorde
                FortranColumns2XML builder = new FortranColumns2XML(fortranfile, outfile, "iso-8859-1");
317 881 mdecorde
                builder.setColumnIndexes([0, 3, 4, 8, 29, 55, 67],        [3, 4, 8, 28, 54, 66, 79]);
318 881 mdecorde
                builder.setColumnsLinesNames(["ref","s","sent","word","lemme","line","pos"], "w", 3);
319 881 mdecorde
                builder.setLineCheck(3, "l");
320 881 mdecorde
321 881 mdecorde
                builder.process();
322 881 mdecorde
        }
323 881 mdecorde
}
324 881 mdecorde