Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / FortranColumns2XML.groovy @ 479

History | View | Annotate | Download (7.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
// 
38
// 
39
// 
40
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
41
// $LastChangedRevision: 3426 $
42
// $LastChangedBy: mdecorde $ 
43
//
44
package org.txm.importer;
45

    
46
import org.txm.utils.i18n.DetectBOM;
47
import org.w3c.dom.Document;
48
import org.w3c.dom.Element;
49
import org.w3c.dom.NodeList;
50
import org.xml.sax.SAXException;
51

    
52
import java.io.File;
53
import java.io.FileInputStream;
54
import java.io.InputStreamReader;
55

    
56
import javax.xml.stream.*;
57
import java.net.URL;
58

    
59
// TODO: Auto-generated Javadoc
60
/**
61
 * The Class FortranColumns2XML.
62
 */
63
class FortranColumns2XML {
64
        
65
        /** The infile. */
66
        
67
        File infile;
68
        
69
        /** The outfile. */
70
        File outfile;
71
        
72
        /** The encoding. */
73
        String encoding;
74
        
75
        /** The column names. */
76
        List<String> columnNames
77
        
78
        /** The word tag. */
79
        String wordTag;
80
        
81
        /** The wordcolumn. */
82
        int wordcolumn;
83
        
84
        /** The linecounter. */
85
        int linecounter = 0;
86
        
87
        /** The check line number. */
88
        int checkLineNumber;
89
        
90
        /** The check line number tag. */
91
        String checkLineNumberTag;
92
        
93
        /** The columnindex. */
94
        private List<Integer> columnindex;
95
        
96
        /** The endcolumnindex. */
97
        private List<Integer> endcolumnindex;
98
        
99
        /** The line. */
100
        private String line;
101
        
102
        /** The writer. */
103
        private XMLStreamWriter writer;
104
        
105
        /** The reader. */
106
        private Reader reader;
107
        
108
        /** The output. */
109
        private FileOutputStream output;
110
        
111
        String textname;
112
        
113
        /**
114
         * Instantiates a new fortran columns2 xml.
115
         *
116
         * @param infile the infile
117
         * @param outfile the outfile
118
         * @param encoding the encoding
119
         */
120
        FortranColumns2XML(File infile, File outfile, String encoding)
121
        {
122
                this.infile = infile;
123
                this.outfile = outfile;
124
                
125
                textname = infile.getName();
126
                int idx = -1;
127
                idx = textname.lastIndexOf(".");
128
                if (idx > 0)
129
                        textname = textname.substring(0, idx);
130

    
131
                def input = new FileInputStream(infile)                 
132
                reader = new InputStreamReader(input , encoding);
133
                DetectBOM bomdetector = new DetectBOM(infile);
134
                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
135
                
136
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
137
                output = new FileOutputStream(outfile)
138
                writer = factory.createXMLStreamWriter(output, "UTF-8");
139
        }
140
        
141
        /**
142
         * Process.
143
         *
144
         * @return true, if successful
145
         */
146
        public boolean process()
147
        {
148
                line = reader.readLine(); // use the first line to find columns 
149
                
150
                if(columnindex == null || endcolumnindex == null)
151
                {
152
                        searchNumberOfColumns();
153
                }
154
                
155
                if(columnindex.size() != columnNames.size())
156
                {
157
                        System.err.println("Number of column("+columnindex.size()+") different of number of column name("+columnNames.size()+")");
158
                        return false;
159
                }
160
                
161
                //write start document
162
                writer.writeStartDocument("UTF-8", "1.0");
163
                writer.writeStartElement("text")
164
                writer.writeAttribute("id",infile.getName())
165
                
166
                boolean gogogo = true;
167
                while(gogogo)
168
                {        
169
                        //println "line : "+line
170
                        processLine();//split line accordinly to columnindexes
171
                        linecounter++;
172
                        
173
                        //continue to read the document
174
                        line = reader.readLine();
175
                        if(line == null)
176
                                gogogo= false;
177
                }
178
                
179
                writer.writeEndElement();
180
                writer.writeEndDocument();
181
                writer.close();
182
                output.close();
183
                
184
                return true;
185
        }
186
        
187
        /**
188
         * Process line.
189
         */
190
        int wordcount = 1;
191
        private void processLine()
192
        {
193
                String pcdata;
194
                if(line.length() > 0)
195
                {
196
                        writer.writeStartElement(wordTag)
197
                        if(setIds)
198
                                writer.writeAttribute("id", "w_"+textname+"_"+(wordcount++));
199
                        for(int i = 0 ; i < columnindex.size(); i++)
200
                        {
201
                                if(line.length() > endcolumnindex.get(i))
202
                                {
203
                                        String substring = line.substring(columnindex.get(i), endcolumnindex.get(i))
204
                                        if(i != wordcolumn)
205
                                        {
206
                                                writer.writeAttribute(columnNames.get(i), substring.trim())
207
                                        }
208
                                        else
209
                                        {
210
                                                pcdata = substring.trim()
211
                                        }
212
                                }
213
                                else
214
                                {
215
                                        println "Error: line $linecounter: len="+line.length()+" maxidx="+endcolumnindex.get(i)
216
                                        println line;
217
                                }
218
                        }
219
                        writer.writeCharacters(pcdata);
220
                        writer.writeEndElement();
221
                }
222
        }
223
        
224
        /**
225
         * Sets the column indexes.
226
         *
227
         * @param starts the starts
228
         * @param ends the ends
229
         */
230
        private void setColumnIndexes(List<Integer> starts, List<Integer> ends)
231
        {
232
                columnindex = starts;
233
                endcolumnindex = ends;
234
        }
235
        
236
        boolean setIds = false;
237
        public void setAddIds(boolean setIds)
238
        {
239
                this.setIds = setIds;
240
        }
241
        
242
        /**
243
         * Search number of columns.
244
         */
245
        private void searchNumberOfColumns()
246
        {
247
                columnindex = new ArrayList<Integer>();
248
                endcolumnindex = new ArrayList<Integer>();
249
                columnindex.add(0);
250
                int ic = 0
251
                char c;
252
                char[] cline = line.toCharArray();
253
                int state = 0; // 0 lettre, 1 espace
254
                
255
                for(ic = 0 ; ic < cline.length ; ic++)
256
                {
257
                        c = cline[ic];
258
                        if(state == 0)// zone lettre
259
                        {
260
                                if(c == " ")
261
                                {
262
                                        state = 1;
263
                                        
264
                                }
265
                        }
266
                        else // zone espace
267
                        {
268
                                if(c != " ")
269
                                {
270
                                        columnindex.add(ic);
271
                                        endcolumnindex.add(ic-1);
272
                                        state = 0;
273
                                }
274
                        }
275
                }
276
                endcolumnindex.add(cline.length);
277
                println columnindex;
278
                println endcolumnindex;
279
        }
280
        
281
        /**
282
         * Sets the columns lines names.
283
         *
284
         * @param columnNames the column names
285
         * @param unitTag the unit tag
286
         * @param wordcolumn the wordcolumn
287
         */
288
        public void setColumnsLinesNames(List<String> columnNames, String unitTag, int wordcolumn)
289
        {
290
                this.columnNames = columnNames;
291
                this.wordTag = unitTag;
292
                this.wordcolumn = wordcolumn;
293
        }
294
        
295
        /**
296
         * Sets the line check.
297
         *
298
         * @param colindex the colindex
299
         * @param tagname the tagname
300
         */
301
        public void setLineCheck(int colindex, String tagname)
302
        {
303
                this.checkLineNumber = colindex;
304
                this.checkLineNumberTag = tagname;
305
        }
306
        
307
        /**
308
         * The main method.
309
         *
310
         * @param args the arguments
311
         */
312
        public static void main(String[] args)
313
        {
314
                File fortranfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\lasla\\L01PlaAmphi.APN");
315
                File outfile = new File(System.getProperty("user.home"),"\\xml\\lasla\\out.xml");
316
                FortranColumns2XML builder = new FortranColumns2XML(fortranfile, outfile, "iso-8859-1");
317
                builder.setColumnIndexes([0, 3, 4, 8, 29, 55, 67],        [3, 4, 8, 28, 54, 66, 79]);
318
                builder.setColumnsLinesNames(["ref","s","sent","word","lemme","line","pos"], "w", 3);
319
                builder.setLineCheck(3, "l");
320
                
321
                builder.process();
322
        }
323
}
324

    
325