Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / Alceste2Xml.groovy @ 187

History | View | Annotate | Download (8.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer;
29

    
30
import javax.xml.parsers.DocumentBuilder;
31
import javax.xml.parsers.DocumentBuilderFactory;
32
import javax.xml.parsers.ParserConfigurationException;
33
import javax.xml.transform.OutputKeys;
34
import javax.xml.transform.Result;
35
import javax.xml.transform.Source;
36
import javax.xml.transform.Transformer;
37
import javax.xml.transform.TransformerFactory;
38
import javax.xml.transform.dom.DOMSource;
39
import javax.xml.transform.stream.StreamResult;
40

    
41
import org.txm.utils.AsciiUtils;
42
import org.txm.utils.CharsetDetector;
43
import org.txm.utils.i18n.DetectBOM;
44
import org.w3c.dom.Document;
45
import org.w3c.dom.Element;
46
import org.w3c.dom.NodeList;
47
import org.xml.sax.SAXException;
48

    
49
import java.io.File;
50
import java.io.FileInputStream;
51
import java.io.InputStreamReader;
52
import java.io.Reader;
53

    
54
import javax.xml.stream.*;
55
import java.net.URL;
56

    
57

    
58
// TODO: Auto-generated Javadoc
59
/**
60
 * Create a file per text of an alceste file.
61
 */
62
class Alceste2Xml {
63

    
64
        /** The textno. */
65
        String textno = "0001";
66

    
67
        /** The textcount. */
68
        int textcount = 0;
69
        int nbOfStarNumbers;
70
        static def TEXTNUMBERREGEX = /^[0-9]{4,6} .+/
71
        static def TEXTSTARREGEX = /^[*]{4,6} .+/
72
        /**
73
         * run the script.
74
         *
75
         * @param infile the infile
76
         * @param outdir the outdir
77
         * @param encoding the encoding
78
         */
79
        public boolean run(File infile, File outdir, String encoding) {
80
                if (encoding == "??") {
81
                        encoding = new CharsetDetector(infile).getEncoding();
82
                        println "Guessing encoding of $infile : $encoding"
83
                }
84
                int noline = 0;
85
                int p_n = 1;
86
                int lb_n = 1;
87
                def duplicatedAttributes = new HashSet<String>();
88
                try {
89
                        outdir.mkdir();
90
                        boolean star = guessStar(infile, encoding);
91

    
92
                        String textname = infile.getName().substring(0,infile.getName().length()-4);// creates file with prefix the name of the alceste file
93

    
94
                        FileInputStream input = new FileInputStream(infile);
95
                        Reader reader = new InputStreamReader(input , encoding);
96

    
97
                        DetectBOM bomdetector = new DetectBOM(infile);
98
                        for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
99
                        
100
                        XMLOutputFactory factory = XMLOutputFactory.newInstance();
101
                        BufferedOutputStream output;
102
                        XMLStreamWriter writer;
103

    
104
                        String line = reader.readLine();
105
                        while (line != null) {
106
                                line = CleanFile.clean(line);
107
                                
108
                                if (((line ==~ TEXTNUMBERREGEX) || line ==~ TEXTSTARREGEX) && line.contains("*")) { // we found a text declaration with star attributes
109
                                        if (writer != null) { // end previous text if any
110
                                                writer.writeEndElement(); // p
111
                                                writer.writeEndElement(); // text
112
                                                writer.close();
113
                                                output.close()
114
                                        }
115
                                        
116
                                        lb_n = 1; // reset line counter
117
                                        p_n = 1; // reset paragraph counter
118
                                        int idx = line.indexOf(" "); // star metadata starts after the first ' '
119
                                        
120
                                        if (!star) {
121
                                                textno = line.substring(0, idx)
122
                                                textcount++;
123
                                        } else {
124
                                                this.buildNextId(); // update textno
125
                                        }
126

    
127
                                        String filename = textno;
128
                                        def attributes = new LinkedHashMap<String, String>();
129
                                        filename = filename.trim();
130
                                        output = new BufferedOutputStream(new FileOutputStream(new File(outdir, filename+".xml")))
131
                                        writer = factory.createXMLStreamWriter(output, "UTF-8"); //create a new file
132
                                        writer.writeStartDocument("UTF-8", "1.0")
133
                                        writer.writeStartElement("text");
134
                                        
135
                                        attributes["id"] = filename // writen at the end
136
                                        // get text metadata
137
                                        line = line.substring(idx + 1); // remove the text marker
138
                                        
139
                                        String[] splited = line.split("\\*");
140
                                        for (String s1 : splited) { // for each text star metadata
141
                                                s1 = s1.trim();
142
                                                if (s1.length() == 0) continue;
143

    
144
                                                String[] splited2 = s1.split("_", 2);
145
                                                if (splited2.length == 1) { // attribute without value
146
                                                        
147
                                                        if (s1.endsWith("0")) {
148
                                                                String attrname = s1.substring(0, s1.length()-1); // remove "0"
149
                                                                String attrnamebase = attrname = AsciiUtils.buildId(attrname);
150
                                                                int c = 2;
151
                                                                while (attributes.containsKey(attrname)) {
152
                                                                        attrname = attrnamebase+(c++)
153
                                                                }
154
                                                                if (c > 2) duplicatedAttributes.add(attrname);
155
                                                                attributes.put(attrname, "N/A")
156
                                                        } else {
157
                                                                println "Error: attribute: "+s1+ " with line "+line
158
                                                        }
159
                                                } else if (splited2.length == 2) {
160
                                                        
161
                                                        String attrname = splited2[0].toLowerCase()
162
                                                        String attrnamebase = attrname = AsciiUtils.buildId(attrname);
163
                                                                int c = 2;
164
                                                                while (attributes.containsKey(attrname)) {
165
                                                                        attrname = attrnamebase+(c++)
166
                                                                }
167
                                                                if (c > 2) duplicatedAttributes.add(attrname);
168
                                                        
169
                                                        String attrvalue = splited2[1]
170
                                                                
171
                                                        attributes.put(attrname, attrvalue)
172
                                                } else {
173
                                                
174
                                                        print("wrong size: name_value : "+splited2.length+" : ");
175
                                                        for (String s : splited ) print s+""
176
                                                        println ""
177
                                                }
178
                                        }
179
                                        
180
                                        for (def attname: attributes.keySet()) { // finally write the XML attributes
181
                                                writer.writeAttribute(attname, attributes[attname]);
182
                                        }
183
                                        writer.writeCharacters("\n");
184
                                        writer.writeStartElement("p"); // we create a tag <p>
185
                                        writer.writeAttribute("n", ""+(p_n++));// then we add attributes to the tag
186

    
187
                                } else if (line.startsWith("-*")) {
188
                                        println "ignored locution : "+line;
189
                                } else {
190
                                        if (writer != null) {
191
                                                if (line.length() > 0) {
192
                                                        writer.writeEmptyElement("lb");
193
                                                        writer.writeAttribute("n", ""+(lb_n++))
194
                                                        writer.writeCharacters("\n");
195
                                                } else if (line.trim().length() == 0) {
196
                                                        writer.writeEndElement(); // p
197
                                                        writer.writeStartElement("p");
198
                                                        writer.writeAttribute("n", ""+(p_n++))
199
                                                        writer.writeCharacters("\n");
200
                                                }
201
                                                
202
                                                writer.writeCharacters(line);
203
                                        }
204
                                }
205
                                line = reader.readLine();
206
                                noline++;
207
                        }
208
                        // the end
209
                        if (writer != null) { // end last text
210
                                writer.writeEndElement(); // p
211
                                writer.writeEndElement(); // text
212

    
213
                                reader.close();
214
                                writer.close();
215
                                output.close();
216
                        }
217
                } catch(Exception e) {
218
                        println "Error line : "+noline;
219
                        org.txm.utils.logger.Log.printStackTrace(e);
220
                }
221
                
222
                if (duplicatedAttributes.size() > 0){
223
                        println "Some attributes were duplicated: "+duplicatedAttributes;
224
                }
225
                
226
                if (textcount == 0) {
227
                        println "Error: no text found in "+infile;
228
                        return false;
229
                }
230
                
231
                return true;
232
        }
233

    
234
        /**
235
         * Builds the next id.
236
         *
237
         * @return the string
238
         */
239
        private String buildNextId() {
240
                textcount++;
241
                textno = ""+textcount;
242
                while (textno.length() < nbOfStarNumbers) {
243
                        textno="0"+textno;
244
                }
245
                return textno;
246
        }
247

    
248
        /**
249
         * allow to guess if the file use **** or 000x format.
250
         *
251
         * @param infile the infile
252
         * @param encoding the file encoding
253
         * @return true, if the text marker is "****"
254
         */
255
        private boolean guessStar(File infile, String encoding)
256
        {
257
                Reader input = new InputStreamReader(new FileInputStream(infile) , encoding);
258
                String line = input.readLine();
259
                while (line != null) {
260
                        if (line ==~ /^[0-9]{4,6} .*/) { // 0000
261
                                int idx = line.indexOf(" ");
262
                                nbOfStarNumbers = idx;
263
                                //println "NB NUMBER:" + nbOfStarNumbers
264
                                input.close();
265
                                return false;
266
                        } else if (line ==~ /^[*]{4,6} .*/) { 
267
                                input.close();
268
                                int idx = line.indexOf(" ");
269
                                nbOfStarNumbers = idx;
270
                                //println "NB STAR:" + nbOfStarNumbers
271
                                return true;
272
                        }
273
                        line = input.readLine();
274
                }
275
                input.close();
276
        }
277

    
278
        /**
279
         * Main.
280
         *
281
         * @param args the args
282
         */
283
        static main(args)
284
        {
285
                //File infile = new File(System.getProperty("user.home")+"/xml/nature/Nature.txt");
286
                File infile = new File(System.getProperty("user.home")+"/xml/TESTS2/alceste/Voeux.txt");
287
                //File infile = new File("~/xml/geo/src/rivières.txt");
288
                File outdir = new File(infile.getParentFile(), "out");
289
                outdir.mkdir()
290
                new Alceste2Xml().run(infile, outdir, "cp1252");
291
                println "done: $infile -> $outdir"
292
        }
293
}
294