Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / alceste / Alceste2Xml.groovy @ 479

History | View | Annotate | Download (8.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.alceste;
29

    
30
import javax.xml.parsers.DocumentBuilder;
31
import javax.xml.parsers.DocumentBuilderFactory;
32
import javax.xml.parsers.ParserConfigurationException;
33
import javax.xml.transform.OutputKeys;
34
import javax.xml.transform.Result;
35
import javax.xml.transform.Source;
36
import javax.xml.transform.Transformer;
37
import javax.xml.transform.TransformerFactory;
38
import javax.xml.transform.dom.DOMSource;
39
import javax.xml.transform.stream.StreamResult;
40

    
41
import org.txm.importer.CleanFile;
42
import org.txm.utils.AsciiUtils;
43
import org.txm.utils.CharsetDetector;
44
import org.txm.utils.i18n.DetectBOM;
45
import org.w3c.dom.Document;
46
import org.w3c.dom.Element;
47
import org.w3c.dom.NodeList;
48
import org.xml.sax.SAXException;
49

    
50
import java.io.File;
51
import java.io.FileInputStream;
52
import java.io.InputStreamReader;
53
import java.io.Reader;
54

    
55
import javax.xml.stream.*;
56

    
57
import java.net.URL;
58

    
59

    
60
// TODO: Auto-generated Javadoc
61
/**
62
 * Create a file per text of an alceste file.
63
 */
64
class Alceste2Xml {
65

    
66
        /** The textno. */
67
        String textno = "0001";
68

    
69
        /** The textcount. */
70
        int textcount = 0;
71
        int nbOfStarNumbers;
72
        static def TEXTNUMBERREGEX = /^[0-9]{4,6} .+/
73
        static def TEXTSTARREGEX = /^[*]{4,6} .+/
74
        /**
75
         * run the script.
76
         *
77
         * @param infile the infile
78
         * @param outdir the outdir
79
         * @param encoding the encoding
80
         */
81
        public boolean run(File infile, File outdir, String encoding) {
82
                if (encoding == "??") {
83
                        encoding = new CharsetDetector(infile).getEncoding();
84
                        println "Guessing encoding of $infile : $encoding"
85
                }
86
                int noline = 0;
87
                int p_n = 1;
88
                int lb_n = 1;
89
                def duplicatedAttributes = new HashSet<String>();
90
                try {
91
                        outdir.mkdir();
92
                        boolean star = guessStar(infile, encoding);
93

    
94
                        String textname = infile.getName().substring(0,infile.getName().length()-4);// creates file with prefix the name of the alceste file
95

    
96
                        FileInputStream input = new FileInputStream(infile);
97
                        Reader reader = new InputStreamReader(input , encoding);
98

    
99
                        input.skip(new DetectBOM(infile).getBOMSize())
100
                        
101
                        XMLOutputFactory factory = XMLOutputFactory.newInstance();
102
                        BufferedOutputStream output;
103
                        XMLStreamWriter writer;
104

    
105
                        String line = reader.readLine();
106
                        while (line != null) {
107
                                line = CleanFile.clean(line); // remove ctrl and surrogate chars
108
                                
109
                                if (((line ==~ TEXTNUMBERREGEX) || line ==~ TEXTSTARREGEX) && line.contains("*")) { // we found a text declaration with star attributes
110
                                        if (writer != null) { // end previous text if any
111
                                                writer.writeEndElement(); // p
112
                                                writer.writeEndElement(); // text
113
                                                writer.close();
114
                                                output.close()
115
                                        }
116
                                        
117
                                        lb_n = 1; // reset line counter
118
                                        p_n = 1; // reset paragraph counter
119
                                        int idx = line.indexOf(" "); // star metadata starts after the first ' '
120
                                        
121
                                        if (!star) {
122
                                                textno = line.substring(0, idx)
123
                                                textcount++;
124
                                        } else {
125
                                                this.buildNextId(); // update textno
126
                                        }
127

    
128
                                        String filename = textno;
129
                                        def attributes = new LinkedHashMap<String, String>();
130
                                        filename = filename.trim();
131
                                        output = new BufferedOutputStream(new FileOutputStream(new File(outdir, filename+".xml")))
132
                                        writer = factory.createXMLStreamWriter(output, "UTF-8"); //create a new file
133
                                        writer.writeStartDocument("UTF-8", "1.0")
134
                                        writer.writeStartElement("text");
135
                                        
136
                                        attributes["id"] = filename // writen at the end
137
                                        // get text metadata
138
                                        line = line.substring(idx + 1); // remove the text marker
139
                                        
140
                                        String[] splited = line.split("\\*");
141
                                        for (String s1 : splited) { // for each text star metadata
142
                                                s1 = s1.trim();
143
                                                if (s1.length() == 0) continue;
144

    
145
                                                String[] splited2 = s1.split("_", 2);
146
                                                if (splited2.length == 1) { // attribute without value
147
                                                        
148
                                                        if (s1.endsWith("0")) {
149
                                                                String attrname = s1.substring(0, s1.length()-1); // remove "0"
150
                                                                String attrnamebase = attrname = AsciiUtils.buildId(attrname);
151
                                                                int c = 2;
152
                                                                while (attributes.containsKey(attrname)) {
153
                                                                        attrname = attrnamebase+(c++)
154
                                                                }
155
                                                                if (c > 2) duplicatedAttributes.add(attrname);
156
                                                                attributes.put(attrname, "N/A")
157
                                                        } else {
158
                                                                println "Error: attribute: "+s1+ " with line "+line
159
                                                        }
160
                                                } else if (splited2.length == 2) {
161
                                                        
162
                                                        String attrname = splited2[0].toLowerCase()
163
                                                        String attrnamebase = attrname = AsciiUtils.buildId(attrname);
164
                                                                int c = 2;
165
                                                                while (attributes.containsKey(attrname)) {
166
                                                                        attrname = attrnamebase+(c++)
167
                                                                }
168
                                                                if (c > 2) duplicatedAttributes.add(attrname);
169
                                                        
170
                                                        String attrvalue = splited2[1]
171
                                                                
172
                                                        attributes.put(attrname, attrvalue)
173
                                                } else {
174
                                                
175
                                                        print("wrong size: name_value : "+splited2.length+" : ");
176
                                                        for (String s : splited ) print s+""
177
                                                        println ""
178
                                                }
179
                                        }
180
                                        
181
                                        for (def attname: attributes.keySet()) { // finally write the XML attributes
182
                                                writer.writeAttribute(attname, attributes[attname]);
183
                                        }
184
                                        writer.writeCharacters("\n");
185
                                        writer.writeStartElement("p"); // we create a tag <p>
186
                                        writer.writeAttribute("n", ""+(p_n++));// then we add attributes to the tag
187

    
188
                                } else if (line.startsWith("-*")) {
189
                                        println "ignored locution : "+line;
190
                                } else {
191
                                        if (writer != null) {
192
                                                if (line.length() > 0) {
193
                                                        writer.writeEmptyElement("lb");
194
                                                        writer.writeAttribute("n", ""+(lb_n++))
195
                                                        writer.writeCharacters("\n");
196
                                                } else if (line.trim().length() == 0) {
197
                                                        writer.writeEndElement(); // p
198
                                                        writer.writeStartElement("p");
199
                                                        writer.writeAttribute("n", ""+(p_n++))
200
                                                        writer.writeCharacters("\n");
201
                                                }
202
                                                
203
                                                writer.writeCharacters(line);
204
                                        }
205
                                }
206
                                line = reader.readLine();
207
                                noline++;
208
                        }
209
                        // the end
210
                        if (writer != null) { // end last text
211
                                writer.writeEndElement(); // p
212
                                writer.writeEndElement(); // text
213

    
214
                                reader.close();
215
                                writer.close();
216
                                output.close();
217
                        }
218
                } catch(Exception e) {
219
                        println "Error line : "+noline;
220
                        org.txm.utils.logger.Log.printStackTrace(e);
221
                }
222
                
223
                if (duplicatedAttributes.size() > 0){
224
                        println "Some attributes were duplicated: "+duplicatedAttributes;
225
                }
226
                
227
                if (textcount == 0) {
228
                        println "Error: no text found in "+infile;
229
                        return false;
230
                }
231
                
232
                return true;
233
        }
234

    
235
        /**
236
         * Builds the next id.
237
         *
238
         * @return the string
239
         */
240
        private String buildNextId() {
241
                textcount++;
242
                textno = ""+textcount;
243
                while (textno.length() < nbOfStarNumbers) {
244
                        textno="0"+textno;
245
                }
246
                return textno;
247
        }
248

    
249
        /**
250
         * allow to guess if the file use **** or 000x format.
251
         *
252
         * @param infile the infile
253
         * @param encoding the file encoding
254
         * @return true, if the text marker is "****"
255
         */
256
        private boolean guessStar(File infile, String encoding)
257
        {
258
                Reader input = new InputStreamReader(new FileInputStream(infile) , encoding);
259
                String line = input.readLine();
260
                while (line != null) {
261
                        if (line ==~ /^[0-9]{4,6} .*/) { // 0000
262
                                int idx = line.indexOf(" ");
263
                                nbOfStarNumbers = idx;
264
                                //println "NB NUMBER:" + nbOfStarNumbers
265
                                input.close();
266
                                return false;
267
                        } else if (line ==~ /^[*]{4,6} .*/) { 
268
                                input.close();
269
                                int idx = line.indexOf(" ");
270
                                nbOfStarNumbers = idx;
271
                                //println "NB STAR:" + nbOfStarNumbers
272
                                return true;
273
                        }
274
                        line = input.readLine();
275
                }
276
                input.close();
277
        }
278

    
279
        /**
280
         * Main.
281
         *
282
         * @param args the args
283
         */
284
        static main(args)
285
        {
286
                //File infile = new File(System.getProperty("user.home")+"/xml/nature/Nature.txt");
287
                File infile = new File(System.getProperty("user.home")+"/xml/TESTS2/alceste/Voeux.txt");
288
                //File infile = new File("~/xml/geo/src/rivières.txt");
289
                File outdir = new File(infile.getParentFile(), "out");
290
                outdir.mkdir()
291
                new Alceste2Xml().run(infile, outdir, "cp1252");
292
                println "done: $infile -> $outdir"
293
        }
294
}
295