Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / Hyperbase2Xml.groovy @ 625

History | View | Annotate | Download (5.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer
29

    
30
import org.txm.utils.CharsetDetector
31
import org.txm.utils.i18n.DetectBOM;
32
import org.w3c.dom.Document;
33
import org.w3c.dom.Element;
34
import org.w3c.dom.NodeList;
35
import org.xml.sax.SAXException;
36

    
37
import java.io.File;
38
import java.io.FileInputStream;
39
import java.io.InputStreamReader;
40

    
41
import javax.xml.stream.*;
42
import java.net.URL;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * Create a xml file per text declared in a Hyperbase file (old format).
47
 *
48
 * @author mdecorde
49
 */
50
class Hyperbase2Xml {
51
        
52
        /**
53
         * Run.
54
         *
55
         * @param infile the hyperbase file (old format)
56
         * @param outdir where the xml files are created
57
         * @param encoding the encoding of the hyperbase file
58
         * @return true, if successful
59
         */
60
        public boolean run(File infile, File outdir, String encoding)
61
        {
62
                if (encoding == "??") {
63
                        encoding = new CharsetDetector(infile).getEncoding();
64
                        println "Guessing encoding of $infile : $encoding"
65
                }
66
                outdir.mkdir();
67
                
68
                String textname = null
69
                String partname = null;
70
                int textcount = 1;
71
                def input = new FileInputStream(infile)
72
                Reader reader = new InputStreamReader(input , encoding);
73
                DetectBOM bomdetector = new DetectBOM(infile);
74
                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
75
                
76
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
77
                BufferedOutputStream output = null;
78
                XMLStreamWriter writer = null;
79
                
80
                boolean firsttext=true;
81
                
82
                String line = reader.readLine();
83
                while(line != null) {
84
                        line = CleanFile.clean(line);
85
                        if ((line.startsWith("&&&"))) // text declaration &&& longtitle, title, shorttitle &&&
86
                        {
87
                                if (partname != null) // close previous part
88
                                {
89
                                        //println("close last part")
90
                                        writer.writeEndElement();
91
                                        partname= null;
92
                                }
93
                                if (textname != null) // close previous text
94
                                {
95
                                        //println("close text "+textname)
96
                                        writer.writeEndElement();
97
                                        writer.close();
98
                                        output.close();
99
                                }
100
                                textname = line.substring(3, line.length()-3);
101
                                if (textname.contains(","))
102
                                {
103
                                        String[] textnames = textname.split(",");
104
                                        if(textnames.length > 1)
105
                                                textname = textnames[1].trim();
106
                                        
107
                                }
108
                                // create a new text
109
                                output = new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
110
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
111
                                writer.writeStartElement("text");
112
                                writer.writeAttribute("id",textname);
113
                                println "text : "+new File(outdir,textname+".xml");
114
                        }
115
                        else if((line.startsWith("%%%%"))) // an other type of text declaration
116
                        {
117
                                if(partname != null)
118
                                {
119
                                        //println("close last part")
120
                                        writer.writeEndElement();
121
                                        partname= null;
122
                                }
123
                                if(textname != null)
124
                                {
125
                                        //println("close text "+textname)
126
                                        writer.writeEndElement();
127
                                        writer.close();
128
                                        output.close();
129
                                }
130
                                textname = infile.getName().substring(0, infile.getName().length()-4)+(textcount++);
131
                                output =  new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
132
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
133
                                writer.writeStartElement("text");
134
                                writer.writeAttribute("id",textname);
135
                                println "text : "+new File(outdir,textname+".xml");
136
                        }
137
                        else if((line.startsWith("\$"))) // paragraph declaration
138
                        {
139
                                if(partname != null)
140
                                {
141
                                        //println("close part "+partname)
142
                                        writer.writeEndElement();
143
                                }
144
                                partname = line.substring(1);
145
                                
146
                                writer.writeStartElement("p");
147
                                writer.writeAttribute("id",partname);
148
                        }
149
                        else // brut text > write chars
150
                        {
151
                                if(writer != null)
152
                                        writer.writeCharacters(line+"\n");
153
                        }
154
                        line = reader.readLine();
155
                }
156
                
157
                if(partname != null)
158
                {
159
                        //println("close last part")
160
                        writer.writeEndElement();
161
                }
162
                if(writer != null)
163
                        writer.writeEndElement();
164
                
165
                reader.close();
166
                if(output != null)
167
                        output.close();
168
                
169
                return true;
170
        }
171
        
172
        /**
173
         * Main.
174
         *
175
         * @param args the args
176
         */
177
        static main(args) 
178
        {
179
                File infile = new File(System.getProperty("user.home")+"/xml/presdiscfra/src/PresDiscFranc");
180
                File outfile = new File(System.getProperty("user.home")+"/xml/presdiscfra/split/");
181
                new Hyperbase2Xml().run(infile, outfile,"ISO-8859-1");
182
                println "done"
183
        }
184
}