Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / Hyperbase2Xml.groovy @ 187

History | View | Annotate | Download (5.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-12-16 17:28:35 +0100 (Fri, 16 Dec 2016) $
25
// $LastChangedRevision: 3377 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer
29

    
30
import org.txm.utils.CharsetDetector
31
import org.txm.utils.i18n.DetectBOM;
32
import org.w3c.dom.Document;
33
import org.w3c.dom.Element;
34
import org.w3c.dom.NodeList;
35
import org.xml.sax.SAXException;
36

    
37
import java.io.File;
38
import java.io.FileInputStream;
39
import java.io.InputStreamReader;
40

    
41
import javax.xml.stream.*;
42
import java.net.URL;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * Create a xml file per text declared in a Hyperbase file (old format).
47
 *
48
 * @author mdecorde
49
 */
50
class Hyperbase2Xml {
51
        
52
        /**
53
         * Run.
54
         *
55
         * @param infile the hyperbase file (old format)
56
         * @param outdir where the xml files are created
57
         * @param encoding the encoding of the hyperbase file
58
         * @return true, if successful
59
         */
60
        public boolean run(File infile, File outdir, String encoding)
61
        {
62
                if (encoding == "??") {
63
                        encoding = new CharsetDetector(infile).getEncoding();
64
                        println "Guessing encoding of $infile : $encoding"
65
                }
66
                outdir.mkdir();
67
                
68
                String textname = null
69
                String partname = null;
70
                int textcount = 1;
71
                def input = new FileInputStream(infile)
72
                Reader reader = new InputStreamReader(input , encoding);
73
                DetectBOM bomdetector = new DetectBOM(infile);
74
                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
75
                
76
                XMLOutputFactory factory = X        MLOutputFactory.newInstance();
77
                BufferedOutputStream output = null;
78
                XMLStreamWriter writer = null;
79
                
80
                boolean firsttext=true;
81
                
82
                String line = reader.readLine();
83
                while(line != null)
84
                {
85
                        line = CleanFile.clean(line);
86
                        if((line.startsWith("&&&"))) // text declaration &&& longtitle, title, shorttitle &&&
87
                        {
88
                                if(partname != null) // close previous part
89
                                {
90
                                        //println("close last part")
91
                                        writer.writeEndElement();
92
                                        partname= null;
93
                                }
94
                                if(textname != null) // close previous text
95
                                {
96
                                        //println("close text "+textname)
97
                                        writer.writeEndElement();
98
                                        writer.close();
99
                                        output.close();
100
                                }
101
                                textname = line.substring(3, line.length()-3);
102
                                if(textname.contains(","))
103
                                {
104
                                        String[] textnames = textname.split(",");
105
                                        if(textnames.length > 1)
106
                                                textname = textnames[1].trim();
107
                                        
108
                                }
109
                                // create a new text
110
                                output = new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
111
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
112
                                writer.writeStartElement("text");
113
                                writer.writeAttribute("id",textname);
114
                                println "text : "+new File(outdir,textname+".xml");
115
                        }
116
                        else if((line.startsWith("%%%%"))) // an other type of text declaration
117
                        {
118
                                if(partname != null)
119
                                {
120
                                        //println("close last part")
121
                                        writer.writeEndElement();
122
                                        partname= null;
123
                                }
124
                                if(textname != null)
125
                                {
126
                                        //println("close text "+textname)
127
                                        writer.writeEndElement();
128
                                        writer.close();
129
                                        output.close();
130
                                }
131
                                textname = infile.getName().substring(0, infile.getName().length()-4)+(textcount++);
132
                                output =  new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
133
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
134
                                writer.writeStartElement("text");
135
                                writer.writeAttribute("id",textname);
136
                                println "text : "+new File(outdir,textname+".xml");
137
                        }
138
                        else if((line.startsWith("\$"))) // paragraph declaration
139
                        {
140
                                if(partname != null)
141
                                {
142
                                        //println("close part "+partname)
143
                                        writer.writeEndElement();
144
                                }
145
                                partname = line.substring(1);
146
                                
147
                                writer.writeStartElement("p");
148
                                writer.writeAttribute("id",partname);
149
                        }
150
                        else // brut text > write chars
151
                        {
152
                                if(writer != null)
153
                                        writer.writeCharacters(line+"\n");
154
                        }
155
                        line = reader.readLine();
156
                }
157
                
158
                if(partname != null)
159
                {
160
                        //println("close last part")
161
                        writer.writeEndElement();
162
                }
163
                if(writer != null)
164
                        writer.writeEndElement();
165
                
166
                reader.close();
167
                if(output != null)
168
                        output.close();
169
                
170
                return true;
171
        }
172
        
173
        /**
174
         * Main.
175
         *
176
         * @param args the args
177
         */
178
        static main(args) 
179
        {
180
                File infile = new File(System.getProperty("user.home")+"/xml/presdiscfra/src/PresDiscFranc");
181
                File outfile = new File(System.getProperty("user.home")+"/xml/presdiscfra/split/");
182
                new Hyperbase2Xml().run(infile, outfile,"ISO-8859-1");
183
                println "done"
184
        }
185
}