Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / Hyperbase2Xml.groovy @ 1000

History | View | Annotate | Download (5.1 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
//
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25 881 mdecorde
// $LastChangedRevision: 3451 $
26 881 mdecorde
// $LastChangedBy: mdecorde $
27 881 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.importer
29 881 mdecorde
30 881 mdecorde
import org.txm.utils.CharsetDetector
31 881 mdecorde
import org.txm.utils.i18n.DetectBOM;
32 881 mdecorde
import org.w3c.dom.Document;
33 881 mdecorde
import org.w3c.dom.Element;
34 881 mdecorde
import org.w3c.dom.NodeList;
35 881 mdecorde
import org.xml.sax.SAXException;
36 881 mdecorde
37 881 mdecorde
import java.io.File;
38 881 mdecorde
import java.io.FileInputStream;
39 881 mdecorde
import java.io.InputStreamReader;
40 881 mdecorde
41 881 mdecorde
import javax.xml.stream.*;
42 881 mdecorde
import java.net.URL;
43 881 mdecorde
44 881 mdecorde
// TODO: Auto-generated Javadoc
45 881 mdecorde
/**
46 881 mdecorde
 * Create a xml file per text declared in a Hyperbase file (old format).
47 881 mdecorde
 *
48 881 mdecorde
 * @author mdecorde
49 881 mdecorde
 */
50 881 mdecorde
class Hyperbase2Xml {
51 881 mdecorde
52 881 mdecorde
        /**
53 881 mdecorde
         * Run.
54 881 mdecorde
         *
55 881 mdecorde
         * @param infile the hyperbase file (old format)
56 881 mdecorde
         * @param outdir where the xml files are created
57 881 mdecorde
         * @param encoding the encoding of the hyperbase file
58 881 mdecorde
         * @return true, if successful
59 881 mdecorde
         */
60 881 mdecorde
        public boolean run(File infile, File outdir, String encoding)
61 881 mdecorde
        {
62 881 mdecorde
                if (encoding == "??") {
63 881 mdecorde
                        encoding = new CharsetDetector(infile).getEncoding();
64 881 mdecorde
                        println "Guessing encoding of $infile : $encoding"
65 881 mdecorde
                }
66 881 mdecorde
                outdir.mkdir();
67 881 mdecorde
68 881 mdecorde
                String textname = null
69 881 mdecorde
                String partname = null;
70 881 mdecorde
                int textcount = 1;
71 881 mdecorde
                def input = new FileInputStream(infile)
72 881 mdecorde
                Reader reader = new InputStreamReader(input , encoding);
73 881 mdecorde
                DetectBOM bomdetector = new DetectBOM(infile);
74 881 mdecorde
                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
75 881 mdecorde
76 881 mdecorde
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
77 881 mdecorde
                BufferedOutputStream output = null;
78 881 mdecorde
                XMLStreamWriter writer = null;
79 881 mdecorde
80 881 mdecorde
                boolean firsttext=true;
81 881 mdecorde
82 881 mdecorde
                String line = reader.readLine();
83 881 mdecorde
                while(line != null) {
84 881 mdecorde
                        line = CleanFile.clean(line);
85 881 mdecorde
                        if ((line.startsWith("&&&"))) // text declaration &&& longtitle, title, shorttitle &&&
86 881 mdecorde
                        {
87 881 mdecorde
                                if (partname != null) // close previous part
88 881 mdecorde
                                {
89 881 mdecorde
                                        //println("close last part")
90 881 mdecorde
                                        writer.writeEndElement();
91 881 mdecorde
                                        partname= null;
92 881 mdecorde
                                }
93 881 mdecorde
                                if (textname != null) // close previous text
94 881 mdecorde
                                {
95 881 mdecorde
                                        //println("close text "+textname)
96 881 mdecorde
                                        writer.writeEndElement();
97 881 mdecorde
                                        writer.close();
98 881 mdecorde
                                        output.close();
99 881 mdecorde
                                }
100 881 mdecorde
                                textname = line.substring(3, line.length()-3);
101 881 mdecorde
                                if (textname.contains(","))
102 881 mdecorde
                                {
103 881 mdecorde
                                        String[] textnames = textname.split(",");
104 881 mdecorde
                                        if(textnames.length > 1)
105 881 mdecorde
                                                textname = textnames[1].trim();
106 881 mdecorde
107 881 mdecorde
                                }
108 881 mdecorde
                                // create a new text
109 881 mdecorde
                                output = new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
110 881 mdecorde
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
111 881 mdecorde
                                writer.writeStartElement("text");
112 881 mdecorde
                                writer.writeAttribute("id",textname);
113 881 mdecorde
                                println "text : "+new File(outdir,textname+".xml");
114 881 mdecorde
                        }
115 881 mdecorde
                        else if((line.startsWith("%%%%"))) // an other type of text declaration
116 881 mdecorde
                        {
117 881 mdecorde
                                if(partname != null)
118 881 mdecorde
                                {
119 881 mdecorde
                                        //println("close last part")
120 881 mdecorde
                                        writer.writeEndElement();
121 881 mdecorde
                                        partname= null;
122 881 mdecorde
                                }
123 881 mdecorde
                                if(textname != null)
124 881 mdecorde
                                {
125 881 mdecorde
                                        //println("close text "+textname)
126 881 mdecorde
                                        writer.writeEndElement();
127 881 mdecorde
                                        writer.close();
128 881 mdecorde
                                        output.close();
129 881 mdecorde
                                }
130 881 mdecorde
                                textname = infile.getName().substring(0, infile.getName().length()-4)+(textcount++);
131 881 mdecorde
                                output =  new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml")))
132 881 mdecorde
                                writer = factory.createXMLStreamWriter(output, "UTF-8");
133 881 mdecorde
                                writer.writeStartElement("text");
134 881 mdecorde
                                writer.writeAttribute("id",textname);
135 881 mdecorde
                                println "text : "+new File(outdir,textname+".xml");
136 881 mdecorde
                        }
137 881 mdecorde
                        else if((line.startsWith("\$"))) // paragraph declaration
138 881 mdecorde
                        {
139 881 mdecorde
                                if(partname != null)
140 881 mdecorde
                                {
141 881 mdecorde
                                        //println("close part "+partname)
142 881 mdecorde
                                        writer.writeEndElement();
143 881 mdecorde
                                }
144 881 mdecorde
                                partname = line.substring(1);
145 881 mdecorde
146 881 mdecorde
                                writer.writeStartElement("p");
147 881 mdecorde
                                writer.writeAttribute("id",partname);
148 881 mdecorde
                        }
149 881 mdecorde
                        else // brut text > write chars
150 881 mdecorde
                        {
151 881 mdecorde
                                if(writer != null)
152 881 mdecorde
                                        writer.writeCharacters(line+"\n");
153 881 mdecorde
                        }
154 881 mdecorde
                        line = reader.readLine();
155 881 mdecorde
                }
156 881 mdecorde
157 881 mdecorde
                if(partname != null)
158 881 mdecorde
                {
159 881 mdecorde
                        //println("close last part")
160 881 mdecorde
                        writer.writeEndElement();
161 881 mdecorde
                }
162 881 mdecorde
                if(writer != null)
163 881 mdecorde
                        writer.writeEndElement();
164 881 mdecorde
165 881 mdecorde
                reader.close();
166 881 mdecorde
                if(output != null)
167 881 mdecorde
                        output.close();
168 881 mdecorde
169 881 mdecorde
                return true;
170 881 mdecorde
        }
171 881 mdecorde
172 881 mdecorde
        /**
173 881 mdecorde
         * Main.
174 881 mdecorde
         *
175 881 mdecorde
         * @param args the args
176 881 mdecorde
         */
177 881 mdecorde
        static main(args)
178 881 mdecorde
        {
179 881 mdecorde
                File infile = new File(System.getProperty("user.home")+"/xml/presdiscfra/src/PresDiscFranc");
180 881 mdecorde
                File outfile = new File(System.getProperty("user.home")+"/xml/presdiscfra/split/");
181 881 mdecorde
                new Hyperbase2Xml().run(infile, outfile,"ISO-8859-1");
182 881 mdecorde
                println "done"
183 881 mdecorde
        }
184 881 mdecorde
}