root / tmp / org.txm.core / src / java / org / txm / scripts / importer / Hyperbase2Xml.groovy @ 1000
History | View | Annotate | Download (5.1 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 3451 $
|
26 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import org.txm.utils.CharsetDetector |
31 | 881 | mdecorde | import org.txm.utils.i18n.DetectBOM; |
32 | 881 | mdecorde | import org.w3c.dom.Document; |
33 | 881 | mdecorde | import org.w3c.dom.Element; |
34 | 881 | mdecorde | import org.w3c.dom.NodeList; |
35 | 881 | mdecorde | import org.xml.sax.SAXException; |
36 | 881 | mdecorde | |
37 | 881 | mdecorde | import java.io.File; |
38 | 881 | mdecorde | import java.io.FileInputStream; |
39 | 881 | mdecorde | import java.io.InputStreamReader; |
40 | 881 | mdecorde | |
41 | 881 | mdecorde | import javax.xml.stream.*; |
42 | 881 | mdecorde | import java.net.URL; |
43 | 881 | mdecorde | |
44 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
45 | 881 | mdecorde | /**
|
46 | 881 | mdecorde | * Create a xml file per text declared in a Hyperbase file (old format).
|
47 | 881 | mdecorde | *
|
48 | 881 | mdecorde | * @author mdecorde
|
49 | 881 | mdecorde | */
|
50 | 881 | mdecorde | class Hyperbase2Xml { |
51 | 881 | mdecorde | |
52 | 881 | mdecorde | /**
|
53 | 881 | mdecorde | * Run.
|
54 | 881 | mdecorde | *
|
55 | 881 | mdecorde | * @param infile the hyperbase file (old format)
|
56 | 881 | mdecorde | * @param outdir where the xml files are created
|
57 | 881 | mdecorde | * @param encoding the encoding of the hyperbase file
|
58 | 881 | mdecorde | * @return true, if successful
|
59 | 881 | mdecorde | */
|
60 | 881 | mdecorde | public boolean run(File infile, File outdir, String encoding) |
61 | 881 | mdecorde | { |
62 | 881 | mdecorde | if (encoding == "??") { |
63 | 881 | mdecorde | encoding = new CharsetDetector(infile).getEncoding();
|
64 | 881 | mdecorde | println "Guessing encoding of $infile : $encoding"
|
65 | 881 | mdecorde | } |
66 | 881 | mdecorde | outdir.mkdir(); |
67 | 881 | mdecorde | |
68 | 881 | mdecorde | String textname = null |
69 | 881 | mdecorde | String partname = null; |
70 | 881 | mdecorde | int textcount = 1; |
71 | 881 | mdecorde | def input = new FileInputStream(infile) |
72 | 881 | mdecorde | Reader reader = new InputStreamReader(input , encoding); |
73 | 881 | mdecorde | DetectBOM bomdetector = new DetectBOM(infile);
|
74 | 881 | mdecorde | for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read() |
75 | 881 | mdecorde | |
76 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
77 | 881 | mdecorde | BufferedOutputStream output = null; |
78 | 881 | mdecorde | XMLStreamWriter writer = null;
|
79 | 881 | mdecorde | |
80 | 881 | mdecorde | boolean firsttext=true; |
81 | 881 | mdecorde | |
82 | 881 | mdecorde | String line = reader.readLine();
|
83 | 881 | mdecorde | while(line != null) { |
84 | 881 | mdecorde | line = CleanFile.clean(line); |
85 | 881 | mdecorde | if ((line.startsWith("&&&"))) // text declaration &&& longtitle, title, shorttitle &&& |
86 | 881 | mdecorde | { |
87 | 881 | mdecorde | if (partname != null) // close previous part |
88 | 881 | mdecorde | { |
89 | 881 | mdecorde | //println("close last part")
|
90 | 881 | mdecorde | writer.writeEndElement(); |
91 | 881 | mdecorde | partname= null;
|
92 | 881 | mdecorde | } |
93 | 881 | mdecorde | if (textname != null) // close previous text |
94 | 881 | mdecorde | { |
95 | 881 | mdecorde | //println("close text "+textname)
|
96 | 881 | mdecorde | writer.writeEndElement(); |
97 | 881 | mdecorde | writer.close(); |
98 | 881 | mdecorde | output.close(); |
99 | 881 | mdecorde | } |
100 | 881 | mdecorde | textname = line.substring(3, line.length()-3); |
101 | 881 | mdecorde | if (textname.contains(",")) |
102 | 881 | mdecorde | { |
103 | 881 | mdecorde | String[] textnames = textname.split(","); |
104 | 881 | mdecorde | if(textnames.length > 1) |
105 | 881 | mdecorde | textname = textnames[1].trim();
|
106 | 881 | mdecorde | |
107 | 881 | mdecorde | } |
108 | 881 | mdecorde | // create a new text
|
109 | 881 | mdecorde | output = new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml"))) |
110 | 881 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8");
|
111 | 881 | mdecorde | writer.writeStartElement("text");
|
112 | 881 | mdecorde | writer.writeAttribute("id",textname);
|
113 | 881 | mdecorde | println "text : "+new File(outdir,textname+".xml"); |
114 | 881 | mdecorde | } |
115 | 881 | mdecorde | else if((line.startsWith("%%%%"))) // an other type of text declaration |
116 | 881 | mdecorde | { |
117 | 881 | mdecorde | if(partname != null) |
118 | 881 | mdecorde | { |
119 | 881 | mdecorde | //println("close last part")
|
120 | 881 | mdecorde | writer.writeEndElement(); |
121 | 881 | mdecorde | partname= null;
|
122 | 881 | mdecorde | } |
123 | 881 | mdecorde | if(textname != null) |
124 | 881 | mdecorde | { |
125 | 881 | mdecorde | //println("close text "+textname)
|
126 | 881 | mdecorde | writer.writeEndElement(); |
127 | 881 | mdecorde | writer.close(); |
128 | 881 | mdecorde | output.close(); |
129 | 881 | mdecorde | } |
130 | 881 | mdecorde | textname = infile.getName().substring(0, infile.getName().length()-4)+(textcount++); |
131 | 881 | mdecorde | output = new BufferedOutputStream(new FileOutputStream(new File(outdir,textname+".xml"))) |
132 | 881 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8");
|
133 | 881 | mdecorde | writer.writeStartElement("text");
|
134 | 881 | mdecorde | writer.writeAttribute("id",textname);
|
135 | 881 | mdecorde | println "text : "+new File(outdir,textname+".xml"); |
136 | 881 | mdecorde | } |
137 | 881 | mdecorde | else if((line.startsWith("\$"))) // paragraph declaration |
138 | 881 | mdecorde | { |
139 | 881 | mdecorde | if(partname != null) |
140 | 881 | mdecorde | { |
141 | 881 | mdecorde | //println("close part "+partname)
|
142 | 881 | mdecorde | writer.writeEndElement(); |
143 | 881 | mdecorde | } |
144 | 881 | mdecorde | partname = line.substring(1);
|
145 | 881 | mdecorde | |
146 | 881 | mdecorde | writer.writeStartElement("p");
|
147 | 881 | mdecorde | writer.writeAttribute("id",partname);
|
148 | 881 | mdecorde | } |
149 | 881 | mdecorde | else // brut text > write chars |
150 | 881 | mdecorde | { |
151 | 881 | mdecorde | if(writer != null) |
152 | 881 | mdecorde | writer.writeCharacters(line+"\n");
|
153 | 881 | mdecorde | } |
154 | 881 | mdecorde | line = reader.readLine(); |
155 | 881 | mdecorde | } |
156 | 881 | mdecorde | |
157 | 881 | mdecorde | if(partname != null) |
158 | 881 | mdecorde | { |
159 | 881 | mdecorde | //println("close last part")
|
160 | 881 | mdecorde | writer.writeEndElement(); |
161 | 881 | mdecorde | } |
162 | 881 | mdecorde | if(writer != null) |
163 | 881 | mdecorde | writer.writeEndElement(); |
164 | 881 | mdecorde | |
165 | 881 | mdecorde | reader.close(); |
166 | 881 | mdecorde | if(output != null) |
167 | 881 | mdecorde | output.close(); |
168 | 881 | mdecorde | |
169 | 881 | mdecorde | return true; |
170 | 881 | mdecorde | } |
171 | 881 | mdecorde | |
172 | 881 | mdecorde | /**
|
173 | 881 | mdecorde | * Main.
|
174 | 881 | mdecorde | *
|
175 | 881 | mdecorde | * @param args the args
|
176 | 881 | mdecorde | */
|
177 | 881 | mdecorde | static main(args)
|
178 | 881 | mdecorde | { |
179 | 881 | mdecorde | File infile = new File(System.getProperty("user.home")+"/xml/presdiscfra/src/PresDiscFranc"); |
180 | 881 | mdecorde | File outfile = new File(System.getProperty("user.home")+"/xml/presdiscfra/split/"); |
181 | 881 | mdecorde | new Hyperbase2Xml().run(infile, outfile,"ISO-8859-1"); |
182 | 881 | mdecorde | println "done"
|
183 | 881 | mdecorde | } |
184 | 881 | mdecorde | } |