root / tmp / org.txm.core / src / java / org / txm / scripts / importer / SplitBy.groovy @ 2473
History | View | Annotate | Download (5.5 kB)
1 |
package org.txm.scripts.importer
|
---|---|
2 |
|
3 |
import java.io.Writer; |
4 |
import java.io.File; |
5 |
import javax.xml.stream.*; |
6 |
import java.net.URL; |
7 |
import org.txm.importer.filters.*; |
8 |
|
9 |
class SplitBy |
10 |
{ |
11 |
|
12 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
13 |
FileOutputStream output;
|
14 |
XMLStreamWriter writer; |
15 |
def parser, inputData;
|
16 |
ArrayList<String> newfiles = []; |
17 |
|
18 |
public SplitBy(File xmlfile) |
19 |
{ |
20 |
try {
|
21 |
inputData = xmlfile.toURI().toURL().openStream(); |
22 |
def inputFactory = XMLInputFactory.newInstance();
|
23 |
parser = inputFactory.createXMLStreamReader(inputData); |
24 |
|
25 |
} catch (XMLStreamException ex) {
|
26 |
System.out.println(ex);
|
27 |
}catch (IOException ex) { |
28 |
System.out.println("IOException while parsing "); |
29 |
} |
30 |
} |
31 |
boolean inby = false; |
32 |
public boolean process(File outdir, String by, String idAttribute) |
33 |
{ |
34 |
outdir.mkdirs(); |
35 |
try
|
36 |
{ |
37 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
38 |
|
39 |
switch(event)
|
40 |
{ |
41 |
case XMLStreamConstants.START_ELEMENT:
|
42 |
if(parser.getLocalName() == by)
|
43 |
{ |
44 |
|
45 |
inby = true;
|
46 |
if(writer != null) |
47 |
{ |
48 |
writer.close(); |
49 |
output.close(); |
50 |
} |
51 |
|
52 |
String filename = ""; |
53 |
for(int i = 0 ; i < parser.getAttributeCount(); i++) |
54 |
if(parser.getAttributeLocalName(i) == idAttribute)
|
55 |
{ |
56 |
filename = parser.getAttributeValue(i); |
57 |
break;
|
58 |
} |
59 |
File newfile = new File(outdir,filename+".xml") |
60 |
if(files.contains(newfile))
|
61 |
{ |
62 |
println "Item declared twice "+filename+" at location "+parser.getLocation(); |
63 |
if (parser != null) parser.close(); |
64 |
if (inputData != null) inputData.close(); |
65 |
return false; |
66 |
} |
67 |
files.add(newfile); |
68 |
output = new FileOutputStream(newfile) |
69 |
writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file |
70 |
|
71 |
} |
72 |
if(writer != null) |
73 |
{ |
74 |
writeEvent(event); |
75 |
if(parser.getLocalName() == "TEI") |
76 |
{ // write namespaces
|
77 |
writer.writeNamespace("tei", "http://www.tei-c.org/ns/1.0") |
78 |
writer.writeNamespace("txm", "http://textometrie.org/1.0") |
79 |
} |
80 |
} |
81 |
break;
|
82 |
case XMLStreamConstants.END_ELEMENT:
|
83 |
if(inby)
|
84 |
{ |
85 |
if(writer != null) |
86 |
{ |
87 |
writer.writeEndElement(); |
88 |
} |
89 |
} |
90 |
if(parser.getLocalName() == by)
|
91 |
{ |
92 |
inby = false; // write the end element $by before |
93 |
} |
94 |
break;
|
95 |
case XMLStreamConstants.CHARACTERS:
|
96 |
if(writer != null) |
97 |
{ |
98 |
writer.writeCharacters(parser.getText()); |
99 |
} |
100 |
break;
|
101 |
} |
102 |
|
103 |
} |
104 |
if(writer != null) |
105 |
writer.close(); |
106 |
if(output != null) |
107 |
output.close(); |
108 |
if (parser != null) parser.close(); |
109 |
if (inputData != null) inputData.close(); |
110 |
return true; |
111 |
} |
112 |
catch(Exception e){ |
113 |
println "XML Error at location: "+parser.getLocation()
|
114 |
e.printStackTrace() |
115 |
if (writer != null) |
116 |
writer.close() |
117 |
output.close() |
118 |
if (parser != null) parser.close(); |
119 |
if (inputData != null) inputData.close(); |
120 |
return false |
121 |
} |
122 |
} |
123 |
|
124 |
/**
|
125 |
* write the current event.
|
126 |
*
|
127 |
* @param event the stax event
|
128 |
*/
|
129 |
private void writeEvent(int event) |
130 |
{ |
131 |
def prefix = parser.getPrefix();
|
132 |
if (event == XMLStreamConstants.START_ELEMENT )
|
133 |
{ |
134 |
def localname = parser.getLocalName(); // write element |
135 |
if (prefix != null && prefix.length() > 0) |
136 |
writer.writeStartElement(prefix+":"+localname);
|
137 |
else
|
138 |
writer.writeStartElement(localname); |
139 |
|
140 |
for(int i = 0 ; i < parser.getNamespaceCount() ; i++) //write namespaces |
141 |
{ |
142 |
writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i)); |
143 |
} |
144 |
|
145 |
String attrprefix; // write attributes |
146 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) |
147 |
{ |
148 |
attrprefix = parser.getAttributePrefix(i); |
149 |
if (attrprefix != null & attrprefix.length() > 0) |
150 |
writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
|
151 |
else
|
152 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)) |
153 |
} |
154 |
//writer.writeCharacters("\n");
|
155 |
} |
156 |
else if (event == XMLStreamConstants.END_ELEMENT) |
157 |
{ |
158 |
writer.writeEndElement(); |
159 |
//writer.writeCharacters("\n");
|
160 |
} |
161 |
else if (event == XMLStreamConstants.CHARACTERS) |
162 |
{ |
163 |
writer.writeCharacters(parser.getText()); |
164 |
} |
165 |
else if (event == XMLStreamConstants.COMMENT) |
166 |
{ |
167 |
writer.writeComment(parser.getText()); |
168 |
} |
169 |
} |
170 |
|
171 |
public ArrayList<File> getFiles() |
172 |
{ |
173 |
return newfiles;
|
174 |
} |
175 |
|
176 |
public static void main(String[] args) |
177 |
{ |
178 |
File xmlfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\ELTIEMPO.2002-2010.MIN.xml"); |
179 |
File okfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\"); |
180 |
File outdir = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\tiempo"); |
181 |
outdir.deleteDir(); |
182 |
outdir.mkdir(); |
183 |
|
184 |
// Reader reader = new InputStreamReader(new FileInputStream(xmlfile) , "ISO-8859-1");
|
185 |
// Writer writer = new OutputStreamWriter(new FileOutputStream(okfile) , "UTF-8");
|
186 |
// String line = reader.readLine();
|
187 |
// while(line != null)
|
188 |
// {
|
189 |
// line = line.replaceAll(' ([a-zA-Z0-9]+)="(.*)""', ' $1="$2"')
|
190 |
// writer.write(line+"\n");
|
191 |
// line = reader.readLine();
|
192 |
// }
|
193 |
// reader.close();
|
194 |
// writer.close();
|
195 |
//
|
196 |
//
|
197 |
//// if(!ValidateXml.test(okfile))
|
198 |
//// {
|
199 |
//// println "XML not valid";
|
200 |
//// return;
|
201 |
//// }
|
202 |
|
203 |
def splitter = new SplitBy(xmlfile); |
204 |
if(splitter.process(outdir, "texte", "id")) |
205 |
println "success";
|
206 |
else
|
207 |
println "failed !"
|
208 |
|
209 |
} |
210 |
} |