root / tmp / org.txm.core / src / java / org / txm / scripts / importer / SplitBy.groovy @ 2473
History | View | Annotate | Download (5.5 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer
|
---|---|---|---|
2 | 881 | mdecorde | |
3 | 881 | mdecorde | import java.io.Writer; |
4 | 881 | mdecorde | import java.io.File; |
5 | 881 | mdecorde | import javax.xml.stream.*; |
6 | 881 | mdecorde | import java.net.URL; |
7 | 881 | mdecorde | import org.txm.importer.filters.*; |
8 | 881 | mdecorde | |
9 | 881 | mdecorde | class SplitBy |
10 | 881 | mdecorde | { |
11 | 881 | mdecorde | |
12 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
13 | 881 | mdecorde | FileOutputStream output;
|
14 | 881 | mdecorde | XMLStreamWriter writer; |
15 | 1688 | mdecorde | def parser, inputData;
|
16 | 881 | mdecorde | ArrayList<String> newfiles = []; |
17 | 881 | mdecorde | |
18 | 881 | mdecorde | public SplitBy(File xmlfile) |
19 | 881 | mdecorde | { |
20 | 881 | mdecorde | try {
|
21 | 1688 | mdecorde | inputData = xmlfile.toURI().toURL().openStream(); |
22 | 881 | mdecorde | def inputFactory = XMLInputFactory.newInstance();
|
23 | 881 | mdecorde | parser = inputFactory.createXMLStreamReader(inputData); |
24 | 881 | mdecorde | |
25 | 881 | mdecorde | } catch (XMLStreamException ex) {
|
26 | 881 | mdecorde | System.out.println(ex);
|
27 | 881 | mdecorde | }catch (IOException ex) { |
28 | 881 | mdecorde | System.out.println("IOException while parsing "); |
29 | 881 | mdecorde | } |
30 | 881 | mdecorde | } |
31 | 881 | mdecorde | boolean inby = false; |
32 | 881 | mdecorde | public boolean process(File outdir, String by, String idAttribute) |
33 | 881 | mdecorde | { |
34 | 881 | mdecorde | outdir.mkdirs(); |
35 | 881 | mdecorde | try
|
36 | 881 | mdecorde | { |
37 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
38 | 881 | mdecorde | |
39 | 881 | mdecorde | switch(event)
|
40 | 881 | mdecorde | { |
41 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
42 | 881 | mdecorde | if(parser.getLocalName() == by)
|
43 | 881 | mdecorde | { |
44 | 881 | mdecorde | |
45 | 881 | mdecorde | inby = true;
|
46 | 881 | mdecorde | if(writer != null) |
47 | 881 | mdecorde | { |
48 | 881 | mdecorde | writer.close(); |
49 | 881 | mdecorde | output.close(); |
50 | 881 | mdecorde | } |
51 | 881 | mdecorde | |
52 | 881 | mdecorde | String filename = ""; |
53 | 881 | mdecorde | for(int i = 0 ; i < parser.getAttributeCount(); i++) |
54 | 881 | mdecorde | if(parser.getAttributeLocalName(i) == idAttribute)
|
55 | 881 | mdecorde | { |
56 | 881 | mdecorde | filename = parser.getAttributeValue(i); |
57 | 881 | mdecorde | break;
|
58 | 881 | mdecorde | } |
59 | 881 | mdecorde | File newfile = new File(outdir,filename+".xml") |
60 | 881 | mdecorde | if(files.contains(newfile))
|
61 | 881 | mdecorde | { |
62 | 881 | mdecorde | println "Item declared twice "+filename+" at location "+parser.getLocation(); |
63 | 1688 | mdecorde | if (parser != null) parser.close(); |
64 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
65 | 881 | mdecorde | return false; |
66 | 881 | mdecorde | } |
67 | 881 | mdecorde | files.add(newfile); |
68 | 881 | mdecorde | output = new FileOutputStream(newfile) |
69 | 881 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file |
70 | 881 | mdecorde | |
71 | 881 | mdecorde | } |
72 | 881 | mdecorde | if(writer != null) |
73 | 881 | mdecorde | { |
74 | 881 | mdecorde | writeEvent(event); |
75 | 881 | mdecorde | if(parser.getLocalName() == "TEI") |
76 | 881 | mdecorde | { // write namespaces
|
77 | 881 | mdecorde | writer.writeNamespace("tei", "http://www.tei-c.org/ns/1.0") |
78 | 881 | mdecorde | writer.writeNamespace("txm", "http://textometrie.org/1.0") |
79 | 881 | mdecorde | } |
80 | 881 | mdecorde | } |
81 | 881 | mdecorde | break;
|
82 | 881 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
83 | 881 | mdecorde | if(inby)
|
84 | 881 | mdecorde | { |
85 | 881 | mdecorde | if(writer != null) |
86 | 881 | mdecorde | { |
87 | 881 | mdecorde | writer.writeEndElement(); |
88 | 881 | mdecorde | } |
89 | 881 | mdecorde | } |
90 | 881 | mdecorde | if(parser.getLocalName() == by)
|
91 | 881 | mdecorde | { |
92 | 881 | mdecorde | inby = false; // write the end element $by before |
93 | 881 | mdecorde | } |
94 | 881 | mdecorde | break;
|
95 | 881 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
96 | 881 | mdecorde | if(writer != null) |
97 | 881 | mdecorde | { |
98 | 881 | mdecorde | writer.writeCharacters(parser.getText()); |
99 | 881 | mdecorde | } |
100 | 881 | mdecorde | break;
|
101 | 881 | mdecorde | } |
102 | 881 | mdecorde | |
103 | 881 | mdecorde | } |
104 | 881 | mdecorde | if(writer != null) |
105 | 881 | mdecorde | writer.close(); |
106 | 881 | mdecorde | if(output != null) |
107 | 881 | mdecorde | output.close(); |
108 | 1688 | mdecorde | if (parser != null) parser.close(); |
109 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
110 | 881 | mdecorde | return true; |
111 | 881 | mdecorde | } |
112 | 881 | mdecorde | catch(Exception e){ |
113 | 881 | mdecorde | println "XML Error at location: "+parser.getLocation()
|
114 | 881 | mdecorde | e.printStackTrace() |
115 | 1688 | mdecorde | if (writer != null) |
116 | 881 | mdecorde | writer.close() |
117 | 881 | mdecorde | output.close() |
118 | 1688 | mdecorde | if (parser != null) parser.close(); |
119 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
120 | 881 | mdecorde | return false |
121 | 881 | mdecorde | } |
122 | 881 | mdecorde | } |
123 | 881 | mdecorde | |
124 | 881 | mdecorde | /**
|
125 | 881 | mdecorde | * write the current event.
|
126 | 881 | mdecorde | *
|
127 | 881 | mdecorde | * @param event the stax event
|
128 | 881 | mdecorde | */
|
129 | 881 | mdecorde | private void writeEvent(int event) |
130 | 881 | mdecorde | { |
131 | 881 | mdecorde | def prefix = parser.getPrefix();
|
132 | 881 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT )
|
133 | 881 | mdecorde | { |
134 | 881 | mdecorde | def localname = parser.getLocalName(); // write element |
135 | 881 | mdecorde | if (prefix != null && prefix.length() > 0) |
136 | 881 | mdecorde | writer.writeStartElement(prefix+":"+localname);
|
137 | 881 | mdecorde | else
|
138 | 881 | mdecorde | writer.writeStartElement(localname); |
139 | 881 | mdecorde | |
140 | 881 | mdecorde | for(int i = 0 ; i < parser.getNamespaceCount() ; i++) //write namespaces |
141 | 881 | mdecorde | { |
142 | 881 | mdecorde | writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i)); |
143 | 881 | mdecorde | } |
144 | 881 | mdecorde | |
145 | 881 | mdecorde | String attrprefix; // write attributes |
146 | 881 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) |
147 | 881 | mdecorde | { |
148 | 881 | mdecorde | attrprefix = parser.getAttributePrefix(i); |
149 | 881 | mdecorde | if (attrprefix != null & attrprefix.length() > 0) |
150 | 881 | mdecorde | writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
|
151 | 881 | mdecorde | else
|
152 | 881 | mdecorde | writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)) |
153 | 881 | mdecorde | } |
154 | 881 | mdecorde | //writer.writeCharacters("\n");
|
155 | 881 | mdecorde | } |
156 | 881 | mdecorde | else if (event == XMLStreamConstants.END_ELEMENT) |
157 | 881 | mdecorde | { |
158 | 881 | mdecorde | writer.writeEndElement(); |
159 | 881 | mdecorde | //writer.writeCharacters("\n");
|
160 | 881 | mdecorde | } |
161 | 881 | mdecorde | else if (event == XMLStreamConstants.CHARACTERS) |
162 | 881 | mdecorde | { |
163 | 881 | mdecorde | writer.writeCharacters(parser.getText()); |
164 | 881 | mdecorde | } |
165 | 881 | mdecorde | else if (event == XMLStreamConstants.COMMENT) |
166 | 881 | mdecorde | { |
167 | 881 | mdecorde | writer.writeComment(parser.getText()); |
168 | 881 | mdecorde | } |
169 | 881 | mdecorde | } |
170 | 881 | mdecorde | |
171 | 881 | mdecorde | public ArrayList<File> getFiles() |
172 | 881 | mdecorde | { |
173 | 881 | mdecorde | return newfiles;
|
174 | 881 | mdecorde | } |
175 | 881 | mdecorde | |
176 | 881 | mdecorde | public static void main(String[] args) |
177 | 881 | mdecorde | { |
178 | 881 | mdecorde | File xmlfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\ELTIEMPO.2002-2010.MIN.xml"); |
179 | 881 | mdecorde | File okfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\"); |
180 | 881 | mdecorde | File outdir = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\tiempo"); |
181 | 881 | mdecorde | outdir.deleteDir(); |
182 | 881 | mdecorde | outdir.mkdir(); |
183 | 881 | mdecorde | |
184 | 881 | mdecorde | // Reader reader = new InputStreamReader(new FileInputStream(xmlfile) , "ISO-8859-1");
|
185 | 881 | mdecorde | // Writer writer = new OutputStreamWriter(new FileOutputStream(okfile) , "UTF-8");
|
186 | 881 | mdecorde | // String line = reader.readLine();
|
187 | 881 | mdecorde | // while(line != null)
|
188 | 881 | mdecorde | // {
|
189 | 881 | mdecorde | // line = line.replaceAll(' ([a-zA-Z0-9]+)="(.*)""', ' $1="$2"')
|
190 | 881 | mdecorde | // writer.write(line+"\n");
|
191 | 881 | mdecorde | // line = reader.readLine();
|
192 | 881 | mdecorde | // }
|
193 | 881 | mdecorde | // reader.close();
|
194 | 881 | mdecorde | // writer.close();
|
195 | 881 | mdecorde | //
|
196 | 881 | mdecorde | //
|
197 | 881 | mdecorde | //// if(!ValidateXml.test(okfile))
|
198 | 881 | mdecorde | //// {
|
199 | 881 | mdecorde | //// println "XML not valid";
|
200 | 881 | mdecorde | //// return;
|
201 | 881 | mdecorde | //// }
|
202 | 881 | mdecorde | |
203 | 881 | mdecorde | def splitter = new SplitBy(xmlfile); |
204 | 881 | mdecorde | if(splitter.process(outdir, "texte", "id")) |
205 | 881 | mdecorde | println "success";
|
206 | 881 | mdecorde | else
|
207 | 881 | mdecorde | println "failed !"
|
208 | 881 | mdecorde | |
209 | 881 | mdecorde | } |
210 | 881 | mdecorde | } |