Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / SplitBy.groovy @ 2473

History | View | Annotate | Download (5.5 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 881 mdecorde
import java.io.Writer;
4 881 mdecorde
import java.io.File;
5 881 mdecorde
import javax.xml.stream.*;
6 881 mdecorde
import java.net.URL;
7 881 mdecorde
import org.txm.importer.filters.*;
8 881 mdecorde
9 881 mdecorde
class SplitBy
10 881 mdecorde
{
11 881 mdecorde
12 881 mdecorde
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
13 881 mdecorde
        FileOutputStream output;
14 881 mdecorde
        XMLStreamWriter writer;
15 1688 mdecorde
        def parser, inputData;
16 881 mdecorde
        ArrayList<String> newfiles = [];
17 881 mdecorde
18 881 mdecorde
        public SplitBy(File xmlfile)
19 881 mdecorde
        {
20 881 mdecorde
                try {
21 1688 mdecorde
                        inputData = xmlfile.toURI().toURL().openStream();
22 881 mdecorde
                        def inputFactory = XMLInputFactory.newInstance();
23 881 mdecorde
                        parser = inputFactory.createXMLStreamReader(inputData);
24 881 mdecorde
25 881 mdecorde
                } catch (XMLStreamException ex) {
26 881 mdecorde
                        System.out.println(ex);
27 881 mdecorde
                }catch (IOException ex) {
28 881 mdecorde
                        System.out.println("IOException while parsing ");
29 881 mdecorde
                }
30 881 mdecorde
        }
31 881 mdecorde
        boolean inby = false;
32 881 mdecorde
        public boolean process(File outdir, String by, String idAttribute)
33 881 mdecorde
        {
34 881 mdecorde
                outdir.mkdirs();
35 881 mdecorde
                try
36 881 mdecorde
                {
37 881 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
38 881 mdecorde
39 881 mdecorde
                                switch(event)
40 881 mdecorde
                                {
41 881 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
42 881 mdecorde
                                                if(parser.getLocalName() == by)
43 881 mdecorde
                                                {
44 881 mdecorde
45 881 mdecorde
                                                        inby = true;
46 881 mdecorde
                                                        if(writer != null)
47 881 mdecorde
                                                        {
48 881 mdecorde
                                                                writer.close();
49 881 mdecorde
                                                                output.close();
50 881 mdecorde
                                                        }
51 881 mdecorde
52 881 mdecorde
                                                        String filename = "";
53 881 mdecorde
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
54 881 mdecorde
                                                                if(parser.getAttributeLocalName(i) == idAttribute)
55 881 mdecorde
                                                                {
56 881 mdecorde
                                                                        filename = parser.getAttributeValue(i);
57 881 mdecorde
                                                                        break;
58 881 mdecorde
                                                                }
59 881 mdecorde
                                                        File newfile = new File(outdir,filename+".xml")
60 881 mdecorde
                                                        if(files.contains(newfile))
61 881 mdecorde
                                                        {
62 881 mdecorde
                                                                println "Item declared twice "+filename+" at location "+parser.getLocation();
63 1688 mdecorde
                                                                if (parser != null) parser.close();
64 1688 mdecorde
                                                                if (inputData != null) inputData.close();
65 881 mdecorde
                                                                return false;
66 881 mdecorde
                                                        }
67 881 mdecorde
                                                        files.add(newfile);
68 881 mdecorde
                                                        output = new FileOutputStream(newfile)
69 881 mdecorde
                                                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
70 881 mdecorde
71 881 mdecorde
                                                }
72 881 mdecorde
                                                if(writer != null)
73 881 mdecorde
                                                {
74 881 mdecorde
                                                        writeEvent(event);
75 881 mdecorde
                                                        if(parser.getLocalName() == "TEI")
76 881 mdecorde
                                                        { // write namespaces
77 881 mdecorde
                                                                writer.writeNamespace("tei", "http://www.tei-c.org/ns/1.0")
78 881 mdecorde
                                                                writer.writeNamespace("txm", "http://textometrie.org/1.0")
79 881 mdecorde
                                                        }
80 881 mdecorde
                                                }
81 881 mdecorde
                                                break;
82 881 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
83 881 mdecorde
                                                if(inby)
84 881 mdecorde
                                                {
85 881 mdecorde
                                                        if(writer != null)
86 881 mdecorde
                                                        {
87 881 mdecorde
                                                                writer.writeEndElement();
88 881 mdecorde
                                                        }
89 881 mdecorde
                                                }
90 881 mdecorde
                                                if(parser.getLocalName() == by)
91 881 mdecorde
                                                {
92 881 mdecorde
                                                        inby = false; // write the end element $by before
93 881 mdecorde
                                                }
94 881 mdecorde
                                                break;
95 881 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
96 881 mdecorde
                                                if(writer != null)
97 881 mdecorde
                                                {
98 881 mdecorde
                                                        writer.writeCharacters(parser.getText());
99 881 mdecorde
                                                }
100 881 mdecorde
                                                break;
101 881 mdecorde
                                }
102 881 mdecorde
103 881 mdecorde
                        }
104 881 mdecorde
                        if(writer != null)
105 881 mdecorde
                                writer.close();
106 881 mdecorde
                        if(output != null)
107 881 mdecorde
                                output.close();
108 1688 mdecorde
                                if (parser != null) parser.close();
109 1688 mdecorde
                                if (inputData != null) inputData.close();
110 881 mdecorde
                        return true;
111 881 mdecorde
                }
112 881 mdecorde
                catch(Exception e){
113 881 mdecorde
                        println "XML Error at location: "+parser.getLocation()
114 881 mdecorde
                        e.printStackTrace()
115 1688 mdecorde
                        if (writer != null)
116 881 mdecorde
                                writer.close()
117 881 mdecorde
                                output.close()
118 1688 mdecorde
                                if (parser != null) parser.close();
119 1688 mdecorde
                                if (inputData != null) inputData.close();
120 881 mdecorde
                                return false
121 881 mdecorde
                        }
122 881 mdecorde
        }
123 881 mdecorde
124 881 mdecorde
        /**
125 881 mdecorde
         * write the current event.
126 881 mdecorde
         *
127 881 mdecorde
         * @param event the stax event
128 881 mdecorde
         */
129 881 mdecorde
        private void writeEvent(int event)
130 881 mdecorde
        {
131 881 mdecorde
                def prefix = parser.getPrefix();
132 881 mdecorde
                if (event == XMLStreamConstants.START_ELEMENT )
133 881 mdecorde
                {
134 881 mdecorde
                        def localname = parser.getLocalName(); // write element
135 881 mdecorde
                        if (prefix != null && prefix.length() > 0)
136 881 mdecorde
                                writer.writeStartElement(prefix+":"+localname);
137 881 mdecorde
                        else
138 881 mdecorde
                                writer.writeStartElement(localname);
139 881 mdecorde
140 881 mdecorde
                        for(int i = 0 ; i < parser.getNamespaceCount() ; i++) //write namespaces
141 881 mdecorde
                        {
142 881 mdecorde
                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
143 881 mdecorde
                        }
144 881 mdecorde
145 881 mdecorde
                        String attrprefix; // write attributes
146 881 mdecorde
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
147 881 mdecorde
                        {
148 881 mdecorde
                                attrprefix = parser.getAttributePrefix(i);
149 881 mdecorde
                                if (attrprefix != null & attrprefix.length() > 0)
150 881 mdecorde
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
151 881 mdecorde
                                else
152 881 mdecorde
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
153 881 mdecorde
                        }
154 881 mdecorde
                        //writer.writeCharacters("\n");
155 881 mdecorde
                }
156 881 mdecorde
                else if (event == XMLStreamConstants.END_ELEMENT)
157 881 mdecorde
                {
158 881 mdecorde
                        writer.writeEndElement();
159 881 mdecorde
                        //writer.writeCharacters("\n");
160 881 mdecorde
                }
161 881 mdecorde
                else if (event == XMLStreamConstants.CHARACTERS)
162 881 mdecorde
                {
163 881 mdecorde
                        writer.writeCharacters(parser.getText());
164 881 mdecorde
                }
165 881 mdecorde
                else if (event == XMLStreamConstants.COMMENT)
166 881 mdecorde
                {
167 881 mdecorde
                        writer.writeComment(parser.getText());
168 881 mdecorde
                }
169 881 mdecorde
        }
170 881 mdecorde
171 881 mdecorde
        public ArrayList<File> getFiles()
172 881 mdecorde
        {
173 881 mdecorde
                return newfiles;
174 881 mdecorde
        }
175 881 mdecorde
176 881 mdecorde
        public static void main(String[] args)
177 881 mdecorde
        {
178 881 mdecorde
                File xmlfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\ELTIEMPO.2002-2010.MIN.xml");
179 881 mdecorde
                File okfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\");
180 881 mdecorde
                File outdir = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\tiempo");
181 881 mdecorde
                outdir.deleteDir();
182 881 mdecorde
                outdir.mkdir();
183 881 mdecorde
184 881 mdecorde
                //                Reader reader = new InputStreamReader(new FileInputStream(xmlfile) , "ISO-8859-1");
185 881 mdecorde
                //                Writer writer = new OutputStreamWriter(new FileOutputStream(okfile) , "UTF-8");
186 881 mdecorde
                //                String line = reader.readLine();
187 881 mdecorde
                //                while(line != null)
188 881 mdecorde
                //                {
189 881 mdecorde
                //                        line = line.replaceAll(' ([a-zA-Z0-9]+)="(.*)""', ' $1="$2"')
190 881 mdecorde
                //                        writer.write(line+"\n");
191 881 mdecorde
                //                        line = reader.readLine();
192 881 mdecorde
                //                }
193 881 mdecorde
                //                reader.close();
194 881 mdecorde
                //                writer.close();
195 881 mdecorde
                //
196 881 mdecorde
                //
197 881 mdecorde
                ////                if(!ValidateXml.test(okfile))
198 881 mdecorde
                ////                {
199 881 mdecorde
                ////                        println "XML not valid";
200 881 mdecorde
                ////                        return;
201 881 mdecorde
                ////                }
202 881 mdecorde
203 881 mdecorde
                def splitter = new SplitBy(xmlfile);
204 881 mdecorde
                if(splitter.process(outdir, "texte", "id"))
205 881 mdecorde
                        println "success";
206 881 mdecorde
                else
207 881 mdecorde
                        println "failed !"
208 881 mdecorde
209 881 mdecorde
        }
210 881 mdecorde
}