Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / SplitBy.groovy @ 1688

History | View | Annotate | Download (5.5 kB)

1
package org.txm.scripts.importer
2

    
3
import java.io.Writer;
4
import java.io.File;
5
import javax.xml.stream.*;
6
import java.net.URL;
7
import org.txm.importer.filters.*;
8

    
9
class SplitBy 
10
{
11
        
12
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
13
        FileOutputStream output;
14
        XMLStreamWriter writer;
15
        def parser, inputData;
16
        ArrayList<String> newfiles = [];
17
        
18
        public SplitBy(File xmlfile)
19
        {
20
                try {
21
                        inputData = xmlfile.toURI().toURL().openStream();
22
                        def inputFactory = XMLInputFactory.newInstance();
23
                        parser = inputFactory.createXMLStreamReader(inputData);
24
                        
25
                } catch (XMLStreamException ex) {
26
                        System.out.println(ex);
27
                }catch (IOException ex) {
28
                        System.out.println("IOException while parsing ");
29
                }
30
        }
31
        boolean inby = false;
32
        public boolean process(File outdir, String by, String idAttribute)
33
        {
34
                outdir.mkdirs();
35
                try
36
                {
37
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
38
                                
39
                                switch(event)
40
                                {
41
                                        case XMLStreamConstants.START_ELEMENT:
42
                                                if(parser.getLocalName() == by)
43
                                                {
44
                                                        
45
                                                        inby = true;
46
                                                        if(writer != null)
47
                                                        {
48
                                                                writer.close();
49
                                                                output.close();
50
                                                        }
51
                                                        
52
                                                        String filename = "";
53
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
54
                                                                if(parser.getAttributeLocalName(i) == idAttribute)
55
                                                                {
56
                                                                        filename = parser.getAttributeValue(i);
57
                                                                        break;
58
                                                                }
59
                                                        File newfile = new File(outdir,filename+".xml")
60
                                                        if(files.contains(newfile))
61
                                                        {
62
                                                                println "Item declared twice "+filename+" at location "+parser.getLocation();
63
                                                                if (parser != null) parser.close();
64
                                                                if (inputData != null) inputData.close();
65
                                                                return false;
66
                                                        }
67
                                                        files.add(newfile);
68
                                                        output = new FileOutputStream(newfile)
69
                                                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
70
                                                        
71
                                                }
72
                                                if(writer != null)
73
                                                {
74
                                                        writeEvent(event);
75
                                                        if(parser.getLocalName() == "TEI")
76
                                                        { // write namespaces
77
                                                                writer.writeNamespace("tei", "http://www.tei-c.org/ns/1.0")
78
                                                                writer.writeNamespace("txm", "http://textometrie.org/1.0")
79
                                                        }
80
                                                }
81
                                                break;
82
                                        case XMLStreamConstants.END_ELEMENT:
83
                                                if(inby)
84
                                                {
85
                                                        if(writer != null)
86
                                                        {
87
                                                                writer.writeEndElement();
88
                                                        }
89
                                                }
90
                                                if(parser.getLocalName() == by)
91
                                                {
92
                                                        inby = false; // write the end element $by before
93
                                                }
94
                                                break;
95
                                        case XMLStreamConstants.CHARACTERS:
96
                                                if(writer != null)
97
                                                {
98
                                                        writer.writeCharacters(parser.getText());
99
                                                }
100
                                                break;
101
                                }
102
                                
103
                        }
104
                        if(writer != null)
105
                                writer.close();
106
                        if(output != null)
107
                                output.close();
108
                                if (parser != null) parser.close();
109
                                if (inputData != null) inputData.close();
110
                        return true;
111
                }
112
                catch(Exception e){
113
                        println "XML Error at location: "+parser.getLocation()
114
                        e.printStackTrace()
115
                        if (writer != null)
116
                                writer.close()
117
                                output.close()
118
                                if (parser != null) parser.close();
119
                                if (inputData != null) inputData.close();
120
                                return false
121
                        }
122
        }
123
        
124
        /**
125
         * write the current event.
126
         *
127
         * @param event the stax event
128
         */
129
        private void writeEvent(int event)
130
        {
131
                def prefix = parser.getPrefix();
132
                if (event == XMLStreamConstants.START_ELEMENT ) 
133
                {
134
                        def localname = parser.getLocalName(); // write element
135
                        if (prefix != null && prefix.length() > 0)
136
                                writer.writeStartElement(prefix+":"+localname);
137
                        else
138
                                writer.writeStartElement(localname);
139
                        
140
                        for(int i = 0 ; i < parser.getNamespaceCount() ; i++) //write namespaces
141
                        {
142
                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
143
                        }
144
                        
145
                        String attrprefix; // write attributes
146
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
147
                        {
148
                                attrprefix = parser.getAttributePrefix(i);
149
                                if (attrprefix != null & attrprefix.length() > 0)
150
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
151
                                else
152
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
153
                        }
154
                        //writer.writeCharacters("\n");
155
                }
156
                else if (event == XMLStreamConstants.END_ELEMENT)
157
                {
158
                        writer.writeEndElement();
159
                        //writer.writeCharacters("\n");
160
                }
161
                else if (event == XMLStreamConstants.CHARACTERS)
162
                {
163
                        writer.writeCharacters(parser.getText());
164
                }
165
                else if (event == XMLStreamConstants.COMMENT)
166
                {
167
                        writer.writeComment(parser.getText());
168
                }
169
        }
170
        
171
        public ArrayList<File> getFiles()
172
        {
173
                return newfiles;
174
        }
175
        
176
        public static void main(String[] args)
177
        {
178
                File xmlfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\ELTIEMPO.2002-2010.MIN.xml");
179
                File okfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\");
180
                File outdir = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\tiempo");
181
                outdir.deleteDir();
182
                outdir.mkdir();
183
                
184
                //                Reader reader = new InputStreamReader(new FileInputStream(xmlfile) , "ISO-8859-1");
185
                //                Writer writer = new OutputStreamWriter(new FileOutputStream(okfile) , "UTF-8");
186
                //                String line = reader.readLine();
187
                //                while(line != null)
188
                //                {
189
                //                        line = line.replaceAll(' ([a-zA-Z0-9]+)="(.*)""', ' $1="$2"')
190
                //                        writer.write(line+"\n");
191
                //                        line = reader.readLine();
192
                //                }
193
                //                reader.close();
194
                //                writer.close();
195
                //                
196
                //                
197
                ////                if(!ValidateXml.test(okfile))
198
                ////                {
199
                ////                        println "XML not valid";
200
                ////                        return;
201
                ////                }
202
                
203
                def splitter = new SplitBy(xmlfile);
204
                if(splitter.process(outdir, "texte", "id"))
205
                        println "success";
206
                else
207
                        println "failed !"
208
                
209
        }
210
}