Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / SplitBy.groovy @ 1000

History | View | Annotate | Download (5.3 kB)

1
package org.txm.scripts.importer
2

    
3
import java.io.Writer;
4
import java.io.File;
5
import javax.xml.stream.*;
6
import java.net.URL;
7
import org.txm.importer.filters.*;
8

    
9
class SplitBy 
10
{
11
        
12
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
13
        FileOutputStream output;
14
        XMLStreamWriter writer;
15
        def parser;
16
        ArrayList<String> newfiles = [];
17
        
18
        public SplitBy(File xmlfile)
19
        {
20
                try {
21
                        def inputData = xmlfile.toURI().toURL().openStream();
22
                        def inputFactory = XMLInputFactory.newInstance();
23
                        parser = inputFactory.createXMLStreamReader(inputData);
24
                        
25
                } catch (XMLStreamException ex) {
26
                        System.out.println(ex);
27
                }catch (IOException ex) {
28
                        System.out.println("IOException while parsing ");
29
                }
30
        }
31
        boolean inby = false;
32
        public boolean process(File outdir, String by, String idAttribute)
33
        {
34
                outdir.mkdirs();
35
                try
36
                {
37
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
38
                                
39
                                switch(event)
40
                                {
41
                                        case XMLStreamConstants.START_ELEMENT:
42
                                                if(parser.getLocalName() == by)
43
                                                {
44
                                                        
45
                                                        inby = true;
46
                                                        if(writer != null)
47
                                                        {
48
                                                                writer.close();
49
                                                                output.close();
50
                                                        }
51
                                                        
52
                                                        String filename = "";
53
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
54
                                                                if(parser.getAttributeLocalName(i) == idAttribute)
55
                                                                {
56
                                                                        filename = parser.getAttributeValue(i);
57
                                                                        break;
58
                                                                }
59
                                                        File newfile = new File(outdir,filename+".xml")
60
                                                        if(files.contains(newfile))
61
                                                        {
62
                                                                println "Item declared twice "+filename+" at location "+parser.getLocation();
63
                                                                return false;
64
                                                        }
65
                                                        files.add(newfile);
66
                                                        output = new FileOutputStream(newfile)
67
                                                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
68
                                                        
69
                                                }
70
                                                if(writer != null)
71
                                                {
72
                                                        writeEvent(event);
73
                                                        if(parser.getLocalName() == "TEI")
74
                                                        { // write namespaces
75
                                                                writer.writeNamespace("tei", "http://www.tei-c.org/ns/1.0")
76
                                                                writer.writeNamespace("txm", "http://textometrie.org/1.0")
77
                                                        }
78
                                                }
79
                                                break;
80
                                        case XMLStreamConstants.END_ELEMENT:
81
                                                if(inby)
82
                                                {
83
                                                        if(writer != null)
84
                                                        {
85
                                                                writer.writeEndElement();
86
                                                        }
87
                                                }
88
                                                if(parser.getLocalName() == by)
89
                                                {
90
                                                        inby = false; // write the end element $by before
91
                                                }
92
                                                break;
93
                                        case XMLStreamConstants.CHARACTERS:
94
                                                if(writer != null)
95
                                                {
96
                                                        writer.writeCharacters(parser.getText());
97
                                                }
98
                                                break;
99
                                }
100
                                
101
                        }
102
                        if(writer != null)
103
                                writer.close();
104
                        if(output != null)
105
                                output.close();
106
                        return true;
107
                }
108
                catch(Exception e){
109
                        println "XML Error at location: "+parser.getLocation()
110
                        e.printStackTrace()
111
                        if(writer != null)
112
                                writer.close()
113
                                output.close()
114
                                return false
115
                        }
116
        }
117
        
118
        /**
119
         * write the current event.
120
         *
121
         * @param event the stax event
122
         */
123
        private void writeEvent(int event)
124
        {
125
                def prefix = parser.getPrefix();
126
                if (event == XMLStreamConstants.START_ELEMENT ) 
127
                {
128
                        def localname = parser.getLocalName(); // write element
129
                        if (prefix != null && prefix.length() > 0)
130
                                writer.writeStartElement(prefix+":"+localname);
131
                        else
132
                                writer.writeStartElement(localname);
133
                        
134
                        for(int i = 0 ; i < parser.getNamespaceCount() ; i++) //write namespaces
135
                        {
136
                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
137
                        }
138
                        
139
                        String attrprefix; // write attributes
140
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
141
                        {
142
                                attrprefix = parser.getAttributePrefix(i);
143
                                if (attrprefix != null & attrprefix.length() > 0)
144
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
145
                                else
146
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
147
                        }
148
                        //writer.writeCharacters("\n");
149
                }
150
                else if (event == XMLStreamConstants.END_ELEMENT)
151
                {
152
                        writer.writeEndElement();
153
                        //writer.writeCharacters("\n");
154
                }
155
                else if (event == XMLStreamConstants.CHARACTERS)
156
                {
157
                        writer.writeCharacters(parser.getText());
158
                }
159
                else if (event == XMLStreamConstants.COMMENT)
160
                {
161
                        writer.writeComment(parser.getText());
162
                }
163
        }
164
        
165
        public ArrayList<File> getFiles()
166
        {
167
                return newfiles;
168
        }
169
        
170
        public static void main(String[] args)
171
        {
172
                File xmlfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\ELTIEMPO.2002-2010.MIN.xml");
173
                File okfile = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\");
174
                File outdir = new File("C:\\Documents and Settings\\mdecorde\\Bureau\\MISAT2011\\corpus\\tiempo\\tiempo");
175
                outdir.deleteDir();
176
                outdir.mkdir();
177
                
178
                //                Reader reader = new InputStreamReader(new FileInputStream(xmlfile) , "ISO-8859-1");
179
                //                Writer writer = new OutputStreamWriter(new FileOutputStream(okfile) , "UTF-8");
180
                //                String line = reader.readLine();
181
                //                while(line != null)
182
                //                {
183
                //                        line = line.replaceAll(' ([a-zA-Z0-9]+)="(.*)""', ' $1="$2"')
184
                //                        writer.write(line+"\n");
185
                //                        line = reader.readLine();
186
                //                }
187
                //                reader.close();
188
                //                writer.close();
189
                //                
190
                //                
191
                ////                if(!ValidateXml.test(okfile))
192
                ////                {
193
                ////                        println "XML not valid";
194
                ////                        return;
195
                ////                }
196
                
197
                def splitter = new SplitBy(xmlfile);
198
                if(splitter.process(outdir, "texte", "id"))
199
                        println "success";
200
                else
201
                        println "failed !"
202
                
203
        }
204
}