Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / InlineCleaner.groovy @ 1000

History | View | Annotate | Download (5.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
25
// $LastChangedRevision: 3185 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.scripts.teitxm;
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import javax.xml.stream.*;
34
import java.net.URL;
35
import org.txm.importer.scripts.filters.*;
36

    
37
// TODO: Auto-generated Javadoc
38
/**
39
 * The Class InlineCleaner.
40
 *
41
 * @author mdecorde
42
 * remove all ana tags of a xml-tei-txm file it supose you have
43
 * the stand-off version
44
 */
45

    
46
public class InlineCleaner {
47
        
48
        /** The url. */
49
        private def url;
50
        
51
        /** The input data. */
52
        private def inputData;
53
        
54
        /** The factory. */
55
        private def factory;
56
        
57
        /** The parser. */
58
        private XMLStreamReader parser;
59

    
60
        /** The output. */
61
        private def output;
62
        
63
        /** The solotags. */
64
        ArrayList<String> solotags;
65

    
66
        /**
67
         * Instantiates a new inline cleaner.
68
         *
69
         * @param url the url
70
         * @param outfile the outfile
71
         * @param solotags the solotags
72
         */
73
        public InlineCleaner(URL url, File outfile, ArrayList<String> solotags) {
74
                try {
75
                        this.url = url;
76
                        this.solotags = solotags;
77
                        inputData = url.openStream();
78
                        factory = XMLInputFactory.newInstance();
79

    
80
                        parser = factory.createXMLStreamReader(inputData);
81

    
82
                        this.transformFile(outfile);
83

    
84
                } catch (XMLStreamException ex) {
85
                        System.out.println(ex);
86
                } catch (IOException ex) {
87
                        System.out.println("IOException while parsing ");
88
                }
89
        }
90

    
91
        /**
92
         * Creates the output.
93
         *
94
         * @param outfile the outfile
95
         * @return true, if successful
96
         */
97
        private boolean createOutput(File outfile) {
98
                output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
99
                return true;
100
        }
101

    
102
        /**
103
         * Transform file.
104
         *
105
         * @param outfile the outfile
106
         * @return true, if successful
107
         */
108
        public boolean transformFile(File outfile) {
109
                boolean shouldwrite = true;
110
                boolean isW = false;
111
                String lastopenlocalname = "";
112

    
113
                if (!createOutput(outfile))
114
                        return false;
115

    
116
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser
117
                                .next()) {
118
                        String prefix = parser.getPrefix();
119
                        if (prefix == null)
120
                                prefix = "";
121
                        else
122
                                prefix += ":";
123

    
124
                        switch (event) {
125
                        case XMLStreamConstants.START_ELEMENT:
126
                                
127
                                if (parser.getLocalName().equals("w")) {
128
                                        isW = true;
129
                                } 
130
                                
131
                                if (parser.getLocalName().equals("ana") && isW) {
132
                                        shouldwrite = false;
133
                                } else {
134
                                        lastopenlocalname = parser.getLocalName();
135
                                        output.write("\n<" + prefix + parser.getLocalName());
136
                                        if (parser.getLocalName().equals("TEI"))
137
                                                output
138
                                                                .write(" xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\"");
139
                                        for (int i = 0; i < parser.getAttributeCount(); i++) {
140
                                                String attname = parser.getAttributeLocalName(i);
141
                                                output.write(" " + attname + "=\""
142
                                                                + parser.getAttributeValue(i) + "\"");
143
                                        }
144
                                        if (solotags.contains(lastopenlocalname))
145
                                                output.write("/>");
146
                                        else
147
                                                output.write(">");
148
                                }
149
                                break;
150
                        case XMLStreamConstants.END_ELEMENT:
151
                                switch (parser.getLocalName()) {
152
                                case "ana":
153
                                        if(isW)
154
                                                shouldwrite = true;
155
                                        break;
156
                                case "w":
157
                                        isW = false;
158
                                        //do default
159
                                default:
160
                                        if (!solotags.contains(parser.getLocalName()))
161
                                                if (lastopenlocalname.equals(parser.getLocalName()))
162
                                                        output.write("</" + prefix + parser.getLocalName()+ ">");
163
                                                else
164
                                                        output.write("\n</" + prefix
165
                                                                        + parser.getLocalName() + ">");
166
                                }
167
                                break;
168

    
169
                        case XMLStreamConstants.CHARACTERS:
170
                                if (shouldwrite) {
171
                                        String txt = parser.getText().trim();
172
                                        output.write(txt);
173
                                }
174
                                break;
175
                        }
176
                }
177
                output.close();
178
                parser.close();
179
        }
180

    
181
        /**
182
         * The main method.
183
         *
184
         * @param args the arguments
185
         */
186
        public static void main(String[] args) {
187

    
188
                String rootDir = "~/xml/rgaqcj/";
189
                new File(rootDir + "/cleaner/").mkdir();
190

    
191
                ArrayList<String> milestones = new ArrayList<String>();// the tags who stay milestones
192
                milestones.add("tagUsage");
193
                milestones.add("pb");
194
                milestones.add("lb");
195
                milestones.add("catRef");
196

    
197
                File srcfile = new File(rootDir + "/anainline/", "roland-ana.xml");
198
                File cleanfile = new File(rootDir + "/cleaner/", "roland-off.xml");
199

    
200
                System.out.println("clean file " + srcfile);
201
                def builder = new InlineCleaner(srcfile.toURL(), cleanfile, milestones);
202
                return;
203
        }
204

    
205
}