Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / InlineCleaner.groovy @ 1688

History | View | Annotate | Download (5.1 kB)

1 986 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 986 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 986 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 986 mdecorde
// Sophia Antipolis, University of Paris 3.
5 986 mdecorde
//
6 986 mdecorde
// The TXM platform is free software: you can redistribute it
7 986 mdecorde
// and/or modify it under the terms of the GNU General Public
8 986 mdecorde
// License as published by the Free Software Foundation,
9 986 mdecorde
// either version 2 of the License, or (at your option) any
10 986 mdecorde
// later version.
11 986 mdecorde
//
12 986 mdecorde
// The TXM platform is distributed in the hope that it will be
13 986 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 986 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 986 mdecorde
// PURPOSE. See the GNU General Public License for more
16 986 mdecorde
// details.
17 986 mdecorde
//
18 986 mdecorde
// You should have received a copy of the GNU General
19 986 mdecorde
// Public License along with the TXM platform. If not, see
20 986 mdecorde
// http://www.gnu.org/licenses.
21 986 mdecorde
//
22 986 mdecorde
//
23 986 mdecorde
//
24 986 mdecorde
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
25 986 mdecorde
// $LastChangedRevision: 3185 $
26 986 mdecorde
// $LastChangedBy: mdecorde $
27 986 mdecorde
//
28 986 mdecorde
package org.txm.importer.scripts.xmltxm
29 986 mdecorde
30 986 mdecorde
import java.text.DateFormat;
31 986 mdecorde
import java.util.Date;
32 986 mdecorde
import java.util.ArrayList;
33 986 mdecorde
import javax.xml.stream.*;
34 986 mdecorde
import java.net.URL;
35 986 mdecorde
import org.txm.importer.filters.*;
36 986 mdecorde
37 986 mdecorde
// TODO: Auto-generated Javadoc
38 986 mdecorde
/**
39 986 mdecorde
 * The Class InlineCleaner.
40 986 mdecorde
 *
41 986 mdecorde
 * @author mdecorde
42 986 mdecorde
 * remove all ana tags of a xml-tei-txm file it supose you have
43 986 mdecorde
 * the stand-off version
44 986 mdecorde
 */
45 986 mdecorde
46 986 mdecorde
public class InlineCleaner {
47 986 mdecorde
48 986 mdecorde
        /** The url. */
49 986 mdecorde
        private def url;
50 986 mdecorde
51 986 mdecorde
        /** The input data. */
52 986 mdecorde
        private def inputData;
53 986 mdecorde
54 986 mdecorde
        /** The factory. */
55 986 mdecorde
        private def factory;
56 986 mdecorde
57 986 mdecorde
        /** The parser. */
58 986 mdecorde
        private XMLStreamReader parser;
59 986 mdecorde
60 986 mdecorde
        /** The output. */
61 986 mdecorde
        private def output;
62 986 mdecorde
63 986 mdecorde
        /** The solotags. */
64 986 mdecorde
        ArrayList<String> solotags;
65 986 mdecorde
66 986 mdecorde
        /**
67 986 mdecorde
         * Instantiates a new inline cleaner.
68 986 mdecorde
         *
69 986 mdecorde
         * @param url the url
70 986 mdecorde
         * @param outfile the outfile
71 986 mdecorde
         * @param solotags the solotags
72 986 mdecorde
         */
73 986 mdecorde
        public InlineCleaner(URL url, File outfile, ArrayList<String> solotags) {
74 986 mdecorde
                try {
75 986 mdecorde
                        this.url = url;
76 986 mdecorde
                        this.solotags = solotags;
77 986 mdecorde
                        inputData = url.openStream();
78 986 mdecorde
                        factory = XMLInputFactory.newInstance();
79 986 mdecorde
80 986 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
81 986 mdecorde
82 986 mdecorde
                        this.transformFile(outfile);
83 986 mdecorde
84 986 mdecorde
                } catch (XMLStreamException ex) {
85 986 mdecorde
                        System.out.println(ex);
86 986 mdecorde
                } catch (IOException ex) {
87 986 mdecorde
                        System.out.println("IOException while parsing ");
88 986 mdecorde
                }
89 986 mdecorde
        }
90 986 mdecorde
91 986 mdecorde
        /**
92 986 mdecorde
         * Creates the output.
93 986 mdecorde
         *
94 986 mdecorde
         * @param outfile the outfile
95 986 mdecorde
         * @return true, if successful
96 986 mdecorde
         */
97 986 mdecorde
        private boolean createOutput(File outfile) {
98 986 mdecorde
                output = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8");
99 986 mdecorde
                return true;
100 986 mdecorde
        }
101 986 mdecorde
102 986 mdecorde
        /**
103 986 mdecorde
         * Transform file.
104 986 mdecorde
         *
105 986 mdecorde
         * @param outfile the outfile
106 986 mdecorde
         * @return true, if successful
107 986 mdecorde
         */
108 986 mdecorde
        public boolean transformFile(File outfile) {
109 986 mdecorde
                boolean shouldwrite = true;
110 986 mdecorde
                boolean isW = false;
111 986 mdecorde
                String lastopenlocalname = "";
112 986 mdecorde
113 986 mdecorde
                if (!createOutput(outfile))
114 986 mdecorde
                        return false;
115 986 mdecorde
116 986 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser
117 986 mdecorde
                                .next()) {
118 986 mdecorde
                        String prefix = parser.getPrefix();
119 986 mdecorde
                        if (prefix == null)
120 986 mdecorde
                                prefix = "";
121 986 mdecorde
                        else
122 986 mdecorde
                                prefix += ":";
123 986 mdecorde
124 986 mdecorde
                        switch (event) {
125 986 mdecorde
                        case XMLStreamConstants.START_ELEMENT:
126 986 mdecorde
127 986 mdecorde
                                if (parser.getLocalName().equals("w")) {
128 986 mdecorde
                                        isW = true;
129 986 mdecorde
                                }
130 986 mdecorde
131 986 mdecorde
                                if (parser.getLocalName().equals("ana") && isW) {
132 986 mdecorde
                                        shouldwrite = false;
133 986 mdecorde
                                } else {
134 986 mdecorde
                                        lastopenlocalname = parser.getLocalName();
135 986 mdecorde
                                        output.write("\n<" + prefix + parser.getLocalName());
136 986 mdecorde
                                        if (parser.getLocalName().equals("TEI"))
137 986 mdecorde
                                                output
138 986 mdecorde
                                                                .write(" xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\"");
139 986 mdecorde
                                        for (int i = 0; i < parser.getAttributeCount(); i++) {
140 986 mdecorde
                                                String attname = parser.getAttributeLocalName(i);
141 986 mdecorde
                                                output.write(" " + attname + "=\""
142 986 mdecorde
                                                                + parser.getAttributeValue(i) + "\"");
143 986 mdecorde
                                        }
144 986 mdecorde
                                        if (solotags.contains(lastopenlocalname))
145 986 mdecorde
                                                output.write("/>");
146 986 mdecorde
                                        else
147 986 mdecorde
                                                output.write(">");
148 986 mdecorde
                                }
149 986 mdecorde
                                break;
150 986 mdecorde
                        case XMLStreamConstants.END_ELEMENT:
151 986 mdecorde
                                switch (parser.getLocalName()) {
152 986 mdecorde
                                case "ana":
153 986 mdecorde
                                        if(isW)
154 986 mdecorde
                                                shouldwrite = true;
155 986 mdecorde
                                        break;
156 986 mdecorde
                                case "w":
157 986 mdecorde
                                        isW = false;
158 986 mdecorde
                                        //do default
159 986 mdecorde
                                default:
160 986 mdecorde
                                        if (!solotags.contains(parser.getLocalName()))
161 986 mdecorde
                                                if (lastopenlocalname.equals(parser.getLocalName()))
162 986 mdecorde
                                                        output.write("</" + prefix + parser.getLocalName()+ ">");
163 986 mdecorde
                                                else
164 986 mdecorde
                                                        output.write("\n</" + prefix
165 986 mdecorde
                                                                        + parser.getLocalName() + ">");
166 986 mdecorde
                                }
167 986 mdecorde
                                break;
168 986 mdecorde
169 986 mdecorde
                        case XMLStreamConstants.CHARACTERS:
170 986 mdecorde
                                if (shouldwrite) {
171 986 mdecorde
                                        String txt = parser.getText().trim();
172 986 mdecorde
                                        output.write(txt);
173 986 mdecorde
                                }
174 986 mdecorde
                                break;
175 986 mdecorde
                        }
176 986 mdecorde
                }
177 986 mdecorde
                output.close();
178 986 mdecorde
                parser.close();
179 1688 mdecorde
                inputData.close();
180 986 mdecorde
        }
181 986 mdecorde
182 986 mdecorde
        /**
183 986 mdecorde
         * The main method.
184 986 mdecorde
         *
185 986 mdecorde
         * @param args the arguments
186 986 mdecorde
         */
187 986 mdecorde
        public static void main(String[] args) {
188 986 mdecorde
189 986 mdecorde
                String rootDir = "~/xml/rgaqcj/";
190 986 mdecorde
                new File(rootDir + "/cleaner/").mkdir();
191 986 mdecorde
192 986 mdecorde
                ArrayList<String> milestones = new ArrayList<String>();// the tags who stay milestones
193 986 mdecorde
                milestones.add("tagUsage");
194 986 mdecorde
                milestones.add("pb");
195 986 mdecorde
                milestones.add("lb");
196 986 mdecorde
                milestones.add("catRef");
197 986 mdecorde
198 986 mdecorde
                File srcfile = new File(rootDir + "/anainline/", "roland-ana.xml");
199 986 mdecorde
                File cleanfile = new File(rootDir + "/cleaner/", "roland-off.xml");
200 986 mdecorde
201 986 mdecorde
                System.out.println("clean file " + srcfile);
202 986 mdecorde
                def builder = new InlineCleaner(srcfile.toURL(), cleanfile, milestones);
203 986 mdecorde
                return;
204 986 mdecorde
        }
205 986 mdecorde
206 986 mdecorde
}