Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WordCounter.groovy @ 1688

History | View | Annotate | Download (5.5 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
//
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
25 881 mdecorde
// $LastChangedRevision: 3087 $
26 881 mdecorde
// $LastChangedBy: mdecorde $
27 881 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.importer;
29 881 mdecorde
30 881 mdecorde
import java.io.BufferedReader;
31 881 mdecorde
import java.util.ArrayList;
32 881 mdecorde
import javax.xml.parsers.SAXParserFactory
33 881 mdecorde
import org.xml.sax.helpers.DefaultHandler
34 881 mdecorde
import org.xml.sax.*
35 881 mdecorde
import javax.xml.stream.*;
36 881 mdecorde
import java.net.URL;
37 881 mdecorde
38 881 mdecorde
// TODO: Auto-generated Javadoc
39 881 mdecorde
/** Count the focused tag and add id to it you must specify  : which tag are milestones the tag to count. */
40 881 mdecorde
class WordCounter extends DefaultHandler {
41 881 mdecorde
    String txt ;
42 881 mdecorde
43 881 mdecorde
        /** The focus. */
44 881 mdecorde
        String focus;
45 881 mdecorde
46 881 mdecorde
    /** The counter. */
47 881 mdecorde
    int counter = 0;
48 881 mdecorde
49 881 mdecorde
    /** The solotags. */
50 881 mdecorde
    ArrayList<String> solotags;
51 881 mdecorde
52 881 mdecorde
    /** The writer. */
53 881 mdecorde
    XMLStreamWriter writer;
54 881 mdecorde
55 881 mdecorde
        /**
56 881 mdecorde
         * The initializer do the work
57 881 mdecorde
         * The file infile is replaced by the result.
58 881 mdecorde
         *
59 881 mdecorde
         * @param infile the infile
60 881 mdecorde
         * @param focus the focus
61 881 mdecorde
         * @param txt the txt
62 881 mdecorde
         */
63 881 mdecorde
    WordCounter(File infile, String focus, String txt)
64 881 mdecorde
    {
65 881 mdecorde
            File outfile = new File(infile.getName()+"counter");
66 881 mdecorde
                this.focus=focus;
67 881 mdecorde
                this.txt = txt;
68 881 mdecorde
69 881 mdecorde
                FileInputStream input = new FileInputStream(infile);
70 881 mdecorde
71 881 mdecorde
        def handler = this;
72 881 mdecorde
        def reader = SAXParserFactory.newInstance().newSAXParser();
73 881 mdecorde
74 881 mdecorde
        def output = new FileOutputStream(outfile);
75 881 mdecorde
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
76 881 mdecorde
                writer = factory.createXMLStreamWriter(output, "UTF-8");
77 881 mdecorde
                writer.writeStartDocument("UTF-8", "1.0");
78 881 mdecorde
                output.write("\n")
79 881 mdecorde
80 881 mdecorde
                //Reader inputStream = new InputStreamReader(new FileInputStream(infile) , "UTF-8");
81 881 mdecorde
                //BufferedReader r= new BufferedReader(inputStream);
82 881 mdecorde
83 881 mdecorde
        reader.parse(input, handler);// process !!
84 881 mdecorde
85 881 mdecorde
        input.close();
86 881 mdecorde
        writer.close();
87 881 mdecorde
                output.close();
88 881 mdecorde
        //r.close();
89 881 mdecorde
        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
90 881 mdecorde
    }
91 881 mdecorde
92 881 mdecorde
        /**
93 881 mdecorde
         * process start tag.
94 881 mdecorde
         *
95 881 mdecorde
         * @param ns the ns
96 881 mdecorde
         * @param localName the localname of the current tag
97 881 mdecorde
         * @param qName the q name
98 881 mdecorde
         * @param atts the attributes of the current tag
99 881 mdecorde
         */
100 881 mdecorde
    void startElement(String ns, String localName, String qName, Attributes atts) {
101 881 mdecorde
102 881 mdecorde
            writer.writeStartElement(qName)
103 881 mdecorde
104 881 mdecorde
            //writeAttrs
105 881 mdecorde
        if(qName.matches(focus))
106 881 mdecorde
        {
107 881 mdecorde
                counter++;
108 881 mdecorde
                writer.writeAttribute("n",""+counter)
109 881 mdecorde
            writer.writeAttribute("xml:id",focus+txt+"_"+counter)
110 881 mdecorde
            for(int i=0; i < atts.getLength();i++)
111 881 mdecorde
                    if(atts.getQName(i) != "n" && atts.getQName(i) != "xml:id" && atts.getQName(i) != "id")
112 881 mdecorde
                writer.writeAttribute(atts.getQName(i), atts.getValue(i));
113 881 mdecorde
        }
114 881 mdecorde
        else
115 881 mdecorde
        {
116 881 mdecorde
                for(int i=0; i < atts.getLength();i++)
117 881 mdecorde
                writer.writeAttribute(atts.getQName(i), atts.getValue(i));
118 881 mdecorde
        }
119 881 mdecorde
    }
120 881 mdecorde
121 881 mdecorde
        /**
122 881 mdecorde
         * write the CDATA, not finished.
123 881 mdecorde
         *
124 881 mdecorde
         * @param chars all the chars
125 881 mdecorde
         * @param offset where the text begin
126 881 mdecorde
         * @param length the length
127 881 mdecorde
         */
128 881 mdecorde
    void characters(char[] chars, int offset, int length)
129 881 mdecorde
    {
130 881 mdecorde
            writer.writeCharacters(chars, offset, length)
131 881 mdecorde
    }
132 881 mdecorde
133 881 mdecorde
        /* (non-Javadoc)
134 881 mdecorde
         * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
135 881 mdecorde
         */
136 881 mdecorde
    void endElement(String ns, String localName, String qName)
137 881 mdecorde
    {
138 881 mdecorde
       writer.writeEndElement();
139 881 mdecorde
    }
140 881 mdecorde
141 881 mdecorde
    /**
142 881 mdecorde
     * Find text id.
143 881 mdecorde
     *
144 881 mdecorde
     * @param infile the infile
145 881 mdecorde
     * @param focus the focus
146 881 mdecorde
     * @return the string
147 881 mdecorde
     */
148 881 mdecorde
    public static String findTextId(File infile, String focus)
149 881 mdecorde
    {
150 881 mdecorde
            def inputData = infile.toURI().toURL().openStream();
151 881 mdecorde
                def inputfactory = XMLInputFactory.newInstance();
152 881 mdecorde
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
153 881 mdecorde
154 881 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
155 881 mdecorde
                {
156 881 mdecorde
                        switch (event)
157 881 mdecorde
                        {
158 881 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
159 881 mdecorde
                                        if(parser.getLocalName() == focus)
160 881 mdecorde
                                        {
161 881 mdecorde
                                                String sid = ""
162 881 mdecorde
                                                for(int i = 0 ; i < parser.getAttributeCount() ; i++)
163 881 mdecorde
                                                        if(parser.getAttributeLocalName(i) == "id")
164 881 mdecorde
                                                        {
165 881 mdecorde
                                                                sid = parser.getAttributeValue(i); //xxx_000
166 881 mdecorde
                                                                sid = sid.substring(focus.length()); // on retire le nom du focus
167 881 mdecorde
                                                                sid = sid.split("_")[0];
168 881 mdecorde
                                                                break;
169 881 mdecorde
                                                        }
170 1688 mdecorde
                                                if (parser != null) parser.close();
171 1688 mdecorde
                                                if (inputData != null) inputData.close();
172 881 mdecorde
                                                return sid;
173 881 mdecorde
                                        }
174 881 mdecorde
                        }
175 881 mdecorde
                }
176 1688 mdecorde
                if (parser != null) parser.close();
177 1688 mdecorde
                if (inputData != null) inputData.close();
178 881 mdecorde
    }
179 881 mdecorde
180 881 mdecorde
    /**
181 881 mdecorde
     * The main method.
182 881 mdecorde
     *
183 881 mdecorde
     * @param args the arguments
184 881 mdecorde
     */
185 881 mdecorde
    static void main(String[] args)
186 881 mdecorde
    {
187 881 mdecorde
            File infile = new File("~/xml/quote/processme-q.xml")
188 881 mdecorde
        new WordCounter(infile, "s","01");
189 881 mdecorde
            new WordCounter(infile, "q","01");
190 881 mdecorde
    }
191 881 mdecorde
}