Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / WordCounter.groovy @ 187

History | View | Annotate | Download (5.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2015-12-17 12:11:39 +0100 (Thu, 17 Dec 2015) $
25
// $LastChangedRevision: 3087 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer;
29

    
30
import java.io.BufferedReader;
31
import java.util.ArrayList;
32
import javax.xml.parsers.SAXParserFactory
33
import org.xml.sax.helpers.DefaultHandler
34
import org.xml.sax.*
35
import javax.xml.stream.*;
36
import java.net.URL;
37

    
38
// TODO: Auto-generated Javadoc
39
/** Count the focused tag and add id to it you must specify  : which tag are milestones the tag to count. */
40
class WordCounter extends DefaultHandler {
41
    String txt ;
42
        
43
        /** The focus. */
44
        String focus;
45
    
46
    /** The counter. */
47
    int counter = 0;
48
    
49
    /** The solotags. */
50
    ArrayList<String> solotags;
51
    
52
    /** The writer. */
53
    XMLStreamWriter writer;
54
        
55
        /**
56
         * The initializer do the work
57
         * The file infile is replaced by the result.
58
         *
59
         * @param infile the infile
60
         * @param focus the focus
61
         * @param txt the txt
62
         */
63
    WordCounter(File infile, String focus, String txt)
64
    {
65
            File outfile = new File(infile.getName()+"counter");
66
                this.focus=focus;
67
                this.txt = txt;
68
                
69
                FileInputStream input = new FileInputStream(infile);
70
                
71
        def handler = this;
72
        def reader = SAXParserFactory.newInstance().newSAXParser();
73
                
74
        def output = new FileOutputStream(outfile);
75
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
76
                writer = factory.createXMLStreamWriter(output, "UTF-8");
77
                writer.writeStartDocument("UTF-8", "1.0");
78
                output.write("\n")
79
                
80
                //Reader inputStream = new InputStreamReader(new FileInputStream(infile) , "UTF-8");
81
                //BufferedReader r= new BufferedReader(inputStream);
82
                
83
        reader.parse(input, handler);// process !!
84
                
85
        input.close();
86
        writer.close();
87
                output.close();
88
        //r.close();
89
        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
90
    }
91
        
92
        /**
93
         * process start tag.
94
         *
95
         * @param ns the ns
96
         * @param localName the localname of the current tag
97
         * @param qName the q name
98
         * @param atts the attributes of the current tag
99
         */
100
    void startElement(String ns, String localName, String qName, Attributes atts) {
101
            
102
            writer.writeStartElement(qName)
103
            
104
            //writeAttrs
105
        if(qName.matches(focus))
106
        {
107
                counter++;
108
                writer.writeAttribute("n",""+counter)
109
            writer.writeAttribute("xml:id",focus+txt+"_"+counter)
110
            for(int i=0; i < atts.getLength();i++)
111
                    if(atts.getQName(i) != "n" && atts.getQName(i) != "xml:id" && atts.getQName(i) != "id")
112
                writer.writeAttribute(atts.getQName(i), atts.getValue(i));
113
        }
114
        else
115
        {
116
                for(int i=0; i < atts.getLength();i++)
117
                writer.writeAttribute(atts.getQName(i), atts.getValue(i));
118
        }
119
    }
120
        
121
        /**
122
         * write the CDATA, not finished.
123
         *
124
         * @param chars all the chars
125
         * @param offset where the text begin
126
         * @param length the length
127
         */
128
    void characters(char[] chars, int offset, int length)
129
    {
130
            writer.writeCharacters(chars, offset, length)
131
    }
132
        
133
        /* (non-Javadoc)
134
         * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
135
         */
136
    void endElement(String ns, String localName, String qName)
137
    {
138
       writer.writeEndElement();
139
    }
140
    
141
    /**
142
     * Find text id.
143
     *
144
     * @param infile the infile
145
     * @param focus the focus
146
     * @return the string
147
     */
148
    public static String findTextId(File infile, String focus)
149
    {
150
            def inputData = infile.toURI().toURL().openStream();
151
                def inputfactory = XMLInputFactory.newInstance();
152
                XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
153
                                
154
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
155
                {
156
                        switch (event) 
157
                        {
158
                                case XMLStreamConstants.START_ELEMENT:
159
                                        if(parser.getLocalName() == focus)
160
                                        {
161
                                                String sid = ""
162
                                                for(int i = 0 ; i < parser.getAttributeCount() ; i++)
163
                                                        if(parser.getAttributeLocalName(i) == "id")
164
                                                        {
165
                                                                sid = parser.getAttributeValue(i); //xxx_000
166
                                                                sid = sid.substring(focus.length()); // on retire le nom du focus
167
                                                                sid = sid.split("_")[0];
168
                                                                break;
169
                                                        }
170
                                                inputData.close();
171
                                                parser.close();
172
                                                return sid;
173
                                        }
174
                        }
175
                }
176
    }
177
    
178
    /**
179
     * The main method.
180
     *
181
     * @param args the arguments
182
     */
183
    static void main(String[] args)
184
    {
185
            File infile = new File("~/xml/quote/processme-q.xml")
186
        new WordCounter(infile, "s","01");
187
            new WordCounter(infile, "q","01");
188
    }
189
}