root / tmp / org.txm.core / src / java / org / txm / scripts / importer / WordCounter.groovy @ 1688
History | View | Annotate | Download (5.5 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2015-12-17 12:11:39 +0100 (jeu. 17 déc. 2015) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 3087 $
|
26 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer;
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import java.io.BufferedReader; |
31 | 881 | mdecorde | import java.util.ArrayList; |
32 | 881 | mdecorde | import javax.xml.parsers.SAXParserFactory |
33 | 881 | mdecorde | import org.xml.sax.helpers.DefaultHandler |
34 | 881 | mdecorde | import org.xml.sax.* |
35 | 881 | mdecorde | import javax.xml.stream.*; |
36 | 881 | mdecorde | import java.net.URL; |
37 | 881 | mdecorde | |
38 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
39 | 881 | mdecorde | /** Count the focused tag and add id to it you must specify : which tag are milestones the tag to count. */
|
40 | 881 | mdecorde | class WordCounter extends DefaultHandler { |
41 | 881 | mdecorde | String txt ;
|
42 | 881 | mdecorde | |
43 | 881 | mdecorde | /** The focus. */
|
44 | 881 | mdecorde | String focus;
|
45 | 881 | mdecorde | |
46 | 881 | mdecorde | /** The counter. */
|
47 | 881 | mdecorde | int counter = 0; |
48 | 881 | mdecorde | |
49 | 881 | mdecorde | /** The solotags. */
|
50 | 881 | mdecorde | ArrayList<String> solotags; |
51 | 881 | mdecorde | |
52 | 881 | mdecorde | /** The writer. */
|
53 | 881 | mdecorde | XMLStreamWriter writer; |
54 | 881 | mdecorde | |
55 | 881 | mdecorde | /**
|
56 | 881 | mdecorde | * The initializer do the work
|
57 | 881 | mdecorde | * The file infile is replaced by the result.
|
58 | 881 | mdecorde | *
|
59 | 881 | mdecorde | * @param infile the infile
|
60 | 881 | mdecorde | * @param focus the focus
|
61 | 881 | mdecorde | * @param txt the txt
|
62 | 881 | mdecorde | */
|
63 | 881 | mdecorde | WordCounter(File infile, String focus, String txt) |
64 | 881 | mdecorde | { |
65 | 881 | mdecorde | File outfile = new File(infile.getName()+"counter"); |
66 | 881 | mdecorde | this.focus=focus;
|
67 | 881 | mdecorde | this.txt = txt;
|
68 | 881 | mdecorde | |
69 | 881 | mdecorde | FileInputStream input = new FileInputStream(infile); |
70 | 881 | mdecorde | |
71 | 881 | mdecorde | def handler = this; |
72 | 881 | mdecorde | def reader = SAXParserFactory.newInstance().newSAXParser(); |
73 | 881 | mdecorde | |
74 | 881 | mdecorde | def output = new FileOutputStream(outfile); |
75 | 881 | mdecorde | XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
76 | 881 | mdecorde | writer = factory.createXMLStreamWriter(output, "UTF-8");
|
77 | 881 | mdecorde | writer.writeStartDocument("UTF-8", "1.0"); |
78 | 881 | mdecorde | output.write("\n")
|
79 | 881 | mdecorde | |
80 | 881 | mdecorde | //Reader inputStream = new InputStreamReader(new FileInputStream(infile) , "UTF-8");
|
81 | 881 | mdecorde | //BufferedReader r= new BufferedReader(inputStream);
|
82 | 881 | mdecorde | |
83 | 881 | mdecorde | reader.parse(input, handler);// process !!
|
84 | 881 | mdecorde | |
85 | 881 | mdecorde | input.close(); |
86 | 881 | mdecorde | writer.close(); |
87 | 881 | mdecorde | output.close(); |
88 | 881 | mdecorde | //r.close();
|
89 | 881 | mdecorde | if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile |
90 | 881 | mdecorde | } |
91 | 881 | mdecorde | |
92 | 881 | mdecorde | /**
|
93 | 881 | mdecorde | * process start tag.
|
94 | 881 | mdecorde | *
|
95 | 881 | mdecorde | * @param ns the ns
|
96 | 881 | mdecorde | * @param localName the localname of the current tag
|
97 | 881 | mdecorde | * @param qName the q name
|
98 | 881 | mdecorde | * @param atts the attributes of the current tag
|
99 | 881 | mdecorde | */
|
100 | 881 | mdecorde | void startElement(String ns, String localName, String qName, Attributes atts) { |
101 | 881 | mdecorde | |
102 | 881 | mdecorde | writer.writeStartElement(qName) |
103 | 881 | mdecorde | |
104 | 881 | mdecorde | //writeAttrs
|
105 | 881 | mdecorde | if(qName.matches(focus))
|
106 | 881 | mdecorde | { |
107 | 881 | mdecorde | counter++; |
108 | 881 | mdecorde | writer.writeAttribute("n",""+counter) |
109 | 881 | mdecorde | writer.writeAttribute("xml:id",focus+txt+"_"+counter) |
110 | 881 | mdecorde | for(int i=0; i < atts.getLength();i++) |
111 | 881 | mdecorde | if(atts.getQName(i) != "n" && atts.getQName(i) != "xml:id" && atts.getQName(i) != "id") |
112 | 881 | mdecorde | writer.writeAttribute(atts.getQName(i), atts.getValue(i)); |
113 | 881 | mdecorde | } |
114 | 881 | mdecorde | else
|
115 | 881 | mdecorde | { |
116 | 881 | mdecorde | for(int i=0; i < atts.getLength();i++) |
117 | 881 | mdecorde | writer.writeAttribute(atts.getQName(i), atts.getValue(i)); |
118 | 881 | mdecorde | } |
119 | 881 | mdecorde | } |
120 | 881 | mdecorde | |
121 | 881 | mdecorde | /**
|
122 | 881 | mdecorde | * write the CDATA, not finished.
|
123 | 881 | mdecorde | *
|
124 | 881 | mdecorde | * @param chars all the chars
|
125 | 881 | mdecorde | * @param offset where the text begin
|
126 | 881 | mdecorde | * @param length the length
|
127 | 881 | mdecorde | */
|
128 | 881 | mdecorde | void characters(char[] chars, int offset, int length) |
129 | 881 | mdecorde | { |
130 | 881 | mdecorde | writer.writeCharacters(chars, offset, length) |
131 | 881 | mdecorde | } |
132 | 881 | mdecorde | |
133 | 881 | mdecorde | /* (non-Javadoc)
|
134 | 881 | mdecorde | * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
|
135 | 881 | mdecorde | */
|
136 | 881 | mdecorde | void endElement(String ns, String localName, String qName) |
137 | 881 | mdecorde | { |
138 | 881 | mdecorde | writer.writeEndElement(); |
139 | 881 | mdecorde | } |
140 | 881 | mdecorde | |
141 | 881 | mdecorde | /**
|
142 | 881 | mdecorde | * Find text id.
|
143 | 881 | mdecorde | *
|
144 | 881 | mdecorde | * @param infile the infile
|
145 | 881 | mdecorde | * @param focus the focus
|
146 | 881 | mdecorde | * @return the string
|
147 | 881 | mdecorde | */
|
148 | 881 | mdecorde | public static String findTextId(File infile, String focus) |
149 | 881 | mdecorde | { |
150 | 881 | mdecorde | def inputData = infile.toURI().toURL().openStream();
|
151 | 881 | mdecorde | def inputfactory = XMLInputFactory.newInstance();
|
152 | 881 | mdecorde | XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
153 | 881 | mdecorde | |
154 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
155 | 881 | mdecorde | { |
156 | 881 | mdecorde | switch (event)
|
157 | 881 | mdecorde | { |
158 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
159 | 881 | mdecorde | if(parser.getLocalName() == focus)
|
160 | 881 | mdecorde | { |
161 | 881 | mdecorde | String sid = "" |
162 | 881 | mdecorde | for(int i = 0 ; i < parser.getAttributeCount() ; i++) |
163 | 881 | mdecorde | if(parser.getAttributeLocalName(i) == "id") |
164 | 881 | mdecorde | { |
165 | 881 | mdecorde | sid = parser.getAttributeValue(i); //xxx_000
|
166 | 881 | mdecorde | sid = sid.substring(focus.length()); // on retire le nom du focus
|
167 | 881 | mdecorde | sid = sid.split("_")[0]; |
168 | 881 | mdecorde | break;
|
169 | 881 | mdecorde | } |
170 | 1688 | mdecorde | if (parser != null) parser.close(); |
171 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
172 | 881 | mdecorde | return sid;
|
173 | 881 | mdecorde | } |
174 | 881 | mdecorde | } |
175 | 881 | mdecorde | } |
176 | 1688 | mdecorde | if (parser != null) parser.close(); |
177 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
178 | 881 | mdecorde | } |
179 | 881 | mdecorde | |
180 | 881 | mdecorde | /**
|
181 | 881 | mdecorde | * The main method.
|
182 | 881 | mdecorde | *
|
183 | 881 | mdecorde | * @param args the arguments
|
184 | 881 | mdecorde | */
|
185 | 881 | mdecorde | static void main(String[] args) |
186 | 881 | mdecorde | { |
187 | 881 | mdecorde | File infile = new File("~/xml/quote/processme-q.xml") |
188 | 881 | mdecorde | new WordCounter(infile, "s","01"); |
189 | 881 | mdecorde | new WordCounter(infile, "q","01"); |
190 | 881 | mdecorde | } |
191 | 881 | mdecorde | } |