Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / filters / Tokeniser / OneTagPerLine.groovy @ 479

History | View | Annotate | Download (7.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package filters.Tokeniser;
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import javax.xml.stream.*;
34
import java.net.URL;
35
import org.txm.importer.filters.*;
36
import java.net.URLEncoder;
37

    
38
// TODO: Auto-generated Javadoc
39
/**
40
 * Used by the tokenizer. put one tag per line
41
 * 
42
 * @author mdecorde
43
 */
44

    
45
public class OneTagPerLine {
46

    
47
        /** The url. */
48
        private def url;
49

    
50
        /** The input data. */
51
        private def inputData;
52

    
53
        /** The factory. */
54
        private def factory;
55

    
56
        /** The parser. */
57
        private XMLStreamReader parser;
58

    
59
        /** The output. */
60
        private def output;
61

    
62
        /** The solotags. */
63
        ArrayList<String> solotags;
64

    
65
        /**
66
         * Instantiates a new one tag per line.
67
         *
68
         * @param url the url
69
         * @param solotags the solotags
70
         */
71
        public OneTagPerLine(URL url, ArrayList<String> solotags){
72
                try {
73
                        this.url = url;
74
                        this.solotags = solotags;
75
                        inputData = url.openStream();
76
                        factory = XMLInputFactory.newInstance();
77
                        factory.setProperty(XMLInputFactory.IS_VALIDATING,false)
78
                        parser = factory.createXMLStreamReader(inputData);
79

    
80
                } catch (XMLStreamException ex) {
81
                        System.out.println(ex);
82
                }catch (IOException ex) {
83
                        System.out.println("IOException while parsing ");
84
                }
85
        }
86

    
87
        /**
88
         * Creates the output.
89
         *
90
         * @param outfile the outfile
91
         * @return true, if successful
92
         */
93
        private boolean createOutput(File outfile) {
94
                try {
95
                        File f = outfile;
96
                        output = new OutputStreamWriter(new FileOutputStream(f), "UTF-8");
97

    
98
                        return true;
99
                } catch (Exception e) {
100
                        System.out.println(e.getLocalizedMessage());
101
                        return false;
102
                }
103
        }
104

    
105
        /**
106
         * Read next.
107
         *
108
         * @param n the n
109
         * @return the java.lang. object
110
         */
111
        public def readNext(int n)
112
        {
113
                try
114
                {
115
                        return parser.next()
116
                }catch(Exception e){
117
                        if(n > 10)
118
                                return parser.next();
119
                        else
120
                                readNext(n+1)
121
                }
122
        }
123
        boolean wordtag = false;
124
        /**
125
         * Process.
126
         *
127
         * @param outfile the outfile
128
         * @return true, if successful
129
         */
130
        public boolean process(File outfile) {
131

    
132
                StringBuffer chars = new StringBuffer();
133
                if (createOutput(outfile)) {
134

    
135
                        String lastopenlocalname= "";
136
                        String localname = "";
137
                        boolean word_part = false;
138
                        output.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
139
                        try {
140
                                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = readNext(0)) {
141

    
142
                                        String prefix = parser.getPrefix();
143
                                        if (prefix == null || prefix == "")
144
                                                prefix = "";
145
                                        else
146
                                                prefix +=":";
147

    
148
                                        switch (event) {
149
                                                case XMLStreamConstants.START_ELEMENT:
150
                                                        if (chars.length() > 0) {
151
                                                                output.write(chars.substring(0, chars.length()-1));
152
                                                                chars = new StringBuffer();
153
                                                        }
154
                                                        localname = parser.getLocalName();
155
                                                        lastopenlocalname = localname;
156
                                                        word_part = false;
157
                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
158
                                                                if (parser.getAttributeValue(i).contains("word_part"))
159
                                                                        word_part = true;
160

    
161
                                                /*if(word_part == true)
162
                                                 output.write("<"+prefix+localname);
163
                                                 else*/
164
                                                //                                                if(word_part)
165
                                                //                                                        output.write("<"+prefix+localname);
166
                                                //                                                else
167
                                                        if (wordtag)
168
                                                                output.write("<"+prefix+localname);
169
                                                        else
170
                                                                output.write("\n<"+prefix+localname);
171
                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
172
                                                                String ns = parser.getAttributePrefix(i);
173
                                                                if (ns == null || ns == "")
174
                                                                        output.write(" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i).replace("&","&amp; ").replace("<","&lt;")+"\"" );
175
                                                                else
176
                                                                        output.write(" "+ns+":"+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i).replace("&","&amp; ").replace("<","&lt;")+"\"" );
177

    
178
                                                        }
179

    
180
                                                        if (solotags.contains(localname))
181
                                                                output.write("/>");
182
                                                        else
183
                                                                output.write(">");
184

    
185
                                                //                                                if(!word_part)
186
                                                        if (localname == "w") {
187
                                                                wordtag=true;
188
                                                        } else if(!wordtag) {
189
                                                                output.write("\n");
190
                                                        }
191
                                                        break;
192

    
193
                                                case XMLStreamConstants.END_ELEMENT:
194
                                                        if (chars.length() > 0) {
195
                                                                output.write(chars.substring(0, chars.length()-1));
196
                                                                chars = new StringBuffer();
197
                                                        }
198

    
199
                                                        localname = parser.getLocalName();
200
                                                        switch (localname) {
201
                                                                default:
202
                                                                        if(!solotags.contains(localname))
203
                                                                        /*if(word_part) // see filter ReunitBrokenWords
204
                                                         output.write("</"+prefix+localname+">");
205
                                                         else*/
206
                                                                        if (localname == "w") {
207
                                                                                wordtag = false
208
                                                                                output.write("</"+prefix+localname+">\n");
209
                                                                        } else if (wordtag) {
210
                                                                                output.write("</"+prefix+localname+">");
211
                                                                        } else {
212
                                                                                output.write("\n</"+prefix+localname+">\n");
213
                                                                        }
214
                                                        }
215
                                                        break;
216

    
217
                                                case XMLStreamConstants.CHARACTERS:
218
                                                        if (parser.getText().length() > 0) {
219
                                                                int start=0;
220
                                                                while (parser.getText().charAt(start) == "\n" || parser.getText().charAt(start) == " ")
221
                                                                        if (start < parser.getText().length()-1) {
222
                                                                                start++;
223
                                                                        } else {
224
                                                                                start++;
225
                                                                                break;
226
                                                                        }
227
                                                                //replace & by &amp; ; < by &lt; ; insecable space by space
228
                                                                chars.append(parser.getText().substring(start).replace("\t", " ").replace("&","&amp;").replace("<","&lt;").replace(" "," ").trim()+ " ");
229
                                                                //String text = URLEncoder.encode(parser.getText().substring(start))
230
                                                                //output.write(text);
231
                                                        }
232
                                                        break;
233
                                        }
234
                                }
235
                                output.close();
236
                                parser.close();
237

    
238
                                Reader input = new InputStreamReader(new FileInputStream(outfile) , "UTF-8");
239
                                File temp = new File(outfile.getParent(),"temp");
240
                                output = new OutputStreamWriter(new FileOutputStream(temp) , "UTF-8");
241
                                String line = input.readLine();
242
                                while (line != null) {
243
                                        if (line.length() > 0)
244
                                                output.write(line+"\n");
245
                                        line = input.readLine();
246
                                }
247
                                input.close();
248
                                output.close();
249

    
250
                                if (!(outfile.delete() && temp.renameTo(outfile))) println "Warning can't rename file "+temp+" to "+outfile
251
                        } catch (XMLStreamException ex) {
252
                                input.close();
253
                                output.close();
254
                                System.out.println(ex);
255
                                return false;
256
                        } catch (IOException ex) {
257
                                input.close();
258
                                output.close();
259
                                System.out.println("IOException while parsing " + inputData);
260
                                return false;
261
                        }
262
                }
263
                return true;
264
        }
265

    
266
        /**
267
         * The main method.
268
         *
269
         * @param args the arguments
270
         */
271
        public static void main(String[] args) {
272

    
273
                String rootDir = "~/xml/Hyperprince/";
274
                new File(rootDir + "/identity/").mkdir();
275

    
276
                ArrayList<String> milestones = new ArrayList<String>();// the tags who
277
                // you want them
278
                // to stay
279
                // milestones
280
                milestones.add("tagUsage");
281
                milestones.add("pb");
282
                milestones.add("lb");
283

    
284
                File srcfile = new File(rootDir, "Corpus-Hyperprince_2009-06-10.xml");
285
                File resultfile = new File(rootDir,
286
                                "Corpus-Hyperprince_2009-06-10-T.xml");
287
                println("prepare tokenizer file : " + srcfile + " to : " + resultfile);
288

    
289
                def builder = new OneTagPerLine(srcfile.toURL(), milestones);
290
                builder.process(resultfile);
291

    
292
                return;
293
        }
294

    
295
}