Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / cqp / pager.groovy @ 2246

History | View | Annotate | Download (8.7 kB)

1
package org.txm.scripts.importer.cqp;
2
//Copyright © - ANR Textométrie - http://textometrie.ens-lyon.fr
3
//
4
//This file is part of the TXM platform.
5
//
6
//The TXM platform is free software: you can redistribute it and/or modif y
7
//it under the terms of the GNU General Public License as published by
8
//the Free Software Foundation, either version 3 of the License, or
9
//(at your option) any later version.
10
//
11
//The TXM platform is distributed in the hope that it will be useful,
12
//but WITHOUT ANY WARRANTY; without even the implied warranty of
13
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
//GNU General Public License for more details.
15
//
16
//You should have received a copy of the GNU General Public License
17
//along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
18
//
19
//
20
//
21
//$LastChangedDate: 2012-06-01 17:47:31 +0200 (ven., 01 juin 2012) $
22
//$LastChangedRevision: 2185 $
23
//$LastChangedBy: mdecorde $
24
//
25

    
26
import org.txm.scripts.importer.*;
27
import org.xml.sax.Attributes;
28
import org.txm.importer.scripts.filters.*;
29
import java.io.File;
30
import java.io.IOException;
31
import java.util.ArrayList;
32
import javax.xml.parsers.SAXParserFactory;
33
import javax.xml.parsers.ParserConfigurationException;
34
import javax.xml.parsers.SAXParser;
35
import javax.xml.stream.*;
36
import java.net.URL;
37
import org.xml.sax.InputSource;
38
import org.xml.sax.helpers.DefaultHandler;
39

    
40
/** Build CQP corpus simple edition from a CQP file. 
41
 * 
42
 * @author mdecorde
43
 */
44
class pager {
45
        List<String> NoSpaceBefore;
46

    
47
        /** The No space after. */
48
        List<String> NoSpaceAfter;
49

    
50
        /** The wordcount. */
51
        int wordcount = 0;
52

    
53
        /** The pagecount. */
54
        int pagecount = 0;
55

    
56
        /** The wordmax. */
57
        int wordmax = 0;
58

    
59
        /** The basename. */
60
        String basename = "";
61
        String txtname = "";
62
        File outdir;
63

    
64
        /** The wordid. */
65
        String wordid;
66

    
67
        /** The first word. */
68
        boolean firstWord = true;
69

    
70
        /** The wordvalue. */
71
        String wordvalue;
72

    
73
        /** The interpvalue. */
74
        String interpvalue;
75

    
76
        /** The lastword. */
77
        String lastword = " ";
78

    
79
        /** The wordtype. */
80
        String wordtype;
81

    
82
        /** The flagform. */
83
        boolean flagform = false;
84

    
85
        /** The flaginterp. */
86
        boolean flaginterp = false;
87

    
88
        /** The url. */
89
        private def url;
90

    
91
        /** The input data. */
92
        private def inputData;
93

    
94
        /** The factory. */
95
        private def factory;
96

    
97
        /** The parser. */
98
        private XMLStreamReader parser;
99

    
100
        /** The writer. */
101
        OutputStreamWriter writer;
102

    
103
        /** The pagedWriter. */
104
        OutputStreamWriter pagedWriter = null;
105

    
106
        /** The cqpFile. */
107
        File cqpFile;
108

    
109
        /** The outfile. */
110
        File outfile;
111

    
112
        /** The pages. */
113
        def pages = [:];
114

    
115
        /** The idxstart. */
116
        def idxstart = [:]
117
        String editionPage;
118
        ArrayList<Integer> splitTUs; // contains the tu ids used to split pages
119
        boolean shouldSplit = false;
120
        boolean useSplitTUs = false;
121
        boolean hasWordId = false;
122
        int noWordIdProperty = -1;
123
        /**
124
         * Instantiates a new pager.
125
         *
126
         * @param cqpFile the cqpFile
127
         * @param outfile the outfile
128
         * @param NoSpaceBefore the no space before
129
         * @param NoSpaceAfter the no space after
130
         * @param max the max
131
         * @param basename the basename
132
         */
133
        pager(File cqpFile, File outdir, List<String> NoSpaceBefore,
134
        List<String> NoSpaceAfter, int max, String basename, boolean hasWordId, List pAttrs) {
135
                this.editionPage = editionPage;
136
                this.basename = basename;
137
                this.txtname = "text1";
138
                this.outdir = outdir;
139
                this.wordmax = max;
140
                this.NoSpaceBefore = NoSpaceBefore;
141
                this.NoSpaceAfter = NoSpaceAfter;
142
                this.cqpFile = cqpFile;
143
                this.hasWordId = hasWordId;
144
                if (hasWordId) {
145
                        noWordIdProperty = pAttrs.indexOf("id") + 1;
146
                        if (noWordIdProperty == 0)
147
                        hasWordId = false;
148
                }
149
                
150
                process();
151
        }
152

    
153
        private void closeMultiWriter()
154
        {
155
                if (pagedWriter != null) {
156
                        if (firstWord) { // there was no words
157
                                this.idxstart[txtname] = ["w_0"]
158
                                pagedWriter.write("<span id=\"w_0\"/>");
159
                        }
160
                        pagedWriter.write("</p>\n")
161
                        pagedWriter.write("</body>");
162
                        pagedWriter.write("</html>");
163
                        pagedWriter.close();
164
                }
165
        }
166

    
167
        /**
168
         * Creates the next output.
169
         *
170
         * @return true, if successful
171
         */
172
        private boolean createNextOutput()
173
        {
174
                wordcount = 0;
175
                shouldSplit = false;
176
                try {
177
                        closeMultiWriter();
178
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
179
                        //println "outfile: "+outfile
180
                        if (pages[txtname] == null) pages[txtname] = []
181
                        pages[txtname] << outfile;
182
                        firstWord = true; // waiting for next word
183

    
184
                        pagedWriter = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outfile)) , "UTF-8");
185
                        pagedWriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
186
                        pagedWriter.write("<!DOCTYPE html>\n")
187
                        pagedWriter.write("<html>");
188
                        pagedWriter.write("<head>");
189
                        pagedWriter.write("<title>"+basename.toUpperCase()+" $txtname Edition - Page "+pagecount+"</title>");
190
                        pagedWriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"tmx.css\"/>");
191
                        pagedWriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\"/>");
192
                        pagedWriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
193
                        pagedWriter.write("</head>");
194
                        pagedWriter.write("<body>");
195

    
196
                        return true;
197
                } catch (Exception e) {
198
                        System.out.println(e.getLocalizedMessage());
199
                        return false;
200
                }
201
        }
202

    
203
        /**
204
         * Creates the output.
205
         *
206
         * @param outfile the outfile
207
         * @return true, if successful
208
         */
209
        private boolean createOutput() {
210
                try {
211
                        return createNextOutput();
212
                } catch (Exception e) {
213
                        System.out.println(e.getLocalizedMessage());
214
                        return false;
215
                }
216
        }
217

    
218
        /**
219
         * Gets the page files.
220
         *
221
         * @return the page files
222
         */
223
        public def getPageFiles() {
224
                return pages;
225
        }
226

    
227
        /**
228
         * Gets the idx.
229
         *
230
         * @return the idx
231
         */
232
        public def getIdx() {
233
                return idxstart;
234
        }
235

    
236
        def texts = []
237

    
238
        /**
239
         * Process.
240
         */
241
        void process() {
242

    
243
                String localname = "";
244
                String lastword= "";
245
                createNextOutput();
246
                int wcounter = 1;
247
                int txtwcounter = 1;
248

    
249
                BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cqpFile) , "UTF-8"));
250
                String line = reader.readLine();
251
                int cline = 0;
252
                while (line != null) {
253
                        pagedWriter.flush();
254
                        cline++;
255
                        //println "line: "+line
256
                        if (line.startsWith("<")) {
257
                                if (line.startsWith("<lb/") || line.startsWith("<br/") || line.startsWith("</p>") || 
258
                                        line.startsWith("<lb>") || line.startsWith("<br>")) {
259
                                        pagedWriter.write("<br/>");
260
                                } 
261
                                if (line.startsWith("<text>") || line.startsWith("<text ")) {
262
                                        String tmp = line;
263
                                        int idx = tmp.indexOf("id=\"");
264
                                        if (idx > 0) {
265
                                                tmp = tmp.substring(idx+4);
266
                                                //println "tmp1: "+tmp
267
                                                int idx2 = tmp.indexOf("\"");
268
                                                //println "tmp2: "+tmp
269
                                                if (idx2 > 0) {
270
                                                        tmp = tmp.substring(0, idx2);
271
                                                        txtname = tmp
272
                                                        texts << txtname;
273
                                                        pagecount = 0;
274
                                                        createNextOutput();
275
                                                }
276
                                        }
277
                                } else if (line.startsWith("<s>") || line.startsWith("<s ")) {
278
                                        pagedWriter.write(line.replaceAll("<s", "<sent "));
279
                                } else if (line.startsWith("</s>")) {
280
                                        pagedWriter.write("</sent>");
281
                                }  else if (line.startsWith("<pb/>") || line.startsWith("<pb>")) {
282
                                        pagedWriter.write(line);
283
                                        txtwcounter = 1;
284
                                        createNextOutput();
285
                                } else {
286
                                        pagedWriter.write(line);
287
                                }
288
                        } else { // word
289
                                if (txtwcounter > wordmax) {
290
                                        txtwcounter = 1;
291
                                        createNextOutput();
292
                                }
293
                                def split = line.split("\t");
294
                                if (split == null || split.size() == 0) {
295
                                        // empty line
296
                                } else {
297
                                        String wordid;
298
                                        if (hasWordId) {
299
                                                if (split.size() <= noWordIdProperty) { // +1 since line contains word
300
                                                        println "Error: line $cline"
301
                                                } else {
302
                                                        wordid = split[noWordIdProperty];
303
                                                }
304
                                        } else {
305
                                                wordid = "w_"+(wcounter++);
306
                                        }
307
                                        txtwcounter++;
308
                                        String wordvalue = split[0];
309

    
310
                                        if (firstWord) {
311
                                                firstWord = false;
312
                                                if (this.idxstart[txtname] == null) {
313
                                                        this.idxstart[txtname] = []
314
                                                }
315
                                                this.idxstart[txtname] << [wordid];
316
                                        }
317

    
318
                                        int l = lastword.length();
319
                                        String endOfLastWord = "";
320
                                        if (l > 0) {
321
                                                endOfLastWord = lastword.subSequence(l-1, l);
322
                                        }
323

    
324
                                        String interpvalue = "";
325
                                        if (split.size() > 1) {
326
                                                interpvalue = split[1..split.size()-1].join(" ")
327
                                                interpvalue = interpvalue.replace("&", "&amp;").replace("<", "&lt;").replace("\"","&quot;");
328
                                        }
329

    
330

    
331
                                        if (NoSpaceBefore.contains(wordvalue) ||
332
                                        NoSpaceAfter.contains(lastword) ||
333
                                        wordvalue.startsWith("-") ||
334
                                        NoSpaceAfter.contains(endOfLastWord)) {
335
                                                pagedWriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
336
                                        } else {
337
                                                pagedWriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
338
                                        }
339

    
340
                                        pagedWriter.write(wordvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;")+"</span>\n");
341
                                        pagedWriter.flush()
342
                                        lastword = wordvalue;
343
                                }
344
                        }
345
                        line = reader.readLine();
346
                }
347

    
348
                closeMultiWriter();
349
        }
350

    
351
        def getTextNames() {
352
                return texts;
353
        }
354
}