Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / TagSentences / TEISentencer.groovy @ 187

History | View | Annotate | Download (10.6 kB)

1
package filters.TagSentences
2

    
3
import org.txm.importer.StaxIdentityParser;
4
import org.txm.importer.ValidateXml;
5

    
6
import java.net.URL;
7
import filters.Tokeniser.TEITokenizer.STATES;
8

    
9
class TEISentencer extends StaxIdentityParser {
10

    
11
        boolean DEBUG = false;
12
        String filename;
13
        
14
        enum STATES { IGNORE, NOTE, WORK, W };
15
        enum SENT { IN, OUT };
16
        STATES state = STATES.IGNORE;
17
        SENT sentence = SENT.OUT;
18
        
19
        String STARTTAGS = "text"
20
        String NOTETAGS = "note"
21
        String WORDTAGS = "w"
22
        String DIVTAGS = "div"
23
        String QTAGS = "q"
24
        String EDTAGS = "supplied"
25
        def openTags = [];
26
        
27
        public TEISentencer(URL inputurl) {
28
                super(inputurl);
29

    
30
                this.filename = new File(inputurl.getFile()).getName();
31
                int index = filename.indexOf(".");
32
                if (index > 0)
33
                        filename = filename.substring(0, index);
34
        }
35

    
36
        protected void processStartElement() {
37

    
38
                if (state == STATES.IGNORE) {
39
                        sentence = SENT.OUT;
40
                        if (localname == STARTTAGS) {
41
                                state = STATES.WORK;
42
                        }
43
                        super.processStartElement();
44
                } else {
45
                        if (localname == NOTETAGS) {
46
                                state = STATES.NOTE;
47
                                super.processStartElement();
48
                                goToEnd(localname)
49
                        } else {
50
                                switch(sentence) {
51
                                        case SENT.IN:
52
                                                processStartElementIN();
53
                                                break;
54
                                        case SENT.OUT:
55
                                                processStartElementOUT();
56
                                                break;
57
                                }
58
                        }
59
                }
60
        }
61
        
62
        @Override
63
        protected void processEndElement() {
64
                if (state == STATES.IGNORE) {
65
                        super.processEndElement();
66
                } else {
67
                        if (localname == STARTTAGS) {
68
                                state = STATES.IGNORE;
69
                                super.processEndElement();
70
                        } else {
71
                                switch(sentence) {
72
                                        case SENT.IN:
73
                                                processEndElementIN();
74
                                                break;
75
                                        case SENT.OUT:
76
                                                processEndElementOUT();
77
                                                break;
78
                                }
79
                        }
80
                }
81
        }
82
        
83
        protected void cutSentence() {
84
                for( int i = 0 ; i < openTags.size() ; i++) {
85
                        writer.writeEndElement();
86
                }
87
                endSentence() // close sent
88
                super.processStartElement();
89
                startSentence()
90
                for( Tag tag : openTags) {
91
                        tag.writeStart(writer);
92
                }
93
        }
94
        
95
        protected void cutSentenceAndClose() {
96
                for( int i = 0 ; i < openTags.size() ; i++) {
97
                        writer.writeEndElement();
98
                }
99
                endSentence() // close sent
100
                super.processEndElement();
101
                startSentence()
102
                for( Tag tag : openTags) {
103
                        tag.writeStart(writer);
104
                }
105
        }
106
        
107
        protected void closeRemainingSentence() {
108
                def tags = []
109
                while( openTags.size() > 0) {
110
                        writer.writeEndElement();
111
                        tags << openTags.pop()
112
                }
113
                writer.writeEndElement(); // close sent
114
                sentence = SENT.OUT;
115
        }
116
        
117
        protected void processStartElementIN() {
118
                if (localname.matches(DIVTAGS)) {
119
                        if (openTags.size() == 0) {
120
                                endSentence();
121
                                super.processStartElement();
122
                                println "WARNING: start DIVTAG in sentence at "+getLocation()
123
                        } else {
124
                                cutSentence();
125
                                println "WARNING: DIVTAG opepened inside EDTAG at "+getLocation()
126
                        }
127
                } else if (localname.matches(QTAGS)) {
128
                        if (openTags.size() == 0) {
129
                                endSentence();
130
                                super.processStartElement();
131
                        } else {
132
                                cutSentence();
133
                                println "WARNING: QTAG opepened inside EDTAG at "+getLocation()
134
                        }
135
                } else if (localname.matches(EDTAGS)) {
136
                        if (openTags.size() == 0) {
137
                                if (hasMULTIS()) {
138
                                        endSentence();
139
                                } else {
140
                                        if (openTags.size() > 0 && openTags[0].localname == localname) {
141
                                                println "STOP: openning same EDTAG $localname at "+getLocation()
142
                                                closeForError();
143
                                        }
144
                                        openTags << new Tag(parser)
145
                                }
146
                                super.processStartElement();
147
                        } else {
148
                                if (hasMULTIS()) {
149
                                        cutSentence();
150
                                        println "WARNING: EDTAG opened  inside EDTAG"
151
                                } else {
152
                                        super.processStartElement();
153
                                        if (openTags.size() > 0 && openTags[0].localname == localname) {
154
                                                println "STOP: openning same EDTAG $localname at "+getLocation()
155
                                                closeForError();
156
                                        }
157
                                        openTags << new Tag(parser)
158
                                }
159
                        }
160
                } else if (localname.matches(WORDTAGS)) {
161
                        boolean shouldClose = isWEndPunct()
162
                        if (openTags.size() == 0) {
163
                                super.processStartElement();
164
                                goToEnd(localname);
165
                                if (shouldClose) {
166
                                        endSentence();
167
                                }
168
                        } else {
169
                                super.processStartElement();
170
                                goToEnd(localname);
171
                                if (shouldClose) {
172
                                        println "WARNING: Strong ponctuation found inside EDTAG at "+getLocation()
173
                                }
174
                        }
175
                } else { // OTHER TAGS
176
                        if (openTags.size() == 0) {
177
                                if (hasMULTIS()) {
178
                                        endSentence();
179
                                } else {
180
                                        if (openTags.size() > 0 && openTags[0].localname == localname) {
181
                                                println "STOP: openning same EDTAG $localname at "+getLocation()
182
                                                closeForError();
183
                                        }
184
                                        openTags << new Tag(parser)
185
                                }
186
                                super.processStartElement();
187
                        } else {
188
                                if (hasMULTIS()) {
189
                                        cutSentence();
190
                                        println "WARNING: multi_s tag opens inside EDTAG"
191
                                } else {
192
                                        super.processStartElement();
193
                                        if (openTags.size() > 0 && openTags[0].localname == localname) {
194
                                                println "STOP: openning same TAG $localname at "+getLocation()
195
                                                closeForError();
196
                                        }
197
                                        openTags << new Tag(parser);
198
                                }
199
                        }
200
                }
201
        }
202

    
203
        protected void processEndElementIN() {
204
                if (localname.matches(DIVTAGS)) {
205
                        if (openTags.size() == 0) {
206
                                endSentence();
207
                                super.processEndElement();
208
                        } else {
209
                                cutSentenceAndClose();
210
                                println "WARNING: DIVTAG closed  inside EDTAG"
211
                        }
212
                } else if (localname.matches(QTAGS)) {
213
                        if (openTags.size() == 0) {
214
                                endSentence();
215
                                super.processEndElement();
216
                        } else {
217
                                cutSentenceAndClose();
218
                                println "WARNING: QTAG closed  inside EDTAG"
219
                        }
220
                } else if (localname.matches(EDTAGS)) {
221
                        if (openTags.size() == 0) { // it was a multi_s EDTAG
222
                                endSentence();
223
                                super.processEndElement();
224
                        } else {
225
                                if (openTags[-1].localname == localname) {
226
                                        super.processEndElement();
227
                                        openTags.pop();
228
                                } else {
229
                                        println "STOP: closing EDTAG does not match last opened EDTAG "+localname+" != "+openTags[-1].localname+" at "+getLocation()
230
                                        closeForError()
231
                                }
232
                        }
233
                } else if (localname.matches(WORDTAGS)) {
234
                        println "ERROR: processEndElementIN: </w> at "+getLocation();
235
                } else {
236
                        if (openTags.size() == 0) {
237
                                super.processEndElement();
238
                        } else {
239
                                if (openTags[-1].localname == localname) {
240
                                        super.processEndElement();
241
                                        openTags.pop();
242
                                } else {
243
                                        println "STOP: closing tag does not match last opened EDTAG "+localname+" != "+openTags[-1].localname+" at "+getLocation()
244
                                        closeForError()
245
                                }
246
                        }
247
                }
248
        }
249

    
250
        protected void processStartElementOUT() {
251
                if (localname.matches(DIVTAGS)) {
252
                        super.processStartElement();
253
                } else if (localname.matches(QTAGS)) {
254
                        super.processStartElement();
255
                } else if (localname.matches(EDTAGS)) {
256
                        if (hasMULTIS()) {
257
                                super.processStartElement();
258
                        } else {
259
                                startSentence();
260
                                super.processStartElement();
261
                                if (openTags.size() > 0 && openTags[0].localname == localname) {
262
                                        println "STOP: openning same EDTAG name at "+getLocation()
263
                                        closeForError();
264
                                }
265
                                openTags << new Tag(parser);
266
                        }
267
                } else if (localname.matches(WORDTAGS)) {
268
                        if (isWEndPunct()) {
269
                                startSentence();
270
                                super.processStartElement();
271
                                goToEnd(localname);
272
                                endSentence();
273
                                println "WARNING: empty sentence at "+getLocation();
274
                        } else {
275
                                startSentence();
276
                                super.processStartElement();
277
                                goToEnd(localname);
278
                        }
279

    
280
                } else {
281

    
282
                        if (hasMULTIS()) {
283
                                super.processStartElement();
284
                        } else {
285
                                startSentence();
286
                                super.processStartElement();
287
                                if (openTags.size() > 0 && openTags[0].localname == localname) {
288
                                        println "STOP: openning same TAG $localname name at "+getLocation()
289
                                        closeForError();
290
                                }
291
                                openTags << new Tag(parser)
292
                                println "WARNING: unclassified tag: "+localname+" at "+getLocation();
293
                        }
294
                }
295
        }
296

    
297
        protected void processEndElementOUT() {
298
                if (localname.matches(DIVTAGS)) {
299
                        super.processEndElement();
300
                } else if (localname.matches(QTAGS)) {
301
                        super.processEndElement();
302
                } else if (localname.matches(EDTAGS)) {
303
                        super.processEndElement();
304
                } else if (localname.matches(WORDTAGS)) {
305
                        println "ERROR: processEndElementOUT: </w> at "+getLocation();
306
                } else {
307
                        super.processEndElement();
308
                }
309
        }
310

    
311
        private static String STAG = "s"
312
        protected void startSentence() {
313
                sentence = SENT.IN;
314
                writer.writeStartElement(STAG);
315
        }
316

    
317
        protected void endSentence() {
318
                sentence = SENT.OUT;
319
                writer.writeEndElement();
320
        }
321

    
322
        private static String ENDOFSENTENCE = "PONfrt"
323
        protected boolean isWEndPunct() {
324
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
325
                        if (parser.getAttributeValue(i) == ENDOFSENTENCE) return true;
326
                }
327
                return false;
328
        }
329

    
330
        private static String MULTI_S = "multi_s"
331
        protected boolean hasMULTIS() {
332
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
333
                        if (parser.getAttributeValue(i) == MULTI_S) return true;
334
                }
335
                return false;
336
        }
337

    
338
        public static void main(String[] args) {
339
                File dir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized")
340
                File outdir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized/s")
341
                outdir.mkdir()
342
                for (File infile : dir.listFiles()) {
343
                        long start = System.currentTimeMillis()
344
                        println infile
345
                        File outfile = new File(outdir, infile.getName())
346
        
347
                        TEISentencer sentencer = new TEISentencer(infile.toURI().toURL());
348
                        if (sentencer.process(outfile)) {
349
                                println "DONE - OK: "+ValidateXml.test(outfile);
350
                        } else {
351
                                println "DONE - FAIL"
352
                        }
353
                        println "time: "+(System.currentTimeMillis()-start)/1000
354
                }
355
        }
356

    
357
        public class Tag {
358
                public String[] attnames, attvalues, attprefix;
359
                public String localname, prefix
360
                int count;
361
                ArrayList<String> contents = [""];
362
                public ArrayList<Tag> children = [];
363
                boolean inChild = false;
364

    
365
                public Tag(def parser) {
366
                        prefix = parser.getPrefix()
367
                        localname = parser.getLocalName();
368
                        count = parser.getAttributeCount()
369
                        attnames =  new String[count]
370
                        attvalues = new String[count]
371
                        attprefix = new String[count]
372
                        for (int i = 0 ; i < count ; i++) {
373
                                attnames[i] = parser.getAttributeLocalName(i)
374
                                attprefix[i] = parser.getAttributePrefix(i).toString()
375
                                attvalues[i] = parser.getAttributeValue(i).toString()
376
                        }
377
                }
378

    
379
                public void appendContent(String content) {
380
                        if (inChild) {
381
                                children[-1].appendContent(content);
382
                        } else {
383
                                this.contents[-1] += content;
384
                        }
385
                }
386

    
387
                public void endOfChild() {
388
                        contents << "";
389
                        inChild = false;
390
                }
391

    
392
                public void addChild(Tag child) {
393
                        if (inChild) {
394
                                children[-1].addChild(child);
395
                        } else {
396
                                children.add(child)
397
                                inChild = true;
398
                        }
399
                }
400

    
401
                public writeStart(def writer) {
402
                        if (prefix != null && prefix.length() > 0)
403
                                writer.writeStartElement(prefix+":"+localname)
404
                        else
405
                                writer.writeStartElement(localname)
406

    
407
                        for (int i = 0 ; i < count ; i++) {
408
                                if (attprefix[i] != null && attprefix[i].length() > 0) {
409
                                        writer.writeAttribute(attprefix[i]+":"+attnames[i], attvalues[i])
410
                                } else {
411
                                        writer.writeAttribute(attnames[i], attvalues[i])
412
                                }
413
                        }
414

    
415
                        //                        for ( Tag child: children) {
416
                        //                                child.write(writer)
417
                        //                        }
418

    
419
                        //                        writer.writeEndElement();
420
                }
421

    
422
                public String getContent() {
423
                        return contents.toString();
424
                }
425
        }
426
}