Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / transcriber / ValidateTRS.groovy @ 479

History | View | Annotate | Download (15.3 kB)

1
package org.txm.importer.transcriber
2

    
3
import java.text.DecimalFormat;
4

    
5
import org.txm.utils.xml.DomUtils;
6
import org.txm.utils.ExecTimer;
7
import org.w3c.dom.*;
8

    
9
/**
10
 * TRS file time validation for the STDDAD master corpus
11
 * @author mdecorde
12
 *
13
 */
14
class ValidateTRS {
15
        def doc;
16
        def root;
17
        def episodeElem;
18
        def topicsElem;
19
        def speakersElem;
20

    
21
        def declaredTopics = [];
22
        def topics = [];
23

    
24
        def mandatoryTopics = ["parcours", "ressources", "débouchés", "acteurs", "métier", "images", "autres"]
25
        def declaredSpeakers = [];
26
        def speakers = [];
27
        def spkDict = [:];
28
        def topicDict = [:];
29

    
30
        def sections = [];
31
        def turns = new LinkedHashMap();
32
        def syncs = new LinkedHashMap();
33
        def entities = [];
34

    
35
        def messages = [];
36

    
37
        float sectionsTime = 0;
38
        int numberOfSection = 0;
39
        HashMap<String, Float> sectionsTimesByTopic = new HashMap<String, Float>();
40
        HashMap<String, Integer> numberOfSectionByTopic = new HashMap<String, Integer>();
41

    
42
        float turnsTime = 0;
43
        int numberOfTurn = 0;
44
        HashMap<String, Float> turnsTimesBySpk = new HashMap<String, Float>();
45
        HashMap<String, Integer> numberOfTurnsBySpk = new HashMap<String, Integer>();
46

    
47
        int totalWords = 0;
48
        HashMap<String, Integer> totalWordsByTopic = new HashMap<String, Integer>();
49
        HashMap<String, Integer> totalWordsBySpk = new HashMap<String, Integer>();
50

    
51
        def static okTopics = ["débouchés", "parcours","ressources",
52
                "métier", "acteurs", "autres", "images"];
53

    
54
        def static spkPattern = "((int|enq)[0-9][0-9])|(ext[0-9][0-9][0-9])"
55

    
56
        public static boolean checkSpk(String spk)
57
        {
58
                return spk.matches("((int|enq)[0-9][0-9])|(ext[0-9][0-9][0-9])");
59
        }
60

    
61
        public ValidateTRS(File trsFile)
62
        {
63
                def slurper = new XmlParser();
64
                root = slurper.parse(trsFile.toURI().toString())
65
                episodeElem = root.Episode;
66
                speakersElem = root.Speakers;
67
                topicsElem = root.Topics
68
                init();
69
        }
70

    
71
        public boolean test(){
72
                //ckeckTimes();
73
        }
74

    
75
        public void init() {
76
                initSpeakersDecl();
77
                initTopicsDecl();
78

    
79
                initSections();
80
                initTurns();
81
                initSyncs();
82

    
83
                initSpeakers();
84
                initTopics();
85
                initEntities();
86
        }
87

    
88
        public void initEntities() {
89
                for(def section : sections)
90
                        for(def turn : turns.get(section))
91
                                for(def entity : turn.Event)
92
                                        if (entity.@type == "entities")
93
                                                entities.add(entity.@desc)
94
        }
95

    
96
        public void initTopicsDecl(){
97
                declaredTopics = topicsElem.Topic;
98
                for (def topic:declaredTopics) {
99
                        topicDict[topic.@id]=topic.@desc
100
                }
101
        }
102

    
103
        public void initSpeakersDecl(){
104
                declaredSpeakers = speakersElem.Speaker;
105
                for (def spk:declaredSpeakers) {
106
                        spkDict[spk.@id]=spk.@name
107
                }
108
        }
109

    
110
        public void initSections() {
111
                sections = episodeElem.Section;
112
                numberOfSection = sections.size()
113
        }
114

    
115
        public void initTurns() {
116
                for(def section : sections) {
117
                        turns[section] = section.Turn
118
                        numberOfTurn += turns[section].size()
119
                        //numberOfTurn
120
                }
121
        }
122

    
123
        public void initSyncs() {
124
                for(def section : sections)
125
                        for(def turn : turns.get(section))
126
                                syncs[turn] = turn.Sync
127
        }
128

    
129
        public void initTopics(){
130
                for(def section : sections) {
131
                        String topic = section.@topic;
132
                        if (section.@type != "report")
133
                                topic = section.@type
134
                        if (topic == null) topic = "notopic"        
135
                        topics.add(topic);
136
                }
137

    
138
                topics.sort()
139
        }
140

    
141
        public void initSpeakers(){
142
                for(def section : sections)
143
                        for(def turn : turns.get(section)) {
144
                                String spk = turn.@speaker;
145
                                if (spk == null) spk = "nospk"
146
                                speakers.add(spk);
147
                        }
148
                speakers.sort();
149
        }
150

    
151
        public boolean topicsCheck() {
152
                messages = []
153
                boolean ret = false;
154
                def topicsIds = declaredTopics.collect { topic -> topic.@id }
155
                for(def topic : topics)
156
                        if( topic != null && !topicsIds.contains(topic) && topic != "nontrans" && topic != "notopic") {
157
                                messages << "missing topic declaration: "+topic
158
                                ret = true;
159
                        }
160
                return ret;
161
        }
162

    
163
        public def speakersCheck() {
164
                messages = []
165
                boolean ret = false;
166
                def spksIds = declaredSpeakers.collect { spk -> spk.@id }
167
                for(def spk : speakers)  {
168
                        if (spk == null) continue; // ignore null spk
169
                        def split = spk.split(" ")
170
                        for(def sspk : split) {
171
                                if(!spksIds.contains(sspk) && sspk != "nospk") {
172
                                        println "missing spk declaration: "+sspk
173
                                        ret = true;
174
                                }
175
                        }
176
                }
177
                return ret;
178
        }
179

    
180
        String NULL = "null"
181
        public void printSortedList(def list, def dict) {
182
                def all = [:]
183
                for(def value : list) {
184
                        if (value == null) {
185
                                if(!all.containsKey(NULL))
186
                                        all[NULL] = 0
187
                                all[NULL] = all[NULL] + 1
188
                        } else {
189
                                def split = value.split(" ")
190
                                if (dict != null) {
191
                                        for(def sspk : split) {
192
                                                if (!(all.containsKey(sspk)))
193
                                                        all[sspk] = 0;
194
                                                all[sspk] = all[sspk] + 1
195
                                        }
196
                                } else {
197
                                        if (!(all.containsKey(value)))
198
                                                all[value] = 0;
199
                                        all[value] = all[value] + 1
200
                                }
201
                        }
202
                }
203

    
204
                def toprint = [];
205
                for(def speaker : all.keySet())
206
                        toprint.add([speaker, all[speaker]]);
207

    
208
                toprint = toprint.sort{it -> it.get(1)}
209

    
210
                for(def line : toprint)
211
                        if (dict != null) {
212
                                if (dict[line.get(0)] != null)
213
                                        println line.get(0)+"\t"+dict[line.get(0)]+"\t"+line.get(1)
214
                                else
215
                                println line.get(0)+"\t"+line.get(0)+"\t"+line.get(1)
216
                        } else {
217
                                println line.get(0)+"\t"+line.get(1)
218
                        }
219
        }
220

    
221
        public def timeCheck() {
222
                messages = [];
223
                boolean warning = false;
224
                boolean error = false
225
                def previousSectionStart = -1.0f;
226
                def previousSectionEnd = -1.0f;
227

    
228
                def previousTurnStart = -1.0f;
229
                def previousTurnEnd = -1.0f;
230

    
231
                for(def section : sections) {
232
                        def sectionStart = Float.parseFloat(section.'@startTime')
233
                        def sectionEnd =  Float.parseFloat(section.'@endTime')
234
                        def topic = section.@topic;
235
                        if (section.@type != "report")
236
                                topic = section.@type;
237
                        if (topic == null) topic = "notopic"
238
                        
239
                        def time = sectionEnd - sectionStart;
240
                        if (!sectionsTimesByTopic.containsKey(topic)) {
241
                                sectionsTimesByTopic.put(topic, 0);
242
                                numberOfSectionByTopic.put(topic, 0);
243
                                totalWordsByTopic.put(topic, 0);
244
                        }
245
                        if (topic != null && topic != "null" && topic.length() > 0) {
246
                                sectionsTime += time;
247
                        }
248
                        sectionsTimesByTopic.put(topic, sectionsTimesByTopic.get(topic) + time);
249
                        numberOfSectionByTopic.put(topic, numberOfSectionByTopic.get(topic) + 1);
250

    
251
                        boolean sectionBug = false;
252

    
253
                        if ( sectionEnd <= sectionStart) { sectionBug = true;
254
                                messages << "sectionEnd <= sectionStart : $sectionEnd <= $sectionStart"
255
                        }
256

    
257
                        if ( sectionStart < previousSectionStart) { sectionBug = true;
258
                                messages << "sectionStart < previousSectionStart : $sectionStart < $previousSectionStart"
259
                        }
260
                        if ( sectionStart < previousSectionEnd) { sectionBug = true;
261
                                messages << "sectionStart < previousSectionEnd : $sectionStart < $previousSectionEnd"
262
                        }
263
                        if ( sectionEnd < previousSectionStart) { sectionBug = true;
264
                                messages << "sectionEnd < previousSectionStart : $sectionEnd < $previousSectionStart"
265
                        }
266
                        if ( sectionEnd < previousSectionEnd) { sectionBug = true;
267
                                messages << "sectionEnd < previousSectionEnd : $sectionEnd < $previousSectionEnd"
268
                        }
269

    
270
                        for(def turn : turns.get(section)) {
271
                                def turnStart = Float.parseFloat(turn.'@startTime')
272
                                def turnEnd =  Float.parseFloat(turn.'@endTime')
273
                                def spk = ""+turn.@speaker;
274
                                if (turn.@speaker == "null") spk = "nospk"
275
                                time = turnEnd - turnStart;
276
                                int nbwords = turn.text().split().length
277

    
278
                                for(String subspk : spk.split(" ")) {
279
                                        String spkname = spkDict[subspk];
280
                                        if (spkname == null) spkname = "nospk"
281
                                        if (!totalWordsBySpk.containsKey(spkname)) {
282
                                                totalWordsBySpk.put(spkname, 0);
283
                                                turnsTimesBySpk.put(spkname, 0);
284
                                                numberOfTurnsBySpk.put(spkname, 0);
285
                                        }
286
                                }
287

    
288
                                if (spk != "null") turnsTime += time;
289
                                totalWords += nbwords
290
                                for(String subspk : spk.split(" ")) {
291
                                        String spkname = spkDict[subspk];
292
                                        if (spkname == null) spkname = "nospk"
293
                                        totalWordsBySpk.put(spkname, totalWordsBySpk.get(spkname) + nbwords)
294
                                        numberOfTurnsBySpk.put(spkname, numberOfTurnsBySpk.get(spkname) + 1);
295
                                        turnsTimesBySpk.put(spkname, turnsTimesBySpk.get(spkname) + time);
296
                                }
297
                                totalWordsByTopic.put(topic,totalWordsByTopic.get(topic) +nbwords);
298

    
299
                                boolean bug = false;
300

    
301
                                if ( turnEnd <= turnStart) { bug = true;
302
                                        messages << " turnEnd <= turnStart : $turnEnd <= $turnStart"
303
                                }
304

    
305
                                //                                if ( turnStart < previousSectionStart) { bug = true;
306
                                //                                        println " turnStart < previousSectionStart : $turnStart < $previousSectionStart"
307
                                //                                }
308
                                //                                if ( turnStart < previousSectionEnd) { bug = true;
309
                                //                                        println " turnStart < previousSectionEnd : $turnStart < $previousSectionEnd"
310
                                //                                }
311
                                //                                if ( turnEnd < previousSectionStart) { bug = true;
312
                                //                                        println " turnEnd < previousSectionStart : $turnEnd < $previousSectionStart"
313
                                //                                }
314
                                //                                if ( turnEnd < previousSectionEnd) { bug = true;
315
                                //                                        println " turnEnd < previousSectionEnd : $turnEnd < $previousSectionEnd"
316
                                //                                }
317

    
318
                                if ( turnStart < previousTurnStart) { bug = true;
319
                                        messages << " turnStart < previousTurnStart : $turnStart < $previousTurnStart"
320
                                }
321
                                if ( turnStart < previousTurnEnd) { bug = true;
322
                                        messages << " turnStart < previousTurnEnd : $turnStart < $previousTurnEnd"
323
                                }
324
                                if ( turnEnd < previousSectionStart) { bug = true;
325
                                        messages << " turnEnd < previousTurnStart : $turnEnd < $previousTurnStart"
326
                                }
327
                                if ( turnEnd < previousTurnEnd) { bug = true;
328
                                        messages << " turnEnd < previousTurnEnd : $turnEnd < $previousTurnEnd"
329
                                }
330
                                if (turnStart < sectionStart) { bug = true;
331
                                        messages << " turnStart < sectionStart : $turnStart < $sectionStart"
332
                                }
333
                                if (turnStart > sectionEnd) { bug = true;
334
                                        messages << " turnStart > sectionEnd : $turnStart > $sectionEnd"
335
                                }
336
                                if ( turnEnd < sectionStart) { bug = true;
337
                                        messages << " turnEnd < sectionStart : $turnEnd < $sectionStart"
338
                                }
339
                                if (turnEnd > sectionEnd) { bug = true;
340
                                        messages << " turnEnd > sectionEnd : $turnEnd > $sectionEnd"
341
                                }
342

    
343
                                for(def sync : syncs.get(turn)) {
344
                                        boolean syncBug = false;
345
                                        def syncTime = Float.parseFloat(sync.'@time')
346
                                        if ( syncTime < turnStart) { syncBug = true;
347
                                                messages << "  syncTime < turnStart : $syncTime < $turnStart"
348
                                        }
349
                                        if ( syncTime > turnEnd) { syncBug = true;
350
                                                messages << "  syncTime > turnEnd : $syncTime > $turnEnd"
351
                                        }
352

    
353
                                        if (syncBug) {
354
                                                warning = true;
355
                                        }
356
                                }
357

    
358
                                previousTurnStart = turnStart;
359
                                previousTurnEnd = turnEnd;
360
                                if (bug) {
361
                                        error = true;
362
                                }
363
                        }
364

    
365
                        previousSectionStart = sectionStart;
366
                        previousSectionEnd = sectionEnd;
367
                        if ( sectionBug) {
368
                                error = true;
369
                        }
370
                }
371

    
372
                return [error, warning];
373
        }
374

    
375
        public void printErrors()
376
        {
377
                for(String mess : messages)
378
                        println mess;
379
        }
380

    
381
        public void printAll()
382
        {
383
                println "Declared topics: $declaredTopics"
384
                println "Topics: $topics"
385

    
386
                println "Declared speakers: $declaredSpeakers"
387
                println "Speakers: $speakers"
388

    
389
                println "Sections: "+sections.size()
390
                println "Turns: "+syncs.size()
391
                println "Entities: $entities"
392
        }
393

    
394
        public static HashSet<String> allTopicDesc = new HashSet<String>();
395
        public static HashSet<String> allSpkName = new HashSet<String>();
396
        public static void checkDirectory(File dir) {
397
                allTopicDesc = new HashSet<String>();
398
                allSpkName = new HashSet<String>();
399
                //println "trs dir: $dir"
400
                for (File trsFile : dir.listFiles().sort()) {
401
                        if (!trsFile.getName().endsWith(".trs")) continue; // ignore
402
                        checkTRS(trsFile)
403

    
404
                }
405

    
406
                //println allTopicDesc;
407
                //println allSpkName;
408

    
409
        }
410

    
411
        public static boolean checkTRS(File trsFile) {
412
                boolean ret = true;
413
                println "\n***** $trsFile.name *****"
414
                def tester = new ValidateTRS(trsFile);
415
                //tester.printAll();
416

    
417
                def (error, warning) = tester.timeCheck();
418
                if( error ) {
419
                        print "\nTIMES: "
420
                        //if( error || warning) {
421
                        println "errors: "+error+" warnings: "+warning;
422
                        tester.printErrors();
423
                        ret = false;
424
                } //else println "all declared"
425

    
426
                println "\nSPEAKERS: id, name, frequency"
427
                if (tester.speakersCheck()) {
428
                        println "errors:"
429
                        tester.printErrors();
430
                        ret = false;
431
                } //else println "all declared"
432
                tester.printSortedList(tester.speakers, tester.spkDict);
433

    
434
                println "\nSECTIONS: id, title, frequency"//+ tester.declaredTopics
435
                if (tester.topicsCheck()) {
436
                        println "errors:"
437
                        tester.printErrors();
438
                        ret = false;
439
                } //else println "all declared"
440
                tester.printSortedList(tester.topics, tester.topicDict);
441

    
442
                println "\nENTITIES: value, frequency"
443
                tester.printSortedList(tester.entities, null);
444

    
445
                println ""
446
                for(def topic : tester.declaredTopics) {
447
                        allTopicDesc.add(topic.@desc)
448
                        if (!okTopics.contains(topic.@desc)) {
449
                                if(tester.topics.contains(topic.@id))
450
                                        println "topic error: "+topic.@desc
451

    
452
                        }
453
                }
454
                //println ""
455
                for(def spk : tester.declaredSpeakers) {
456
                        allSpkName.add(spk.@name)
457
                        if(!checkSpk(spk.@name)) {
458
                                if(tester.speakers.contains(spk.@id))
459
                                        println "spk error: "+spk.@name
460
                        }
461
                }
462
                DecimalFormat formater = new DecimalFormat("0.00");
463

    
464
                println "Statistics:"
465
                println " Approx nb words: "+tester.totalWords;
466
                println " Total time: "+ExecTimer.formatSecs(tester.turnsTime)
467
                println " Approx word rate (word/sec): "+formater.format(tester.totalWords/tester.turnsTime)
468
                println " Approx word rate (word/min): "+formater.format(60*(tester.totalWords/tester.turnsTime))
469
                println ""
470

    
471
                def meanSections = [:];
472
                for(String key : tester.sectionsTimesByTopic.keySet())
473
                        meanSections.put(key, ExecTimer.formatSecs(tester.sectionsTimesByTopic.get(key) / tester.numberOfSectionByTopic.get(key)))
474
                def meanTurns = [:];
475
                for(String key : tester.turnsTimesBySpk.keySet())
476
                        meanTurns.put(key, ExecTimer.formatSecs(tester.turnsTimesBySpk.get(key) / tester.numberOfTurnsBySpk.get(key)))
477

    
478
                for(String key : tester.sectionsTimesByTopic.keySet())
479
                        tester.sectionsTimesByTopic.put(key, ExecTimer.formatSecs(tester.sectionsTimesByTopic.get(key)))
480
                for(String key : tester.turnsTimesBySpk.keySet())
481
                        tester.turnsTimesBySpk.put(key, ExecTimer.formatSecs(tester.turnsTimesBySpk.get(key)))
482

    
483
                println " Number of sections: "+tester.numberOfSection
484
                println " Sections time: "+ExecTimer.formatSecs(tester.sectionsTime);
485
                println " Mean time of sections: "+ExecTimer.formatSecs(tester.sectionsTime/tester.numberOfSection)
486

    
487
                println " Sections time by topic: "+tester.sectionsTimesByTopic
488
                println " Number of sections by topic: "+tester.numberOfSectionByTopic
489
                println " Mean time of sections by topic: "+meanSections
490
                println ""
491

    
492
                println " Number of turns: "+tester.numberOfTurn
493
                println " Turns time: "+ExecTimer.formatSecs(tester.turnsTime);
494
                println " Mean time of turns: "+ExecTimer.formatSecs(tester.turnsTime/tester.numberOfTurn)
495

    
496
                println " Turns time by spk: "+tester.turnsTimesBySpk
497
                println " Number of turns by spk: "+tester.numberOfTurnsBySpk
498
                println " Mean time of turns by spk: "+meanTurns
499
                println ""
500

    
501
                return ret;
502
        }
503

    
504
        public static void main(String[] args) {
505
                //File dir = new File("/home/sheiden/enslyon_projets/Textométrie/SpUV/EVS/master 1 STDDAD 2011/enquête 2011/Reçues")
506
                File dir = new File("/home/mdecorde/xml/concattrs/ready")
507
                ValidateTRS.checkDirectory(dir);
508
                //ValidateTRS.checkTRS(new File("/home/mdecorde/xml/concattrs/int40-21.trs"))
509
        }
510
}