Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / transcriber / ValidateTRS.groovy @ 187

History | View | Annotate | Download (15.2 kB)

1
package org.txm.importer.transcriber
2

    
3
import java.text.DecimalFormat
4

    
5
import org.txm.utils.ExecTimer
6
import org.w3c.dom.*
7

    
8
/**
9
 * TRS file time validation for the STDDAD master corpus
10
 * @author mdecorde
11
 *
12
 */
13
class ValidateTRS {
14
        def doc;
15
        def root;
16
        def episodeElem;
17
        def topicsElem;
18
        def speakersElem;
19

    
20
        def declaredTopics = [];
21
        def topics = [];
22

    
23
        def mandatoryTopics = ["parcours", "ressources", "débouchés", "acteurs", "métier", "images", "autres"]
24
        def declaredSpeakers = [];
25
        def speakers = [];
26
        def spkDict = [:];
27
        def topicDict = [:];
28

    
29
        def sections = [];
30
        def turns = new LinkedHashMap();
31
        def syncs = new LinkedHashMap();
32
        def entities = [];
33

    
34
        def messages = [];
35

    
36
        float sectionsTime = 0;
37
        int numberOfSection = 0;
38
        HashMap<String, Float> sectionsTimesByTopic = new HashMap<String, Float>();
39
        HashMap<String, Integer> numberOfSectionByTopic = new HashMap<String, Integer>();
40

    
41
        float turnsTime = 0;
42
        int numberOfTurn = 0;
43
        HashMap<String, Float> turnsTimesBySpk = new HashMap<String, Float>();
44
        HashMap<String, Integer> numberOfTurnsBySpk = new HashMap<String, Integer>();
45

    
46
        int totalWords = 0;
47
        HashMap<String, Integer> totalWordsByTopic = new HashMap<String, Integer>();
48
        HashMap<String, Integer> totalWordsBySpk = new HashMap<String, Integer>();
49

    
50
        def static okTopics = ["débouchés", "parcours","ressources",
51
                "métier", "acteurs", "autres", "images"];
52

    
53
        def static spkPattern = "((int|enq)[0-9][0-9])|(ext[0-9][0-9][0-9])"
54

    
55
        public static boolean checkSpk(String spk)
56
        {
57
                return spk.matches("((int|enq)[0-9][0-9])|(ext[0-9][0-9][0-9])");
58
        }
59

    
60
        public ValidateTRS(File trsFile)
61
        {
62
                def slurper = new XmlParser();
63
                root = slurper.parse(trsFile.toURI().toString())
64
                episodeElem = root.Episode;
65
                speakersElem = root.Speakers;
66
                topicsElem = root.Topics
67
                init();
68
        }
69

    
70
        public boolean test(){
71
                //ckeckTimes();
72
        }
73

    
74
        public void init() {
75
                initSpeakersDecl();
76
                initTopicsDecl();
77

    
78
                initSections();
79
                initTurns();
80
                initSyncs();
81

    
82
                initSpeakers();
83
                initTopics();
84
                initEntities();
85
        }
86

    
87
        public void initEntities() {
88
                for(def section : sections)
89
                        for(def turn : turns.get(section))
90
                                for(def entity : turn.Event)
91
                                        if (entity.@type == "entities")
92
                                                entities.add(entity.@desc)
93
        }
94

    
95
        public void initTopicsDecl(){
96
                declaredTopics = topicsElem.Topic;
97
                for (def topic:declaredTopics) {
98
                        topicDict[topic.@id]=topic.@desc
99
                }
100
        }
101

    
102
        public void initSpeakersDecl(){
103
                declaredSpeakers = speakersElem.Speaker;
104
                for (def spk:declaredSpeakers) {
105
                        spkDict[spk.@id]=spk.@name
106
                }
107
        }
108

    
109
        public void initSections() {
110
                sections = episodeElem.Section;
111
                numberOfSection = sections.size()
112
        }
113

    
114
        public void initTurns() {
115
                for(def section : sections) {
116
                        turns[section] = section.Turn
117
                        numberOfTurn += turns[section].size()
118
                        //numberOfTurn
119
                }
120
        }
121

    
122
        public void initSyncs() {
123
                for(def section : sections)
124
                        for(def turn : turns.get(section))
125
                                syncs[turn] = turn.Sync
126
        }
127

    
128
        public void initTopics(){
129
                for(def section : sections) {
130
                        String topic = section.@topic;
131
                        if (section.@type != "report")
132
                                topic = section.@type
133
                        if (topic == null) topic = "notopic"        
134
                        topics.add(topic);
135
                }
136

    
137
                topics.sort()
138
        }
139

    
140
        public void initSpeakers(){
141
                for(def section : sections)
142
                        for(def turn : turns.get(section)) {
143
                                String spk = turn.@speaker;
144
                                if (spk == null) spk = "nospk"
145
                                speakers.add(spk);
146
                        }
147
                speakers.sort();
148
        }
149

    
150
        public boolean topicsCheck() {
151
                messages = []
152
                boolean ret = false;
153
                def topicsIds = declaredTopics.collect { topic -> topic.@id }
154
                for(def topic : topics)
155
                        if( topic != null && !topicsIds.contains(topic) && topic != "nontrans" && topic != "notopic") {
156
                                messages << "missing topic declaration: "+topic
157
                                ret = true;
158
                        }
159
                return ret;
160
        }
161

    
162
        public def speakersCheck() {
163
                messages = []
164
                boolean ret = false;
165
                def spksIds = declaredSpeakers.collect { spk -> spk.@id }
166
                for(def spk : speakers)  {
167
                        if (spk == null) continue; // ignore null spk
168
                        def split = spk.split(" ")
169
                        for(def sspk : split) {
170
                                if(!spksIds.contains(sspk) && sspk != "nospk") {
171
                                        println "missing spk declaration: "+sspk
172
                                        ret = true;
173
                                }
174
                        }
175
                }
176
                return ret;
177
        }
178

    
179
        String NULL = "null"
180
        public void printSortedList(def list, def dict) {
181
                def all = [:]
182
                for(def value : list) {
183
                        if (value == null) {
184
                                if(!all.containsKey(NULL))
185
                                        all[NULL] = 0
186
                                all[NULL] = all[NULL] + 1
187
                        } else {
188
                                def split = value.split(" ")
189
                                if (dict != null) {
190
                                        for(def sspk : split) {
191
                                                if (!(all.containsKey(sspk)))
192
                                                        all[sspk] = 0;
193
                                                all[sspk] = all[sspk] + 1
194
                                        }
195
                                } else {
196
                                        if (!(all.containsKey(value)))
197
                                                all[value] = 0;
198
                                        all[value] = all[value] + 1
199
                                }
200
                        }
201
                }
202

    
203
                def toprint = [];
204
                for(def speaker : all.keySet())
205
                        toprint.add([speaker, all[speaker]]);
206

    
207
                toprint = toprint.sort{it -> it.get(1)}
208

    
209
                for(def line : toprint)
210
                        if (dict != null) {
211
                                if (dict[line.get(0)] != null)
212
                                        println line.get(0)+"\t"+dict[line.get(0)]+"\t"+line.get(1)
213
                                else
214
                                println line.get(0)+"\t"+line.get(0)+"\t"+line.get(1)
215
                        } else {
216
                                println line.get(0)+"\t"+line.get(1)
217
                        }
218
        }
219

    
220
        public def timeCheck() {
221
                messages = [];
222
                boolean warning = false;
223
                boolean error = false
224
                def previousSectionStart = -1.0f;
225
                def previousSectionEnd = -1.0f;
226

    
227
                def previousTurnStart = -1.0f;
228
                def previousTurnEnd = -1.0f;
229

    
230
                for(def section : sections) {
231
                        def sectionStart = Float.parseFloat(section.'@startTime')
232
                        def sectionEnd =  Float.parseFloat(section.'@endTime')
233
                        def topic = section.@topic;
234
                        if (section.@type != "report")
235
                                topic = section.@type;
236
                        if (topic == null) topic = "notopic"
237
                        
238
                        def time = sectionEnd - sectionStart;
239
                        if (!sectionsTimesByTopic.containsKey(topic)) {
240
                                sectionsTimesByTopic.put(topic, 0);
241
                                numberOfSectionByTopic.put(topic, 0);
242
                                totalWordsByTopic.put(topic, 0);
243
                        }
244
                        if (topic != null && topic != "null" && topic.length() > 0) {
245
                                sectionsTime += time;
246
                        }
247
                        sectionsTimesByTopic.put(topic, sectionsTimesByTopic.get(topic) + time);
248
                        numberOfSectionByTopic.put(topic, numberOfSectionByTopic.get(topic) + 1);
249

    
250
                        boolean sectionBug = false;
251

    
252
                        if ( sectionEnd <= sectionStart) { sectionBug = true;
253
                                messages << "sectionEnd <= sectionStart : $sectionEnd <= $sectionStart"
254
                        }
255

    
256
                        if ( sectionStart < previousSectionStart) { sectionBug = true;
257
                                messages << "sectionStart < previousSectionStart : $sectionStart < $previousSectionStart"
258
                        }
259
                        if ( sectionStart < previousSectionEnd) { sectionBug = true;
260
                                messages << "sectionStart < previousSectionEnd : $sectionStart < $previousSectionEnd"
261
                        }
262
                        if ( sectionEnd < previousSectionStart) { sectionBug = true;
263
                                messages << "sectionEnd < previousSectionStart : $sectionEnd < $previousSectionStart"
264
                        }
265
                        if ( sectionEnd < previousSectionEnd) { sectionBug = true;
266
                                messages << "sectionEnd < previousSectionEnd : $sectionEnd < $previousSectionEnd"
267
                        }
268

    
269
                        for(def turn : turns.get(section)) {
270
                                def turnStart = Float.parseFloat(turn.'@startTime')
271
                                def turnEnd =  Float.parseFloat(turn.'@endTime')
272
                                def spk = ""+turn.@speaker;
273
                                if (turn.@speaker == "null") spk = "nospk"
274
                                time = turnEnd - turnStart;
275
                                int nbwords = turn.text().split().length
276

    
277
                                for(String subspk : spk.split(" ")) {
278
                                        String spkname = spkDict[subspk];
279
                                        if (spkname == null) spkname = "nospk"
280
                                        if (!totalWordsBySpk.containsKey(spkname)) {
281
                                                totalWordsBySpk.put(spkname, 0);
282
                                                turnsTimesBySpk.put(spkname, 0);
283
                                                numberOfTurnsBySpk.put(spkname, 0);
284
                                        }
285
                                }
286

    
287
                                if (spk != "null") turnsTime += time;
288
                                totalWords += nbwords
289
                                for(String subspk : spk.split(" ")) {
290
                                        String spkname = spkDict[subspk];
291
                                        if (spkname == null) spkname = "nospk"
292
                                        totalWordsBySpk.put(spkname, totalWordsBySpk.get(spkname) + nbwords)
293
                                        numberOfTurnsBySpk.put(spkname, numberOfTurnsBySpk.get(spkname) + 1);
294
                                        turnsTimesBySpk.put(spkname, turnsTimesBySpk.get(spkname) + time);
295
                                }
296
                                totalWordsByTopic.put(topic,totalWordsByTopic.get(topic) +nbwords);
297

    
298
                                boolean bug = false;
299

    
300
                                if ( turnEnd <= turnStart) { bug = true;
301
                                        messages << " turnEnd <= turnStart : $turnEnd <= $turnStart"
302
                                }
303

    
304
                                //                                if ( turnStart < previousSectionStart) { bug = true;
305
                                //                                        println " turnStart < previousSectionStart : $turnStart < $previousSectionStart"
306
                                //                                }
307
                                //                                if ( turnStart < previousSectionEnd) { bug = true;
308
                                //                                        println " turnStart < previousSectionEnd : $turnStart < $previousSectionEnd"
309
                                //                                }
310
                                //                                if ( turnEnd < previousSectionStart) { bug = true;
311
                                //                                        println " turnEnd < previousSectionStart : $turnEnd < $previousSectionStart"
312
                                //                                }
313
                                //                                if ( turnEnd < previousSectionEnd) { bug = true;
314
                                //                                        println " turnEnd < previousSectionEnd : $turnEnd < $previousSectionEnd"
315
                                //                                }
316

    
317
                                if ( turnStart < previousTurnStart) { bug = true;
318
                                        messages << " turnStart < previousTurnStart : $turnStart < $previousTurnStart"
319
                                }
320
                                if ( turnStart < previousTurnEnd) { bug = true;
321
                                        messages << " turnStart < previousTurnEnd : $turnStart < $previousTurnEnd"
322
                                }
323
                                if ( turnEnd < previousSectionStart) { bug = true;
324
                                        messages << " turnEnd < previousTurnStart : $turnEnd < $previousTurnStart"
325
                                }
326
                                if ( turnEnd < previousTurnEnd) { bug = true;
327
                                        messages << " turnEnd < previousTurnEnd : $turnEnd < $previousTurnEnd"
328
                                }
329
                                if (turnStart < sectionStart) { bug = true;
330
                                        messages << " turnStart < sectionStart : $turnStart < $sectionStart"
331
                                }
332
                                if (turnStart > sectionEnd) { bug = true;
333
                                        messages << " turnStart > sectionEnd : $turnStart > $sectionEnd"
334
                                }
335
                                if ( turnEnd < sectionStart) { bug = true;
336
                                        messages << " turnEnd < sectionStart : $turnEnd < $sectionStart"
337
                                }
338
                                if (turnEnd > sectionEnd) { bug = true;
339
                                        messages << " turnEnd > sectionEnd : $turnEnd > $sectionEnd"
340
                                }
341

    
342
                                for(def sync : syncs.get(turn)) {
343
                                        boolean syncBug = false;
344
                                        def syncTime = Float.parseFloat(sync.'@time')
345
                                        if ( syncTime < turnStart) { syncBug = true;
346
                                                messages << "  syncTime < turnStart : $syncTime < $turnStart"
347
                                        }
348
                                        if ( syncTime > turnEnd) { syncBug = true;
349
                                                messages << "  syncTime > turnEnd : $syncTime > $turnEnd"
350
                                        }
351

    
352
                                        if (syncBug) {
353
                                                warning = true;
354
                                        }
355
                                }
356

    
357
                                previousTurnStart = turnStart;
358
                                previousTurnEnd = turnEnd;
359
                                if (bug) {
360
                                        error = true;
361
                                }
362
                        }
363

    
364
                        previousSectionStart = sectionStart;
365
                        previousSectionEnd = sectionEnd;
366
                        if ( sectionBug) {
367
                                error = true;
368
                        }
369
                }
370

    
371
                return [error, warning];
372
        }
373

    
374
        public void printErrors()
375
        {
376
                for(String mess : messages)
377
                        println mess;
378
        }
379

    
380
        public void printAll()
381
        {
382
                println "Declared topics: $declaredTopics"
383
                println "Topics: $topics"
384

    
385
                println "Declared speakers: $declaredSpeakers"
386
                println "Speakers: $speakers"
387

    
388
                println "Sections: "+sections.size()
389
                println "Turns: "+syncs.size()
390
                println "Entities: $entities"
391
        }
392

    
393
        public static HashSet<String> allTopicDesc = new HashSet<String>();
394
        public static HashSet<String> allSpkName = new HashSet<String>();
395
        public static void checkDirectory(File dir) {
396
                allTopicDesc = new HashSet<String>();
397
                allSpkName = new HashSet<String>();
398
                //println "trs dir: $dir"
399
                for (File trsFile : dir.listFiles().sort()) {
400
                        if (!trsFile.getName().endsWith(".trs")) continue; // ignore
401
                        checkTRS(trsFile)
402

    
403
                }
404

    
405
                //println allTopicDesc;
406
                //println allSpkName;
407

    
408
        }
409

    
410
        public static boolean checkTRS(File trsFile) {
411
                boolean ret = true;
412
                println "\n***** $trsFile.name *****"
413
                def tester = new ValidateTRS(trsFile);
414
                //tester.printAll();
415

    
416
                def (error, warning) = tester.timeCheck();
417
                if( error ) {
418
                        print "\nTIMES: "
419
                        //if( error || warning) {
420
                        println "errors: "+error+" warnings: "+warning;
421
                        tester.printErrors();
422
                        ret = false;
423
                } //else println "all declared"
424

    
425
                println "\nSPEAKERS: id, name, frequency"
426
                if (tester.speakersCheck()) {
427
                        println "errors:"
428
                        tester.printErrors();
429
                        ret = false;
430
                } //else println "all declared"
431
                tester.printSortedList(tester.speakers, tester.spkDict);
432

    
433
                println "\nSECTIONS: id, title, frequency"//+ tester.declaredTopics
434
                if (tester.topicsCheck()) {
435
                        println "errors:"
436
                        tester.printErrors();
437
                        ret = false;
438
                } //else println "all declared"
439
                tester.printSortedList(tester.topics, tester.topicDict);
440

    
441
                println "\nENTITIES: value, frequency"
442
                tester.printSortedList(tester.entities, null);
443

    
444
                println ""
445
                for(def topic : tester.declaredTopics) {
446
                        allTopicDesc.add(topic.@desc)
447
                        if (!okTopics.contains(topic.@desc)) {
448
                                if(tester.topics.contains(topic.@id))
449
                                        println "topic error: "+topic.@desc
450

    
451
                        }
452
                }
453
                //println ""
454
                for(def spk : tester.declaredSpeakers) {
455
                        allSpkName.add(spk.@name)
456
                        if(!checkSpk(spk.@name)) {
457
                                if(tester.speakers.contains(spk.@id))
458
                                        println "spk error: "+spk.@name
459
                        }
460
                }
461
                DecimalFormat formater = new DecimalFormat("0.00");
462

    
463
                println "Statistics:"
464
                println " Approx nb words: "+tester.totalWords;
465
                println " Total time: "+ExecTimer.formatSecs(tester.turnsTime)
466
                println " Approx word rate (word/sec): "+formater.format(tester.totalWords/tester.turnsTime)
467
                println " Approx word rate (word/min): "+formater.format(60*(tester.totalWords/tester.turnsTime))
468
                println ""
469

    
470
                def meanSections = [:];
471
                for(String key : tester.sectionsTimesByTopic.keySet())
472
                        meanSections.put(key, ExecTimer.formatSecs(tester.sectionsTimesByTopic.get(key) / tester.numberOfSectionByTopic.get(key)))
473
                def meanTurns = [:];
474
                for(String key : tester.turnsTimesBySpk.keySet())
475
                        meanTurns.put(key, ExecTimer.formatSecs(tester.turnsTimesBySpk.get(key) / tester.numberOfTurnsBySpk.get(key)))
476

    
477
                for(String key : tester.sectionsTimesByTopic.keySet())
478
                        tester.sectionsTimesByTopic.put(key, ExecTimer.formatSecs(tester.sectionsTimesByTopic.get(key)))
479
                for(String key : tester.turnsTimesBySpk.keySet())
480
                        tester.turnsTimesBySpk.put(key, ExecTimer.formatSecs(tester.turnsTimesBySpk.get(key)))
481

    
482
                println " Number of sections: "+tester.numberOfSection
483
                println " Sections time: "+ExecTimer.formatSecs(tester.sectionsTime);
484
                println " Mean time of sections: "+ExecTimer.formatSecs(tester.sectionsTime/tester.numberOfSection)
485

    
486
                println " Sections time by topic: "+tester.sectionsTimesByTopic
487
                println " Number of sections by topic: "+tester.numberOfSectionByTopic
488
                println " Mean time of sections by topic: "+meanSections
489
                println ""
490

    
491
                println " Number of turns: "+tester.numberOfTurn
492
                println " Turns time: "+ExecTimer.formatSecs(tester.turnsTime);
493
                println " Mean time of turns: "+ExecTimer.formatSecs(tester.turnsTime/tester.numberOfTurn)
494

    
495
                println " Turns time by spk: "+tester.turnsTimesBySpk
496
                println " Number of turns by spk: "+tester.numberOfTurnsBySpk
497
                println " Mean time of turns by spk: "+meanTurns
498
                println ""
499

    
500
                return ret;
501
        }
502

    
503
        public static void main(String[] args) {
504
                //File dir = new File("/home/sheiden/enslyon_projets/Textométrie/SpUV/EVS/master 1 STDDAD 2011/enquête 2011/Reçues")
505
                File dir = new File("/home/mdecorde/xml/concattrs/ready")
506
                ValidateTRS.checkDirectory(dir);
507
                //ValidateTRS.checkTRS(new File("/home/mdecorde/xml/concattrs/int40-21.trs"))
508
        }
509
}