Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsInterdistanceMacro.groovy @ 2144

History | View | Annotate | Download (4.8 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5
// STANDARD DECLARATIONS
6
package org.txm.macro.urs.exploit
7

    
8
import org.kohsuke.args4j.*
9
import groovy.transform.Field
10
import org.txm.*
11
import org.txm.macro.urs.AnalecUtils
12
import visuAnalec.elements.*
13
import org.txm.rcp.swt.widget.parameters.*
14
import org.txm.annotation.urs.*
15
import org.txm.searchengine.cqp.corpus.*
16
import org.apache.commons.lang.StringUtils;
17

    
18
def selection = []
19
for (def s : corpusViewSelections) {
20
        if (s instanceof CQPCorpus) selection << s
21
        else if (s instanceof Partition) selection.addAll(s.getParts())
22
}
23

    
24
if (selection.size() == 0) {
25
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
26
        return false
27
} else {
28
        for (def c : selection) c.compute(false)
29
}
30

    
31
// BEGINNING OF PARAMETERS
32
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE")
33
String schema_ursql
34

    
35
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
36
int minimum_schema_size
37
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
38
int maximum_schema_size
39
@Field @Option(name="schema_display_property_name",usage="", widget="String", required=false, def="REF")
40
String schema_display_property_name
41

    
42
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
43
String unit_ursql
44
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45
boolean strict_inclusion
46
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
47
debug
48

    
49
if (!ParametersDialog.open(this)) return;
50
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
51

    
52
for (def corpus : selection) {
53
        def analecCorpus = URSCorpora.getCorpus(corpus)
54
        
55
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
56
        if (errors.size() > 0) {
57
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
58
                return;
59
        }
60
        
61
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
62
        if (errors.size() > 0) {
63
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
64
                return;
65
        }
66
        
67
        def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
68
        def distances = [];
69
        def nDistances = 0
70
        def cadences = [];
71
        for (def schema : schemas) {
72
                
73
                def allUnites = schema.getUnitesSousjacentesNonTriees()
74
        
75
                def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
76
                
77
                Collections.sort(units)
78
                        
79
                for (int i = 0 ; i < units.size() ; i++) {
80
                        int d1 = 9999999;
81
                        int d2 = 9999999;
82
                        
83
                        
84
                        if (i < units.size()-1) d1 = units[i+1].getDeb() - units[i].getFin() - 1;
85
                        if (d1 < 0) {
86
                                //println "D1 "+units[i+1].getDeb()+" - "+units[i].getFin()+" = "+d1
87
                                d1 = 0; // the first unit pass the next one ?
88
                        }
89
                        if (i > 0) {
90
                                d2 = units[i].getDeb() - units[i-1].getFin() - 1;
91
                                distances << d2
92
                                nDistances++
93
                        }
94
                        if (d2 < 0) {
95
                                //println "D2 "+units[i].getDeb()+" - "+units[i-1].getFin()+" = "+d2
96
                                d2 = 0; // the first unit pass the next one ?
97
                        }
98
                        
99
                        if (d1 < d2) cadences << d1 else cadences << d2
100
                }
101
        }
102
        distances = distances.sort()
103
        cadences = cadences.sort()
104
        
105
        int distances_total = distances.sum()
106
        int cadences_total = cadences.sum()
107
        coef = (distances_total / nDistances)
108
        cadence = (cadences_total / nDistances)
109
        
110
        println "$corpus distances:"
111
        if (debug > 0) println "distances $distances"
112
        println "distance moyenne : $distances_total / ${distances.size()} = $coef"
113
        println "distance medianne : "+distances[(int)(distances.size() / 2)]
114
        println "distance quartils : "+distances[0]+" "+distances[(int)(distances.size() / 4)] + " "+distances[(int)(distances.size() / 2)]+" "+distances[(int)(3*distances.size() / 4)]+" "+distances[(int)(distances.size() -1)]
115
        if (debug > 0) println "cadences $cadences"
116
        println "cadence moyenne : $cadences_total / ${cadences.size()} = $cadence"
117
        println "cadence medianne : "+cadences[(int)(cadences.size() / 2)]
118
        println "cadence quartils : "+cadences[0]+" "+cadences[(int)(cadences.size() / 4)] + " "+cadences[(int)(cadences.size() / 2)]+" "+cadences[(int)(3*cadences.size() / 4)]+" "+cadences[(int)(cadences.size() -1)]
119
        
120
        //return ["result":coef, "result2":cadence, "data":["distances":distances, "nDistances":nDistances, "cadences":cadences]]
121
}