Révision 2094
| tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/misc/ChercherRemplacer.txt (revision 2094) | ||
|---|---|---|
| 1 |
Macro ChercherRemplacer |
|
| 2 |
Auteur : Matthieu QUIGNARD |
|
| 3 |
Version : 05 Février 2019 |
|
| 4 |
|
|
| 5 |
Retouche la valeur d'une propriété pour la remplacer par une autre. |
|
| 6 |
Par exemple : CATEGORIE=PRO.CHECK => CATEGORIE=ERREUR |
|
| 7 |
|
|
| 8 |
Possibilité d'inclure aussi les mentions dont la valeur initiale est vide. |
|
| 9 |
Par exemple : CATEGORIE= => CATEGORIE=ERREUR |
|
| 10 |
|
|
| 11 |
NB : on peut utiliser cette macro pour retoucher le nom des référents. |
|
| 12 |
Exemple : REF=roi de france => REF=Le Roi de France |
|
| 13 |
|
|
| 0 | 14 | |
| tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/misc/ChercherSupprimerMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// ChercherSupprimer |
|
| 2 |
// Auteur Matthieu Quignard |
|
| 3 |
// Date : 14 janvier 2019 |
|
| 4 |
|
|
| 5 |
/********** |
|
| 6 |
Sert à supprimer des mentions qui une valeur particulière attribuée |
|
| 7 |
Par exemple : CATEGORIE=ERREUR |
|
| 8 |
ou bien : REF=NON_REF |
|
| 9 |
ou encore : CHECK= |
|
| 10 |
|
|
| 11 |
ATTENTION : CETTE MACRO N'EST PAS REVERSIBLE |
|
| 12 |
***********/ |
|
| 13 |
|
|
| 14 |
package org.txm.macroprototypes.urs.misc |
|
| 15 |
|
|
| 16 |
import org.apache.commons.lang.* |
|
| 17 |
import org.kohsuke.args4j.* |
|
| 18 |
import groovy.transform.* |
|
| 19 |
import org.txm.* |
|
| 20 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 21 |
import org.txm.analec.* |
|
| 22 |
import org.txm.searchengine.cqp.* |
|
| 23 |
import org.txm.searchengine.cqp.corpus.* |
|
| 24 |
import visuAnalec.Message.* |
|
| 25 |
import visuAnalec.donnees.* |
|
| 26 |
import visuAnalec.elements.* |
|
| 27 |
import visuAnalec.vue.* |
|
| 28 |
|
|
| 29 |
// CORPS DU SCRIPT |
|
| 30 |
|
|
| 31 |
if (!(corpusViewSelection instanceof MainCorpus)) {
|
|
| 32 |
println "Corpora selection is not a Corpus" |
|
| 33 |
return |
|
| 34 |
} |
|
| 35 |
|
|
| 36 |
// BEGINNING OF PARAMETERS |
|
| 37 |
@Field @Option(name="unit_type", usage="Unité", widget="String", required=true, def="MENTION") |
|
| 38 |
def unit_type |
|
| 39 |
@Field @Option(name="prop_name", usage="Propriété", widget="String", required=true, def="CATEGORIE") |
|
| 40 |
def prop_name |
|
| 41 |
@Field @Option(name="val_cherche", usage="Valeur recherchée", widget="String", required=true, def="") |
|
| 42 |
def val_cherche |
|
| 43 |
@Field @Option(name="inclureVides", usage="Inclure les valeurs vides", widget="Boolean", required=true, def="true") |
|
| 44 |
def inclureVides |
|
| 45 |
|
|
| 46 |
if (!ParametersDialog.open(this)) return |
|
| 47 |
|
|
| 48 |
corpus = corpusViewSelection |
|
| 49 |
analecCorpus = AnalecCorpora.getCorpus(corpus.getName()) |
|
| 50 |
vue = AnalecCorpora.getVue(corpus.getName()) |
|
| 51 |
structure = analecCorpus.getStructure() |
|
| 52 |
|
|
| 53 |
if (!structure.getUnites().contains(unit_type)) { // check if the structure contains the unit_type units
|
|
| 54 |
println "Erreur : le corpus ne contient d'unité de type $unit_type" |
|
| 55 |
println "Script terminé" |
|
| 56 |
return |
|
| 57 |
} |
|
| 58 |
|
|
| 59 |
if (!structure.getUniteProperties(unit_type).contains(prop_name)) {
|
|
| 60 |
println "Erreur : les unités $unit_type n'ont pas de propriété $prop_name" |
|
| 61 |
println "Script terminé" |
|
| 62 |
return |
|
| 63 |
} |
|
| 64 |
|
|
| 65 |
println "Option 'inclure les valeurs vides' : $inclureVides" |
|
| 66 |
|
|
| 67 |
def nDeleted = 0 |
|
| 68 |
def nIgnored = 0 |
|
| 69 |
|
|
| 70 |
def units = analecCorpus.getUnites(unit_type) |
|
| 71 |
units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
|
|
| 72 |
|
|
| 73 |
def garbageBin = [] |
|
| 74 |
|
|
| 75 |
for (Unite unit : units) { // process all units
|
|
| 76 |
def val = unit.getProp( prop_name ) |
|
| 77 |
|
|
| 78 |
if ( (val == val_cherche) || ( inclureVides && (val == "")) ) {
|
|
| 79 |
garbageBin.add( unit ) |
|
| 80 |
nDeleted++ |
|
| 81 |
} else {
|
|
| 82 |
nIgnored++ |
|
| 83 |
} |
|
| 84 |
} |
|
| 85 |
|
|
| 86 |
// Suppression effective des unités ciblées |
|
| 87 |
garbageBin.each {
|
|
| 88 |
analecCorpus.supUnite( it ) |
|
| 89 |
} |
|
| 90 |
|
|
| 91 |
if (nDeleted > 0) corpus.setIsModified(true); |
|
| 92 |
|
|
| 93 |
println "Result:" |
|
| 94 |
println "- $nDeleted units of type $unit_type have been deleted." |
|
| 95 |
println "- $nIgnored units of type $unit_type have not been modified." |
|
| 96 |
|
|
| 97 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut() |
|
| tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/misc/ChercherRemplacerMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// ChercherRemplacer |
|
| 2 |
// Auteur Matthieu Quignard |
|
| 3 |
// Date : 14 janvier 2019 |
|
| 4 |
|
|
| 5 |
/********** |
|
| 6 |
Sert à retoucher une valeur attribuée à une mention et la remplacer par une autre |
|
| 7 |
Par exemple : CATEGORIE=PRO.CHECK => CATEGORIE=PRO.PER |
|
| 8 |
ou bien : REF=roi de France => REF=Le Roi de France |
|
| 9 |
ou encore : REF= => REF=<EMPTY> |
|
| 10 |
***********/ |
|
| 11 |
|
|
| 12 |
package org.txm.macroprototypes.urs.misc |
|
| 13 |
|
|
| 14 |
import org.apache.commons.lang.* |
|
| 15 |
import org.kohsuke.args4j.* |
|
| 16 |
import groovy.transform.* |
|
| 17 |
import org.txm.* |
|
| 18 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 19 |
import org.txm.analec.* |
|
| 20 |
import org.txm.searchengine.cqp.* |
|
| 21 |
import org.txm.searchengine.cqp.corpus.* |
|
| 22 |
import visuAnalec.Message.* |
|
| 23 |
import visuAnalec.donnees.* |
|
| 24 |
import visuAnalec.elements.* |
|
| 25 |
import visuAnalec.vue.* |
|
| 26 |
|
|
| 27 |
// CORPS DU SCRIPT |
|
| 28 |
|
|
| 29 |
if (!(corpusViewSelection instanceof MainCorpus)) {
|
|
| 30 |
println "Corpora selection is not a Corpus" |
|
| 31 |
return |
|
| 32 |
} |
|
| 33 |
|
|
| 34 |
// BEGINNING OF PARAMETERS |
|
| 35 |
@Field @Option(name="unit_type", usage="Unité", widget="String", required=true, def="MENTION") |
|
| 36 |
def unit_type |
|
| 37 |
@Field @Option(name="prop_name", usage="Propriété", widget="String", required=true, def="CATEGORIE") |
|
| 38 |
def prop_name |
|
| 39 |
@Field @Option(name="val_cherche", usage="Valeur recherchée", widget="String", required=true, def="") |
|
| 40 |
def val_cherche |
|
| 41 |
@Field @Option(name="val_remplace", usage="Valeur de remplacement", widget="String", required=true, def="") |
|
| 42 |
def val_remplace |
|
| 43 |
@Field @Option(name="inclureVides", usage="Inclure les valeurs vides", widget="Boolean", required=true, def="true") |
|
| 44 |
def inclureVides |
|
| 45 |
|
|
| 46 |
if (!ParametersDialog.open(this)) return |
|
| 47 |
|
|
| 48 |
corpus = corpusViewSelection |
|
| 49 |
analecCorpus = AnalecCorpora.getCorpus(corpus.getName()) |
|
| 50 |
vue = AnalecCorpora.getVue(corpus.getName()) |
|
| 51 |
structure = analecCorpus.getStructure() |
|
| 52 |
|
|
| 53 |
if (!structure.getUnites().contains(unit_type)) { // check if the structure contains the unit_type units
|
|
| 54 |
println "Erreur : le corpus ne contient d'unité de type $unit_type" |
|
| 55 |
println "Script terminé" |
|
| 56 |
return |
|
| 57 |
} |
|
| 58 |
|
|
| 59 |
if (!structure.getUniteProperties(unit_type).contains(prop_name)) {
|
|
| 60 |
println "Erreur : les unités $unit_type n'ont pas de propriété $prop_name" |
|
| 61 |
println "Script terminé" |
|
| 62 |
return |
|
| 63 |
} |
|
| 64 |
|
|
| 65 |
println "Option 'inclure les valeurs vides' : $inclureVides" |
|
| 66 |
|
|
| 67 |
def nModified = 0 |
|
| 68 |
def nIgnored = 0 |
|
| 69 |
|
|
| 70 |
def units = analecCorpus.getUnites(unit_type) |
|
| 71 |
units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
|
|
| 72 |
|
|
| 73 |
for (Unite unit : units) { // process all units
|
|
| 74 |
def val = unit.getProp( prop_name ) |
|
| 75 |
|
|
| 76 |
if ( (val == val_cherche) || ( inclureVides && (val == "")) ) {
|
|
| 77 |
vue.setValeurChamp(unit, prop_name, val_remplace) |
|
| 78 |
nModified++ |
|
| 79 |
} else {
|
|
| 80 |
nIgnored++ |
|
| 81 |
} |
|
| 82 |
} |
|
| 83 |
|
|
| 84 |
|
|
| 85 |
|
|
| 86 |
if (nModified > 0) corpus.setIsModified(true); |
|
| 87 |
|
|
| 88 |
println "Result:" |
|
| 89 |
println "- $nModified units of type $unit_type have been modified." |
|
| 90 |
println "- $nIgnored units of type $unit_type have not been modified." |
|
| 91 |
|
|
| 92 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut() |
|
| tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/misc/ChercherSupprimer.txt (revision 2094) | ||
|---|---|---|
| 1 |
Macro ChercherSupprimer |
|
| 2 |
Auteur : Matthieu QUIGNARD |
|
| 3 |
Version : 05 Février 2019 |
|
| 4 |
|
|
| 5 |
Supprime de façon **DEFINITIVE** les mentions qui ont une certaine valeur de propriété. |
|
| 6 |
Comme pour ChercherRemplacer, on peut choisir d'inclure aussi |
|
| 7 |
les mentions qui ont une valeur vide pour la propriété donnée. |
|
| 8 |
|
|
| 9 |
Par exemple : |
|
| 10 |
CATEGORIE=ERREUR |
|
| 11 |
CHECK= |
|
| 12 |
REF=NON_REF (suppression des mentions non référentielles) |
|
| 13 |
REF=SI (suppression de tous les singletons) |
|
| 14 |
|
|
| 15 |
|
|
| 0 | 16 | |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsSummaryMacro.groovy (revision 2094) | ||
|---|---|---|
| 43 | 43 |
int maximum_schema_size |
| 44 | 44 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
| 45 | 45 |
String unit_ursql |
| 46 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int limit_distance_in_schema
|
|
| 48 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 49 |
limit_cql
|
|
| 46 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int position_in_schema
|
|
| 48 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 49 |
cql_limit
|
|
| 50 | 50 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true") |
| 51 | 51 |
boolean strict_inclusion |
| 52 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int limit_distance
|
|
| 52 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int position
|
|
| 54 | 54 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
| 55 | 55 |
debug |
| 56 | 56 |
if (!ParametersDialog.open(this)) return |
| ... | ... | |
| 70 | 70 |
props.addAll(analecCorpus.getStructure().getUniteProperties(type)); |
| 71 | 71 |
|
| 72 | 72 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, |
| 73 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
|
|
| 73 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position);
|
|
| 74 | 74 |
|
| 75 | 75 |
allresults[corpus] = selectedUnits; |
| 76 | 76 |
} |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsReferentialDensityMacro.groovy (revision 2094) | ||
|---|---|---|
| 20 | 20 |
|
| 21 | 21 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
| 22 | 22 |
String unit_ursql |
| 23 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 24 |
limit_cql
|
|
| 23 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 24 |
cql_limit
|
|
| 25 | 25 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 26 | 26 |
strict_inclusion |
| 27 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 28 |
limit_distance
|
|
| 27 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 28 |
position
|
|
| 29 | 29 |
|
| 30 | 30 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
| 31 | 31 |
debug |
| ... | ... | |
| 40 | 40 |
int nMots = corpus.getSize(); |
| 41 | 41 |
|
| 42 | 42 |
def units = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, "", 0, 0, |
| 43 |
unit_ursql, 0, limit_cql, strict_inclusion, limit_distance);
|
|
| 43 |
unit_ursql, 0, cql_limit, strict_inclusion, position);
|
|
| 44 | 44 |
|
| 45 | 45 |
int nUnites = units.size(); |
| 46 | 46 |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsStabilityScoreMacro.groovy (revision 2094) | ||
|---|---|---|
| 36 | 36 |
@Field @Option(name="word_property", usage="", widget="String", required=false, def="word") |
| 37 | 37 |
String word_property |
| 38 | 38 |
|
| 39 |
@Field @Option(name="show_values", usage="", widget="Boolean", required=false, def="false") |
|
| 40 |
boolean show_values |
|
| 41 |
|
|
| 39 | 42 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
| 40 | 43 |
debug |
| 41 | 44 |
|
| ... | ... | |
| 60 | 63 |
def CQI = CQPSearchEngine.getCqiClient() |
| 61 | 64 |
|
| 62 | 65 |
def prop = corpus.getProperty(word_property) |
| 63 |
|
|
| 66 |
if (prop == null) { // no CQP property called $word_property
|
|
| 67 |
errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], word_property) |
|
| 68 |
if (errors.size() > 0) {
|
|
| 69 |
println "** $word_property unit property cannot be computed in the corpus with types: $errors." |
|
| 70 |
return; |
|
| 71 |
} |
|
| 72 |
} |
|
| 64 | 73 |
def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, 999999); |
| 65 | 74 |
allFormesSet = new HashSet(); |
| 66 |
nUnitesGrandTotal = 0; |
|
| 67 | 75 |
def coefs = [] |
| 68 | 76 |
int n = 1 |
| 77 |
|
|
| 78 |
int nUnitesAllSchemas = 0 |
|
| 79 |
int nUnitesTotalSchemas = 0 |
|
| 80 |
|
|
| 69 | 81 |
for (def schema : schemas) {
|
| 70 | 82 |
def formesSet = new HashSet(); // contient toutes les formes du CR courant |
| 71 | 83 |
nUnitesTotal = 0; |
| ... | ... | |
| 73 | 85 |
def allUnites = schema.getUnitesSousjacentesNonTriees() |
| 74 | 86 |
|
| 75 | 87 |
def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql) |
| 76 |
def nUnites = units.size() |
|
| 88 |
def nUnites = schema.getUnitesSousjacentes().size() |
|
| 89 |
def nUnitesTotal = units.size() |
|
| 77 | 90 |
for (def unit : units) {
|
| 78 | 91 |
|
| 79 | 92 |
String forme = null; |
| ... | ... | |
| 88 | 101 |
} |
| 89 | 102 |
|
| 90 | 103 |
formesSet.add(forme) |
| 91 |
|
|
| 92 |
nUnitesTotal++ |
|
| 93 | 104 |
} |
| 94 |
if (formesSet.size() == 0) {
|
|
| 95 |
coef = -1 |
|
| 105 |
|
|
| 106 |
if (formesSet.size() == 0 || nUnitesTotal == 0) {
|
|
| 107 |
coef = "NA" |
|
| 96 | 108 |
} else {
|
| 97 | 109 |
coef = (nUnitesTotal/formesSet.size()) |
| 98 | 110 |
} |
| 99 | 111 |
coefs << coef |
| 100 |
nUnitesGrandTotal += nUnitesTotal; |
|
| 101 | 112 |
allFormesSet.addAll(formesSet) |
| 102 | 113 |
|
| 103 |
if (schema_display_property_name != null) {
|
|
| 114 |
if (schema_display_property_name != null && schema_display_property_name.length() > 0) {
|
|
| 104 | 115 |
print schema.getProp(schema_display_property_name) |
| 105 | 116 |
} else {
|
| 106 | 117 |
print schema_ursql+"-"+n+" : " |
| 107 | 118 |
} |
| 108 | 119 |
|
| 109 |
println " ($nUnites units) : $nUnitesTotal selected units / ${formesSet.size()} forms = $coef"
|
|
| 120 |
println " ($nUnites units) : $nUnitesTotal selected units / ${formesSet.size()} ${word_property}s = $coef"
|
|
| 121 |
if (show_values) {
|
|
| 122 |
println "\t${word_property}s="+formesSet
|
|
| 123 |
} |
|
| 110 | 124 |
n++ |
| 125 |
|
|
| 126 |
nUnitesAllSchemas += nUnites |
|
| 127 |
nUnitesTotalSchemas += nUnitesTotal |
|
| 111 | 128 |
} |
| 112 | 129 |
|
| 113 |
return ["result":coefs, "data":["nUnitesTotal":nUnitesGrandTotal, "allFormesSet":allFormesSet], "coef":(nUnitesGrandTotal/allFormesSet.size())] |
|
| 130 |
coef = nUnitesTotalSchemas/allFormesSet.size() |
|
| 131 |
//println "ALL : ($nUnitesAllSchemas units) : $nUnitesTotalSchemas selected units / ${allFormesSet.size()} ${word_property}s = $coef"
|
|
| 132 |
|
|
| 133 |
return ["result":coefs, "data":["nUnitesTotal":nUnitesTotalSchemas, "allFormesSet":allFormesSet], "coef":(coef)] |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsIndexMacro.groovy (revision 2094) | ||
|---|---|---|
| 51 | 51 |
String unit_property_display |
| 52 | 52 |
@Field @Option(name="cqp_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="") |
| 53 | 53 |
String cqp_property_display |
| 54 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 55 |
int limit_distance_in_schema
|
|
| 56 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 57 |
limit_cql
|
|
| 54 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 55 |
int position_in_schema
|
|
| 56 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 57 |
cql_limit
|
|
| 58 | 58 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 59 | 59 |
strict_inclusion |
| 60 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 61 |
limit_distance
|
|
| 62 |
@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true") |
|
| 63 |
output_2D |
|
| 60 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 61 |
position
|
|
| 62 |
//@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
|
|
| 63 |
output_2D = true
|
|
| 64 | 64 |
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true") |
| 65 | 65 |
output_showlegend |
| 66 | 66 |
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=true, def="0") |
| ... | ... | |
| 115 | 115 |
} |
| 116 | 116 |
|
| 117 | 117 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, |
| 118 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
|
|
| 118 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position);
|
|
| 119 | 119 |
selectedUnits = new HashSet(selectedUnits) |
| 120 | 120 |
def counts = null |
| 121 | 121 |
if (cqp_property_display != null && cqp_property_display.length() > 0) {
|
| ... | ... | |
| 163 | 163 |
else { corpus = corpusViewSelection }
|
| 164 | 164 |
|
| 165 | 165 |
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}"
|
| 166 |
title += "[${limit_distance}]."
|
|
| 166 |
title += "[${position}]."
|
|
| 167 | 167 |
if (cqp_property_display.length() > 0) title += "${cqp_property_display} frequencies"
|
| 168 | 168 |
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies"
|
| 169 |
if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) title += "\n(${limit_cql} limits)"
|
|
| 169 |
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)"
|
|
| 170 | 170 |
|
| 171 |
if (cqp_property_display.length() > 0) println "Index de la propriété $cqp_property_display des mots des unités $unit_ursql[$limit_distance] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
|
|
| 172 |
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$limit_distance] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
|
|
| 171 |
if (cqp_property_display.length() > 0) println "Index de la propriété $cqp_property_display des mots des unités $unit_ursql[$position] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
|
|
| 172 |
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
|
|
| 173 | 173 |
println "$unit_property_display\t"+selection.join("\t")
|
| 174 | 174 |
|
| 175 | 175 |
keys.eachWithIndex { prop_val, i ->
|
| ... | ... | |
| 223 | 223 |
library(latticeExtra) |
| 224 | 224 |
library(lattice) |
| 225 | 225 |
library(RColorBrewer) |
| 226 |
cloud(value~col+row, m, panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
|
|
| 226 |
cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
|
|
| 227 | 227 |
""" |
| 228 | 228 |
} |
| 229 | 229 |
|
| ... | ... | |
| 260 | 260 |
@Override |
| 261 | 261 |
public void run() { try {
|
| 262 | 262 |
if (UnitsIndexMacro.this.output_histogram) {
|
| 263 |
OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units") |
|
| 263 |
//OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units") |
|
| 264 |
OpenBrowser.openfile(file.getAbsolutePath()) |
|
| 264 | 265 |
} |
| 265 | 266 |
if (UnitsIndexMacro.this.output_lexicaltable) {
|
| 266 | 267 |
CorporaView.refreshObject(corpus) |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemasProgressionMacro.groovy (revision 2094) | ||
|---|---|---|
| 46 | 46 |
String schema_property_display |
| 47 | 47 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="") |
| 48 | 48 |
String unit_ursql |
| 49 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 50 |
int limit_distance_in_schema
|
|
| 51 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 52 |
limit_cql
|
|
| 49 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 50 |
int position_in_schema
|
|
| 51 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 52 |
cql_limit
|
|
| 53 | 53 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 54 | 54 |
boolean strict_inclusion |
| 55 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 56 |
int limit_distance
|
|
| 55 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 56 |
int position
|
|
| 57 | 57 |
@Field @Option(name="struct_name", usage="Structure to display", widget="String", required=true, def="div") |
| 58 | 58 |
String struct_name |
| 59 | 59 |
@Field @Option(name="struct_prop", usage="Structure property to display", widget="String", required=true, def="n") |
| ... | ... | |
| 77 | 77 |
|
| 78 | 78 |
def CQI = CQPSearchEngine.getCqiClient() |
| 79 | 79 |
|
| 80 |
def limit_cql_matches = null;
|
|
| 81 |
if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) {
|
|
| 82 |
def limitssubcorpus = parent.createSubcorpus(limit_cql, parent.getName().toUpperCase())
|
|
| 83 |
limit_cql_matches = limitssubcorpus.getMatches();
|
|
| 80 |
def cql_limit_matches = null;
|
|
| 81 |
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) {
|
|
| 82 |
def limitssubcorpus = parent.createSubcorpus(cql_limit, parent.getName().toUpperCase())
|
|
| 83 |
cql_limit_matches = limitssubcorpus.getMatches();
|
|
| 84 | 84 |
limitssubcorpus.delete(); |
| 85 | 85 |
} else {
|
| 86 |
limit_cql_matches = parent.getMatches()
|
|
| 86 |
cql_limit_matches = parent.getMatches()
|
|
| 87 | 87 |
} |
| 88 | 88 |
|
| 89 | 89 |
def queries = [] |
| ... | ... | |
| 121 | 121 |
} |
| 122 | 122 |
|
| 123 | 123 |
allUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchemas) |
| 124 |
if ((unit_ursql != null && unit_ursql.length() > 0) || (limit_cql != null && !limit_cql.getQueryString().equals("\"\""))) {
|
|
| 124 |
if ((unit_ursql != null && unit_ursql.length() > 0) || (cql_limit != null && !cql_limit.getQueryString().equals("\"\""))) {
|
|
| 125 | 125 |
if (debug) println "Building selection of units to highlight..." |
| 126 | 126 |
allHighlightedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchemas, unit_ursql) |
| 127 | 127 |
} |
| 128 | 128 |
|
| 129 |
if (limit_distance_in_schema > 0) allHighlightedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, allSchemas, limit_distance_in_schema)
|
|
| 129 |
if (position_in_schema > 0) allHighlightedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, allSchemas, position_in_schema)
|
|
| 130 | 130 |
|
| 131 | 131 |
if (allUnits.size() == 0) {
|
| 132 | 132 |
println "No unit match for '$unit_ursql' selection. Aborting" |
| ... | ... | |
| 150 | 150 |
def selectedAndHighlightedUnits = new HashSet() // faster to find items |
| 151 | 151 |
if (allHighlightedUnits != null && allHighlightedUnits.containsKey(schema)) selectedAndHighlightedUnits.addAll(allHighlightedUnits[schema]) |
| 152 | 152 |
|
| 153 |
if (limit_cql_matches != null) {
|
|
| 153 |
if (cql_limit_matches != null) {
|
|
| 154 | 154 |
if (debug) println "corpus matches: "+parent.getMatches() |
| 155 |
if (debug) println "filter limit_cql_matches=${limit_cql_matches} with "+selectedAndHighlightedUnits.size()+" units."
|
|
| 156 |
selectedAndHighlightedUnits = AnalecUtils.filterUniteByInclusion(debug, selectedAndHighlightedUnits, limit_cql_matches, strict_inclusion, limit_distance)
|
|
| 155 |
if (debug) println "filter cql_limit_matches=${cql_limit_matches} with "+selectedAndHighlightedUnits.size()+" units."
|
|
| 156 |
selectedAndHighlightedUnits = AnalecUtils.filterUniteByInclusion(debug, selectedAndHighlightedUnits, cql_limit_matches, strict_inclusion, position)
|
|
| 157 | 157 |
if (debug) println "selectedAndHighlightedUnits=${selectedAndHighlightedUnits.size()}"
|
| 158 | 158 |
} |
| 159 | 159 |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsListMacro.groovy (revision 2094) | ||
|---|---|---|
| 48 | 48 |
int maximum_schema_size |
| 49 | 49 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
| 50 | 50 |
String unit_ursql |
| 51 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 52 |
int limit_distance_in_schema
|
|
| 53 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 54 |
limit_cql
|
|
| 51 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 52 |
int position_in_schema
|
|
| 53 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
|
|
| 54 |
cql_limit
|
|
| 55 | 55 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 56 | 56 |
boolean strict_inclusion |
| 57 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 58 |
int limit_distance
|
|
| 57 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 58 |
int position
|
|
| 59 | 59 |
@Field @Option(name="output_mode", usage="If selected units properties and words are shown", widget="StringArray", metaVar="COUNT TABULATED FORMATED CONCORDANCE CQL", required=true, def="FORMATED") |
| 60 | 60 |
output_mode |
| 61 | 61 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
| ... | ... | |
| 76 | 76 |
def analecCorpus = URSCorpora.getCorpus(mainCorpus) |
| 77 | 77 |
|
| 78 | 78 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, |
| 79 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
|
|
| 79 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position);
|
|
| 80 | 80 |
|
| 81 | 81 |
def n = 1 |
| 82 | 82 |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/edit/ConcordanceToUnitMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// STANDARD DECLARATIONS |
|
| 2 |
package org.txm.macro.urs.edit |
|
| 3 |
|
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 |
import groovy.transform.Field |
|
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
import org.txm.rcp.editors.concordances.* |
|
| 8 |
import org.txm.searchengine.cqp.corpus.CQPCorpus |
|
| 9 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
| 10 |
import org.txm.functions.concordances.* |
|
| 11 |
import org.txm.annotation.urs.* |
|
| 12 |
import org.txm.concordance.core.functions.Concordance |
|
| 13 |
import org.txm.concordance.rcp.editors.ConcordanceEditor |
|
| 14 |
import visuAnalec.elements.Unite |
|
| 15 |
|
|
| 16 |
// BEGINNING OF PARAMETERS |
|
| 17 |
|
|
| 18 |
@Field @Option(name="unit_type", usage="The unit type to create", widget="String", required=true, def="MENTION") |
|
| 19 |
def unit_type |
|
| 20 |
|
|
| 21 |
@Field @Option(name="create_only_if_new", usage="Create the unit if not already annotated", widget="Boolean", required=true, def="true") |
|
| 22 |
def create_only_if_new |
|
| 23 |
|
|
| 24 |
@Field @Option(name="prop", usage="prop", widget="String", required=true, def="REF") |
|
| 25 |
def prop |
|
| 26 |
|
|
| 27 |
@Field @Option(name="value", usage="default value", widget="String", required=true, def="NAME") |
|
| 28 |
def value |
|
| 29 |
|
|
| 30 |
// END OF PARAMETERS |
|
| 31 |
|
|
| 32 |
// get a Concordance from 1) current Concordance editor or 2) CorporaView selection |
|
| 33 |
Concordance concordance |
|
| 34 |
if (editor instanceof ConcordanceEditor) {
|
|
| 35 |
concordance = editor.getConcordance() |
|
| 36 |
} else if (corpusViewSelection instanceof Concordance) {
|
|
| 37 |
concordance = corpusViewSelection |
|
| 38 |
} else {
|
|
| 39 |
println "You must select a concordance or open a concordance result to run this macro." |
|
| 40 |
return false |
|
| 41 |
} |
|
| 42 |
|
|
| 43 |
if (concordance == null) {
|
|
| 44 |
println "You must compute a concordance before." |
|
| 45 |
return |
|
| 46 |
} |
|
| 47 |
|
|
| 48 |
// check the analec corpus is ready |
|
| 49 |
CQPCorpus corpus = concordance.getCorpus().getMainCorpus(); |
|
| 50 |
String name = corpus.getID() |
|
| 51 |
if (!URSCorpora.isAnnotationStructureReady(corpus)) {
|
|
| 52 |
println "Annotation structure is not ready." |
|
| 53 |
return |
|
| 54 |
} |
|
| 55 |
|
|
| 56 |
// Open the parameters input dialog box |
|
| 57 |
if (!ParametersDialog.open(this)) return; |
|
| 58 |
|
|
| 59 |
// check the corpus structure has the unit_type provided |
|
| 60 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
| 61 |
if (!analecCorpus.getStructure().getUnites().contains(unit_type)) {
|
|
| 62 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 63 |
//return; |
|
| 64 |
analecCorpus.getStructure().ajouterType(Unite.class, unit_type); |
|
| 65 |
} |
|
| 66 |
|
|
| 67 |
if (!analecCorpus.getStructure().getNomsProps(Unite.class, unit_type).contains(prop)) {
|
|
| 68 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 69 |
//return; |
|
| 70 |
analecCorpus.getStructure().ajouterProp(Unite.class, unit_type, prop) |
|
| 71 |
} |
|
| 72 |
|
|
| 73 |
if (!analecCorpus.getStructure().getValeursProp(Unite.class, unit_type, prop).contains(value)) {
|
|
| 74 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 75 |
//return; |
|
| 76 |
analecCorpus.getStructure().ajouterVal(Unite.class, unit_type, prop, value) |
|
| 77 |
} |
|
| 78 |
|
|
| 79 |
// browse lines and check |
|
| 80 |
def units = analecCorpus.getUnites(unit_type) |
|
| 81 |
def lines = concordance.getLines() |
|
| 82 |
|
|
| 83 |
int n = 0 |
|
| 84 |
for (int iLine = 0 ; iLine < lines.size() ; iLine++) {
|
|
| 85 |
int iUnit = 0 |
|
| 86 |
def line = lines[iLine] |
|
| 87 |
def m = line.getMatch() |
|
| 88 |
def do_create = true |
|
| 89 |
if (create_only_if_new && iUnit < units.size()) { // test only if create_only_if_new == true
|
|
| 90 |
def unit = null |
|
| 91 |
//TODO don't iterates over all units |
|
| 92 |
while (iUnit < units.size() ) { //&& units[iUnit].getDeb() < m.getStart()) {
|
|
| 93 |
if (iUnit < units.size()) {
|
|
| 94 |
unit = units[iUnit++] |
|
| 95 |
if (unit.getDeb() == m.getStart() && unit.getFin() == m.getEnd()) { // skip and print the line
|
|
| 96 |
println("skiping concordance line '"+line.keywordToString()+"' at "+line.getViewRef().toString()+" ("+unit.getDeb()+ ", "+unit.getFin()+")")
|
|
| 97 |
do_create = false |
|
| 98 |
continue |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 |
} |
|
| 102 |
} |
|
| 103 |
if (do_create) {
|
|
| 104 |
n++ |
|
| 105 |
def props = [:] |
|
| 106 |
props[prop] = value |
|
| 107 |
Unite u = analecCorpus.addUniteSaisie(unit_type, m.getStart(), m.getEnd(), props) |
|
| 108 |
// println "$props -> "+u.getProps() |
|
| 109 |
} |
|
| 110 |
} |
|
| 111 |
println "$n $unit_type created." |
|
| 112 |
if (n > 0) corpus.setIsModified(true); |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/edit/ConcordanceToUnitsMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// STANDARD DECLARATIONS |
|
| 2 |
package org.txm.macro.urs.edit |
|
| 3 |
|
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 |
import groovy.transform.Field |
|
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
import org.txm.rcp.editors.concordances.* |
|
| 8 |
import org.txm.searchengine.cqp.corpus.CQPCorpus |
|
| 9 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
| 10 |
import org.txm.functions.concordances.* |
|
| 11 |
import org.txm.annotation.urs.* |
|
| 12 |
import org.txm.concordance.core.functions.Concordance |
|
| 13 |
import org.txm.concordance.rcp.editors.ConcordanceEditor |
|
| 14 |
import visuAnalec.elements.Unite |
|
| 15 |
|
|
| 16 |
// BEGINNING OF PARAMETERS |
|
| 17 |
|
|
| 18 |
@Field @Option(name="unit_type", usage="The unit type to create", widget="String", required=true, def="MENTION") |
|
| 19 |
def unit_type |
|
| 20 |
|
|
| 21 |
@Field @Option(name="create_only_if_new", usage="Create the unit if not already annotated", widget="Boolean", required=true, def="true") |
|
| 22 |
def create_only_if_new |
|
| 23 |
|
|
| 24 |
@Field @Option(name="prop", usage="prop", widget="String", required=true, def="REF") |
|
| 25 |
def prop |
|
| 26 |
|
|
| 27 |
@Field @Option(name="value", usage="default value", widget="String", required=true, def="NAME") |
|
| 28 |
def value |
|
| 29 |
|
|
| 30 |
// END OF PARAMETERS |
|
| 31 |
|
|
| 32 |
// get a Concordance from 1) current Concordance editor or 2) CorporaView selection |
|
| 33 |
Concordance concordance |
|
| 34 |
if (editor instanceof ConcordanceEditor) {
|
|
| 35 |
concordance = editor.getConcordance() |
|
| 36 |
} else if (corpusViewSelection instanceof Concordance) {
|
|
| 37 |
concordance = corpusViewSelection |
|
| 38 |
} else {
|
|
| 39 |
println "You must select a concordance or open a concordance result to run this macro." |
|
| 40 |
return false |
|
| 41 |
} |
|
| 42 |
|
|
| 43 |
if (concordance == null) {
|
|
| 44 |
println "You must compute a concordance before." |
|
| 45 |
return |
|
| 46 |
} |
|
| 47 |
|
|
| 48 |
// check the analec corpus is ready |
|
| 49 |
CQPCorpus corpus = concordance.getCorpus().getMainCorpus(); |
|
| 50 |
String name = corpus.getID() |
|
| 51 |
if (!URSCorpora.isAnnotationStructureReady(corpus)) {
|
|
| 52 |
println "Annotation structure is not ready." |
|
| 53 |
return |
|
| 54 |
} |
|
| 55 |
|
|
| 56 |
// Open the parameters input dialog box |
|
| 57 |
if (!ParametersDialog.open(this)) return; |
|
| 58 |
|
|
| 59 |
// check the corpus structure has the unit_type provided |
|
| 60 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
| 61 |
if (!analecCorpus.getStructure().getUnites().contains(unit_type)) {
|
|
| 62 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 63 |
//return; |
|
| 64 |
analecCorpus.getStructure().ajouterType(Unite.class, unit_type); |
|
| 65 |
} |
|
| 66 |
|
|
| 67 |
if (!analecCorpus.getStructure().getNomsProps(Unite.class, unit_type).contains(prop)) {
|
|
| 68 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 69 |
//return; |
|
| 70 |
analecCorpus.getStructure().ajouterProp(Unite.class, unit_type, prop) |
|
| 71 |
} |
|
| 72 |
|
|
| 73 |
if (!analecCorpus.getStructure().getValeursProp(Unite.class, unit_type, prop).contains(value)) {
|
|
| 74 |
//println "The corpus structure does not contains unit with type=$unit_type" |
|
| 75 |
//return; |
|
| 76 |
analecCorpus.getStructure().ajouterVal(Unite.class, unit_type, prop, value) |
|
| 77 |
} |
|
| 78 |
|
|
| 79 |
// browse lines and check |
|
| 80 |
def units = analecCorpus.getUnites(unit_type) |
|
| 81 |
def lines = concordance.getLines() |
|
| 82 |
|
|
| 83 |
int n = 0 |
|
| 84 |
for (int iLine = 0 ; iLine < lines.size() ; iLine++) {
|
|
| 85 |
int iUnit = 0 |
|
| 86 |
def line = lines[iLine] |
|
| 87 |
def m = line.getMatch() |
|
| 88 |
def do_create = true |
|
| 89 |
if (create_only_if_new && iUnit < units.size()) { // test only if create_only_if_new == true
|
|
| 90 |
def unit = null |
|
| 91 |
//TODO don't iterates over all units |
|
| 92 |
while (iUnit < units.size() ) { //&& units[iUnit].getDeb() < m.getStart()) {
|
|
| 93 |
if (iUnit < units.size()) {
|
|
| 94 |
unit = units[iUnit++] |
|
| 95 |
if (unit.getDeb() == m.getStart() && unit.getFin() == m.getEnd()) { // skip and print the line
|
|
| 96 |
println("skiping concordance line '"+line.keywordToString()+"' at "+line.getViewRef().toString()+" ("+unit.getDeb()+ ", "+unit.getFin()+")")
|
|
| 97 |
do_create = false |
|
| 98 |
continue |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 |
} |
|
| 102 |
} |
|
| 103 |
if (do_create) {
|
|
| 104 |
n++ |
|
| 105 |
def props = [:] |
|
| 106 |
props[prop] = value |
|
| 107 |
Unite u = analecCorpus.addUniteSaisie(unit_type, m.getStart(), m.getEnd(), props) |
|
| 108 |
// println "$props -> "+u.getProps() |
|
| 109 |
} |
|
| 110 |
} |
|
| 111 |
println "$n $unit_type created." |
|
| 112 |
if (n > 0) corpus.setIsModified(true); |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/edit/UnitsDeleteMacro.groovy (revision 2094) | ||
|---|---|---|
| 43 | 43 |
int minimum_schema_size |
| 44 | 44 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
| 45 | 45 |
String unit_ursql |
| 46 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int limit_distance_in_schema
|
|
| 48 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 49 |
limit_cql
|
|
| 46 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int position_in_schema
|
|
| 48 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 49 |
cql_limit
|
|
| 50 | 50 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 51 | 51 |
boolean strict_inclusion |
| 52 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int limit_distance
|
|
| 52 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int position
|
|
| 54 | 54 |
|
| 55 | 55 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
| 56 | 56 |
debug |
| ... | ... | |
| 69 | 69 |
Structure structure = analecCorpus.getStructure() |
| 70 | 70 |
|
| 71 | 71 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, Integer.MAX_VALUE, |
| 72 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
|
|
| 72 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position);
|
|
| 73 | 73 |
|
| 74 | 74 |
def n = 0 |
| 75 | 75 |
def nerrors = 0 |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/edit/UnitsAnnotateMacro.groovy (revision 2094) | ||
|---|---|---|
| 43 | 43 |
int minimum_schema_size |
| 44 | 44 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
| 45 | 45 |
String unit_ursql |
| 46 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int limit_distance_in_schema
|
|
| 48 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
|
|
| 49 |
limit_cql
|
|
| 46 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 47 |
int position_in_schema
|
|
| 48 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="<div> [] expand to div")
|
|
| 49 |
cql_limit
|
|
| 50 | 50 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
| 51 | 51 |
boolean strict_inclusion |
| 52 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int limit_distance
|
|
| 52 |
@Field @Option(name="position", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
|
|
| 53 |
int position
|
|
| 54 | 54 |
|
| 55 | 55 |
@Field @Option(name="unit_property_to_set", usage="PROP", widget="String", required=false, def="TESTPROP") |
| 56 | 56 |
String unit_property_to_set |
| ... | ... | |
| 80 | 80 |
analecView.initVueParDefaut() |
| 81 | 81 |
|
| 82 | 82 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, Integer.MAX_VALUE, |
| 83 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
|
|
| 83 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position);
|
|
| 84 | 84 |
|
| 85 | 85 |
println " "+selectedUnits.size()+" units to annotate..." |
| 86 | 86 |
def n = 0 |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/AnalecUtils.groovy (revision 2094) | ||
|---|---|---|
| 93 | 93 |
* @param minimum_schema_size |
| 94 | 94 |
* @param maximum_schema_size |
| 95 | 95 |
* @param unit_ursql |
| 96 |
* @param limit_cql
|
|
| 96 |
* @param cql_limit
|
|
| 97 | 97 |
* @param strict_inclusion |
| 98 |
* @param limit_distance
|
|
| 98 |
* @param position
|
|
| 99 | 99 |
* @return |
| 100 | 100 |
*/ |
| 101 | 101 |
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus, |
| 102 | 102 |
String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size, |
| 103 |
String unit_ursql, Integer limit_distance_in_schema, CQLQuery limit_cql, Boolean strict_inclusion, int limit_distance) {
|
|
| 103 |
String unit_ursql, Integer position_in_schema, CQLQuery cql_limit, Boolean strict_inclusion, int position) {
|
|
| 104 | 104 |
def groupedUnits = [] |
| 105 | 105 |
if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
|
| 106 | 106 |
def allSchema = null; |
| ... | ... | |
| 114 | 114 |
|
| 115 | 115 |
groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql) |
| 116 | 116 |
|
| 117 |
if (limit_distance_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, limit_distance_in_schema)
|
|
| 117 |
if (position_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
|
|
| 118 | 118 |
|
| 119 | 119 |
} else {
|
| 120 | 120 |
groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)] |
| ... | ... | |
| 122 | 122 |
if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
|
| 123 | 123 |
|
| 124 | 124 |
def matches = null |
| 125 |
if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) {
|
|
| 126 |
Subcorpus limitssubcorpus = corpus.createSubcorpus(limit_cql, corpus.getID().toUpperCase())
|
|
| 125 |
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) {
|
|
| 126 |
Subcorpus limitssubcorpus = corpus.createSubcorpus(cql_limit, corpus.getID().toUpperCase())
|
|
| 127 | 127 |
matches = limitssubcorpus.getMatches(); |
| 128 | 128 |
limitssubcorpus.delete(); |
| 129 | 129 |
} else {
|
| ... | ... | |
| 132 | 132 |
if (debug >= 2) println "matches=${matches}"
|
| 133 | 133 |
def allUnits = [] |
| 134 | 134 |
for (def k : groupedUnits.keySet()) {
|
| 135 |
def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, limit_distance)
|
|
| 135 |
def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, position)
|
|
| 136 | 136 |
allUnits.addAll(selectedUnits) |
| 137 | 137 |
} |
| 138 | 138 |
if (debug >= 2) println "selectedUnits=${allUnits.size()}"
|
| ... | ... | |
| 306 | 306 |
return selectedUnitsPerMatch |
| 307 | 307 |
} |
| 308 | 308 |
|
| 309 |
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int limit_distance) {
|
|
| 309 |
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int position) {
|
|
| 310 | 310 |
|
| 311 | 311 |
def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion); |
| 312 | 312 |
//println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size() |
| 313 | 313 |
def selectedUnits = [] |
| 314 |
if (limit_distance != 0) {
|
|
| 315 |
if (limit_distance > 0) limit_distance--
|
|
| 314 |
if (position != 0) {
|
|
| 315 |
if (position > 0) position--
|
|
| 316 | 316 |
|
| 317 | 317 |
for (def m : selectedUnitsPerMatch.keySet()) {
|
| 318 |
if (selectedUnitsPerMatch[m].size() > limit_distance && selectedUnitsPerMatch[m].size() > 0) {
|
|
| 318 |
if (selectedUnitsPerMatch[m].size() > position && selectedUnitsPerMatch[m].size() > 0) {
|
|
| 319 | 319 |
def units = selectedUnitsPerMatch[m] |
| 320 | 320 |
//println "$m -> "+units.collect() {it -> it.getDeb()}
|
| 321 | 321 |
units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
|
| 322 | 322 |
//println "$m -> "+units.collect() {it -> it.getDeb()}
|
| 323 |
selectedUnits << units[limit_distance]
|
|
| 324 |
if (debug >=3) println "dist select: "+units[limit_distance].getDeb()
|
|
| 323 |
selectedUnits << units[position]
|
|
| 324 |
if (debug >=3) println "dist select: "+units[position].getDeb()
|
|
| 325 | 325 |
} |
| 326 | 326 |
} |
| 327 | 327 |
} else {
|
| ... | ... | |
| 387 | 387 |
int atidx = URSQL.indexOf("@");
|
| 388 | 388 |
int equalidx = URSQL.indexOf("=");
|
| 389 | 389 |
|
| 390 |
if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) {
|
|
| 390 |
if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) { // TYPE@PROP=VALUE
|
|
| 391 | 391 |
type = URSQL.substring(0, atidx) |
| 392 | 392 |
prop = URSQL.substring(atidx+1, equalidx) |
| 393 | 393 |
value = URSQL.substring(equalidx+1) |
| 394 |
} else if (atidx >= 0) {
|
|
| 394 |
} else if (atidx >= 0) { // TYPE@PROP
|
|
| 395 | 395 |
type = URSQL.substring(0, atidx) |
| 396 | 396 |
prop = URSQL.substring(atidx+1) |
| 397 |
} else if (equalidx >= 0) {
|
|
| 397 |
} else if (equalidx >= 0) { // TYPE=VALUE -> not well formed
|
|
| 398 | 398 |
type = URSQL.substring(0, equalidx) |
| 399 | 399 |
value = URSQL.substring(equalidx+1) |
| 400 |
} else {
|
|
| 400 |
} else { // TYPE
|
|
| 401 | 401 |
type = URSQL; |
| 402 | 402 |
} |
| 403 | 403 |
// println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"]) |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/check/CheckDuplicatesInSchemasMacro.groovy (revision 2094) | ||
|---|---|---|
| 96 | 96 |
} |
| 97 | 97 |
|
| 98 | 98 |
if (duplicates.size() > 0) {
|
| 99 |
println "Duplicates found"
|
|
| 99 |
println "${duplicates.size()} duplicates found"
|
|
| 100 | 100 |
for (def unit : duplicates.keySet()) {
|
| 101 | 101 |
println AnalecUtils.toString(CQI, word, unit)+" in: " |
| 102 | 102 |
for (Schema schema : duplicates[unit]) {
|
| ... | ... | |
| 104 | 104 |
} |
| 105 | 105 |
} |
| 106 | 106 |
} else {
|
| 107 |
println "No duplicates found in $schema_ursql units" |
|
| 107 |
println "No duplicates found in $schema_ursql schema units"
|
|
| 108 | 108 |
} |
| 109 | 109 |
|
| 110 | 110 |
return duplicates |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/VerificationsMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// Auteur Matthieu Quignard |
|
| 2 |
// Date : 14 janvier 2019 |
|
| 3 |
|
|
| 4 |
/********** |
|
| 5 |
Vérifications automatiques |
|
| 6 |
1. Repère les mentions sans catégorie : CHECK > CAT |
|
| 7 |
2. Repère les mentions sans référent : CHECK > REF |
|
| 8 |
3. Supprime les ponctuations en début et en fin de mention : CHECK > BORNES |
|
| 9 |
4. Supprime les prépositions autres que 'de' en début de mention : CHECK > BORNES |
|
| 10 |
5. Supprime automatiquement toutes les mentions vides = sans aucun mot = de longueur 0 |
|
| 11 |
6. Détecter les mentions qui ont exactement les mêmes bornes : CHECK > DOUBLON |
|
| 12 |
7 (option). Détecter les pronoms hors mention : CHECK > NEW |
|
| 13 |
***********/ |
|
| 14 |
|
|
| 15 |
package org.txm.macro.urs.democrat |
|
| 16 |
|
|
| 17 |
import org.apache.commons.lang.* |
|
| 18 |
import org.kohsuke.args4j.* |
|
| 19 |
import groovy.transform.* |
|
| 20 |
import org.txm.* |
|
| 21 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 22 |
import org.txm.analec.* |
|
| 23 |
import org.txm.searchengine.cqp.* |
|
| 24 |
import org.txm.searchengine.cqp.corpus.* |
|
| 25 |
import visuAnalec.Message.* |
|
| 26 |
import visuAnalec.donnees.* |
|
| 27 |
import visuAnalec.elements.* |
|
| 28 |
import visuAnalec.vue.* |
|
| 29 |
|
|
| 30 |
// TODO : ajouter les étiquettes équivalentes issues du tagset TreeTagger |
|
| 31 |
// Ponctuations et Prépositions |
|
| 32 |
def interditsAuDebut = ["PONfbl", "PONfrt", "PONpxx", "PRE"] |
|
| 33 |
// Ponctuations |
|
| 34 |
def interditsALaFin = ["PONfbl", "PONfrt", "PONpxx"] |
|
| 35 |
// Pronoms en tous genres |
|
| 36 |
def listePronoms = ["PROadv", "PROcar", "PROdem", "PROimp", "PROind", "PROint", "PROper", "PROpos", "PROrel"] |
|
| 37 |
|
|
| 38 |
// CORPS DU SCRIPT |
|
| 39 |
|
|
| 40 |
if (!(corpusViewSelection instanceof MainCorpus)) {
|
|
| 41 |
println "Corpora selection is not a Corpus" |
|
| 42 |
return |
|
| 43 |
} |
|
| 44 |
|
|
| 45 |
// BEGINNING OF PARAMETERS |
|
| 46 |
@Field @Option(name="unit_type", usage="Unité", widget="String", required=true, def="MENTION") |
|
| 47 |
def unit_type |
|
| 48 |
@Field @Option(name="pos_property_name", usage="Etiquette de morphosyntaxe", widget="String", required=true, def="frpos") |
|
| 49 |
def pos_property_name |
|
| 50 |
@Field @Option(name="cat_name", usage="Propriété CATEGORIE", widget="String", required=true, def="CATEGORIE") |
|
| 51 |
def cat_name |
|
| 52 |
@Field @Option(name="ref_name", usage="Propriété REF", widget="String", required=true, def="REF") |
|
| 53 |
def ref_name |
|
| 54 |
@Field @Option(name="checkPronouns", usage="Vérifier les pronoms oubliés", widget="Boolean", required=true, def="true") |
|
| 55 |
def checkPronouns |
|
| 56 |
|
|
| 57 |
if (!ParametersDialog.open(this)) return |
|
| 58 |
|
|
| 59 |
corpus = corpusViewSelection |
|
| 60 |
CQI = Toolbox.getCqiClient() |
|
| 61 |
word = corpus.getWordProperty() |
|
| 62 |
posProperty = corpus.getProperty(pos_property_name) |
|
| 63 |
if (posProperty == null) {
|
|
| 64 |
println "Error: CQP corpus does not contains the word property with name=$pos_property_name" |
|
| 65 |
return |
|
| 66 |
} |
|
| 67 |
|
|
| 68 |
analecCorpus = AnalecCorpora.getCorpus(corpus.getName()) |
|
| 69 |
vue = AnalecCorpora.getVue(corpus.getName()) |
|
| 70 |
structure = analecCorpus.getStructure() |
|
| 71 |
|
|
| 72 |
if (!structure.getUnites().contains(unit_type)) { // check if the structure contains the unit_type units
|
|
| 73 |
println "Error: corpus structure does not contains unit with name=$unit_type" |
|
| 74 |
return |
|
| 75 |
} |
|
| 76 |
|
|
| 77 |
if (!structure.getUniteProperties(unit_type).contains(cat_name)) {
|
|
| 78 |
println "Erreur : les unités $unit_type n'ont pas de propriété $cat_name" |
|
| 79 |
return |
|
| 80 |
} |
|
| 81 |
|
|
| 82 |
if (!structure.getUniteProperties(unit_type).contains(ref_name)) {
|
|
| 83 |
println "Erreur : les unités $unit_type n'ont pas de propriété $ref_name" |
|
| 84 |
return |
|
| 85 |
} |
|
| 86 |
|
|
| 87 |
println "Détection des pronoms oubliés : $checkPronouns" |
|
| 88 |
|
|
| 89 |
// Reinitialiser la propriété CHECK |
|
| 90 |
if (!structure.getUniteProperties(unit_type).contains("CHECK")) {
|
|
| 91 |
analecCorpus.ajouterProp(Unite.class, unit_type, "CHECK") |
|
| 92 |
} else {
|
|
| 93 |
println "Nettoyage des anciennes annotations CHECK" |
|
| 94 |
def tmpvalues = new HashSet() |
|
| 95 |
tmpvalues.addAll(structure.getValeursProp(Unite.class, unit_type, "CHECK")); |
|
| 96 |
for (String val : tmpvalues) {
|
|
| 97 |
structure.supprimerVal(Unite.class, unit_type, "CHECK", val); |
|
| 98 |
//println "suppression de l'étiquette $val" |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 |
|
|
| 102 |
structure.ajouterVal(Unite.class, unit_type, "CHECK", "DONE") |
|
| 103 |
|
|
| 104 |
|
|
| 105 |
|
|
| 106 |
def nModified = 0 |
|
| 107 |
def nIgnored = 0 |
|
| 108 |
def nDeleted = 0 |
|
| 109 |
def nAdded = 0 |
|
| 110 |
|
|
| 111 |
def garbageBin = [] |
|
| 112 |
|
|
| 113 |
def nToks = corpus.getSize() |
|
| 114 |
def tokenIndex = new int[nToks] |
|
| 115 |
|
|
| 116 |
def i = 0 |
|
| 117 |
for (i=0 ; i< nToks ; i++) tokenIndex[i] = 1 |
|
| 118 |
|
|
| 119 |
errors = new HashMap() |
|
| 120 |
def units = analecCorpus.getUnites(unit_type) |
|
| 121 |
units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
|
|
| 122 |
|
|
| 123 |
// pour les doublons |
|
| 124 |
def lastUnit = null |
|
| 125 |
|
|
| 126 |
for (Unite unit : units) { // process all units
|
|
| 127 |
def erreur = "" |
|
| 128 |
|
|
| 129 |
// 1. Catégories vides |
|
| 130 |
def cat = unit.getProp( cat_name ); |
|
| 131 |
if (cat == "") erreur += "CAT " |
|
| 132 |
|
|
| 133 |
// 2. Référents vides (plus grave) ; pas besoin de catégories |
|
| 134 |
def ref = unit.getProp( ref_name ); |
|
| 135 |
if (ref == "") erreur += "REF " |
|
| 136 |
|
|
| 137 |
// 3. Suppression des erreurs initiales ; besoin de catégories |
|
| 138 |
int[] positions = null |
|
| 139 |
if (unit.getDeb() == unit.getFin()) positions = [unit.getDeb()] |
|
| 140 |
else positions = (unit.getDeb()..unit.getFin()) |
|
| 141 |
def Mention = CQI.cpos2Str(posProperty.getQualifiedName(), positions) |
|
| 142 |
|
|
| 143 |
def isOK = false |
|
| 144 |
while (isOK == false) {
|
|
| 145 |
if (interditsAuDebut.contains(Mention[0])) {
|
|
| 146 |
if (positions.size() == 1) {
|
|
| 147 |
erreur += "SUPPR" |
|
| 148 |
isOK = true |
|
| 149 |
} else {
|
|
| 150 |
def debut = unit.getDeb() |
|
| 151 |
unit.setDeb( debut + 1 ) |
|
| 152 |
|
|
| 153 |
if (unit.getDeb() == unit.getFin()) positions = [unit.getDeb()] |
|
| 154 |
else positions = (unit.getDeb()..unit.getFin()) |
|
| 155 |
Mention = CQI.cpos2Str(posProperty.getQualifiedName(), positions) |
|
| 156 |
if (!erreur.contains("BORNESG")) erreur += "BORNESG "
|
|
| 157 |
} |
|
| 158 |
} else {
|
|
| 159 |
isOK = true |
|
| 160 |
} |
|
| 161 |
} |
|
| 162 |
|
|
| 163 |
|
|
| 164 |
// 4. Suppression des erreurs de borne de fin ; besoin de catégories |
|
| 165 |
if (unit.getDeb() == unit.getFin()) positions = [unit.getDeb()] |
|
| 166 |
else positions = (unit.getDeb()..unit.getFin()) |
|
| 167 |
Mention = CQI.cpos2Str(posProperty.getQualifiedName(), positions) |
|
| 168 |
|
|
| 169 |
isOK = false |
|
| 170 |
while ((isOK == false) && (erreur != "remove")) {
|
|
| 171 |
def n = Mention.size() |
|
| 172 |
if (interditsALaFin.contains(Mention[ n-1 ])) {
|
|
| 173 |
if (positions.size() == 1) {
|
|
| 174 |
if (!erreur.contains("SUPPR")) erreur += "SUPPR"
|
|
| 175 |
isOK = true |
|
| 176 |
} else {
|
|
| 177 |
def fin = unit.getFin() |
|
| 178 |
unit.setFin( fin - 1 ) |
|
| 179 |
|
|
| 180 |
if (unit.getDeb() == unit.getFin()) positions = [unit.getDeb()] |
|
| 181 |
else positions = (unit.getDeb()..unit.getFin()) |
|
| 182 |
Mention = CQI.cpos2Str(posProperty.getQualifiedName(), positions) |
|
| 183 |
if (!erreur.contains("BORNESD")) erreur += "BORNESD "
|
|
| 184 |
} |
|
| 185 |
} else isOK = true |
|
| 186 |
} |
|
| 187 |
|
|
| 188 |
// 5. Suppression des unités problématiques (bornes incohérentes ou mot vide) ; pas besoin de catégories |
|
| 189 |
def forme = CQI.cpos2Str(word.getQualifiedName(), positions)[0].trim().toLowerCase() |
|
| 190 |
|
|
| 191 |
if (erreur == "remove") {}
|
|
| 192 |
else if (unit.getFin() < unit.getDeb()) {
|
|
| 193 |
println "ERREUR GRAVE : segmentation incohérente" |
|
| 194 |
erreur = "remove" |
|
| 195 |
} else if ( forme.length() == 0 ) {
|
|
| 196 |
println "ERREUR GRAVE : unité sans mot" |
|
| 197 |
erreur = "remove" |
|
| 198 |
} |
|
| 199 |
|
|
| 200 |
// 6. Détection des doublons ; pas besoin de catégories |
|
| 201 |
if (lastUnit != null) {
|
|
| 202 |
if ((unit.getDeb() == lastUnit.getDeb()) && (unit.getFin() == lastUnit.getFin()) ) {
|
|
| 203 |
erreur += " DOUBLON " |
|
| 204 |
} |
|
| 205 |
} |
|
| 206 |
lastUnit = unit |
|
| 207 |
|
|
| 208 |
|
|
| 209 |
erreur = erreur.trim() |
|
| 210 |
if (erreur == "remove") {
|
|
| 211 |
garbageBin.add( unit ) |
|
| 212 |
nDeleted++ |
|
| 213 |
} else if (erreur != "") {
|
|
| 214 |
vue.setValeurChamp(unit, "CHECK", erreur) |
|
| 215 |
nModified++ |
|
| 216 |
} else {
|
|
| 217 |
nIgnored++ |
|
| 218 |
} |
|
| 219 |
|
|
| 220 |
// mise à jour des tokens couverts |
|
| 221 |
for (int p=unit.getDeb() ; p <= unit.getFin() ; p++) {
|
|
| 222 |
tokenIndex[p] = 0 |
|
| 223 |
} |
|
| 224 |
} |
|
| 225 |
|
|
| 226 |
// Suppression effective des unités incohérentes |
|
| 227 |
garbageBin.each {
|
|
| 228 |
analecCorpus.supUnite( it ) |
|
| 229 |
} |
|
| 230 |
|
|
| 231 |
// 7. Ajouter les pronoms non couverts par une annotation ; besoin de catégories |
|
| 232 |
// Parcourir les tokens non couverts ; obtenir leur POS et créer une unité si c'est un pronom |
|
| 233 |
|
|
| 234 |
if (checkPronouns) {
|
|
| 235 |
println "Détection des pronoms oubliés" |
|
| 236 |
for (i=0 ; i < nToks ; i++) {
|
|
| 237 |
if (tokenIndex[i] > 0) {
|
|
| 238 |
Mention = CQI.cpos2Str(posProperty.getQualifiedName(), i) |
|
| 239 |
if (listePronoms.contains(Mention[0])) {
|
|
| 240 |
def props = [:] |
|
| 241 |
props["CHECK"] = "NEW" |
|
| 242 |
Unite u = analecCorpus.addUniteSaisie(unit_type, i, i, props) |
|
| 243 |
vue.setValeurChamp(u, "CHECK", "NEW") |
|
| 244 |
nAdded++ |
|
| 245 |
} |
|
| 246 |
} |
|
| 247 |
} |
|
| 248 |
} |
|
| 249 |
|
|
| 250 |
if (nAdded + nModified + nDeleted > 0) corpus.setIsModified(true); |
|
| 251 |
|
|
| 252 |
println "Result:" |
|
| 253 |
println "- $nModified units of type $unit_type have been modified." |
|
| 254 |
println "- $nDeleted units of type $unit_type have been deleted." |
|
| 255 |
println "- $nIgnored units of type $unit_type have not been modified." |
|
| 256 |
println "- $nAdded forgotten pronominal units of type $unit_type have been added.\n" |
|
| 257 |
|
|
| 258 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut() |
|
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/RetoucheComplementDuNomMacro.groovy (revision 2094) | ||
|---|---|---|
| 5 | 5 |
import org.kohsuke.args4j.* |
| 6 | 6 |
import groovy.transform.* |
| 7 | 7 |
import org.txm.* |
| 8 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 9 |
import org.txm.annotation.urs.*
|
|
| 8 |
import org.txm.rcpapplication.swt.widget.parameters.*
|
|
| 9 |
import org.txm.analec.*
|
|
| 10 | 10 |
import org.txm.searchengine.cqp.* |
| 11 | 11 |
import org.txm.searchengine.cqp.corpus.* |
| 12 | 12 |
import visuAnalec.Message.* |
| ... | ... | |
| 16 | 16 |
|
| 17 | 17 |
|
| 18 | 18 |
/* MACRO pour corriger une erreur d'annotation |
| 19 |
Retirer le "De" du complément du nom |
|
| 20 |
Algo : |
|
| 21 |
POUR CHAQUE MENTION dont le premier mot est "de" (en minuscules) |
|
| 22 |
SI il existe une autre MENTION dans laquelle celle-ci est totalement incluse |
|
| 23 |
ALORS incrémenter d'un mot la frontière gauche de la mention |
|
| 24 |
Ajouter la categorie CDN.CHECK pour qu'on puisse verifier facilement le job. |
|
| 25 |
*/ |
|
| 19 |
Retirer le "de" ou le "d'" (forme élidée) du complément du nom |
|
| 26 | 20 |
|
| 21 |
Algo : |
|
| 22 |
POUR CHAQUE MENTION dont le premier mot est "de" (en minuscules) |
|
| 23 |
SI il existe une autre MENTION dans laquelle celle-ci est totalement incluse |
|
| 24 |
ALORS incrémenter d'un mot la frontière gauche de la mention |
|
| 25 |
Ajouter la categorie CDN.CHECK pour qu'on puisse verifier facilement le job. |
|
| 26 |
|
|
| 27 |
Cette macro ne s'appuie pas sur des catégories morphosyntaxiques. |
|
| 28 |
On peut donc la faire tourner avant les frpos2cattex ou fropos2cattex |
|
| 29 |
|
|
| 30 |
Auteur : Matthieu QUIGNARD |
|
| 31 |
Date : 18 janvier 2019 |
|
| 32 |
*/ |
|
| 33 |
|
|
| 27 | 34 |
// BEGINNING OF PARAMETERS |
| 28 |
if (!(corpusViewSelection instanceof org.txm.searchengine.cqp.corpus.CQPCorpus)) {
|
|
| 29 |
println "Selection must be a Corpus: "+corpusViewSelection |
|
| 30 |
return; |
|
| 31 |
} |
|
| 35 |
|
|
| 32 | 36 |
// Declare each parameter here |
| 33 | 37 |
// BEGINNING OF PARAMETERS |
| 34 | 38 |
@Field @Option(name="unit_type", usage="", widget="String", required=true, def="MENTION") |
| 35 |
def unit_type
|
|
| 39 |
def unit_type |
|
| 36 | 40 |
@Field @Option(name="category_name", usage="", widget="String", required=true, def="CATEGORIE") |
| 37 |
def category_name
|
|
| 41 |
def category_name |
|
| 38 | 42 |
if (!ParametersDialog.open(this)) return |
| 39 | 43 |
|
| 40 |
corpus = corpusViewSelection.getMainCorpus()
|
|
| 41 |
CQI = CQPSearchEngine.getCqiClient()
|
|
| 44 |
corpus = corpusViewSelection
|
|
| 45 |
CQI = Toolbox.getCqiClient()
|
|
| 42 | 46 |
word = corpus.getWordProperty() |
| 43 | 47 |
|
| 44 |
analecCorpus = URSCorpora.getCorpus(corpus)
|
|
| 45 |
vue = URSCorpora.getVue(corpus)
|
|
| 48 |
analecCorpus = AnalecCorpora.getCorpus(corpus.getName())
|
|
| 49 |
vue = AnalecCorpora.getVue(corpus.getName())
|
|
| 46 | 50 |
structure = analecCorpus.getStructure() |
| 47 | 51 |
if (!structure.getUnites().contains(unit_type)) { // check if the structure contains the unit_type units
|
| 48 | 52 |
println "Error: corpus structure does not contains unit with name=$unit_type" |
| ... | ... | |
| 51 | 55 |
|
| 52 | 56 |
|
| 53 | 57 |
// Si la structure d'annotation ne contient pas CATEGORIE, on la crée avec ses valeurs |
| 54 |
if (!structure.getUniteProperties(unit_type).contains(category_name)) {
|
|
| 55 |
structure.ajouterProp(Unite.class, unit_type, category_name)
|
|
| 58 |
if (!structure.getUniteProperties(unit_type).contains(category_name)) {
|
|
| 59 |
structure.ajouterProp(Unite.class, unit_type, category_name)
|
|
| 56 | 60 |
} |
| 57 | 61 |
|
| 58 | 62 |
def check_cat = "CDN.CHECK" |
| ... | ... | |
| 69 | 73 |
units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
|
| 70 | 74 |
|
| 71 | 75 |
/* Test sur la premiere mention : |
| 72 |
def debut1 = units[1].getDeb()
|
|
| 73 |
def fin1 = units[1].getFin()
|
|
| 74 |
println "$debut1 - $fin1"
|
|
| 75 |
units[1].setDeb( debut1 + 1)
|
|
| 76 |
units[1].setFin( fin1 + 1)
|
|
| 77 |
URSCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 78 |
def debut2 = units[1].getDeb()
|
|
| 79 |
def fin2 = units[1].getFin()
|
|
| 80 |
println "$debut2 - $fin2"
|
|
| 81 |
*/
|
|
| 76 |
def debut1 = units[1].getDeb() |
|
| 77 |
def fin1 = units[1].getFin() |
|
| 78 |
println "$debut1 - $fin1" |
|
| 79 |
units[1].setDeb( debut1 + 1) |
|
| 80 |
units[1].setFin( fin1 + 1) |
|
| 81 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 82 |
def debut2 = units[1].getDeb() |
|
| 83 |
def fin2 = units[1].getFin() |
|
| 84 |
println "$debut2 - $fin2" |
|
| 85 |
*/ |
|
| 82 | 86 |
|
| 83 | 87 |
|
| 84 | 88 |
for (Unite unit : units) { // process all units
|
| 85 |
|
|
| 89 |
|
|
| 86 | 90 |
def debut = unit.getDeb() |
| 87 | 91 |
def fin = unit.getFin() |
| 88 | 92 |
def premierMot = CQI.cpos2Str(word.getQualifiedName(), debut)[0] |
| 89 |
|
|
| 90 |
if (premierMot != "de") {
|
|
| 93 |
|
|
| 94 |
if ((premierMot != "de") && (premierMot !="d'")) {
|
|
| 91 | 95 |
nIgnored1++ |
| 92 | 96 |
compteur++ |
| 93 | 97 |
continue |
| 94 | 98 |
} else {
|
| 95 |
for (i = compteur-1; i >= 0 ; i--) {
|
|
| 99 |
for (i = compteur-1; i>=0 ; i--) {
|
|
| 96 | 100 |
def u = units[i] |
| 97 | 101 |
def udeb = u.getDeb() |
| 98 | 102 |
def ufin = u.getFin() |
| 99 | 103 |
if (ufin >= fin) {
|
| 100 |
println "\nAVANT => Unit $compteur : $debut - $fin"
|
|
| 101 |
if (fin > debut) unit.setDeb( debut++ )
|
|
| 102 |
else println "not resizing"
|
|
| 103 |
def debut2 = unit.getDeb()
|
|
| 104 |
def fin2 = unit.getFin()
|
|
| 105 |
URSCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 106 |
println "APRES => Unit $compteur : $debut2 - $fin2"
|
|
| 107 |
unit.getProps().put(category_name, check_cat)
|
|
| 108 |
break
|
|
| 104 |
println "\nAVANT => Unit $compteur : $debut - $fin"
|
|
| 105 |
if (fin > debut) unit.setDeb( ++debut )
|
|
| 106 |
else println "not resizing"
|
|
| 107 |
def debut2 = unit.getDeb()
|
|
| 108 |
def fin2 = unit.getFin()
|
|
| 109 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 110 |
println "APRES => Unit $compteur : $debut2 - $fin2"
|
|
| 111 |
unit.getProps().put(category_name, check_cat)
|
|
| 112 |
break
|
|
| 109 | 113 |
} |
| 110 | 114 |
} |
| 111 |
if (i < 0) nIgnored2++
|
|
| 115 |
if (i <0) nIgnored2++ |
|
| 112 | 116 |
else nModified++ |
| 113 | 117 |
compteur++ |
| 114 |
} |
|
| 118 |
}
|
|
| 115 | 119 |
} |
| 116 | 120 |
|
| 117 | 121 |
println "\nResult:" |
| ... | ... | |
| 121 | 125 |
println "Total ($compteur)." |
| 122 | 126 |
|
| 123 | 127 |
// END OF PARAMETERS |
| 124 |
URSCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 128 |
AnalecCorpora.getVue(analecCorpus).retablirVueParDefaut()
|
|
| 125 | 129 |
println "corpora selection: "+corpusViewSelection |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/SUJ-PARTINF.txt (revision 2094) | ||
|---|---|---|
| 1 |
Macro : SUJ-PARTINF |
|
| 2 |
Auteur : Matthieu QUIGNARD |
|
| 3 |
Date : 05 Février 2019 |
|
| 4 |
|
|
| 5 |
Recatégorise les mentions étiquetées en SUJ.ZERO pour distinguer les cas |
|
| 6 |
des verbes conjugués (qui restent en SUJ.ZERO) |
|
| 7 |
des verbes à l'infinitif ou au participe passé/présent qui, eux, vont |
|
| 8 |
passer sous la catégorie SUJ.PARTINF |
|
| 9 |
|
|
| 10 |
Cette macro est réversible. |
|
| 11 |
Il suffit d'invoquer la macro ChercherRemplacer et substituer |
|
| 12 |
CATEGORIE=SUJ.PARTINF |
|
| 13 |
en |
|
| 14 |
CATEGORIE=SUJ.ZERO |
|
| 15 |
|
|
| 16 |
NB : fonctionne avec le tagset fro (ancien français) et le tagset TreeTagger. |
|
| 17 |
Infinitif : VERinf VER:infi |
|
| 18 |
Part.Pass : VERppa VER:pper |
|
| 19 |
Part.Pres : VERppe VER:ppre |
|
| 0 | 20 | |
| tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/Verifications_alMacro.groovy (revision 2094) | ||
|---|---|---|
| 1 |
// Auteur Matthieu Quignard |
|
| 2 |
// Date : 14 janvier 2019 |
|
| 3 |
|
|
| 4 |
/********** |
|
| 5 |
Vérifications automatiques |
|
| 6 |
1. Repère les mentions sans catégorie : CHECK > CAT |
|
| 7 |
2. Repère les mentions sans référent : CHECK > REF |
|
| 8 |
3. Supprime les ponctuations en début et en fin de mention : CHECK > BORNES |
|
| 9 |
4. Supprime les prépositions autres que 'de' en début de mention : CHECK > BORNES |
|
| 10 |
5. Supprime automatiquement toutes les mentions vides = sans aucun mot = de longueur 0 |
|
| 11 |
6. Détecter les mentions qui ont exactement les mêmes bornes : CHECK > DOUBLON |
|
| 12 |
7 (option). Détecter les pronoms hors mention : CHECK > NEW |
|
| 13 |
***********/ |
|
| 14 |
|
|
| 15 |
package org.txm.macro.urs.democrat |
|
| 16 |
|
|
| 17 |
import org.apache.commons.lang.* |
|
| 18 |
import org.kohsuke.args4j.* |
|
| 19 |
import groovy.transform.* |
|
| 20 |
import org.txm.* |
|
| 21 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 22 |
import org.txm.analec.* |
|
| 23 |
import org.txm.searchengine.cqp.* |
|
Formats disponibles : Unified diff