/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2633

     package org.txm.macro.cqp
     // Copyright © 2016 ENS de Lyon
     //
     // Authors:
     // - Serge Heiden
     //
     // Licence:
     // This file is part of the TXM platform.
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     // Version:
     // $LastChangedDate: 2014-11-01 16:00:01 +0100 (sam., 1 nov. 2014) $
     // $LastChangedRevision: XXXX $
     // $LastChangedBy: sheiden $
     //
     //FR:
     // Macro affichant les statistiques de différentes structures d'un corpus
     //
     // Exemple de résultat :
     // struct	prop	start	end		t		v		fmin	fmax
     // act		I		153		5824	5672	1396	1		400
     // scene	I.I		155		2415	2261	736		1		159
     // scene	I.II	2416	3216	686		382		1		15
     // scene	I.III	3217	5824	2206	775		1		78
     // act		II		5825	13444	6397	1667	1		199
     // scene	II.I	5827	8017	1855	786		1		47
     // scene	II.II	8018	8754	601		267		1		19
     // scene	II.III	8755	11898	2642	893		1		88
     // scene	II.IV	11899	12490	496		232		1		13
     // scene	II.V	12491	13444	801		371		1		39
     // ...
     //
     // Pour le corpus SHAKESPEARE (All's Well That Ends Well) avec les paramètres :
     // - corpus : SHAKESPEARE
     // - structures : act,scene
     // - structProperties : n,n
     // - query : [word!='\p{P}']
     // - wordProperty : word
     //
     // Paramètres de la macro :
     // - corpus : nom du corpus à interroger
     // - structures : liste des structures à interroger. Séparer les noms par une virgule
     // - structProperties : liste des propriétés de structures. Séparer les noms par une virgule.
     //						Il doit y avoir autant de propriétés de structures que de structures indiquées dans le paramètre structure.
     //						Les structures doivent posséder la propriété demandée.
     //						Ce paramètre peut être laissé vide, dans ce cas la colonne 'prop' n'est pas affichée.
     // - query : requête CQL de mot exprimée obligatoirement en format complet : [...]
     // - wordProperty : propriété de mot utilisée pour calculer le vocabulaire et les fréquences
     //
     // Résultat :
     // Le résultat est un tableau TSV affiché dans la console.
     // On peut l'exploiter avec un copier/coller dans Calc.
     // Chaque ligne correspond à une structure du corpus.
     // Les lignes sont ordonnées par ordre hiérarchique des structures.
     // Les colonnes sont :
     // - struct : nom de la structure
     // - prop : valeur de la propriété de la structure
     //			(si le paramètre structProperties est vide, cette colone est absente du résultat)
     // - start : position du premier mot de la structure dans le corpus
     //			(les positions du corpus sont numérotées à partir de 0 - la position du premier mot du corpus est 0).
     //			Les colonnes start et end sont pratiques quand on n'a pas de propriété de structure à afficher pour se repérer dans le corpus.
     // - end : position du dernier mot de la structure
     // - t : nombre de mots de la structure
     // - v : nombre de valeurs différentes de la propriété de mot dans la structure
     // - fmin : fréquence minimale des valeurs de la propriété de mot dans la structure
     // - fmax : fréquence maximale des valeurs de la propriété de mot dans la structure
     // Déclarations
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.Toolbox
     import org.eclipse.ui.console.*
     byte CQI_CONST_FIELD_MATCH = (byte) 0x10
     // BEGINNING OF PARAMETERS
     @Field @Option(name="corpus", usage="SHAKESPEARE", widget="String", required=true, def="SHAKESPEARE")
     def corpus
     @Field @Option(name="structures", usage="act,scene", widget="String", required=true, def="act,scene")
     def structures
     @Field @Option(name="structProperties", usage="n,n", widget="String", required=false, def="n,n")
     def structProperties
     @Field @Option(name="query", usage="[word!='\\p{P}']", widget="String", required=true, def="[word!='\\p{P}']")
     def query
     @Field @Option(name="wordProperty", usage="word", widget="String", required=true, def="word")
     def wordProperty
     @Field @Option(name="displayIndex", usage="display a hierarchical index", widget="Boolean", required=true, def="true")
     def displayIndex
     @Field @Option(name="Vmax", usage="size of index", widget="Integer", required=false, def="20")
     def Vmax
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     def clearConsole = { ->
     	// clear the console
     	(ConsolePlugin.getDefault().getConsoleManager().getConsoles())[0].clearConsole()
+    }
     clearConsole()
     def corpusEngine = Toolbox.getCqiClient()
     def corpusName = corpus 							// "SHAKESPEARE"
     def corpusStructs = structures.split(",") 			// ["act", "scene"]
     structProperties = structProperties.trim()
     if (structProperties.size() > 0) {
     	propParam = true
     	corpusStructPropNames = structProperties.split(",")	// ["n", "n"]
     	corpusStructProps = [corpusStructs, corpusStructPropNames].transpose().collectEntries()
     	} else {
     	propParam = false
+    }
     def struct_names = (corpusEngine.corpusStructuralAttributes(corpusName) as List)
     struct_names.removeAll { it.contains('_') }
     struct_names=(struct_names-"txmcorpus").grep(corpusStructs)
     //println "struct_names = "+struct_names
     if (struct_names.size() == 0) {
     	println "** Impossible to find the structures (${corpusStructs}), aborting."
     	return
+    }
     def level = [:]
     /*
     class Node {
         String  	name
         Integer 	start
         Integer 	end
         Node 	parent
         List<Node> 	children
         Node(String n, Integer s, Integer e) {
             name  = n
             start = s
             end   = e
     	children = new LinkedList<Node>()
+        }
         public int compareTo(Node n) {
     /* Possible combinations
        this = []
           n = {}
        an interval is not empty
     [  {  }  ] -> this is parent 1
     [  {  }]   -> this is parent 1
     [  {  ]  } -> *overlap 0
     [  ]{  }   -> this is left sibling 2
     [  ]  {  } -> this is left sibling 2
     [{  }  ]   -> this is parent 1
     [{  }]     -> *duplicate 0
     [{  ]  }   -> this is child -1
     {  [  ]  } -> n is parent -1
     {  [  ]}   -> n is parent -1
     {  [  }  ] -> *overlap 0
     {  }[  ]   -> n is left sibling -2
     {  }  [  ] -> n is left sibling -2
     {[  ]  }   -> n is parent -1
     {[  ]}     -> *duplicate 0
     {[  }  ]   -> n is child 1
     //
     if (start < n.start) {
     // [  {  }  ] -> this is parent 1
     // [  {  }]   -> this is parent 1
     // [  {  ]  } -> *overlap 0
     // [  ]{  }   -> this is left sibling 2
     // [  ]  {  } -> this is left sibling 2
     	if (end > n.end) {
     // [  {  }  ] -> this is parent 1
     		return 1
     	} else if (end == n.end) {
     // [  {  }]   -> this is parent 1
     			return 1
     	} else if (end < n.end && end > n.start) {
     // [  {  ]  } -> *overlap 0
     			println "** Error: overlapping intervals, [  {  ]  } should not happen, "+this.toString()+", "+n.toString()
     			return 0
     	} else if (end == n.start) {
     // [  ]{  }   -> this is left sibling 2
     			return 2
     	} else if (end < n.start) {
     // [  ]  {  } -> this is left sibling 2
     			return 2
     	} else {
     // should not happen
     		println "** Error: should not happen, "+this.toString()+", "+n.toString()
     		return -10
+    	}
     } else if (start > n.start) {
     // {  [  ]  } -> n is parent -1
     // {  [  ]}   -> n is parent -1
     // {  [  }  ] -> *overlap 0
     // {  }[  ]   -> n is left sibling -2
     // {  }  [  ] -> n is left sibling -2
     	if (end < n.end) {
     // {  [  ]  } -> n is parent -1
     		return -1
     	} else if (end == n.end) {
     // {  [  ]}   -> n is parent -1
     		return -1
     	} else if (end > n.end && n.end > start) {
     // {  [  }  ] -> *overlap 0
     			println "** Error: overlapping intervals, {  [  }  ] should not happen, "+this.toString()+", "+n.toString()
     			return 0
     	} else if (n.end == start) {
     // {  }[  ]   -> n is left sibling -2
     		return -2
     	} else if (n.end < start)
     // {  }  [  ] -> n is left sibling -2
     			return -2
     	} else {
     // should not happen
     		println "** Error: should not happen, "+this.toString()+", "+n.toString()
     		return -10
+    	}
     } else {
     // [{  }  ]   -> this is parent 1
     // [{  }]     -> *duplicate 0
     // [{  ]  }   -> this is child -1
     // {[  ]  }   -> n is parent -1
     // {[  ]}     -> *duplicate 0
     // {[  }  ]   -> n is child 1
     	if (end > n.end) {
     // [{  }  ]   -> this is parent 1
     // {[  }  ]   -> n is child 1
     		return 1
     	} else if (end < n.end) {
     // [{  ]  }   -> this is child -1
     // {[  ]  }   -> n is parent -1
     		return -1
     	} else if (end == n.end) {
     // [{  }]     -> *duplicate 0
     // {[  ]}     -> *duplicate 0
     			println "** Error: duplicate intervals, [{  }] should not happen, "+this.toString()+", "+n.toString()
     			return 0
     	} else {
     // should not happen
     		println "** Error: should not happen, "+this.toString()+", "+n.toString()
     		return -10
+    	}
+     }
+    }
     	public Node add(Node n) {
     	switch (this.compareTo(n)) {
     	case 1:
             childNode = new Node(n, s, e)
             childNode.parent = this
             this.children.add(childNode)
             return childNode
+        }
         public toString(Node n) {
     		sprintf("%s[%d, %d]", n.name, n.start, n.end)
+        }
         public print(Node n) {
     		print(n.toString())
+        }
+    }
     */
     // First define the order theory over corpus structures intervals
     // by defining a binary comparator that will be used to build the
     // TreeSet of intervals
     class Struct implements Comparable<Struct> {
         String  name
         Integer start
         Integer end
         Struct(String n, Integer s, Integer e) {
             name  = n
             start = s
             end   = e
+        }
         public int compareTo(Struct s) {
         	if (start < s.start && end > s.end) { 			// self contains s : [ { } ]
         		//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end)
         		return -1
         	} else if (start > s.start && end < s.end) { 	// s contains self : { [ ] }
         		//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end)
         		return 1
         	} else if (start == s.start && end == s.end) { 	// self and s have the same intervals : [{ }]
         		//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end)
          		return name.compareTo(s.name) // use the lexicographic order of the structure names
         	} else if (start < s.start) { 					// interval starting on the left comes first : [ { ...
         		return -1
         	} else if (start > s.start) { 					// interval starting on the right comes after : { [ ...
         		return 1
         	} else if (end > s.end) { 						// same start, interval ending on the right comes before : [{ } ]...
         		return -1
         	} else if (end < s.end) { 						// same start, interval ending on the right comes before : [{ ] }...
         		return -1
         	} else {										// same start, same end : [{ ]}...
         		return name.compareTo(s.name) // use the lexicographic order of the structure names
+        	}
+        }
     /*
         public int compareTo(Struct s) {
         	if (start < s.start) { 					// [ { ...
         		if (end < s.end) {					// [ { ] } *
         		} else if (end > s.end) {			// [ { } ]
         			   } else {						// [ { }]
+        			   }
         	} else if (start > s.start) { 			// { [ ...
         				if (end < s.end) {			// { [ ] }
         				} else if (end > s.end) {	// { [ } ] *
         					   } else {				// { [ ]}
+        				}
         			} else { 						// {[ ...
          				if (end < s.end) {			// {[ ] }
         				} else if (end > s.end) {	// {[ } ]
         					   } else {				// {[ ]}
+        				}
         		//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end)
         		return -1
         	} else if (start > s.start && end < s.end) { 	// s contains self : { [ ] }
         		//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end)
         		return 1
         	} else if (start == s.start && end == s.end) { 	// self and s have the same intervals : [{ }]
         		//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end)
          		return name.compareTo(s.name) // use the lexicographic order of the structure names
         	} else if (start < s.start) { 					// interval starting on the left comes first : [ { ...
         		return -1
         	} else if (start > s.start) { 					// interval starting on the right comes after : { [ ...
         		return 1
         	} else if (end > s.end) { 						// same start, interval ending on the right comes before : [{ } ]...
         		return -1
         	} else if (end < s.end) { 						// same start, interval ending on the right comes before : [{ ] }...
         		return -1
         	} else {										// same start, same end : [{ ]}...
         		return name.compareTo(s.name) // use the lexicographic order of the structure names
+        	}
+        }
     */
         public toString(Struct s) {
     		sprintf("%s[%d, %d]", s.name, s.start, s.end)
+        }
         public print(Struct s) {
     		print(s.toString())
+        }
+    }
     // Now build the TreeSet of corpus structures intervals
     def h = new TreeSet<Struct>()
     struct_names.each {
     	for (i in 0..corpusEngine.attributeSize("${corpusName}.${it}")-1) {
     		(start, end) = corpusEngine.struc2Cpos("${corpusName}.${it}", i)
     		//println sprintf("Adding %s[%d, %d]", it, start, end)
     		h.add(new Struct(it, start, end))
+    	}
+    }
     // function to print the hierarchical index of a query
     def print_index = { c, q, p, cut ->
     	corpusEngine.cqpQuery(c, "RES1", q)
     	def matches_target_p = corpusEngine.cpos2Str("${c}.${p}", corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1))
     	if (cut > 0) {
     		println matches_target_p.countBy { it }.sort { -it.value }.take(cut)
     	} else {
     		println matches_target_p.countBy { it }.sort { -it.value }
+    	}
     	corpusEngine.dropSubCorpus("${c}:RES1")
+    }
     // function to print the statistics of an index of a query
     def print_freq = { c, q, p ->
     	// appel du moteur
     	corpusEngine.cqpQuery(c, "RES1", q)
     	def matches_target_p = corpusEngine.cpos2Id("${c}.${p}", corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1))
     	//println ""
     	// afficher les positions de mots du résultat
     	//println corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1)
     	// afficher les codes des occurrences de la propriété du résultat
     	//println matches_target_p
     	// afficher l'index hiérarchique des codes du résultat
     	//println matches_target_p.collect { it }.countBy { it }.sort { -it.value }
     	// calculer la fréquence de chaque valeur et ne garder que les fréquences
     	def index = matches_target_p.collect { it }.countBy { it }
     	def freqs = index.values()
     	// afficher la liste décroissante des fréquences du résultat
     	//println freqs.sort { -it.value }
     	def tC = corpusEngine.subCorpusSize("${c}:RES1")
     	//def tF = freqs.sum() // control value
     	def v = freqs.size()
     	def fmin = freqs.min()
     	def fmax = freqs.max()
     	//println sprintf("t %d, v %d, fmin %d, fmax %d", tC, v, fmin, fmax)
     	print sprintf("%d\t%d\t%d\t%d", tC, v, fmin, fmax)
     	// afficher les valeurs des occurrences de la propriété du résultat
     	if (displayIndex) {
     		heads = index.sort { -it.value }.take(Vmax).keySet()
     		println "\t"+heads.collect { corpusEngine.id2Str("${c}.${p}", it)[0] }
     	} else {
     		println ""
+    	}
     	corpusEngine.dropSubCorpus("${c}:RES1")
+    }
     if (propParam) {
     	print sprintf("struct\tprop\tt\tv\tfmin\tfmax")
     	} else {
     	print sprintf("struct\tstart\tend\tt\tv\tfmin\tfmax")
+    }
     if (displayIndex) {
     	println sprintf("\tindex")
     } else {
     	println ""
+    }
     def env = System.getenv()
     def localPath = env["HOME"]+"/Documents/d3test"
     new File(localPath).mkdirs()
     // reset output file
     def resultFile = new File(localPath, "desc-partition.html")
     def result = new PrintWriter(resultFile)
     result.print("")
     result.close()
     resultFile << '''\
     <!DOCTYPE html>
     <html>
       <head>
         <meta http-equiv="Content-Type" content="text/html;charset=utf-8" charset="UTF-8"/>
         <link type="text/css" rel="stylesheet" href="style.css"/>
         <script type="text/javascript" src="d3/d3.v3.js" charset="utf-8"></script>
         <script type="text/javascript" src="d3/layout/partition.js" charset="utf-8"></script>
         <style type="text/css">
     .chart {
       display: block;
       margin: auto;
       margin-top: 60px;
       font-size: 11px;
+    }
     rect {
       stroke: #eee;
       fill: #aaa;
       fill-opacity: .8;
+    }
     rect.parent {
       cursor: pointer;
       fill: steelblue;
+    }
     text {
       pointer-events: none;
+    }
         </style>
       </head>
       <body>
         <div id="body">
           <div id="footer">
             Structures hierarchy
             <div class="hint">click or shift-alt-click to zoom-in or out</div>
           </div>
         </div>
         <script type="text/javascript">
     var w = 1120,
         h = 600,
         x = d3.scale.linear().range([0, w]),
         y = d3.scale.linear().range([0, h]);
     var vis = d3.select("#body").append("div")
         .attr("class", "chart")
         .style("width", w + "px")
         .style("height", h + "px")
       .append("svg:svg")
         .attr("width", w)
         .attr("height", h);
     var partition = d3.layout.partition()
         .value(function(d) { return d.size; }).sort(null);
     var tree = `{'''
     // Now iterate on the TreeSet to get a depth first search on the structure intervals
     def rec_struct_regex = /([^0-9]+)[0-9]+/
     /*
      "name": "sha-hamlet",
      "children": [
+      {
        "name": "sha-hamcast",
        "children": [
+        {
          "name": "sha-ham1",
          "children": [
           {"name": "sha-ham102", "size": 855},
           {"name": "sha-ham103", "size": 464},
           {"name": "sha-ham104", "size": 296},
           {"name": "sha-ham105", "size": 635}
+         ]
+        }
+       ]
+      }
+     ]
     }`;
     */
     def displayTree = { head ->
     	if (head) {
     		subtree = h.tailSet(head)
     		subtree.each { print sprintf("%s[%d, %d], ", it.name, it.start, it.end) }
     		println ""
     		if (subtree.size() == 0) {
     			println sprintf("%s[%d, %d]", head.name, head.start, head.end)
     			} else {
     			displayTree(subtree)
+    		}
+    	}
+    }
     //displayTree(h.first())
     h.each {
     	//println sprintf("Displaying %s[%d, %d]", it.name, it.start, it.end)
     	if (propParam) {
     		def rec_match = (it.name =~ rec_struct_regex)
     		if (rec_match.size() == 1) {
     			//println "Rec struct match = "+rec_match[0][1]
     			istruct_name = rec_match[0][1]
     		} else {
     			//println "Struct match = "+it.name
     			istruct_name = it.name
+    		}
     		def struct_name = "${corpusName}.${istruct_name}_${corpusStructProps[it.name]}"
     		print sprintf("%s\t%s\t", it.name, corpusEngine.struc2Str(struct_name, corpusEngine.cpos2Struc(struct_name, [it.start] as int[]))[0])
     	} else {
     		def struct_name = "${corpusName}.${it.name}"
     		print sprintf("%s\t%d\t%d\t", it.name, it.start, it.end)
+    	}
     	print_freq(corpusName, sprintf("a:%s :: a>=%d & a<=%d", query, it.start, it.end), wordProperty)
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2633