Révision 2633

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/prototypes/commands/TreeMapDescriptionMacro.groovy (revision 2633)
1
package org.txm.macro.cqp
2
// Copyright © 2016 ENS de Lyon
3
//
4
// Authors:
5
// - Serge Heiden
6
//
7
// Licence:
8
// This file is part of the TXM platform.
9
// The TXM platform is free software: you can redistribute it
10
// and/or modify it under the terms of the GNU General Public
11
// License as published by the Free Software Foundation,
12
// either version 2 of the License, or (at your option) any
13
// later version.
14
//
15
// The TXM platform is distributed in the hope that it will be
16
// useful, but WITHOUT ANY WARRANTY; without even the implied
17
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
18
// PURPOSE. See the GNU General Public License for more
19
// details.
20
//
21
// You should have received a copy of the GNU General
22
// Public License along with the TXM platform. If not, see
23
// http://www.gnu.org/licenses.
24
//
25
// Version:
26
// $LastChangedDate: 2014-11-01 16:00:01 +0100 (sam., 1 nov. 2014) $
27
// $LastChangedRevision: XXXX $
28
// $LastChangedBy: sheiden $
29
//
30

  
31
//FR:
32
// Macro affichant les statistiques de différentes structures d'un corpus
33
//
34
// Exemple de résultat :
35
// struct	prop	start	end		t		v		fmin	fmax
36
// act		I		153		5824	5672	1396	1		400
37
// scene	I.I		155		2415	2261	736		1		159
38
// scene	I.II	2416	3216	686		382		1		15
39
// scene	I.III	3217	5824	2206	775		1		78
40
// act		II		5825	13444	6397	1667	1		199
41
// scene	II.I	5827	8017	1855	786		1		47
42
// scene	II.II	8018	8754	601		267		1		19
43
// scene	II.III	8755	11898	2642	893		1		88
44
// scene	II.IV	11899	12490	496		232		1		13
45
// scene	II.V	12491	13444	801		371		1		39
46
// ...
47
// 
48
// Pour le corpus SHAKESPEARE (All's Well That Ends Well) avec les paramètres :
49
// - corpus : SHAKESPEARE
50
// - structures : act,scene
51
// - structProperties : n,n
52
// - query : [word!='\p{P}']
53
// - wordProperty : word
54
//
55
// Paramètres de la macro :
56
// - corpus : nom du corpus à interroger
57
// - structures : liste des structures à interroger. Séparer les noms par une virgule
58
// - structProperties : liste des propriétés de structures. Séparer les noms par une virgule.
59
//						Il doit y avoir autant de propriétés de structures que de structures indiquées dans le paramètre structure.
60
//						Les structures doivent posséder la propriété demandée.
61
//						Ce paramètre peut être laissé vide, dans ce cas la colonne 'prop' n'est pas affichée.
62
// - query : requête CQL de mot exprimée obligatoirement en format complet : [...]
63
// - wordProperty : propriété de mot utilisée pour calculer le vocabulaire et les fréquences
64
//
65
// Résultat :
66
// Le résultat est un tableau TSV affiché dans la console.
67
// On peut l'exploiter avec un copier/coller dans Calc.
68
// Chaque ligne correspond à une structure du corpus.
69
// Les lignes sont ordonnées par ordre hiérarchique des structures.
70
// Les colonnes sont :
71
// - struct : nom de la structure
72
// - prop : valeur de la propriété de la structure
73
//			(si le paramètre structProperties est vide, cette colone est absente du résultat)
74
// - start : position du premier mot de la structure dans le corpus
75
//			(les positions du corpus sont numérotées à partir de 0 - la position du premier mot du corpus est 0).
76
//			Les colonnes start et end sont pratiques quand on n'a pas de propriété de structure à afficher pour se repérer dans le corpus.
77
// - end : position du dernier mot de la structure
78
// - t : nombre de mots de la structure
79
// - v : nombre de valeurs différentes de la propriété de mot dans la structure
80
// - fmin : fréquence minimale des valeurs de la propriété de mot dans la structure
81
// - fmax : fréquence maximale des valeurs de la propriété de mot dans la structure
82

  
83
// Déclarations
84

  
85
import org.kohsuke.args4j.*
86
import groovy.transform.Field
87
import org.txm.rcpapplication.swt.widget.parameters.*
88

  
89
import org.txm.Toolbox
90

  
91
import org.eclipse.ui.console.*
92

  
93
byte CQI_CONST_FIELD_MATCH = (byte) 0x10
94

  
95
// BEGINNING OF PARAMETERS
96

  
97
@Field @Option(name="corpus", usage="SHAKESPEARE", widget="String", required=true, def="SHAKESPEARE")
98
def corpus
99

  
100
@Field @Option(name="structures", usage="act,scene", widget="String", required=true, def="act,scene")
101
def structures
102

  
103
@Field @Option(name="structProperties", usage="n,n", widget="String", required=false, def="n,n")
104
def structProperties
105

  
106
@Field @Option(name="query", usage="[word!='\\p{P}']", widget="String", required=true, def="[word!='\\p{P}']")
107
def query
108

  
109
@Field @Option(name="wordProperty", usage="word", widget="String", required=true, def="word")
110
def wordProperty
111

  
112
@Field @Option(name="displayIndex", usage="display a hierarchical index", widget="Boolean", required=true, def="true")
113
def displayIndex
114

  
115
@Field @Option(name="Vmax", usage="size of index", widget="Integer", required=false, def="20")
116
def Vmax
117

  
118
// Open the parameters input dialog box
119
if (!ParametersDialog.open(this)) return;
120

  
121
// END OF PARAMETERS
122

  
123
def clearConsole = { ->
124
	// clear the console
125
	(ConsolePlugin.getDefault().getConsoleManager().getConsoles())[0].clearConsole()
126
}
127

  
128
clearConsole()
129

  
130
def corpusEngine = Toolbox.getCqiClient()
131

  
132
def corpusName = corpus 							// "SHAKESPEARE"
133
def corpusStructs = structures.split(",") 			// ["act", "scene"]
134
structProperties = structProperties.trim()
135

  
136
if (structProperties.size() > 0) {
137
	propParam = true
138
	corpusStructPropNames = structProperties.split(",")	// ["n", "n"]
139
	corpusStructProps = [corpusStructs, corpusStructPropNames].transpose().collectEntries()
140
	} else {
141
	propParam = false
142
}
143

  
144
def struct_names = (corpusEngine.corpusStructuralAttributes(corpusName) as List)
145
struct_names.removeAll { it.contains('_') }
146
struct_names=(struct_names-"txmcorpus").grep(corpusStructs)
147
//println "struct_names = "+struct_names
148

  
149
if (struct_names.size() == 0) {
150
	println "** Impossible to find the structures (${corpusStructs}), aborting."
151
	return
152
}
153

  
154
def level = [:]
155

  
156
/*
157
class Node {
158
 
159
    String  	name
160
    Integer 	start
161
    Integer 	end
162
    Node 	parent
163
    List<Node> 	children
164

  
165
    Node(String n, Integer s, Integer e) {
166
        name  = n
167
        start = s
168
        end   = e
169
	children = new LinkedList<Node>()
170
    }
171

  
172
    public int compareTo(Node n) {
173

  
174
/* Possible combinations
175
   this = []
176
      n = {}
177
   an interval is not empty
178

  
179
[  {  }  ] -> this is parent 1
180
[  {  }]   -> this is parent 1
181
[  {  ]  } -> *overlap 0
182
[  ]{  }   -> this is left sibling 2
183
[  ]  {  } -> this is left sibling 2
184
[{  }  ]   -> this is parent 1
185
[{  }]     -> *duplicate 0
186
[{  ]  }   -> this is child -1
187

  
188
{  [  ]  } -> n is parent -1
189
{  [  ]}   -> n is parent -1
190
{  [  }  ] -> *overlap 0
191
{  }[  ]   -> n is left sibling -2
192
{  }  [  ] -> n is left sibling -2
193
{[  ]  }   -> n is parent -1
194
{[  ]}     -> *duplicate 0
195
{[  }  ]   -> n is child 1
196

  
197
//
198

  
199
if (start < n.start) {
200
// [  {  }  ] -> this is parent 1
201
// [  {  }]   -> this is parent 1
202
// [  {  ]  } -> *overlap 0
203
// [  ]{  }   -> this is left sibling 2
204
// [  ]  {  } -> this is left sibling 2
205
	if (end > n.end) {
206
// [  {  }  ] -> this is parent 1
207
		return 1
208
	} else if (end == n.end) {
209
// [  {  }]   -> this is parent 1
210
			return 1
211
	} else if (end < n.end && end > n.start) {
212
// [  {  ]  } -> *overlap 0
213
			println "** Error: overlapping intervals, [  {  ]  } should not happen, "+this.toString()+", "+n.toString()
214
			return 0
215
	} else if (end == n.start) {
216
// [  ]{  }   -> this is left sibling 2
217
			return 2
218
	} else if (end < n.start) {
219
// [  ]  {  } -> this is left sibling 2
220
			return 2
221
	} else {
222
// should not happen
223
		println "** Error: should not happen, "+this.toString()+", "+n.toString()
224
		return -10
225
	}
226
} else if (start > n.start) {
227
// {  [  ]  } -> n is parent -1
228
// {  [  ]}   -> n is parent -1
229
// {  [  }  ] -> *overlap 0
230
// {  }[  ]   -> n is left sibling -2
231
// {  }  [  ] -> n is left sibling -2
232
	if (end < n.end) {
233
// {  [  ]  } -> n is parent -1
234
		return -1
235
	} else if (end == n.end) {
236
// {  [  ]}   -> n is parent -1
237
		return -1
238
	} else if (end > n.end && n.end > start) {
239
// {  [  }  ] -> *overlap 0
240
			println "** Error: overlapping intervals, {  [  }  ] should not happen, "+this.toString()+", "+n.toString()
241
			return 0
242
	} else if (n.end == start) {
243
// {  }[  ]   -> n is left sibling -2
244
		return -2
245
	} else if (n.end < start)
246
// {  }  [  ] -> n is left sibling -2
247
			return -2
248
	} else {
249
// should not happen
250
		println "** Error: should not happen, "+this.toString()+", "+n.toString()
251
		return -10
252
	}	
253
} else {
254
// [{  }  ]   -> this is parent 1
255
// [{  }]     -> *duplicate 0
256
// [{  ]  }   -> this is child -1
257
// {[  ]  }   -> n is parent -1
258
// {[  ]}     -> *duplicate 0
259
// {[  }  ]   -> n is child 1
260
	if (end > n.end) {
261
// [{  }  ]   -> this is parent 1
262
// {[  }  ]   -> n is child 1
263
		return 1
264
	} else if (end < n.end) {
265
// [{  ]  }   -> this is child -1
266
// {[  ]  }   -> n is parent -1
267
		return -1
268
	} else if (end == n.end) {
269
// [{  }]     -> *duplicate 0
270
// {[  ]}     -> *duplicate 0
271
			println "** Error: duplicate intervals, [{  }] should not happen, "+this.toString()+", "+n.toString()
272
			return 0
273
	} else {
274
// should not happen
275
		println "** Error: should not happen, "+this.toString()+", "+n.toString()
276
		return -10
277
	}
278
 }
279
}
280

  
281
	public Node add(Node n) {
282

  
283
	switch (this.compareTo(n)) {
284

  
285
	case 1:
286
		
287

  
288
        childNode = new Node(n, s, e)
289
        childNode.parent = this
290
        this.children.add(childNode)
291
        return childNode
292
    }
293

  
294
    public toString(Node n) {
295
		sprintf("%s[%d, %d]", n.name, n.start, n.end)
296
    }
297
    
298
    public print(Node n) {
299
		print(n.toString())
300
    }
301
}
302

  
303
*/
304

  
305
// First define the order theory over corpus structures intervals
306
// by defining a binary comparator that will be used to build the
307
// TreeSet of intervals
308

  
309
class Struct implements Comparable<Struct> {
310
 
311
    String  name
312
    Integer start
313
    Integer end
314
 
315
    Struct(String n, Integer s, Integer e) {
316
        name  = n
317
        start = s
318
        end   = e
319
    }
320
 
321
    public int compareTo(Struct s) {
322
    	if (start < s.start && end > s.end) { 			// self contains s : [ { } ]
323
    		//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end)
324
    		return -1
325
    	} else if (start > s.start && end < s.end) { 	// s contains self : { [ ] }
326
    		//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end)
327
    		return 1
328
    	} else if (start == s.start && end == s.end) { 	// self and s have the same intervals : [{ }]
329
    		//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end)
330
     		return name.compareTo(s.name) // use the lexicographic order of the structure names
331
    	} else if (start < s.start) { 					// interval starting on the left comes first : [ { ...
332
    		return -1
333
    	} else if (start > s.start) { 					// interval starting on the right comes after : { [ ...
334
    		return 1
335
    	} else if (end > s.end) { 						// same start, interval ending on the right comes before : [{ } ]...
336
    		return -1
337
    	} else if (end < s.end) { 						// same start, interval ending on the right comes before : [{ ] }...
338
    		return -1
339
    	} else {										// same start, same end : [{ ]}...
340
    		return name.compareTo(s.name) // use the lexicographic order of the structure names
341
    	}
342
    }
343

  
344
/*
345
    public int compareTo(Struct s) {
346
    	if (start < s.start) { 					// [ { ...
347
    		if (end < s.end) {					// [ { ] } *
348
    		} else if (end > s.end) {			// [ { } ]
349
    			   } else {						// [ { }]
350
    			   }
351
    	} else if (start > s.start) { 			// { [ ...
352
    				if (end < s.end) {			// { [ ] }
353
    				} else if (end > s.end) {	// { [ } ] *
354
    					   } else {				// { [ ]}
355
    				}
356
    			} else { 						// {[ ...
357
     				if (end < s.end) {			// {[ ] }
358
    				} else if (end > s.end) {	// {[ } ]
359
    					   } else {				// {[ ]}
360
    				}
361
    			
362
    	
363
    		//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end)
364
    		return -1
365
    	} else if (start > s.start && end < s.end) { 	// s contains self : { [ ] }
366
    		//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end)
367
    		return 1
368
    	} else if (start == s.start && end == s.end) { 	// self and s have the same intervals : [{ }]
369
    		//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end)
370
     		return name.compareTo(s.name) // use the lexicographic order of the structure names
371
    	} else if (start < s.start) { 					// interval starting on the left comes first : [ { ...
372
    		return -1
373
    	} else if (start > s.start) { 					// interval starting on the right comes after : { [ ...
374
    		return 1
375
    	} else if (end > s.end) { 						// same start, interval ending on the right comes before : [{ } ]...
376
    		return -1
377
    	} else if (end < s.end) { 						// same start, interval ending on the right comes before : [{ ] }...
378
    		return -1
379
    	} else {										// same start, same end : [{ ]}...
380
    		return name.compareTo(s.name) // use the lexicographic order of the structure names
381
    	}
382
    }
383
*/
384

  
385
    public toString(Struct s) {
386
		sprintf("%s[%d, %d]", s.name, s.start, s.end)
387
    }
388
    
389
    public print(Struct s) {
390
		print(s.toString())
391
    }
392
}
393

  
394
// Now build the TreeSet of corpus structures intervals
395

  
396
def h = new TreeSet<Struct>()
397

  
398
struct_names.each {
399
	for (i in 0..corpusEngine.attributeSize("${corpusName}.${it}")-1) {
400
		(start, end) = corpusEngine.struc2Cpos("${corpusName}.${it}", i)
401
		//println sprintf("Adding %s[%d, %d]", it, start, end)
402
		h.add(new Struct(it, start, end))
403
	}
404
}
405

  
406
// function to print the hierarchical index of a query
407
def print_index = { c, q, p, cut ->
408

  
409
	corpusEngine.cqpQuery(c, "RES1", q)
410
	def matches_target_p = corpusEngine.cpos2Str("${c}.${p}", corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1))
411
	if (cut > 0) {
412
		println matches_target_p.countBy { it }.sort { -it.value }.take(cut)
413
	} else {
414
		println matches_target_p.countBy { it }.sort { -it.value }
415
	}
416
	corpusEngine.dropSubCorpus("${c}:RES1")
417
}
418

  
419
// function to print the statistics of an index of a query
420
def print_freq = { c, q, p ->
421

  
422
	// appel du moteur
423
	corpusEngine.cqpQuery(c, "RES1", q)
424
	def matches_target_p = corpusEngine.cpos2Id("${c}.${p}", corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1))
425

  
426
	//println ""
427

  
428
	// afficher les positions de mots du résultat
429
	//println corpusEngine.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, corpusEngine.subCorpusSize("${c}:RES1")-1)
430

  
431
	// afficher les codes des occurrences de la propriété du résultat
432
	//println matches_target_p
433

  
434
	// afficher l'index hiérarchique des codes du résultat
435
	//println matches_target_p.collect { it }.countBy { it }.sort { -it.value }
436

  
437
	// calculer la fréquence de chaque valeur et ne garder que les fréquences
438
	def index = matches_target_p.collect { it }.countBy { it }
439
	def freqs = index.values()
440
	
441
	// afficher la liste décroissante des fréquences du résultat
442
	//println freqs.sort { -it.value }
443
	
444
	def tC = corpusEngine.subCorpusSize("${c}:RES1")
445
	//def tF = freqs.sum() // control value
446
	def v = freqs.size()
447
	def fmin = freqs.min()
448
	def fmax = freqs.max()
449
	//println sprintf("t %d, v %d, fmin %d, fmax %d", tC, v, fmin, fmax)
450
	print sprintf("%d\t%d\t%d\t%d", tC, v, fmin, fmax)
451
	// afficher les valeurs des occurrences de la propriété du résultat
452
	if (displayIndex) {
453
		heads = index.sort { -it.value }.take(Vmax).keySet()
454
		println "\t"+heads.collect { corpusEngine.id2Str("${c}.${p}", it)[0] }
455
	} else {
456
		println ""
457
	}
458
	corpusEngine.dropSubCorpus("${c}:RES1")
459
}
460

  
461
if (propParam) {
462
	print sprintf("struct\tprop\tt\tv\tfmin\tfmax")
463
	} else {
464
	print sprintf("struct\tstart\tend\tt\tv\tfmin\tfmax")
465
}
466

  
467
if (displayIndex) {
468
	println sprintf("\tindex")
469
} else {
470
	println ""
471
}
472

  
473
def env = System.getenv()
474
def localPath = env["HOME"]+"/Documents/d3test"
475
new File(localPath).mkdirs()
476

  
477
// reset output file
478
def resultFile = new File(localPath, "desc-partition.html")
479
def result = new PrintWriter(resultFile)
480
result.print("")
481
result.close()
482

  
483
resultFile << '''\
484
<!DOCTYPE html>
485
<html>
486
  <head>
487
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8" charset="UTF-8"/>
488
    <link type="text/css" rel="stylesheet" href="style.css"/>
489
    <script type="text/javascript" src="d3/d3.v3.js" charset="utf-8"></script>
490
    <script type="text/javascript" src="d3/layout/partition.js" charset="utf-8"></script>
491
    <style type="text/css">
492

  
493
.chart {
494
  display: block;
495
  margin: auto;
496
  margin-top: 60px;
497
  font-size: 11px;
498
}
499

  
500
rect {
501
  stroke: #eee;
502
  fill: #aaa;
503
  fill-opacity: .8;
504
}
505

  
506
rect.parent {
507
  cursor: pointer;
508
  fill: steelblue;
509
}
510

  
511
text {
512
  pointer-events: none;
513
}
514

  
515
    </style>
516
  </head>
517
  <body>
518
    <div id="body">
519
      <div id="footer">
520
        Structures hierarchy
521
        <div class="hint">click or shift-alt-click to zoom-in or out</div>
522
      </div>
523
    </div>
524
    <script type="text/javascript">
525

  
526
var w = 1120,
527
    h = 600,
528
    x = d3.scale.linear().range([0, w]),
529
    y = d3.scale.linear().range([0, h]);
530

  
531
var vis = d3.select("#body").append("div")
532
    .attr("class", "chart")
533
    .style("width", w + "px")
534
    .style("height", h + "px")
535
  .append("svg:svg")
536
    .attr("width", w)
537
    .attr("height", h);
538

  
539
var partition = d3.layout.partition()
540
    .value(function(d) { return d.size; }).sort(null);
541

  
542
var tree = `{'''
543

  
544
// Now iterate on the TreeSet to get a depth first search on the structure intervals
545

  
546
def rec_struct_regex = /([^0-9]+)[0-9]+/
547

  
548
/*
549
 "name": "sha-hamlet",
550
 "children": [
551
  {
552
   "name": "sha-hamcast",
553
   "children": [
554
    {
555
     "name": "sha-ham1",
556
     "children": [
557
      {"name": "sha-ham102", "size": 855},
558
      {"name": "sha-ham103", "size": 464},
559
      {"name": "sha-ham104", "size": 296},
560
      {"name": "sha-ham105", "size": 635}
561
     ]
562
    }
563
   ]
564
  }
565
 ]
566
}`;
567
*/
568

  
569
def displayTree = { head ->
570
	if (head) {
571
		subtree = h.tailSet(head)
572
		subtree.each { print sprintf("%s[%d, %d], ", it.name, it.start, it.end) }
573
		println ""
574
		if (subtree.size() == 0) {
575
			println sprintf("%s[%d, %d]", head.name, head.start, head.end)
576
			} else {
577
			displayTree(subtree)
578
		}
579
	}
580
}			
581

  
582
//displayTree(h.first())
583

  
584

  
585
h.each {
586
	//println sprintf("Displaying %s[%d, %d]", it.name, it.start, it.end)
587
	if (propParam) {
588
		def rec_match = (it.name =~ rec_struct_regex)
589
		if (rec_match.size() == 1) {
590
			//println "Rec struct match = "+rec_match[0][1]
591
			istruct_name = rec_match[0][1]
592
		} else {
593
			//println "Struct match = "+it.name
594
			istruct_name = it.name
595
		}
596
		def struct_name = "${corpusName}.${istruct_name}_${corpusStructProps[it.name]}"
597
		print sprintf("%s\t%s\t", it.name, corpusEngine.struc2Str(struct_name, corpusEngine.cpos2Struc(struct_name, [it.start] as int[]))[0])
598
	} else {
599
		def struct_name = "${corpusName}.${it.name}"
600
		print sprintf("%s\t%d\t%d\t", it.name, it.start, it.end)
601
	}
602
	print_freq(corpusName, sprintf("a:%s :: a>=%d & a<=%d", query, it.start, it.end), wordProperty)
603
}
604

  

Formats disponibles : Unified diff