/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2087

     package org.txm.macroprototypes.urs.cqp
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.annotation.urs.*
     import org.txm.importer.ValidateXml
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import visuAnalec.elements.*
     // BEGINNING OF PARAMETERS
     // Open the parameters input dialog box
     //if (!ParametersDialog.open(this)) return;
     if (!(corpusViewSelection instanceof MainCorpus)) {
     	println "Selection must be a Corpus"
     	return
+    }
     MainCorpus corpus = corpusViewSelection
     def analecCorpus = URSCorpora.getCorpus(corpus)
     def texts = corpus.getCorpusTextIdsList();
     def texts_startlimits = corpus.getTextStartLimits()
     def texts_endlimits = corpus.getTextEndLimits()
     for (int i = 0 ; i < texts.size() ; i++) {
     	println "Processing annotations of "+texts[i]+"..."
     	def text_id = texts[i]
     	def text_start = texts_startlimits[i]
     	def text_end = texts_endlimits[i]
     	File xmltxmFile = new File(corpus.getProject().getProjectDirectory(), "txm/"+corpus.getID()+"/"+text_id+".xml")
     	File xmltxmFileCopy = new File(corpus.getProject().getProjectDirectory(), text_id+"_copy.xml")
     	if (!xmltxmFile.exists()) {
     		println "Warning: no text file found: "+xmltxmFile
     		continue
+    	}
     	for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) {
     		def corpus_units = []
     		ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type)
     		corpus_units = all_units.findAll() {
     			"word".equals(it.getProp("type")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written")))
+    		}
     		if (corpus_units.size() == 0) continue;
     		try {
     			println "processing word Units ${text_id} and its units "+corpus_units.size()
     			WordUnitsInserter inserter = new WordUnitsInserter(corpus, xmltxmFile, corpus_units);
     			if (inserter.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) {
     				xmltxmFile.delete()
     				xmltxmFileCopy.renameTo(xmltxmFile)
     				for (Unite unit : corpus_units) {
     					unit.getProps()["written"] = "true"
+    				}
     				URSCorpora.saveCorpus(corpus);
     				println "Done, "+corpus_units.size()+ " units written"
     			} else {
     				println "Error while processing milestones $xmltxmFile file"
     				File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     				error.getParentFile().mkdirs()
     				println "	moving created file to $error"
     				error.delete()
     				xmltxmFileCopy.renameTo(error)
+    			}
     		} catch(Exception e) {
     			println "Error while processing milestones $xmltxmFile file: "+e
     			e.printStackTrace();
     			File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     			error.getParentFile().mkdirs()
     			println "	moving created file to $error"
     			error.delete()
     			xmltxmFileCopy.renameTo(error)
+    		}
+    	}
+    }

     // STANDARD DECLARATIONS
     package org.txm.macroprototypes.urs.cqp
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.annotation.urs.*
     import org.txm.importer.ValidateXml
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import visuAnalec.elements.*
     // BEGINNING OF PARAMETERS
     // Open the parameters input dialog box
     //if (!ParametersDialog.open(this)) return;
     if (!(corpusViewSelection instanceof MainCorpus)) {
     	println "Selection must be a Corpus"
     	return
+    }
     MainCorpus corpus = corpusViewSelection
     def analecCorpus = URSCorpora.getCorpus(corpus)
     def texts = corpus.getCorpusTextIdsList();
     def texts_startlimits = corpus.getTextStartLimits()
     def texts_endlimits = corpus.getTextEndLimits()
     for (int i = 0 ; i < texts.size() ; i++) {
     	println "Processing annotations of "+texts[i]+"..."
     	def text_id = texts[i]
     	def text_start = texts_startlimits[i]
     	def text_end = texts_endlimits[i]
     	File xmltxmFile = new File(corpus.getProject().getProjectDirectory(), "txm/"+corpus.getID()+"/"+text_id+".xml")
     	File xmltxmFileCopy = new File(corpus.getProject().getProjectDirectory(), text_id+"_copy.xml")
     	if (!xmltxmFile.exists()) {
     		println "Warning: no text file found: "+xmltxmFile
     		continue
+    	}
     	// WRITE MILESTONES UNITS
     	println "-> MILESTONES UNITS"
     	def units = []
     	for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) {
     		def corpus_units = []
     		ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type)
     		corpus_units = all_units.findAll() {
     			it.getProp("type") != null && it.getProp("type").trim().length() > 0 && "yes".equals(it.getProp("milestone")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written")))
+    		}
     		units.addAll(corpus_units)
+    	}
     	if (units.size()== 0) {
     		println "No milestones to write"
     	} else {
     		try {
     			println "processing milestones Units ${text_id} and its units "+units.size()
     			MileStoneInserter inserter = new MileStoneInserter(corpus, xmltxmFile, units);
     			if (inserter.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) {
     				xmltxmFile.delete()
     				xmltxmFileCopy.renameTo(xmltxmFile)
     				for (Unite unit : units) {
     					unit.getProps()["written"] = "true"
+    				}
     				URSCorpora.saveCorpus(corpus);
     				println "Done, "+units.size()+ " milestones written"
     			} else {
     				println "Error while processing milestones $xmltxmFile file"
     				File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     				error.getParentFile().mkdirs()
     				println "	moving created file to $error"
     				error.delete()
     				xmltxmFileCopy.renameTo(error)
+    			}
     		} catch(Exception e) {
     			println "Error while processing milestones $xmltxmFile file: "+e
     			File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     			error.getParentFile().mkdirs()
     			println "	moving created file to $error"
     			error.delete()
     			xmltxmFileCopy.renameTo(error)
+    		}
+    	}
     	// WRITE NON MILESTONES UNITS
     	println "-> OTHER UNITS"
     	for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) {
     		def corpus_units = []
     		ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type)
     		corpus_units = all_units.findAll() {
     			it.getProp("type") != null && it.getProp("type").trim().length() > 0 && !"no".equals(it.getProp("milestone")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written")))
+    		}
     		if (corpus_units.size() == 0) continue;
     		println "processing Units ${text_id} and its $unit_type units "+corpus_units.size()
     		UnitsInserter inserter2 = new UnitsInserter(corpus, xmltxmFile, corpus_units, unit_type);
     		try {
     			if (inserter2.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) {
     				xmltxmFile.delete()
     				xmltxmFileCopy.renameTo(xmltxmFile)
     				for (Unite unit : corpus_units) {
     					unit.getProps()["written"] = "true"
+    				}
     				URSCorpora.saveCorpus(corpus);
     				println "Done, "+corpus_units.size()+ " units written"
     			} else {
     				println "Error while processing milestones $xmltxmFile file"
     				File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     				error.getParentFile().mkdirs()
     				println "	moving created file to $error"
     				error.delete()
     				xmltxmFileCopy.renameTo(error)
+    			}
     		} catch(Exception e) {
     			println "Error while processing milestones $xmltxmFile file: "+e
     			File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName())
     			error.getParentFile().mkdirs()
     			println "	moving created file to $error"
     			error.delete()
     			xmltxmFileCopy.renameTo(error)
+    		}
+    	}
+    }

     package org.txm.macroprototypes.urs.cqp
     import java.io.IOException
     import java.util.Date
     import java.util.LinkedHashMap
     import javax.xml.stream.XMLStreamException
     import org.txm.Toolbox
     import org.txm.importer.StaxIdentityParser
     import org.txm.macro.urs.AnalecUtils
     import org.txm.scripts.importer.GetAttributeValue
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.MainCorpus
     import visuAnalec.elements.Unite
     public class WordUnitsInserter extends StaxIdentityParser {
     	List<Unite> units
     	String[] ids
     	File inputFile
     	def id2Units = [:]
     	LinkedHashMap<String, String> anaValues = new LinkedHashMap<String, String>();
     	public WordUnitsInserter(MainCorpus corpus, File inputFile, List<Unite> units) {
     		super(inputFile)
     		this.inputFile = inputFile
     		this.units = units
     		this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
     		for (int i = 0 ; i < units.size() ; i++) {
     			Unite u = units[i]
     			int[] positions = u.getDeb()..u.getFin()
     			ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions)
     			for (String id : ids) {
     				if (!id2Units.containsKey(id)) {
     					id2Units[id] = []
+    				}
     				id2Units[id] << u
+    			}
+    		}
+    	}
     	boolean inAna = false
     	String ana_type, ana_resp, ana_value
     	boolean inW = false
     	String word_id = null
     	protected void processStartElement() throws XMLStreamException, IOException {
     		if ("w".equals(localname)) {
     			inW = true
     			word_id = this.getParserAttributeValue("id")
     			if (id2Units.containsKey(word_id)) {
     				for (Unite u : id2Units[word_id]) {
     					def props = u.getProps();
     					for (String p : props.keySet()) {
     						if ("type".equals(p)) continue;
     						if ("written".equals(p)) continue;
     						if ("milestone".equals(p)) continue;
     						if (!anaValues.containsKey(p)) {
     							anaValues[p] = ""
     							ana_resp = "#txm"
+    						}
     						anaValues[p] = (anaValues[p]+" "+props.get(p)).trim()
+    					}
+    				}
     				if (anaValues.size() > 0) println anaValues
+    			}
     			super.processStartElement(); // write the tag
     		} else if ("ana".equals(localname) && inW) {
     			inAna = true
     			ana_type = this.getParserAttributeValue("type").substring(1)
     			ana_resp = this.getParserAttributeValue("resp")
     			ana_value = ""
     		} else {
     			super.processStartElement()
+    		}
+    	}
     	@Override
     	public void processCharacters() throws XMLStreamException {
     		if (inAna) {
     			ana_value += parser.getText().trim()
     		} else {
     			super.processCharacters()
+    		}
+    	}
     	protected void processEndElement() throws XMLStreamException {
     		if ("w".equals(localname)) {
     			// write the last values
     			for (String ana_type : anaValues.keySet()) {
     				writer.writeStartElement("txm:ana")
     				writer.writeAttribute("type", "#" + ana_type)
     				writer.writeAttribute("resp", "#txm") // change
     				writer.writeCharacters(anaValues[ana_type])
     				writer.writeEndElement()
+    			}
     			anaValues.clear()
     			super.processEndElement() // finally write word then close annotations
     			inW = false
     		} else if ("ana".equals(localname) && inW) {
     			if (!anaValues.containsKey(ana_type)) {
     				anaValues[ana_type] = ana_value.trim()
     			} else {
     				ana_resp = "#txm" // set the resp to txm since anaValues update the ana value
     				anaValues[ana_type] = (anaValues[ana_type]+" "+ana_value.trim()).trim()
+    			}
     			String value = anaValues[ana_type]
     			writer.writeStartElement("txm:ana")
     			writer.writeAttribute("type", "#" + ana_type)
     			writer.writeAttribute("resp", ana_resp) // change
     			writer.writeCharacters(value)
     			writer.writeEndElement()
     			anaValues.remove(ana_type)
     			inAna = false
     			ana_type = null
     			ana_resp = null
     			ana_value = null
     			// write ana later
     		} else {
     			super.processEndElement()
+    		}
+    	}
+    }

     package org.txm.macroprototypes.urs.cqp
     import java.io.IOException
     import javax.xml.stream.XMLStreamException
     import org.txm.importer.StaxIdentityParser
     import org.txm.macro.urs.AnalecUtils
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.MainCorpus
     import visuAnalec.elements.Unite
     public class MileStoneInserter extends StaxIdentityParser {
     	List<Unite> units
     	String[] ids
     	File inputFile
     	def id2Units = [:]
     	public MileStoneInserter(MainCorpus corpus, File inputFile, List<Unite> units) {
     		super(inputFile);
     		this.inputFile = inputFile;
     		this.units = units;
     		this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
     		int[] positions = new int[units.size()];
     		for( int i = 0 ; i < units.size() ; i++) {
     			positions[i] = units.get(i).getDeb();
+    		}
     		ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions);
     		for (int i = 0 ; i < ids.length ; i++) {
     			String id = ids[i]
     			if (id != null) {
     				if (!id2Units.containsKey(id)) id2Units[id] = []
     				id2Units[id] << units[i]
+    			}
+    		}
+    	}
     	boolean start = false;
     	String word_id = null;
     	@Override
     	protected void processStartElement() throws XMLStreamException, IOException {
     		if ("text".equals(localname)) {
     			start = true;
     		} else if ("w".equals(localname) && start) {
     			word_id = getParserAttributeValue("id");
     			if (word_id == null) {
     				println "Warning: found <w> without id at line "+parser.getLocation().getLineNumber()+" in "+inputFile
     			} else {
     				writeAllUnits(word_id, "before")
+    			}
+    		}
     		super.processStartElement();
+    	}
     	protected void writeAllUnits(String id, String position) {
     		def units = id2Units[id]
     		if (units == null) return; // no units to write
     		for (Unite currentUnit : units) {
     			if (!position.equals(currentUnit.getProp("position"))) return;
     			writer.writeStartElement(currentUnit.getProp("type"));
     			HashMap props = currentUnit.getProps();
     			for (String p : props.keySet()) {
     				if (p.equals("type")) continue; // ignore the type since written in tag name
     				writer.writeAttribute(p, ""+props.get(p));
+    			}
     			writer.writeEndElement();
+    		}
+    	}
     	@Override
     	protected void processEndElement() throws XMLStreamException {
     		super.processEndElement();
     		if ("w".equals(localname) && start && word_id != null) {
     			writeAllUnits(word_id, "after")
+    		}
     		if ("w".equals(localname)) {
     			word_id = null;
+    		}
+    	}
+    }

     package org.txm.macroprototypes.urs.cqp
     import java.io.IOException
     import javax.xml.stream.XMLStreamException
     import org.txm.importer.StaxIdentityParser
     import org.txm.macro.urs.AnalecUtils
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.MainCorpus
     import visuAnalec.elements.Unite
     public class UnitsInserter extends StaxIdentityParser {
     	List<Unite> units
     	String[] ids
     	File inputFile
     	def open_id2Units = [:]
     	def close_id2Units = [:]
     	def writing_units = []
     	def writing_stacks = []
     	def stack = "";
     	def writing_start, writing_end;
     	def positions2id = [:] // used to relocate end of units
     	public UnitsInserter(MainCorpus corpus, File inputFile, List<Unite> units, String type) {
     		super(inputFile);
     		this.inputFile = inputFile;
     		this.units = units;
     		this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: b.getFin() <=> a.getFin() }
     		// get words id limits
     		int[] positions = new int[units.size()];
     		for( int i = 0 ; i < units.size() ; i++) {
     			positions[i] = units.get(i).getDeb();
+    		}
     		ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions);
     		for (int i = 0 ; i < ids.length ; i++) {
     			String id = ids[i]
     			if (id != null) {
     				positions2id[positions[i]] = id
     				if (!open_id2Units.containsKey(id)) open_id2Units[id] = []
     				open_id2Units[id] << units[i]
+    			}
+    		}
     		positions = new int[units.size()];
     		for( int i = 0 ; i < units.size() ; i++) {
     			positions[i] = units.get(i).getFin();
+    		}
     		ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions);
     		for (int i = 0 ; i < ids.length ; i++) {
     			String id = ids[i]
     			if (id != null) {
     				positions2id[positions[i]] = id
     				if (!close_id2Units.containsKey(id)) close_id2Units[id] = new HashSet<Unite>()
     				close_id2Units[id] << units[i]
+    			}
+    		}
+    	}
     	boolean start = false;
     	String word_id = null;
     	@Override
     	protected void processStartElement() throws XMLStreamException, IOException {
     		stack += "/"+localname
     		if ("text".equals(localname)) {
     			start = true;
     		} else if ("w".equals(localname) && start) {
     			word_id = getParserAttributeValue("id");
     			if (word_id == null) {
     				println "Warning: found <w> without id at line "+parser.getLocation().getLineNumber()+" in "+inputFile
     			} else {
     				writeOpenUnits()
+    			}
+    		}
     		super.processStartElement();
+    	}
     	protected void writeOpenUnits() {
     		def toWrite = open_id2Units[word_id]
     		if (toWrite != null) {
     			for (Unite unite : toWrite) {
     				for (int i = 0 ; i < writing_units.size() ; i++) {
     					Unite u = writing_units.get(i);
     					if (unite.getFin() > u.getFin()) {
     						// add unite to close_id2Units
     						String id = positions2id[u.getFin()]
     						close_id2Units[id] << unite // close the unite at the same moment
+    					}
+    				}
     				writeUnit(unite);
+    			}
+    		}
+    	}
     	protected void writeCloseUnits() {
     		if (word_id != null) {
     			def toClose = close_id2Units[word_id]
     			if (toClose != null) {
     				for (int i = 0 ; i < writing_units.size() ; i++) {
     					Unite u = writing_units.get(i);
     					if (toClose.contains(u)) {
     						writing_stacks.remove(i)
     						writing_units.remove(i)
     						writer.writeEndElement();
     						i--;
+    					}
+    				}
+    			}
     		} else {
     			for (int i = 0 ; i < writing_stacks.size() ; i++) {
     				if (writing_stacks[i].equals(stack)) {
     					writing_stacks.remove(i)
     					writing_units.remove(i)
     					writer.writeEndElement();
     					i--
+    				}
+    			}
+    		}
+    	}
     	protected void writeUnit(Unite currentUnit) {
     		writing_units << currentUnit
     		writing_stacks << currentUnit
     		if (currentUnit.getDeb() > writing_start)
     		writer.writeStartElement(currentUnit.getProp("type"));
     		HashMap props = currentUnit.getProps();
     		for (String p : props.keySet()) {
     			if (p.equals("type")) continue; // ignore the type since written in tag name
     			writer.writeAttribute(p, ""+props.get(p));
+    		}
+    	}
     	@Override
     	protected void processEndElement() throws XMLStreamException {
     //		println "writing_stacks=$writing_stacks"
     //		println "stack=$stack"
     		if (writing_stacks.size() > 0 && writing_stacks[-1].equals(stack)) {
     			writeCloseUnits()
+    		}
     		super.processEndElement();
     		stack = stack.substring(0, stack.length() - localname.length() - 1);
     		if ("w".equals(localname)) {
     			if (start && word_id != null) {
     				writeCloseUnits()
+    			}
     			word_id = null;
+    		}
+    	}
+    }

      org.txm.annotation.urs.view,
      org.txm.annotation.urs.widgets,
      org.txm.macro.urs,
      org.txm.macro.urs.cqp,
      org.txm.macro.urs.democrat,
      org.txm.macro.urs.edit,
      org.txm.macro.urs.exploit,
      org.txm.macro.urs.export,
      org.txm.macroprototypes.urs.cqp,
      org.txm.macroprototypes.urs.misc,
      visuAnalec,
      visuAnalec.chaines,

     package org.txm.macro.commands
     // Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
     // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
     // @author sheiden
     /*
      Macro affichant les statistiques de différentes structures d'un corpus
      Paramètres de la macro :
      - corpus : le corpus sélectionné dans la vue Corpus
      - structures : liste des structures à interroger. Séparer les noms par une virgule.
      - structProperties : liste des propriétés de structures. Séparer les noms par une virgule.
      Il doit y avoir autant de propriétés de structures que de structures indiquées dans le paramètre structures.
      Les structures doivent posséder la propriété demandée.
      Ce paramètre peut être laissé vide, dans ce cas la colonne 'prop' n'est pas affichée.
      - query : requête CQL de sélection de mots exprimée obligatoirement en format complet : [...]
      Par exemple :
      - [frpos="N.*"] pour sélectionner les noms communs et les noms propres
      - [] pour sélectionner tous les mots
      - wordProperty : propriété de mot utilisée pour calculer le vocabulaire et les fréquences
      - displayIndex : calculer l'index hiérarchique des valeurs de la propriété wordProperty pour la requête query sur chaque structure
      - Vmax : nombre maximum des mots les plus fréquents à afficher dans l'index
      Résultat :
      Le résultat est un tableau TSV affiché dans la console.
      On peut l'exploiter avec un copier/coller dans Calc.
      Chaque ligne correspond à une structure du corpus.
      Les lignes sont ordonnées par ordre hiérarchique des structures du début à la fin du corpus.
      Les colonnes sont :
      - struct : nom de la structure
      - prop : valeur de la propriété de la structure
      (si le paramètre structProperties est vide, cette colonne est absente du résultat)
      - start : position du premier mot de la structure dans le corpus
      (les positions du corpus sont numérotées à partir de 0).
      Les colonnes start et end sont pratiques quand on n'a pas de propriété de structure à afficher pour se repérer dans le corpus.
      - end : position du dernier mot de la structure
      - T : taille de la structure (end-start)
      - t : nombre de mots sélectionnés dans la structure
      - v : nombre de valeurs différentes de la propriété des mots sélectionnés dans la structure
      - fmin : fréquence minimale des valeurs de la propriété de mots sélectionnés dans la structure
      - fmax : fréquence maximale des valeurs de la propriété de mots sélectionnés dans la structure
      - index : l'index hiérarchique des valeurs de la propriété de mot choisie des mots sélectionnés par la requête CQL
      Exemple de résultats sur le texte "Essais sur la peinture" de Diderot :
      struct	prop	start	end	T	t	v	fmin	fmax	index
      text	DiderotEssais	46203	56871	10668	2011	903	1	38	[nature, couleur, homme, tableau, lumière, objets, œil, toile, art, effet, corps, artiste, ombre, ombres, deux, peintre, peinture, dessin, couleurs, tête]
      div	0	46214	49223	3009	549	327	1	16	[nature, homme, modèle, figure, deux, école, artiste, chose, âge, figures, dessin, actions, fois, professeur, action, attitude, manière, femme, col, tête]
      p	0	46220	46259	39	5	5	1	1	[nature, forme, cause, êtres, un]
      p	1	46260	46456	196	36	25	1	3	[yeux, col, épaules, gorge, femme, jeunesse, nature, accroissement, orbe, paupières, cavité, absence, organe, sourcils, joues, lèvre, mouvement, altération, parties, visage]
      p	2	46457	46578	121	28	26	1	2	[pieds, nature, regards, homme, dos, poitrine, forme, cartilages, col, vertèbres, tête, mains, articulation, poignet, coudes, arrière, membres, centre, gravité, système]
      p	3	46579	46622	43	5	4	1	2	[causes, effets, êtres, imitation]
      p	4	46623	46727	104	22	20	1	2	[ignorance, règles, effets, causes, convention, suites, peine, artiste, imitation, nature, pieds, jambes, genoux, têtes, tact, observation, phénomènes, liaison, enchaînement, difformités]
      p	5	46728	46797	69	10	6	1	4	[nez, Antinoüs, nature, difformité, altérations, reste]
      p	6	46798	46859	61	9	7	1	2	[règles, nature, homme, rue, chose, statue, proportions]
      p	7	46860	46942	82	13	11	1	2	[extrémité, pied, voile, bossu, Venus, Medicis, nature, figure, crayons, monstre, chose]
      p	8	46943	46982	39	11	11	1	1	[figure, système, suites, inconséquence, principe, production, art, mille, lieues, œuvre, nature]
      p	9	46983	47196	213	38	30	1	5	[homme, figure, âge, fonctions, mystères, art, artiste, proportions, despotisme, nature, condition, sacrifice, cent, manières, organisation, habitude, facilité, grandeur, proportion, membre]
      ... [13 paragraphes] ...
      div	1	49224	52163	2939	531	307	1	23	[couleur, nature, chair, artiste, toile, art, homme, yeux, œil, couleurs, tableau, harmonie, effet, dessin, palette, organe, ton, coloriste, vie, ami]
      p	24	49230	49258	28	7	7	1	1	[C', dessin, forme, êtres, couleur, vie, souffle]
      p	25	49259	49284	25	6	6	1	1	[maîtres, art, juges, dessin, monde, couleur]
      p	26	49285	49354	69	16	16	1	1	[dessinateurs, coloristes, littérature, Cent, froids, orateur, Dix, orateurs, poète, intérêt, homme, Helvétius, dix, bons, peine, mort]
      p	27	49355	49485	130	24	21	1	2	[artiste, besoin, échelle, ami, atelier, teintes, demi-, palette, quart, heure, travail, ordre, pendant, passage, auteur, bureau, ligne, livre, place, allure]
      p	28	49486	49680	194	46	42	1	2	[yeux, toile, chaos, œuvre, sentiment, couleur, bouche, palette, image, pinceau, création, oiseaux, nuances, plumage, fleurs, velouté, arbres, verdures, azur, ciel]
      p	29	49681	49967	286	48	43	1	3	[nature, organe, homme, arbre, artistes, chose, monde, variété, coloristes, couleur, disposition, doute, œil, couleurs, tableau, effets, rouges, blancs, tapisserie, murs]
      p	30	49968	50068	100	20	17	1	3	[fois, organe, peintre, ouvrage, littérateur, caractère, disposition, pente, homme, voix, explosion, état, silence, artiste, tableau, couleur, coloris]
      p	31	50069	50105	36	7	7	1	1	[coup, organe, affection, corps, vapeur, nature, imitation]
      p	32	50106	50267	161	26	19	1	4	[couleur, palette, artiste, effet, tableau, teintes, couleurs, idée, endroit, fois, appréciation, scène, composition, manie, travail, teinte, composé, substances, unes]
      p	33	50268	50319	51	7	7	1	1	[général, harmonie, composition, peintre, effet, pinceau, couleur]
      ... [etc.]
      Avec les paramètres :
      - structures : text,div,p
      - structProperties : id,n,n
      - query : [frpos="N.*"]
      - wordProperty : word
      - displayIndex : true
      - Vmax : 20
      */
     // Déclarations
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.Toolbox
     import org.eclipse.ui.console.*
     import org.txm.macro.cqp.*
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.CQPCorpus
     import org.txm.searchengine.cqp.corpus.Partition
     import org.txm.searchengine.cqp.corpus.Property
     import org.txm.searchengine.cqp.corpus.QueryResult
     import org.txm.searchengine.cqp.corpus.Subcorpus;
     import org.txm.searchengine.cqp.corpus.query.CQLQuery
     import org.txm.rcp.commands.*
     import org.txm.statsengine.r.core.RWorkspace
     byte CQI_CONST_FIELD_MATCH = (byte) 0x10
     def scriptName = this.class.getSimpleName()
     def selection = []
     for (def s : corpusViewSelections) {
     	if (s instanceof CQPCorpus) selection << s
     	else if (s instanceof Partition) selection.addAll(s.getParts())
+    }
     if (selection.size() == 0) {
     	println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
     	return false
+    }
     println "WORKING WITH $selection"
     // BEGINNING OF PARAMETERS
     @Field @Option(name="structures", usage="act,scene", widget="String", required=true, def="text,div,p")
     		def structures
     @Field @Option(name="structProperties", usage="n,n", widget="String", required=false, def="id,n,n")
     		def structProperties
     @Field @Option(name="query", usage="[word!='\\p{P}']", widget="String", required=true, def="[pos=\"NOM.*\"|frpos=\"N.*\"]")
     		def query
     @Field @Option(name="wordProperty", usage="word", widget="String", required=true, def="word")
     		def wordProperty
     @Field @Option(name="displayIndex", usage="display a hierarchical index", widget="Boolean", required=true, def="true")
     		def displayIndex
     @Field @Option(name="Vmax", usage="size of index", widget="Integer", required=false, def="20")
     		def Vmax
     // END OF PARAMETERS
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     def CQI = CQPSearchEngine.getCqiClient()
     def corpusStructs = structures.split(",") 			// ["act", "scene"]
     structProperties = structProperties.trim()
     if (structProperties.size() > 0) {
     	propParam = true
     	corpusStructPropNames = structProperties.split(",")	// ["n", "n"]
     	corpusStructProps = [corpusStructs, corpusStructPropNames].transpose().collectEntries()
     } else {
     	propParam = false
+    }
     // First define the order theory over corpus structures intervals
     // by defining a binary comparator that will be used to build the
     // TreeSet of intervals
     // function to print the hierarchical index of a query
     def print_index = { c, q, p, cut ->
     	QueryResult qr = c.query(new CQLQuery(q), "RES1", false);
     	Subcorpus subcorpus = c.createSubcorpus("RES1", qr);
     	p = subcorpus.getProperty(p)
     	def tC = subcorpus.getSize()
     	def matches_target_p = CQI.cpos2Str(p.getQualifiedName(), CQI.dumpSubCorpus(qr.getQualifiedCqpId(), CQI_CONST_FIELD_MATCH, 0, tC-1))
     	if (cut > 0) {
     		println matches_target_p.countBy { it }.sort { -it.value }.take(cut)
     	} else {
     		println matches_target_p.countBy { it }.sort { -it.value }
+    	}
     	subcorpus.delete()
+    }
     // function to print the statistics of an index of a query
     def print_freq = { CQPCorpus c, q, p ->
     	// appel du moteur
     	//println "QUERY=$q"
     	QueryResult qr = c.query(new CQLQuery(q), "RES1", false);
     	Subcorpus subcorpus = c.createSubcorpus("RES1", qr);
     	p = subcorpus.getProperty(p)
     	int csize = c.getSize()
     	if (csize == 0) {
     		if (displayIndex) {
     			println "0\t0\t0\t0\t[]"
     		} else {
     			println "0\t0\t0\t0"
+    		}
     	} else {
     		def tC = CQI.subCorpusSize(subcorpus.getQualifiedCqpId())
     		def matches_target_p = CQI.cpos2Id(p.getQualifiedName(), CQI.dumpSubCorpus(subcorpus.getQualifiedCqpId(), CQI_CONST_FIELD_MATCH, 0, tC-1))
     		//println ""
     		// afficher les positions de mots du résultat
     		//println CQI.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, CQI.subCorpusSize("${c}:RES1")-1)
     		// afficher les codes des occurrences de la propriété du résultat
     		//println matches_target_p
     		// afficher l'index hiérarchique des codes du résultat
     		//println matches_target_p.collect { it }.countBy { it }.sort { -it.value }
     		// calculer la fréquence de chaque valeur et ne garder que les fréquences
     		def index = matches_target_p.collect { it }.countBy { it }
     		def freqs = index.values()
     		// afficher la liste décroissante des fréquences du résultat
     		//println freqs.sort { -it.value }
     		//def tF = freqs.sum() // control value
     		def v = freqs.size()
     		def fmin = freqs.min()
     		def fmax = freqs.max()
     		//println sprintf("t %d, v %d, fmin %d, fmax %d", tC, v, fmin, fmax)
     		print sprintf("%d\t%d\t%d\t%d", tC, v, fmin, fmax)
     		// afficher les valeurs des occurrences de la propriété du résultat
     		if (displayIndex) {
     			heads = index.sort { -it.value }.take(Vmax).keySet()
     			println "\t"+heads.collect { CQI.id2Str(p.getQualifiedName(), it)[0] }
     		} else {
     			println ""
+    		}
+    	}
     	subcorpus.delete()
+    }
     def r = RWorkspace.getRWorkspaceInstance()
     /**
      * group units by CQP match
+     *
      * units are sorted for faster processing
+     *
      * @param allUnites
      * @param matches
      * @param strict_inclusion
      * @return
      */
     static def inter(def allUnites, def matches) {
     	//println allUnites.collect() {it -> it[0]}
     	allUnites = allUnites.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] }
     	//println allUnites.collect() {it -> it[0]}
     	def unitsSize = allUnites.size()
     	def iCurrentUnit = 0
     	def selectedUnits = []
     	def matchesSize = matches.size()
     	def iCurrentMatch = 0
     	while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
     		def unit = allUnites[iCurrentUnit]
     		def match = matches[iCurrentMatch]
     		if (unit[1] < match.getStart()) {
     			iCurrentUnit++
     		} else if (unit[0] > match.getEnd()) {
     			iCurrentMatch++
     		} else {
     			if (match.getStart() <= unit[0] && unit[1] <= match.getEnd()) {
     				selectedUnits << unit
+    			}
     			iCurrentUnit++
+    		}
+    	}
     	return selectedUnits
+    }
     selection.each { corpus ->
     	corpusName = corpus.getID()
     	mainCorpusName = corpus.getMainCorpus().getID()
     	println "Corpus = "+corpusName
     	println "Corpus QualifiedCqpId = "+corpus.getCqpId()
     	println "MainCorpus = "+mainCorpusName
     	println "Corpus QualifiedCqpId = "+corpus.getMainCorpus().getCqpId()
     	def struct_names = (CQI.corpusStructuralAttributes(corpus.getMainCorpus().getCqpId()) as List)
     	struct_names.removeAll { it.contains('_') }
     	struct_names=(struct_names-"txmcorpus").grep(corpusStructs)
     	//println "struct_names = "+struct_names
     	if (struct_names.size() == 0) {
     		println "** Impossible to find the structures (${corpusStructs}), aborting."
     		return
+    	}
     	def level = [:]
     	// Now build the TreeSet of corpus structures intervals
     	def h = new TreeSet<Struct>()
     	struct_names.each {
     		def matches = []
     		for (i in 0..CQI.attributeSize("${mainCorpusName}.${it}")-1) {
     			(start, end) = CQI.struc2Cpos("${mainCorpusName}.${it}", i)
     			matches << [start, end]
     			//println sprintf("Adding %s[%d, %d]", it, start, end)
+    		}
     		def intersection = inter(matches, corpus.getMatches())
     		for (def item : intersection)
     			h.add(new Struct(it, item[0], item[1]))
+    	}
     	if (propParam) {
     		print sprintf("struct\tprop\tstart\tend\tT\tt\tv\tfmin\tfmax")
     	} else {
     		print sprintf("struct\tstart\tend\tT\tt\tv\tfmin\tfmax")
+    	}
     	if (displayIndex) {
     		println sprintf("\tindex")
     	} else {
     		println ""
+    	}
     	def env = System.getenv()
     	def localPath = env["HOME"]+"/Documents/d3test"
     	new File(localPath).mkdirs()
     	// reset output file
     	def resultFile = new File(localPath, "desc-partition.html")
     	def result = new PrintWriter(resultFile)
     	result.print("")
     	result.close()
     	resultFile << '''\
     <!DOCTYPE html>
     <html>
       <head>
         <meta http-equiv="Content-Type" content="text/html;charset=utf-8" charset="UTF-8"/>
         <link type="text/css" rel="stylesheet" href="style.css"/>
         <script type="text/javascript" src="d3/d3.v3.js" charset="utf-8"></script>
         <script type="text/javascript" src="d3/layout/partition.js" charset="utf-8"></script>
         <style type="text/css">
     .chart {
       display: block;
       margin: auto;
       margin-top: 60px;
       font-size: 11px;
+    }
     rect {
       stroke: #eee;
       fill: #aaa;
       fill-opacity: .8;
+    }
     rect.parent {
       cursor: pointer;
       fill: steelblue;
+    }
     text {
       pointer-events: none;
+    }
         </style>
       </head>
       <body>
         <div id="body">
           <div id="footer">
             Structures hierarchy
             <div class="hint">click or shift-alt-click to zoom-in or out</div>
           </div>
         </div>
         <script type="text/javascript">
     var w = 1120,
         h = 600,
         x = d3.scale.linear().range([0, w]),
         y = d3.scale.linear().range([0, h]);
     var vis = d3.select("#body").append("div")
         .attr("class", "chart")
         .style("width", w + "px")
         .style("height", h + "px")
       .append("svg:svg")
         .attr("width", w)
         .attr("height", h);
     var partition = d3.layout.partition()
         .value(function(d) { return d.size; }).sort(null);
     var tree = `{'''
     	// Now iterate on the TreeSet to get a depth first search on the structure intervals
     	def rec_struct_regex = /([^0-9]+)[0-9]+/
     	/*
     	 "name": "sha-hamlet",
     	 "children": [
+    	 {
     	 "name": "sha-hamcast",
     	 "children": [
+    	 {
     	 "name": "sha-ham1",
     	 "children": [
     	 {"name": "sha-ham102", "size": 855},
     	 {"name": "sha-ham103", "size": 464},
     	 {"name": "sha-ham104", "size": 296},
     	 {"name": "sha-ham105", "size": 635}
+    	 ]
+    	 }
+    	 ]
+    	 }
+    	 ]
     	 }`;
     	 */
     	def displayTree = { head ->
     		if (head) {
     			subtree = h.tailSet(head)
     			subtree.each { print sprintf("%s[%d, %d], ", it.name, it.start, it.end) }
     			println ""
     			if (subtree.size() == 0) {
     				println sprintf("%s[%d, %d]", head.name, head.start, head.end)
     			} else {
     				displayTree(subtree)
+    			}
+    		}
+    	}
     	//displayTree(h.first())
     	def divPropVals = []
     	def divLengths = []
     	def textDivPropVals = []
     	def textDivLengths = []
     	h.each {
     		//println sprintf("Displaying %s[%d, %d]", it.name, it.start, it.end)
     		if (propParam) {
     			def rec_match = (it.name =~ rec_struct_regex)
     			if (rec_match.size() == 1) {
     				println "Rec struct match = "+rec_match[0][1]
     				istruct_name = rec_match[0][1]
     			} else {
     				//println "Struct match = "+it.name
     				istruct_name = it.name
+    			}
     			def struct_name = "${mainCorpusName}.${istruct_name}_${corpusStructProps[it.name]}"
     			def propVal = CQI.struc2Str(struct_name, CQI.cpos2Struc(struct_name, [it.start] as int[]))[0]
     			if (it.name == "text") {
     				textDivPropVals.push(divPropVals)
     				divPropVals = []
     				textDivLengths.push(divLengths)
     				divLengths = []
     			} else if (it.name == "div") {
     				divPropVals.push(propVal)
     				divLengths.push(it.end-it.start)
+    			}
     			print sprintf("%s\t%s\t%d\t%d\t%d\t", it.name, propVal, it.start, it.end, it.end-it.start)
     		} else {
     			def struct_name = "${mainCorpusName}.${it.name}"
     			print sprintf("%s\t%d\t%d\t%d\t", it.name, it.start, it.end, it.end-it.start)
+    		}
     		print_freq(corpus, sprintf("a:%s :: a>=%d & a<=%d", query, it.start, it.end), wordProperty)
+    	}
     	textDivPropVals.push(divPropVals)
     	textDivPropVals.remove(0)
     	textDivLengths.push(divLengths)
     	textDivLengths.remove(0)
     	println textDivPropVals
     	println textDivLengths
     	def textDivPropVals1 = textDivPropVals[0] as String[]
     	r.addVectorToWorkspace("textDivPropVals1", textDivPropVals1)
     	def textDivLengths1 = textDivLengths[0] as int[]
     	r.addVectorToWorkspace("textDivLengths1", textDivLengths1)
     	def PNGFile = File.createTempFile("txm", ".png", new File(Toolbox.getTxmHomePath(), "results"))
     	def PNGFilePath = PNGFile.getAbsolutePath()
     	println "PNG file: "+PNGFilePath
     	def SVGFile = File.createTempFile("txm", ".svg", new File(Toolbox.getTxmHomePath(), "results"))
     	def SVGFilePath = SVGFile.getAbsolutePath()
     	println "SVG file: "+SVGFilePath
     	/// BEGINNING OF R SCRIPT
     	def script ="""
     df <- data.frame(structure=textDivPropVals1,
                      longueur=textDivLengths1)
     p<-ggplot(data=df, aes(x=structure, y=longueur)) +
       geom_bar(stat="identity", fill="steelblue") +
       geom_text(aes(label=longueur), vjust=1.6, color="white", size=3.5) +
       labs(title="${corpusName}", x="Structure div", y = "Longueur") +
       theme_minimal()
     """
     	/// END OF R SCRIPT
     	// execute R script
     	try {
     		r.eval("library(ggplot2)")
     		try {
     			r.eval(script+"ggsave(file=\"${PNGFilePath}\", plot=p)")
     			r.eval(script+"ggsave(file=\"${SVGFilePath}\", plot=p)")
     			//display the SVG results graphic
     			monitor.syncExec(new Runnable() {
     						@Override
     						public void run() { try { OpenSVGGraph.OpenSVGFile(SVGFilePath, "Longueur des structures de "+corpusName) } catch(Exception e) {e.printStackTrace()} }
     					})
     		} catch (Exception e) {
     			println "** Error: "+e
+    		}
     	} catch (Exception e) {
     		println "** The 'ggplot2' R package is not installed. Start R ("+RWorkspace.getExecutablePath()+") and run 'install.packages(\"ggplot2\");'."
+    	}
+    }

     package org.txm.macro.commands;
     class Struct implements Comparable<Struct> {
     	String  name
     	Integer start
     	Integer end
     	Struct(String n, Integer s, Integer e) {
     		name  = n
     		start = s
     		end   = e
+    	}
     	public int compareTo(Struct s) {
     		if (start < s.start && end > s.end) { 			// self contains s : [ { } ]
     			//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end)
     			return -1
     		} else if (start > s.start && end < s.end) { 	// s contains self : { [ ] }
     			//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end)
     			return 1
     		} else if (start == s.start && end == s.end) { 	// self and s have the same intervals : [{ }]
     			//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end)
     			return name.compareTo(s.name) // use the lexicographic order of the structure names
     		} else if (start < s.start) { 					// interval starting on the left comes first : [ { ...
     			return -1
     		} else if (start > s.start) { 					// interval starting on the right comes after : { [ ...
     			return 1
     		} else if (end > s.end) { 						// same start, interval ending on the right comes before : [{ } ]...
     			return -1
     		} else if (end < s.end) { 						// same start, interval ending on the right comes before : [{ ] }...
     			return -1
     		} else {										// same start, same end : [{ ]}...
     			return name.compareTo(s.name) // use the lexicographic order of the structure names
+    		}
+    	}
     	public toString(Struct s) {
     		sprintf("%s[%d, %d]", s.name, s.start, s.end)
+    	}
     	public print(Struct s) {
     		print(s.toString())
+    	}
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2087