Révision 2087
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/cqp/URSUnits2CQPWordsMacro.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macroprototypes.urs.cqp |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import org.txm.annotation.urs.* |
|
6 |
import org.txm.importer.ValidateXml |
|
7 |
import org.txm.rcp.swt.widget.parameters.* |
|
8 |
import org.txm.searchengine.cqp.corpus.* |
|
9 |
import visuAnalec.elements.* |
|
10 |
|
|
11 |
// BEGINNING OF PARAMETERS |
|
12 |
|
|
13 |
// Open the parameters input dialog box |
|
14 |
//if (!ParametersDialog.open(this)) return; |
|
15 |
|
|
16 |
if (!(corpusViewSelection instanceof MainCorpus)) { |
|
17 |
println "Selection must be a Corpus" |
|
18 |
return |
|
19 |
} |
|
20 |
|
|
21 |
MainCorpus corpus = corpusViewSelection |
|
22 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
23 |
|
|
24 |
def texts = corpus.getCorpusTextIdsList(); |
|
25 |
def texts_startlimits = corpus.getTextStartLimits() |
|
26 |
def texts_endlimits = corpus.getTextEndLimits() |
|
27 |
|
|
28 |
for (int i = 0 ; i < texts.size() ; i++) { |
|
29 |
|
|
30 |
println "Processing annotations of "+texts[i]+"..." |
|
31 |
|
|
32 |
def text_id = texts[i] |
|
33 |
def text_start = texts_startlimits[i] |
|
34 |
def text_end = texts_endlimits[i] |
|
35 |
|
|
36 |
File xmltxmFile = new File(corpus.getProject().getProjectDirectory(), "txm/"+corpus.getID()+"/"+text_id+".xml") |
|
37 |
File xmltxmFileCopy = new File(corpus.getProject().getProjectDirectory(), text_id+"_copy.xml") |
|
38 |
|
|
39 |
if (!xmltxmFile.exists()) { |
|
40 |
println "Warning: no text file found: "+xmltxmFile |
|
41 |
continue |
|
42 |
} |
|
43 |
|
|
44 |
for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) { |
|
45 |
def corpus_units = [] |
|
46 |
ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type) |
|
47 |
corpus_units = all_units.findAll() { |
|
48 |
"word".equals(it.getProp("type")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written"))) |
|
49 |
} |
|
50 |
|
|
51 |
if (corpus_units.size() == 0) continue; |
|
52 |
|
|
53 |
try { |
|
54 |
println "processing word Units ${text_id} and its units "+corpus_units.size() |
|
55 |
WordUnitsInserter inserter = new WordUnitsInserter(corpus, xmltxmFile, corpus_units); |
|
56 |
if (inserter.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) { |
|
57 |
xmltxmFile.delete() |
|
58 |
xmltxmFileCopy.renameTo(xmltxmFile) |
|
59 |
|
|
60 |
for (Unite unit : corpus_units) { |
|
61 |
unit.getProps()["written"] = "true" |
|
62 |
} |
|
63 |
URSCorpora.saveCorpus(corpus); |
|
64 |
println "Done, "+corpus_units.size()+ " units written" |
|
65 |
|
|
66 |
} else { |
|
67 |
println "Error while processing milestones $xmltxmFile file" |
|
68 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
69 |
error.getParentFile().mkdirs() |
|
70 |
println " moving created file to $error" |
|
71 |
error.delete() |
|
72 |
xmltxmFileCopy.renameTo(error) |
|
73 |
} |
|
74 |
} catch(Exception e) { |
|
75 |
println "Error while processing milestones $xmltxmFile file: "+e |
|
76 |
e.printStackTrace(); |
|
77 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
78 |
error.getParentFile().mkdirs() |
|
79 |
println " moving created file to $error" |
|
80 |
error.delete() |
|
81 |
xmltxmFileCopy.renameTo(error) |
|
82 |
} |
|
83 |
} |
|
84 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/cqp/URSUnits2CQPStructMacro.groovy (revision 2087) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macroprototypes.urs.cqp |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.annotation.urs.* |
|
7 |
import org.txm.importer.ValidateXml |
|
8 |
import org.txm.rcp.swt.widget.parameters.* |
|
9 |
import org.txm.searchengine.cqp.corpus.* |
|
10 |
import visuAnalec.elements.* |
|
11 |
|
|
12 |
// BEGINNING OF PARAMETERS |
|
13 |
|
|
14 |
// Open the parameters input dialog box |
|
15 |
//if (!ParametersDialog.open(this)) return; |
|
16 |
|
|
17 |
if (!(corpusViewSelection instanceof MainCorpus)) { |
|
18 |
println "Selection must be a Corpus" |
|
19 |
return |
|
20 |
} |
|
21 |
|
|
22 |
MainCorpus corpus = corpusViewSelection |
|
23 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
24 |
|
|
25 |
def texts = corpus.getCorpusTextIdsList(); |
|
26 |
def texts_startlimits = corpus.getTextStartLimits() |
|
27 |
def texts_endlimits = corpus.getTextEndLimits() |
|
28 |
|
|
29 |
for (int i = 0 ; i < texts.size() ; i++) { |
|
30 |
|
|
31 |
println "Processing annotations of "+texts[i]+"..." |
|
32 |
|
|
33 |
def text_id = texts[i] |
|
34 |
def text_start = texts_startlimits[i] |
|
35 |
def text_end = texts_endlimits[i] |
|
36 |
|
|
37 |
File xmltxmFile = new File(corpus.getProject().getProjectDirectory(), "txm/"+corpus.getID()+"/"+text_id+".xml") |
|
38 |
File xmltxmFileCopy = new File(corpus.getProject().getProjectDirectory(), text_id+"_copy.xml") |
|
39 |
|
|
40 |
if (!xmltxmFile.exists()) { |
|
41 |
println "Warning: no text file found: "+xmltxmFile |
|
42 |
continue |
|
43 |
} |
|
44 |
|
|
45 |
// WRITE MILESTONES UNITS |
|
46 |
|
|
47 |
println "-> MILESTONES UNITS" |
|
48 |
|
|
49 |
def units = [] |
|
50 |
for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) { |
|
51 |
def corpus_units = [] |
|
52 |
ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type) |
|
53 |
corpus_units = all_units.findAll() { |
|
54 |
it.getProp("type") != null && it.getProp("type").trim().length() > 0 && "yes".equals(it.getProp("milestone")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written"))) |
|
55 |
} |
|
56 |
units.addAll(corpus_units) |
|
57 |
} |
|
58 |
|
|
59 |
if (units.size()== 0) { |
|
60 |
println "No milestones to write" |
|
61 |
} else { |
|
62 |
try { |
|
63 |
println "processing milestones Units ${text_id} and its units "+units.size() |
|
64 |
MileStoneInserter inserter = new MileStoneInserter(corpus, xmltxmFile, units); |
|
65 |
if (inserter.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) { |
|
66 |
xmltxmFile.delete() |
|
67 |
xmltxmFileCopy.renameTo(xmltxmFile) |
|
68 |
|
|
69 |
for (Unite unit : units) { |
|
70 |
unit.getProps()["written"] = "true" |
|
71 |
} |
|
72 |
URSCorpora.saveCorpus(corpus); |
|
73 |
println "Done, "+units.size()+ " milestones written" |
|
74 |
|
|
75 |
} else { |
|
76 |
println "Error while processing milestones $xmltxmFile file" |
|
77 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
78 |
error.getParentFile().mkdirs() |
|
79 |
println " moving created file to $error" |
|
80 |
error.delete() |
|
81 |
xmltxmFileCopy.renameTo(error) |
|
82 |
} |
|
83 |
} catch(Exception e) { |
|
84 |
println "Error while processing milestones $xmltxmFile file: "+e |
|
85 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
86 |
error.getParentFile().mkdirs() |
|
87 |
println " moving created file to $error" |
|
88 |
error.delete() |
|
89 |
xmltxmFileCopy.renameTo(error) |
|
90 |
} |
|
91 |
} |
|
92 |
// WRITE NON MILESTONES UNITS |
|
93 |
println "-> OTHER UNITS" |
|
94 |
|
|
95 |
for (String unit_type : analecCorpus.getStructure().getTypes(Unite.class)) { |
|
96 |
|
|
97 |
def corpus_units = [] |
|
98 |
ArrayList<Unite> all_units = analecCorpus.getUnites(unit_type) |
|
99 |
corpus_units = all_units.findAll() { |
|
100 |
it.getProp("type") != null && it.getProp("type").trim().length() > 0 && !"no".equals(it.getProp("milestone")) && text_start <= it.getDeb() && it.getFin() < text_end && !("true".equals(it.getProp("written"))) |
|
101 |
} |
|
102 |
|
|
103 |
if (corpus_units.size() == 0) continue; |
|
104 |
|
|
105 |
println "processing Units ${text_id} and its $unit_type units "+corpus_units.size() |
|
106 |
UnitsInserter inserter2 = new UnitsInserter(corpus, xmltxmFile, corpus_units, unit_type); |
|
107 |
try { |
|
108 |
if (inserter2.process(xmltxmFileCopy) && ValidateXml.test(xmltxmFileCopy)) { |
|
109 |
xmltxmFile.delete() |
|
110 |
xmltxmFileCopy.renameTo(xmltxmFile) |
|
111 |
|
|
112 |
for (Unite unit : corpus_units) { |
|
113 |
unit.getProps()["written"] = "true" |
|
114 |
} |
|
115 |
URSCorpora.saveCorpus(corpus); |
|
116 |
println "Done, "+corpus_units.size()+ " units written" |
|
117 |
|
|
118 |
} else { |
|
119 |
println "Error while processing milestones $xmltxmFile file" |
|
120 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
121 |
error.getParentFile().mkdirs() |
|
122 |
println " moving created file to $error" |
|
123 |
error.delete() |
|
124 |
xmltxmFileCopy.renameTo(error) |
|
125 |
} |
|
126 |
} catch(Exception e) { |
|
127 |
println "Error while processing milestones $xmltxmFile file: "+e |
|
128 |
File error = new File(corpus.getProject().getProjectDirectory(), "error/"+xmltxmFile.getName()) |
|
129 |
error.getParentFile().mkdirs() |
|
130 |
println " moving created file to $error" |
|
131 |
error.delete() |
|
132 |
xmltxmFileCopy.renameTo(error) |
|
133 |
} |
|
134 |
} |
|
135 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/cqp/WordUnitsInserter.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macroprototypes.urs.cqp |
|
2 |
|
|
3 |
import java.io.IOException |
|
4 |
import java.util.Date |
|
5 |
import java.util.LinkedHashMap |
|
6 |
|
|
7 |
import javax.xml.stream.XMLStreamException |
|
8 |
|
|
9 |
import org.txm.Toolbox |
|
10 |
import org.txm.importer.StaxIdentityParser |
|
11 |
import org.txm.macro.urs.AnalecUtils |
|
12 |
import org.txm.scripts.importer.GetAttributeValue |
|
13 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
14 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
15 |
import visuAnalec.elements.Unite |
|
16 |
|
|
17 |
public class WordUnitsInserter extends StaxIdentityParser { |
|
18 |
|
|
19 |
List<Unite> units |
|
20 |
String[] ids |
|
21 |
File inputFile |
|
22 |
def id2Units = [:] |
|
23 |
|
|
24 |
LinkedHashMap<String, String> anaValues = new LinkedHashMap<String, String>(); |
|
25 |
|
|
26 |
public WordUnitsInserter(MainCorpus corpus, File inputFile, List<Unite> units) { |
|
27 |
super(inputFile) |
|
28 |
this.inputFile = inputFile |
|
29 |
|
|
30 |
this.units = units |
|
31 |
this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() } |
|
32 |
|
|
33 |
for (int i = 0 ; i < units.size() ; i++) { |
|
34 |
Unite u = units[i] |
|
35 |
int[] positions = u.getDeb()..u.getFin() |
|
36 |
|
|
37 |
ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions) |
|
38 |
|
|
39 |
for (String id : ids) { |
|
40 |
if (!id2Units.containsKey(id)) { |
|
41 |
id2Units[id] = [] |
|
42 |
} |
|
43 |
id2Units[id] << u |
|
44 |
} |
|
45 |
} |
|
46 |
} |
|
47 |
|
|
48 |
boolean inAna = false |
|
49 |
String ana_type, ana_resp, ana_value |
|
50 |
boolean inW = false |
|
51 |
String word_id = null |
|
52 |
|
|
53 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
54 |
|
|
55 |
if ("w".equals(localname)) { |
|
56 |
|
|
57 |
inW = true |
|
58 |
word_id = this.getParserAttributeValue("id") |
|
59 |
|
|
60 |
if (id2Units.containsKey(word_id)) { |
|
61 |
for (Unite u : id2Units[word_id]) { |
|
62 |
def props = u.getProps(); |
|
63 |
for (String p : props.keySet()) { |
|
64 |
if ("type".equals(p)) continue; |
|
65 |
if ("written".equals(p)) continue; |
|
66 |
if ("milestone".equals(p)) continue; |
|
67 |
|
|
68 |
if (!anaValues.containsKey(p)) { |
|
69 |
anaValues[p] = "" |
|
70 |
ana_resp = "#txm" |
|
71 |
} |
|
72 |
anaValues[p] = (anaValues[p]+" "+props.get(p)).trim() |
|
73 |
} |
|
74 |
} |
|
75 |
if (anaValues.size() > 0) println anaValues |
|
76 |
} |
|
77 |
|
|
78 |
super.processStartElement(); // write the tag |
|
79 |
|
|
80 |
} else if ("ana".equals(localname) && inW) { |
|
81 |
|
|
82 |
inAna = true |
|
83 |
ana_type = this.getParserAttributeValue("type").substring(1) |
|
84 |
ana_resp = this.getParserAttributeValue("resp") |
|
85 |
ana_value = "" |
|
86 |
|
|
87 |
} else { |
|
88 |
super.processStartElement() |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
@Override |
|
93 |
public void processCharacters() throws XMLStreamException { |
|
94 |
if (inAna) { |
|
95 |
ana_value += parser.getText().trim() |
|
96 |
} else { |
|
97 |
super.processCharacters() |
|
98 |
} |
|
99 |
} |
|
100 |
|
|
101 |
protected void processEndElement() throws XMLStreamException { |
|
102 |
|
|
103 |
if ("w".equals(localname)) { |
|
104 |
|
|
105 |
// write the last values |
|
106 |
for (String ana_type : anaValues.keySet()) { |
|
107 |
writer.writeStartElement("txm:ana") |
|
108 |
writer.writeAttribute("type", "#" + ana_type) |
|
109 |
writer.writeAttribute("resp", "#txm") // change |
|
110 |
writer.writeCharacters(anaValues[ana_type]) |
|
111 |
writer.writeEndElement() |
|
112 |
} |
|
113 |
|
|
114 |
anaValues.clear() |
|
115 |
super.processEndElement() // finally write word then close annotations |
|
116 |
inW = false |
|
117 |
|
|
118 |
} else if ("ana".equals(localname) && inW) { |
|
119 |
|
|
120 |
if (!anaValues.containsKey(ana_type)) { |
|
121 |
anaValues[ana_type] = ana_value.trim() |
|
122 |
} else { |
|
123 |
ana_resp = "#txm" // set the resp to txm since anaValues update the ana value |
|
124 |
anaValues[ana_type] = (anaValues[ana_type]+" "+ana_value.trim()).trim() |
|
125 |
} |
|
126 |
|
|
127 |
String value = anaValues[ana_type] |
|
128 |
|
|
129 |
writer.writeStartElement("txm:ana") |
|
130 |
writer.writeAttribute("type", "#" + ana_type) |
|
131 |
writer.writeAttribute("resp", ana_resp) // change |
|
132 |
writer.writeCharacters(value) |
|
133 |
writer.writeEndElement() |
|
134 |
|
|
135 |
anaValues.remove(ana_type) |
|
136 |
|
|
137 |
inAna = false |
|
138 |
ana_type = null |
|
139 |
ana_resp = null |
|
140 |
ana_value = null |
|
141 |
|
|
142 |
// write ana later |
|
143 |
} else { |
|
144 |
super.processEndElement() |
|
145 |
} |
|
146 |
} |
|
147 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/cqp/MileStoneInserter.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macroprototypes.urs.cqp |
|
2 |
|
|
3 |
import java.io.IOException |
|
4 |
|
|
5 |
import javax.xml.stream.XMLStreamException |
|
6 |
|
|
7 |
import org.txm.importer.StaxIdentityParser |
|
8 |
import org.txm.macro.urs.AnalecUtils |
|
9 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
10 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
11 |
import visuAnalec.elements.Unite |
|
12 |
|
|
13 |
public class MileStoneInserter extends StaxIdentityParser { |
|
14 |
|
|
15 |
List<Unite> units |
|
16 |
String[] ids |
|
17 |
File inputFile |
|
18 |
def id2Units = [:] |
|
19 |
|
|
20 |
public MileStoneInserter(MainCorpus corpus, File inputFile, List<Unite> units) { |
|
21 |
super(inputFile); |
|
22 |
this.inputFile = inputFile; |
|
23 |
|
|
24 |
this.units = units; |
|
25 |
this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() } |
|
26 |
|
|
27 |
int[] positions = new int[units.size()]; |
|
28 |
for( int i = 0 ; i < units.size() ; i++) { |
|
29 |
positions[i] = units.get(i).getDeb(); |
|
30 |
} |
|
31 |
|
|
32 |
ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions); |
|
33 |
|
|
34 |
for (int i = 0 ; i < ids.length ; i++) { |
|
35 |
String id = ids[i] |
|
36 |
if (id != null) { |
|
37 |
if (!id2Units.containsKey(id)) id2Units[id] = [] |
|
38 |
|
|
39 |
id2Units[id] << units[i] |
|
40 |
} |
|
41 |
} |
|
42 |
} |
|
43 |
|
|
44 |
boolean start = false; |
|
45 |
String word_id = null; |
|
46 |
@Override |
|
47 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
48 |
|
|
49 |
if ("text".equals(localname)) { |
|
50 |
start = true; |
|
51 |
} else if ("w".equals(localname) && start) { |
|
52 |
word_id = getParserAttributeValue("id"); |
|
53 |
if (word_id == null) { |
|
54 |
println "Warning: found <w> without id at line "+parser.getLocation().getLineNumber()+" in "+inputFile |
|
55 |
} else { |
|
56 |
writeAllUnits(word_id, "before") |
|
57 |
} |
|
58 |
} |
|
59 |
|
|
60 |
super.processStartElement(); |
|
61 |
} |
|
62 |
|
|
63 |
protected void writeAllUnits(String id, String position) { |
|
64 |
|
|
65 |
def units = id2Units[id] |
|
66 |
if (units == null) return; // no units to write |
|
67 |
|
|
68 |
for (Unite currentUnit : units) { |
|
69 |
|
|
70 |
if (!position.equals(currentUnit.getProp("position"))) return; |
|
71 |
|
|
72 |
writer.writeStartElement(currentUnit.getProp("type")); |
|
73 |
HashMap props = currentUnit.getProps(); |
|
74 |
for (String p : props.keySet()) { |
|
75 |
if (p.equals("type")) continue; // ignore the type since written in tag name |
|
76 |
writer.writeAttribute(p, ""+props.get(p)); |
|
77 |
} |
|
78 |
writer.writeEndElement(); |
|
79 |
} |
|
80 |
} |
|
81 |
|
|
82 |
@Override |
|
83 |
protected void processEndElement() throws XMLStreamException { |
|
84 |
super.processEndElement(); |
|
85 |
|
|
86 |
if ("w".equals(localname) && start && word_id != null) { |
|
87 |
writeAllUnits(word_id, "after") |
|
88 |
} |
|
89 |
|
|
90 |
if ("w".equals(localname)) { |
|
91 |
word_id = null; |
|
92 |
} |
|
93 |
} |
|
94 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/cqp/UnitsInserter.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macroprototypes.urs.cqp |
|
2 |
|
|
3 |
import java.io.IOException |
|
4 |
|
|
5 |
import javax.xml.stream.XMLStreamException |
|
6 |
|
|
7 |
import org.txm.importer.StaxIdentityParser |
|
8 |
import org.txm.macro.urs.AnalecUtils |
|
9 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
10 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
11 |
import visuAnalec.elements.Unite |
|
12 |
|
|
13 |
public class UnitsInserter extends StaxIdentityParser { |
|
14 |
|
|
15 |
List<Unite> units |
|
16 |
String[] ids |
|
17 |
File inputFile |
|
18 |
def open_id2Units = [:] |
|
19 |
def close_id2Units = [:] |
|
20 |
def writing_units = [] |
|
21 |
def writing_stacks = [] |
|
22 |
|
|
23 |
def stack = ""; |
|
24 |
|
|
25 |
def writing_start, writing_end; |
|
26 |
|
|
27 |
def positions2id = [:] // used to relocate end of units |
|
28 |
|
|
29 |
public UnitsInserter(MainCorpus corpus, File inputFile, List<Unite> units, String type) { |
|
30 |
super(inputFile); |
|
31 |
this.inputFile = inputFile; |
|
32 |
this.units = units; |
|
33 |
|
|
34 |
this.units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: b.getFin() <=> a.getFin() } |
|
35 |
|
|
36 |
// get words id limits |
|
37 |
int[] positions = new int[units.size()]; |
|
38 |
for( int i = 0 ; i < units.size() ; i++) { |
|
39 |
positions[i] = units.get(i).getDeb(); |
|
40 |
} |
|
41 |
ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions); |
|
42 |
for (int i = 0 ; i < ids.length ; i++) { |
|
43 |
String id = ids[i] |
|
44 |
|
|
45 |
if (id != null) { |
|
46 |
positions2id[positions[i]] = id |
|
47 |
if (!open_id2Units.containsKey(id)) open_id2Units[id] = [] |
|
48 |
open_id2Units[id] << units[i] |
|
49 |
} |
|
50 |
} |
|
51 |
|
|
52 |
positions = new int[units.size()]; |
|
53 |
for( int i = 0 ; i < units.size() ; i++) { |
|
54 |
positions[i] = units.get(i).getFin(); |
|
55 |
} |
|
56 |
ids = CQPSearchEngine.getCqiClient().cpos2Str(corpus.getProperty("id").getQualifiedName(), positions); |
|
57 |
for (int i = 0 ; i < ids.length ; i++) { |
|
58 |
String id = ids[i] |
|
59 |
if (id != null) { |
|
60 |
positions2id[positions[i]] = id |
|
61 |
if (!close_id2Units.containsKey(id)) close_id2Units[id] = new HashSet<Unite>() |
|
62 |
close_id2Units[id] << units[i] |
|
63 |
} |
|
64 |
} |
|
65 |
} |
|
66 |
|
|
67 |
boolean start = false; |
|
68 |
String word_id = null; |
|
69 |
@Override |
|
70 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
71 |
|
|
72 |
stack += "/"+localname |
|
73 |
|
|
74 |
if ("text".equals(localname)) { |
|
75 |
start = true; |
|
76 |
} else if ("w".equals(localname) && start) { |
|
77 |
word_id = getParserAttributeValue("id"); |
|
78 |
if (word_id == null) { |
|
79 |
println "Warning: found <w> without id at line "+parser.getLocation().getLineNumber()+" in "+inputFile |
|
80 |
} else { |
|
81 |
writeOpenUnits() |
|
82 |
} |
|
83 |
} |
|
84 |
|
|
85 |
super.processStartElement(); |
|
86 |
} |
|
87 |
|
|
88 |
protected void writeOpenUnits() { |
|
89 |
|
|
90 |
def toWrite = open_id2Units[word_id] |
|
91 |
|
|
92 |
if (toWrite != null) { |
|
93 |
for (Unite unite : toWrite) { |
|
94 |
|
|
95 |
for (int i = 0 ; i < writing_units.size() ; i++) { |
|
96 |
Unite u = writing_units.get(i); |
|
97 |
if (unite.getFin() > u.getFin()) { |
|
98 |
// add unite to close_id2Units |
|
99 |
String id = positions2id[u.getFin()] |
|
100 |
close_id2Units[id] << unite // close the unite at the same moment |
|
101 |
} |
|
102 |
} |
|
103 |
|
|
104 |
writeUnit(unite); |
|
105 |
} |
|
106 |
} |
|
107 |
} |
|
108 |
|
|
109 |
protected void writeCloseUnits() { |
|
110 |
if (word_id != null) { |
|
111 |
def toClose = close_id2Units[word_id] |
|
112 |
if (toClose != null) { |
|
113 |
for (int i = 0 ; i < writing_units.size() ; i++) { |
|
114 |
Unite u = writing_units.get(i); |
|
115 |
if (toClose.contains(u)) { |
|
116 |
writing_stacks.remove(i) |
|
117 |
writing_units.remove(i) |
|
118 |
writer.writeEndElement(); |
|
119 |
i--; |
|
120 |
} |
|
121 |
} |
|
122 |
} |
|
123 |
} else { |
|
124 |
for (int i = 0 ; i < writing_stacks.size() ; i++) { |
|
125 |
if (writing_stacks[i].equals(stack)) { |
|
126 |
writing_stacks.remove(i) |
|
127 |
writing_units.remove(i) |
|
128 |
writer.writeEndElement(); |
|
129 |
i-- |
|
130 |
} |
|
131 |
} |
|
132 |
} |
|
133 |
} |
|
134 |
|
|
135 |
protected void writeUnit(Unite currentUnit) { |
|
136 |
|
|
137 |
writing_units << currentUnit |
|
138 |
writing_stacks << currentUnit |
|
139 |
if (currentUnit.getDeb() > writing_start) |
|
140 |
|
|
141 |
writer.writeStartElement(currentUnit.getProp("type")); |
|
142 |
HashMap props = currentUnit.getProps(); |
|
143 |
for (String p : props.keySet()) { |
|
144 |
if (p.equals("type")) continue; // ignore the type since written in tag name |
|
145 |
writer.writeAttribute(p, ""+props.get(p)); |
|
146 |
} |
|
147 |
} |
|
148 |
|
|
149 |
@Override |
|
150 |
protected void processEndElement() throws XMLStreamException { |
|
151 |
|
|
152 |
// println "writing_stacks=$writing_stacks" |
|
153 |
// println "stack=$stack" |
|
154 |
|
|
155 |
if (writing_stacks.size() > 0 && writing_stacks[-1].equals(stack)) { |
|
156 |
writeCloseUnits() |
|
157 |
} |
|
158 |
|
|
159 |
super.processEndElement(); |
|
160 |
|
|
161 |
stack = stack.substring(0, stack.length() - localname.length() - 1); |
|
162 |
|
|
163 |
if ("w".equals(localname)) { |
|
164 |
if (start && word_id != null) { |
|
165 |
writeCloseUnits() |
|
166 |
} |
|
167 |
word_id = null; |
|
168 |
} |
|
169 |
} |
|
170 |
} |
tmp/org.txm.analec.rcp/META-INF/MANIFEST.MF (revision 2087) | ||
---|---|---|
167 | 167 |
org.txm.annotation.urs.view, |
168 | 168 |
org.txm.annotation.urs.widgets, |
169 | 169 |
org.txm.macro.urs, |
170 |
org.txm.macro.urs.cqp, |
|
171 | 170 |
org.txm.macro.urs.democrat, |
172 | 171 |
org.txm.macro.urs.edit, |
173 | 172 |
org.txm.macro.urs.exploit, |
174 | 173 |
org.txm.macro.urs.export, |
174 |
org.txm.macroprototypes.urs.cqp, |
|
175 | 175 |
org.txm.macroprototypes.urs.misc, |
176 | 176 |
visuAnalec, |
177 | 177 |
visuAnalec.chaines, |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/StructuresIndexMacro.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macro.commands |
|
2 |
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté |
|
3 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
4 |
// @author sheiden |
|
5 |
|
|
6 |
/* |
|
7 |
Macro affichant les statistiques de différentes structures d'un corpus |
|
8 |
Paramètres de la macro : |
|
9 |
- corpus : le corpus sélectionné dans la vue Corpus |
|
10 |
- structures : liste des structures à interroger. Séparer les noms par une virgule. |
|
11 |
- structProperties : liste des propriétés de structures. Séparer les noms par une virgule. |
|
12 |
Il doit y avoir autant de propriétés de structures que de structures indiquées dans le paramètre structures. |
|
13 |
Les structures doivent posséder la propriété demandée. |
|
14 |
Ce paramètre peut être laissé vide, dans ce cas la colonne 'prop' n'est pas affichée. |
|
15 |
- query : requête CQL de sélection de mots exprimée obligatoirement en format complet : [...] |
|
16 |
Par exemple : |
|
17 |
- [frpos="N.*"] pour sélectionner les noms communs et les noms propres |
|
18 |
- [] pour sélectionner tous les mots |
|
19 |
- wordProperty : propriété de mot utilisée pour calculer le vocabulaire et les fréquences |
|
20 |
- displayIndex : calculer l'index hiérarchique des valeurs de la propriété wordProperty pour la requête query sur chaque structure |
|
21 |
- Vmax : nombre maximum des mots les plus fréquents à afficher dans l'index |
|
22 |
Résultat : |
|
23 |
Le résultat est un tableau TSV affiché dans la console. |
|
24 |
On peut l'exploiter avec un copier/coller dans Calc. |
|
25 |
Chaque ligne correspond à une structure du corpus. |
|
26 |
Les lignes sont ordonnées par ordre hiérarchique des structures du début à la fin du corpus. |
|
27 |
Les colonnes sont : |
|
28 |
- struct : nom de la structure |
|
29 |
- prop : valeur de la propriété de la structure |
|
30 |
(si le paramètre structProperties est vide, cette colonne est absente du résultat) |
|
31 |
- start : position du premier mot de la structure dans le corpus |
|
32 |
(les positions du corpus sont numérotées à partir de 0). |
|
33 |
Les colonnes start et end sont pratiques quand on n'a pas de propriété de structure à afficher pour se repérer dans le corpus. |
|
34 |
- end : position du dernier mot de la structure |
|
35 |
- T : taille de la structure (end-start) |
|
36 |
- t : nombre de mots sélectionnés dans la structure |
|
37 |
- v : nombre de valeurs différentes de la propriété des mots sélectionnés dans la structure |
|
38 |
- fmin : fréquence minimale des valeurs de la propriété de mots sélectionnés dans la structure |
|
39 |
- fmax : fréquence maximale des valeurs de la propriété de mots sélectionnés dans la structure |
|
40 |
- index : l'index hiérarchique des valeurs de la propriété de mot choisie des mots sélectionnés par la requête CQL |
|
41 |
Exemple de résultats sur le texte "Essais sur la peinture" de Diderot : |
|
42 |
struct prop start end T t v fmin fmax index |
|
43 |
text DiderotEssais 46203 56871 10668 2011 903 1 38 [nature, couleur, homme, tableau, lumière, objets, œil, toile, art, effet, corps, artiste, ombre, ombres, deux, peintre, peinture, dessin, couleurs, tête] |
|
44 |
div 0 46214 49223 3009 549 327 1 16 [nature, homme, modèle, figure, deux, école, artiste, chose, âge, figures, dessin, actions, fois, professeur, action, attitude, manière, femme, col, tête] |
|
45 |
p 0 46220 46259 39 5 5 1 1 [nature, forme, cause, êtres, un] |
|
46 |
p 1 46260 46456 196 36 25 1 3 [yeux, col, épaules, gorge, femme, jeunesse, nature, accroissement, orbe, paupières, cavité, absence, organe, sourcils, joues, lèvre, mouvement, altération, parties, visage] |
|
47 |
p 2 46457 46578 121 28 26 1 2 [pieds, nature, regards, homme, dos, poitrine, forme, cartilages, col, vertèbres, tête, mains, articulation, poignet, coudes, arrière, membres, centre, gravité, système] |
|
48 |
p 3 46579 46622 43 5 4 1 2 [causes, effets, êtres, imitation] |
|
49 |
p 4 46623 46727 104 22 20 1 2 [ignorance, règles, effets, causes, convention, suites, peine, artiste, imitation, nature, pieds, jambes, genoux, têtes, tact, observation, phénomènes, liaison, enchaînement, difformités] |
|
50 |
p 5 46728 46797 69 10 6 1 4 [nez, Antinoüs, nature, difformité, altérations, reste] |
|
51 |
p 6 46798 46859 61 9 7 1 2 [règles, nature, homme, rue, chose, statue, proportions] |
|
52 |
p 7 46860 46942 82 13 11 1 2 [extrémité, pied, voile, bossu, Venus, Medicis, nature, figure, crayons, monstre, chose] |
|
53 |
p 8 46943 46982 39 11 11 1 1 [figure, système, suites, inconséquence, principe, production, art, mille, lieues, œuvre, nature] |
|
54 |
p 9 46983 47196 213 38 30 1 5 [homme, figure, âge, fonctions, mystères, art, artiste, proportions, despotisme, nature, condition, sacrifice, cent, manières, organisation, habitude, facilité, grandeur, proportion, membre] |
|
55 |
... [13 paragraphes] ... |
|
56 |
div 1 49224 52163 2939 531 307 1 23 [couleur, nature, chair, artiste, toile, art, homme, yeux, œil, couleurs, tableau, harmonie, effet, dessin, palette, organe, ton, coloriste, vie, ami] |
|
57 |
p 24 49230 49258 28 7 7 1 1 [C', dessin, forme, êtres, couleur, vie, souffle] |
|
58 |
p 25 49259 49284 25 6 6 1 1 [maîtres, art, juges, dessin, monde, couleur] |
|
59 |
p 26 49285 49354 69 16 16 1 1 [dessinateurs, coloristes, littérature, Cent, froids, orateur, Dix, orateurs, poète, intérêt, homme, Helvétius, dix, bons, peine, mort] |
|
60 |
p 27 49355 49485 130 24 21 1 2 [artiste, besoin, échelle, ami, atelier, teintes, demi-, palette, quart, heure, travail, ordre, pendant, passage, auteur, bureau, ligne, livre, place, allure] |
|
61 |
p 28 49486 49680 194 46 42 1 2 [yeux, toile, chaos, œuvre, sentiment, couleur, bouche, palette, image, pinceau, création, oiseaux, nuances, plumage, fleurs, velouté, arbres, verdures, azur, ciel] |
|
62 |
p 29 49681 49967 286 48 43 1 3 [nature, organe, homme, arbre, artistes, chose, monde, variété, coloristes, couleur, disposition, doute, œil, couleurs, tableau, effets, rouges, blancs, tapisserie, murs] |
|
63 |
p 30 49968 50068 100 20 17 1 3 [fois, organe, peintre, ouvrage, littérateur, caractère, disposition, pente, homme, voix, explosion, état, silence, artiste, tableau, couleur, coloris] |
|
64 |
p 31 50069 50105 36 7 7 1 1 [coup, organe, affection, corps, vapeur, nature, imitation] |
|
65 |
p 32 50106 50267 161 26 19 1 4 [couleur, palette, artiste, effet, tableau, teintes, couleurs, idée, endroit, fois, appréciation, scène, composition, manie, travail, teinte, composé, substances, unes] |
|
66 |
p 33 50268 50319 51 7 7 1 1 [général, harmonie, composition, peintre, effet, pinceau, couleur] |
|
67 |
... [etc.] |
|
68 |
Avec les paramètres : |
|
69 |
- structures : text,div,p |
|
70 |
- structProperties : id,n,n |
|
71 |
- query : [frpos="N.*"] |
|
72 |
- wordProperty : word |
|
73 |
- displayIndex : true |
|
74 |
- Vmax : 20 |
|
75 |
*/ |
|
76 |
|
|
77 |
// Déclarations |
|
78 |
|
|
79 |
import org.kohsuke.args4j.* |
|
80 |
|
|
81 |
import groovy.transform.Field |
|
82 |
|
|
83 |
import org.txm.rcp.swt.widget.parameters.* |
|
84 |
import org.txm.Toolbox |
|
85 |
import org.eclipse.ui.console.* |
|
86 |
import org.txm.macro.cqp.* |
|
87 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
88 |
import org.txm.searchengine.cqp.corpus.CQPCorpus |
|
89 |
import org.txm.searchengine.cqp.corpus.Partition |
|
90 |
import org.txm.searchengine.cqp.corpus.Property |
|
91 |
import org.txm.searchengine.cqp.corpus.QueryResult |
|
92 |
import org.txm.searchengine.cqp.corpus.Subcorpus; |
|
93 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery |
|
94 |
import org.txm.rcp.commands.* |
|
95 |
import org.txm.statsengine.r.core.RWorkspace |
|
96 |
|
|
97 |
byte CQI_CONST_FIELD_MATCH = (byte) 0x10 |
|
98 |
|
|
99 |
def scriptName = this.class.getSimpleName() |
|
100 |
|
|
101 |
def selection = [] |
|
102 |
for (def s : corpusViewSelections) { |
|
103 |
if (s instanceof CQPCorpus) selection << s |
|
104 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
|
105 |
} |
|
106 |
|
|
107 |
if (selection.size() == 0) { |
|
108 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections |
|
109 |
return false |
|
110 |
} |
|
111 |
println "WORKING WITH $selection" |
|
112 |
// BEGINNING OF PARAMETERS |
|
113 |
|
|
114 |
@Field @Option(name="structures", usage="act,scene", widget="String", required=true, def="text,div,p") |
|
115 |
def structures |
|
116 |
@Field @Option(name="structProperties", usage="n,n", widget="String", required=false, def="id,n,n") |
|
117 |
def structProperties |
|
118 |
@Field @Option(name="query", usage="[word!='\\p{P}']", widget="String", required=true, def="[pos=\"NOM.*\"|frpos=\"N.*\"]") |
|
119 |
def query |
|
120 |
@Field @Option(name="wordProperty", usage="word", widget="String", required=true, def="word") |
|
121 |
def wordProperty |
|
122 |
@Field @Option(name="displayIndex", usage="display a hierarchical index", widget="Boolean", required=true, def="true") |
|
123 |
def displayIndex |
|
124 |
@Field @Option(name="Vmax", usage="size of index", widget="Integer", required=false, def="20") |
|
125 |
def Vmax |
|
126 |
// END OF PARAMETERS |
|
127 |
|
|
128 |
// Open the parameters input dialog box |
|
129 |
if (!ParametersDialog.open(this)) return; |
|
130 |
|
|
131 |
def CQI = CQPSearchEngine.getCqiClient() |
|
132 |
|
|
133 |
def corpusStructs = structures.split(",") // ["act", "scene"] |
|
134 |
structProperties = structProperties.trim() |
|
135 |
|
|
136 |
if (structProperties.size() > 0) { |
|
137 |
propParam = true |
|
138 |
corpusStructPropNames = structProperties.split(",") // ["n", "n"] |
|
139 |
corpusStructProps = [corpusStructs, corpusStructPropNames].transpose().collectEntries() |
|
140 |
} else { |
|
141 |
propParam = false |
|
142 |
} |
|
143 |
|
|
144 |
// First define the order theory over corpus structures intervals |
|
145 |
// by defining a binary comparator that will be used to build the |
|
146 |
// TreeSet of intervals |
|
147 |
|
|
148 |
// function to print the hierarchical index of a query |
|
149 |
def print_index = { c, q, p, cut -> |
|
150 |
|
|
151 |
QueryResult qr = c.query(new CQLQuery(q), "RES1", false); |
|
152 |
Subcorpus subcorpus = c.createSubcorpus("RES1", qr); |
|
153 |
p = subcorpus.getProperty(p) |
|
154 |
def tC = subcorpus.getSize() |
|
155 |
def matches_target_p = CQI.cpos2Str(p.getQualifiedName(), CQI.dumpSubCorpus(qr.getQualifiedCqpId(), CQI_CONST_FIELD_MATCH, 0, tC-1)) |
|
156 |
if (cut > 0) { |
|
157 |
println matches_target_p.countBy { it }.sort { -it.value }.take(cut) |
|
158 |
} else { |
|
159 |
println matches_target_p.countBy { it }.sort { -it.value } |
|
160 |
} |
|
161 |
subcorpus.delete() |
|
162 |
} |
|
163 |
|
|
164 |
// function to print the statistics of an index of a query |
|
165 |
def print_freq = { CQPCorpus c, q, p -> |
|
166 |
|
|
167 |
// appel du moteur |
|
168 |
//println "QUERY=$q" |
|
169 |
QueryResult qr = c.query(new CQLQuery(q), "RES1", false); |
|
170 |
Subcorpus subcorpus = c.createSubcorpus("RES1", qr); |
|
171 |
p = subcorpus.getProperty(p) |
|
172 |
int csize = c.getSize() |
|
173 |
if (csize == 0) { |
|
174 |
if (displayIndex) { |
|
175 |
println "0\t0\t0\t0\t[]" |
|
176 |
} else { |
|
177 |
println "0\t0\t0\t0" |
|
178 |
} |
|
179 |
} else { |
|
180 |
def tC = CQI.subCorpusSize(subcorpus.getQualifiedCqpId()) |
|
181 |
def matches_target_p = CQI.cpos2Id(p.getQualifiedName(), CQI.dumpSubCorpus(subcorpus.getQualifiedCqpId(), CQI_CONST_FIELD_MATCH, 0, tC-1)) |
|
182 |
|
|
183 |
//println "" |
|
184 |
|
|
185 |
// afficher les positions de mots du résultat |
|
186 |
//println CQI.dumpSubCorpus("${c}:RES1", CQI_CONST_FIELD_MATCH, 0, CQI.subCorpusSize("${c}:RES1")-1) |
|
187 |
|
|
188 |
// afficher les codes des occurrences de la propriété du résultat |
|
189 |
//println matches_target_p |
|
190 |
|
|
191 |
// afficher l'index hiérarchique des codes du résultat |
|
192 |
//println matches_target_p.collect { it }.countBy { it }.sort { -it.value } |
|
193 |
|
|
194 |
// calculer la fréquence de chaque valeur et ne garder que les fréquences |
|
195 |
def index = matches_target_p.collect { it }.countBy { it } |
|
196 |
def freqs = index.values() |
|
197 |
|
|
198 |
// afficher la liste décroissante des fréquences du résultat |
|
199 |
//println freqs.sort { -it.value } |
|
200 |
|
|
201 |
|
|
202 |
//def tF = freqs.sum() // control value |
|
203 |
def v = freqs.size() |
|
204 |
def fmin = freqs.min() |
|
205 |
def fmax = freqs.max() |
|
206 |
//println sprintf("t %d, v %d, fmin %d, fmax %d", tC, v, fmin, fmax) |
|
207 |
print sprintf("%d\t%d\t%d\t%d", tC, v, fmin, fmax) |
|
208 |
// afficher les valeurs des occurrences de la propriété du résultat |
|
209 |
if (displayIndex) { |
|
210 |
heads = index.sort { -it.value }.take(Vmax).keySet() |
|
211 |
println "\t"+heads.collect { CQI.id2Str(p.getQualifiedName(), it)[0] } |
|
212 |
} else { |
|
213 |
println "" |
|
214 |
} |
|
215 |
} |
|
216 |
subcorpus.delete() |
|
217 |
} |
|
218 |
|
|
219 |
def r = RWorkspace.getRWorkspaceInstance() |
|
220 |
|
|
221 |
/** |
|
222 |
* group units by CQP match |
|
223 |
* |
|
224 |
* units are sorted for faster processing |
|
225 |
* |
|
226 |
* @param allUnites |
|
227 |
* @param matches |
|
228 |
* @param strict_inclusion |
|
229 |
* @return |
|
230 |
*/ |
|
231 |
static def inter(def allUnites, def matches) { |
|
232 |
//println allUnites.collect() {it -> it[0]} |
|
233 |
allUnites = allUnites.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] } |
|
234 |
//println allUnites.collect() {it -> it[0]} |
|
235 |
def unitsSize = allUnites.size() |
|
236 |
def iCurrentUnit = 0 |
|
237 |
def selectedUnits = [] |
|
238 |
|
|
239 |
def matchesSize = matches.size() |
|
240 |
def iCurrentMatch = 0 |
|
241 |
|
|
242 |
|
|
243 |
while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) { |
|
244 |
|
|
245 |
def unit = allUnites[iCurrentUnit] |
|
246 |
def match = matches[iCurrentMatch] |
|
247 |
if (unit[1] < match.getStart()) { |
|
248 |
iCurrentUnit++ |
|
249 |
} else if (unit[0] > match.getEnd()) { |
|
250 |
iCurrentMatch++ |
|
251 |
} else { |
|
252 |
|
|
253 |
if (match.getStart() <= unit[0] && unit[1] <= match.getEnd()) { |
|
254 |
selectedUnits << unit |
|
255 |
} |
|
256 |
|
|
257 |
iCurrentUnit++ |
|
258 |
} |
|
259 |
} |
|
260 |
return selectedUnits |
|
261 |
} |
|
262 |
|
|
263 |
selection.each { corpus -> |
|
264 |
|
|
265 |
corpusName = corpus.getID() |
|
266 |
mainCorpusName = corpus.getMainCorpus().getID() |
|
267 |
println "Corpus = "+corpusName |
|
268 |
println "Corpus QualifiedCqpId = "+corpus.getCqpId() |
|
269 |
println "MainCorpus = "+mainCorpusName |
|
270 |
println "Corpus QualifiedCqpId = "+corpus.getMainCorpus().getCqpId() |
|
271 |
|
|
272 |
def struct_names = (CQI.corpusStructuralAttributes(corpus.getMainCorpus().getCqpId()) as List) |
|
273 |
struct_names.removeAll { it.contains('_') } |
|
274 |
struct_names=(struct_names-"txmcorpus").grep(corpusStructs) |
|
275 |
//println "struct_names = "+struct_names |
|
276 |
|
|
277 |
if (struct_names.size() == 0) { |
|
278 |
println "** Impossible to find the structures (${corpusStructs}), aborting." |
|
279 |
return |
|
280 |
} |
|
281 |
|
|
282 |
def level = [:] |
|
283 |
|
|
284 |
// Now build the TreeSet of corpus structures intervals |
|
285 |
|
|
286 |
def h = new TreeSet<Struct>() |
|
287 |
|
|
288 |
struct_names.each { |
|
289 |
def matches = [] |
|
290 |
for (i in 0..CQI.attributeSize("${mainCorpusName}.${it}")-1) { |
|
291 |
(start, end) = CQI.struc2Cpos("${mainCorpusName}.${it}", i) |
|
292 |
matches << [start, end] |
|
293 |
//println sprintf("Adding %s[%d, %d]", it, start, end) |
|
294 |
} |
|
295 |
def intersection = inter(matches, corpus.getMatches()) |
|
296 |
for (def item : intersection) |
|
297 |
h.add(new Struct(it, item[0], item[1])) |
|
298 |
} |
|
299 |
|
|
300 |
if (propParam) { |
|
301 |
print sprintf("struct\tprop\tstart\tend\tT\tt\tv\tfmin\tfmax") |
|
302 |
} else { |
|
303 |
print sprintf("struct\tstart\tend\tT\tt\tv\tfmin\tfmax") |
|
304 |
} |
|
305 |
|
|
306 |
if (displayIndex) { |
|
307 |
println sprintf("\tindex") |
|
308 |
} else { |
|
309 |
println "" |
|
310 |
} |
|
311 |
|
|
312 |
def env = System.getenv() |
|
313 |
def localPath = env["HOME"]+"/Documents/d3test" |
|
314 |
new File(localPath).mkdirs() |
|
315 |
|
|
316 |
// reset output file |
|
317 |
def resultFile = new File(localPath, "desc-partition.html") |
|
318 |
def result = new PrintWriter(resultFile) |
|
319 |
result.print("") |
|
320 |
result.close() |
|
321 |
|
|
322 |
resultFile << '''\ |
|
323 |
<!DOCTYPE html> |
|
324 |
<html> |
|
325 |
<head> |
|
326 |
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" charset="UTF-8"/> |
|
327 |
<link type="text/css" rel="stylesheet" href="style.css"/> |
|
328 |
<script type="text/javascript" src="d3/d3.v3.js" charset="utf-8"></script> |
|
329 |
<script type="text/javascript" src="d3/layout/partition.js" charset="utf-8"></script> |
|
330 |
<style type="text/css"> |
|
331 |
|
|
332 |
.chart { |
|
333 |
display: block; |
|
334 |
margin: auto; |
|
335 |
margin-top: 60px; |
|
336 |
font-size: 11px; |
|
337 |
} |
|
338 |
|
|
339 |
rect { |
|
340 |
stroke: #eee; |
|
341 |
fill: #aaa; |
|
342 |
fill-opacity: .8; |
|
343 |
} |
|
344 |
|
|
345 |
rect.parent { |
|
346 |
cursor: pointer; |
|
347 |
fill: steelblue; |
|
348 |
} |
|
349 |
|
|
350 |
text { |
|
351 |
pointer-events: none; |
|
352 |
} |
|
353 |
|
|
354 |
</style> |
|
355 |
</head> |
|
356 |
<body> |
|
357 |
<div id="body"> |
|
358 |
<div id="footer"> |
|
359 |
Structures hierarchy |
|
360 |
<div class="hint">click or shift-alt-click to zoom-in or out</div> |
|
361 |
</div> |
|
362 |
</div> |
|
363 |
<script type="text/javascript"> |
|
364 |
|
|
365 |
var w = 1120, |
|
366 |
h = 600, |
|
367 |
x = d3.scale.linear().range([0, w]), |
|
368 |
y = d3.scale.linear().range([0, h]); |
|
369 |
|
|
370 |
var vis = d3.select("#body").append("div") |
|
371 |
.attr("class", "chart") |
|
372 |
.style("width", w + "px") |
|
373 |
.style("height", h + "px") |
|
374 |
.append("svg:svg") |
|
375 |
.attr("width", w) |
|
376 |
.attr("height", h); |
|
377 |
|
|
378 |
var partition = d3.layout.partition() |
|
379 |
.value(function(d) { return d.size; }).sort(null); |
|
380 |
|
|
381 |
var tree = `{''' |
|
382 |
|
|
383 |
// Now iterate on the TreeSet to get a depth first search on the structure intervals |
|
384 |
|
|
385 |
def rec_struct_regex = /([^0-9]+)[0-9]+/ |
|
386 |
|
|
387 |
/* |
|
388 |
"name": "sha-hamlet", |
|
389 |
"children": [ |
|
390 |
{ |
|
391 |
"name": "sha-hamcast", |
|
392 |
"children": [ |
|
393 |
{ |
|
394 |
"name": "sha-ham1", |
|
395 |
"children": [ |
|
396 |
{"name": "sha-ham102", "size": 855}, |
|
397 |
{"name": "sha-ham103", "size": 464}, |
|
398 |
{"name": "sha-ham104", "size": 296}, |
|
399 |
{"name": "sha-ham105", "size": 635} |
|
400 |
] |
|
401 |
} |
|
402 |
] |
|
403 |
} |
|
404 |
] |
|
405 |
}`; |
|
406 |
*/ |
|
407 |
|
|
408 |
def displayTree = { head -> |
|
409 |
if (head) { |
|
410 |
subtree = h.tailSet(head) |
|
411 |
subtree.each { print sprintf("%s[%d, %d], ", it.name, it.start, it.end) } |
|
412 |
println "" |
|
413 |
if (subtree.size() == 0) { |
|
414 |
println sprintf("%s[%d, %d]", head.name, head.start, head.end) |
|
415 |
} else { |
|
416 |
displayTree(subtree) |
|
417 |
} |
|
418 |
} |
|
419 |
} |
|
420 |
|
|
421 |
//displayTree(h.first()) |
|
422 |
|
|
423 |
def divPropVals = [] |
|
424 |
def divLengths = [] |
|
425 |
def textDivPropVals = [] |
|
426 |
def textDivLengths = [] |
|
427 |
|
|
428 |
h.each { |
|
429 |
|
|
430 |
//println sprintf("Displaying %s[%d, %d]", it.name, it.start, it.end) |
|
431 |
if (propParam) { |
|
432 |
|
|
433 |
def rec_match = (it.name =~ rec_struct_regex) |
|
434 |
if (rec_match.size() == 1) { |
|
435 |
println "Rec struct match = "+rec_match[0][1] |
|
436 |
istruct_name = rec_match[0][1] |
|
437 |
} else { |
|
438 |
//println "Struct match = "+it.name |
|
439 |
istruct_name = it.name |
|
440 |
} |
|
441 |
|
|
442 |
def struct_name = "${mainCorpusName}.${istruct_name}_${corpusStructProps[it.name]}" |
|
443 |
def propVal = CQI.struc2Str(struct_name, CQI.cpos2Struc(struct_name, [it.start] as int[]))[0] |
|
444 |
if (it.name == "text") { |
|
445 |
textDivPropVals.push(divPropVals) |
|
446 |
divPropVals = [] |
|
447 |
textDivLengths.push(divLengths) |
|
448 |
divLengths = [] |
|
449 |
} else if (it.name == "div") { |
|
450 |
divPropVals.push(propVal) |
|
451 |
divLengths.push(it.end-it.start) |
|
452 |
} |
|
453 |
|
|
454 |
|
|
455 |
print sprintf("%s\t%s\t%d\t%d\t%d\t", it.name, propVal, it.start, it.end, it.end-it.start) |
|
456 |
} else { |
|
457 |
def struct_name = "${mainCorpusName}.${it.name}" |
|
458 |
print sprintf("%s\t%d\t%d\t%d\t", it.name, it.start, it.end, it.end-it.start) |
|
459 |
} |
|
460 |
print_freq(corpus, sprintf("a:%s :: a>=%d & a<=%d", query, it.start, it.end), wordProperty) |
|
461 |
} |
|
462 |
|
|
463 |
textDivPropVals.push(divPropVals) |
|
464 |
textDivPropVals.remove(0) |
|
465 |
textDivLengths.push(divLengths) |
|
466 |
textDivLengths.remove(0) |
|
467 |
|
|
468 |
println textDivPropVals |
|
469 |
println textDivLengths |
|
470 |
|
|
471 |
def textDivPropVals1 = textDivPropVals[0] as String[] |
|
472 |
r.addVectorToWorkspace("textDivPropVals1", textDivPropVals1) |
|
473 |
def textDivLengths1 = textDivLengths[0] as int[] |
|
474 |
r.addVectorToWorkspace("textDivLengths1", textDivLengths1) |
|
475 |
|
|
476 |
def PNGFile = File.createTempFile("txm", ".png", new File(Toolbox.getTxmHomePath(), "results")) |
|
477 |
def PNGFilePath = PNGFile.getAbsolutePath() |
|
478 |
println "PNG file: "+PNGFilePath |
|
479 |
|
|
480 |
def SVGFile = File.createTempFile("txm", ".svg", new File(Toolbox.getTxmHomePath(), "results")) |
|
481 |
def SVGFilePath = SVGFile.getAbsolutePath() |
|
482 |
println "SVG file: "+SVGFilePath |
|
483 |
|
|
484 |
/// BEGINNING OF R SCRIPT |
|
485 |
def script =""" |
|
486 |
df <- data.frame(structure=textDivPropVals1, |
|
487 |
longueur=textDivLengths1) |
|
488 |
p<-ggplot(data=df, aes(x=structure, y=longueur)) + |
|
489 |
geom_bar(stat="identity", fill="steelblue") + |
|
490 |
geom_text(aes(label=longueur), vjust=1.6, color="white", size=3.5) + |
|
491 |
labs(title="${corpusName}", x="Structure div", y = "Longueur") + |
|
492 |
theme_minimal() |
|
493 |
""" |
|
494 |
/// END OF R SCRIPT |
|
495 |
|
|
496 |
// execute R script |
|
497 |
try { |
|
498 |
r.eval("library(ggplot2)") |
|
499 |
try { |
|
500 |
r.eval(script+"ggsave(file=\"${PNGFilePath}\", plot=p)") |
|
501 |
r.eval(script+"ggsave(file=\"${SVGFilePath}\", plot=p)") |
|
502 |
|
|
503 |
//display the SVG results graphic |
|
504 |
monitor.syncExec(new Runnable() { |
|
505 |
@Override |
|
506 |
public void run() { try { OpenSVGGraph.OpenSVGFile(SVGFilePath, "Longueur des structures de "+corpusName) } catch(Exception e) {e.printStackTrace()} } |
|
507 |
}) |
|
508 |
} catch (Exception e) { |
|
509 |
println "** Error: "+e |
|
510 |
} |
|
511 |
} catch (Exception e) { |
|
512 |
println "** The 'ggplot2' R package is not installed. Start R ("+RWorkspace.getExecutablePath()+") and run 'install.packages(\"ggplot2\");'." |
|
513 |
} |
|
514 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/Struct.groovy (revision 2087) | ||
---|---|---|
1 |
package org.txm.macro.commands; |
|
2 |
|
|
3 |
class Struct implements Comparable<Struct> { |
|
4 |
|
|
5 |
String name |
|
6 |
Integer start |
|
7 |
Integer end |
|
8 |
|
|
9 |
Struct(String n, Integer s, Integer e) { |
|
10 |
name = n |
|
11 |
start = s |
|
12 |
end = e |
|
13 |
} |
|
14 |
|
|
15 |
public int compareTo(Struct s) { |
|
16 |
if (start < s.start && end > s.end) { // self contains s : [ { } ] |
|
17 |
//println sprintf("%s[%d, %d] ^ %s[%d, %d]", name, start, end, s.name, s.start, s.end) |
|
18 |
return -1 |
|
19 |
} else if (start > s.start && end < s.end) { // s contains self : { [ ] } |
|
20 |
//println sprintf("%s[%d, %d] v %s[%d, %d]", name, start, end, s.name, s.start, s.end) |
|
21 |
return 1 |
|
22 |
} else if (start == s.start && end == s.end) { // self and s have the same intervals : [{ }] |
|
23 |
//println sprintf("%s[%d, %d] = %s[%d, %d]", name, start, end, s.name, s.start, s.end) |
|
24 |
return name.compareTo(s.name) // use the lexicographic order of the structure names |
|
25 |
} else if (start < s.start) { // interval starting on the left comes first : [ { ... |
|
26 |
return -1 |
|
27 |
} else if (start > s.start) { // interval starting on the right comes after : { [ ... |
|
28 |
return 1 |
|
29 |
} else if (end > s.end) { // same start, interval ending on the right comes before : [{ } ]... |
|
30 |
return -1 |
|
31 |
} else if (end < s.end) { // same start, interval ending on the right comes before : [{ ] }... |
|
32 |
return -1 |
|
33 |
} else { // same start, same end : [{ ]}... |
|
34 |
return name.compareTo(s.name) // use the lexicographic order of the structure names |
|
35 |
} |
|
36 |
} |
|
37 |
|
|
38 |
public toString(Struct s) { |
|
39 |
sprintf("%s[%d, %d]", s.name, s.start, s.end) |
|
40 |
} |
|
41 |
|
|
42 |
public print(Struct s) { |
|
43 |
print(s.toString()) |
|
44 |
} |
|
45 |
} |
Formats disponibles : Unified diff