Revision 2027
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/Metadata2TeiHeaderMacro.groovy (revision 2027) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macro.urs.democrat |
|
3 |
|
|
4 |
import java.nio.charset.Charset |
|
5 |
|
|
6 |
import org.kohsuke.args4j.* |
|
7 |
import groovy.transform.Field |
|
8 |
|
|
9 |
import org.txm.importer.StaxIdentityParser |
|
10 |
import org.txm.objects.* |
|
11 |
import org.txm.rcp.swt.widget.parameters.* |
|
12 |
import org.txm.searchengine.cqp.corpus.* |
|
13 |
import org.txm.utils.CsvReader |
|
14 |
import org.txm.utils.io.IOUtils |
|
15 |
import org.txm.scripts.importer.XPathResult |
|
16 |
import javax.xml.xpath.XPathConstants |
|
17 |
|
|
18 |
class Metadata2TEiHeaderMacro { |
|
19 |
|
|
20 |
int debug = 0; |
|
21 |
public Metadata2TEiHeaderMacro(int debug) { |
|
22 |
this.debug = debug |
|
23 |
} |
|
24 |
//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml") |
|
25 |
//def teiHeaderTemplateFile |
|
26 |
// |
|
27 |
//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties") |
|
28 |
//def xpathFile |
|
29 |
// |
|
30 |
//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv") |
|
31 |
//def metadataFile |
|
32 |
// |
|
33 |
//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
|
34 |
//debug |
|
35 |
// |
|
36 |
//if (!ParametersDialog.open(this)) return; |
|
37 |
//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
|
38 |
// |
|
39 |
// |
|
40 |
// |
|
41 |
//def xpathProperties = new Properties() |
|
42 |
//xpathProperties.load(IOUtils.getReader(xpathFile)) |
|
43 |
//println xpathProperties |
|
44 |
// |
|
45 |
//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
|
46 |
//println csvReader |
|
47 |
// |
|
48 |
//csvReader.readHeaders(); |
|
49 |
//def header = csvReader.getHeaders() |
|
50 |
//if (!header.contains("id")) { |
|
51 |
// println "** 'id' column not found in $metadataFile header=$header" |
|
52 |
// return; |
|
53 |
//} |
|
54 |
//if (!header.contains("corpus_id")) { |
|
55 |
// println "** 'corpus_id' column not found in $metadataFile header=$header" |
|
56 |
// return; |
|
57 |
//} |
|
58 |
// |
|
59 |
//def corpora = [:] |
|
60 |
//for (Project project : Workspace.getInstance().getProjects()) { |
|
61 |
// for (MainCorpus corpus : project.getChildren(MainCorpus.class)) { |
|
62 |
// corpora[corpus.getID()] = corpus; |
|
63 |
// } |
|
64 |
//} |
|
65 |
// |
|
66 |
//while (csvReader.readRecord()) { |
|
67 |
// String text_id = csvReader.get("id") |
|
68 |
// if (text_id == null || text_id.length() == 0) { |
|
69 |
// //println "** @id not found for record="+csvReader.getRawRecord() |
|
70 |
// continue; |
|
71 |
// } |
|
72 |
// |
|
73 |
// String corpus_id = csvReader.get("corpus_id") |
|
74 |
// if (corpus_id == null || corpus_id.length() == 0) { |
|
75 |
// println "** @corpus_id not found for record="+csvReader.getRawRecord() |
|
76 |
// continue; |
|
77 |
// } |
|
78 |
// |
|
79 |
// MainCorpus corpus = corpora[corpus_id] |
|
80 |
// if (corpus == null) { |
|
81 |
// println "** no corpus found for ID=$corpus_id" |
|
82 |
// continue; |
|
83 |
// } |
|
84 |
// def project = corpus.getProject() |
|
85 |
// |
|
86 |
// def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID()) |
|
87 |
// |
|
88 |
// if (!txmDir.exists()) { |
|
89 |
// println "** the selected corpus has no XML-TXM files. Aborting." |
|
90 |
// } |
|
91 |
// |
|
92 |
// |
|
93 |
// File txmFile = new File(txmDir, text_id+".xml") |
|
94 |
// if (txmFile.exists()) { |
|
95 |
// |
|
96 |
// def data = [:] |
|
97 |
// |
|
98 |
// for (def h : header) data[h] = csvReader.get(h) |
|
99 |
// |
|
100 |
// String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties); |
|
101 |
// if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) { |
|
102 |
// injecting(txmFile, xmlteiHeaderContent) |
|
103 |
// } else { |
|
104 |
// println "** Text header not updated: $txmFile" |
|
105 |
// } |
|
106 |
// } else { |
|
107 |
// println "** Text not found: $txmFile" |
|
108 |
// } |
|
109 |
//} |
|
110 |
|
|
111 |
def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) { |
|
112 |
XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile); |
|
113 |
for (String info : data.keySet()) { |
|
114 |
String xpath = xpathProperties[info]; |
|
115 |
String value = data[info] |
|
116 |
if (xpath == null) { |
|
117 |
continue; // not a data to inject |
|
118 |
} |
|
119 |
if (debug > 1) println " injecting '$info'='$value' in '$xpath'" |
|
120 |
|
|
121 |
def expr = xpathProcessor.xpath.compile(xpath); |
|
122 |
def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE); |
|
123 |
if (first != null) { |
|
124 |
switch (first.getNodeType()) { |
|
125 |
case 1: // element |
|
126 |
if (debug > 1) println " patching: $info with "+value |
|
127 |
def newChild = xpathProcessor.doc.createElement(first.getTagName()) |
|
128 |
newChild.appendChild(xpathProcessor.doc.createTextNode(value)) |
|
129 |
first.getParentNode().replaceChild(newChild, first) |
|
130 |
break; |
|
131 |
case 2: // attribute |
|
132 |
if (debug > 1) println " patching attribute: $info with "+value |
|
133 |
first.setNodeValue(value) |
|
134 |
break; |
|
135 |
case 3: // text |
|
136 |
if (debug > 1) println " patching text: $info with "+value |
|
137 |
first.setNodeValue(value) |
|
138 |
break; |
|
139 |
default: |
|
140 |
break |
|
141 |
} |
|
142 |
} else { |
|
143 |
println "** not found $xpath" |
|
144 |
} |
|
145 |
} |
|
146 |
|
|
147 |
// return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue(); |
|
148 |
//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString() |
|
149 |
String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0) |
|
150 |
content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length()) |
|
151 |
} |
|
152 |
|
|
153 |
/** |
|
154 |
* Replace the teiHeader |
|
155 |
* @param txmFile |
|
156 |
* @param data |
|
157 |
* @param paths |
|
158 |
* @param xmlteiHeaderContent |
|
159 |
* @return |
|
160 |
*/ |
|
161 |
def injecting(File txmFile, String xmlteiHeaderContent) { |
|
162 |
println "Editing: $txmFile..." |
|
163 |
StaxIdentityParser sparser = new StaxIdentityParser(txmFile) { |
|
164 |
boolean start = false; |
|
165 |
public void processStartElement() { |
|
166 |
|
|
167 |
if (localname == "teiHeader") { |
|
168 |
start = true; |
|
169 |
if (debug > 2) println " replacing teiHeader" |
|
170 |
if (debug > 3) println " with $xmlteiHeaderContent" |
|
171 |
output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8"))); // REPLACE CONTENT ! |
|
172 |
} |
|
173 |
if (!start) { |
|
174 |
super.processStartElement(); |
|
175 |
} |
|
176 |
} |
|
177 |
|
|
178 |
public void processEndElement() { |
|
179 |
if (!start) { |
|
180 |
super.processEndElement(); |
|
181 |
} |
|
182 |
if (localname == "teiHeader") { |
|
183 |
start = false; |
|
184 |
if (debug > 2) println " replace done" |
|
185 |
} |
|
186 |
} |
|
187 |
|
|
188 |
public void processCharacters() { |
|
189 |
if (!start) { |
|
190 |
super.processCharacters(); |
|
191 |
} |
|
192 |
} |
|
193 |
|
|
194 |
public void processComment() { |
|
195 |
if (!start) { |
|
196 |
super.processComment(); |
|
197 |
} |
|
198 |
} |
|
199 |
} |
|
200 |
|
|
201 |
File tmpDirectory = new File("/home/mdecorde/TEMP"); |
|
202 |
File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName()) |
|
203 |
File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName()) |
|
204 |
|
|
205 |
if (sparser.process(outfile)) { // replace inputFile |
|
206 |
if (txmFile.renameTo(copyFile)) { |
|
207 |
if (outfile.renameTo(txmFile)) { |
|
208 |
println " -> SUCCESS see $txmFile" |
|
209 |
return true |
|
210 |
} else { |
|
211 |
println " -> FAIL could not replace $txmFile" |
|
212 |
println " -> see result in $outfile" |
|
213 |
|
|
214 |
} |
|
215 |
} else { |
|
216 |
println " -> FAIL could not make a copy of $txmFile in $copyFile" |
|
217 |
println " -> see result in $outfile" |
|
218 |
} |
|
219 |
|
|
220 |
} else { |
|
221 |
println " -> FAIL see $outfile" |
|
222 |
} |
|
223 |
|
|
224 |
return false; |
|
225 |
} |
|
226 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/ComputeDemocratMeasureMetadataMacro.groovy (revision 2027) | ||
---|---|---|
1 |
package org.txm.macro.urs.democrat |
|
2 |
|
|
3 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
4 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery |
|
5 |
import visuAnalec.donnees.Corpus |
|
6 |
|
|
7 |
class ComputeDemocratMeasureMetadataMacro { |
|
8 |
def getStats(MainCorpus corpus) { |
|
9 |
def data = [:] |
|
10 |
|
|
11 |
data["number of words"] = corpus.query(new CQLQuery("[]"), "TMP", false).getNMatch(); |
|
12 |
|
|
13 |
Corpus analecCorpus = URSCorpora.getCorpus(corpus); |
|
14 |
def chaines = analecCorpus.getSchemas("CHAINE"); |
|
15 |
def refchaines = []; |
|
16 |
for (def chaine : chaines) { |
|
17 |
if (chaine.getUnitesSousjacentes().length >= 3) { |
|
18 |
refchaines << chaine |
|
19 |
} |
|
20 |
} |
|
21 |
data["number of coreference chains"] = chaines.size() |
|
22 |
data["number of referring chains"] = refchaines.size() |
|
23 |
data["number of referring expressions"] = analecCorpus.getUnites("MENTION").size() |
|
24 |
} |
|
25 |
|
|
26 |
def updateTeiHeader(MainCorpus corpus) { |
|
27 |
|
|
28 |
} |
|
29 |
} |
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/BuildDemocratCorpusVersion1Macro.groovy (revision 2027) | ||
---|---|---|
1 |
package org.txm.macro.urs.democrat |
|
2 |
|
|
3 |
import java.io.File |
|
4 |
import java.nio.charset.Charset |
|
5 |
|
|
6 |
import org.eclipse.core.runtime.IProgressMonitor |
|
7 |
import org.kohsuke.args4j.* |
|
8 |
import groovy.transform.Field |
|
9 |
import visuAnalec.donnees.Corpus |
|
10 |
import visuAnalec.vue.Vue |
|
11 |
|
|
12 |
import org.txm.annotation.urs.URSCorpora |
|
13 |
import org.txm.importer.StaxIdentityParser |
|
14 |
import org.txm.objects.* |
|
15 |
import org.txm.rcp.commands.workspace.LoadBinaryCorpus |
|
16 |
import org.txm.rcp.swt.widget.parameters.* |
|
17 |
import org.txm.rcp.utils.JobHandler |
|
18 |
import org.txm.searchengine.cqp.corpus.* |
|
19 |
import org.txm.utils.CsvReader |
|
20 |
import org.txm.utils.io.IOUtils |
|
21 |
import org.txm.scripts.importer.XPathResult |
|
22 |
import javax.xml.xpath.XPathConstants |
|
23 |
import org.txm.annotation.urs.commands.* |
|
24 |
|
|
25 |
@Field @Option(name="inputDirectory", usage="Directory with .txm files, metadata.xslx, metadata.properties...", widget="FileOpen", required=true, def="input directory path") |
|
26 |
def inputDirectory |
|
27 |
|
|
28 |
@Field @Option(name="outputDirectory", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="output directory path") |
|
29 |
def outputDirectory |
|
30 |
|
|
31 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
|
32 |
debug |
|
33 |
|
|
34 |
if (!ParametersDialog.open(this)) return; |
|
35 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
|
36 |
|
|
37 |
def teiHeaderTemplateFile = new File(inputDirectory, "teiHeader.xml") |
|
38 |
def xpathFile = new File(inputDirectory, "metadata.properties") |
|
39 |
def metadataFile = new File(inputDirectory, "metadata.tsv") |
|
40 |
|
|
41 |
if (!inputDirectory.exists()) { |
|
42 |
println "no inputDirectory found: $inputDirectory" |
|
43 |
return false; |
|
44 |
} |
|
45 |
|
|
46 |
outputDirectory.mkdirs() |
|
47 |
if (!outputDirectory.exists()) { |
|
48 |
println "outputDirectory not created: $outputDirectory" |
|
49 |
return false; |
|
50 |
} |
|
51 |
|
|
52 |
if (!teiHeaderTemplateFile.exists()) { |
|
53 |
println "no teiHeader template found: $teiHeaderTemplateFile" |
|
54 |
return false; |
|
55 |
} |
|
56 |
|
|
57 |
if (!xpathFile.exists()) { |
|
58 |
println "no xpath properties found: $xpathFile" |
|
59 |
return false; |
|
60 |
} |
|
61 |
|
|
62 |
if (!metadataFile.exists()) { |
|
63 |
println "no metadata TSV found: $metadataFile" |
|
64 |
return false; |
|
65 |
} |
|
66 |
|
|
67 |
def xpathProperties = new Properties() |
|
68 |
xpathProperties.load(IOUtils.getReader(xpathFile)) |
|
69 |
println xpathProperties |
|
70 |
|
|
71 |
def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
|
72 |
println csvReader |
|
73 |
|
|
74 |
csvReader.readHeaders(); |
|
75 |
def header = csvReader.getHeaders() |
|
76 |
if (!header.contains("id")) { |
|
77 |
println "** 'id' column not found in $metadataFile header=$header" |
|
78 |
return; |
|
79 |
} |
|
80 |
if (!header.contains("corpus_id")) { |
|
81 |
println "** 'corpus_id' column not found in $metadataFile header=$header" |
|
82 |
return; |
|
83 |
} |
|
84 |
|
|
85 |
def corpora = [:] |
|
86 |
for (Project project : Workspace.getInstance().getProjects()) { |
|
87 |
for (MainCorpus corpus : project.getChildren(MainCorpus.class)) { |
|
88 |
corpora[corpus.getID()] = corpus; |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
while (csvReader.readRecord()) { |
|
93 |
String text_id = csvReader.get("id") |
|
94 |
if (text_id == null || text_id.length() == 0) { |
|
95 |
println "** @id not found for record="+csvReader.getRawRecord() |
|
96 |
continue; |
|
97 |
} |
|
98 |
|
|
99 |
String corpus_id = csvReader.get("corpus_id") |
|
100 |
if (corpus_id == null || corpus_id.length() == 0) { |
|
101 |
println "** @corpus_id not found for record="+csvReader.getRawRecord() |
|
102 |
continue; |
|
103 |
} |
|
104 |
|
|
105 |
MainCorpus corpus = corpora[corpus_id] |
|
106 |
Project project = null |
|
107 |
if (corpus == null) { |
|
108 |
File binCorpusFile = new File(inputDirectory, corpus_id+".txm") |
|
109 |
if (!binCorpusFile.exists()) { |
|
110 |
println "** no corpus binary file found for ID=$corpus_id : $binCorpusFile" |
|
111 |
continue; |
|
112 |
} |
|
113 |
println "CALL load corpus from $binCorpusFile" |
|
114 |
JobHandler job = LoadBinaryCorpus.loadBinaryCorpusArchive(binCorpusFile) |
|
115 |
job.join(); |
|
116 |
project = job.getResultObject(); |
|
117 |
corpus = project.getCorpusBuild(corpus_id); |
|
118 |
corpora[corpus.getID()] = corpus; |
|
119 |
} else { |
|
120 |
println "USING loaded corpus: "+corpus |
|
121 |
project = corpus.getProject() |
|
122 |
} |
|
123 |
|
|
124 |
def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID()) |
|
125 |
|
|
126 |
if (!txmDir.exists()) { |
|
127 |
println "** the selected corpus has no XML-TXM files. Aborting." |
|
128 |
continue; |
|
129 |
} |
|
130 |
|
|
131 |
File txmFile = new File(txmDir, text_id+".xml") |
|
132 |
if (!txmFile.exists()) { |
|
133 |
println "** the selected corpus has no XML-TXM file: $txmFile" |
|
134 |
continue; |
|
135 |
} |
|
136 |
|
|
137 |
println "CALL Metadata2TeiHeaderMacro" |
|
138 |
Metadata2TEiHeaderMacro mthm = new Metadata2TEiHeaderMacro(debug); |
|
139 |
for (def h : header) data[h] = csvReader.get(h) |
|
140 |
|
|
141 |
String xmlteiHeaderContent = mthm.getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties); |
|
142 |
if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) { |
|
143 |
mthm.injecting(txmFile, xmlteiHeaderContent) |
|
144 |
} else { |
|
145 |
println "** Text header not updated: $txmFile" |
|
146 |
} |
|
147 |
|
|
148 |
println "CALL ComputeDemocratMeasureMetadataMacro" |
|
149 |
|
|
150 |
println "CALL URS Export" |
|
151 |
Corpus analecCorpus = URSCorpora.getCorpus(corpus); |
|
152 |
Vue vue = URSCorpora.getVue(corpus); |
|
153 |
if (!ExportTEICorpus.export(true, outputDirectory, true, null, corpus, analecCorpus, vue)) { |
|
154 |
println "FAIL TO EXPORT CORPUS$corpus" |
|
155 |
} |
|
156 |
} |
Also available in: Unified diff