root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / democrat / BuildDemocratCorpusVersion1Macro.groovy @ 2027
History | View | Annotate | Download (4.9 kB)
1 |
package org.txm.macro.urs.democrat
|
---|---|
2 |
|
3 |
import java.io.File |
4 |
import java.nio.charset.Charset |
5 |
|
6 |
import org.eclipse.core.runtime.IProgressMonitor |
7 |
import org.kohsuke.args4j.* |
8 |
import groovy.transform.Field |
9 |
import visuAnalec.donnees.Corpus |
10 |
import visuAnalec.vue.Vue |
11 |
|
12 |
import org.txm.annotation.urs.URSCorpora |
13 |
import org.txm.importer.StaxIdentityParser |
14 |
import org.txm.objects.* |
15 |
import org.txm.rcp.commands.workspace.LoadBinaryCorpus |
16 |
import org.txm.rcp.swt.widget.parameters.* |
17 |
import org.txm.rcp.utils.JobHandler |
18 |
import org.txm.searchengine.cqp.corpus.* |
19 |
import org.txm.utils.CsvReader |
20 |
import org.txm.utils.io.IOUtils |
21 |
import org.txm.scripts.importer.XPathResult |
22 |
import javax.xml.xpath.XPathConstants |
23 |
import org.txm.annotation.urs.commands.* |
24 |
|
25 |
@Field @Option(name="inputDirectory", usage="Directory with .txm files, metadata.xslx, metadata.properties...", widget="FileOpen", required=true, def="input directory path") |
26 |
def inputDirectory
|
27 |
|
28 |
@Field @Option(name="outputDirectory", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="output directory path") |
29 |
def outputDirectory
|
30 |
|
31 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
32 |
debug |
33 |
|
34 |
if (!ParametersDialog.open(this)) return; |
35 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
36 |
|
37 |
def teiHeaderTemplateFile = new File(inputDirectory, "teiHeader.xml") |
38 |
def xpathFile = new File(inputDirectory, "metadata.properties") |
39 |
def metadataFile = new File(inputDirectory, "metadata.tsv") |
40 |
|
41 |
if (!inputDirectory.exists()) {
|
42 |
println "no inputDirectory found: $inputDirectory"
|
43 |
return false; |
44 |
} |
45 |
|
46 |
outputDirectory.mkdirs() |
47 |
if (!outputDirectory.exists()) {
|
48 |
println "outputDirectory not created: $outputDirectory"
|
49 |
return false; |
50 |
} |
51 |
|
52 |
if (!teiHeaderTemplateFile.exists()) {
|
53 |
println "no teiHeader template found: $teiHeaderTemplateFile"
|
54 |
return false; |
55 |
} |
56 |
|
57 |
if (!xpathFile.exists()) {
|
58 |
println "no xpath properties found: $xpathFile"
|
59 |
return false; |
60 |
} |
61 |
|
62 |
if (!metadataFile.exists()) {
|
63 |
println "no metadata TSV found: $metadataFile"
|
64 |
return false; |
65 |
} |
66 |
|
67 |
def xpathProperties = new Properties() |
68 |
xpathProperties.load(IOUtils.getReader(xpathFile)) |
69 |
println xpathProperties |
70 |
|
71 |
def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
72 |
println csvReader |
73 |
|
74 |
csvReader.readHeaders(); |
75 |
def header = csvReader.getHeaders()
|
76 |
if (!header.contains("id")) { |
77 |
println "** 'id' column not found in $metadataFile header=$header"
|
78 |
return;
|
79 |
} |
80 |
if (!header.contains("corpus_id")) { |
81 |
println "** 'corpus_id' column not found in $metadataFile header=$header"
|
82 |
return;
|
83 |
} |
84 |
|
85 |
def corpora = [:]
|
86 |
for (Project project : Workspace.getInstance().getProjects()) {
|
87 |
for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
|
88 |
corpora[corpus.getID()] = corpus; |
89 |
} |
90 |
} |
91 |
|
92 |
while (csvReader.readRecord()) {
|
93 |
String text_id = csvReader.get("id") |
94 |
if (text_id == null || text_id.length() == 0) { |
95 |
println "** @id not found for record="+csvReader.getRawRecord()
|
96 |
continue;
|
97 |
} |
98 |
|
99 |
String corpus_id = csvReader.get("corpus_id") |
100 |
if (corpus_id == null || corpus_id.length() == 0) { |
101 |
println "** @corpus_id not found for record="+csvReader.getRawRecord()
|
102 |
continue;
|
103 |
} |
104 |
|
105 |
MainCorpus corpus = corpora[corpus_id] |
106 |
Project project = null
|
107 |
if (corpus == null) { |
108 |
File binCorpusFile = new File(inputDirectory, corpus_id+".txm") |
109 |
if (!binCorpusFile.exists()) {
|
110 |
println "** no corpus binary file found for ID=$corpus_id : $binCorpusFile"
|
111 |
continue;
|
112 |
} |
113 |
println "CALL load corpus from $binCorpusFile"
|
114 |
JobHandler job = LoadBinaryCorpus.loadBinaryCorpusArchive(binCorpusFile) |
115 |
job.join(); |
116 |
project = job.getResultObject(); |
117 |
corpus = project.getCorpusBuild(corpus_id); |
118 |
corpora[corpus.getID()] = corpus; |
119 |
} else {
|
120 |
println "USING loaded corpus: "+corpus
|
121 |
project = corpus.getProject() |
122 |
} |
123 |
|
124 |
def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID()) |
125 |
|
126 |
if (!txmDir.exists()) {
|
127 |
println "** the selected corpus has no XML-TXM files. Aborting."
|
128 |
continue;
|
129 |
} |
130 |
|
131 |
File txmFile = new File(txmDir, text_id+".xml") |
132 |
if (!txmFile.exists()) {
|
133 |
println "** the selected corpus has no XML-TXM file: $txmFile"
|
134 |
continue;
|
135 |
} |
136 |
|
137 |
println "CALL Metadata2TeiHeaderMacro"
|
138 |
Metadata2TEiHeaderMacro mthm = new Metadata2TEiHeaderMacro(debug);
|
139 |
for (def h : header) data[h] = csvReader.get(h) |
140 |
|
141 |
String xmlteiHeaderContent = mthm.getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
|
142 |
if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) { |
143 |
mthm.injecting(txmFile, xmlteiHeaderContent) |
144 |
} else {
|
145 |
println "** Text header not updated: $txmFile"
|
146 |
} |
147 |
|
148 |
println "CALL ComputeDemocratMeasureMetadataMacro"
|
149 |
|
150 |
println "CALL URS Export"
|
151 |
Corpus analecCorpus = URSCorpora.getCorpus(corpus); |
152 |
Vue vue = URSCorpora.getVue(corpus); |
153 |
if (!ExportTEICorpus.export(true, outputDirectory, true, null, corpus, analecCorpus, vue)) { |
154 |
println "FAIL TO EXPORT CORPUS$corpus"
|
155 |
} |
156 |
} |