Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / democrat / BuildDemocratCorpusVersion1Macro.groovy @ 2027

History | View | Annotate | Download (4.9 kB)

1
package org.txm.macro.urs.democrat
2

    
3
import java.io.File
4
import java.nio.charset.Charset
5

    
6
import org.eclipse.core.runtime.IProgressMonitor
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import visuAnalec.donnees.Corpus
10
import visuAnalec.vue.Vue
11

    
12
import org.txm.annotation.urs.URSCorpora
13
import org.txm.importer.StaxIdentityParser
14
import org.txm.objects.*
15
import org.txm.rcp.commands.workspace.LoadBinaryCorpus
16
import org.txm.rcp.swt.widget.parameters.*
17
import org.txm.rcp.utils.JobHandler
18
import org.txm.searchengine.cqp.corpus.*
19
import org.txm.utils.CsvReader
20
import org.txm.utils.io.IOUtils
21
import org.txm.scripts.importer.XPathResult
22
import javax.xml.xpath.XPathConstants
23
import org.txm.annotation.urs.commands.*
24

    
25
@Field @Option(name="inputDirectory", usage="Directory with .txm files, metadata.xslx, metadata.properties...", widget="FileOpen", required=true, def="input directory path")
26
def inputDirectory
27

    
28
@Field @Option(name="outputDirectory", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="output directory path")
29
def outputDirectory
30

    
31
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
32
debug
33

    
34
if (!ParametersDialog.open(this)) return;
35
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
36

    
37
def teiHeaderTemplateFile = new File(inputDirectory, "teiHeader.xml")
38
def xpathFile = new File(inputDirectory, "metadata.properties")
39
def metadataFile = new File(inputDirectory, "metadata.tsv")
40

    
41
if (!inputDirectory.exists()) {
42
        println "no inputDirectory found: $inputDirectory"
43
        return false;
44
}
45

    
46
outputDirectory.mkdirs()
47
if (!outputDirectory.exists()) {
48
        println "outputDirectory not created: $outputDirectory"
49
        return false;
50
}
51

    
52
if (!teiHeaderTemplateFile.exists()) {
53
        println "no teiHeader template found: $teiHeaderTemplateFile"
54
        return false;
55
}
56

    
57
if (!xpathFile.exists()) {
58
        println "no xpath properties found: $xpathFile"
59
        return false;
60
}
61

    
62
if (!metadataFile.exists()) {
63
        println "no metadata TSV found: $metadataFile"
64
        return false;
65
}
66

    
67
def xpathProperties = new Properties()
68
xpathProperties.load(IOUtils.getReader(xpathFile))
69
println xpathProperties
70

    
71
def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
72
println csvReader
73

    
74
csvReader.readHeaders();
75
def header = csvReader.getHeaders()
76
if (!header.contains("id")) {
77
        println "** 'id' column not found in $metadataFile header=$header" 
78
        return;
79
}
80
if (!header.contains("corpus_id")) {
81
        println "** 'corpus_id' column not found in $metadataFile header=$header"
82
        return;
83
}
84

    
85
def corpora = [:]
86
for (Project project : Workspace.getInstance().getProjects()) {
87
        for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
88
                corpora[corpus.getID()] = corpus;
89
        }
90
}
91

    
92
while (csvReader.readRecord())        {
93
        String text_id = csvReader.get("id")
94
        if (text_id == null || text_id.length() == 0) {
95
                println "** @id not found for record="+csvReader.getRawRecord()
96
                continue;
97
        }
98
        
99
        String corpus_id = csvReader.get("corpus_id")
100
        if (corpus_id == null || corpus_id.length() == 0) {
101
                println "** @corpus_id not found for record="+csvReader.getRawRecord()
102
                continue;
103
        }
104
        
105
        MainCorpus corpus = corpora[corpus_id]
106
        Project project = null
107
        if (corpus == null) {
108
                File binCorpusFile = new File(inputDirectory, corpus_id+".txm")
109
                if (!binCorpusFile.exists()) {
110
                        println "** no corpus binary file found for ID=$corpus_id : $binCorpusFile"
111
                        continue;
112
                }
113
                println "CALL load corpus from $binCorpusFile"
114
                JobHandler job = LoadBinaryCorpus.loadBinaryCorpusArchive(binCorpusFile)
115
                job.join();
116
                project = job.getResultObject();
117
                corpus = project.getCorpusBuild(corpus_id);
118
                corpora[corpus.getID()] = corpus;
119
        } else {
120
                println "USING loaded corpus: "+corpus
121
                project = corpus.getProject()
122
        }
123
        
124
        def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
125
        
126
        if (!txmDir.exists()) {
127
                println "** the selected corpus has no XML-TXM files. Aborting."
128
                continue;
129
        }
130
        
131
        File txmFile = new File(txmDir, text_id+".xml")
132
        if (!txmFile.exists()) {
133
                println "** the selected corpus has no XML-TXM file: $txmFile"
134
                continue;
135
        }
136
        
137
        println "CALL Metadata2TeiHeaderMacro"
138
        Metadata2TEiHeaderMacro mthm = new Metadata2TEiHeaderMacro(debug);
139
        for (def h : header) data[h] = csvReader.get(h)
140
        
141
        String xmlteiHeaderContent = mthm.getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
142
        if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
143
                mthm.injecting(txmFile, xmlteiHeaderContent)
144
        } else {
145
                println "** Text header not updated: $txmFile"
146
        }
147
        
148
        println "CALL ComputeDemocratMeasureMetadataMacro"
149
        
150
        println "CALL URS Export"
151
        Corpus analecCorpus = URSCorpora.getCorpus(corpus);
152
        Vue vue = URSCorpora.getVue(corpus);
153
        if (!ExportTEICorpus.export(true, outputDirectory, true, null, corpus, analecCorpus, vue)) {
154
                println "FAIL TO EXPORT CORPUS$corpus"
155
        }
156
}