Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 2288

History | View | Annotate | Download (8.9 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4

    
5
import org.txm.*
6
import org.txm.scripts.importer.SAttributesListener
7
import org.txm.importer.cwb.*
8
import org.txm.utils.ConsoleProgressBar
9
import org.txm.utils.logger.Log
10
import org.txm.core.preferences.TXMPreferences
11
import org.txm.core.preferences.TBXPreferences
12
import org.txm.libs.cqp.CQPLibPreferences
13

    
14
import javax.xml.stream.*
15
import org.txm.importer.xtz.*
16
import org.txm.objects.*
17
import org.txm.searchengine.cqp.corpus.*
18

    
19
class XTZCompiler extends Compiler {
20

    
21
        SAttributesListener sattrsListener; // store scanned structures
22
        private def anatypes = new HashSet<String>() // store scanned word attributes
23

    
24
        String regPath;
25
        String corpusname;
26
        String wtag;
27

    
28
        boolean doNormalizeAttributeValues = false;
29
        boolean doNormalizeAnaValues = true;
30

    
31
        public XTZCompiler(ImportModule module) {
32
                super(module);
33

    
34
                corpusname = module.getProject().getName();
35
                regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
36

    
37
                wtag = module.getProject().getTokenizerWordElement();
38

    
39
                doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
40
                doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
41
        }
42

    
43
        @Override
44
        public void process(List<String> orderedTextIDs) {
45
                super.process(orderedTextIDs); // set member
46

    
47
                if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
48

    
49
                Project project = module.getProject();
50
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
51
                if (corpus != null) {
52
                        if (project.getDoUpdate()) {
53
                                corpus.clean(); // remove old files
54
                        } else {
55
                                corpus.delete(); // remove old files and TXMResult children
56
                        }
57
                } else {
58
                        corpus = new MainCorpus(project);
59
                        corpus.setID(project.getName());
60
                        corpus.setName(project.getName());
61
                }
62
                corpus.setDescription("Built with the XTZ import module");
63

    
64
                if (!doScanStep()) return;
65
                if (!doCQPStep()) return;
66
                if (!doCWBEncodeStep()) return;
67
                if (!doCWBMakeAllStep()) return;
68

    
69
                if (module.getProject().getCleanAfterBuild() && !module.getProject().getDoUpdate()) {
70
                        new File(module.getBinaryDirectory(), "cqp").deleteDir()
71
                }
72

    
73
                isSuccessFul = true;
74
        }
75

    
76
        /**
77
         * Scan all XML-TXM files to find out structures and word properties
78
         */
79
        public boolean doScanStep() {
80
                // get all anatypes
81
                sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
82
                def texts = module.getProject().getTexts()
83
                println "-- Scanning structures&properties to create for "+texts.size()+" texts..."
84
                ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
85
                for (Text t : texts) {
86
                        try {
87
                                cpb.tick();
88
                                getAnaTypes(t.getXMLTXMFile())
89
                        } catch (Exception e) {
90
                                println "Error while processing $t text XML-TXM file : "+t.getSource()+". Error: "+e
91
                                e.printStackTrace();
92
                                return false;
93
                        }
94
                }
95
                println ""
96
                return true;
97
        }
98

    
99
        private void getAnaTypes(File xmlFile) {
100
                def inputData = xmlFile.toURI().toURL().openStream();
101
                def factory = XMLInputFactory.newInstance();
102
                def parser = factory.createXMLStreamReader(inputData);
103
                boolean start = false;
104
                String ANA = "ana"
105
                String TYPE = "type"
106
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
107
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
108
                                if (wtag.equals(parser.getLocalName())) {
109
                                        start = true;
110
                                } else if (start && ANA.equals(parser.getLocalName())) { // ana elem
111
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
112
                                                if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
113
                                                        anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
114
                                                        break;
115
                                                }
116
                                        }
117
                                }
118
                        } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
119
                                if (wtag.equals(parser.getLocalName())) {
120
                                        start = false;
121
                                }
122
                        }
123
                }
124

    
125
                if (parser != null) parser.close();
126
                if (inputData != null) inputData.close();
127

    
128
                //                for (String type : types)
129
                //                        if (!anatypes.contains(type))
130
                //                                anatypes << type
131
        }
132

    
133
        def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
134
        int cqpFilesUpdated = 0;
135
        public boolean doCQPStep() {
136
                
137
                cqpDirectory.mkdir(); // if not created
138

    
139
                def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
140
                def textsToProcess = texts.findAll() { text ->
141
                        File xmlFile = text.getXMLTXMFile()
142
                        String textname = text.getName()
143

    
144
                        File cqpFile = new File(cqpDirectory, textname + ".cqp")
145
                        cqpFiles << cqpFile // insert cqp files to concat later
146
                        // skip step if cqpFile exists AND is more recent than the XML-TXM File
147
                        boolean mustBuild = false;
148
                        if (!cqpFile.exists() || xmlFile.lastModified() >= cqpFile.lastModified()) {
149
                                return true
150
                        }
151

    
152
                        if (!text.isDirty() && !mustBuild) {
153
                                Log.finer("skipping .cqp step of $text");
154
                                return false
155
                        }
156
                        
157
                        return true
158
                }
159
                println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..."
160
                
161
                ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
162
                cqpFilesUpdated = 0;
163
                for (Text text : textsToProcess) {
164
                        cpb.tick();
165

    
166
                        File xmlFile = text.getXMLTXMFile()
167
                        String textname = text.getName()
168

    
169
                        File cqpFile = new File(cqpDirectory, textname + ".cqp")
170
                        
171
                        cqpFilesUpdated++
172

    
173
                        XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
174
                        step.setNormalizeAnaValues(doNormalizeAnaValues)
175
                        step.setNormalizeAttributeValues(doNormalizeAttributeValues)
176
                        if (!step.process()) {
177
                                reason = "Fail to process $xmlFile."
178
                                return false;
179
                        }
180
                }
181
                println ""
182
                return true;
183
        }
184

    
185
        public boolean doCWBEncodeStep() {
186
                println "-- Running cwb-encode..."
187
                CwbEncode cwbEn = new CwbEncode()
188
                cwbEn.setDebug(debug)
189

    
190
                List<String> pargs = []
191
                pargs.add("id")
192
                for (String ana : anatypes) {
193
                        pargs.add(ana)
194
                }
195
                
196
                String[] pAttrs = pargs
197

    
198
                def structs = sattrsListener.getStructs()
199
                def structsProf = sattrsListener.getProfs()
200

    
201
                if (debug) {
202
                        println structs
203
                        println structsProf
204
                }
205

    
206
                List<String> sargs = new ArrayList<String>()
207
                def tmpTextAttrs = []
208
                for (String name : structs.keySet()) {
209
                        if (name == "text") {
210
                                for (String value : structs.get(name)) // append the attributes
211
                                        tmpTextAttrs << value // added after
212
                                continue;
213
                        }
214

    
215
                        String concat = name+":"+structsProf.get(name); // append the depth
216
                        for (String attributeName : structs.get(name)) { // append the attributes
217
                                concat += "+"+attributeName.toLowerCase();
218
                        }
219
                        
220
                        if (structs.get(name).size() == 0) {
221
                                concat += "+n";
222
                        } else {
223
                                if (!structs.get(name).contains("n")) {
224
                                        concat += "+n"
225
                                }
226
                        }
227

    
228
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
229
                                && !concat.contains("+n+") && !concat.endsWith("+n")) {
230
                                concat += "+n"
231
                        }
232
                        sargs.add(concat)
233
                }
234

    
235
                String textSAttributes = "text:0+id+base+project";
236
                for (String name : tmpTextAttrs) {
237
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) {
238
                                textSAttributes += "+"+name.toLowerCase()
239
                        }
240
                }
241

    
242
                sargs.add(textSAttributes)
243
                sargs.add("txmcorpus:0+lang")
244

    
245
                sargs.sort()
246

    
247
                String[] sAttributes = sargs
248
                String[] pAttributes = pAttrs
249
                println " Word properties: "+pAttributes.join(', ')
250
                println " Structures: "+sargs.join(', ')
251
                File allcqpFile = new File(cqpDirectory, "all.cqp");
252
                allcqpFile.delete()
253
                try {
254
                        if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
255
                                println "Fail to write the master cqp file: "+allcqpFile
256
                                return false;
257
                        }
258

    
259
                        if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
260
                                allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
261
                                println "** cwb-encode did not ends well. Activate finer logs to see details."
262
                                return false;
263
                        }
264

    
265
                        allcqpFile.delete(); // clean
266
                } catch (Exception e) {
267
                        println "Error while running cwb-encode: "+e
268
                        e.printStackTrace()
269
                        allcqpFile.delete(); // clean
270
                        return false;
271
                }
272
                println ""
273
                return true;
274
        }
275

    
276
        public boolean doCWBMakeAllStep() {
277
                println "-- Running cwb-makeall..."
278
                try {
279
                        CwbMakeAll cwbMa = new CwbMakeAll();
280
                        cwbMa.setDebug(debug);
281

    
282
                        if (!new File(regPath).exists()) {
283
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
284
                                return false;
285
                        }
286
                        if (!cwbMa.run(corpusname, new File(regPath).getParent())) {
287
                                println "** cwb-makeall did not ends well. Activate finer logs to see details."
288
                                return false;
289
                        }
290

    
291
                        // remove milestones from CWB registry and data files
292
                        FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
293
                                        new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));
294
                        if (!fm.process()) {
295
                                println "Fail to verify&fix milestone declarations"
296
                                return false
297
                        }
298
                } catch (Exception e) {
299
                        println "Error while running cwb-makeall: "+e
300
                        return false;
301
                }
302
                return true;
303
        }
304
}