Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 1804

History | View | Annotate | Download (8.1 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4

    
5
import org.txm.*
6
import org.txm.scripts.importer.SAttributesListener
7
import org.txm.importer.cwb.*
8
import org.txm.utils.ConsoleProgressBar
9
import org.txm.core.preferences.TXMPreferences
10
import org.txm.core.preferences.TBXPreferences
11
import org.txm.libs.cqp.CQPLibPreferences
12

    
13
import javax.xml.stream.*
14
import org.txm.importer.xtz.*
15
import org.txm.objects.*
16
import org.txm.searchengine.cqp.corpus.*
17

    
18
class XTZCompiler extends Compiler {
19

    
20
        SAttributesListener sattrsListener; // store scanned structures
21
        private def anatypes = new HashSet<String>() // store scanned word attributes
22
        
23
        String regPath;
24
        String corpusname;
25
        String wtag;
26
        
27
        boolean doNormalizeAttributeValues = false;
28
        boolean doNormalizeAnaValues = true;
29
        
30
        public XTZCompiler(ImportModule module) {
31
                super(module);
32

    
33
                corpusname = module.getProject().getName();
34
                regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
35
                
36
                wtag = module.getProject().getTokenizerWordElement();
37
                
38
                doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
39
                doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
40
        }
41

    
42
        public void process(ArrayList<File> files) {
43
                super.process(files); // set member
44
                
45
                if (files == null) files = inputDirectory.listFiles();
46
                
47
                Project project = module.getProject();
48
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
49
                if (corpus != null) {
50
                        if (project.getDoUpdate()) {
51
                                corpus.clean(); // remove old files
52
                        } else {
53
                                corpus.delete(); // remove old files and TXMResult children
54
                        }
55
                } else {
56
                        corpus = new MainCorpus(project);
57
                        corpus.setID(project.getName());
58
                        corpus.setName(project.getName());
59
                }
60
                corpus.setDescription("Built with the XTZ import module");
61
                
62
                if (!doScanStep()) return;
63
                if (!doCQPStep()) return;
64
                if (!doCWBEncodeStep()) return;
65
                if (!doCWBMakeAllStep()) return;
66
                
67
                if (module.getProject().getCleanAfterBuild()) {
68
                        new File(module.getBinaryDirectory(), "cqp").deleteDir()
69
                }
70
                
71
                isSuccessFul = true;
72
        }
73

    
74
        /**
75
         * Scan all XML-TXM files to find out structures and word properties
76
         */
77
        public boolean doScanStep() {
78
                // get all anatypes                
79
                sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
80
                println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..."
81
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
82
                for (File f : files) {
83
                        try {
84
                                cpb.tick();
85
                                getAnaTypes(f)
86
                        } catch (Exception e) {
87
                                println "Error while processing $f text: "+e
88
                                e.printStackTrace();
89
                                return false;
90
                        }
91
                }
92
                println ""
93
                return true;
94
        }
95
        
96
        private void getAnaTypes(File xmlFile) {
97
                def inputData = xmlFile.toURI().toURL().openStream();
98
                def factory = XMLInputFactory.newInstance();
99
                def parser = factory.createXMLStreamReader(inputData);
100
                boolean start = false;
101
                String ANA = "ana"
102
                String TYPE = "type"
103
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
104
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
105
                                if (wtag.equals(parser.getLocalName())) {
106
                                        start = true;
107
                                } else if (start && ANA.equals(parser.getLocalName())) { // ana elem
108
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
109
                                                if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
110
                                                        anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
111
                                                        break;
112
                                                }
113
                                        }
114
                                }
115
                        } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
116
                                if (wtag.equals(parser.getLocalName())) {
117
                                        start = false;
118
                                }        
119
                        }
120
                }
121
                
122
                if (parser != null) parser.close();
123
                if (inputData != null) inputData.close();
124
                
125
//                for (String type : types)
126
//                        if (!anatypes.contains(type))
127
//                                anatypes << type
128
        }
129

    
130
        def cqpFiles = []
131
        public boolean doCQPStep() {
132
                println "-- Building CQP files $inputDirectory..."
133
                cqpDirectory.mkdir(); // if not created
134
                
135
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
136
                for (File xmlFile : files) {
137
                        cpb.tick();
138
                        String textname = xmlFile.getName();
139
                        int idx = textname.lastIndexOf(".")
140
                        if (idx > 0) textname = textname.substring(0, idx)
141
                        
142
                        File cqpFile = new File(cqpDirectory, textname + ".cqp")
143
                        cqpFiles << cqpFile
144
                        // skip step if cqpFile is more recent than xmlFile
145
                        if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue;
146
                        
147
                        XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
148
                        step.setNormalizeAnaValues(doNormalizeAnaValues)
149
                        step.setNormalizeAttributeValues(doNormalizeAttributeValues)
150
                        if (!step.process()) {
151
                                reason = "Fail to process $xmlFile."
152
                                return false;
153
                        }
154
                }
155
                println ""
156
                return true;
157
        }
158

    
159
        public boolean doCWBEncodeStep() {
160
                println "-- Running cwb-encode..."
161
                CwbEncode cwbEn = new CwbEncode()
162
                cwbEn.setDebug(debug)
163
                
164
                List<String> pargs = []
165
                pargs.add("id")
166
                for (String ana : anatypes)
167
                        pargs.add(ana)
168

    
169
                String[] pAttrs = pargs
170

    
171
                def structs = sattrsListener.getStructs()
172
                def structsProf = sattrsListener.getProfs()
173

    
174
                if (debug) {
175
                        println structs
176
                        println structsProf
177
                }
178
                
179
                List<String> sargs = new ArrayList<String>()
180
                def tmpTextAttrs = []
181
                for (String name : structs.keySet()) {
182
                        if (name == "text") {
183
                                for (String value : structs.get(name)) // append the attributes
184
                                        tmpTextAttrs << value // added after
185
                                continue;
186
                        }
187

    
188
                        String concat = name+":"+structsProf.get(name); // append the depth
189
                        for (String attributeName : structs.get(name)) // append the attributes
190
                                concat += "+"+attributeName.toLowerCase();
191
                        
192
                        if (structs.get(name).size() == 0) {
193
                                concat += "+n";
194
                        } else {
195
                                if (!structs.get(name).contains("n"))
196
                                        concat += "+n"
197
                        }
198
                                
199
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
200
                                 && !concat.contains("+n+") && !concat.endsWith("+n"))
201
                                concat += "+n"
202
                                
203
                        sargs.add(concat)
204
                }
205

    
206
                String textSAttributes = "text:0+id+base+project";
207
                for (String name : tmpTextAttrs) {
208
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name)))
209
                                textSAttributes += "+"+name.toLowerCase()
210
                }
211

    
212
                sargs.add(textSAttributes)
213
                sargs.add("txmcorpus:0+lang")
214

    
215
                sargs.sort()
216

    
217
                String[] sAttributes = sargs
218
                String[] pAttributes = pAttrs
219
                println " Word properties: "+pAttributes
220
                println " Structures: "+sargs
221
                File allcqpFile = new File(cqpDirectory, "all.cqp");
222
                allcqpFile.delete()
223
                try {
224
                        if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
225
                                println "Fail to write the master cqp file: "+allcqpFile
226
                                return false;
227
                        }
228
                        
229
                        if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
230
                                        allcqpFile.getAbsolutePath(),
231
                                        regPath, pAttributes, sAttributes, false)) {
232
                                println "** cwb-encode did not ends well. Activate finer logs to see details."
233
                                return false;
234
                        }
235
                        
236
                        allcqpFile.delete(); // clean
237
                } catch (Exception e) {
238
                        println "Error while running cwb-encode: "+e
239
                        e.printStackTrace()
240
                        allcqpFile.delete(); // clean 
241
                        return false;
242
                }
243
                println ""
244
                return true;
245
        }
246

    
247
        public boolean doCWBMakeAllStep() {
248
                println "-- Running cwb-makeall..."
249
                try {
250
                        CwbMakeAll cwbMa = new CwbMakeAll();
251
                        cwbMa.setDebug(debug);
252

    
253
                        if (!new File(regPath).exists()) {
254
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
255
                                return false;
256
                        }
257
                        if (!cwbMa.run(corpusname, new File(regPath).getParent())) {
258
                                println "** cwb-makeall did not ends well. Activate finer logs to see details."
259
                                return false;
260
                        }
261
                        
262
                        // remove milestones from CWB registry and data files
263
                        FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
264
                                new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));
265
                        if (!fm.process()) {
266
                                println "Fail to verify&fix milestone declarations"
267
                                return false
268
                        }
269
                } catch (Exception e) {
270
                        println "Error while running cwb-makeall: "+e
271
                        return false;
272
                }
273
                return true;
274
        }
275
}