Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZCompiler.groovy @ 479

History | View | Annotate | Download (7.3 kB)

1
package org.txm.importer.xtz
2

    
3
import java.io.File;
4

    
5
import org.txm.*
6
import org.txm.importer.SAttributesListener
7
import org.txm.importer.cwb.*
8
import org.txm.stat.utils.ConsoleProgressBar
9

    
10
import javax.xml.stream.*
11

    
12

    
13
class XTZCompiler extends Compiler {
14

    
15
        SAttributesListener sattrsListener; // store scanned structures
16
        private def anatypes = new HashSet<String>() // store scanned word attributes
17
        
18
        String regPath;
19
        String cwbLoc;
20
        String corpusname;
21
        String wtag;
22
        
23
        boolean doNormalizeAttributeValues = false;
24
        boolean doNormalizeAnaValues = true;
25
        
26
        public XTZCompiler(ImportModule module) {
27
                super(module);
28

    
29
                corpusname = module.getParameters().getCorpusName();
30
                regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
31
                cwbLoc = Toolbox.getParam(Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
32
                
33
                wtag = module.getParameters().getWordElement().getTextContent();
34
                
35
                doNormalizeAttributeValues = "true".equals(module.getParameters().getKeyValueParameters().get(ImportKeys.NORMALISEATTRIBUTEVALUES))
36
                doNormalizeAnaValues = "true".equals(module.getParameters().getKeyValueParameters().get(ImportKeys.NORMALISEANAVALUES))
37
        }
38

    
39
        public void process(ArrayList<File> files) {
40
                super.process(files); // set member
41
                
42
                if (files == null) files = inputDirectory.listFiles();
43
                
44
                if (!doScanStep()) return;
45
                if (!doWTCStep()) return;
46
                if (!doCWBEncodeStep()) return;
47
                if (!doCWBMakeAllStep()) return;
48
                
49
                String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
50
                if ("true".equals(cleanDirectories)) {
51
                        new File(module.getBinaryDirectory(), "wtc").deleteDir()
52
                }
53
                
54
                isSuccessFul = true;
55
        }
56

    
57
        /**
58
         * Scan all XML-TXM files to find out structures and word properties
59
         */
60
        public boolean doScanStep() {
61
                // get all anatypes                
62
                sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
63
                println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..."
64
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
65
                for (File f : files) {
66
                        try {
67
                                cpb.tick();
68
                                getAnaTypes(f)
69
                        } catch (Exception e) {
70
                                println "Error while processing $f text: "+e
71
                                e.printStackTrace();
72
                                return false;
73
                        }
74
                }
75
                println ""
76
                return true;
77
        }
78
        
79
        private void getAnaTypes(File xmlFile) {
80
                def inputData = xmlFile.toURI().toURL().openStream();
81
                def factory = XMLInputFactory.newInstance();
82
                def parser = factory.createXMLStreamReader(inputData);
83
                boolean start = false;
84
                String ANA = "ana"
85
                String TYPE = "type"
86
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
87
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
88
                                if (wtag.equals(parser.getLocalName())) {
89
                                        start = true;
90
                                } else if (start && ANA.equals(parser.getLocalName())) { // ana elem
91
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
92
                                                if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
93
                                                        anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
94
                                                        break;
95
                                                }
96
                                        }
97
                                }
98
                        } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
99
                                if (wtag.equals(parser.getLocalName())) {
100
                                        start = false;
101
                                }        
102
                        }
103
                }
104
                parser.close()
105

    
106
//                for (String type : types)
107
//                        if (!anatypes.contains(type))
108
//                                anatypes << type
109
        }
110

    
111
        def wtcFiles = []
112
        public boolean doWTCStep() {
113
                println "-- Building WTC files $inputDirectory..."
114
                wtcDirectory.mkdir(); // if not created
115
                
116
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
117
                for (File xmlFile : files) {
118
                        cpb.tick();
119
                        String textname = xmlFile.getName();
120
                        int idx = textname.lastIndexOf(".")
121
                        if (idx > 0) textname = textname.substring(0, idx)
122
                        
123
                        File wtcFile = new File(wtcDirectory, textname + ".wtc")
124
                        wtcFiles << wtcFile
125
                        // skip step if wtcFile is more recent than xmlFile
126
                        if (wtcFile.exists() && wtcFile.lastModified() >= xmlFile.lastModified()) continue;
127
                        
128
                        XTZCompilerStep step = new XTZCompilerStep(xmlFile, wtcFile, textname, corpusname, "default", anatypes, wtag)
129
                        step.setNormalizeAnaValues(doNormalizeAnaValues)
130
                        step.setNormalizeAttributeValues(doNormalizeAttributeValues)
131
                        if (!step.process()) {
132
                                reason = "Fail to process $xmlFile."
133
                                return false;
134
                        }
135
                }
136
                println ""
137
                return true;
138
        }
139

    
140
        public boolean doCWBEncodeStep() {
141
                println "-- Running cwb-encode..."
142
                CwbEncode cwbEn = new CwbEncode()
143
                cwbEn.setDebug(debug)
144
                
145
                List<String> pargs = []
146
                pargs.add("id")
147
                for (String ana : anatypes)
148
                        pargs.add(ana)
149

    
150
                String[] pAttrs = pargs
151

    
152
                def structs = sattrsListener.getStructs()
153
                def structsProf = sattrsListener.getProfs()
154

    
155
                if (debug) {
156
                        println structs
157
                        println structsProf
158
                }
159
                
160
                List<String> sargs = new ArrayList<String>()
161
                def tmpTextAttrs = []
162
                for (String name : structs.keySet()) {
163
                        if (name == "text") {
164
                                for (String value : structs.get(name)) // append the attributes
165
                                        tmpTextAttrs << value // added after
166
                                continue;
167
                        }
168

    
169
                        String concat = name+":"+structsProf.get(name); // append the depth
170
                        for (String attributeName : structs.get(name)) // append the attributes
171
                                concat += "+"+attributeName.toLowerCase();
172
                                
173
                        if (structs.get(name).size() == 0) {
174
                                concat += "+n";
175
                        } else {
176
                                if (!structs.get(name).contains("n"))
177
                                        concat += "+n"
178
                        }
179
                                
180
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
181
                                 && !concat.contains("+n+") && !concat.endsWith("+n"))
182
                                concat += "+n"
183
                                
184
                        sargs.add(concat)
185
                }
186

    
187
                String textSAttributes = "text:0+id+base+project";
188
                for (String name : tmpTextAttrs) {
189
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name)))
190
                                textSAttributes += "+"+name.toLowerCase()
191
                }
192

    
193
                sargs.add(textSAttributes)
194
                sargs.add("txmcorpus:0+lang")
195

    
196
                sargs.sort()
197

    
198
                String[] sAttributes = sargs
199
                String[] pAttributes = pAttrs
200
                println " Word properties: "+pAttributes
201
                println " Structures: "+sargs
202
                File allwtcFile = new File(wtcDirectory, "all.wtc");
203
                try {                
204
                        if (!CwbEncode.concat(wtcFiles, allwtcFile)) {
205
                                println "Fail to write the master wtc file: "+allwtcFile
206
                                return false;
207
                        }
208
                                                
209
                        cwbEn.run(cwbLoc + "cwb-encode", outputDirectory.getAbsolutePath() + "/$corpusname",
210
                                        allwtcFile.getAbsolutePath(),
211
                                        regPath, pAttributes, sAttributes, false);
212
                                
213
                        allwtcFile.delete(); // clean
214
                } catch (Exception e) {
215
                        println "Error while running cwb-encode: "+e
216
                        e.printStackTrace()
217
                        allwtcFile.delete(); // clean 
218
                        return false;
219
                }
220
                println ""
221
                return true;
222
        }
223

    
224
        public boolean doCWBMakeAllStep() {
225
                println "-- Running cwb-makeall..."
226
                try {
227
                        CwbMakeAll cwbMa = new CwbMakeAll();
228
                        cwbMa.setDebug(debug);
229

    
230
                        if (!new File(regPath).exists()) {
231
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
232
                                return false;
233
                        }
234
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname, module.getBinaryDirectory().getAbsolutePath() + "/registry");
235
                        
236
                        // remove milestones from CWB registry and data files
237
                        FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
238
                                new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));                        
239
                        if (!fm.process()) {
240
                                println "Fail to verify&fix milestone declarations"
241
                                return false
242
                        }
243
                } catch (Exception e) {
244
                        println "Error while running cwb-makeall: "+e
245
                        return false;
246
                }
247
                return true;
248
        }
249
}