Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 2126

History | View | Annotate | Download (8.1 kB)

1 986 mdecorde
package org.txm.scripts.importer.xtz
2 321 mdecorde
3 321 mdecorde
import java.io.File;
4 321 mdecorde
5 321 mdecorde
import org.txm.*
6 986 mdecorde
import org.txm.scripts.importer.SAttributesListener
7 1000 mdecorde
import org.txm.importer.cwb.*
8 1613 mdecorde
import org.txm.utils.ConsoleProgressBar
9 714 mdecorde
import org.txm.core.preferences.TXMPreferences
10 1137 mdecorde
import org.txm.core.preferences.TBXPreferences
11 714 mdecorde
import org.txm.libs.cqp.CQPLibPreferences
12 321 mdecorde
13 321 mdecorde
import javax.xml.stream.*
14 1000 mdecorde
import org.txm.importer.xtz.*
15 1115 mdecorde
import org.txm.objects.*
16 1115 mdecorde
import org.txm.searchengine.cqp.corpus.*
17 321 mdecorde
18 321 mdecorde
class XTZCompiler extends Compiler {
19 321 mdecorde
20 321 mdecorde
        SAttributesListener sattrsListener; // store scanned structures
21 321 mdecorde
        private def anatypes = new HashSet<String>() // store scanned word attributes
22 321 mdecorde
23 321 mdecorde
        String regPath;
24 321 mdecorde
        String corpusname;
25 321 mdecorde
        String wtag;
26 321 mdecorde
27 321 mdecorde
        boolean doNormalizeAttributeValues = false;
28 321 mdecorde
        boolean doNormalizeAnaValues = true;
29 321 mdecorde
30 321 mdecorde
        public XTZCompiler(ImportModule module) {
31 321 mdecorde
                super(module);
32 321 mdecorde
33 1115 mdecorde
                corpusname = module.getProject().getName();
34 321 mdecorde
                regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
35 321 mdecorde
36 1177 mdecorde
                wtag = module.getProject().getTokenizerWordElement();
37 321 mdecorde
38 1137 mdecorde
                doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
39 1137 mdecorde
                doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
40 321 mdecorde
        }
41 321 mdecorde
42 321 mdecorde
        public void process(ArrayList<File> files) {
43 321 mdecorde
                super.process(files); // set member
44 321 mdecorde
45 321 mdecorde
                if (files == null) files = inputDirectory.listFiles();
46 321 mdecorde
47 1804 mdecorde
                Project project = module.getProject();
48 1804 mdecorde
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
49 1804 mdecorde
                if (corpus != null) {
50 1804 mdecorde
                        if (project.getDoUpdate()) {
51 1804 mdecorde
                                corpus.clean(); // remove old files
52 1804 mdecorde
                        } else {
53 1804 mdecorde
                                corpus.delete(); // remove old files and TXMResult children
54 1804 mdecorde
                        }
55 1804 mdecorde
                } else {
56 1804 mdecorde
                        corpus = new MainCorpus(project);
57 1804 mdecorde
                        corpus.setID(project.getName());
58 1804 mdecorde
                        corpus.setName(project.getName());
59 1804 mdecorde
                }
60 1804 mdecorde
                corpus.setDescription("Built with the XTZ import module");
61 1804 mdecorde
62 321 mdecorde
                if (!doScanStep()) return;
63 804 mdecorde
                if (!doCQPStep()) return;
64 321 mdecorde
                if (!doCWBEncodeStep()) return;
65 321 mdecorde
                if (!doCWBMakeAllStep()) return;
66 321 mdecorde
67 1137 mdecorde
                if (module.getProject().getCleanAfterBuild()) {
68 803 mdecorde
                        new File(module.getBinaryDirectory(), "cqp").deleteDir()
69 321 mdecorde
                }
70 321 mdecorde
71 321 mdecorde
                isSuccessFul = true;
72 321 mdecorde
        }
73 321 mdecorde
74 321 mdecorde
        /**
75 321 mdecorde
         * Scan all XML-TXM files to find out structures and word properties
76 321 mdecorde
         */
77 321 mdecorde
        public boolean doScanStep() {
78 321 mdecorde
                // get all anatypes
79 321 mdecorde
                sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
80 321 mdecorde
                println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..."
81 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
82 321 mdecorde
                for (File f : files) {
83 321 mdecorde
                        try {
84 321 mdecorde
                                cpb.tick();
85 321 mdecorde
                                getAnaTypes(f)
86 321 mdecorde
                        } catch (Exception e) {
87 321 mdecorde
                                println "Error while processing $f text: "+e
88 321 mdecorde
                                e.printStackTrace();
89 321 mdecorde
                                return false;
90 321 mdecorde
                        }
91 321 mdecorde
                }
92 321 mdecorde
                println ""
93 321 mdecorde
                return true;
94 321 mdecorde
        }
95 321 mdecorde
96 321 mdecorde
        private void getAnaTypes(File xmlFile) {
97 321 mdecorde
                def inputData = xmlFile.toURI().toURL().openStream();
98 321 mdecorde
                def factory = XMLInputFactory.newInstance();
99 321 mdecorde
                def parser = factory.createXMLStreamReader(inputData);
100 321 mdecorde
                boolean start = false;
101 321 mdecorde
                String ANA = "ana"
102 321 mdecorde
                String TYPE = "type"
103 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
104 321 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
105 321 mdecorde
                                if (wtag.equals(parser.getLocalName())) {
106 321 mdecorde
                                        start = true;
107 321 mdecorde
                                } else if (start && ANA.equals(parser.getLocalName())) { // ana elem
108 321 mdecorde
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
109 321 mdecorde
                                                if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
110 321 mdecorde
                                                        anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
111 321 mdecorde
                                                        break;
112 321 mdecorde
                                                }
113 321 mdecorde
                                        }
114 321 mdecorde
                                }
115 321 mdecorde
                        } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
116 321 mdecorde
                                if (wtag.equals(parser.getLocalName())) {
117 321 mdecorde
                                        start = false;
118 321 mdecorde
                                }
119 321 mdecorde
                        }
120 321 mdecorde
                }
121 1688 mdecorde
122 1688 mdecorde
                if (parser != null) parser.close();
123 1688 mdecorde
                if (inputData != null) inputData.close();
124 1688 mdecorde
125 321 mdecorde
//                for (String type : types)
126 321 mdecorde
//                        if (!anatypes.contains(type))
127 321 mdecorde
//                                anatypes << type
128 321 mdecorde
        }
129 321 mdecorde
130 803 mdecorde
        def cqpFiles = []
131 804 mdecorde
        public boolean doCQPStep() {
132 804 mdecorde
                println "-- Building CQP files $inputDirectory..."
133 803 mdecorde
                cqpDirectory.mkdir(); // if not created
134 321 mdecorde
135 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
136 321 mdecorde
                for (File xmlFile : files) {
137 321 mdecorde
                        cpb.tick();
138 321 mdecorde
                        String textname = xmlFile.getName();
139 479 mdecorde
                        int idx = textname.lastIndexOf(".")
140 321 mdecorde
                        if (idx > 0) textname = textname.substring(0, idx)
141 321 mdecorde
142 803 mdecorde
                        File cqpFile = new File(cqpDirectory, textname + ".cqp")
143 803 mdecorde
                        cqpFiles << cqpFile
144 803 mdecorde
                        // skip step if cqpFile is more recent than xmlFile
145 803 mdecorde
                        if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue;
146 321 mdecorde
147 803 mdecorde
                        XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
148 321 mdecorde
                        step.setNormalizeAnaValues(doNormalizeAnaValues)
149 321 mdecorde
                        step.setNormalizeAttributeValues(doNormalizeAttributeValues)
150 321 mdecorde
                        if (!step.process()) {
151 321 mdecorde
                                reason = "Fail to process $xmlFile."
152 321 mdecorde
                                return false;
153 321 mdecorde
                        }
154 321 mdecorde
                }
155 321 mdecorde
                println ""
156 321 mdecorde
                return true;
157 321 mdecorde
        }
158 321 mdecorde
159 321 mdecorde
        public boolean doCWBEncodeStep() {
160 321 mdecorde
                println "-- Running cwb-encode..."
161 321 mdecorde
                CwbEncode cwbEn = new CwbEncode()
162 321 mdecorde
                cwbEn.setDebug(debug)
163 321 mdecorde
164 321 mdecorde
                List<String> pargs = []
165 321 mdecorde
                pargs.add("id")
166 321 mdecorde
                for (String ana : anatypes)
167 321 mdecorde
                        pargs.add(ana)
168 321 mdecorde
169 321 mdecorde
                String[] pAttrs = pargs
170 321 mdecorde
171 321 mdecorde
                def structs = sattrsListener.getStructs()
172 321 mdecorde
                def structsProf = sattrsListener.getProfs()
173 321 mdecorde
174 321 mdecorde
                if (debug) {
175 321 mdecorde
                        println structs
176 321 mdecorde
                        println structsProf
177 321 mdecorde
                }
178 321 mdecorde
179 321 mdecorde
                List<String> sargs = new ArrayList<String>()
180 321 mdecorde
                def tmpTextAttrs = []
181 321 mdecorde
                for (String name : structs.keySet()) {
182 321 mdecorde
                        if (name == "text") {
183 321 mdecorde
                                for (String value : structs.get(name)) // append the attributes
184 321 mdecorde
                                        tmpTextAttrs << value // added after
185 321 mdecorde
                                continue;
186 321 mdecorde
                        }
187 321 mdecorde
188 321 mdecorde
                        String concat = name+":"+structsProf.get(name); // append the depth
189 321 mdecorde
                        for (String attributeName : structs.get(name)) // append the attributes
190 321 mdecorde
                                concat += "+"+attributeName.toLowerCase();
191 1726 mdecorde
192 321 mdecorde
                        if (structs.get(name).size() == 0) {
193 321 mdecorde
                                concat += "+n";
194 321 mdecorde
                        } else {
195 321 mdecorde
                                if (!structs.get(name).contains("n"))
196 321 mdecorde
                                        concat += "+n"
197 321 mdecorde
                        }
198 321 mdecorde
199 321 mdecorde
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
200 321 mdecorde
                                 && !concat.contains("+n+") && !concat.endsWith("+n"))
201 321 mdecorde
                                concat += "+n"
202 321 mdecorde
203 321 mdecorde
                        sargs.add(concat)
204 321 mdecorde
                }
205 321 mdecorde
206 321 mdecorde
                String textSAttributes = "text:0+id+base+project";
207 321 mdecorde
                for (String name : tmpTextAttrs) {
208 321 mdecorde
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name)))
209 321 mdecorde
                                textSAttributes += "+"+name.toLowerCase()
210 321 mdecorde
                }
211 321 mdecorde
212 321 mdecorde
                sargs.add(textSAttributes)
213 321 mdecorde
                sargs.add("txmcorpus:0+lang")
214 321 mdecorde
215 321 mdecorde
                sargs.sort()
216 321 mdecorde
217 321 mdecorde
                String[] sAttributes = sargs
218 321 mdecorde
                String[] pAttributes = pAttrs
219 321 mdecorde
                println " Word properties: "+pAttributes
220 321 mdecorde
                println " Structures: "+sargs
221 803 mdecorde
                File allcqpFile = new File(cqpDirectory, "all.cqp");
222 1395 mdecorde
                allcqpFile.delete()
223 1726 mdecorde
                try {
224 803 mdecorde
                        if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
225 803 mdecorde
                                println "Fail to write the master cqp file: "+allcqpFile
226 321 mdecorde
                                return false;
227 321 mdecorde
                        }
228 1726 mdecorde
229 1725 mdecorde
                        if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
230 803 mdecorde
                                        allcqpFile.getAbsolutePath(),
231 1725 mdecorde
                                        regPath, pAttributes, sAttributes, false)) {
232 1725 mdecorde
                                println "** cwb-encode did not ends well. Activate finer logs to see details."
233 1725 mdecorde
                                return false;
234 1725 mdecorde
                        }
235 1725 mdecorde
236 803 mdecorde
                        allcqpFile.delete(); // clean
237 321 mdecorde
                } catch (Exception e) {
238 321 mdecorde
                        println "Error while running cwb-encode: "+e
239 321 mdecorde
                        e.printStackTrace()
240 803 mdecorde
                        allcqpFile.delete(); // clean
241 321 mdecorde
                        return false;
242 321 mdecorde
                }
243 321 mdecorde
                println ""
244 321 mdecorde
                return true;
245 321 mdecorde
        }
246 321 mdecorde
247 321 mdecorde
        public boolean doCWBMakeAllStep() {
248 321 mdecorde
                println "-- Running cwb-makeall..."
249 321 mdecorde
                try {
250 321 mdecorde
                        CwbMakeAll cwbMa = new CwbMakeAll();
251 321 mdecorde
                        cwbMa.setDebug(debug);
252 321 mdecorde
253 321 mdecorde
                        if (!new File(regPath).exists()) {
254 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
255 321 mdecorde
                                return false;
256 321 mdecorde
                        }
257 1725 mdecorde
                        if (!cwbMa.run(corpusname, new File(regPath).getParent())) {
258 1725 mdecorde
                                println "** cwb-makeall did not ends well. Activate finer logs to see details."
259 1725 mdecorde
                                return false;
260 1725 mdecorde
                        }
261 321 mdecorde
262 321 mdecorde
                        // remove milestones from CWB registry and data files
263 321 mdecorde
                        FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
264 1726 mdecorde
                                new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));
265 321 mdecorde
                        if (!fm.process()) {
266 321 mdecorde
                                println "Fail to verify&fix milestone declarations"
267 321 mdecorde
                                return false
268 321 mdecorde
                        }
269 321 mdecorde
                } catch (Exception e) {
270 321 mdecorde
                        println "Error while running cwb-makeall: "+e
271 321 mdecorde
                        return false;
272 321 mdecorde
                }
273 321 mdecorde
                return true;
274 321 mdecorde
        }
275 321 mdecorde
}