root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 1688
History | View | Annotate | Download (7.8 kB)
1 | 986 | mdecorde | package org.txm.scripts.importer.xtz
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 321 | mdecorde | import java.io.File; |
4 | 321 | mdecorde | |
5 | 321 | mdecorde | import org.txm.* |
6 | 986 | mdecorde | import org.txm.scripts.importer.SAttributesListener |
7 | 1000 | mdecorde | import org.txm.importer.cwb.* |
8 | 1613 | mdecorde | import org.txm.utils.ConsoleProgressBar |
9 | 714 | mdecorde | import org.txm.core.preferences.TXMPreferences |
10 | 1137 | mdecorde | import org.txm.core.preferences.TBXPreferences |
11 | 714 | mdecorde | import org.txm.libs.cqp.CQPLibPreferences |
12 | 321 | mdecorde | |
13 | 321 | mdecorde | import javax.xml.stream.* |
14 | 1000 | mdecorde | import org.txm.importer.xtz.* |
15 | 1115 | mdecorde | import org.txm.objects.* |
16 | 1115 | mdecorde | import org.txm.searchengine.cqp.corpus.* |
17 | 321 | mdecorde | |
18 | 321 | mdecorde | class XTZCompiler extends Compiler { |
19 | 321 | mdecorde | |
20 | 321 | mdecorde | SAttributesListener sattrsListener; // store scanned structures
|
21 | 321 | mdecorde | private def anatypes = new HashSet<String>() // store scanned word attributes |
22 | 321 | mdecorde | |
23 | 321 | mdecorde | String regPath;
|
24 | 321 | mdecorde | String corpusname;
|
25 | 321 | mdecorde | String wtag;
|
26 | 321 | mdecorde | |
27 | 321 | mdecorde | boolean doNormalizeAttributeValues = false; |
28 | 321 | mdecorde | boolean doNormalizeAnaValues = true; |
29 | 321 | mdecorde | |
30 | 321 | mdecorde | public XTZCompiler(ImportModule module) {
|
31 | 321 | mdecorde | super(module);
|
32 | 321 | mdecorde | |
33 | 1115 | mdecorde | corpusname = module.getProject().getName(); |
34 | 321 | mdecorde | regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
|
35 | 321 | mdecorde | |
36 | 1177 | mdecorde | wtag = module.getProject().getTokenizerWordElement(); |
37 | 321 | mdecorde | |
38 | 1137 | mdecorde | doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false")) |
39 | 1137 | mdecorde | doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false")) |
40 | 321 | mdecorde | } |
41 | 321 | mdecorde | |
42 | 321 | mdecorde | public void process(ArrayList<File> files) { |
43 | 321 | mdecorde | super.process(files); // set member |
44 | 321 | mdecorde | |
45 | 321 | mdecorde | if (files == null) files = inputDirectory.listFiles(); |
46 | 321 | mdecorde | |
47 | 321 | mdecorde | if (!doScanStep()) return; |
48 | 804 | mdecorde | if (!doCQPStep()) return; |
49 | 321 | mdecorde | if (!doCWBEncodeStep()) return; |
50 | 321 | mdecorde | if (!doCWBMakeAllStep()) return; |
51 | 321 | mdecorde | |
52 | 1115 | mdecorde | CorpusBuild corpus = module.getProject().getCorpusBuild(corpusname); |
53 | 1115 | mdecorde | if (corpus != null) { |
54 | 1115 | mdecorde | //println "CLEAN PREVIOUS CORPUS"
|
55 | 1115 | mdecorde | corpus.delete(); // remove old files
|
56 | 1115 | mdecorde | } |
57 | 1115 | mdecorde | |
58 | 1115 | mdecorde | // make new one
|
59 | 1115 | mdecorde | corpus = new MainCorpus(module.getProject());
|
60 | 1115 | mdecorde | corpus.setID(corpusname); |
61 | 1115 | mdecorde | corpus.setName(corpusname); |
62 | 1115 | mdecorde | corpus.setDescription("Built with the XTZ import module");
|
63 | 1115 | mdecorde | |
64 | 1137 | mdecorde | if (module.getProject().getCleanAfterBuild()) {
|
65 | 803 | mdecorde | new File(module.getBinaryDirectory(), "cqp").deleteDir() |
66 | 321 | mdecorde | } |
67 | 321 | mdecorde | |
68 | 321 | mdecorde | isSuccessFul = true;
|
69 | 321 | mdecorde | } |
70 | 321 | mdecorde | |
71 | 321 | mdecorde | /**
|
72 | 321 | mdecorde | * Scan all XML-TXM files to find out structures and word properties
|
73 | 321 | mdecorde | */
|
74 | 321 | mdecorde | public boolean doScanStep() { |
75 | 321 | mdecorde | // get all anatypes
|
76 | 321 | mdecorde | sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag) |
77 | 321 | mdecorde | println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..." |
78 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
79 | 321 | mdecorde | for (File f : files) { |
80 | 321 | mdecorde | try {
|
81 | 321 | mdecorde | cpb.tick(); |
82 | 321 | mdecorde | getAnaTypes(f) |
83 | 321 | mdecorde | } catch (Exception e) { |
84 | 321 | mdecorde | println "Error while processing $f text: "+e
|
85 | 321 | mdecorde | e.printStackTrace(); |
86 | 321 | mdecorde | return false; |
87 | 321 | mdecorde | } |
88 | 321 | mdecorde | } |
89 | 321 | mdecorde | println ""
|
90 | 321 | mdecorde | return true; |
91 | 321 | mdecorde | } |
92 | 321 | mdecorde | |
93 | 321 | mdecorde | private void getAnaTypes(File xmlFile) { |
94 | 321 | mdecorde | def inputData = xmlFile.toURI().toURL().openStream();
|
95 | 321 | mdecorde | def factory = XMLInputFactory.newInstance();
|
96 | 321 | mdecorde | def parser = factory.createXMLStreamReader(inputData);
|
97 | 321 | mdecorde | boolean start = false; |
98 | 321 | mdecorde | String ANA = "ana" |
99 | 321 | mdecorde | String TYPE = "type" |
100 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
101 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) { // start elem |
102 | 321 | mdecorde | if (wtag.equals(parser.getLocalName())) {
|
103 | 321 | mdecorde | start = true;
|
104 | 321 | mdecorde | } else if (start && ANA.equals(parser.getLocalName())) { // ana elem |
105 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type |
106 | 321 | mdecorde | if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type |
107 | 321 | mdecorde | anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the # |
108 | 321 | mdecorde | break;
|
109 | 321 | mdecorde | } |
110 | 321 | mdecorde | } |
111 | 321 | mdecorde | } |
112 | 321 | mdecorde | } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem |
113 | 321 | mdecorde | if (wtag.equals(parser.getLocalName())) {
|
114 | 321 | mdecorde | start = false;
|
115 | 321 | mdecorde | } |
116 | 321 | mdecorde | } |
117 | 321 | mdecorde | } |
118 | 1688 | mdecorde | |
119 | 1688 | mdecorde | if (parser != null) parser.close(); |
120 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
121 | 1688 | mdecorde | |
122 | 321 | mdecorde | // for (String type : types)
|
123 | 321 | mdecorde | // if (!anatypes.contains(type))
|
124 | 321 | mdecorde | // anatypes << type
|
125 | 321 | mdecorde | } |
126 | 321 | mdecorde | |
127 | 803 | mdecorde | def cqpFiles = [] |
128 | 804 | mdecorde | public boolean doCQPStep() { |
129 | 804 | mdecorde | println "-- Building CQP files $inputDirectory..."
|
130 | 803 | mdecorde | cqpDirectory.mkdir(); // if not created
|
131 | 321 | mdecorde | |
132 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
133 | 321 | mdecorde | for (File xmlFile : files) { |
134 | 321 | mdecorde | cpb.tick(); |
135 | 321 | mdecorde | String textname = xmlFile.getName();
|
136 | 479 | mdecorde | int idx = textname.lastIndexOf(".") |
137 | 321 | mdecorde | if (idx > 0) textname = textname.substring(0, idx) |
138 | 321 | mdecorde | |
139 | 803 | mdecorde | File cqpFile = new File(cqpDirectory, textname + ".cqp") |
140 | 803 | mdecorde | cqpFiles << cqpFile |
141 | 803 | mdecorde | // skip step if cqpFile is more recent than xmlFile
|
142 | 803 | mdecorde | if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue; |
143 | 321 | mdecorde | |
144 | 803 | mdecorde | XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag) |
145 | 321 | mdecorde | step.setNormalizeAnaValues(doNormalizeAnaValues) |
146 | 321 | mdecorde | step.setNormalizeAttributeValues(doNormalizeAttributeValues) |
147 | 321 | mdecorde | if (!step.process()) {
|
148 | 321 | mdecorde | reason = "Fail to process $xmlFile."
|
149 | 321 | mdecorde | return false;
|
150 | 321 | mdecorde | } |
151 | 321 | mdecorde | } |
152 | 321 | mdecorde | println ""
|
153 | 321 | mdecorde | return true; |
154 | 321 | mdecorde | } |
155 | 321 | mdecorde | |
156 | 321 | mdecorde | public boolean doCWBEncodeStep() { |
157 | 321 | mdecorde | println "-- Running cwb-encode..."
|
158 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode()
|
159 | 321 | mdecorde | cwbEn.setDebug(debug) |
160 | 321 | mdecorde | |
161 | 321 | mdecorde | List<String> pargs = [] |
162 | 321 | mdecorde | pargs.add("id")
|
163 | 321 | mdecorde | for (String ana : anatypes) |
164 | 321 | mdecorde | pargs.add(ana) |
165 | 321 | mdecorde | |
166 | 321 | mdecorde | String[] pAttrs = pargs |
167 | 321 | mdecorde | |
168 | 321 | mdecorde | def structs = sattrsListener.getStructs()
|
169 | 321 | mdecorde | def structsProf = sattrsListener.getProfs()
|
170 | 321 | mdecorde | |
171 | 321 | mdecorde | if (debug) {
|
172 | 321 | mdecorde | println structs |
173 | 321 | mdecorde | println structsProf |
174 | 321 | mdecorde | } |
175 | 321 | mdecorde | |
176 | 321 | mdecorde | List<String> sargs = new ArrayList<String>() |
177 | 321 | mdecorde | def tmpTextAttrs = [] |
178 | 321 | mdecorde | for (String name : structs.keySet()) { |
179 | 321 | mdecorde | if (name == "text") { |
180 | 321 | mdecorde | for (String value : structs.get(name)) // append the attributes |
181 | 321 | mdecorde | tmpTextAttrs << value // added after
|
182 | 321 | mdecorde | continue;
|
183 | 321 | mdecorde | } |
184 | 321 | mdecorde | |
185 | 321 | mdecorde | String concat = name+":"+structsProf.get(name); // append the depth |
186 | 321 | mdecorde | for (String attributeName : structs.get(name)) // append the attributes |
187 | 321 | mdecorde | concat += "+"+attributeName.toLowerCase();
|
188 | 321 | mdecorde | |
189 | 321 | mdecorde | if (structs.get(name).size() == 0) { |
190 | 321 | mdecorde | concat += "+n";
|
191 | 321 | mdecorde | } else {
|
192 | 321 | mdecorde | if (!structs.get(name).contains("n")) |
193 | 321 | mdecorde | concat += "+n"
|
194 | 321 | mdecorde | } |
195 | 321 | mdecorde | |
196 | 321 | mdecorde | if ((name == "p" || name == "body" || name == "back" || name == "front") |
197 | 321 | mdecorde | && !concat.contains("+n+") && !concat.endsWith("+n")) |
198 | 321 | mdecorde | concat += "+n"
|
199 | 321 | mdecorde | |
200 | 321 | mdecorde | sargs.add(concat) |
201 | 321 | mdecorde | } |
202 | 321 | mdecorde | |
203 | 321 | mdecorde | String textSAttributes = "text:0+id+base+project"; |
204 | 321 | mdecorde | for (String name : tmpTextAttrs) { |
205 | 321 | mdecorde | if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) |
206 | 321 | mdecorde | textSAttributes += "+"+name.toLowerCase()
|
207 | 321 | mdecorde | } |
208 | 321 | mdecorde | |
209 | 321 | mdecorde | sargs.add(textSAttributes) |
210 | 321 | mdecorde | sargs.add("txmcorpus:0+lang")
|
211 | 321 | mdecorde | |
212 | 321 | mdecorde | sargs.sort() |
213 | 321 | mdecorde | |
214 | 321 | mdecorde | String[] sAttributes = sargs |
215 | 321 | mdecorde | String[] pAttributes = pAttrs |
216 | 321 | mdecorde | println " Word properties: "+pAttributes
|
217 | 321 | mdecorde | println " Structures: "+sargs
|
218 | 803 | mdecorde | File allcqpFile = new File(cqpDirectory, "all.cqp"); |
219 | 1395 | mdecorde | allcqpFile.delete() |
220 | 321 | mdecorde | try {
|
221 | 803 | mdecorde | if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
|
222 | 803 | mdecorde | println "Fail to write the master cqp file: "+allcqpFile
|
223 | 321 | mdecorde | return false; |
224 | 321 | mdecorde | } |
225 | 321 | mdecorde | |
226 | 714 | mdecorde | cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
|
227 | 803 | mdecorde | allcqpFile.getAbsolutePath(), |
228 | 321 | mdecorde | regPath, pAttributes, sAttributes, false);
|
229 | 321 | mdecorde | |
230 | 803 | mdecorde | allcqpFile.delete(); // clean
|
231 | 321 | mdecorde | } catch (Exception e) { |
232 | 321 | mdecorde | println "Error while running cwb-encode: "+e
|
233 | 321 | mdecorde | e.printStackTrace() |
234 | 803 | mdecorde | allcqpFile.delete(); // clean
|
235 | 321 | mdecorde | return false; |
236 | 321 | mdecorde | } |
237 | 321 | mdecorde | println ""
|
238 | 321 | mdecorde | return true; |
239 | 321 | mdecorde | } |
240 | 321 | mdecorde | |
241 | 321 | mdecorde | public boolean doCWBMakeAllStep() { |
242 | 321 | mdecorde | println "-- Running cwb-makeall..."
|
243 | 321 | mdecorde | try {
|
244 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll();
|
245 | 321 | mdecorde | cwbMa.setDebug(debug); |
246 | 321 | mdecorde | |
247 | 321 | mdecorde | if (!new File(regPath).exists()) { |
248 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
249 | 321 | mdecorde | return false; |
250 | 321 | mdecorde | } |
251 | 1166 | mdecorde | cwbMa.run(corpusname, new File(regPath).getParent()); |
252 | 321 | mdecorde | |
253 | 321 | mdecorde | // remove milestones from CWB registry and data files
|
254 | 321 | mdecorde | FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
|
255 | 321 | mdecorde | new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname)); |
256 | 321 | mdecorde | if (!fm.process()) {
|
257 | 321 | mdecorde | println "Fail to verify&fix milestone declarations"
|
258 | 321 | mdecorde | return false |
259 | 321 | mdecorde | } |
260 | 321 | mdecorde | } catch (Exception e) { |
261 | 321 | mdecorde | println "Error while running cwb-makeall: "+e
|
262 | 321 | mdecorde | return false; |
263 | 321 | mdecorde | } |
264 | 321 | mdecorde | return true; |
265 | 321 | mdecorde | } |
266 | 321 | mdecorde | } |