root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 2126
History | View | Annotate | Download (8.1 kB)
1 | 986 | mdecorde | package org.txm.scripts.importer.xtz
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 321 | mdecorde | import java.io.File; |
4 | 321 | mdecorde | |
5 | 321 | mdecorde | import org.txm.* |
6 | 986 | mdecorde | import org.txm.scripts.importer.SAttributesListener |
7 | 1000 | mdecorde | import org.txm.importer.cwb.* |
8 | 1613 | mdecorde | import org.txm.utils.ConsoleProgressBar |
9 | 714 | mdecorde | import org.txm.core.preferences.TXMPreferences |
10 | 1137 | mdecorde | import org.txm.core.preferences.TBXPreferences |
11 | 714 | mdecorde | import org.txm.libs.cqp.CQPLibPreferences |
12 | 321 | mdecorde | |
13 | 321 | mdecorde | import javax.xml.stream.* |
14 | 1000 | mdecorde | import org.txm.importer.xtz.* |
15 | 1115 | mdecorde | import org.txm.objects.* |
16 | 1115 | mdecorde | import org.txm.searchengine.cqp.corpus.* |
17 | 321 | mdecorde | |
18 | 321 | mdecorde | class XTZCompiler extends Compiler { |
19 | 321 | mdecorde | |
20 | 321 | mdecorde | SAttributesListener sattrsListener; // store scanned structures
|
21 | 321 | mdecorde | private def anatypes = new HashSet<String>() // store scanned word attributes |
22 | 321 | mdecorde | |
23 | 321 | mdecorde | String regPath;
|
24 | 321 | mdecorde | String corpusname;
|
25 | 321 | mdecorde | String wtag;
|
26 | 321 | mdecorde | |
27 | 321 | mdecorde | boolean doNormalizeAttributeValues = false; |
28 | 321 | mdecorde | boolean doNormalizeAnaValues = true; |
29 | 321 | mdecorde | |
30 | 321 | mdecorde | public XTZCompiler(ImportModule module) {
|
31 | 321 | mdecorde | super(module);
|
32 | 321 | mdecorde | |
33 | 1115 | mdecorde | corpusname = module.getProject().getName(); |
34 | 321 | mdecorde | regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
|
35 | 321 | mdecorde | |
36 | 1177 | mdecorde | wtag = module.getProject().getTokenizerWordElement(); |
37 | 321 | mdecorde | |
38 | 1137 | mdecorde | doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false")) |
39 | 1137 | mdecorde | doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false")) |
40 | 321 | mdecorde | } |
41 | 321 | mdecorde | |
42 | 321 | mdecorde | public void process(ArrayList<File> files) { |
43 | 321 | mdecorde | super.process(files); // set member |
44 | 321 | mdecorde | |
45 | 321 | mdecorde | if (files == null) files = inputDirectory.listFiles(); |
46 | 321 | mdecorde | |
47 | 1804 | mdecorde | Project project = module.getProject(); |
48 | 1804 | mdecorde | CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class); |
49 | 1804 | mdecorde | if (corpus != null) { |
50 | 1804 | mdecorde | if (project.getDoUpdate()) {
|
51 | 1804 | mdecorde | corpus.clean(); // remove old files
|
52 | 1804 | mdecorde | } else {
|
53 | 1804 | mdecorde | corpus.delete(); // remove old files and TXMResult children
|
54 | 1804 | mdecorde | } |
55 | 1804 | mdecorde | } else {
|
56 | 1804 | mdecorde | corpus = new MainCorpus(project);
|
57 | 1804 | mdecorde | corpus.setID(project.getName()); |
58 | 1804 | mdecorde | corpus.setName(project.getName()); |
59 | 1804 | mdecorde | } |
60 | 1804 | mdecorde | corpus.setDescription("Built with the XTZ import module");
|
61 | 1804 | mdecorde | |
62 | 321 | mdecorde | if (!doScanStep()) return; |
63 | 804 | mdecorde | if (!doCQPStep()) return; |
64 | 321 | mdecorde | if (!doCWBEncodeStep()) return; |
65 | 321 | mdecorde | if (!doCWBMakeAllStep()) return; |
66 | 321 | mdecorde | |
67 | 1137 | mdecorde | if (module.getProject().getCleanAfterBuild()) {
|
68 | 803 | mdecorde | new File(module.getBinaryDirectory(), "cqp").deleteDir() |
69 | 321 | mdecorde | } |
70 | 321 | mdecorde | |
71 | 321 | mdecorde | isSuccessFul = true;
|
72 | 321 | mdecorde | } |
73 | 321 | mdecorde | |
74 | 321 | mdecorde | /**
|
75 | 321 | mdecorde | * Scan all XML-TXM files to find out structures and word properties
|
76 | 321 | mdecorde | */
|
77 | 321 | mdecorde | public boolean doScanStep() { |
78 | 321 | mdecorde | // get all anatypes
|
79 | 321 | mdecorde | sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag) |
80 | 321 | mdecorde | println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..." |
81 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
82 | 321 | mdecorde | for (File f : files) { |
83 | 321 | mdecorde | try {
|
84 | 321 | mdecorde | cpb.tick(); |
85 | 321 | mdecorde | getAnaTypes(f) |
86 | 321 | mdecorde | } catch (Exception e) { |
87 | 321 | mdecorde | println "Error while processing $f text: "+e
|
88 | 321 | mdecorde | e.printStackTrace(); |
89 | 321 | mdecorde | return false; |
90 | 321 | mdecorde | } |
91 | 321 | mdecorde | } |
92 | 321 | mdecorde | println ""
|
93 | 321 | mdecorde | return true; |
94 | 321 | mdecorde | } |
95 | 321 | mdecorde | |
96 | 321 | mdecorde | private void getAnaTypes(File xmlFile) { |
97 | 321 | mdecorde | def inputData = xmlFile.toURI().toURL().openStream();
|
98 | 321 | mdecorde | def factory = XMLInputFactory.newInstance();
|
99 | 321 | mdecorde | def parser = factory.createXMLStreamReader(inputData);
|
100 | 321 | mdecorde | boolean start = false; |
101 | 321 | mdecorde | String ANA = "ana" |
102 | 321 | mdecorde | String TYPE = "type" |
103 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
104 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) { // start elem |
105 | 321 | mdecorde | if (wtag.equals(parser.getLocalName())) {
|
106 | 321 | mdecorde | start = true;
|
107 | 321 | mdecorde | } else if (start && ANA.equals(parser.getLocalName())) { // ana elem |
108 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type |
109 | 321 | mdecorde | if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type |
110 | 321 | mdecorde | anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the # |
111 | 321 | mdecorde | break;
|
112 | 321 | mdecorde | } |
113 | 321 | mdecorde | } |
114 | 321 | mdecorde | } |
115 | 321 | mdecorde | } else if (event == XMLStreamConstants.END_ELEMENT) { // start elem |
116 | 321 | mdecorde | if (wtag.equals(parser.getLocalName())) {
|
117 | 321 | mdecorde | start = false;
|
118 | 321 | mdecorde | } |
119 | 321 | mdecorde | } |
120 | 321 | mdecorde | } |
121 | 1688 | mdecorde | |
122 | 1688 | mdecorde | if (parser != null) parser.close(); |
123 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
124 | 1688 | mdecorde | |
125 | 321 | mdecorde | // for (String type : types)
|
126 | 321 | mdecorde | // if (!anatypes.contains(type))
|
127 | 321 | mdecorde | // anatypes << type
|
128 | 321 | mdecorde | } |
129 | 321 | mdecorde | |
130 | 803 | mdecorde | def cqpFiles = [] |
131 | 804 | mdecorde | public boolean doCQPStep() { |
132 | 804 | mdecorde | println "-- Building CQP files $inputDirectory..."
|
133 | 803 | mdecorde | cqpDirectory.mkdir(); // if not created
|
134 | 321 | mdecorde | |
135 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
136 | 321 | mdecorde | for (File xmlFile : files) { |
137 | 321 | mdecorde | cpb.tick(); |
138 | 321 | mdecorde | String textname = xmlFile.getName();
|
139 | 479 | mdecorde | int idx = textname.lastIndexOf(".") |
140 | 321 | mdecorde | if (idx > 0) textname = textname.substring(0, idx) |
141 | 321 | mdecorde | |
142 | 803 | mdecorde | File cqpFile = new File(cqpDirectory, textname + ".cqp") |
143 | 803 | mdecorde | cqpFiles << cqpFile |
144 | 803 | mdecorde | // skip step if cqpFile is more recent than xmlFile
|
145 | 803 | mdecorde | if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue; |
146 | 321 | mdecorde | |
147 | 803 | mdecorde | XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag) |
148 | 321 | mdecorde | step.setNormalizeAnaValues(doNormalizeAnaValues) |
149 | 321 | mdecorde | step.setNormalizeAttributeValues(doNormalizeAttributeValues) |
150 | 321 | mdecorde | if (!step.process()) {
|
151 | 321 | mdecorde | reason = "Fail to process $xmlFile."
|
152 | 321 | mdecorde | return false;
|
153 | 321 | mdecorde | } |
154 | 321 | mdecorde | } |
155 | 321 | mdecorde | println ""
|
156 | 321 | mdecorde | return true; |
157 | 321 | mdecorde | } |
158 | 321 | mdecorde | |
159 | 321 | mdecorde | public boolean doCWBEncodeStep() { |
160 | 321 | mdecorde | println "-- Running cwb-encode..."
|
161 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode()
|
162 | 321 | mdecorde | cwbEn.setDebug(debug) |
163 | 321 | mdecorde | |
164 | 321 | mdecorde | List<String> pargs = [] |
165 | 321 | mdecorde | pargs.add("id")
|
166 | 321 | mdecorde | for (String ana : anatypes) |
167 | 321 | mdecorde | pargs.add(ana) |
168 | 321 | mdecorde | |
169 | 321 | mdecorde | String[] pAttrs = pargs |
170 | 321 | mdecorde | |
171 | 321 | mdecorde | def structs = sattrsListener.getStructs()
|
172 | 321 | mdecorde | def structsProf = sattrsListener.getProfs()
|
173 | 321 | mdecorde | |
174 | 321 | mdecorde | if (debug) {
|
175 | 321 | mdecorde | println structs |
176 | 321 | mdecorde | println structsProf |
177 | 321 | mdecorde | } |
178 | 321 | mdecorde | |
179 | 321 | mdecorde | List<String> sargs = new ArrayList<String>() |
180 | 321 | mdecorde | def tmpTextAttrs = [] |
181 | 321 | mdecorde | for (String name : structs.keySet()) { |
182 | 321 | mdecorde | if (name == "text") { |
183 | 321 | mdecorde | for (String value : structs.get(name)) // append the attributes |
184 | 321 | mdecorde | tmpTextAttrs << value // added after
|
185 | 321 | mdecorde | continue;
|
186 | 321 | mdecorde | } |
187 | 321 | mdecorde | |
188 | 321 | mdecorde | String concat = name+":"+structsProf.get(name); // append the depth |
189 | 321 | mdecorde | for (String attributeName : structs.get(name)) // append the attributes |
190 | 321 | mdecorde | concat += "+"+attributeName.toLowerCase();
|
191 | 1726 | mdecorde | |
192 | 321 | mdecorde | if (structs.get(name).size() == 0) { |
193 | 321 | mdecorde | concat += "+n";
|
194 | 321 | mdecorde | } else {
|
195 | 321 | mdecorde | if (!structs.get(name).contains("n")) |
196 | 321 | mdecorde | concat += "+n"
|
197 | 321 | mdecorde | } |
198 | 321 | mdecorde | |
199 | 321 | mdecorde | if ((name == "p" || name == "body" || name == "back" || name == "front") |
200 | 321 | mdecorde | && !concat.contains("+n+") && !concat.endsWith("+n")) |
201 | 321 | mdecorde | concat += "+n"
|
202 | 321 | mdecorde | |
203 | 321 | mdecorde | sargs.add(concat) |
204 | 321 | mdecorde | } |
205 | 321 | mdecorde | |
206 | 321 | mdecorde | String textSAttributes = "text:0+id+base+project"; |
207 | 321 | mdecorde | for (String name : tmpTextAttrs) { |
208 | 321 | mdecorde | if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) |
209 | 321 | mdecorde | textSAttributes += "+"+name.toLowerCase()
|
210 | 321 | mdecorde | } |
211 | 321 | mdecorde | |
212 | 321 | mdecorde | sargs.add(textSAttributes) |
213 | 321 | mdecorde | sargs.add("txmcorpus:0+lang")
|
214 | 321 | mdecorde | |
215 | 321 | mdecorde | sargs.sort() |
216 | 321 | mdecorde | |
217 | 321 | mdecorde | String[] sAttributes = sargs |
218 | 321 | mdecorde | String[] pAttributes = pAttrs |
219 | 321 | mdecorde | println " Word properties: "+pAttributes
|
220 | 321 | mdecorde | println " Structures: "+sargs
|
221 | 803 | mdecorde | File allcqpFile = new File(cqpDirectory, "all.cqp"); |
222 | 1395 | mdecorde | allcqpFile.delete() |
223 | 1726 | mdecorde | try {
|
224 | 803 | mdecorde | if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
|
225 | 803 | mdecorde | println "Fail to write the master cqp file: "+allcqpFile
|
226 | 321 | mdecorde | return false; |
227 | 321 | mdecorde | } |
228 | 1726 | mdecorde | |
229 | 1725 | mdecorde | if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname", |
230 | 803 | mdecorde | allcqpFile.getAbsolutePath(), |
231 | 1725 | mdecorde | regPath, pAttributes, sAttributes, false)) {
|
232 | 1725 | mdecorde | println "** cwb-encode did not ends well. Activate finer logs to see details."
|
233 | 1725 | mdecorde | return false; |
234 | 1725 | mdecorde | } |
235 | 1725 | mdecorde | |
236 | 803 | mdecorde | allcqpFile.delete(); // clean
|
237 | 321 | mdecorde | } catch (Exception e) { |
238 | 321 | mdecorde | println "Error while running cwb-encode: "+e
|
239 | 321 | mdecorde | e.printStackTrace() |
240 | 803 | mdecorde | allcqpFile.delete(); // clean
|
241 | 321 | mdecorde | return false; |
242 | 321 | mdecorde | } |
243 | 321 | mdecorde | println ""
|
244 | 321 | mdecorde | return true; |
245 | 321 | mdecorde | } |
246 | 321 | mdecorde | |
247 | 321 | mdecorde | public boolean doCWBMakeAllStep() { |
248 | 321 | mdecorde | println "-- Running cwb-makeall..."
|
249 | 321 | mdecorde | try {
|
250 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll();
|
251 | 321 | mdecorde | cwbMa.setDebug(debug); |
252 | 321 | mdecorde | |
253 | 321 | mdecorde | if (!new File(regPath).exists()) { |
254 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
255 | 321 | mdecorde | return false; |
256 | 321 | mdecorde | } |
257 | 1725 | mdecorde | if (!cwbMa.run(corpusname, new File(regPath).getParent())) { |
258 | 1725 | mdecorde | println "** cwb-makeall did not ends well. Activate finer logs to see details."
|
259 | 1725 | mdecorde | return false; |
260 | 1725 | mdecorde | } |
261 | 321 | mdecorde | |
262 | 321 | mdecorde | // remove milestones from CWB registry and data files
|
263 | 321 | mdecorde | FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
|
264 | 1726 | mdecorde | new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname)); |
265 | 321 | mdecorde | if (!fm.process()) {
|
266 | 321 | mdecorde | println "Fail to verify&fix milestone declarations"
|
267 | 321 | mdecorde | return false |
268 | 321 | mdecorde | } |
269 | 321 | mdecorde | } catch (Exception e) { |
270 | 321 | mdecorde | println "Error while running cwb-makeall: "+e
|
271 | 321 | mdecorde | return false; |
272 | 321 | mdecorde | } |
273 | 321 | mdecorde | return true; |
274 | 321 | mdecorde | } |
275 | 321 | mdecorde | } |