3 |
3 |
import java.io.File;
|
4 |
4 |
|
5 |
5 |
import org.txm.*
|
6 |
|
import org.txm.scripts.importer.SAttributesListener
|
|
6 |
import org.txm.importer.SAttributesListener
|
7 |
7 |
import org.txm.importer.cwb.*
|
8 |
8 |
import org.txm.utils.ConsoleProgressBar
|
9 |
9 |
import org.txm.utils.logger.Log
|
... | ... | |
19 |
19 |
class XTZCompiler extends Compiler {
|
20 |
20 |
|
21 |
21 |
SAttributesListener sattrsListener; // store scanned structures
|
22 |
|
private def anatypes = new HashSet<String>() // store scanned word attributes
|
23 |
22 |
|
24 |
23 |
String regPath;
|
25 |
24 |
String corpusname;
|
... | ... | |
40 |
39 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
|
41 |
40 |
}
|
42 |
41 |
|
|
42 |
/**
|
|
43 |
* the Text list
|
|
44 |
*/
|
|
45 |
def texts;
|
|
46 |
/**
|
|
47 |
* the Text to process (dirty or newer than the cqp files) list
|
|
48 |
*/
|
|
49 |
def textsToProcess;
|
43 |
50 |
@Override
|
44 |
51 |
public void process(List<String> orderedTextIDs) {
|
45 |
52 |
super.process(orderedTextIDs); // set member
|
... | ... | |
47 |
54 |
if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
|
48 |
55 |
|
49 |
56 |
Project project = module.getProject();
|
|
57 |
|
|
58 |
texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
|
|
59 |
textsToProcess = getTextsToProcess(texts)
|
|
60 |
|
|
61 |
// get all structures
|
|
62 |
sattrsListener = new SAttributesListener() // will store the structure and properties declaration
|
|
63 |
sattrsListener.W = wtag
|
|
64 |
|
|
65 |
// File regFile = new File(regPath) // The properties recovery must be done using each Texts property declarations
|
|
66 |
// if (project.getDoUpdate() && regFile.exists() ) { // this optimisation must be done before clearing the corpus files
|
|
67 |
// println "Recovering structures&properties declaration from previous import registry file $regFile..."
|
|
68 |
// ReadRegistryFile rrf = new ReadRegistryFile(regFile);
|
|
69 |
//
|
|
70 |
// sattrsListener.initialize(rrf.getPAttributes(), rrf.getSAttributesMap(), rrf.getSAttributesProfs())
|
|
71 |
//
|
|
72 |
// System.out.println(" pAttributes: "+sattrsListener.getAnatypes());
|
|
73 |
// System.out.println(" sAttributes: "+sattrsListener.getStructs());
|
|
74 |
// }
|
|
75 |
|
50 |
76 |
CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
|
51 |
77 |
if (corpus != null) {
|
52 |
78 |
if (project.getDoUpdate()) {
|
... | ... | |
77 |
103 |
* Scan all XML-TXM files to find out structures and word properties
|
78 |
104 |
*/
|
79 |
105 |
public boolean doScanStep() {
|
80 |
|
// get all anatypes
|
81 |
|
sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
|
82 |
|
def texts = module.getProject().getTexts()
|
|
106 |
|
|
107 |
|
83 |
108 |
println "-- Scanning structures&properties to create for "+texts.size()+" texts..."
|
|
109 |
|
|
110 |
// def initialTypesValues = new HashSet<String>()
|
|
111 |
// initialTypesValues.addAll(sattrsListener.getAnatypes())
|
|
112 |
|
|
113 |
// get all word properties
|
84 |
114 |
ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
|
85 |
115 |
for (Text t : texts) {
|
86 |
116 |
try {
|
87 |
117 |
cpb.tick();
|
88 |
|
getAnaTypes(t.getXMLTXMFile())
|
|
118 |
sattrsListener.scanFile(t.getXMLTXMFile()); // results saved in 'listener' data
|
|
119 |
// println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
|
|
120 |
// println " prof: "+listener.getStructs()
|
|
121 |
// println " prof: "+listener.getProfs()
|
|
122 |
// println " path: "+listener.structPath
|
89 |
123 |
} catch (Exception e) {
|
90 |
124 |
println "Error while processing $t text XML-TXM file : "+t.getSource()+". Error: "+e
|
91 |
125 |
e.printStackTrace();
|
92 |
126 |
return false;
|
93 |
127 |
}
|
94 |
128 |
}
|
|
129 |
|
|
130 |
// if (initialTypesValues != sattrsListener.getAnatypes()) { // the word properties changed all CQP files must be recreated
|
|
131 |
// textsToProcess.clear()
|
|
132 |
// textsToProcess.addAll(texts)
|
|
133 |
// }
|
|
134 |
|
95 |
135 |
println ""
|
96 |
136 |
return true;
|
97 |
137 |
}
|
98 |
138 |
|
99 |
|
private void getAnaTypes(File xmlFile) {
|
100 |
|
def inputData = xmlFile.toURI().toURL().openStream();
|
101 |
|
def factory = XMLInputFactory.newInstance();
|
102 |
|
def parser = factory.createXMLStreamReader(inputData);
|
103 |
|
boolean start = false;
|
104 |
|
String ANA = "ana"
|
105 |
|
String TYPE = "type"
|
106 |
|
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
|
107 |
|
if (event == XMLStreamConstants.START_ELEMENT) { // start elem
|
108 |
|
if (wtag.equals(parser.getLocalName())) {
|
109 |
|
start = true;
|
110 |
|
} else if (start && ANA.equals(parser.getLocalName())) { // ana elem
|
111 |
|
for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
|
112 |
|
if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
|
113 |
|
anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
|
114 |
|
break;
|
115 |
|
}
|
116 |
|
}
|
117 |
|
}
|
118 |
|
} else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
|
119 |
|
if (wtag.equals(parser.getLocalName())) {
|
120 |
|
start = false;
|
121 |
|
}
|
122 |
|
}
|
123 |
|
}
|
124 |
|
|
125 |
|
if (parser != null) parser.close();
|
126 |
|
if (inputData != null) inputData.close();
|
127 |
|
|
128 |
|
// for (String type : types)
|
129 |
|
// if (!anatypes.contains(type))
|
130 |
|
// anatypes << type
|
131 |
|
}
|
132 |
|
|
133 |
|
def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
|
134 |
|
int cqpFilesUpdated = 0;
|
135 |
|
public boolean doCQPStep() {
|
136 |
|
|
137 |
|
cqpDirectory.mkdir(); // if not created
|
138 |
|
|
139 |
|
def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
|
|
139 |
def getTextsToProcess(def texts) {
|
140 |
140 |
def textsToProcess = texts.findAll() { text ->
|
141 |
141 |
File xmlFile = text.getXMLTXMFile()
|
142 |
142 |
String textname = text.getName()
|
... | ... | |
156 |
156 |
|
157 |
157 |
return true
|
158 |
158 |
}
|
|
159 |
|
|
160 |
return textsToProcess
|
|
161 |
}
|
|
162 |
|
|
163 |
def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
|
|
164 |
int cqpFilesUpdated = 0;
|
|
165 |
public boolean doCQPStep() {
|
|
166 |
|
|
167 |
cqpDirectory.mkdir(); // if not created
|
|
168 |
|
159 |
169 |
println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..."
|
160 |
170 |
|
161 |
171 |
ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
|
... | ... | |
170 |
180 |
|
171 |
181 |
cqpFilesUpdated++
|
172 |
182 |
|
173 |
|
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
|
|
183 |
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag)
|
174 |
184 |
step.setNormalizeAnaValues(doNormalizeAnaValues)
|
175 |
185 |
step.setNormalizeAttributeValues(doNormalizeAttributeValues)
|
176 |
186 |
if (!step.process()) {
|
... | ... | |
187 |
197 |
CwbEncode cwbEn = new CwbEncode()
|
188 |
198 |
cwbEn.setDebug(debug)
|
189 |
199 |
|
190 |
|
List<String> pargs = []
|
191 |
|
pargs.add("id")
|
192 |
|
for (String ana : anatypes) {
|
|
200 |
List<String> pargs = ["id"]
|
|
201 |
for (String ana : sattrsListener.getAnatypes()) {
|
|
202 |
if (ana == "word") continue; // no need to be added, cwb will declared it automatically
|
|
203 |
if (ana == "id") continue; // no need to be added, we did it already
|
193 |
204 |
pargs.add(ana)
|
194 |
205 |
}
|
195 |
|
|
|
206 |
|
196 |
207 |
String[] pAttrs = pargs
|
197 |
208 |
|
198 |
209 |
def structs = sattrsListener.getStructs()
|
... | ... | |
206 |
217 |
List<String> sargs = new ArrayList<String>()
|
207 |
218 |
def tmpTextAttrs = []
|
208 |
219 |
for (String name : structs.keySet()) {
|
|
220 |
if (name == "txmcorpus") continue;
|
|
221 |
|
209 |
222 |
if (name == "text") {
|
210 |
223 |
for (String value : structs.get(name)) // append the attributes
|
211 |
224 |
tmpTextAttrs << value // added after
|
... | ... | |
256 |
269 |
return false;
|
257 |
270 |
}
|
258 |
271 |
|
|
272 |
new File(regPath).delete()// ensure the registry file is deleted
|
|
273 |
|
259 |
274 |
if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
|
260 |
275 |
allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
|
261 |
|
println "** cwb-encode did not ends well. Activate finer logs to see details."
|
|
276 |
println "** cwb-encode did not ends well. Please activate a finer log level to see more details."
|
262 |
277 |
return false;
|
263 |
278 |
}
|
264 |
279 |
|