| 3 |
3 |
import java.io.File;
|
| 4 |
4 |
|
| 5 |
5 |
import org.txm.*
|
| 6 |
|
import org.txm.scripts.importer.SAttributesListener
|
|
6 |
import org.txm.importer.SAttributesListener
|
| 7 |
7 |
import org.txm.importer.cwb.*
|
| 8 |
8 |
import org.txm.utils.ConsoleProgressBar
|
| 9 |
9 |
import org.txm.utils.logger.Log
|
| ... | ... | |
| 19 |
19 |
class XTZCompiler extends Compiler {
|
| 20 |
20 |
|
| 21 |
21 |
SAttributesListener sattrsListener; // store scanned structures
|
| 22 |
|
private def anatypes = new HashSet<String>() // store scanned word attributes
|
| 23 |
22 |
|
| 24 |
23 |
String regPath;
|
| 25 |
24 |
String corpusname;
|
| ... | ... | |
| 40 |
39 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
|
| 41 |
40 |
}
|
| 42 |
41 |
|
|
42 |
/**
|
|
43 |
* the Text list
|
|
44 |
*/
|
|
45 |
def texts;
|
|
46 |
/**
|
|
47 |
* the Text to process (dirty or newer than the cqp files) list
|
|
48 |
*/
|
|
49 |
def textsToProcess;
|
| 43 |
50 |
@Override
|
| 44 |
51 |
public void process(List<String> orderedTextIDs) {
|
| 45 |
52 |
super.process(orderedTextIDs); // set member
|
| ... | ... | |
| 47 |
54 |
if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
|
| 48 |
55 |
|
| 49 |
56 |
Project project = module.getProject();
|
|
57 |
|
|
58 |
texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
|
|
59 |
textsToProcess = getTextsToProcess(texts)
|
|
60 |
|
|
61 |
// get all structures
|
|
62 |
sattrsListener = new SAttributesListener() // will store the structure and properties declaration
|
|
63 |
sattrsListener.W = wtag
|
|
64 |
|
|
65 |
// File regFile = new File(regPath) // The properties recovery must be done using each Texts property declarations
|
|
66 |
// if (project.getDoUpdate() && regFile.exists() ) { // this optimisation must be done before clearing the corpus files
|
|
67 |
// println "Recovering structures&properties declaration from previous import registry file $regFile..."
|
|
68 |
// ReadRegistryFile rrf = new ReadRegistryFile(regFile);
|
|
69 |
//
|
|
70 |
// sattrsListener.initialize(rrf.getPAttributes(), rrf.getSAttributesMap(), rrf.getSAttributesProfs())
|
|
71 |
//
|
|
72 |
// System.out.println(" pAttributes: "+sattrsListener.getAnatypes());
|
|
73 |
// System.out.println(" sAttributes: "+sattrsListener.getStructs());
|
|
74 |
// }
|
|
75 |
|
| 50 |
76 |
CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
|
| 51 |
77 |
if (corpus != null) {
|
| 52 |
78 |
if (project.getDoUpdate()) {
|
| ... | ... | |
| 77 |
103 |
* Scan all XML-TXM files to find out structures and word properties
|
| 78 |
104 |
*/
|
| 79 |
105 |
public boolean doScanStep() {
|
| 80 |
|
// get all anatypes
|
| 81 |
|
sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
|
| 82 |
|
def texts = module.getProject().getTexts()
|
|
106 |
|
|
107 |
|
| 83 |
108 |
println "-- Scanning structures&properties to create for "+texts.size()+" texts..."
|
|
109 |
|
|
110 |
// def initialTypesValues = new HashSet<String>()
|
|
111 |
// initialTypesValues.addAll(sattrsListener.getAnatypes())
|
|
112 |
|
|
113 |
// get all word properties
|
| 84 |
114 |
ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
|
| 85 |
115 |
for (Text t : texts) {
|
| 86 |
116 |
try {
|
| 87 |
117 |
cpb.tick();
|
| 88 |
|
getAnaTypes(t.getXMLTXMFile())
|
|
118 |
sattrsListener.scanFile(t.getXMLTXMFile()); // results saved in 'listener' data
|
|
119 |
// println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
|
|
120 |
// println " prof: "+listener.getStructs()
|
|
121 |
// println " prof: "+listener.getProfs()
|
|
122 |
// println " path: "+listener.structPath
|
| 89 |
123 |
} catch (Exception e) {
|
| 90 |
124 |
println "Error while processing $t text XML-TXM file : "+t.getSource()+". Error: "+e
|
| 91 |
125 |
e.printStackTrace();
|
| 92 |
126 |
return false;
|
| 93 |
127 |
}
|
| 94 |
128 |
}
|
|
129 |
|
|
130 |
// if (initialTypesValues != sattrsListener.getAnatypes()) { // the word properties changed all CQP files must be recreated
|
|
131 |
// textsToProcess.clear()
|
|
132 |
// textsToProcess.addAll(texts)
|
|
133 |
// }
|
|
134 |
|
| 95 |
135 |
println ""
|
| 96 |
136 |
return true;
|
| 97 |
137 |
}
|
| 98 |
138 |
|
| 99 |
|
private void getAnaTypes(File xmlFile) {
|
| 100 |
|
def inputData = xmlFile.toURI().toURL().openStream();
|
| 101 |
|
def factory = XMLInputFactory.newInstance();
|
| 102 |
|
def parser = factory.createXMLStreamReader(inputData);
|
| 103 |
|
boolean start = false;
|
| 104 |
|
String ANA = "ana"
|
| 105 |
|
String TYPE = "type"
|
| 106 |
|
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
|
| 107 |
|
if (event == XMLStreamConstants.START_ELEMENT) { // start elem
|
| 108 |
|
if (wtag.equals(parser.getLocalName())) {
|
| 109 |
|
start = true;
|
| 110 |
|
} else if (start && ANA.equals(parser.getLocalName())) { // ana elem
|
| 111 |
|
for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
|
| 112 |
|
if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
|
| 113 |
|
anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
|
| 114 |
|
break;
|
| 115 |
|
}
|
| 116 |
|
}
|
| 117 |
|
}
|
| 118 |
|
} else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
|
| 119 |
|
if (wtag.equals(parser.getLocalName())) {
|
| 120 |
|
start = false;
|
| 121 |
|
}
|
| 122 |
|
}
|
| 123 |
|
}
|
| 124 |
|
|
| 125 |
|
if (parser != null) parser.close();
|
| 126 |
|
if (inputData != null) inputData.close();
|
| 127 |
|
|
| 128 |
|
// for (String type : types)
|
| 129 |
|
// if (!anatypes.contains(type))
|
| 130 |
|
// anatypes << type
|
| 131 |
|
}
|
| 132 |
|
|
| 133 |
|
def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
|
| 134 |
|
int cqpFilesUpdated = 0;
|
| 135 |
|
public boolean doCQPStep() {
|
| 136 |
|
|
| 137 |
|
cqpDirectory.mkdir(); // if not created
|
| 138 |
|
|
| 139 |
|
def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
|
|
139 |
def getTextsToProcess(def texts) {
|
| 140 |
140 |
def textsToProcess = texts.findAll() { text ->
|
| 141 |
141 |
File xmlFile = text.getXMLTXMFile()
|
| 142 |
142 |
String textname = text.getName()
|
| ... | ... | |
| 156 |
156 |
|
| 157 |
157 |
return true
|
| 158 |
158 |
}
|
|
159 |
|
|
160 |
return textsToProcess
|
|
161 |
}
|
|
162 |
|
|
163 |
def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
|
|
164 |
int cqpFilesUpdated = 0;
|
|
165 |
public boolean doCQPStep() {
|
|
166 |
|
|
167 |
cqpDirectory.mkdir(); // if not created
|
|
168 |
|
| 159 |
169 |
println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..."
|
| 160 |
170 |
|
| 161 |
171 |
ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
|
| ... | ... | |
| 170 |
180 |
|
| 171 |
181 |
cqpFilesUpdated++
|
| 172 |
182 |
|
| 173 |
|
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
|
|
183 |
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag)
|
| 174 |
184 |
step.setNormalizeAnaValues(doNormalizeAnaValues)
|
| 175 |
185 |
step.setNormalizeAttributeValues(doNormalizeAttributeValues)
|
| 176 |
186 |
if (!step.process()) {
|
| ... | ... | |
| 187 |
197 |
CwbEncode cwbEn = new CwbEncode()
|
| 188 |
198 |
cwbEn.setDebug(debug)
|
| 189 |
199 |
|
| 190 |
|
List<String> pargs = []
|
| 191 |
|
pargs.add("id")
|
| 192 |
|
for (String ana : anatypes) {
|
|
200 |
List<String> pargs = ["id"]
|
|
201 |
for (String ana : sattrsListener.getAnatypes()) {
|
|
202 |
if (ana == "word") continue; // no need to be added, cwb will declared it automatically
|
|
203 |
if (ana == "id") continue; // no need to be added, we did it already
|
| 193 |
204 |
pargs.add(ana)
|
| 194 |
205 |
}
|
| 195 |
|
|
|
206 |
|
| 196 |
207 |
String[] pAttrs = pargs
|
| 197 |
208 |
|
| 198 |
209 |
def structs = sattrsListener.getStructs()
|
| ... | ... | |
| 206 |
217 |
List<String> sargs = new ArrayList<String>()
|
| 207 |
218 |
def tmpTextAttrs = []
|
| 208 |
219 |
for (String name : structs.keySet()) {
|
|
220 |
if (name == "txmcorpus") continue;
|
|
221 |
|
| 209 |
222 |
if (name == "text") {
|
| 210 |
223 |
for (String value : structs.get(name)) // append the attributes
|
| 211 |
224 |
tmpTextAttrs << value // added after
|
| ... | ... | |
| 256 |
269 |
return false;
|
| 257 |
270 |
}
|
| 258 |
271 |
|
|
272 |
new File(regPath).delete()// ensure the registry file is deleted
|
|
273 |
|
| 259 |
274 |
if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
|
| 260 |
275 |
allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
|
| 261 |
|
println "** cwb-encode did not ends well. Activate finer logs to see details."
|
|
276 |
println "** cwb-encode did not ends well. Please activate a finer log level to see more details."
|
| 262 |
277 |
return false;
|
| 263 |
278 |
}
|
| 264 |
279 |
|