Révision 2792

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 2792)
3 3
import java.io.File;
4 4

  
5 5
import org.txm.*
6
import org.txm.scripts.importer.SAttributesListener
6
import org.txm.importer.SAttributesListener
7 7
import org.txm.importer.cwb.*
8 8
import org.txm.utils.ConsoleProgressBar
9 9
import org.txm.utils.logger.Log
......
19 19
class XTZCompiler extends Compiler {
20 20

  
21 21
	SAttributesListener sattrsListener; // store scanned structures
22
	private def anatypes = new HashSet<String>() // store scanned word attributes
23 22

  
24 23
	String regPath;
25 24
	String corpusname;
......
40 39
		doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
41 40
	}
42 41

  
42
	/**
43
	 * the Text list
44
	 */
45
	def texts;
46
	/**
47
	 * the Text to process (dirty or newer than the cqp files) list
48
	 */
49
	def textsToProcess;
43 50
	@Override
44 51
	public void process(List<String> orderedTextIDs) {
45 52
		super.process(orderedTextIDs); // set member
......
47 54
		if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
48 55

  
49 56
		Project project = module.getProject();
57
		
58
		texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
59
		textsToProcess = getTextsToProcess(texts)
60
		
61
		// get all structures
62
		sattrsListener = new SAttributesListener() // will store the structure and properties declaration
63
		sattrsListener.W = wtag
64
		
65
//		File regFile = new File(regPath) // The properties recovery must be done using each Texts property declarations
66
//		if (project.getDoUpdate() && regFile.exists() ) { // this optimisation must be done before clearing the corpus files
67
//			println "Recovering structures&properties declaration from previous import registry file $regFile..."
68
//			ReadRegistryFile rrf = new ReadRegistryFile(regFile);
69
//			
70
//			sattrsListener.initialize(rrf.getPAttributes(), rrf.getSAttributesMap(), rrf.getSAttributesProfs())
71
//			
72
//			System.out.println("	pAttributes: "+sattrsListener.getAnatypes());
73
//			System.out.println("	sAttributes: "+sattrsListener.getStructs());
74
//		}
75
		
50 76
		CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
51 77
		if (corpus != null) {
52 78
			if (project.getDoUpdate()) {
......
77 103
	 * Scan all XML-TXM files to find out structures and word properties
78 104
	 */
79 105
	public boolean doScanStep() {
80
		// get all anatypes
81
		sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
82
		def texts = module.getProject().getTexts()
106

  
107
	
83 108
		println "-- Scanning structures&properties to create for "+texts.size()+" texts..."
109
		
110
//		def initialTypesValues = new HashSet<String>()
111
//		initialTypesValues.addAll(sattrsListener.getAnatypes())
112
		
113
		// get all word properties
84 114
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
85 115
		for (Text t : texts) {
86 116
			try {
87 117
				cpb.tick();
88
				getAnaTypes(t.getXMLTXMFile())
118
				sattrsListener.scanFile(t.getXMLTXMFile()); // results saved in 'listener' data
119
				//				println "LISTENER RESULT with ${xmlFile.getName()}: "+listener
120
				//				println " prof: "+listener.getStructs()
121
				//				println " prof: "+listener.getProfs()
122
				//				println " path: "+listener.structPath
89 123
			} catch (Exception e) {
90 124
				println "Error while processing $t text XML-TXM file : "+t.getSource()+". Error: "+e
91 125
				e.printStackTrace();
92 126
				return false;
93 127
			}
94 128
		}
129
		
130
//		if (initialTypesValues != sattrsListener.getAnatypes()) { // the word properties changed all CQP files must be recreated
131
//			textsToProcess.clear()
132
//			textsToProcess.addAll(texts)
133
//		}
134
		
95 135
		println ""
96 136
		return true;
97 137
	}
98 138

  
99
	private void getAnaTypes(File xmlFile) {
100
		def inputData = xmlFile.toURI().toURL().openStream();
101
		def factory = XMLInputFactory.newInstance();
102
		def parser = factory.createXMLStreamReader(inputData);
103
		boolean start = false;
104
		String ANA = "ana"
105
		String TYPE = "type"
106
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
107
			if (event == XMLStreamConstants.START_ELEMENT) { // start elem
108
				if (wtag.equals(parser.getLocalName())) {
109
					start = true;
110
				} else if (start && ANA.equals(parser.getLocalName())) { // ana elem
111
					for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
112
						if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type
113
							anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the #
114
							break;
115
						}
116
					}
117
				}
118
			} else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
119
				if (wtag.equals(parser.getLocalName())) {
120
					start = false;
121
				}
122
			}
123
		}
124

  
125
		if (parser != null) parser.close();
126
		if (inputData != null) inputData.close();
127

  
128
		//		for (String type : types)
129
		//			if (!anatypes.contains(type))
130
		//				anatypes << type
131
	}
132

  
133
	def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
134
	int cqpFilesUpdated = 0;
135
	public boolean doCQPStep() {
136
		
137
		cqpDirectory.mkdir(); // if not created
138

  
139
		def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
139
	def getTextsToProcess(def texts) {
140 140
		def textsToProcess = texts.findAll() { text ->
141 141
			File xmlFile = text.getXMLTXMFile()
142 142
			String textname = text.getName()
......
156 156
			
157 157
			return true
158 158
		}
159
		
160
		return textsToProcess
161
	}
162
	
163
	def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
164
	int cqpFilesUpdated = 0;
165
	public boolean doCQPStep() {
166
		
167
		cqpDirectory.mkdir(); // if not created
168

  
159 169
		println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..."
160 170
		
161 171
		ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
......
170 180
			
171 181
			cqpFilesUpdated++
172 182

  
173
			XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
183
			XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag)
174 184
			step.setNormalizeAnaValues(doNormalizeAnaValues)
175 185
			step.setNormalizeAttributeValues(doNormalizeAttributeValues)
176 186
			if (!step.process()) {
......
187 197
		CwbEncode cwbEn = new CwbEncode()
188 198
		cwbEn.setDebug(debug)
189 199

  
190
		List<String> pargs = []
191
		pargs.add("id")
192
		for (String ana : anatypes) {
200
		List<String> pargs = ["id"]
201
		for (String ana : sattrsListener.getAnatypes()) {
202
			if (ana == "word") continue; // no need to be added, cwb will declared it automatically
203
			if (ana == "id") continue; // no need to be added, we did it already
193 204
			pargs.add(ana)
194 205
		}
195
		
206
			
196 207
		String[] pAttrs = pargs
197 208

  
198 209
		def structs = sattrsListener.getStructs()
......
206 217
		List<String> sargs = new ArrayList<String>()
207 218
		def tmpTextAttrs = []
208 219
		for (String name : structs.keySet()) {
220
			if (name == "txmcorpus") continue;
221
			
209 222
			if (name == "text") {
210 223
				for (String value : structs.get(name)) // append the attributes
211 224
					tmpTextAttrs << value // added after
......
256 269
				return false;
257 270
			}
258 271

  
272
			new File(regPath).delete()// ensure the registry file is deleted
273
			
259 274
			if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
260 275
				allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
261
				println "** cwb-encode did not ends well. Activate finer logs to see details."
276
				println "** cwb-encode did not ends well. Please activate a finer log level to see more details."
262 277
				return false;
263 278
			}
264 279

  
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/compiler.groovy (revision 2792)
31 31

  
32 32
package org.txm.scripts.importer.xml;
33 33

  
34

  
34 35
import java.util.ArrayList
35 36
import java.util.Collections
37
import org.txm.importer.SAttributesListener
36 38
import org.txm.importer.cwb.BuildCwbEncodeArgs
37 39
import org.txm.importer.cwb.CwbEncode
38 40
import org.txm.importer.cwb.CwbMakeAll
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/bfm/compiler.groovy (revision 2792)
43 43
//
44 44
package org.txm.scripts.importer.bfm
45 45

  
46
import org.txm.Toolbox;
46
import org.txm.Toolbox;
47
import org.txm.importer.SAttributesListener
47 48
import org.txm.importer.cwb.*
48 49
import org.txm.scripts.importer.*;
49 50
import org.txm.scripts.*;
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/tmx/compiler.groovy (revision 2792)
32 32
import org.txm.importer.cwb.CwbEncode
33 33
import org.txm.importer.cwb.CwbMakeAll
34 34
import org.txm.importer.cwb.PatchCwbRegistry;
35
import org.txm.importer.SAttributesListener
35 36
import org.txm.scripts.importer.*;
36 37
import org.txm.scripts.*;
37 38
import org.txm.importer.scripts.xmltxm.*;

Formats disponibles : Unified diff