Révision 2876

tmp/org.txm.core/src/java/org/txm/importer/xtz/Compiler.java (revision 2876)
1 1
package org.txm.importer.xtz;
2 2

  
3 3
import java.io.File;
4
import java.util.ArrayList;
5 4
import java.util.List;
6 5

  
7 6
import org.txm.utils.DeleteDir;
......
12 11
 * @author mdecorde
13 12
 *
14 13
 */
15
public class Compiler extends ImportStep {
14
public abstract class Compiler extends ImportStep {
16 15

  
17 16
	protected File cqpDirectory, registryDirectory, dataDirectory;
18 17
	protected List<String> orderedTextIDs;
19
	
18

  
20 19
	/**
21 20
	 * Creates the output directories
22 21
	 * 
......
31 30
		dataDirectory = new File(outputDirectory, module.getCorpusName());
32 31
		registryDirectory = new File(module.getBinaryDirectory(), "registry");
33 32

  
34
		DeleteDir.deleteDirectory(outputDirectory);
35
		outputDirectory.mkdirs();
36
		dataDirectory.mkdirs();
37
		
38
		DeleteDir.deleteDirectory(registryDirectory);
39
		registryDirectory.mkdirs();
40
		
41 33
		if (!module.isUpdatingCorpus()) {
42 34
			DeleteDir.deleteDirectory(cqpDirectory);
43 35
		} 
......
48 40
	public void cancel() {
49 41
		// TODO Auto-generated method stub
50 42
	}
43
	
44
	/**
45
	 * implement this method to build CQP files, CQP data and registry file
46
	 */
47
	protected abstract void _process() ;
51 48

  
49
	/**
50
	 * called by ImportModule
51
	 * 
52
	 * @param orderedTextIDs
53
	 */
52 54
	@Override
53 55
	public void process() {
54
		process(null); // no default files order set
56
		process(null);
55 57
	}
56 58
	
59
	/**
60
	 * called by ImportModule
61
	 * 
62
	 * @param orderedTextIDs
63
	 */
57 64
	public void process(List<String> orderedTextIDs) {
58 65
		this.orderedTextIDs = orderedTextIDs;
66
		_process();
59 67
	}
60 68
}
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 2876)
6 6
import org.txm.importer.SAttributesListener
7 7
import org.txm.importer.cwb.*
8 8
import org.txm.utils.ConsoleProgressBar
9
import org.txm.utils.DeleteDir
9 10
import org.txm.utils.logger.Log
10 11
import org.txm.core.preferences.TXMPreferences
11 12
import org.txm.core.preferences.TBXPreferences
......
47 48
	 * the Text to process (dirty or newer than the cqp files) list
48 49
	 */
49 50
	def textsToProcess;
51
	def initialTypesValues;
50 52
	@Override
51
	public void process(List<String> orderedTextIDs) {
52
		super.process(orderedTextIDs); // set member
53
	public void _process() {
53 54

  
54 55
		if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
55 56

  
56 57
		Project project = module.getProject();
57
		
58

  
58 59
		texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
59 60
		textsToProcess = getTextsToProcess(texts)
60
		
61

  
61 62
		// get all structures
62 63
		sattrsListener = new SAttributesListener() // will store the structure and properties declaration
63 64
		sattrsListener.W = wtag
65

  
66
		File registryFile = new File(regPath)
64 67
		
65
//		File regFile = new File(regPath) // The properties recovery must be done using each Texts property declarations
66
//		if (project.getDoUpdate() && regFile.exists() ) { // this optimisation must be done before clearing the corpus files
67
//			println "Recovering structures&properties declaration from previous import registry file $regFile..."
68
//			ReadRegistryFile rrf = new ReadRegistryFile(regFile);
69
//			
70
//			sattrsListener.initialize(rrf.getPAttributes(), rrf.getSAttributesMap(), rrf.getSAttributesProfs())
71
//			
72
//			System.out.println("	pAttributes: "+sattrsListener.getAnatypes());
73
//			System.out.println("	sAttributes: "+sattrsListener.getStructs());
74
//		}
68
		initialTypesValues = new HashSet<String>()
69
		if (registryFile.exists()) {
70
			ReadRegistryFile rrf = new ReadRegistryFile(registryFile);
71
			rrf.read()
72
			initialTypesValues.addAll(rrf.getPAttributes())
73
			initialTypesValues.remove("id")
74
			initialTypesValues.remove("word")
75
		}
75 76
		
76 77
		CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
77 78
		if (corpus != null) {
......
104 105
	 */
105 106
	public boolean doScanStep() {
106 107

  
107
	
108 108
		println "-- Scanning structures&properties to create for "+texts.size()+" texts..."
109
		
110
//		def initialTypesValues = new HashSet<String>()
111
//		initialTypesValues.addAll(sattrsListener.getAnatypes())
112
		
109

  
110

  
111

  
112
		//		def initialTypesValues = new HashSet<String>()
113
		//		initialTypesValues.addAll(sattrsListener.getAnatypes())
114

  
113 115
		// get all word properties
114 116
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
115 117
		for (Text t : texts) {
......
126 128
				return false;
127 129
			}
128 130
		}
129
		
130
//		if (initialTypesValues != sattrsListener.getAnatypes()) { // the word properties changed all CQP files must be recreated
131
//			textsToProcess.clear()
132
//			textsToProcess.addAll(texts)
133
//		}
134
		
131

  
132
		if (initialTypesValues.size() == sattrsListener.getAnatypes().size()
133
			&& initialTypesValues.containsAll(sattrsListener.getAnatypes())) { // the word properties changed all CQP files must be recreated
134
			// no new property
135
		} else {
136
			println "New word properties detected. All CQP files need to be updated"
137
			textsToProcess.clear()
138
			textsToProcess.addAll(texts)
139
		}
140

  
135 141
		println ""
136 142
		return true;
137 143
	}
......
153 159
				Log.finer("skipping .cqp step of $text");
154 160
				return false
155 161
			}
156
			
162

  
157 163
			return true
158 164
		}
159
		
165

  
160 166
		return textsToProcess
161 167
	}
162
	
168

  
163 169
	def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode
164 170
	int cqpFilesUpdated = 0;
165 171
	public boolean doCQPStep() {
166
		
172

  
167 173
		cqpDirectory.mkdir(); // if not created
168 174

  
169 175
		println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..."
170
		
176

  
171 177
		ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
172 178
		cqpFilesUpdated = 0;
173 179
		for (Text text : textsToProcess) {
......
177 183
			String textname = text.getName()
178 184

  
179 185
			File cqpFile = new File(cqpDirectory, textname + ".cqp")
180
			
186

  
181 187
			cqpFilesUpdated++
182 188

  
183 189
			XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag)
......
194 200

  
195 201
	public boolean doCWBEncodeStep() {
196 202
		println "-- Running cwb-encode..."
203
		
204
		// clean directories
205
		DeleteDir.deleteDirectory(outputDirectory);
206
		outputDirectory.mkdirs();
207
		dataDirectory.mkdirs();
208

  
209
		DeleteDir.deleteDirectory(registryDirectory);
210
		registryDirectory.mkdirs();
211
		
197 212
		CwbEncode cwbEn = new CwbEncode()
198 213
		cwbEn.setDebug(debug)
199 214

  
......
203 218
			if (ana == "id") continue; // no need to be added, we did it already
204 219
			pargs.add(ana)
205 220
		}
206
			
221

  
207 222
		String[] pAttrs = pargs
208 223

  
209 224
		def structs = sattrsListener.getStructs()
......
218 233
		def tmpTextAttrs = []
219 234
		for (String name : structs.keySet()) {
220 235
			if (name == "txmcorpus") continue;
221
			
236

  
222 237
			if (name == "text") {
223 238
				for (String value : structs.get(name)) // append the attributes
224 239
					tmpTextAttrs << value // added after
......
229 244
			for (String attributeName : structs.get(name)) { // append the attributes
230 245
				concat += "+"+attributeName.toLowerCase();
231 246
			}
232
			
247

  
233 248
			if (structs.get(name).size() == 0) {
234 249
				concat += "+n";
235 250
			} else {
......
239 254
			}
240 255

  
241 256
			if ((name == "p" || name == "body" || name == "back" || name == "front")
242
				&& !concat.contains("+n+") && !concat.endsWith("+n")) {
257
			&& !concat.contains("+n+") && !concat.endsWith("+n")) {
243 258
				concat += "+n"
244 259
			}
245 260
			sargs.add(concat)
......
270 285
			}
271 286

  
272 287
			new File(regPath).delete()// ensure the registry file is deleted
273
			
288

  
274 289
			if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
275
				allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
290
			allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
276 291
				println "** cwb-encode did not ends well. Please activate a finer log level to see more details."
277 292
				return false;
278 293
			}

Formats disponibles : Unified diff