Revision 2246 tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy

XTZCompiler.groovy (revision 2246)
6 6
import org.txm.scripts.importer.SAttributesListener
7 7
import org.txm.importer.cwb.*
8 8
import org.txm.utils.ConsoleProgressBar
9
import org.txm.utils.logger.Log
9 10
import org.txm.core.preferences.TXMPreferences
10 11
import org.txm.core.preferences.TBXPreferences
11 12
import org.txm.libs.cqp.CQPLibPreferences
......
19 20

  
20 21
	SAttributesListener sattrsListener; // store scanned structures
21 22
	private def anatypes = new HashSet<String>() // store scanned word attributes
22
	
23

  
23 24
	String regPath;
24 25
	String corpusname;
25 26
	String wtag;
26
	
27

  
27 28
	boolean doNormalizeAttributeValues = false;
28 29
	boolean doNormalizeAnaValues = true;
29
	
30

  
30 31
	public XTZCompiler(ImportModule module) {
31 32
		super(module);
32 33

  
33 34
		corpusname = module.getProject().getName();
34 35
		regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
35
		
36

  
36 37
		wtag = module.getProject().getTokenizerWordElement();
37
		
38

  
38 39
		doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
39 40
		doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
40 41
	}
41 42

  
42
	public void process(ArrayList<File> files) {
43
		super.process(files); // set member
44
		
45
		if (files == null) files = inputDirectory.listFiles();
46
		
43
	@Override
44
	public void process(List<String> orderedTextIDs) {
45
		super.process(orderedTextIDs); // set member
46

  
47
		if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ;
48

  
47 49
		Project project = module.getProject();
48 50
		CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
49 51
		if (corpus != null) {
......
58 60
			corpus.setName(project.getName());
59 61
		}
60 62
		corpus.setDescription("Built with the XTZ import module");
61
		
63

  
62 64
		if (!doScanStep()) return;
63 65
		if (!doCQPStep()) return;
64 66
		if (!doCWBEncodeStep()) return;
65 67
		if (!doCWBMakeAllStep()) return;
66
		
67
		if (module.getProject().getCleanAfterBuild()) {
68

  
69
		if (module.getProject().getCleanAfterBuild() && !module.getProject().getDoUpdate()) {
68 70
			new File(module.getBinaryDirectory(), "cqp").deleteDir()
69 71
		}
70
		
72

  
71 73
		isSuccessFul = true;
72 74
	}
73 75

  
......
75 77
	 * Scan all XML-TXM files to find out structures and word properties
76 78
	 */
77 79
	public boolean doScanStep() {
78
		// get all anatypes		
80
		// get all anatypes
79 81
		sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag)
80
		println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..."
81
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
82
		for (File f : files) {
82
		def texts = module.getProject().getTexts()
83
		println "-- Listing structures&properties to create for "+texts.size()+" texts..."
84
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
85
		for (Text t : texts) {
83 86
			try {
84 87
				cpb.tick();
85
				getAnaTypes(f)
88
				getAnaTypes(t.getXMLTXMFile())
86 89
			} catch (Exception e) {
87
				println "Error while processing $f text: "+e
90
				println "Error while processing $t text XML-TXM file : "+t.getSource()+". Error: "+e
88 91
				e.printStackTrace();
89 92
				return false;
90 93
			}
......
92 95
		println ""
93 96
		return true;
94 97
	}
95
	
98

  
96 99
	private void getAnaTypes(File xmlFile) {
97 100
		def inputData = xmlFile.toURI().toURL().openStream();
98 101
		def factory = XMLInputFactory.newInstance();
......
115 118
			} else if (event == XMLStreamConstants.END_ELEMENT) { // start elem
116 119
				if (wtag.equals(parser.getLocalName())) {
117 120
					start = false;
118
				}	
121
				}
119 122
			}
120 123
		}
121
		
124

  
122 125
		if (parser != null) parser.close();
123 126
		if (inputData != null) inputData.close();
124
		
125
//		for (String type : types)
126
//			if (!anatypes.contains(type))
127
//				anatypes << type
127

  
128
		//		for (String type : types)
129
		//			if (!anatypes.contains(type))
130
		//				anatypes << type
128 131
	}
129 132

  
130 133
	def cqpFiles = []
134
	int cqpFilesUpdated = 0;
131 135
	public boolean doCQPStep() {
132 136
		println "-- Building CQP files $inputDirectory..."
133 137
		cqpDirectory.mkdir(); // if not created
134
		
135
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
136
		for (File xmlFile : files) {
138

  
139
		def texts = module.getProject().getTexts()
140
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
141
		cqpFilesUpdated = 0;
142
		for (Text text : texts) {
137 143
			cpb.tick();
138
			String textname = xmlFile.getName();
139
			int idx = textname.lastIndexOf(".")
140
			if (idx > 0) textname = textname.substring(0, idx)
141
			
144

  
145
			File xmlFile = text.getXMLTXMFile()
146
			String textname = text.getName()
147

  
142 148
			File cqpFile = new File(cqpDirectory, textname + ".cqp")
143 149
			cqpFiles << cqpFile
144
			// skip step if cqpFile is more recent than xmlFile
145
			if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue;
150

  
151
			// skip step if cqpFile exists AND is more recent than the XML-TXM File
152
			boolean mustBuild = false;
153
			if (!cqpFile.exists() || xmlFile.lastModified() >= cqpFile.lastModified()) {
154
				mustBuild = true
155
			}
156

  
157
			if (!text.isDirty() && !mustBuild) {
158
				Log.finer("skipping .cqp step of $text");
159
				continue
160
			}
146 161
			
162
			cqpFilesUpdated++
163

  
147 164
			XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag)
148 165
			step.setNormalizeAnaValues(doNormalizeAnaValues)
149 166
			step.setNormalizeAttributeValues(doNormalizeAttributeValues)
......
160 177
		println "-- Running cwb-encode..."
161 178
		CwbEncode cwbEn = new CwbEncode()
162 179
		cwbEn.setDebug(debug)
163
		
180

  
164 181
		List<String> pargs = []
165 182
		pargs.add("id")
166 183
		for (String ana : anatypes)
......
175 192
			println structs
176 193
			println structsProf
177 194
		}
178
		
195

  
179 196
		List<String> sargs = new ArrayList<String>()
180 197
		def tmpTextAttrs = []
181 198
		for (String name : structs.keySet()) {
......
188 205
			String concat = name+":"+structsProf.get(name); // append the depth
189 206
			for (String attributeName : structs.get(name)) // append the attributes
190 207
				concat += "+"+attributeName.toLowerCase();
191
			
208

  
192 209
			if (structs.get(name).size() == 0) {
193 210
				concat += "+n";
194 211
			} else {
195 212
				if (!structs.get(name).contains("n"))
196 213
					concat += "+n"
197 214
			}
198
				
215

  
199 216
			if ((name == "p" || name == "body" || name == "back" || name == "front")
200
				 && !concat.contains("+n+") && !concat.endsWith("+n"))
217
			&& !concat.contains("+n+") && !concat.endsWith("+n"))
201 218
				concat += "+n"
202
				
219

  
203 220
			sargs.add(concat)
204 221
		}
205 222

  
......
225 242
				println "Fail to write the master cqp file: "+allcqpFile
226 243
				return false;
227 244
			}
228
			
245

  
229 246
			if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname",
230
					allcqpFile.getAbsolutePath(),
231
					regPath, pAttributes, sAttributes, false)) {
247
			allcqpFile.getAbsolutePath(),
248
			regPath, pAttributes, sAttributes, false)) {
232 249
				println "** cwb-encode did not ends well. Activate finer logs to see details."
233 250
				return false;
234 251
			}
235
			
252

  
236 253
			allcqpFile.delete(); // clean
237 254
		} catch (Exception e) {
238 255
			println "Error while running cwb-encode: "+e
239 256
			e.printStackTrace()
240
			allcqpFile.delete(); // clean 
257
			allcqpFile.delete(); // clean
241 258
			return false;
242 259
		}
243 260
		println ""
......
258 275
				println "** cwb-makeall did not ends well. Activate finer logs to see details."
259 276
				return false;
260 277
			}
261
			
278

  
262 279
			// remove milestones from CWB registry and data files
263 280
			FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
264
				new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));
281
					new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname));
265 282
			if (!fm.process()) {
266 283
				println "Fail to verify&fix milestone declarations"
267 284
				return false

Also available in: Unified diff