Revision 479 tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/XTZImporter.groovy

XTZImporter.groovy (revision 479)
7 7
import net.sf.saxon.style.XSLParam;
8 8

  
9 9
import org.txm.importer.ApplyXsl2;
10
import org.txm.importer.CleanFile;
10 11
import org.txm.importer.MileStoneProjection;
11 12
import org.txm.importer.ValidateXml;
12 13
import org.txm.*;
13 14

  
15
import javax.xml.parsers.SAXParser
16
import javax.xml.parsers.SAXParserFactory
14 17
import javax.xml.stream.*;
15 18

  
16 19
import org.w3c.dom.Element
20
import org.xml.sax.XMLReader
21
import org.txm.utils.xml.DomUtils;
17 22
import org.txm.metadatas.Metadatas
18 23
import org.txm.objects.BaseParameters
19 24
import org.txm.scripts.teitxm.Xml2Ana
20 25
import org.txm.stat.utils.ConsoleProgressBar
21 26
import org.txm.utils.*;
22
import org.txm.utils.io.FileCopy;
23 27

  
24 28
class XTZImporter extends Importer {
25 29

  
......
35 39
	String lang;
36 40
	String wordTag;
37 41
	boolean skipDoTokenizeStep = false;
38
	
42

  
39 43
	Metadatas metadata = null; // text metadata
40 44

  
41 45
	public XTZImporter(ImportModule module) {
42 46
		super(module);
43 47
	}
44
	
48

  
45 49
	public Metadatas getMetadata() {
46 50
		return metadata;
47 51
	}
......
92 96
		xslParams = params.getXsltParams(corpusElem);
93 97

  
94 98
		lang = corpusElem.getAttribute("lang");
95
		
99

  
96 100
		wordTag = module.getParameters().getWordElement().getTextContent()
97 101
		this.skipDoTokenizeStep = module.getParameters().getSkipTokenization()
98 102

  
......
105 109
				return;
106 110
			}
107 111
			metadata = new Metadatas(copy,
108
					Toolbox.getParam(Toolbox.METADATA_ENCODING),
109
					Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR),
110
					Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
112
			Toolbox.getParam(Toolbox.METADATA_ENCODING),
113
			Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR),
114
			Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
111 115
		}
112 116

  
113 117
		String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
114

  
118
		
119
		if (!doFixSurrogates()) return;
115 120
		if (!doSplitMergeXSLStep()) return;
116 121
		if (!doFrontXSLStep()) return;
117 122
		if (!doCheckXMLFileStep()) return;
......
130 135
	}
131 136

  
132 137
	/**
138
	 * read from source directory and remove the surrogate chars not well managed
139
	 * save the result in $bindir/src
140
	 *
141
	 * 
142
	 *
143
	 */
144
	public boolean doFixSurrogates() {
145
		File srcDirectory = new File(module.getBinaryDirectory(), "src")
146
		srcDirectory.mkdir()
147
		def files = inputDirectory.listFiles()
148
		if (files == null) {
149
			reason = "No file to process in $inputDirectory"
150
			return false;
151
		}
152

  
153
		for (File file : files) {
154
			def outputFile = new File(srcDirectory, file.getName());
155
			if (file.getName().toLowerCase().endsWith(".xml")) {
156
				CleanFile.removeSurrogateFromXmlFile(file, outputFile)
157
			} else {
158
				FileCopy.copy(file, outputFile)
159
			}
160
		}
161

  
162
		inputDirectory = srcDirectory; // the files to process are now in the "src" directory
163
		return true;
164
	}
165

  
166
	/**
133 167
	 * read from source directory and write the result in $bindir/txm
134 168
	 * 
135 169
	 * only one XSL is applied
......
144 178
		println "-- Split-Merge XSL Step with $xslDirectory"
145 179
		def xslFiles = xslDirectory.listFiles()
146 180
		if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
147
			
181

  
148 182
			xslFiles.sort()
149 183
			for (File xslFile : xslFiles) {
150 184
				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
151 185
				//if (!xslFile.getName().matches("[1-9]-.+")) continue;
152
				
153
				if (ApplyXsl2.processImportSources(xslFile, inputDirectory, srcDirectory, xslParams, true)) {
186

  
187
				if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams, false)) {
154 188
					println ""
155 189
				} else {
156 190
					reason = "Fail to apply split-merge XSL: $xslPath"
......
163 197
		}
164 198
		return true;
165 199
	}
166
	
200

  
167 201
	public boolean doEncodeMileStonesStep() {
168 202
		filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
169 203
		def milestonesString = module.getParameters().getMilestonesElement().getTextContent().trim()
170 204
		if (milestonesString.length() == 0) return true;
171
		
205

  
172 206
		def milestones = milestonesString.split(",")
173 207
		//def milestones = ["lb", "pb", "cb"]
174 208
		//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
175 209
		if (milestones.size() == 0) return true;
176
		
210

  
177 211
		println "-- Encoding milestone $milestones into XML files..."
178 212
		ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
179 213
		for (def milestone : milestones) {
180 214
			milestone = milestone.trim()// just in case
181 215
			cpb.tick()
182
			
216

  
183 217
			for (File inputFile : filesToProcess) {
184 218
				File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
185
				MileStoneProjection msp = new MileStoneProjection(inputFile, "body", wordTag, milestone)
219
				println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
220
				MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
186 221
				if (!msp.process(outputFile)) {
187 222
					println "Fail to encode $milestone in $inputFile"
188 223
					return false
......
212 247
		println "-- Front XSL Step with $frontXSLdirectory"
213 248
		def xslFiles = frontXSLdirectory.listFiles()
214 249
		if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
215
			
250

  
216 251
			xslFiles.sort()
217 252
			for (File xslFile : xslFiles) {
218 253
				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
......
250 285
				if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
251 286
				&& !f.getName().startsWith("metadata") && !f.isDirectory())
252 287
			{
253
				
288

  
254 289
				if (!ValidateXml.test(f)) {
255 290
					continue;
256 291
				} else {
......
289 324
			if (outSideTextTagsRegex.trim().length() > 0) {
290 325
				outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
291 326
			}
292
			
327

  
293 328
			Element e2 = module.getParameters().getOutSideTextTagsAndKeepContentElement()
294 329
			def split2  = e2.getTextContent().split(",")
295 330
			for (String s : split2) {
......
298 333
			if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
299 334
				outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
300 335
			}
301
			
336

  
302 337
			Element e3 = module.getParameters().getNoteElement()
303 338
			def split3  = e3.getTextContent().split(",")
304 339
			for (String s : split3) {
......
327 362
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
328 363
				if (wordTag.length() > 0)
329 364
					tokenizer.setWord_tags(wordTag);
365
				tokenizer.setWord_element_to_create(wordTag)
330 366

  
331 367
				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
332 368
					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
333 369
				}
334
				
370

  
335 371
				if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
336 372
					tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
337 373
				}
338
				
374

  
339 375
				if (noteRegex != null && noteRegex.trim().length() > 0) {
340 376
					tokenizer.setNote(noteRegex)
341 377
				}
......
347 383
				}
348 384
			}
349 385
		}
350
		
386

  
351 387
		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
352 388
		println ""
353 389

  

Also available in: Unified diff