Révision 3345

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3345)
52 52
import java.util.HashMap;
53 53
import org.txm.scripts.importer.*;
54 54
import org.txm.utils.*;
55
import org.txm.utils.io.*
55 56
import org.txm.objects.Project;
56 57
import org.txm.metadatas.*;
57 58

  
......
75 76
	/** The outdir. */
76 77
	File txmDir;
77 78
	File binDir;
79
	
80
	public boolean doTokenizeStep = true;
78 81

  
79 82
	/** The trsfiles. */
80 83
	ArrayList<String> trsfiles;
......
100 103
		this.metadatas = metadatas;
101 104
		this.lang = lang;
102 105
		this.project = project;
106
		this.doTokenizeStep = project.getDoTokenizerStep()
103 107
	}
104 108

  
105 109
	/**
......
133 137
		cpb.done()
134 138

  
135 139
		if (metadatas != null) {
140
			
136 141
			if (metadatas.getHeadersList().size() > 0) {
137 142

  
138 143
				println "Injecting metadata "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
......
165 170
		println "Tokenizing entities "+files.length+" files"
166 171
		cpb = new ConsoleProgressBar(files.length)
167 172
		for (File pfile : files) {
173
			
168 174
			cpb.tick()
169 175
			TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL());
170 176
			File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
......
176 182
		cpb.done()
177 183

  
178 184
		//TOKENIZE
179
		println "Tokenizing "+files.length+" files from $txmDir"
180 185
		File tokenizedDir = new File(binDir, "tokenized")
181 186
		tokenizedDir.deleteDir() // delete previous outputed files
182 187
		tokenizedDir.mkdir()
183
		cpb = new ConsoleProgressBar(files.length)
184
		for (File pfile : files) {
185
			cpb.tick()
186
			String filename = pfile.getName().substring(0, pfile.getName().length()-4)
187
			File tfile = new File(tokenizedDir, pfile.getName())
188
			try {
189
				TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
190
				tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
191
				tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
192
				if (!tokenizer.process()) {
193
					println("Failed to tokenize "+pfile)
188
		
189
		if (!doTokenizeStep) {
190
			
191
			println "No tokenization do to."
192
			cpb = new ConsoleProgressBar(files.length)
193
			for (File f : files) {
194
				File outfile = new File(tokenizedDir, f.getName());
195
				FileCopy.copy(f, outfile);
196
				cpb.tick()
197
			}
198
			cpb.done()
199
		} else {
200
			
201
			println "Tokenizing "+files.length+" files from $txmDir"
202
			cpb = new ConsoleProgressBar(files.length)
203
			for (File pfile : files) {
204
				cpb.tick()
205
				String filename = pfile.getName().substring(0, pfile.getName().length()-4)
206
				File tfile = new File(tokenizedDir, pfile.getName())
207
				try {
208
					TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
209
					tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
210
					tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
211
					if (!tokenizer.process()) {
212
						println("Failed to tokenize "+pfile)
213
					}
214
				} catch (Exception e) {
215
					println "Error tokenizer: "+pfile
216
					org.txm.utils.logger.Log.printStackTrace(e);
217
					return false;
194 218
				}
195
			} catch (Exception e) {
196
				println "Error tokenizer: "+pfile
197
				org.txm.utils.logger.Log.printStackTrace(e);
198
				return false;
199 219
			}
220
			cpb.done()
200 221
		}
201
		cpb.done()
202

  
222
		
203 223
		//TRANSFORM INTO XML-TEI-TXM
204 224
		files = tokenizedDir.listFiles()
205 225
		println("Building ${files.length} XML-TXM file"+(files.length > 1?"s":""))
206 226
		cpb = new ConsoleProgressBar(files.length)
207 227
		for (File tfile : files) {
228
			
208 229
			cpb.tick()
209 230
			String filename = tfile.getName().substring(0, tfile.getName().length()-4)
210 231
			File xmlfile = new File(txmDir, tfile.getName())
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 3345)
177 177
		if (MONITOR != null) MONITOR.worked(20)
178 178

  
179 179
		println "-- Xml Validation"
180
		if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
180
		if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
181
		ConsoleProgressBar cpb = new ConsoleProgressBar(txmDir.listFiles().length)
181 182
		for (File infile : txmDir.listFiles()) {
182 183
			if (!ValidateXml.test(infile)) {
183 184
				println "$infile : Validation failed";
184 185
				infile.delete();
185
			}
186
		}
186
			}
187
			cpb.tick()
188
		}
189
		cpb.done()
187 190

  
188 191
		if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189 192
		if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
190

  
193

  
191 194
		boolean annotationSuccess = false;
192 195
		if (annotate) {
193 196
			println "-- ANNOTATE - Running NLP tools"
TXM/trunk/org.txm.core/src/java/org/txm/importer/scripts/xmltxm/Xml2Ana.groovy (revision 3345)
167 167
					if (correspType.containsKey(type)) { // check if txm:ana
168 168
						String corresptype = correspType.get(type);
169 169
						String ref = correspRef.get(type);
170
						anabalises.add([
171
							"#"+ref,
172
							"#"+corresptype,
173
							value
174
						]);
170
						anabalises.add(["#"+ref, "#"+corresptype, value]);
175 171
					} else if (type == ID) { // keep id attribute
176 172
						String wordid = value
177 173
						if (wordid.startsWith("w")) {

Formats disponibles : Unified diff