Revision 967 tmp/org.txm.groovy.core/src/groovy/org/txm/importer/transcriber/importer.groovy

importer.groovy (revision 967)
2 2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 3
// Lyon 2, University of Franche-Comté, University of Nice
4 4
// Sophia Antipolis, University of Paris 3.
5
// 
5
//
6 6
// The TXM platform is free software: you can redistribute it
7 7
// and/or modify it under the terms of the GNU General Public
8 8
// License as published by the Free Software Foundation,
9 9
// either version 2 of the License, or (at your option) any
10 10
// later version.
11
// 
11
//
12 12
// The TXM platform is distributed in the hope that it will be
13 13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 15
// PURPOSE. See the GNU General Public License for more
16 16
// details.
17
// 
17
//
18 18
// You should have received a copy of the GNU General
19 19
// Public License along with the TXM platform. If not, see
20 20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
21
//
22
//
23
//
24 24
// $LastChangedDate:$
25 25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
26
// $LastChangedBy:$
27 27
//
28 28

  
29 29
package org.txm.importer.transcriber
......
81 81

  
82 82
	/** The metadatas. */
83 83
	Metadatas metadatas;
84
	
84

  
85 85
	String lang; // language used by the tokenizer
86 86

  
87 87
	/**
......
93 93
	 */
94 94
	public importer(ArrayList<File> trsfiles, File binDir, File txmDir, Metadatas metadatas, lang) {
95 95
		this.trsfiles = trsfiles;
96
		this.txmDir = txmDir; 
96
		this.txmDir = txmDir;
97 97
		this.binDir = binDir;
98 98
		this.metadatas = metadatas;
99 99
		this.lang = lang;
......
113 113
		if (!txmDir.exists()) {
114 114
			println "can't create txmDir: "+txmDir.getAbsolutePath()
115 115
		}
116
		
116

  
117 117
		// TRS -> TEI
118 118
		println "Converting TRS to TEI "+trsfiles.size()+" files"
119 119
		for (File infile : trsfiles) {
......
131 131
		println ""
132 132

  
133 133
		if (metadatas != null) {
134
			if (metadatas.getHeadersList().size() == 0) {
135
				println "Malformed metadata file. Check column and text separator. Columns: "+metadatas.getHeadersList()
136
				return false;
137
			}
138
			println "Injecting metadatas "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
139
		}
140
		trsfiles = txmDir.listFiles();
141
		trsfiles.sort()
142
		for (File infile : trsfiles) {
143
			File outfile = new File(txmDir, "tmp.xml")
144
			if (metadatas != null && metadatas.isInitialized()) {
145
				print "."
146
				if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) {
147
					println("Failed to inject metadatas in "+infile)
148
					outfile.delete()
134
			if (metadatas.getHeadersList().size() > 0) {
135

  
136
				println "Injecting metadatas "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
137

  
138
				trsfiles = txmDir.listFiles();
139
				trsfiles.sort()
140
				for (File infile : trsfiles) {
141
					File outfile = new File(txmDir, "tmp.xml")
142
					if (metadatas != null && metadatas.isInitialized()) {
143
						print "."
144
						if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) {
145
							println("Failed to inject metadatas in "+infile)
146
							outfile.delete()
147
						}
148
						if (!infile.delete()) {
149
							println "ERROR: could not delete $infile"
150
							return false
151
						}
152
						outfile.renameTo(infile)
153
					}
149 154
				}
150
				if (!infile.delete()) {
151
					println "ERROR: could not delete $infile"
152
					return false
153
				}
154
				outfile.renameTo(infile)
155 155
			}
156 156
		}
157
		
157

  
158 158
		println ""
159
		
159

  
160 160
		// TOKENIZER ENTITIES
161 161
		println "Tokenizing entities "+txmDir.listFiles().length+" files"
162 162
		for (File pfile : txmDir.listFiles()) {
......
165 165
			File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
166 166
			if (tokenizer.process(outfile)) {
167 167
				if (!(pfile.delete() && outfile.renameTo(pfile))) println "Warning can't rename file "+outfile+" to "+pfile
168
			} 
168
			}
169 169
			outfile.delete();
170 170
		}
171 171
		println ""
172
		
172

  
173 173
		//TOKENIZE
174 174
		println "Tokenizing "+txmDir.listFiles().length+" files from $txmDir"
175 175
		File tokenizedDir = new File(binDir, "tokenized")
......
190 190
			}
191 191
		}
192 192
		println ""
193
		
193

  
194 194
		//TRANSFORM INTO XML-TEI-TXM
195 195
		println("Building XML-TXM ("+txmDir.listFiles().length+" files)")
196 196
		for (File tfile : tokenizedDir.listFiles()) {
197 197
			print "."
198 198
			String filename = tfile.getName().substring(0, tfile.getName().length()-4)
199 199
			File xmlfile = new File(txmDir, tfile.getName())
200
			
200

  
201 201
			def correspType = new HashMap<String,String>()
202 202
			correspType.put("event","event");
203 203
			correspType.put("audio","audio");
......
227 227
			def resps = new HashMap<String,String[]>();
228 228
			resps.put("trs", ["Transcriber annotations","TXM","",""])
229 229
			String wordprefix = "w_";
230
			
230

  
231 231
			Xml2Ana builder = new Xml2Ana(tfile);
232 232
			builder.setConvertAllAtrtibutes true;
233 233
			builder.setCorrespondances(correspRef, correspType);
......
237 237
				xmlfile.delete();
238 238
			}
239 239
		}
240
		
240

  
241 241
		println ""
242 242
		return txmDir.listFiles() != null;
243 243
	}
......
251 251
	 * @return true, if successful
252 252
	 */
253 253
	public boolean process(File infile, File outfile, ArrayList<Pair<String, String>> metas) {
254
		//inject metadatas into 	
254
		//inject metadatas into
255 255
		this.infile = infile;
256 256
		this.outfile = outfile;
257 257
		def factory = DocumentBuilderFactory.newInstance()
......
272 272
		println ("insert $pairs into $xpath")
273 273
		def expr = XPathFactory.newInstance().newXPath().compile(xpath)
274 274
		def nodes = expr.evaluate(doc, XPathConstants.NODESET)
275
		
275

  
276 276
		for (Node node : nodes) {
277 277
			Element elem = (Element)node;
278 278
			for (Pair<String, String> p : pairs) {
......
290 290
		try {
291 291
			// Création de la source DOM
292 292
			Source source = new DOMSource(doc);
293
			
293

  
294 294
			// Création du fichier de sortie
295
			Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8")); 
295
			Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
296 296
			Result resultat = new StreamResult(writer);
297
			
297

  
298 298
			// Configuration du transformer
299 299
			TransformerFactory fabrique = new net.sf.saxon.TransformerFactoryImpl();
300 300
			Transformer transformer = fabrique.newTransformer();
301 301
			transformer.setOutputProperty(OutputKeys.METHOD, "xml");
302
			transformer.setOutputProperty(OutputKeys.INDENT, "yes"); 
303
			transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); 
304
			
302
			transformer.setOutputProperty(OutputKeys.INDENT, "yes");
303
			transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
304

  
305 305
			// Transformation
306 306
			transformer.transform(source, resultat);
307 307
			writer.close();

Also available in: Unified diff