Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / importer.groovy @ 2554

History | View | Annotate | Download (9.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$
27
//
28

    
29
package org.txm.scripts.importer.transcriber
30

    
31
import java.util.ArrayList;
32
import org.txm.utils.Pair;
33

    
34
import javax.xml.transform.*;
35
import javax.xml.transform.dom.DOMSource;
36
import javax.xml.transform.stream.StreamResult;
37

    
38
import org.txm.utils.logger.Log;
39
import org.txm.importer.scripts.xmltxm.*;
40

    
41
import java.io.BufferedWriter;
42
import java.io.File;
43
import java.io.FileOutputStream;
44
import java.io.IOException;
45
import java.io.OutputStreamWriter;
46
import java.io.Writer;
47
import org.w3c.dom.*;
48
import org.xml.sax.SAXException;
49
import javax.xml.parsers.*;
50
import javax.xml.xpath.*;
51

    
52
import java.util.HashMap;
53
import org.txm.scripts.importer.*;
54
import org.txm.utils.*;
55
import org.txm.metadatas.*;
56

    
57
/**
58
 * The Class importer.
59
 */
60
class importer {
61

    
62
        /** The trans. */
63
        HashMap<String, String> trans = ["trans":"//Trans"];
64

    
65
        /** The doc. */
66
        def doc;
67

    
68
        /** The infile. */
69
        File infile;
70

    
71
        /** The outfile. */
72
        File outfile;
73

    
74
        /** The outdir. */
75
        File txmDir;
76
        File binDir;
77

    
78
        /** The trsfiles. */
79
        ArrayList<String> trsfiles;
80

    
81
        /** The metadatas. */
82
        Metadatas metadatas;
83

    
84
        String lang; // language used by the tokenizer
85

    
86
        /**
87
         * Instantiates a new importer.
88
         *
89
         * @param trsfiles the trsfiles
90
         * @param outdir the outdir
91
         * @param metadatas the metadatas
92
         */
93
        public importer(ArrayList<File> trsfiles, File binDir, File txmDir, Metadatas metadatas, lang) {
94
                this.trsfiles = trsfiles;
95
                this.txmDir = txmDir;
96
                this.binDir = binDir;
97
                this.metadatas = metadatas;
98
                this.lang = lang;
99
        }
100

    
101
        /**
102
         * Run.
103
         *
104
         * @return true, if successful
105
         */
106
        public boolean run() {
107
                if (trsfiles == null) {
108
                        println "no files to process"
109
                        return false;
110
                }
111
                txmDir.mkdir();
112
                if (!txmDir.exists()) {
113
                        println "can't create txmDir: "+txmDir.getAbsolutePath()
114
                }
115

    
116
                // TRS -> TEI
117
                println "Converting TRS to TEI "+trsfiles.size()+" files"
118
                ConsoleProgressBar cpb = new ConsoleProgressBar(trsfiles.size())
119
                for (File infile : trsfiles) {
120
                        cpb.tick()
121
                        String textid = infile.getName()
122
                        int idx = textid.indexOf(".trs")
123
                        if (idx > 0) textid = textid.substring(0, idx)
124
                        File outfile = new File(txmDir, textid+".xml")
125
                        TRSToTEI p = new TRSToTEI(infile);
126
                        if (!p.process(outfile)) {
127
                                println "Error while converting TRS to TEI: "+infile
128
                                //return false;
129
                        }
130
                }
131
                cpb.done()
132

    
133
                if (metadatas != null) {
134
                        if (metadatas.getHeadersList().size() > 0) {
135

    
136
                                println "Injecting metadata "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
137

    
138
                                trsfiles = txmDir.listFiles();
139
                                trsfiles.sort()
140
                                cpb = new ConsoleProgressBar(trsfiles.size())
141
                                for (File infile : trsfiles) {
142
                                        File outfile = new File(txmDir, "tmp.xml")
143
                                        if (metadatas != null && metadatas.isInitialized()) {
144
                                                cpb.tick()
145
                                                if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) {
146
                                                        println("Failed to inject metadata in "+infile)
147
                                                        outfile.delete()
148
                                                }
149
                                                if (!infile.delete()) {
150
                                                        println "ERROR: could not delete $infile"
151
                                                        return false
152
                                                }
153
                                                outfile.renameTo(infile)
154
                                        }
155
                                }
156
                                cpb.done()
157
                        }
158
                }
159

    
160

    
161
                // TOKENIZER ENTITIES
162
                def files = txmDir.listFiles()
163
                println "Tokenizing entities "+files.length+" files"
164
                cpb = new ConsoleProgressBar(files.length)
165
                for (File pfile : files) {
166
                        cpb.tick()
167
                        TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL());
168
                        File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
169
                        if (tokenizer.process(outfile)) {
170
                                if (!(pfile.delete() && outfile.renameTo(pfile))) println "Warning can't rename file "+outfile+" to "+pfile
171
                        }
172
                        outfile.delete();
173
                }
174
                cpb.done()
175

    
176
                //TOKENIZE
177
                println "Tokenizing "+files.length+" files from $txmDir"
178
                File tokenizedDir = new File(binDir, "tokenized")
179
                tokenizedDir.deleteDir() // delete previous outputed files
180
                tokenizedDir.mkdir()
181
                cpb = new ConsoleProgressBar(files.length)
182
                for (File pfile : files) {
183
                        cpb.tick()
184
                        String filename = pfile.getName().substring(0, pfile.getName().length()-4)
185
                        File tfile = new File(tokenizedDir, pfile.getName())
186
                        try {
187
                                TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
188
                                if (!tokenizer.process()) {
189
                                        println("Failed to tokenize "+pfile)
190
                                }
191
                        } catch (Exception e) {
192
                                println "Error tokenizer: "+pfile
193
                                org.txm.utils.logger.Log.printStackTrace(e);
194
                                return false;
195
                        }
196
                }
197
                cpb.done()
198

    
199
                //TRANSFORM INTO XML-TEI-TXM
200
                files = tokenizedDir.listFiles()
201
                println("Building XML-TXM ("+files.length+" files)")
202
                cpb = new ConsoleProgressBar(files.length)
203
                for (File tfile : files) {
204
                        cpb.tick()
205
                        String filename = tfile.getName().substring(0, tfile.getName().length()-4)
206
                        File xmlfile = new File(txmDir, tfile.getName())
207

    
208
                        def correspType = new HashMap<String,String>()
209
                        correspType.put("event","event");
210
                        correspType.put("audio","audio");
211
                        correspType.put("notation","notation");
212
                        def correspRef = new HashMap<String,String>();
213
                        correspRef.put("event","trs");
214
                        correspRef.put("audio","trs");
215
                        correspRef.put("notation","trs");
216
                        def respId = ["trs"];
217
                        def applications = new HashMap<String,HashMap<String,String>>();
218
                        applications.put("trs",new ArrayList<String>());
219
                        applications.get("trs").add("Transcriber");//app ident
220
                        applications.get("trs").add("");//app version
221
                        applications.get("trs").add(null);//app report file path
222
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
223
                        taxonomiesUtilisees.put("ctx1",["event", "audio", "notation"]);//,"lemma","lasla","grace"]);
224
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
225
                        itemsURI.put("event",new HashMap<String,String>());
226
                        itemsURI.get("event").put("tagset","orth|corr");
227
                        itemsURI.get("event").put("website","");
228
                        itemsURI.put("audio",new HashMap<String,String>());
229
                        itemsURI.get("audio").put("tagset","present|absent|partiel");
230
                        itemsURI.get("audio").put("website","");
231
                        itemsURI.put("notation",new HashMap<String,String>());
232
                        itemsURI.get("notation").put("tagset","");
233
                        itemsURI.get("notation").put("website","");
234
                        def resps = new HashMap<String,String[]>();
235
                        resps.put("trs", ["Transcriber annotations","TXM","",""])
236
                        String wordprefix = "w_";
237

    
238
                        Xml2Ana builder = new Xml2Ana(tfile);
239
                        builder.setConvertAllAtrtibutes true;
240
                        builder.setCorrespondances(correspRef, correspType);
241
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
242
                        if (!builder.process(xmlfile)) {
243
                                println("Failed to process "+tfile)
244
                                xmlfile.delete();
245
                        }
246
                }
247

    
248
                cpb.done()
249
                                
250
                return txmDir.listFiles() != null;
251
        }
252

    
253
        /**
254
         * Process.
255
         *
256
         * @param infile the infile
257
         * @param outfile the outfile
258
         * @param metas the metas
259
         * @return true, if successful
260
         */
261
        public boolean process(File infile, File outfile, ArrayList<Pair<String, String>> metas) {
262
                //inject metadatas into
263
                this.infile = infile;
264
                this.outfile = outfile;
265
                def factory = DocumentBuilderFactory.newInstance()
266
                factory.setXIncludeAware(true);
267
                def builder = factory.newDocumentBuilder()
268
                doc = builder.parse(infile)
269
                insert(trans.get("trans"), metas);
270
                return save();
271
        }
272

    
273
        /**
274
         * Insert.
275
         *
276
         * @param xpath the xpath
277
         * @param pairs the pairs
278
         */
279
        public void insert(String xpath, List<Pair<String, String>> pairs) {
280
                println ("insert $pairs into $xpath")
281
                def expr = XPathFactory.newInstance().newXPath().compile(xpath)
282
                def nodes = expr.evaluate(doc, XPathConstants.NODESET)
283

    
284
                for (Node node : nodes) {
285
                        Element elem = (Element)node;
286
                        for (Pair<String, String> p : pairs) {
287
                                elem.setAttribute(p.getFirst(), p.getSecond());
288
                        }
289
                }
290
        }
291

    
292
        /**
293
         * Save.
294
         *
295
         * @return true, if successful
296
         */
297
        private boolean save() {
298
                try {
299
                        // Création de la source DOM
300
                        Source source = new DOMSource(doc);
301

    
302
                        // Création du fichier de sortie
303
                        Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
304
                        Result resultat = new StreamResult(writer);
305

    
306
                        // Configuration du transformer
307
                        TransformerFactory fabrique = new net.sf.saxon.TransformerFactoryImpl();
308
                        Transformer transformer = fabrique.newTransformer();
309
                        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
310
                        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
311
                        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
312

    
313
                        // Transformation
314
                        transformer.transform(source, resultat);
315
                        writer.close();
316
                        return true;
317
                } catch (Exception e) {
318
                        org.txm.utils.logger.Log.printStackTrace(e);
319
                        return false;
320
                }
321
        }
322
}