Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / importer.groovy @ 2246

History | View | Annotate | Download (8.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$
27
//
28

    
29
package org.txm.scripts.importer.transcriber
30

    
31
import java.util.ArrayList;
32
import org.txm.utils.Pair;
33

    
34
import javax.xml.transform.*;
35
import javax.xml.transform.dom.DOMSource;
36
import javax.xml.transform.stream.StreamResult;
37

    
38
import org.txm.utils.logger.Log;
39
import org.txm.importer.scripts.xmltxm.*;
40

    
41
import java.io.BufferedWriter;
42
import java.io.File;
43
import java.io.FileOutputStream;
44
import java.io.IOException;
45
import java.io.OutputStreamWriter;
46
import java.io.Writer;
47
import org.w3c.dom.*;
48
import org.xml.sax.SAXException;
49
import javax.xml.parsers.*;
50
import javax.xml.xpath.*;
51

    
52
import java.util.HashMap;
53
import org.txm.scripts.importer.*;
54
import org.txm.utils.*;
55
import org.txm.metadatas.*;
56

    
57
// TODO: Auto-generated Javadoc
58
/**
59
 * The Class importer.
60
 */
61
class importer {
62

    
63
        /** The trans. */
64
        HashMap<String, String> trans = ["trans":"//Trans"];
65

    
66
        /** The doc. */
67
        def doc;
68

    
69
        /** The infile. */
70
        File infile;
71

    
72
        /** The outfile. */
73
        File outfile;
74

    
75
        /** The outdir. */
76
        File txmDir;
77
        File binDir;
78

    
79
        /** The trsfiles. */
80
        ArrayList<String> trsfiles;
81

    
82
        /** The metadatas. */
83
        Metadatas metadatas;
84

    
85
        String lang; // language used by the tokenizer
86

    
87
        /**
88
         * Instantiates a new importer.
89
         *
90
         * @param trsfiles the trsfiles
91
         * @param outdir the outdir
92
         * @param metadatas the metadatas
93
         */
94
        public importer(ArrayList<File> trsfiles, File binDir, File txmDir, Metadatas metadatas, lang) {
95
                this.trsfiles = trsfiles;
96
                this.txmDir = txmDir;
97
                this.binDir = binDir;
98
                this.metadatas = metadatas;
99
                this.lang = lang;
100
        }
101

    
102
        /**
103
         * Run.
104
         *
105
         * @return true, if successful
106
         */
107
        public boolean run() {
108
                if (trsfiles == null) {
109
                        println "no files to process"
110
                        return false;
111
                }
112
                txmDir.mkdir();
113
                if (!txmDir.exists()) {
114
                        println "can't create txmDir: "+txmDir.getAbsolutePath()
115
                }
116

    
117
                // TRS -> TEI
118
                println "Converting TRS to TEI "+trsfiles.size()+" files"
119
                for (File infile : trsfiles) {
120
                        print "."
121
                        String textid = infile.getName()
122
                        int idx = textid.indexOf(".trs")
123
                        if (idx > 0) textid = textid.substring(0, idx)
124
                        File outfile = new File(txmDir, textid+".xml")
125
                        TRSToTEI p = new TRSToTEI(infile);
126
                        if (!p.process(outfile)) {
127
                                println "Error while converting TRS to TEI: "+infile
128
                                //return false;
129
                        }
130
                }
131
                println ""
132

    
133
                if (metadatas != null) {
134
                        if (metadatas.getHeadersList().size() > 0) {
135

    
136
                                println "Injecting metadata "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
137

    
138
                                trsfiles = txmDir.listFiles();
139
                                trsfiles.sort()
140
                                for (File infile : trsfiles) {
141
                                        File outfile = new File(txmDir, "tmp.xml")
142
                                        if (metadatas != null && metadatas.isInitialized()) {
143
                                                print "."
144
                                                if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) {
145
                                                        println("Failed to inject metadata in "+infile)
146
                                                        outfile.delete()
147
                                                }
148
                                                if (!infile.delete()) {
149
                                                        println "ERROR: could not delete $infile"
150
                                                        return false
151
                                                }
152
                                                outfile.renameTo(infile)
153
                                        }
154
                                }
155
                        }
156
                }
157

    
158
                println ""
159

    
160
                // TOKENIZER ENTITIES
161
                println "Tokenizing entities "+txmDir.listFiles().length+" files"
162
                for (File pfile : txmDir.listFiles()) {
163
                        print "."
164
                        TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL());
165
                        File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
166
                        if (tokenizer.process(outfile)) {
167
                                if (!(pfile.delete() && outfile.renameTo(pfile))) println "Warning can't rename file "+outfile+" to "+pfile
168
                        }
169
                        outfile.delete();
170
                }
171
                println ""
172

    
173
                //TOKENIZE
174
                println "Tokenizing "+txmDir.listFiles().length+" files from $txmDir"
175
                File tokenizedDir = new File(binDir, "tokenized")
176
                tokenizedDir.mkdir()
177
                for (File pfile : txmDir.listFiles()) {
178
                        print "."
179
                        String filename = pfile.getName().substring(0, pfile.getName().length()-4)
180
                        File tfile = new File(tokenizedDir, pfile.getName())
181
                        try {
182
                                TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
183
                                if (!tokenizer.process()) {
184
                                        println("Failed to tokenize "+pfile)
185
                                }
186
                        } catch (Exception e) {
187
                                println "Error tokenizer: "+pfile
188
                                org.txm.utils.logger.Log.printStackTrace(e);
189
                                return false;
190
                        }
191
                }
192
                println ""
193

    
194
                //TRANSFORM INTO XML-TEI-TXM
195
                println("Building XML-TXM ("+txmDir.listFiles().length+" files)")
196
                for (File tfile : tokenizedDir.listFiles()) {
197
                        print "."
198
                        String filename = tfile.getName().substring(0, tfile.getName().length()-4)
199
                        File xmlfile = new File(txmDir, tfile.getName())
200

    
201
                        def correspType = new HashMap<String,String>()
202
                        correspType.put("event","event");
203
                        correspType.put("audio","audio");
204
                        correspType.put("notation","notation");
205
                        def correspRef = new HashMap<String,String>();
206
                        correspRef.put("event","trs");
207
                        correspRef.put("audio","trs");
208
                        correspRef.put("notation","trs");
209
                        def respId = ["trs"];
210
                        def applications = new HashMap<String,HashMap<String,String>>();
211
                        applications.put("trs",new ArrayList<String>());
212
                        applications.get("trs").add("Transcriber");//app ident
213
                        applications.get("trs").add("");//app version
214
                        applications.get("trs").add(null);//app report file path
215
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
216
                        taxonomiesUtilisees.put("ctx1",["event", "audio", "notation"]);//,"lemma","lasla","grace"]);
217
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
218
                        itemsURI.put("event",new HashMap<String,String>());
219
                        itemsURI.get("event").put("tagset","orth|corr");
220
                        itemsURI.get("event").put("website","");
221
                        itemsURI.put("audio",new HashMap<String,String>());
222
                        itemsURI.get("audio").put("tagset","present|absent|partiel");
223
                        itemsURI.get("audio").put("website","");
224
                        itemsURI.put("notation",new HashMap<String,String>());
225
                        itemsURI.get("notation").put("tagset","");
226
                        itemsURI.get("notation").put("website","");
227
                        def resps = new HashMap<String,String[]>();
228
                        resps.put("trs", ["Transcriber annotations","TXM","",""])
229
                        String wordprefix = "w_";
230

    
231
                        Xml2Ana builder = new Xml2Ana(tfile);
232
                        builder.setConvertAllAtrtibutes true;
233
                        builder.setCorrespondances(correspRef, correspType);
234
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
235
                        if (!builder.process(xmlfile)) {
236
                                println("Failed to process "+tfile)
237
                                xmlfile.delete();
238
                        }
239
                }
240

    
241
                println ""
242
                return txmDir.listFiles() != null;
243
        }
244

    
245
        /**
246
         * Process.
247
         *
248
         * @param infile the infile
249
         * @param outfile the outfile
250
         * @param metas the metas
251
         * @return true, if successful
252
         */
253
        public boolean process(File infile, File outfile, ArrayList<Pair<String, String>> metas) {
254
                //inject metadatas into
255
                this.infile = infile;
256
                this.outfile = outfile;
257
                def factory = DocumentBuilderFactory.newInstance()
258
                factory.setXIncludeAware(true);
259
                def builder = factory.newDocumentBuilder()
260
                doc = builder.parse(infile)
261
                insert(trans.get("trans"), metas);
262
                return save();
263
        }
264

    
265
        /**
266
         * Insert.
267
         *
268
         * @param xpath the xpath
269
         * @param pairs the pairs
270
         */
271
        public void insert(String xpath, List<Pair<String, String>> pairs) {
272
                println ("insert $pairs into $xpath")
273
                def expr = XPathFactory.newInstance().newXPath().compile(xpath)
274
                def nodes = expr.evaluate(doc, XPathConstants.NODESET)
275

    
276
                for (Node node : nodes) {
277
                        Element elem = (Element)node;
278
                        for (Pair<String, String> p : pairs) {
279
                                elem.setAttribute(p.getFirst(), p.getSecond());
280
                        }
281
                }
282
        }
283

    
284
        /**
285
         * Save.
286
         *
287
         * @return true, if successful
288
         */
289
        private boolean save() {
290
                try {
291
                        // Création de la source DOM
292
                        Source source = new DOMSource(doc);
293

    
294
                        // Création du fichier de sortie
295
                        Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
296
                        Result resultat = new StreamResult(writer);
297

    
298
                        // Configuration du transformer
299
                        TransformerFactory fabrique = new net.sf.saxon.TransformerFactoryImpl();
300
                        Transformer transformer = fabrique.newTransformer();
301
                        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
302
                        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
303
                        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
304

    
305
                        // Transformation
306
                        transformer.transform(source, resultat);
307
                        writer.close();
308
                        return true;
309
                } catch (Exception e) {
310
                        org.txm.utils.logger.Log.printStackTrace(e);
311
                        return false;
312
                }
313
        }
314
}