Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / Annotate.groovy @ 479

History | View | Annotate | Download (16.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-10-03 15:30:36 +0200 (lun. 03 oct. 2016) $
25
// $LastChangedRevision: 3313 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm
29

    
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33
import java.util.concurrent.*
34

    
35
import org.txm.Toolbox
36
import org.txm.importer.*
37
import org.txm.importer.cwb.*
38
import org.txm.objects.*
39
import org.txm.stat.utils.ConsoleProgressBar;
40
import org.txm.utils.LangDetector;
41
import org.txm.utils.logger.Log;
42
import org.txm.utils.treetagger.TreeTagger
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
47
 * creates $rootDirFile/interp and $rootDirFile/treetagger
48
 *
49
 */
50
class Annotate
51
{
52
        boolean cancelNow = false;
53

    
54
        /** The report file. */
55
        File reportFile;//contains the txm:application tag content
56

    
57
        /** The resp person. */
58
        String respPerson;
59

    
60
        /** The resp id. */
61
        String respId;
62

    
63
        /** The resp desc. */
64
        String respDesc;
65

    
66
        /** The resp date. */
67
        String respDate;
68

    
69
        /** The resp when. */
70
        String respWhen;
71

    
72
        /** The app ident. */
73
        String appIdent;
74

    
75
        /** The app version. */
76
        String appVersion;
77

    
78
        /** The distributor. */
79
        String distributor;
80

    
81
        /** The publi stmt. */
82
        String publiStmt;
83

    
84
        /** The source stmt. */
85
        String sourceStmt;
86

    
87
        /** The types. */
88
        def types;
89

    
90
        /** The types title. */
91
        def typesTITLE;
92

    
93
        /** The types desc. */
94
        def typesDesc;
95

    
96
        /** The types tagset. */
97
        def typesTAGSET;
98

    
99
        /** The types web. */
100
        def typesWEB;
101

    
102
        /** The idform. */
103
        String idform;
104

    
105
        /** The debug. */
106
        boolean debug = false;
107

    
108
        File modelsDirectory;
109
        
110
        public Annotate() {
111
                modelsDirectory = new File(Toolbox.getParam(Toolbox.TREETAGGER_MODELS_PATH)); // default models directory is set in the Toolbox
112
        }
113
                
114
        /**
115
         * Sets the debug.
116
         */
117
        public void setDebug() {
118
                debug = true;
119
        }
120

    
121
        String id;
122
        /**
123
         * Inits the tt outfile infos.
124
         *
125
         * @param rootDirFile the root dir file
126
         * @param modelfile the modelfile
127
         */
128
        public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename)
129
        {
130
                initTTOutfileInfos(rootDirFile, modelfile, modelfilename, null);
131
        }
132
        
133
        /**
134
         * Inits the tt outfile infos.
135
         *
136
         * @param rootDirFile the root dir file
137
         * @param modelfile the modelfile
138
         * @param properties : 2 element array that contains the word properties to create. It can be null (the modelfilename will be used)
139
         */
140
        public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename, String[] properties)
141
        {
142
                id = modelfilename;
143
                String[] split = id.split("\\.");
144
                if (split.length > 0) id = split[0];
145
                if (id.equals("??")) id = "xx"
146

    
147
                reportFile = new File(rootDirFile,"NLPToolsParameters.xml");
148

    
149
                respPerson = System.getProperty("user.name");
150
                respId = "txm";
151
                respDesc = "NLP annotation tool";
152
                respDate = DateFormat.getDateInstance(DateFormat.SHORT, Locale.UK).format(new Date());
153
                respWhen = DateFormat.getDateInstance(DateFormat.FULL, Locale.UK).format(new Date());
154

    
155
                appIdent = "TreeTagger";
156
                appVersion = "3.2";
157

    
158
                distributor = "";
159
                publiStmt = """""";
160
                sourceStmt = """""";
161

    
162
                if (properties != null && properties.length == 2) {
163
                        types = [properties[0],properties[1]];
164
                        typesTITLE = [properties[0],properties[1]];
165
                } else {
166
                        types = [id+"pos",id+"lemma"];
167
                        typesTITLE = [id+"pos",id+"lemma"];
168
                }
169
                
170
                //TODO: the tagset, website and description should be referenced in the model catalog
171
                if(modelfile.getName() == "rgaqcj.par") {
172
                        typesDesc = ["CATTEX pos tagset built with BFM texts","fr lemma of the model "+modelfile+" - "]
173
                        typesTAGSET = ["http://bfm.ens-lyon.fr/IMG/pdf/Cattex2009_Manuel.pdf",""]
174
                        typesWEB = ["http://bfm.ens-lyon.fr/",""]
175
                } else {
176
                        typesDesc = ["pos tagset built from model "+modelfile,id+" lemma of the model "+modelfile+" - "]
177
                        typesTAGSET = ["",""]
178
                        typesWEB = ["",""]
179
                }
180
                
181
                idform ="w";
182
        }
183

    
184
        /**
185
         * Apply tt.
186
         *
187
         * @param ttsrcfile the ttsrcfile
188
         * @param ttoutfile the ttoutfile
189
         * @param modelfile the modelfile
190
         * @return true, if successful
191
         */
192
        public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile)
193
        {
194
                applyTT(ttsrcfile, ttoutfile, modelfile, null)
195
        }
196
        
197
        /**
198
         * Apply tt.
199
         *
200
         * @param ttsrcfile the ttsrcfile
201
         * @param ttoutfile the ttoutfile
202
         * @param modelfile the modelfile
203
         * @param options, if null use value set in Toolbox preferences
204
         * @return true, if successful
205
         */
206
        public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile, String[] options)
207
        {
208
                try {
209
                        File infile = ttsrcfile;
210
                        File outfile = ttoutfile;
211

    
212
                        // TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
213
                        String opt = Toolbox.getParam(Toolbox.TREETAGGER_OPTIONS);
214
                        if (options == null) {
215
                                if (opt == null) opt = "";
216
                                options = opt.split("  ");
217
                        }
218
                        
219
                        def tt = new TreeTagger(Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options);
220
                        tt.settoken();
221
                        tt.setlemma();
222
                        tt.setquiet();
223
                        tt.setsgml();
224
                        tt.setnounknown();
225
                        tt.seteostag("<s>");
226
                        tt.debug(debug);
227
                        tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
228
                        infile.delete();
229
                } catch(Exception e) {
230
                        Log.printStackTrace(e);
231
                        System.out.println("Failed to apply TreeTagger on $f input file with the $modelfile model file.");
232
                        return false;
233
                }
234
                return true;
235
        }
236

    
237
        /**
238
         * Write standoff file.
239
         *
240
         * @param ttoutfile the ttoutfile
241
         * @param posfile the posfile
242
         * @return true, if successful
243
         */
244
        public boolean writeStandoffFile(File ttoutfile, File posfile)
245
        {
246
                def encoding ="UTF-8";
247
                def transfo = new CSV2W_ANA();
248
                //println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
249
                transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
250
                transfo.setResp(respId, respDesc, respDate, respPerson, respWhen);
251
                transfo.setApp(appIdent, appVersion);
252
                transfo.setTarget(ttoutfile.getAbsolutePath(), reportFile);
253
                transfo.setInfos(distributor,  publiStmt, sourceStmt);
254
                return transfo.process( ttoutfile, posfile, encoding );
255
        }
256

    
257
        /**
258
         * Run step by step : build TT src files, run TT, build xml-standoff files, inject standoff annotations
259
         *
260
         * @param rootDirFile the root dir file
261
         * @param modelfilename the modelfilename
262
         * @return true, if successful
263
         */
264
        public boolean run(File binDir, File txmDir,  String modelfilename)
265
        {
266
                //test if modelfile exists
267
                if (debug) {
268
                        println "rootDirFile "+binDir
269
                        println "txmDir "+txmDir
270
                        println "TREETAGGER INSTALL PATH : "+Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)
271
                        println "TREETAGGER MODELS PATH : "+Toolbox.getParam(Toolbox.TREETAGGER_MODELS_PATH)
272
                }
273

    
274
                //test if the Toolbox know TreeTagger
275
                if (!new File(Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists()) {
276
                        println("Could not find TreeTagger binaries in "+Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
277
                        return false;
278
                }
279
                String langAll = null
280
                String lang;
281
                if (modelfilename.startsWith("??")) {
282
                        langAll = new LangDetector(binDir).getLang();
283
                        println "General lang $langAll"
284
                }
285

    
286
                //cleaning
287
                new File(binDir, "annotations").deleteDir();
288
                new File(binDir, "annotations").mkdir();
289
                new File(binDir, "treetagger").deleteDir();
290
                new File(binDir, "treetagger").mkdir();
291

    
292
                ArrayList<String> milestones = [];
293

    
294
                //BUILD TT FILE READY TO BE TAGGED
295
                //println "Re "+Toolbox.getParam(Toolbox.TREETAGGER_APOSTROPHE)
296
                List<File> files = txmDir.listFiles()
297

    
298
                println("Building TT source files ("+files.size()+") from directory "+txmDir)
299
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
300
                for (File f : files) {
301
                        cpb.tick()
302
                        File srcfile = f;
303
                        File resultfile = new File(binDir, "treetagger/"+f.getName()+".tt");
304
                        if(debug)
305
                                println "build tt src : "+srcfile+" >> "+resultfile
306
                        def ttsrcbuilder = new BuildTTSrc(srcfile.toURI().toURL())
307
                        if (!ttsrcbuilder.process(resultfile, null))
308
                                System.out.println("Failed to build tt src file of "+srcfile);
309
                }
310

    
311
                if (cancelNow) return;
312

    
313
                File modelDirectory = new File(Toolbox.getParam(Toolbox.TREETAGGER_MODELS_PATH));
314
                if (!modelDirectory.exists()) {
315
                        println "Skipping ANNOTATE: TreeTagger language model file directory not found: "+modelDirectory.getAbsolutePath();
316
                        return false;
317
                } else         if (!modelDirectory.canRead()) {
318
                        println "Skipping ANNOTATE: impossible to access the TreeTagger language model file directory: "+modelDirectory.getAbsolutePath();
319
                        return false;
320
                }
321
                println("")
322
                //Convert encoding if needed
323

    
324
                //APPLY TREETAGGER
325
                files = new File(binDir, "treetagger").listFiles()
326
                println("Applying $modelfilename TreeTagger model on dir: "+new File(binDir, "treetagger")+ " ("+files.size()+" files)")
327
                if (files == null || files.size() == 0)
328
                        return false;
329
                File modelfile;
330
                cpb = new ConsoleProgressBar(files.size())
331
                for (File f : files) {
332
                        String tmpModelFileName = modelfilename
333
                        if (modelfilename.startsWith("??")) {
334
                                lang = langAll;
335
                                if (f.length() > LangDetector.MINIMALSIZE) {
336
                                        lang = new LangDetector(f).getLang();
337
                                        //println "guessing lang $f : $lang"
338
                                }
339
                                tmpModelFileName = lang+".par"
340
                        }
341
                        modelfile = new File(modelsDirectory, tmpModelFileName);
342
                        if (debug)
343
                                println "model file : "+modelfile;
344

    
345
                        File
346

    
347
                        if (!modelfile.exists()) {
348
                                println "Skipping ANNOTATE: '$modelfile' TreeTagger language model file not found."
349
                                if(System.getProperty("os.name").startsWith("Windows") || System.getProperty("os.name").startsWith("Mac"))
350
                                        println "Windows&Mac users: the operating system might be hiding file extensions. Use your file explorer to check the file name."
351
                                return false;
352
                        } else if (!modelfile.canRead()) {
353
                                println "Skipping ANNOTATE: impossible to access the '$modelfile' TreeTagger language model file."
354
                                return false;
355
                        }
356

    
357
                        //                        if (modelfile.getName().equals("sp.par")) {//UTF >> Latin1
358
                        //                                if(debug)
359
                        //                                        println "fix encoding for model "+modelfile
360
                        //                                new EncodingConverter(f, "UTF-8", "ISO-8859-1")
361
                        //                        }
362

    
363
                        cpb.tick()
364
                        File infile = f;
365
                        File outfile = new File(f.getParent(),f.getName()+"-out.tt");
366
                        if (!applyTT(infile, outfile, modelfile)) {
367
                                return false;
368
                        }
369

    
370
                        //                        //Reconvert encoding if needed
371
                        //                        if (modelfile.getName().equals("sp.par")) {
372
                        //                                if(debug)
373
                        //                                        println "convert "+f+" latin1 >> UTF-8"
374
                        //                                new EncodingConverter(f, "ISO-8859-1", "UTF-8")
375
                        //                        }
376
                }
377
                println("")
378

    
379
                if (cancelNow) return;
380

    
381
                //BUILD STAND-OFF FILES
382
                if (modelfile == null) {
383
                        println "no model applied"
384
                        return false;
385
                }
386

    
387
                initTTOutfileInfos(binDir, modelfile, modelfilename);
388
                files = new File(binDir, "treetagger").listFiles()// now contains the result files of TT
389
                println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations");
390
                if (files == null || files.size() == 0)
391
                        return false;
392
                cpb = new ConsoleProgressBar(files.size())
393
                for (File ttrezfile : files) {
394
                        cpb.tick()
395
                        File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml");
396
                        if (!writeStandoffFile(ttrezfile, annotfile))
397
                                println("Failed to build standoff file of "+ttrezfile);
398
                }
399
                println("")
400

    
401
                if (cancelNow) return;
402

    
403
                //INJECT ANNOTATIONS
404
                List<File> interpfiles = new File(binDir, "annotations").listFiles();
405
                List<File> txmfiles = txmDir.listFiles();
406
                if (txmfiles == null) {
407
                        println "No file to annotate in "+txmDir.getAbsolutePath()
408
                        return false;
409
                }
410
                interpfiles.sort(); // same order
411
                txmfiles.sort(); //same order
412
                println "Injecting stdoff files ("+interpfiles.size()+") data from "+new File(binDir, "annotations")+ " to xml-txm files of "+txmDir;
413
                if (interpfiles == null || interpfiles.size() == 0)
414
                        return false;
415
                cpb = new ConsoleProgressBar(interpfiles.size())
416
                for (int i = 0 ; i < interpfiles.size() ; i++) {
417
                        cpb.tick()
418
                        File srcfile = txmfiles.get(i);
419
                        File pos1file = interpfiles.get(i);
420
                        File temp = File.createTempFile("Annotate", "temp", srcfile.getParentFile());
421
                        def builder = new AnnotationInjection(srcfile.toURI().toURL(), pos1file.toURI().toURL());
422
                        if (!builder.process(temp)) {
423
                                return false;
424
                        }
425
                        builder = null;
426

    
427
                        //println "renaming files..."
428
                        if (!(srcfile.delete() && temp.renameTo(srcfile)))
429
                                println "Warning can't rename file "+temp+" to "+srcfile
430
                }
431
                println("")
432
                return true;
433
        }
434
        
435
        public void setModelsDirectory(File modelsDirectory) {
436
                this.modelsDirectory = modelsDirectory;
437
        }
438

    
439
        /**
440
         * Run file by file. Allow to have one different lang per file. Default behavior add new word properties
441
         *
442
         * @param binDir
443
         * @param txmDir
444
         * @param lang associate a file name with a model filename
445
         * @return true, if successful
446
         */
447
        public boolean run(File binDir, File txmDir, HashMap<String, String> langs)
448
        {
449
                return run(binDir, txmDir, langs, false, new String[0], new String[0]);
450
        }
451
        
452
        /**
453
         * Run file by file. Allow to have one different lang per file
454
         *
455
         * @param binDir 
456
         * @param txmDir
457
         * @param lang associate a file name with a model filename
458
         * @param replace, replace or create a word property
459
         * @return true, if successful
460
         */
461
        public boolean run(File binDir, File txmDir, HashMap<String, String> langs, boolean replace, String[] properties, String[] options)
462
        {
463
                if (!new File(Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists()) {
464
                        println("Path to TreeTagger is wrong "+Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
465
                        return true;
466
                }
467

    
468
                List<File> listfiles = txmDir.listFiles();
469

    
470
                //cleaning
471
                File annotDir = new File(binDir,"annotations");
472
                annotDir.deleteDir();
473
                annotDir.mkdir();
474
                File ptreetaggerDir = new File(binDir,"ptreetagger");
475
                ptreetaggerDir.deleteDir();
476
                ptreetaggerDir.mkdir();
477
                File treetaggerDir = new File(binDir,"treetagger");
478
                treetaggerDir.deleteDir();
479
                treetaggerDir.mkdir();
480

    
481
                int cores = Runtime.getRuntime().availableProcessors()
482
                int coresToUse = Math.max(1.0, cores * 0.7)
483
                ExecutorService pool = Executors.newFixedThreadPool(coresToUse)
484

    
485
                def files = txmDir.listFiles()
486
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
487
                for (File teiFile : files) {
488
                        int counter = 1;
489
                        ThreadFile t = new ThreadFile("TT_"+counter++, teiFile) {
490

    
491
                                                public void run() {
492

    
493
                                                        if (cancelNow) return;
494

    
495
                                                        if (langs.get(f.getName()) == null) {
496
                                                                println "Error: no lang defined for file $f"
497
                                                                return;
498
                                                        }
499

    
500
                                                        String model = langs.get(f.getName());
501
                                                        File modelfile = new File(modelsDirectory, model+".par");
502
                                                        if (!modelfile.exists()) {
503
                                                                println "Error: No Modelfile available for lang "+modelfile+". Continue import process ";
504
                                                                return;
505
                                                        }
506
                                                        File annotfile = new File(annotDir, f.getName()+"-STDOFF.xml");
507
                                                        File ttsrcfile = new File(ptreetaggerDir, f.getName()+"-src.tt");
508
                                                        File ttrezfile = new File(treetaggerDir, f.getName()+"-out.tt");
509
                                                        //println ("TT with $model "+f+"+"+annotfile+" > "+ttsrcfile+" > "+ttrezfile);
510

    
511
                                                        //BUILD TT FILE READY TO BE TAGGED
512
                                                        def builder = new BuildTTSrc(f.toURL());
513
                                                        builder.process(ttsrcfile, null);
514

    
515
                                                        //Apply TT
516
                                                        applyTT(ttsrcfile, ttrezfile, modelfile, options);
517

    
518
                                                        //CREATE STANDOFF FILES
519
                                                        initTTOutfileInfos(binDir, modelfile, model, properties);
520
                                                        writeStandoffFile(ttrezfile, annotfile)
521

    
522
                                                        //INJECT ANNOTATIONS
523
                                                        File tmpFile = new File(txmDir, "temp_"+f.getName())
524
                                                        builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace);
525
                                                        builder.process(tmpFile);
526
                                                        if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f
527

    
528
                                                        cpb.tick();
529
                                                }
530
                                        };
531

    
532
                        pool.execute(t)
533
                }
534

    
535
                pool.shutdown()
536
                pool.awaitTermination(10, TimeUnit.HOURS)
537
                println ""
538
                return true;
539
        }
540

    
541
        public void setCancelNow() {
542
                cancelNow = true;
543
        }
544

    
545
        public class ThreadFile extends Thread {
546
                File f;
547
                public ThreadFile(String name, File f) {
548
                        this.setName(name)
549
                        this.f = f;
550
                }
551
        }
552
}