Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / AnnotateWTC.groovy @ 479

History | View | Annotate | Download (5.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $
25
// $LastChangedRevision: 2583 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm
29

    
30
import java.io.File
31
import java.text.DateFormat
32
import java.util.Date
33

    
34
import org.txm.Toolbox
35
import org.txm.importer.*
36
import org.txm.importer.cwb.*
37
import org.txm.objects.*
38
import org.txm.stat.utils.ConsoleProgressBar
39
import org.txm.utils.DeleteDir;
40
import org.txm.utils.LangDetector;
41
import org.txm.utils.treetagger.TreeTagger
42

    
43
// TODO: Auto-generated Javadoc
44
/**
45
 * Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
46
 * creates $rootDirFile/interp and $rootDirFile/treetagger
47
 *
48
 */
49
class AnnotateWTC {
50

    
51
        /** The debug. */
52
        boolean debug = true;
53

    
54
        /**
55
         * Sets the debug.
56
         */
57
        public void setDebug() { debug=true; }
58

    
59
        String id;
60

    
61
        /**
62
         * Apply tt.
63
         *
64
         * @param ttsrcfile the ttsrcfile
65
         * @param ttoutfile the ttoutfile
66
         * @param modelfile the modelfile
67
         * @return true, if successful
68
         */
69
        public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile)
70
        {
71
                try {
72
                        File infile = ttsrcfile;
73
                        File outfile = ttoutfile;
74

    
75
                        // TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ???
76
                        String opt = Toolbox.getParam(Toolbox.TREETAGGER_OPTIONS);
77
                        if (opt == null) opt = "";
78
                        String[] options = opt.split("  ");
79
                        def tt = new TreeTagger(Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options);
80
                        tt.settoken();
81
                        tt.setlemma();
82
                        tt.setquiet();
83
                        tt.setsgml();
84
                        tt.setnounknown();
85
                        tt.seteostag("<s>");
86
                        tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
87
                        //infile.delete();
88
                }
89
                catch(Exception e)
90
                {
91
                        e.printStackTrace();
92
                        return false;
93
                }
94
                return true;
95
        }
96

    
97
        /**
98
         * Run.
99
         *
100
         * @param rootDirFile the root dir file
101
         * @param modelfilename the modelfilename
102
         * @return true, if successful
103
         */
104
        public boolean run(File binDir, File txmDir,  String modelfilename)
105
        {
106
                //test if modelfile exists
107
                if(debug) {
108
                        println "rootDirFile "+binDir
109
                        println "txmDir "+txmDir
110
                        println "TREETAGGER INSTALL PATH : "+Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)
111
                        println "TREETAGGER MODELS PATH : "+Toolbox.getParam(Toolbox.TREETAGGER_MODELS_PATH)
112
                }
113

    
114
                //test if the Toolbox know TreeTagger
115
                if (!new File(Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists())
116
                {
117
                        println("Could not find TreeTagger binaries in "+Toolbox.getParam(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
118
                        return false;
119
                }
120

    
121
                //cleaning
122
                File annotDir = new File(binDir, "annotations")
123
                annotDir.mkdir();
124

    
125
                //BUILD TT FILE READY TO BE TAGGED
126
                //println "Re "+Toolbox.getParam(Toolbox.TREETAGGER_APOSTROPHE)
127
                List<File> files = txmDir.listFiles()
128

    
129
                // get model file and check it
130
                File modelfile = new File(Toolbox.getParam(Toolbox.TREETAGGER_MODELS_PATH), modelfilename);
131
                if (debug)
132
                        println "model file : "+modelfile;
133

    
134
                if (!modelfile.exists()) {
135
                        println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile;
136
                        if(System.getProperty("os.name").startsWith("Windows"))
137
                                println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters."
138
                        return false;
139
                }
140

    
141
                //APPLY TREETAGGER
142
                println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)")
143
                if (files == null || files.size() == 0)
144
                        return false;
145

    
146
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size());
147
                for (File f : files) {
148
                        cpb.tick();
149
                        
150
                        File infile = f;
151
                        File outfile = new File(new File(binDir, "annotations"), f.getName());
152
                        if (outfile.exists() && // outfile exists
153
                                outfile.lastModified() >= infile.lastModified() && // outfile is more recent 
154
                                outfile.length() > infile.length()) { // outfile is bigger
155
                                // skip
156
                        } else {
157
                                if (!applyTT(infile, outfile, modelfile)) {
158
                                        System.out.println("Failed to apply treetagger on file "+f);
159
                                        return false;
160
                                }
161
                        }
162
                }
163
                println("")
164

    
165
//                if (DeleteDir.deleteDirectory(txmDir)) {
166
//                        if (annotDir.renameTo(txmDir)) {
167
//
168
//                        } else {
169
//                                println "Could not rename  'annotations' directory to 'txm' directory"
170
//                                return false
171
//                        }
172
//                } else {
173
//                        println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory"
174
//                }
175

    
176
                return true;
177
        }
178
}