Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / importer.groovy @ 1000

History | View | Annotate | Download (6.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.xml
29

    
30
//import org.txm.scripts.filters.TabulatedToXml.*;
31
import java.io.File
32

    
33
import javax.xml.stream.*
34

    
35
import org.txm.*
36
import org.txm.scripts.importer.*
37
import org.txm.importer.scripts.filters.*
38
import org.txm.objects.*
39
import org.txm.scripts.*
40
import org.txm.importer.scripts.xmltxm.*
41

    
42
import org.txm.scripts.filters.CutHeader.*
43
import org.txm.scripts.filters.FusionHeader.*
44
import org.txm.scripts.filters.TagSentences.*
45
import org.txm.scripts.filters.Tokeniser.*
46
import org.txm.importer.ValidateXml
47

    
48
// TODO: Auto-generated Javadoc
49
/**
50
 * The Class importer.
51
 */
52
class importer {
53

    
54
        /** The validation. */
55
        boolean validation;
56

    
57
        /** The tokenize. */
58
        boolean tokenize;
59

    
60
        /** The sentence. */
61
        boolean sentence = false;
62

    
63
        /**
64
         * Do tokenize.
65
         *
66
         * @param b the b
67
         */
68
        public void doTokenize(boolean b) { tokenize = b; }
69

    
70
        /**
71
         * Do validation.
72
         *
73
         * @param b the b
74
         */
75
        public void doValidation(boolean b) { validation = b; }
76

    
77
        /**
78
         * Run.
79
         *
80
         * @param rootDirFile the root dir file
81
         * @param basename the basename
82
         * @return true, if successful
83
         */
84
        public boolean run(File srcDir, File binDir, File txmDir, String basename, String ignoredElements, String lang)
85
        {
86
                new File(binDir,"tokenized").deleteDir();
87
                new File(binDir,"tokenized").mkdir();
88

    
89
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
90

    
91
                //TEST EACH XML FILE IF VALID
92
                List<File> srcfiles = txmDir.listFiles();
93
                List<File> okfiles = []
94

    
95
                println "Sources cleaning & validation";
96
                                def files = txmDir.listFiles()
97
                for (File f : txmDir.listFiles()) { // clean directory
98
                        String name = f.getName()
99
                        if (f.isHidden() || !name.toLowerCase().endsWith(".xml")) {
100
                                if (!f.delete()) {
101
                                        println "WARNING: could not clean $txmDir directory: TXM could not delete $f"
102
                                        return false;
103
                                }
104
                        }
105
                }
106
                
107
                files = txmDir.listFiles()
108
                if (files == null || files.size() == 0) {
109
                        println "No XML file (*.xml) to process. Aborting"
110
                        return false;
111
                }
112
                files = files.sort()
113
                for (File f : txmDir.listFiles()) {
114
                        if (!f.getName().toLowerCase().endsWith(".xml")) continue;
115
                        
116
                        if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") && !f.getName().startsWith("metadata") && !f.isDirectory()) {
117
                                print "."
118
                                if (!ValidateXml.test(f)) {
119
                                        if (stopIfMalformed)
120
                                                return;
121
                                        else
122
                                                continue;
123
                                } else {
124
                                        if (!f.getName().equals("import.xml"))
125
                                                okfiles.add(f);
126
                                }
127
                        }
128
                }
129
                println ""
130

    
131
                okfiles.sort()
132
                println "Files processed: "+okfiles
133
                if (okfiles.size() == 0) {
134
                        println "No file. Check if the file extensions are '.xml'"
135
                        return false;
136
                }
137
                
138
                // Fix surrogates
139
                File srcDirectory = new File(binDir, "src")
140
                srcDirectory.mkdir()
141
                for (File f : okfiles) {
142
                        File outputFile = new File (srcDirectory, f.getName())
143
                        CleanFile.removeSurrogateFromXmlFile(f, outputFile);
144
                }
145
                okfiles = srcDirectory.listFiles()
146
                okfiles.sort()
147
                
148
                //println "ptokenfiles "+ptokenfiles
149
                //TOKENIZE FILES
150
                List<File> tokenfiles;
151
                if (tokenize || sentence) {
152
                        println "Tokenizing "+okfiles.size()+" files"
153
                        for (File f : okfiles) {
154
                                print "."
155
                                File infile = f;
156
                                File outfile = new File(binDir, "tokenized/"+f.getName());
157
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
158
                                if (ignoredElements != null && ignoredElements.trim().length() > 0)
159
                                        tokenizer.setOutSideTextTagsAndKeepContent(ignoredElements)
160
                                if (!tokenizer.process()) {
161
                                        println("Failed to process "+f)
162
                                        outfile.delete()
163
                                }
164
                        }
165
                        tokenfiles = new File(binDir, "tokenized").listFiles()
166
                        println ""
167
                } else {
168
                        tokenfiles = okfiles;
169
                }
170

    
171
//                Change XML tag "text" to "textunit"
172
//                for (File f : tokenfiles) {
173
//                        if (!RenameTag.rename(f, "text", "textunit")) {
174
//                                println "Failed to rename <text> tag to <textunit> in file "+f
175
//                                return false;
176
//                        }
177
//                }
178
                if (tokenfiles.size() == 0) {
179
                        return false;
180
                }
181
                //TRANSFORM INTO XML-TEI-TXM
182
                println("Building XML-TXM ("+tokenfiles.size()+" files)")
183
                for (File f : tokenfiles) {
184
                        //ArrayList<String> milestones = new ArrayList<String>();
185
                        print "."
186
                        File file = f;
187
                        String txmfile = f.getName();
188

    
189
                        def correspType = new HashMap<String,String>()
190
                        def correspRef = new HashMap<String,String>()
191
                        //il faut lister les id de tous les respStmt
192
                        def respId = [];
193
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
194
                        def applications = new HashMap<String,HashMap<String,String>>();
195
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
196
                        //pour construire les ref vers les taxonomies
197
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
198
                        //associe un id d'item avec sa description et son URI
199
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
200
                        //informations de respStmt
201
                        //resps (respId <voir ci-dessus>, [description, person, date])
202
                        def resps = new HashMap<String,String[]>();
203
                        //lance le traitement
204
                        def builder = new Xml2Ana(file);
205
                        builder.setConvertAllAtrtibutes true;
206
                        builder.setCorrespondances(correspRef, correspType);
207
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
208
                        //builder.setAddTEIHeader();
209
                        if (!builder.process(new File(txmDir,txmfile))) {
210
                                println("Failed to process "+f);
211
                                new File(txmDir,txmfile).delete();
212
                        }
213
                }
214
                println ""
215
                okfiles = txmDir.listFiles()
216
                
217
                return okfiles != null && okfiles.size() > 0;
218
        }
219

    
220
        boolean stopIfMalformed = false;
221
        public void setStopIfMalformed(boolean b) {
222
                stopIfMalformed = b;
223
        }
224
}