Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xml / importer.groovy @ 479

History | View | Annotate | Download (6.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml
29

    
30
//import filters.TabulatedToXml.*;
31
import java.io.File
32

    
33
import javax.xml.stream.*
34

    
35
import org.txm.*
36
import org.txm.importer.*
37
import org.txm.importer.filters.*
38
import org.txm.objects.*
39
import org.txm.scripts.*
40
import org.txm.scripts.teitxm.*
41

    
42
import filters.CutHeader.*
43
import filters.FusionHeader.*
44
import filters.TagSentences.*
45
import filters.Tokeniser.*
46

    
47
// TODO: Auto-generated Javadoc
48
/**
49
 * The Class importer.
50
 */
51
class importer {
52

    
53
        /** The validation. */
54
        boolean validation;
55

    
56
        /** The tokenize. */
57
        boolean tokenize;
58

    
59
        /** The sentence. */
60
        boolean sentence = false;
61

    
62
        /**
63
         * Do tokenize.
64
         *
65
         * @param b the b
66
         */
67
        public void doTokenize(boolean b) { tokenize = b; }
68

    
69
        /**
70
         * Do validation.
71
         *
72
         * @param b the b
73
         */
74
        public void doValidation(boolean b) { validation = b; }
75

    
76
        /**
77
         * Run.
78
         *
79
         * @param rootDirFile the root dir file
80
         * @param basename the basename
81
         * @return true, if successful
82
         */
83
        public boolean run(File srcDir, File binDir, File txmDir, String basename, String ignoredElements, String lang)
84
        {
85
                new File(binDir,"tokenized").deleteDir();
86
                new File(binDir,"tokenized").mkdir();
87

    
88
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
89

    
90
                //TEST EACH XML FILE IF VALID
91
                List<File> srcfiles = txmDir.listFiles();
92
                List<File> okfiles = []
93

    
94
                println "Sources cleaning & validation";
95
                                def files = txmDir.listFiles()
96
                for (File f : txmDir.listFiles()) { // clean directory
97
                        String name = f.getName()
98
                        if (f.isHidden() || !name.toLowerCase().endsWith(".xml")) {
99
                                if (!f.delete()) {
100
                                        println "WARNING: could not clean $txmDir directory: TXM could not delete $f"
101
                                        return false;
102
                                }
103
                        }
104
                }
105
                
106
                files = txmDir.listFiles()
107
                if (files == null || files.size() == 0) {
108
                        println "No XML file (*.xml) to process. Aborting"
109
                        return false;
110
                }
111
                files = files.sort()
112
                for (File f : txmDir.listFiles()) {
113
                        if (!f.getName().toLowerCase().endsWith(".xml")) continue;
114
                        
115
                        if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") && !f.getName().startsWith("metadata") && !f.isDirectory()) {
116
                                print "."
117
                                if (!ValidateXml.test(f)) {
118
                                        if (stopIfMalformed)
119
                                                return;
120
                                        else
121
                                                continue;
122
                                } else {
123
                                        if (!f.getName().equals("import.xml"))
124
                                                okfiles.add(f);
125
                                }
126
                        }
127
                }
128
                println ""
129

    
130
                okfiles.sort()
131
                println "Files processed: "+okfiles
132
                if (okfiles.size() == 0) {
133
                        println "No file. Check if the file extensions are '.xml'"
134
                        return false;
135
                }
136
                
137
                // Fix surrogates
138
                File srcDirectory = new File(binDir, "src")
139
                srcDirectory.mkdir()
140
                for (File f : okfiles) {
141
                        File outputFile = new File (srcDirectory, f.getName())
142
                        CleanFile.removeSurrogateFromXmlFile(f, outputFile);
143
                }
144
                okfiles = srcDirectory.listFiles()
145
                okfiles.sort()
146
                
147
                //println "ptokenfiles "+ptokenfiles
148
                //TOKENIZE FILES
149
                List<File> tokenfiles;
150
                if (tokenize || sentence) {
151
                        println "Tokenizing "+okfiles.size()+" files"
152
                        for (File f : okfiles) {
153
                                print "."
154
                                File infile = f;
155
                                File outfile = new File(binDir, "tokenized/"+f.getName());
156
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
157
                                if (ignoredElements != null && ignoredElements.trim().length() > 0)
158
                                        tokenizer.setOutSideTextTagsAndKeepContent(ignoredElements)
159
                                if (!tokenizer.process()) {
160
                                        println("Failed to process "+f)
161
                                        outfile.delete()
162
                                }
163
                        }
164
                        tokenfiles = new File(binDir, "tokenized").listFiles()
165
                        println ""
166
                } else {
167
                        tokenfiles = okfiles;
168
                }
169

    
170
//                Change XML tag "text" to "textunit"
171
//                for (File f : tokenfiles) {
172
//                        if (!RenameTag.rename(f, "text", "textunit")) {
173
//                                println "Failed to rename <text> tag to <textunit> in file "+f
174
//                                return false;
175
//                        }
176
//                }
177
                if (tokenfiles.size() == 0) {
178
                        return false;
179
                }
180
                //TRANSFORM INTO XML-TEI-TXM
181
                println("Building XML-TXM ("+tokenfiles.size()+" files)")
182
                for (File f : tokenfiles) {
183
                        //ArrayList<String> milestones = new ArrayList<String>();
184
                        print "."
185
                        File file = f;
186
                        String txmfile = f.getName();
187

    
188
                        def correspType = new HashMap<String,String>()
189
                        def correspRef = new HashMap<String,String>()
190
                        //il faut lister les id de tous les respStmt
191
                        def respId = [];
192
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
193
                        def applications = new HashMap<String,HashMap<String,String>>();
194
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
195
                        //pour construire les ref vers les taxonomies
196
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
197
                        //associe un id d'item avec sa description et son URI
198
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
199
                        //informations de respStmt
200
                        //resps (respId <voir ci-dessus>, [description, person, date])
201
                        def resps = new HashMap<String,String[]>();
202
                        //lance le traitement
203
                        def builder = new Xml2Ana(file);
204
                        builder.setConvertAllAtrtibutes true;
205
                        builder.setCorrespondances(correspRef, correspType);
206
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
207
                        //builder.setAddTEIHeader();
208
                        if (!builder.process(new File(txmDir,txmfile))) {
209
                                println("Failed to process "+f);
210
                                new File(txmDir,txmfile).delete();
211
                        }
212
                }
213
                println ""
214
                okfiles = txmDir.listFiles()
215
                
216
                return okfiles != null && okfiles.size() > 0;
217
        }
218

    
219
        boolean stopIfMalformed = false;
220
        public void setStopIfMalformed(boolean b) {
221
                stopIfMalformed = b;
222
        }
223
}