Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xml / importer.groovy @ 187

History | View | Annotate | Download (5.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-12-01 17:48:31 +0100 (Thu, 01 Dec 2016) $
25
// $LastChangedRevision: 3351 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml
29

    
30
//import filters.TabulatedToXml.*;
31
import java.io.File
32

    
33
import javax.xml.stream.*
34

    
35
import org.txm.*
36
import org.txm.importer.*
37
import org.txm.importer.filters.*
38
import org.txm.objects.*
39
import org.txm.scripts.*
40
import org.txm.scripts.teitxm.*
41

    
42
import filters.CutHeader.*
43
import filters.FusionHeader.*
44
import filters.TagSentences.*
45
import filters.Tokeniser.*
46

    
47
// TODO: Auto-generated Javadoc
48
/**
49
 * The Class importer.
50
 */
51
class importer {
52

    
53
        /** The validation. */
54
        boolean validation;
55

    
56
        /** The tokenize. */
57
        boolean tokenize;
58

    
59
        /** The sentence. */
60
        boolean sentence = false;
61

    
62
        /**
63
         * Do tokenize.
64
         *
65
         * @param b the b
66
         */
67
        public void doTokenize(boolean b) { tokenize = b; }
68

    
69
        /**
70
         * Do validation.
71
         *
72
         * @param b the b
73
         */
74
        public void doValidation(boolean b) { validation = b; }
75

    
76
        /**
77
         * Run.
78
         *
79
         * @param rootDirFile the root dir file
80
         * @param basename the basename
81
         * @return true, if successful
82
         */
83
        public boolean run(File srcDir, File binDir, File txmDir, String basename, String ignoredElements, String lang)
84
        {
85
                new File(binDir,"tokenized").deleteDir();
86
                new File(binDir,"tokenized").mkdir();
87

    
88
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
89

    
90
                //TEST EACH XML FILE IF VALID
91
                List<File> srcfiles = txmDir.listFiles();
92
                List<File> okfiles = []
93

    
94
                println "Sources cleaning & validation";
95
                                def files = txmDir.listFiles()
96
                for (File f : txmDir.listFiles()) { // clean directory
97
                        String name = f.getName()
98
                        if (f.isHidden() || !name.toLowerCase().endsWith(".xml")) {
99
                                if (!f.delete()) {
100
                                        println "WARNING: could not clean $txmDir directory: TXM could not delete $f"
101
                                        return false;
102
                                }
103
                        }
104
                }
105
                
106
                files = txmDir.listFiles()
107
                if (files == null || files.size() == 0) {
108
                        println "No XML file (*.xml) to process. Aborting"
109
                        return false;
110
                }
111
                files = files.sort()
112
                for (File f : txmDir.listFiles()) {
113
                        if (!f.getName().toLowerCase().endsWith(".xml")) continue;
114
                        
115
                        if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") && !f.getName().startsWith("metadata") && !f.isDirectory()) {
116
                                print "."
117
                                if (!ValidateXml.test(f)) {
118
                                        if (stopIfMalformed)
119
                                                return;
120
                                        else
121
                                                continue;
122
                                } else {
123
                                        if (!f.getName().equals("import.xml"))
124
                                                okfiles.add(f);
125
                                }
126
                        }
127
                }
128
                println ""
129

    
130
                okfiles.sort()
131
                println "Files processed: "+okfiles
132
                if (okfiles.size() == 0) {
133
                        println "No file. Check if the file extensions are '.xml'"
134
                        return false;
135
                }
136
                
137
                //println "ptokenfiles "+ptokenfiles
138
                //TOKENIZE FILES
139
                List<File> tokenfiles;
140
                if (tokenize || sentence) {
141
                        println "Tokenizing "+okfiles.size()+" files"
142
                        for (File f : okfiles) {
143
                                print "."
144
                                File infile = f;
145
                                File outfile = new File(binDir,"tokenized/"+f.getName());
146
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
147
                                if (ignoredElements != null && ignoredElements.trim().length() > 0)
148
                                        tokenizer.setOutSideTextTagsAndKeepContent(ignoredElements)
149
                                if (!tokenizer.process()) {
150
                                        println("Failed to process "+f)
151
                                        outfile.delete()
152
                                }
153
                        }
154
                        tokenfiles = new File(binDir,"tokenized").listFiles()
155
                        println ""
156
                } else {
157
                        tokenfiles = okfiles;
158
                }
159

    
160
//                Change XML tag "text" to "textunit"
161
//                for (File f : tokenfiles) {
162
//                        if (!RenameTag.rename(f, "text", "textunit")) {
163
//                                println "Failed to rename <text> tag to <textunit> in file "+f
164
//                                return false;
165
//                        }
166
//                }
167

    
168
                //TRANSFORM INTO XML-TEI-TXM
169
                println("Building XML-TXM ("+tokenfiles.size()+" files)")
170
                for (File f : tokenfiles) {
171
                        //ArrayList<String> milestones = new ArrayList<String>();
172
                        print "."
173
                        File file = f;
174
                        String txmfile = f.getName();
175

    
176
                        def correspType = new HashMap<String,String>()
177
                        def correspRef = new HashMap<String,String>()
178
                        //il faut lister les id de tous les respStmt
179
                        def respId = [];
180
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
181
                        def applications = new HashMap<String,HashMap<String,String>>();
182
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
183
                        //pour construire les ref vers les taxonomies
184
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
185
                        //associe un id d'item avec sa description et son URI
186
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
187
                        //informations de respStmt
188
                        //resps (respId <voir ci-dessus>, [description, person, date])
189
                        def resps = new HashMap<String,String[]>();
190
                        //lance le traitement
191
                        def builder = new Xml2Ana(file);
192
                        builder.setConvertAllAtrtibutes true;
193
                        builder.setCorrespondances(correspRef, correspType);
194
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
195
                        //builder.setAddTEIHeader();
196
                        if (!builder.process(new File(txmDir,txmfile))) {
197
                                println("Failed to process "+f);
198
                                new File(txmDir,txmfile).delete();
199
                        }
200
                }
201
                println ""
202
                return true;
203
        }
204

    
205
        boolean stopIfMalformed = false;
206
        public void setStopIfMalformed(boolean b) {
207
                stopIfMalformed = b;
208
        }
209
}