Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / quick / importer.groovy @ 187

History | View | Annotate | Download (9.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-12-16 17:28:35 +0100 (Fri, 16 Dec 2016) $
25
// $LastChangedRevision: 3377 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.quick
29

    
30
import javax.xml.stream.*
31

    
32
import org.txm.*
33
import org.txm.importer.*
34
import org.txm.importer.filters.*
35
import org.txm.scripts.*
36
import org.txm.scripts.teitxm.*
37
import org.txm.utils.*
38
import org.txm.utils.i18n.DetectBOM
39

    
40
import filters.CutHeader.*
41
import filters.FusionHeader.*
42
import filters.TagSentences.*
43
import filters.Tokeniser.*
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class importer.
48
 */
49
class importer {
50

    
51
        /**
52
         * Run.
53
         *
54
         * @param rootfile the rootfile
55
         * @param encoding the encoding
56
         * @param suffixes the suffixes
57
         * @param basename the basename
58
         * @return true, if successful
59
         */
60
        public static boolean run(File srcDir, File binDir, File txmDir, String encoding, List<String> suffixes, String basename, String lang)
61
        {
62
                //String rootDir = srcDir.getAbsolutePath();// ./xml/quick
63

    
64
                File ptokenizedDir = new File(binDir,"ptokenized");
65
                ptokenizedDir.deleteDir();
66
                ptokenizedDir.mkdir();
67
                File tokenizedDir = new File(binDir,"tokenized");
68
                tokenizedDir.deleteDir();
69
                tokenizedDir.mkdir();
70

    
71
                List<File> files = null;
72

    
73
                ArrayList<String> milestones = new ArrayList<String>();//the tags who stay milestones
74
                milestones.add("tagUsage");
75
                milestones.add("pb");
76
                milestones.add("lb");
77

    
78
                // Build the list of src files
79
                List<File> srcfiles = DeleteDir.scanDirectory(srcDir, true)
80
                for (int i = 0; i < srcfiles.size(); i++) {
81
                        File srcfile = srcfiles.get(i);
82
                        int point = srcfile.getName().indexOf(".");
83
                        String filename, fileext;
84
                        if (point >= 1 && point < srcfile.getName().length()) {
85
                                filename = srcfile.getName().substring(0, point)
86
                                fileext = srcfile.getName().substring(point+1, srcfile.getName().length());
87
                        } else {
88
                                filename = srcfile.getName()
89
                                fileext = "";
90
                        }
91

    
92
                        if (!suffixes.contains(fileext)) {
93
                                srcfiles.remove(i);
94
                                i--;
95
                        }
96
                }
97
                
98
                if (srcfiles != null && srcfiles.size() == 0) {
99
                        println "No 'txt' file to process in directory $srcDir"
100
                        return false;
101
                }
102

    
103
                //CREATE SIMPLE XML FILE
104
                HashSet<String> existingfiles = new HashSet<String>();
105
                String currentpath = srcDir.getAbsolutePath()
106

    
107
                String encodingAll = null;
108
                if (encoding == "??") {
109
                        encodingAll = new CharsetDetector(srcDir).getEncoding();
110
                        println "Guessed encoding: $encodingAll"
111
                }
112

    
113
                println("Create simple XML files ("+srcfiles.size()+" files)")
114
                for (File srcfile : srcfiles) {
115
                        print "."
116
                        File xmlfile;
117
                        try {
118
                                int point = srcfile.getName().indexOf(".");
119
                                String filename;
120
                                String fileext;
121
                                if (point >= 1 && point < srcfile.getName().length() ) {
122
                                        filename = srcfile.getName().substring(0, point)
123
                                        fileext = srcfile.getName().substring(point+1, srcfile.getName().length());
124
                                } else {
125
                                        filename = srcfile.getName()
126
                                        fileext = "";
127
                                }
128

    
129
                                String filepath = srcfile.getParentFile().getAbsolutePath();
130
                                filepath = filepath.substring(currentpath.length());
131
                                xmlfile = new File(txmDir, filename+".xml");
132

    
133
                                while (existingfiles.contains(xmlfile.getName()))//to not erase a file with the same name
134
                                {
135
                                        xmlfile = new File(txmDir, "_"+xmlfile.getName());
136
                                }
137
                                existingfiles.add(xmlfile.getName());
138

    
139
                                //println "ENCODING "+encoding
140
                                String tmpEncoding = encoding;
141
                                if (encodingAll != null) {
142
                                        tmpEncoding = encodingAll
143
                                        if (srcfile.length() > CharsetDetector.MINIMALSIZE) {
144
                                                tmpEncoding = new CharsetDetector(srcfile).getEncoding();
145
                                                //println "file encoding: $tmpEncoding"
146
                                        }
147
                                }
148
                                def input = new FileInputStream(srcfile)
149
                                Reader reader = new InputStreamReader(input , tmpEncoding);
150
                                DetectBOM bomdetector = new DetectBOM(srcfile);
151
                                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
152

    
153
                                int lb = 1;
154
                                int np = 1;
155
                                int countlb = 0;
156

    
157
                                BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlfile))
158
                                XMLOutputFactory factory = XMLOutputFactory.newInstance();
159
                                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
160

    
161
                                writer.writeStartDocument("UTF-8","1.0");
162
                                writer.writeStartElement ("TEI");
163
                                writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
164
                                writer.writeNamespace("txm", "http://textometrie.org/1.0");
165
                                writer.writeStartElement ("teiHeader");
166
                                writer.writeStartElement("fileDesc");
167
                                writer.writeStartElement("titleStmt");
168
                                writer.writeStartElement("title")
169
                                writer.writeCharacters(xmlfile.getName())
170
                                writer.writeEndElement(); // title
171
                                writer.writeEndElement(); //titleStmt
172
                                writer.writeStartElement("publicationStmt");
173
                                writer.writeEndElement(); // publicationStmt
174
                                writer.writeStartElement("sourceDesc");
175
                                writer.writeStartElement("p");
176
                                writer.writeCharacters("Generated by TXM TXT+CSV import module - TXM project - http://textometrie.org");
177
                                writer.writeEndElement(); // p
178
                                writer.writeEndElement(); // sourceDesc
179
                                writer.writeEndElement(); // fileDesc
180
                                writer.writeStartElement("encodingDesc")
181
                                writer.writeStartElement("http://textometrie.org/1.0", "applicationDesc")
182
                                writer.writeEndElement();
183
                                writer.writeEndElement();
184
                                writer.writeEndElement (); // teiHeader
185
                                writer.writeStartElement ("text");
186
                                writer.writeAttribute("id",filename)
187

    
188
                                reader.eachLine{String line ->
189
                                        line = CleanFile.clean(line);
190
                                        
191
                                        //writer.writeStartElement("p");
192
                                        //writer.writeAttribute("id",""+np++)
193
                                        line = FixString.normalize(line);
194
                                        writer.writeStartElement("lb")
195
                                        writer.writeAttribute("n",""+lb++)
196
                                        writer.writeEndElement(); // close lb
197
                                        writer.writeCharacters(line.replace("\t"," "))
198
                                        //writer.writeEndElement(); // p
199
                                        writer.writeCharacters("\n"); // XML readability
200
                                        //}
201
                                }
202

    
203
                                writer.writeEndElement();//text
204
                                writer.writeEndElement();// TEI
205
                                
206
                                writer.close();
207
                                output.close();
208
                        }
209
                        catch(Exception e) {
210
                                println("Failed to process file "+srcfile);
211
                                xmlfile.delete()
212
                        }
213
                }
214
                println ""
215

    
216
                println("Tokenizing "+txmDir.listFiles().length+" files")
217
                for (File pfile : txmDir.listFiles()) {
218
                        print "."
219
                        File tfile = new File(tokenizedDir, pfile.getName());
220

    
221
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(pfile, tfile, lang);
222
                        tokenizer.setStartTag("text")
223
                        if (!tokenizer.process()) {
224
                                println("Failed to tokenize file "+pfile)
225
                                tfile.delete()
226
                        }
227
                }
228
                println ""
229
                
230
                // Tag sentences
231
                List<File> stokenfiles = tokenizedDir.listFiles()
232
                File stokenizeDir = new File(tokenizedDir.getParentFile(), "stokenized")
233
                stokenizeDir.mkdir();
234
                println("Tagging sentences of "+stokenfiles.size()+" files")
235
                for (File f : stokenfiles) {
236
                        print "."
237
                        Sequence S = new Sequence();
238
                        Filter F1 = new CutHeader();
239
                        Filter F7 = new TagSentences();
240
                        Filter F11 = new FusionHeader();
241
                        S.add(F1);
242
                        S.add(F7);
243
                        S.add(F11);
244
                        File infile = new File(tokenizedDir, f.getName());
245
                        File xmlfile = new File(stokenizeDir, f.getName());
246
                        File headerfile = new File(tokenizedDir, f.getName()+"header.xml");
247

    
248
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
249
                        S.setEncodages("UTF-8","UTF-8");
250
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
251
                        F1.SetUsedParam(arguments1);
252
                        Object[] arguments2 = [headerfile.getAbsolutePath(), F1];
253
                        F11.SetUsedParam(arguments2);
254
                        if (!S.proceed()) {
255
                                println "Failed to tag file: "+f
256
                        }
257
                        
258
                        S.clean();
259
                        S = F1 = F7 = F11 = null;
260
                        headerfile.delete();//remove the prepared file to clean
261
                }
262
                println ""
263

    
264
                //TRANSFORM INTO XML-TEI-TXM
265
                println("Building xml-tei-txm ("+stokenizeDir.listFiles().length+" files)")
266
                for (File tfile : stokenizeDir.listFiles()) {
267
                        print  "."
268
                        File xmlfile = new File(txmDir, tfile.getName());
269

    
270
                        def correspType = new HashMap<String,String>()
271
                        def correspRef = new HashMap<String,String>()
272
                        def respId = [];
273
                        def applications = new HashMap<String,HashMap<String,String>>();
274
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
275
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
276
                        def resps = new HashMap<String,String[]>();
277

    
278
                        //println "file : $tfile"
279
                        Xml2Ana builder = new Xml2Ana(tfile);
280
                        builder.setCorrespondances(correspRef, correspType);
281
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
282
                        if (!builder.process(xmlfile)) {
283
                                println("Failed to process file "+tfile)
284
                                new File(xmlfile.getParent(),xmlfile.getName()).delete()
285
                        }
286
                }
287
                println ""
288

    
289
                return true;
290
        }
291
}