Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / txt / importer.groovy @ 2554

History | View | Annotate | Download (9.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.txt
29

    
30
import javax.xml.stream.*
31

    
32
import org.txm.*
33
import org.txm.scripts.importer.*
34
import org.txm.importer.scripts.filters.*
35
import org.txm.scripts.*
36
import org.txm.importer.scripts.xmltxm.*
37
import org.txm.utils.*
38
import org.txm.utils.i18n.DetectBOM
39

    
40
import org.txm.scripts.filters.CutHeader.*
41
import org.txm.scripts.filters.FusionHeader.*
42
import org.txm.scripts.filters.TagSentences.*
43
import org.txm.scripts.filters.Tokeniser.*
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class importer.
48
 */
49
class importer {
50

    
51
        /**
52
         * Run.
53
         *
54
         * @param rootfile the rootfile
55
         * @param encoding the encoding
56
         * @param suffixes the suffixes
57
         * @param basename the basename
58
         * @return true, if successful
59
         */
60
        public static boolean run(File srcDir, File binDir, File txmDir, String encoding, List<String> suffixes, String basename, String lang)
61
        {
62
                
63
                File stokenizedDir = new File(binDir,"stokenized");
64
                stokenizedDir.deleteDir();
65
                stokenizedDir.mkdir();
66
                File ptokenizedDir = new File(binDir,"ptokenized");
67
                ptokenizedDir.deleteDir();
68
                ptokenizedDir.mkdir();
69
                File tokenizedDir = new File(binDir,"tokenized");
70
                tokenizedDir.deleteDir();
71
                tokenizedDir.mkdir();
72

    
73
                List<File> files = null;
74

    
75
                ArrayList<String> milestones = new ArrayList<String>();//the tags who stay milestones
76
                milestones.add("tagUsage");
77
                milestones.add("pb");
78
                milestones.add("lb");
79

    
80
                // Build the list of src files
81
                List<File> srcfiles = DeleteDir.scanDirectory(srcDir, true)
82
                for (int i = 0; i < srcfiles.size(); i++) {
83
                        File srcfile = srcfiles.get(i);
84
                        int point = srcfile.getName().indexOf(".");
85
                        String filename, fileext;
86
                        if (point >= 1 && point < srcfile.getName().length()) {
87
                                filename = srcfile.getName().substring(0, point)
88
                                fileext = srcfile.getName().substring(point+1, srcfile.getName().length());
89
                        } else {
90
                                filename = srcfile.getName()
91
                                fileext = "";
92
                        }
93

    
94
                        if (!suffixes.contains(fileext)) {
95
                                srcfiles.remove(i);
96
                                i--;
97
                        }
98
                }
99
                
100
                if (srcfiles != null && srcfiles.size() == 0) {
101
                        println "No 'txt' file to process in directory $srcDir"
102
                        return false;
103
                }
104

    
105
                //CREATE SIMPLE XML FILE
106
                HashSet<String> existingfiles = new HashSet<String>();
107
                String currentpath = srcDir.getAbsolutePath()
108

    
109
                String encodingAll = null;
110
                if (encoding == "??") {
111
                        encodingAll = new CharsetDetector(srcDir).getEncoding();
112
                        println "Guessed encoding: $encodingAll"
113
                }
114

    
115
                println("Create simple XML files ("+srcfiles.size()+" files)")
116
                for (File srcfile : srcfiles) {
117
                        if (srcfile.length() == 0) {
118
                                println "Skipping empty file: "+srcfile
119
                                continue;
120
                        }
121
                        print "."
122
                        File xmlfile;
123
                        try {
124
                                int point = srcfile.getName().indexOf(".");
125
                                String filename;
126
                                String fileext;
127
                                if (point >= 1 && point < srcfile.getName().length() ) {
128
                                        filename = srcfile.getName().substring(0, point)
129
                                        fileext = srcfile.getName().substring(point+1, srcfile.getName().length());
130
                                } else {
131
                                        filename = srcfile.getName()
132
                                        fileext = "";
133
                                }
134

    
135
                                String filepath = srcfile.getParentFile().getAbsolutePath();
136
                                filepath = filepath.substring(currentpath.length());
137
                                xmlfile = new File(txmDir, filename+".xml");
138

    
139
                                while (existingfiles.contains(xmlfile.getName()))//to not erase a file with the same name
140
                                {
141
                                        xmlfile = new File(txmDir, "_"+xmlfile.getName());
142
                                }
143
                                existingfiles.add(xmlfile.getName());
144

    
145
                                //println "ENCODING "+encoding
146
                                String tmpEncoding = encoding;
147
                                if (encodingAll != null) {
148
                                        tmpEncoding = encodingAll
149
                                        if (srcfile.length() > CharsetDetector.MINIMALSIZE) {
150
                                                tmpEncoding = new CharsetDetector(srcfile).getEncoding();
151
                                                //println "file encoding: $tmpEncoding"
152
                                        }
153
                                }
154
                                def input = new FileInputStream(srcfile)
155
                                Reader reader = new InputStreamReader(input , tmpEncoding);
156
                                DetectBOM bomdetector = new DetectBOM(srcfile);
157
                                for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
158

    
159
                                int lb = 1;
160
                                int np = 1;
161
                                int countlb = 0;
162

    
163
                                BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlfile))
164
                                XMLOutputFactory factory = XMLOutputFactory.newInstance();
165
                                XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
166

    
167
                                writer.writeStartDocument("UTF-8","1.0");
168
                                writer.writeStartElement ("TEI");
169
                                writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
170
                                writer.writeNamespace("txm", "http://textometrie.org/1.0");
171
                                writer.writeStartElement ("teiHeader");
172
                                writer.writeStartElement("fileDesc");
173
                                writer.writeStartElement("titleStmt");
174
                                writer.writeStartElement("title")
175
                                writer.writeCharacters(xmlfile.getName())
176
                                writer.writeEndElement(); // title
177
                                writer.writeEndElement(); //titleStmt
178
                                writer.writeStartElement("publicationStmt");
179
                                writer.writeEndElement(); // publicationStmt
180
                                writer.writeStartElement("sourceDesc");
181
                                writer.writeStartElement("p");
182
                                writer.writeCharacters("Generated by TXM TXT+CSV import module - TXM project - http://textometrie.org");
183
                                writer.writeEndElement(); // p
184
                                writer.writeEndElement(); // sourceDesc
185
                                writer.writeEndElement(); // fileDesc
186
                                writer.writeStartElement("encodingDesc")
187
                                writer.writeStartElement("http://textometrie.org/1.0", "applicationDesc")
188
                                writer.writeEndElement();
189
                                writer.writeEndElement();
190
                                writer.writeEndElement (); // teiHeader
191
                                writer.writeStartElement ("text");
192
                                writer.writeAttribute("id",filename)
193

    
194
                                reader.eachLine{String line ->
195
                                        line = CleanFile.clean(line); // remove ctrl and surrogate chars
196
                                        
197
                                        //writer.writeStartElement("p");
198
                                        //writer.writeAttribute("id",""+np++)
199
                                        line = FixString.normalize(line);
200
                                        writer.writeStartElement("lb")
201
                                        writer.writeAttribute("n",""+lb++)
202
                                        writer.writeEndElement(); // close lb
203
                                        writer.writeCharacters(line.replace("\t"," "))
204
                                        //writer.writeEndElement(); // p
205
                                        writer.writeCharacters("\n"); // XML readability
206
                                        //}
207
                                }
208

    
209
                                writer.writeEndElement();//text
210
                                writer.writeEndElement();// TEI
211
                                
212
                                writer.close();
213
                                output.close();
214
                        }
215
                        catch(Exception e) {
216
                                println("Failed to process file "+srcfile);
217
                                xmlfile.delete()
218
                        }
219
                }
220
                println ""
221

    
222
                println("Tokenizing "+txmDir.listFiles().length+" files")
223
                for (File pfile : txmDir.listFiles()) {
224
                        print "."
225
                        File tfile = new File(tokenizedDir, pfile.getName());
226

    
227
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(pfile, tfile, lang);
228
                        tokenizer.setStartTag("text")
229
                        if (!tokenizer.process()) {
230
                                println("Failed to tokenize file "+pfile)
231
                                tfile.delete()
232
                        }
233
                }
234
                println ""
235
                
236
                // Tag sentences
237
                List<File> stokenfiles = tokenizedDir.listFiles()
238
                File stokenizeDir = new File(tokenizedDir.getParentFile(), "stokenized")
239
                stokenizeDir.mkdir();
240
                println("Tagging sentences of "+stokenfiles.size()+" files")
241
                for (File f : stokenfiles) {
242
                        print "."
243
                        Sequence S = new Sequence();
244
                        Filter F1 = new CutHeader();
245
                        Filter F7 = new TagSentences();
246
                        Filter F11 = new FusionHeader();
247
                        S.add(F1);
248
                        S.add(F7);
249
                        S.add(F11);
250
                        File infile = new File(tokenizedDir, f.getName());
251
                        File xmlfile = new File(stokenizeDir, f.getName());
252
                        File headerfile = new File(tokenizedDir, f.getName()+"header.xml");
253

    
254
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
255
                        S.setEncodages("UTF-8","UTF-8");
256
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
257
                        F1.SetUsedParam(arguments1);
258
                        Object[] arguments2 = [headerfile.getAbsolutePath(), F1];
259
                        F11.SetUsedParam(arguments2);
260
                        if (!S.proceed()) {
261
                                println "Failed to tag file: "+f
262
                        }
263
                        
264
                        S.clean();
265
                        S = F1 = F7 = F11 = null;
266
                        headerfile.delete();//remove the prepared file to clean
267
                }
268
                println ""
269

    
270
                //TRANSFORM INTO XML-TEI-TXM
271
                println("Building xml-tei-txm ("+stokenizeDir.listFiles().length+" files)")
272
                for (File tfile : stokenizeDir.listFiles()) {
273
                        print  "."
274
                        File xmlfile = new File(txmDir, tfile.getName());
275

    
276
                        def correspType = new HashMap<String,String>()
277
                        def correspRef = new HashMap<String,String>()
278
                        def respId = [];
279
                        def applications = new HashMap<String,HashMap<String,String>>();
280
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
281
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
282
                        def resps = new HashMap<String,String[]>();
283

    
284
                        //println "file : $tfile"
285
                        Xml2Ana builder = new Xml2Ana(tfile);
286
                        builder.setCorrespondances(correspRef, correspType);
287
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
288
                        if (!builder.process(xmlfile)) {
289
                                println("Failed to process file "+tfile)
290
                                new File(xmlfile.getParent(),xmlfile.getName()).delete()
291
                        }
292
                }
293
                println ""
294
                
295
                files = txmDir.listFiles()
296
                if (files == null || files.size() == 0) return false
297
                
298
                return true;
299
        }
300
}