Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / discours / importer.groovy @ 479

History | View | Annotate | Download (8.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.discours
29

    
30
//import filters.TabulatedToXml.*;
31
import org.txm.importer.*;
32
import org.txm.importer.filters.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.cwb.CwbEncode
35
import org.txm.importer.cwb.CwbMakeAll
36
import org.txm.metadatas.Metadatas
37
import org.txm.scripts.teitxm.*;
38

    
39
import javax.xml.stream.*;
40

    
41
import java.io.BufferedWriter;
42
import java.io.File;
43
import java.io.FileInputStream;
44
import java.io.FileOutputStream;
45
import java.io.OutputStreamWriter;
46
import java.io.PrintStream;
47
import java.net.URL;
48
import java.util.ArrayList;
49
import java.util.Collections;
50
import java.util.List;
51

    
52
import org.txm.*;
53
import org.txm.utils.*;
54
import org.txm.utils.i18n.DetectBOM;
55

    
56
// TODO: Auto-generated Javadoc
57
/**
58
 * The Class importer.
59
 */
60
class importer {
61

    
62
        /**
63
         * Run.
64
         *
65
         * @param dir the dir
66
         * @param encoding the encoding
67
         * @param basename the basename
68
         * @return true, if successful
69
         */
70
        public boolean run(File srcDir, File binDir, File txmDir, String encoding, String basename)
71
        {
72
                String rootDir = srcDir.getAbsolutePath()+"/"
73
                // scanning directory brut/*.cnr
74
                String extension = "cnr";
75
                File[] list = srcDir.listFiles();
76
                
77
                if (list == null || list.length == 0) return false;
78

    
79
                LinkedList filelist = new LinkedList();
80
                for (File cnr : list) {
81
                        if (cnr.getName().endsWith("."+extension)) {
82
                                filelist.add(cnr.getName());
83
                        }
84
                }
85
                Collections.sort(filelist);
86

    
87
                // get the headers
88
                def csvfile = new File(srcDir, "metadata.csv");
89
                Metadatas metadatas; // text metadata
90
                
91
                println "Trying to read metadata from: "+csvfile
92
                if (csvfile.exists()) {
93
                        File copy = new File(binDir, "metadata.csv")
94
                        if (!FileCopy.copy(csvfile, copy)) {
95
                                println "Error: could not create a copy of metadata file "+csvfile.getAbsoluteFile();
96
                                return;
97
                        }
98
                        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
99
                } else {
100
                        println "No metadata file: "+csvfile
101
                        println "Aborting"
102
                        return false;
103
                }
104
                
105
                // build each xml file from cnr
106
                def attrs = ["para", "sent", "form", "lem", "pos", "func"];
107
                int formid = 2;
108
                String separator = "\t";
109

    
110
                String encodingAll = null;
111
                if (encoding == "??") {
112
                        encodingAll = new CharsetDetector(srcDir).getEncoding();
113
                        println "Guessed encoding: $encodingAll"
114
                }
115

    
116
                print "Converting CNR to XML:"
117
                def second = 0
118
                for (String id : metadatas.keySet()) // loop on text nodes
119
                {
120
                        int sentence=-1, paragraph=-1, idword=0;
121
                        String lastopenlocalname= "", localname = "";
122
                        int para = -1, sent = -1;
123
                        
124
                        // = textnode.attribute("id")
125
                        ArrayList<org.txm.metadatas.Entry> metas = metadatas.get(id);
126
                        
127
                        String filename = id +".cnr";
128
                        File srcfile = new File(srcDir, filename);
129
                        if (!srcfile.exists()) {
130
                                println "Can't find CNR file : "+filename
131
                                continue
132
                        }
133
                        File teifile = new File(txmDir, id+".xml");
134
                        //println("||"+srcfile+">>"+teifile.getAbsolutePath());
135
                        OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(teifile), "UTF-8");
136

    
137
//                        if (second) { print(", ") }
138
//                        if ((second % 5) == 0) println ""
139
//                        print(filename.replaceFirst("\\.cnr", ""));
140
//                        second++
141

    
142
                        // HEADER
143
                        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
144
                        writer.write("<TEI xmlns=\"http://www.tei-c.org/ns/1.0\" xmlns:txm=\"http://textometrie.org/1.0\">\n");
145
                        writer.write("<teiHeader type=\"text\">\n");
146
                        writer.write("<fileDesc>\n");
147
                        writer.write("<titleStmt>\n");
148
                        writer.write("<title>"+id+"</title>\n");
149
                        writer.write("<respStmt>\n");
150
                        writer.write("<resp id=\"cordial\">initial tagging</resp>")
151
                        writer.write("</respStmt>\n");
152
                        writer.write("</titleStmt>\n");
153
                        writer.write("</fileDesc>\n");
154
                        writer.write("<encodingDesc>\n");
155
                        writer.write("<classDecl>\n");
156
                        writer.write("<taxonomy id=\"pos\"><bibl type=\"tagset\"/></taxonomy>\n")
157
                        writer.write("<taxonomy id=\"func\"><bibl type=\"tagset\"/></taxonomy>\n")
158
                        writer.write("<taxonomy id=\"lemma\"><bibl type=\"tagset\"/></taxonomy>\n")
159
                        writer.write("</classDecl>\n");
160
                        writer.write("</encodingDesc>\n");
161
                        writer.write("</teiHeader>\n");
162
                        writer.write("<text ")
163
                        for (org.txm.metadatas.Entry metadata : metas) {
164
                                String name = metadata.getId()
165
                                String value = metadata.getValue()
166
                                if (name != "id")
167
                                        writer.write(" "+name+"=\""+value.replace("&", "&amp;").replace("<", "&lt;").replace("\"", "&quot;")+"\"");        
168
                        }
169
                        writer.write(" id=\""+id.replace("&", "&amp;").replace("<", "&lt;").replace("\"", "&quot;")+"\"");
170
                        writer.write(">\n");
171

    
172
                        //def content = srcfile.getText(encoding)
173
                        String tmpEncoding = encoding;
174
                        if (encodingAll != null) {
175
                                tmpEncoding = encodingAll
176
                                if (srcfile.length() > CharsetDetector.MINIMALSIZE) {
177
                                        tmpEncoding = new CharsetDetector(srcfile).getEncoding();
178
                                        //println "file encoding: $tmpEncoding"        
179
                                }
180
                        }
181
                        def input = new FileInputStream(srcfile)
182
                        def reader = new BufferedReader(new InputStreamReader(input , tmpEncoding));
183
                        DetectBOM bomdetector = new DetectBOM(srcfile);
184
                        for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
185

    
186
                        // BODY
187
                        String line = reader.readLine();
188
                        int nline = 0;
189
                        def errors = [];
190
                        while (line != null) {
191
                                line = CleanFile.clean(line); // remove ctrl and surrogate chars
192
                                
193
                                nline++;
194
                                String[] fields = line.split(separator);
195
                                if (fields.size() == attrs.size()) {
196
                                        idword++;
197
                                        
198
                                        String paraV = fields[0].replaceAll("\\p{C}", "")
199
                                        String sentV = fields[1].replaceAll("\\p{C}", "")
200
                                        String formV = fields[2].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
201
                                        String lemV = fields[3].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
202
                                        String posV = fields[4].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
203
                                        String funcV = fields[5].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
204
                                        
205
                                        try {
206
                                                para = Integer.parseInt(paraV);
207
                                        } catch(Exception e){}
208
                                        if (para != paragraph) {
209
                                                if (paragraph != -1) {
210
                                                        if (sentence != -1) {
211
                                                                writer.write("</s>\n")
212
                                                                sentence = -1;
213
                                                        }
214
                                                        writer.write("</p>\n")
215
                                                }
216
                                                writer.write("<p id=\"p_${para}\">\n");
217
                                                paragraph = para;
218
                                        }
219
                                        
220
                                        try {
221
                                                sent = Integer.parseInt(sentV);
222
                                        } catch(Exception e){}
223
                                        if (sent != sentence) {
224
                                                if (sentence != -1)
225
                                                        writer.write("</s>\n")
226
                                                writer.write("<s id=\"s_${sent}\">\n");
227
                                                sentence = sent;
228
                                        }
229
                                        
230
                                        writer.write("<w id=\"w_${id}_${idword}\">\n");
231
                                        writer.write(" <txm:form>$formV</txm:form>\n");
232
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#pos\">$posV</txm:ana>\n");
233
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#func\">$funcV</txm:ana>\n");
234
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#lemma\">$lemV</txm:ana>\n");
235
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#sent\">$sentV</txm:ana>\n");
236
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#para\">$paraV</txm:ana>\n");
237
                                        writer.write(" <txm:ana resp=\"#txm\" type=\"#ref\">${id}, §${paraV}</txm:ana>\n");
238
                                        writer.write("</w>\n");
239
                                } else {
240
                                        //println "error : $nline : '$line'"
241
                                        errors << nline
242
                                }
243
                                line = reader.readLine();
244
                        }
245

    
246
                        // CLOSE
247
                        if(sentence != -1) { writer.write("</s>\n") }
248
                        if(paragraph != -1) { writer.write("</p>\n") }
249
                        paragraph=-1;
250
                        sentence=-1;
251
                        writer.write("</text>\n");
252
                        writer.write("</TEI>\n");
253
                        writer.close();
254
                        
255
                        if (errors.size() > 0) {
256
                                println "Errors in file $filename. Wrong number of columns at lines: "+errors
257
                        } else print "."
258
                }
259
                println ""
260
                return true;
261
        }
262

    
263
        /**
264
         * Gets the metadata.
265
         *
266
         * @return the metadata
267
         */
268
        public List<String> getMetadatas()
269
        {
270
                return metaDatas;
271
        }
272

    
273
        /**
274
         * The main method.
275
         *
276
         * @param args the arguments
277
         */
278
        public static void main(String[] args)
279
        {
280
                File dir = new File(System.getProperty("user.home"), "xml/discours/");
281
                new importer().run(dir);
282
        }
283
}