Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / discours / importer.groovy @ 945

History | View | Annotate | Download (9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.discours
29

    
30
//import filters.TabulatedToXml.*;
31
import org.txm.importer.*;
32
import org.txm.importer.filters.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.cwb.CwbEncode
35
import org.txm.importer.cwb.CwbMakeAll
36
import org.txm.metadatas.Metadatas
37
import org.txm.importer.xmltxm.*;
38

    
39
import javax.xml.stream.*;
40

    
41
import org.txm.core.preferences.TBXPreferences
42
import java.io.BufferedWriter;
43
import java.io.File;
44
import java.io.FileInputStream;
45
import java.io.FileOutputStream;
46
import java.io.OutputStreamWriter;
47
import java.io.PrintStream;
48
import java.net.URL;
49
import java.util.ArrayList;
50
import java.util.Collections;
51
import java.util.List;
52

    
53
import org.txm.*;
54
import org.txm.core.engines.*;
55
import org.txm.utils.*
56
import org.txm.utils.io.*;
57
import org.txm.utils.i18n.DetectBOM;
58

    
59
// TODO: Auto-generated Javadoc
60
/**
61
 * The Class importer.
62
 */
63
class importer {
64

    
65
        /**
66
         * Run.
67
         *
68
         * @param dir the dir
69
         * @param encoding the encoding
70
         * @param basename the basename
71
         * @return true, if successful
72
         */
73
        public boolean run(File srcDir, File binDir, File txmDir, String encoding, String basename)
74
        {
75
                String rootDir = srcDir.getAbsolutePath()+"/"
76
                // scanning directory brut/*.cnr
77
                String extension = "cnr";
78
                File[] list = srcDir.listFiles();
79
                
80
                if (list == null || list.length == 0) return false;
81

    
82
                LinkedList filelist = new LinkedList();
83
                for (File cnr : list) {
84
                        if (cnr.getName().endsWith("."+extension)) {
85
                                filelist.add(cnr.getName());
86
                        }
87
                }
88
                Collections.sort(filelist);
89

    
90
                // get the headers
91
                def csvfile = new File(srcDir, "metadata.csv");
92
                Metadatas metadatas; // text metadata
93
                
94
                println "Trying to read metadata from: "+csvfile
95
                if (csvfile.exists()) {
96
                        File copy = new File(binDir, "metadata.csv")
97
                        if (!FileCopy.copy(csvfile, copy)) {
98
                                println "Error: could not create a copy of metadata file "+csvfile.getAbsoluteFile();
99
                                return;
100
                        }
101
                        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
102
                Toolbox.getMetadataColumnSeparator(), 
103
                Toolbox.getMetadataTextSeparator(), 1)
104
                } else {
105
                        println "No metadata file: "+csvfile
106
                        println "Aborting"
107
                        return false;
108
                }
109
                
110
                // build each xml file from cnr
111
                def attrs = ["para", "sent", "form", "lem", "pos", "func"];
112
                int formid = 2;
113
                String separator = "\t";
114

    
115
                String encodingAll = null;
116
                if (encoding == "??") {
117
                        encodingAll = new CharsetDetector(srcDir).getEncoding();
118
                        println "Guessed encoding: $encodingAll"
119
                }
120

    
121
                print "Converting CNR to XML:"
122
                def second = 0
123
                for (String id : metadatas.keySet()) // loop on text nodes
124
                {
125
                        int sentence=-1, paragraph=-1, idword=0;
126
                        String lastopenlocalname= "", localname = "";
127
                        int para = -1, sent = -1;
128
                        
129
                        // = textnode.attribute("id")
130
                        ArrayList<org.txm.metadatas.Entry> metas = metadatas.get(id);
131
                        
132
                        String filename = id +".cnr";
133
                        File srcfile = new File(srcDir, filename);
134
                        if (!srcfile.exists()) {
135
                                println "Can't find CNR file : "+filename
136
                                continue
137
                        }
138
                        File teifile = new File(txmDir, id+".xml");
139
                        //println("||"+srcfile+">>"+teifile.getAbsolutePath());
140
                        OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(teifile), "UTF-8");
141

    
142
//                        if (second) { print(", ") }
143
//                        if ((second % 5) == 0) println ""
144
//                        print(filename.replaceFirst("\\.cnr", ""));
145
//                        second++
146

    
147
                        // HEADER
148
                        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
149
                        writer.write("<TEI xmlns=\"http://www.tei-c.org/ns/1.0\" xmlns:txm=\"http://textometrie.org/1.0\">\n");
150
                        writer.write("<teiHeader type=\"text\">\n");
151
                        writer.write("<fileDesc>\n");
152
                        writer.write("<titleStmt>\n");
153
                        writer.write("<title>"+id+"</title>\n");
154
                        writer.write("<respStmt>\n");
155
                        writer.write("<resp id=\"cordial\">initial tagging</resp>")
156
                        writer.write("</respStmt>\n");
157
                        writer.write("</titleStmt>\n");
158
                        writer.write("</fileDesc>\n");
159
                        writer.write("<encodingDesc>\n");
160
                        writer.write("<classDecl>\n");
161
                        writer.write("<taxonomy id=\"pos\"><bibl type=\"tagset\"/></taxonomy>\n")
162
                        writer.write("<taxonomy id=\"func\"><bibl type=\"tagset\"/></taxonomy>\n")
163
                        writer.write("<taxonomy id=\"lemma\"><bibl type=\"tagset\"/></taxonomy>\n")
164
                        writer.write("</classDecl>\n");
165
                        writer.write("</encodingDesc>\n");
166
                        writer.write("</teiHeader>\n");
167
                        writer.write("<text ")
168
                        for (org.txm.metadatas.Entry metadata : metas) {
169
                                String name = metadata.getId()
170
                                String value = metadata.getValue()
171
                                if (name != "id")
172
                                        writer.write(" "+name+"=\""+value.replace("&", "&amp;").replace("<", "&lt;").replace("\"", "&quot;")+"\"");        
173
                        }
174
                        writer.write(" id=\""+id.replace("&", "&amp;").replace("<", "&lt;").replace("\"", "&quot;")+"\"");
175
                        writer.write(">\n");
176

    
177
                        //def content = srcfile.getText(encoding)
178
                        String tmpEncoding = encoding;
179
                        if (encodingAll != null) {
180
                                tmpEncoding = encodingAll
181
                                if (srcfile.length() > CharsetDetector.MINIMALSIZE) {
182
                                        tmpEncoding = new CharsetDetector(srcfile).getEncoding();
183
                                        //println "file encoding: $tmpEncoding"        
184
                                }
185
                        }
186
                        def input = new FileInputStream(srcfile)
187
                        def reader = new BufferedReader(new InputStreamReader(input , tmpEncoding));
188
                        DetectBOM bomdetector = new DetectBOM(srcfile);
189
                        for (int ibom = 0 ; ibom < bomdetector.getBOMSize() ; ibom++) input.read()
190

    
191
                        // BODY
192
                        String line = reader.readLine();
193
                        int nline = 0;
194
                        def errors = [];
195
                        while (line != null) {
196
                                line = CleanFile.cleanAllButTabs(line); // remove ctrl and surrogate chars
197
                                
198
                                nline++;
199
                                String[] fields = line.split(separator);
200
                                if (fields.size() == attrs.size()) {
201
                                        idword++;
202
                                        
203
                                        String paraV = fields[0].replaceAll("\\p{C}", "")
204
                                        String sentV = fields[1].replaceAll("\\p{C}", "")
205
                                        String formV = fields[2].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
206
                                        String lemV = fields[3].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
207
                                        String posV = fields[4].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
208
                                        String funcV = fields[5].replaceAll("\\p{C}", "").replace("&","&amp;").replace("<","&lt;");
209
                                        
210
                                        try {
211
                                                para = Integer.parseInt(paraV);
212
                                        } catch(Exception e){}
213
                                        if (para != paragraph) {
214
                                                if (paragraph != -1) {
215
                                                        if (sentence != -1) {
216
                                                                writer.write("</s>\n")
217
                                                                sentence = -1;
218
                                                        }
219
                                                        writer.write("</p>\n")
220
                                                }
221
                                                writer.write("<p id=\"p_${para}\">\n");
222
                                                paragraph = para;
223
                                        }
224
                                        
225
                                        try {
226
                                                sent = Integer.parseInt(sentV);
227
                                        } catch(Exception e){}
228
                                        if (sent != sentence) {
229
                                                if (sentence != -1)
230
                                                        writer.write("</s>\n")
231
                                                writer.write("<s id=\"s_${sent}\">\n");
232
                                                sentence = sent;
233
                                        }
234
                                        
235
                                        writer.write("<w id=\"w_${id}_${idword}\">\n");
236
                                        writer.write(" <txm:form>$formV</txm:form>\n");
237
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#pos\">$posV</txm:ana>\n");
238
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#func\">$funcV</txm:ana>\n");
239
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#lemma\">$lemV</txm:ana>\n");
240
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#sent\">$sentV</txm:ana>\n");
241
                                        writer.write(" <txm:ana resp=\"#cordial\" type=\"#para\">$paraV</txm:ana>\n");
242
                                        writer.write(" <txm:ana resp=\"#txm\" type=\"#ref\">${id}, §${paraV}</txm:ana>\n");
243
                                        writer.write("</w>\n");
244
                                } else {
245
                                        //println "error : $nline : '$line' -> $fields"
246
                                        if (line.length() > 0) errors << nline
247
                                }
248
                                line = reader.readLine();
249
                        }
250

    
251
                        // CLOSE
252
                        if(sentence != -1) { writer.write("</s>\n") }
253
                        if(paragraph != -1) { writer.write("</p>\n") }
254
                        paragraph=-1;
255
                        sentence=-1;
256
                        writer.write("</text>\n");
257
                        writer.write("</TEI>\n");
258
                        writer.close();
259
                        
260
                        if (errors.size() > 0) {
261
                                println "Errors in file $filename. Wrong number of columns at lines: "+errors
262
                        } else print "."
263
                }
264
                println ""
265
                return true;
266
        }
267

    
268
        /**
269
         * Gets the metadata.
270
         *
271
         * @return the metadata
272
         */
273
        public List<String> getMetadatas()
274
        {
275
                return metaDatas;
276
        }
277

    
278
        /**
279
         * The main method.
280
         *
281
         * @param args the arguments
282
         */
283
        public static void main(String[] args)
284
        {
285
                File dir = new File(System.getProperty("user.home"), "xml/discours/");
286
                new importer().run(dir);
287
        }
288
}