Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / alceste / importer.groovy @ 1000

History | View | Annotate | Download (5.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25
// $LastChangedRevision: 3451 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.alceste
29

    
30

    
31
//import org.txm.scripts.filters.TabulatedToXml.*;
32
import org.txm.scripts.importer.*;
33
import org.txm.importer.scripts.filters.*;
34
import org.txm.scripts.*;
35
import org.txm.importer.cwb.CwbEncode
36
import org.txm.importer.cwb.CwbMakeAll
37
import org.txm.utils.io.FileCopy;
38
import org.txm.importer.scripts.xmltxm.*;
39

    
40
import javax.xml.stream.*;
41

    
42
import java.io.BufferedWriter;
43
import java.io.File;
44
import java.io.FileOutputStream;
45
import java.io.OutputStreamWriter;
46
import java.io.PrintStream;
47
import java.net.URL;
48

    
49
import org.txm.scripts.filters.CutHeader.*;
50
import org.txm.scripts.filters.Tokeniser.*;
51
import org.txm.scripts.filters.FusionHeader.*;
52
import org.txm.scripts.filters.TagSentences.*;
53

    
54
import org.txm.Toolbox;
55

    
56
// TODO: Auto-generated Javadoc
57
/**
58
 * The Class importer.
59
 */
60
class importer {
61

    
62
        /**
63
         * Run.
64
         *
65
         * @param rootDirFile the root dir file
66
         * @param encoding the encoding
67
         * @param basename the basename
68
         * @return true, if successful
69
         */
70
        public static boolean run(File rootDirFile, File binDir, File txmDir, String encoding, String basename, String lang)
71
        {
72
                if (rootDirFile.listFiles() == null || rootDirFile.listFiles().size() == 0) {
73
                        println "Error: no file to process in "+rootDirFile;
74
                        return false;
75
                }
76
                File srcfile;
77
                for (File f : rootDirFile.listFiles()) {
78
                        if (!f.isHidden() && f.canRead() && f.isFile() && f.getName() != "import.xml" && !f.getName().endsWith("~")) {
79
                                srcfile = f
80
                                break;
81
                        }
82
                }
83

    
84
                File splitDir = new File(binDir, "split");
85
                File tokenizeDir = new File(binDir, "tokenized");
86
                File stokenizeDir = new File(binDir,"stokenized")
87
                stokenizeDir.deleteDir();
88
                stokenizeDir.mkdir();
89
                tokenizeDir.deleteDir();
90
                tokenizeDir.mkdir();
91
                splitDir.deleteDir();
92
                splitDir.mkdir();
93

    
94
                // Build xml files from the alceste file
95
                Alceste2Xml transformer = new Alceste2Xml()
96
                if (!transformer.run(srcfile, splitDir, encoding)) {
97
                        return false;
98
                }
99

    
100
                // Tokenize xml files
101
                List<File> srcfiles = splitDir.listFiles();
102
                println (srcfiles.size()+ " texts found in "+srcfile)
103

    
104
                println("Tokenizing files ("+srcfiles.size()+")")
105
                for (File f : srcfiles) {
106
                        print "."
107
                        File resultfile = new File(tokenizeDir, f.getName());
108
                        try {
109
                                def builder = new SimpleTokenizerXml(f, resultfile, lang);
110
                                builder.setStartTag("text")
111
                                if (!builder.process()) {
112
                                        println "Failed to tokenize: "+f;
113
                                        resultfile.delete();
114
                                }
115
                        }
116
                        catch(Exception e){org.txm.utils.logger.Log.printStackTrace(e); println "Failed to tokenize: "+f; return false;}
117
                }
118
                println ""
119
                
120
                List<File> stokenfiles = tokenizeDir.listFiles();        
121
                println("Tagging sentences of "+stokenfiles.size()+" files")
122
                for (File f : stokenfiles) {
123
                        print "."
124
                        Sequence S = new Sequence();
125
                        Filter F1 = new CutHeader();
126
                        Filter F7 = new TagSentences();
127
                        Filter F11 = new FusionHeader();
128
                        S.add(F1);
129
                        S.add(F7);
130
                        S.add(F11);
131
                        File infile = f;
132
                        File xmlfile = new File(stokenizeDir, f.getName());
133
                        File headerfile = new File(f.getParentFile(), f.getName()+"header.xml");
134

    
135
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
136
                        S.setEncodages("UTF-8","UTF-8");
137
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
138
                        F1.SetUsedParam(arguments1);
139
                        Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
140
                        F11.SetUsedParam(arguments2);
141
                        if (!S.proceed()) {
142
                                println "Failed to tag file: "+f
143
                        }
144
                        S.clean();
145
                        headerfile.delete();//remove the prepared file to clean
146
                }
147
                println ""
148

    
149
                //TRANSFORM INTO XML-TEI-TXM
150
                List<File> tokenfiles = stokenizeDir.listFiles()
151
                println("Building xml-tei-txm ("+tokenfiles.size()+" files)")
152
                for (File f : tokenfiles) {
153
                        print "."
154
                        File file = f;
155
                        File txmfile = new File(txmDir, f.getName());
156

    
157
                        def correspType = new HashMap<String, String>()
158
                        def correspRef = new HashMap<String, String>()
159
                        //il faut lister les id de tous les respStmt
160
                        def respId = [];
161
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
162
                        def applications = new HashMap<String,HashMap<String,String>>();
163
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
164
                        //pour construire les ref vers les taxonomies
165
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
166
                        //associe un id d'item avec sa description et son URI
167
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
168
                        //informations de respStmt
169
                        //resps (respId <voir ci-dessus>, [description, person, date])
170
                        def resps = new HashMap<String,String[]>();
171
                        //lance le traitement
172
                        def builder = new Xml2Ana(file);
173
                        builder.setConvertAllAtrtibutes true;
174
                        builder.setCorrespondances(correspRef, correspType);
175
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
176
                        if(!builder.process(txmfile))
177
                        {
178
                                println "Failed to build xml-txm of file: "+file;
179
                                txmfile.delete();
180
                        }
181
                }
182
                println ""
183
                return true;
184
        }
185
}