Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / alceste / importer.groovy @ 1000

History | View | Annotate | Download (5.8 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
25 479 mdecorde
// $LastChangedRevision: 3451 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 986 mdecorde
package org.txm.scripts.importer.alceste
29 321 mdecorde
30 321 mdecorde
31 986 mdecorde
//import org.txm.scripts.filters.TabulatedToXml.*;
32 986 mdecorde
import org.txm.scripts.importer.*;
33 1000 mdecorde
import org.txm.importer.scripts.filters.*;
34 321 mdecorde
import org.txm.scripts.*;
35 1000 mdecorde
import org.txm.importer.cwb.CwbEncode
36 1000 mdecorde
import org.txm.importer.cwb.CwbMakeAll
37 321 mdecorde
import org.txm.utils.io.FileCopy;
38 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
39 321 mdecorde
40 321 mdecorde
import javax.xml.stream.*;
41 321 mdecorde
42 321 mdecorde
import java.io.BufferedWriter;
43 321 mdecorde
import java.io.File;
44 321 mdecorde
import java.io.FileOutputStream;
45 321 mdecorde
import java.io.OutputStreamWriter;
46 321 mdecorde
import java.io.PrintStream;
47 321 mdecorde
import java.net.URL;
48 321 mdecorde
49 986 mdecorde
import org.txm.scripts.filters.CutHeader.*;
50 986 mdecorde
import org.txm.scripts.filters.Tokeniser.*;
51 986 mdecorde
import org.txm.scripts.filters.FusionHeader.*;
52 986 mdecorde
import org.txm.scripts.filters.TagSentences.*;
53 321 mdecorde
54 321 mdecorde
import org.txm.Toolbox;
55 321 mdecorde
56 321 mdecorde
// TODO: Auto-generated Javadoc
57 321 mdecorde
/**
58 321 mdecorde
 * The Class importer.
59 321 mdecorde
 */
60 321 mdecorde
class importer {
61 321 mdecorde
62 321 mdecorde
        /**
63 321 mdecorde
         * Run.
64 321 mdecorde
         *
65 321 mdecorde
         * @param rootDirFile the root dir file
66 321 mdecorde
         * @param encoding the encoding
67 321 mdecorde
         * @param basename the basename
68 321 mdecorde
         * @return true, if successful
69 321 mdecorde
         */
70 321 mdecorde
        public static boolean run(File rootDirFile, File binDir, File txmDir, String encoding, String basename, String lang)
71 321 mdecorde
        {
72 321 mdecorde
                if (rootDirFile.listFiles() == null || rootDirFile.listFiles().size() == 0) {
73 321 mdecorde
                        println "Error: no file to process in "+rootDirFile;
74 321 mdecorde
                        return false;
75 321 mdecorde
                }
76 321 mdecorde
                File srcfile;
77 321 mdecorde
                for (File f : rootDirFile.listFiles()) {
78 321 mdecorde
                        if (!f.isHidden() && f.canRead() && f.isFile() && f.getName() != "import.xml" && !f.getName().endsWith("~")) {
79 321 mdecorde
                                srcfile = f
80 321 mdecorde
                                break;
81 321 mdecorde
                        }
82 321 mdecorde
                }
83 321 mdecorde
84 321 mdecorde
                File splitDir = new File(binDir, "split");
85 321 mdecorde
                File tokenizeDir = new File(binDir, "tokenized");
86 321 mdecorde
                File stokenizeDir = new File(binDir,"stokenized")
87 321 mdecorde
                stokenizeDir.deleteDir();
88 321 mdecorde
                stokenizeDir.mkdir();
89 321 mdecorde
                tokenizeDir.deleteDir();
90 321 mdecorde
                tokenizeDir.mkdir();
91 321 mdecorde
                splitDir.deleteDir();
92 321 mdecorde
                splitDir.mkdir();
93 321 mdecorde
94 321 mdecorde
                // Build xml files from the alceste file
95 321 mdecorde
                Alceste2Xml transformer = new Alceste2Xml()
96 321 mdecorde
                if (!transformer.run(srcfile, splitDir, encoding)) {
97 321 mdecorde
                        return false;
98 321 mdecorde
                }
99 321 mdecorde
100 321 mdecorde
                // Tokenize xml files
101 321 mdecorde
                List<File> srcfiles = splitDir.listFiles();
102 321 mdecorde
                println (srcfiles.size()+ " texts found in "+srcfile)
103 321 mdecorde
104 321 mdecorde
                println("Tokenizing files ("+srcfiles.size()+")")
105 321 mdecorde
                for (File f : srcfiles) {
106 321 mdecorde
                        print "."
107 321 mdecorde
                        File resultfile = new File(tokenizeDir, f.getName());
108 321 mdecorde
                        try {
109 321 mdecorde
                                def builder = new SimpleTokenizerXml(f, resultfile, lang);
110 321 mdecorde
                                builder.setStartTag("text")
111 321 mdecorde
                                if (!builder.process()) {
112 321 mdecorde
                                        println "Failed to tokenize: "+f;
113 321 mdecorde
                                        resultfile.delete();
114 321 mdecorde
                                }
115 321 mdecorde
                        }
116 321 mdecorde
                        catch(Exception e){org.txm.utils.logger.Log.printStackTrace(e); println "Failed to tokenize: "+f; return false;}
117 321 mdecorde
                }
118 321 mdecorde
                println ""
119 321 mdecorde
120 321 mdecorde
                List<File> stokenfiles = tokenizeDir.listFiles();
121 321 mdecorde
                println("Tagging sentences of "+stokenfiles.size()+" files")
122 321 mdecorde
                for (File f : stokenfiles) {
123 321 mdecorde
                        print "."
124 321 mdecorde
                        Sequence S = new Sequence();
125 321 mdecorde
                        Filter F1 = new CutHeader();
126 321 mdecorde
                        Filter F7 = new TagSentences();
127 321 mdecorde
                        Filter F11 = new FusionHeader();
128 321 mdecorde
                        S.add(F1);
129 321 mdecorde
                        S.add(F7);
130 321 mdecorde
                        S.add(F11);
131 321 mdecorde
                        File infile = f;
132 321 mdecorde
                        File xmlfile = new File(stokenizeDir, f.getName());
133 321 mdecorde
                        File headerfile = new File(f.getParentFile(), f.getName()+"header.xml");
134 321 mdecorde
135 321 mdecorde
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
136 321 mdecorde
                        S.setEncodages("UTF-8","UTF-8");
137 321 mdecorde
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
138 321 mdecorde
                        F1.SetUsedParam(arguments1);
139 321 mdecorde
                        Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
140 321 mdecorde
                        F11.SetUsedParam(arguments2);
141 321 mdecorde
                        if (!S.proceed()) {
142 321 mdecorde
                                println "Failed to tag file: "+f
143 321 mdecorde
                        }
144 321 mdecorde
                        S.clean();
145 321 mdecorde
                        headerfile.delete();//remove the prepared file to clean
146 321 mdecorde
                }
147 321 mdecorde
                println ""
148 321 mdecorde
149 321 mdecorde
                //TRANSFORM INTO XML-TEI-TXM
150 321 mdecorde
                List<File> tokenfiles = stokenizeDir.listFiles()
151 321 mdecorde
                println("Building xml-tei-txm ("+tokenfiles.size()+" files)")
152 321 mdecorde
                for (File f : tokenfiles) {
153 321 mdecorde
                        print "."
154 321 mdecorde
                        File file = f;
155 321 mdecorde
                        File txmfile = new File(txmDir, f.getName());
156 321 mdecorde
157 321 mdecorde
                        def correspType = new HashMap<String, String>()
158 321 mdecorde
                        def correspRef = new HashMap<String, String>()
159 321 mdecorde
                        //il faut lister les id de tous les respStmt
160 321 mdecorde
                        def respId = [];
161 321 mdecorde
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
162 321 mdecorde
                        def applications = new HashMap<String,HashMap<String,String>>();
163 321 mdecorde
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
164 321 mdecorde
                        //pour construire les ref vers les taxonomies
165 321 mdecorde
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
166 321 mdecorde
                        //associe un id d'item avec sa description et son URI
167 321 mdecorde
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
168 321 mdecorde
                        //informations de respStmt
169 321 mdecorde
                        //resps (respId <voir ci-dessus>, [description, person, date])
170 321 mdecorde
                        def resps = new HashMap<String,String[]>();
171 321 mdecorde
                        //lance le traitement
172 321 mdecorde
                        def builder = new Xml2Ana(file);
173 625 mdecorde
                        builder.setConvertAllAtrtibutes true;
174 321 mdecorde
                        builder.setCorrespondances(correspRef, correspType);
175 321 mdecorde
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
176 321 mdecorde
                        if(!builder.process(txmfile))
177 321 mdecorde
                        {
178 321 mdecorde
                                println "Failed to build xml-txm of file: "+file;
179 321 mdecorde
                                txmfile.delete();
180 321 mdecorde
                        }
181 321 mdecorde
                }
182 321 mdecorde
                println ""
183 321 mdecorde
                return true;
184 321 mdecorde
        }
185 321 mdecorde
}