Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / hyperbase / importer.groovy @ 479

History | View | Annotate | Download (6.1 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2015-06-03 15:04:53 +0200 (mer. 03 juin 2015) $
25 321 mdecorde
// $LastChangedRevision: 2984 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 321 mdecorde
package org.txm.importer.hyperbase
29 321 mdecorde
30 321 mdecorde
//import filters.TabulatedToXml.*;
31 321 mdecorde
import org.txm.importer.*;
32 321 mdecorde
import org.txm.importer.filters.*;
33 321 mdecorde
import org.txm.scripts.*;
34 321 mdecorde
import org.txm.importer.cwb.CwbEncode
35 321 mdecorde
import org.txm.importer.cwb.CwbMakeAll
36 321 mdecorde
import org.txm.scripts.teitxm.BuildTTSrc;
37 321 mdecorde
import org.txm.scripts.teitxm.*;
38 321 mdecorde
//import org.txm.utils.treetagger.TreeTagger;
39 321 mdecorde
import javax.xml.stream.*;
40 321 mdecorde
import java.io.BufferedWriter;
41 321 mdecorde
import java.io.File;
42 321 mdecorde
import java.io.FileOutputStream;
43 321 mdecorde
import java.io.OutputStreamWriter;
44 321 mdecorde
import java.io.PrintStream;
45 321 mdecorde
import java.net.URL;
46 321 mdecorde
47 321 mdecorde
import filters.CutHeader.*;
48 321 mdecorde
import filters.Tokeniser.*;
49 321 mdecorde
import filters.FusionHeader.*;
50 321 mdecorde
import filters.TagSentences.*;
51 321 mdecorde
52 321 mdecorde
import org.txm.*;
53 321 mdecorde
import org.txm.utils.*;
54 321 mdecorde
// TODO: Auto-generated Javadoc
55 321 mdecorde
56 321 mdecorde
/**
57 321 mdecorde
 * The Class importer.
58 321 mdecorde
 */
59 321 mdecorde
class importer {
60 321 mdecorde
61 321 mdecorde
        /**
62 321 mdecorde
         * Run.
63 321 mdecorde
         *
64 321 mdecorde
         * @param rootDirFile the root dir file
65 321 mdecorde
         * @param encoding the encoding
66 321 mdecorde
         * @param basename the basename
67 321 mdecorde
         * @return true, if successful
68 321 mdecorde
         */
69 321 mdecorde
        public static boolean run(File srcDir, File binDir, File txmDir, String encoding, String basename, String lang)
70 321 mdecorde
        {
71 321 mdecorde
                String rootDir = srcDir.getAbsolutePath()+"/"
72 321 mdecorde
                //cleaning
73 321 mdecorde
74 321 mdecorde
                new File(binDir,"ptokenized").deleteDir();
75 321 mdecorde
                new File(binDir,"ptokenized").mkdir();
76 321 mdecorde
                new File(binDir,"tokenized").deleteDir();
77 321 mdecorde
                new File(binDir,"tokenized").mkdir();
78 321 mdecorde
                new File(binDir,"split").deleteDir();
79 321 mdecorde
                new File(binDir,"split").mkdir();
80 321 mdecorde
81 321 mdecorde
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
82 321 mdecorde
                milestones.add("tagUsage");
83 321 mdecorde
                milestones.add("pb");
84 321 mdecorde
                milestones.add("lb");
85 321 mdecorde
86 321 mdecorde
                //build xml files
87 321 mdecorde
                if (srcDir.listFiles() == null || srcDir.listFiles().size() == 0 ) {
88 321 mdecorde
                        println "No corpus file in: "+srcDir;
89 321 mdecorde
                        return false;
90 321 mdecorde
                }
91 321 mdecorde
                File corpusfile = null;
92 321 mdecorde
                for (File f : srcDir.listFiles()) {
93 321 mdecorde
                        if (!f.isHidden() && f.canRead() && f.isFile() && f.getName() != "import.xml") {
94 321 mdecorde
                                corpusfile = f;
95 321 mdecorde
                                break;
96 321 mdecorde
                        }
97 321 mdecorde
                }
98 321 mdecorde
99 321 mdecorde
                System.out.println("Corpus file: "+corpusfile);
100 321 mdecorde
                if (!new Hyperbase2Xml().run(corpusfile, new File(binDir, "split"), encoding)) {
101 321 mdecorde
                        println("Failed to split the corpus file into text: "+corpusfile);
102 321 mdecorde
                        return false;
103 321 mdecorde
                }
104 321 mdecorde
105 321 mdecorde
                //TOKENIZE ALL FILES
106 321 mdecorde
                List<File> srcfiles = new File(binDir, "split").listFiles();
107 321 mdecorde
                println("Tokenizing "+srcfiles.size()+" files");
108 321 mdecorde
                for (File pfile : srcfiles) {
109 321 mdecorde
                        print "."
110 321 mdecorde
                        String filename = pfile.getName().substring(0, pfile.getName().length()-4)
111 321 mdecorde
                        File tfile = new File(binDir, "tokenized/"+filename+".xml");
112 321 mdecorde
113 321 mdecorde
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(pfile, tfile, lang);
114 321 mdecorde
                        if (!tokenizer.process()) {
115 321 mdecorde
                                println("Failed to tokenize file: "+pfile);
116 321 mdecorde
                                tfile.delete();
117 321 mdecorde
                        }
118 321 mdecorde
                }
119 321 mdecorde
                println ""
120 321 mdecorde
121 321 mdecorde
                //Tag sentences
122 321 mdecorde
                List<File> stokenfiles = new File(binDir,"tokenized").listFiles()
123 321 mdecorde
                new File(binDir,"stokenized").mkdir();
124 321 mdecorde
                println("Tag sentences of "+stokenfiles.size()+" files")
125 321 mdecorde
                for (File f : stokenfiles) {
126 321 mdecorde
                        print "."
127 321 mdecorde
                        try {
128 321 mdecorde
                                Sequence S = new Sequence();
129 321 mdecorde
                                Filter F1 = new CutHeader();
130 321 mdecorde
                                Filter F7 = new TagSentences();
131 321 mdecorde
                                Filter F11 = new FusionHeader();
132 321 mdecorde
                                S.add(F1);
133 321 mdecorde
                                S.add(F7);
134 321 mdecorde
                                S.add(F11);
135 321 mdecorde
                                File infile = new File(binDir, "tokenized/"+f.getName());
136 321 mdecorde
                                File xmlfile = new File(binDir, "stokenized/"+f.getName());
137 321 mdecorde
                                File headerfile = new File(binDir, "tokenized/"+f.getName()+"header.xml");
138 321 mdecorde
139 321 mdecorde
                                S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
140 321 mdecorde
                                S.setEncodages("UTF-8","UTF-8");
141 321 mdecorde
                                Object[] arguments1 = [headerfile.getAbsolutePath()];
142 321 mdecorde
                                F1.SetUsedParam(arguments1);
143 321 mdecorde
                                Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
144 321 mdecorde
                                F11.SetUsedParam(arguments2);
145 321 mdecorde
                                S.proceed();
146 321 mdecorde
                                S.clean();
147 321 mdecorde
                                headerfile.delete();//remove the prepared file to clean
148 321 mdecorde
                        } catch (Exception e) {println "Failed to sentence file "+f;println e;}
149 321 mdecorde
                }
150 321 mdecorde
                println ""
151 321 mdecorde
152 321 mdecorde
                //TRANSFORM INTO XML-TEI-TXM
153 321 mdecorde
                List<File> tokenfiles = new File(binDir, "stokenized").listFiles()
154 321 mdecorde
                println("Building xml-txm ("+tokenfiles.size()+" files)")
155 321 mdecorde
                for (File f : tokenfiles) {
156 321 mdecorde
                        print "."
157 321 mdecorde
                        File file = f;
158 321 mdecorde
                        String txmfile = f.getName();
159 321 mdecorde
160 321 mdecorde
                        def correspType = new HashMap<String,String>()
161 321 mdecorde
                        def correspRef = new HashMap<String,String>()
162 321 mdecorde
                        //il faut lister les id de tous les respStmt
163 321 mdecorde
                        def respId = [];
164 321 mdecorde
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
165 321 mdecorde
                        def applications = new HashMap<String,HashMap<String,String>>();
166 321 mdecorde
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
167 321 mdecorde
                        //pour construire les ref vers les taxonomies
168 321 mdecorde
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
169 321 mdecorde
                        //associe un id d'item avec sa description et son URI
170 321 mdecorde
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
171 321 mdecorde
                        //informations de respStmt
172 321 mdecorde
                        //resps (respId <voir ci-dessus>, [description, person, date])
173 321 mdecorde
                        def resps = new HashMap<String,String[]>();
174 321 mdecorde
                        //lance le traitement
175 321 mdecorde
                        def builder = new Xml2Ana(file);
176 321 mdecorde
                        builder.setCorrespondances(correspRef, correspType);
177 321 mdecorde
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
178 321 mdecorde
                        //builder.setAddTEIHeader();
179 321 mdecorde
                        if (!builder.process(new File(txmDir,txmfile))) {
180 321 mdecorde
                                println("Failed to process "+f);
181 321 mdecorde
                                new File(txmDir, txmfile).delete();
182 321 mdecorde
                        }
183 321 mdecorde
                }
184 321 mdecorde
                println ""
185 321 mdecorde
                return true;
186 321 mdecorde
        }
187 321 mdecorde
}