Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / hyperbase / importer.groovy @ 187

History | View | Annotate | Download (6.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2015-06-03 15:04:53 +0200 (Wed, 03 Jun 2015) $
25
// $LastChangedRevision: 2984 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.hyperbase
29

    
30
//import filters.TabulatedToXml.*;
31
import org.txm.importer.*;
32
import org.txm.importer.filters.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.cwb.CwbEncode
35
import org.txm.importer.cwb.CwbMakeAll
36
import org.txm.scripts.teitxm.BuildTTSrc;
37
import org.txm.scripts.teitxm.*;
38
//import org.txm.utils.treetagger.TreeTagger;
39
import javax.xml.stream.*;
40
import java.io.BufferedWriter;
41
import java.io.File;
42
import java.io.FileOutputStream;
43
import java.io.OutputStreamWriter;
44
import java.io.PrintStream;
45
import java.net.URL;
46

    
47
import filters.CutHeader.*;
48
import filters.Tokeniser.*;
49
import filters.FusionHeader.*;
50
import filters.TagSentences.*;
51

    
52
import org.txm.*;
53
import org.txm.utils.*;
54
// TODO: Auto-generated Javadoc
55

    
56
/**
57
 * The Class importer.
58
 */
59
class importer {
60

    
61
        /**
62
         * Run.
63
         *
64
         * @param rootDirFile the root dir file
65
         * @param encoding the encoding
66
         * @param basename the basename
67
         * @return true, if successful
68
         */
69
        public static boolean run(File srcDir, File binDir, File txmDir, String encoding, String basename, String lang)
70
        {
71
                String rootDir = srcDir.getAbsolutePath()+"/"
72
                //cleaning
73

    
74
                new File(binDir,"ptokenized").deleteDir();
75
                new File(binDir,"ptokenized").mkdir();
76
                new File(binDir,"tokenized").deleteDir();
77
                new File(binDir,"tokenized").mkdir();
78
                new File(binDir,"split").deleteDir();
79
                new File(binDir,"split").mkdir();
80

    
81
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
82
                milestones.add("tagUsage");
83
                milestones.add("pb");
84
                milestones.add("lb");
85

    
86
                //build xml files
87
                if (srcDir.listFiles() == null || srcDir.listFiles().size() == 0 ) {
88
                        println "No corpus file in: "+srcDir;
89
                        return false;
90
                }
91
                File corpusfile = null;
92
                for (File f : srcDir.listFiles()) {
93
                        if (!f.isHidden() && f.canRead() && f.isFile() && f.getName() != "import.xml") {
94
                                corpusfile = f;
95
                                break;
96
                        }
97
                }
98

    
99
                System.out.println("Corpus file: "+corpusfile);
100
                if (!new Hyperbase2Xml().run(corpusfile, new File(binDir, "split"), encoding)) {
101
                        println("Failed to split the corpus file into text: "+corpusfile);
102
                        return false;
103
                }
104

    
105
                //TOKENIZE ALL FILES
106
                List<File> srcfiles = new File(binDir, "split").listFiles();
107
                println("Tokenizing "+srcfiles.size()+" files");
108
                for (File pfile : srcfiles) {
109
                        print "."
110
                        String filename = pfile.getName().substring(0, pfile.getName().length()-4)
111
                        File tfile = new File(binDir, "tokenized/"+filename+".xml");
112

    
113
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(pfile, tfile, lang);
114
                        if (!tokenizer.process()) {
115
                                println("Failed to tokenize file: "+pfile);
116
                                tfile.delete();
117
                        }
118
                }
119
                println ""
120

    
121
                //Tag sentences
122
                List<File> stokenfiles = new File(binDir,"tokenized").listFiles()
123
                new File(binDir,"stokenized").mkdir();
124
                println("Tag sentences of "+stokenfiles.size()+" files")
125
                for (File f : stokenfiles) {
126
                        print "."
127
                        try {
128
                                Sequence S = new Sequence();
129
                                Filter F1 = new CutHeader();
130
                                Filter F7 = new TagSentences();
131
                                Filter F11 = new FusionHeader();
132
                                S.add(F1);
133
                                S.add(F7);
134
                                S.add(F11);
135
                                File infile = new File(binDir, "tokenized/"+f.getName());
136
                                File xmlfile = new File(binDir, "stokenized/"+f.getName());
137
                                File headerfile = new File(binDir, "tokenized/"+f.getName()+"header.xml");
138

    
139
                                S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
140
                                S.setEncodages("UTF-8","UTF-8");
141
                                Object[] arguments1 = [headerfile.getAbsolutePath()];
142
                                F1.SetUsedParam(arguments1);
143
                                Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
144
                                F11.SetUsedParam(arguments2);
145
                                S.proceed();
146
                                S.clean();
147
                                headerfile.delete();//remove the prepared file to clean
148
                        } catch (Exception e) {println "Failed to sentence file "+f;println e;}
149
                }
150
                println ""
151

    
152
                //TRANSFORM INTO XML-TEI-TXM
153
                List<File> tokenfiles = new File(binDir, "stokenized").listFiles()
154
                println("Building xml-txm ("+tokenfiles.size()+" files)")
155
                for (File f : tokenfiles) {
156
                        print "."
157
                        File file = f;
158
                        String txmfile = f.getName();
159

    
160
                        def correspType = new HashMap<String,String>()
161
                        def correspRef = new HashMap<String,String>()
162
                        //il faut lister les id de tous les respStmt
163
                        def respId = [];
164
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
165
                        def applications = new HashMap<String,HashMap<String,String>>();
166
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
167
                        //pour construire les ref vers les taxonomies
168
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
169
                        //associe un id d'item avec sa description et son URI
170
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
171
                        //informations de respStmt
172
                        //resps (respId <voir ci-dessus>, [description, person, date])
173
                        def resps = new HashMap<String,String[]>();
174
                        //lance le traitement
175
                        def builder = new Xml2Ana(file);
176
                        builder.setCorrespondances(correspRef, correspType);
177
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
178
                        //builder.setAddTEIHeader();
179
                        if (!builder.process(new File(txmDir,txmfile))) {
180
                                println("Failed to process "+f);
181
                                new File(txmDir, txmfile).delete();
182
                        }
183
                }
184
                println ""
185
                return true;
186
        }
187
}