Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / bvh / importer.groovy @ 187

History | View | Annotate | Download (5.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.bvh
29

    
30
import org.txm.importer.*;
31
import org.txm.importer.filters.*;
32
import org.txm.scripts.*;
33
import org.txm.importer.cwb.CwbEncode
34
import org.txm.importer.cwb.CwbMakeAll
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37
import javax.xml.stream.*;
38
import java.io.File;
39
import java.net.URL;
40

    
41
import org.txm.*;
42

    
43
import filters.CutHeader.*;
44
import filters.Tokeniser.*;
45
import filters.FusionHeader.*;
46
import filters.TagSentences.TagSentences;
47

    
48
// TODO: Auto-generated Javadoc
49
/**
50
 * Convert TEI-BVH to TEI-TXM <br/>
51
 * 1- tokenize<br/>
52
 * 2- launch XML2TEITXM with no annotation import <br/>.
53
 *
54
 * @author mdecorde
55
 */
56
class importer {
57
        
58
        /**
59
         * Run.
60
         *
61
         * @param rootDirFile contains the source files
62
         */
63
        public static void run(File rootDirFile, String basename)
64
        {
65
                
66
                File fullfile;
67
                String rootDir =rootDirFile.getAbsolutePath();
68
                
69
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
70
                milestones.add("tagUsage");
71
                milestones.add("pb");
72
                milestones.add("lb");
73
                
74
                File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
75
                binDir.deleteDir();
76
                binDir.mkdir();
77
                new File(binDir,"ptokenized").deleteDir();
78
                new File(binDir,"ptokenized").mkdir();
79
                new File(binDir,"tokenized").deleteDir();
80
                new File(binDir,"tokenized").mkdir();
81
                new File(binDir,"txm").deleteDir();
82
                new File(binDir,"txm").mkdir();
83
                
84
                //PREPARE EACH SPLITED FILE TO BE TOKENIZED
85
                List<File> srcfiles = rootDirFile.listFiles();
86
                println("Preparing Tokenizing "+srcfiles.size()+" files")
87
                for(File f : srcfiles)
88
                {
89
                        print "."
90
                        File srcfile = f;
91
                        File resultfile = new File(binDir,"ptokenized/"+f.getName());
92
                        //println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
93
                        def builder = new OneTagPerLine(srcfile.toURL(), milestones);
94
                        builder.process(resultfile);
95
                }
96
                println ""
97
                rootDir = binDir.getAbsolutePath();
98
                
99
                //TOKENIZE FILES
100
                List<File> ptokenfiles = new File(rootDir,"ptokenized").listFiles()        
101
                println("Tokenizing "+ptokenfiles.size()+" files")
102
                for(File f : ptokenfiles)
103
                {
104
                        print "."
105
                        Sequence S = new Sequence();
106
                        Filter F1 = new CutHeader();
107
                        Filter F6 = new Tokeniser(f);
108
                        Filter F7 = new TagSentences();
109
                        Filter F11 = new FusionHeader();
110
                        S.add(F1);
111
                        S.add(F6);
112
                        S.add(F7);
113
                        S.add(F11);
114
                        File infile = new File(rootDir+"/ptokenized",f.getName());
115
                        File xmlfile = new File(rootDir+"/tokenized",f.getName());
116
                        File headerfile = new File(rootDir+"/ptokenized/",f.getName()+"header.xml");
117
                        
118
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
119
                        S.setEncodages("UTF-8","UTF-8");
120
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
121
                        F1.SetUsedParam(arguments1);
122
                        Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
123
                        F11.SetUsedParam(arguments2);
124
                        S.proceed();
125
                        
126
                        S.clean();
127
                        //headerfile.delete();//remove the prepared file to clean
128
                }
129
                println ""
130
                
131
                //TRANSFORM INTO XML-TEI-TXM
132
                List<File> tokenfiles = new File(rootDir,"tokenized").listFiles()        
133
                println("Building "+tokenfiles.size()+" xml-tei-txm files")
134
                for(File f : tokenfiles)
135
                {
136
                        print "."
137
                        File file = f; 
138
                        String txmfile = f.getName();
139
                        
140
                        def correspType = new HashMap<String,String>()
141
                        def correspRef = new HashMap<String,String>()
142
                        //il faut lister les id de tous les respStmt
143
                        def respId = [];
144
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
145
                        def applications = new HashMap<String,HashMap<String,String>>();        
146
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
147
                                        //pour construire les ref vers les taxonomies
148
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
149
                        //associe un id d'item avec sa description et son URI
150
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
151
                        //informations de respStmt
152
                        //resps (respId <voir ci-dessus>, [description, person, date])
153
                        def resps = new HashMap<String,String[]>();
154
                        //lance le traitement
155
                        String wordprefix = "w_";
156
                        def builder = new Xml2Ana(file);
157
                        builder.setCorrespondances(correspRef, correspType);
158
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
159
                        builder.process(new File(new File(rootDir,"txm"),txmfile));
160
                }
161
                println ""
162
        }
163
        
164
        /**
165
         * The main method.
166
         *
167
         * @param args the arguments
168
         */
169
        public static void main(String[] args)
170
        {
171
                new importer().run(new File("~/xml/bvh"));
172
        }
173
}