Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / corptef / importer.groovy @ 1000

History | View | Annotate | Download (5.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.corptef;
29

    
30
import org.txm.scripts.importer.*;
31
import org.txm.importer.scripts.filters.*;
32
import org.txm.scripts.*;
33
import org.txm.importer.scripts.xmltxm.*;
34
import org.txm.utils.treetagger.TreeTagger;
35
import javax.xml.stream.*;
36
import java.net.URL;
37

    
38
import org.txm.Toolbox;
39

    
40
import org.txm.scripts.filters.CutHeader.*;
41
import org.txm.scripts.filters.ReunitBrokenWords.*;
42
import org.txm.scripts.filters.Tokeniser.*;
43
import org.txm.scripts.filters.TagSentences.*;
44
import org.txm.scripts.filters.FusionHeader.*;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class importer.
49
 */
50
class importer {
51
        
52
        /**
53
         * Run.
54
         *
55
         * @param dir the dir
56
         */
57
        public void run(File dir)
58
        {
59
                //prepare
60
                String rootDir = dir.getAbsolutePath()+"/";
61

    
62
                //clean directories
63
                File binDir = new File(Toolbox.getTxmHomePath(),"corpora/corptef");
64
                binDir.deleteDir();
65
                binDir.mkdir();
66
                new File(binDir,"ptokenized").deleteDir();
67
                new File(binDir,"ptokenized").mkdir();
68
                new File(binDir,"tokenized").deleteDir();
69
                new File(binDir,"tokenized").mkdir();
70
                new File(binDir,"headers").deleteDir();
71
                new File(binDir,"headers").mkdir();
72
                new File(binDir,"txm").deleteDir();
73
                new File(binDir,"txm").mkdir();
74

    
75
                ArrayList<String> milestones = new ArrayList<String>();
76

    
77
                List<File> files = new File(rootDir,"").listFiles();//scan directory split
78

    
79
                //PREPARE EACH FILE TO BE TOKENIZED
80
                for(File f : files)
81
                {
82
                        File srcfile = f;
83
                        File resultfile = new File(binDir,"ptokenized/"+f.getName());
84
                        println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
85

    
86
                        def builder2 = new OneTagPerLine(srcfile.toURL(), milestones);
87
                        builder2.process(resultfile);
88
                }
89

    
90
                rootDir = binDir.getAbsolutePath();
91

    
92
                //TOKENIZE FILES
93
                //Manager<Filter> filterManager = new FilterManager(ActionHome);        
94
                files = new File(rootDir,"ptokenized").listFiles();//scan directory split
95
                for(File infile : files)
96
                {
97
                        Sequence S = new Sequence();
98
                        Filter F1 = new CutHeader();
99
                        Filter F2 = new ReunitBrokenWords();
100
                        Filter F6 = new Tokeniser(infile);
101
                        Filter F7 = new TagSentences();
102
                        Filter F11 = new FusionHeader();
103
                        S.add(F1);
104
                        S.add(F2);
105
                        S.add(F6);
106
                        //S.add(F7);
107
                        S.add(F11);
108

    
109
                        File xmlfile = new File(rootDir,"tokenized/"+infile.getName());
110
                        println("Tokenizing "+xmlfile)
111
                        S.SetInFileAndOutFile(infile.getAbsolutePath(), xmlfile.getAbsolutePath());
112
                        S.setEncodages("UTF-8","UTF-8");
113

    
114
                        Object[] arguments1 = [rootDir+"/headers/"+infile.getName()+"header.xml"];
115
                        F1.SetUsedParam(arguments1);
116
                        Object[] arguments2 = [rootDir+"/headers/"+infile.getName()+"header.xml",F1];
117
                        F11.SetUsedParam(arguments2);
118

    
119
                        S.proceed();
120
                        S.clean();
121
                }
122

    
123
                files = new File(rootDir,"tokenized").listFiles()
124
                //TRANSFORM INTO XML-TEI-TXM
125
                for(File f : files)
126
                {
127
                        //ArrayList<String> milestones = new ArrayList<String>();
128
                        println("Building xml-tei-txm "+f+ " >> "+f.getName())
129
                        File file = f; 
130
                        String txmfile = f.getName();
131

    
132
                        def correspType = new HashMap<String,String>()
133
                        def correspRef = new HashMap<String,String>()
134

    
135
                        //il faut lister les id de tous les respStmt
136
                        def respId = [];
137
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
138
                        def applications = new HashMap<String,HashMap<String,String>>();        
139
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
140
                        //pour construire les ref vers les taxonomies
141
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
142
                        //associe un id d'item avec sa description et son URI
143
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
144
                        //informations de respStmt
145
                        //resps (respId <voir ci-dessus>, [description, person, date])
146
                        def resps = new HashMap<String,String[]>();
147

    
148
                        //lance le traitement
149
                        def builder3 = new Xml2Ana(file);
150
                        builder3.setCorrespondances(correspRef, correspType);
151
                        builder3.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
152
                        builder3.transformFile(rootDir,"txm/"+txmfile);
153
                }
154
        }
155

    
156
        /**
157
         * The main method.
158
         *
159
         * @param args the arguments
160
         */
161
        public static void main(String[] args)
162
        {
163
                File dir = new File("C:/TXM/corpora/corptef/")
164
                new importer().run(dir);
165
        }
166
}