Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / hyperprince / importer.groovy @ 479

History | View | Annotate | Download (6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2015-06-03 15:04:53 +0200 (mer. 03 juin 2015) $
25
// $LastChangedRevision: 2984 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.hyperprince
29

    
30
import org.txm.importer.*;
31
import org.txm.importer.filters.*;
32
import org.txm.scripts.*;
33
import org.txm.importer.cwb.CwbEncode
34
import org.txm.importer.cwb.CwbMakeAll
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37
import javax.xml.stream.*;
38
import java.net.URL;
39

    
40
import filters.CutHeader.*;
41
import filters.Tokeniser.*;
42
import filters.TagSentences.*;
43
import filters.FusionHeader.*;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class importer.
48
 *
49
 * @author mdecorde
50
 * simple import in cwb of RGAQCJ
51
 * structunits : initial, s, lb ...
52
 * wordproperties : form, pos and n
53
 */
54

    
55
class importer
56
{                
57
        
58
        /** The align list. */
59
        HashSet<String>  alignList;
60
        
61
        /** The aa. */
62
        AnalyzeAlignement aa ;
63

    
64
        /**
65
         * Run.
66
         *
67
         * @param rootDir the root dir
68
         * @param corpusSrc the corpus src
69
         */
70
        public void run(String rootDir, String corpusSrc)//List<File> srcfiles = [new File(rootDir,"Corpus-Hyperprince_2009-06-10.xml")];
71
        {
72
                //String ActionHome = "~/Bureau/trunkToolbox/0.4.6/org.txm.toolbox/GroovyImportScripts/filters";
73
                
74
                //-0 split, parceque c'est un corpus et ca a pls header partout partout
75
                ArrayList<String> milestones = new ArrayList<String>();//the tags which stay milestones
76
                milestones.add("tagUsage");
77
                milestones.add("pb");
78
                milestones.add("lb");
79
                
80
                File corpussrc = new File(rootDir,corpusSrc);
81
                rootDir = corpussrc.getParent()+"/";
82
                
83
                new File(rootDir+"ptokenized").deleteDir();
84
                new File(rootDir+"ptokenized").mkdir();
85
                new File(rootDir+"tokenized").deleteDir();
86
                new File(rootDir+"tokenized").mkdir();
87
                new File(rootDir+"split").deleteDir();
88
                new File(rootDir+"split").mkdir();
89
                new File(rootDir+"txm").deleteDir();
90
                new File(rootDir+"txm").mkdir();
91
                
92
                String xslfile = rootDir+"/xsl/splitcorpus.xsl";
93
                String outfile = rootDir+"split_temp.xml";
94
                
95
                aa = new AnalyzeAlignement(corpussrc);
96
                alignList = aa.process()
97
                ApplyXsl a = new ApplyXsl(xslfile);
98
                a.process(corpussrc.getAbsolutePath(),outfile);
99
                new File(outfile).delete()
100

    
101
                
102
                List<File> files = new File(rootDir,"split").listFiles();//scan directory split
103
                //PREPARE EACH FILE TO BE TOKENIZED
104
                for(File f : files)
105
                {
106
                        File srcfile = f;
107
                        //new EncodingConverter(srcfile, "iso-8859-1");
108
                        File resultfile = new File(rootDir+"ptokenized",f.getName());
109
                        println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
110
                        
111
                        def builder = new OneTagPerLine(srcfile.toURL(), milestones);
112
                        builder.process(resultfile);
113
                }
114
                
115
                //TOKENIZE FILES
116
                //Manager<Filter> filterManager = new FilterManager(ActionHome);
117
                files = new File(rootDir,"ptokenized").listFiles();//scan directory split
118
                for(File infile : files)
119
                {
120
                        Sequence S = new Sequence();
121
                        Filter F1 = new CutHeader();
122
                        Filter F6 = new Tokeniser(infile);
123
                        Filter F7 = new TagSentences();
124
                        Filter F11 = new FusionHeader();
125
                        S.add(F1);
126
                        S.add(F6);
127
                        //S.add(F7);
128
                        S.add(F11);
129
                        
130
                        //File infile = new File(rootDir+"tokenized",f.getName()+"-src.xml");
131
                        File xmlfile = new File(rootDir+"tokenized",infile.getName());
132
                        println("Tokenizing "+xmlfile)
133
                        S.setEncodages("UTF-8","UTF-8");
134
                        S.SetInFileAndOutFile(infile.getAbsolutePath(), xmlfile.getAbsolutePath());
135
                        
136
                        Object[] arguments1 = [infile.getParent()+"/"+infile.getName()+"header.xml"];
137
                        F1.SetUsedParam(arguments1);
138
                        Object[] arguments2 = [infile.getParent()+"/"+infile.getName()+"header.xml",F1];
139
                        F11.SetUsedParam(arguments2);
140
                        
141
                        S.proceed();
142
                }
143
                
144
                files = new File(rootDir,"tokenized").listFiles()                        
145
                //TRANSFORM INTO XML-TEI-TXM
146
                for(File f : files)
147
                {
148
                        //ArrayList<String> milestones = new ArrayList<String>();
149
                        println("Building xml-tei-txm "+f+ " >> "+f.getName()+".xml")
150
                        File file = f; 
151
                        String txmfile = f.getName().subSequence(0, 2)+".xml";
152
                        
153
                        def correspType = new HashMap<String,String>()
154
                        def correspRef = new HashMap<String,String>()
155
                        
156
                        //il faut lister les id de tous les respStmt
157
                        def respId = [];
158
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
159
                        def applications = new HashMap<String,HashMap<String,String>>();        
160
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
161
                                        //pour construire les ref vers les taxonomies
162
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
163
                        //associe un id d'item avec sa description et son URI
164
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
165
                        //informations de respStmt
166
                        //resps (respId <voir ci-dessus>, [description, person, date])
167
                        def resps = new HashMap<String,String[]>();
168
                        
169
                        //lance le traitement
170
                        def builder = new Xml2Ana(file);
171
                        builder.setCorrespondances(correspRef, correspType);
172
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
173
                        builder.transformFile(rootDir+"txm/",txmfile);
174
                }
175
                
176
                //rename files correctly
177
                files = new File(rootDir,"txm").listFiles();
178
                for(File file : files)
179
                {
180
                        String txmfile = file.getName();
181
                        file = aa.addAlignement(file, alignList);
182
                        txmfile = txmfile.tokenize(".").get(0)+".xml"
183
                        file.renameTo(new File(file.getParent(),txmfile));
184
                }
185
        }
186
}