Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / perrault / importer.groovy @ 187

History | View | Annotate | Download (7.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28

    
29
package org.txm.importer.perrault
30

    
31
import org.txm.importer.*;
32
import org.txm.importer.filters.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.cwb.CwbEncode
35
import org.txm.importer.cwb.CwbMakeAll
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38
import javax.xml.stream.*;
39
import java.net.URL;
40

    
41
import filters.CutHeader.*;
42
import filters.Tokeniser.*;
43
import filters.FusionHeader.*;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class importer.
48
 */
49
class importer {
50
        
51
        /**
52
         * Split.
53
         *
54
         * @param file the file
55
         * @return the list
56
         */
57
        public static List<File> split(File file)
58
        {
59
                File f  = file;
60
                
61
                println "split file "+f;
62
                String rootDir = f.getParent()+"/";
63
                String xslfile = rootDir+"/xsl/splitcorpus.xsl";
64
                String outfile = rootDir+"split_temp.xml";
65
                
66
                //get the splited file name
67
                //String outfilename = new XPathResult(f).getXpathResponse("//TEI/text/body/div/head","");
68
                
69
                ApplyXsl a = new ApplyXsl(xslfile);
70
                //a.SetParam("xpathtag", "//TEI");//coupe //text
71
                //a.SetParam("xpathfilename", "/body/div/head");//cherche a partir de //xpathtag
72
                a.process(f.getPath(),outfile);
73
                
74
                new File(outfile).delete();
75
                
76
                List<File> files = new File(f.getParent(),"split").listFiles();
77
        }
78
        
79
        /**
80
         * Run.
81
         *
82
         * @param srcfiles the srcfiles
83
         */
84
        public static void run(File[] srcfiles)
85
        {
86
                List<File> files = null;
87
                File fullfile;
88
                String rootDir ="";
89
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
90
                milestones.add("tagUsage");
91
                milestones.add("pb");
92
                milestones.add("lb");
93
                
94
                for(File f : srcfiles)
95
                {
96
                        files = importer.split(f);
97
                        rootDir = f.getParent()+"/"
98
                        fullfile = f;
99
                        new File(rootDir+"ptokenized").deleteDir();
100
                        new File(rootDir+"ptokenized").mkdir();
101
                        new File(rootDir+"tokenized").deleteDir();
102
                        new File(rootDir+"tokenized").mkdir();
103
                        new File(rootDir+"split").deleteDir();
104
                        new File(rootDir+"split").mkdir();
105
                        new File(rootDir+"txm").deleteDir();
106
                        new File(rootDir+"txm").mkdir();
107
                        
108
                        String xslfile = rootDir+"xsl/splitcorpus.xsl";
109
                        String infile = f.getPath();
110
                        String outfile = rootDir+"split_temp.xml";
111
                        
112
                        ApplyXsl a = new ApplyXsl(xslfile);
113
                        a.process(infile,outfile);
114
                        
115
                        new File(outfile).delete();
116
                }
117
                
118
                //get header
119
                String header =""
120
                Reader reader = new FileReader(fullfile);
121
                String cline = reader.readLine();
122
                while(!cline.trim().contains("<text>"))
123
                {
124
                        header += cline+"\n";
125
                        cline = reader.readLine();
126
                }
127
                
128
                //put splited into tei file
129
                for(File f : files)
130
                {
131
                        File temp = new File(f.getParent(),"temp");
132
                        Writer writer = new OutputStreamWriter(new FileOutputStream(temp) , "UTF-8");
133
                        writer.write(header);
134
                        writer.write("""<text>\n<body>\n""")
135
                        f.eachLine{String line->
136
                                if(!line.startsWith("<?xml"))
137
                                writer.write(line+"\n");
138
                        }
139
                        
140
                        writer.write("""</body>\n</text>\n</TEI>""")
141
                        writer.close();
142
                        
143
                        if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f
144
                }
145
                
146
                //PREPARE EACH SPLITED FILE TO BE TOKENIZED
147
                println files
148
                for(File f : files)
149
                {
150
                        File srcfile = f;
151
                        File resultfile = new File(rootDir+"ptokenized",f.getName()+"-src.xml");
152
                        println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
153
                        def builder = new OneTagPerLine(srcfile.toURL(), milestones);
154
                        builder.process(resultfile);
155
                }
156
                
157
                //TOKENIZE FILES
158
                files = new File(rootDir,"ptokenized").listFiles()        
159
                for(File f : files)
160
                {
161
                        Sequence S = new Sequence();
162
                        Filter F1 = new CutHeader();
163
                        Filter F6 = new Tokeniser(f);
164
                        Filter F11 = new FusionHeader();
165
                        S.add(F1);
166
                        S.add(F6);
167
                        S.add(F11);
168
                        File infile = new File(rootDir+"ptokenized",f.getName());
169
                        File xmlfile = new File(rootDir+"tokenized",f.getName()+"-out.xml");
170
                        File headerfile = new File(rootDir+"/ptokenized/",f.getName()+"header.xml");
171
                        println("Tokenizing "+xmlfile)
172
                        S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
173
                        S.setEncodages("UTF-8","UTF-8");
174
                        Object[] arguments1 = [headerfile.getAbsolutePath()];
175
                        F1.SetUsedParam(arguments1);
176
                        Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
177
                        F11.SetUsedParam(arguments2);
178
                        S.proceed();
179

    
180
                        System.out.println("exits src tokenized : "+infile.exists());
181
                        S= null;
182
                        F1=null;
183
                        F6=null;
184
                        F11=null;
185
                        System.out.println("delete src tokenized : "+infile.delete());//remove the prepared file to clean
186
                        headerfile.delete();//remove the prepared file to clean
187
                }
188

    
189
                //TRANSFORM INTO XML-TEI-TXM
190
                files = new File(rootDir,"tokenized").listFiles()        
191
                for(File f : files)
192
                {
193
                        //ArrayList<String> milestones = new ArrayList<String>();
194
                        println("Building xml-tei-txm "+f+ " >> "+f.getName()+"-TXM.xml")
195
                        File file = f; 
196
                        String txmfile = f.getName()+"-TXM.xml";
197
                        
198
                        def correspType = new HashMap<String,String>()
199
                        def correspRef = new HashMap<String,String>()
200
                        //il faut lister les id de tous les respStmt
201
                        def respId = [];
202
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
203
                        def applications = new HashMap<String,HashMap<String,String>>();        
204
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
205
                                        //pour construire les ref vers les taxonomies
206
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
207
                        //associe un id d'item avec sa description et son URI
208
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
209
                        //informations de respStmt
210
                        //resps (respId <voir ci-dessus>, [description, person, date])
211
                        def resps = new HashMap<String,String[]>();
212
                        //lance le traitement
213
                        String wordprefix = "w_c_";
214
                        def builder = new Xml2Ana(file);
215
                        builder.setCorrespondances(correspRef, correspType);
216
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
217
                        builder.transformFile(rootDir+"txm/",txmfile);
218
                }
219
                
220
                //rename files correctly
221
                files = new File(rootDir,"txm").listFiles();
222
                for(File file : files)
223
                {
224
                        String txmfile = file.getName();
225
                        txmfile = txmfile.tokenize(".").get(0);
226
                        txmfile = txmfile.substring(0, txmfile.length()-1);
227
                        txmfile += ".xml"
228
                        file.renameTo(new File(file.getParent(),txmfile));
229
                }
230
                
231
        }
232
        
233
        /**
234
         * The main method.
235
         *
236
         * @param args the arguments
237
         */
238
        public static void main(String[] args)
239
        {
240
                File[] files = [new File("~/xml/perrault/perrault.xml")];
241
                new importer().run(files);
242
        }
243
}