Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / tmx / importer.groovy @ 1000

History | View | Annotate | Download (4.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-12-01 17:48:31 +0100 (jeu. 01 déc. 2016) $
25
// $LastChangedRevision: 3351 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.tmx
29

    
30

    
31

    
32
//import org.txm.scripts.filters.TabulatedToXml.*;
33
import org.txm.scripts.importer.*;
34
import org.txm.importer.scripts.filters.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.cwb.CwbEncode
37
import org.txm.importer.cwb.CwbMakeAll
38
import org.txm.importer.scripts.xmltxm.*;
39
import org.txm.utils.io.FileCopy;
40
//import org.txm.utils.treetagger.TreeTagger;
41
import javax.xml.stream.*;
42
import java.io.BufferedWriter;
43
import java.io.File;
44
import java.io.FileOutputStream;
45
import java.io.OutputStreamWriter;
46
import java.io.PrintStream;
47
import java.net.URL;
48

    
49
import org.txm.scripts.filters.CutHeader.*;
50
import org.txm.scripts.filters.Tokeniser.*;
51
import org.txm.scripts.filters.FusionHeader.*;
52
import org.txm.scripts.filters.TagSentences.*;
53

    
54
import org.txm.objects.*;
55
import org.txm.*;
56
import org.txm.core.engines.*;
57

    
58
// TODO: Auto-generated Javadoc
59
/**
60
 * The Class importer.
61
 */
62
class importer {
63
        static Tmx2XmlFiles converter;
64
        /**
65
         * Run.
66
         *
67
         * @param rootDirFile the root dir file
68
         * @param base the base
69
         */
70
        public static void run(File srcDir, File binDir, File txmDir, HashMap<String, String> langs, HashMap<String, String> originalTexts)
71
        {
72
                String filename = binDir.getName();
73
                
74
                new File(binDir, "ptokenized").deleteDir();
75
                new File(binDir, "ptokenized").mkdir();
76
                new File(binDir, "tokenized").deleteDir();
77
                new File(binDir, "tokenized").mkdir();
78
                new File(binDir, "split").deleteDir();
79
                new File(binDir, "split").mkdir();
80

    
81
                //build xml files
82
                if(srcDir.listFiles() == null || srcDir.listFiles().size() == 0)
83
                        return;
84
                //File tmxfile = rootDirFile.listFiles()[0]
85
                converter = new Tmx2XmlFiles(langs, originalTexts) // fill langs and originalTexts dictionaries
86
                converter.run(srcDir, new File(binDir, "split"));
87
                        
88
                List<File> srcfiles = new File(binDir, "split").listFiles();
89
                println("Tokenizing "+srcfiles.size()+" files")
90
                for (File f : srcfiles) {
91
                        
92
                        String lang = langs.get(f.getName());
93
                                                
94
                        print "."
95
                        File outfile = new File(binDir,"tokenized/"+f.getName());
96
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(f, outfile, lang)
97
                        tokenizer.setOutSideTextTagsAndKeepContent("note");
98
                        if (!tokenizer.process()) {
99
                                println("Failed to tokenize $f")
100
                                outfile.delete()
101
                        }
102
                }
103
                println ""
104
                
105
                // copy css file if any
106
                File cssSrc = new File(srcDir, "style.css");
107
                File cssBin = new File(binDir, "style.css");
108
                if (cssSrc.exists()) {
109
                        FileCopy.copy(cssSrc, cssBin)
110
                }
111
                
112
                //TRANSFORM INTO XML-TEI-TXM
113
                List<File> tokenfiles = new File(binDir,"tokenized").listFiles()
114
                println("Building xml-tei-txm ("+tokenfiles.size()+" files)")
115
                for (File f : tokenfiles) {
116
                        print "."
117
                        String txmfile = f.getName();
118
                        
119
                        def correspType = new HashMap<String, String>()
120
                        def correspRef = new HashMap<String, String>()
121
                        //il faut lister les id de tous les respStmt
122
                        def respId = [];
123
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
124
                        def applications = new HashMap<String, HashMap<String,String>>();        
125
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
126
                        //pour construire les ref vers les taxonomies
127
                        def taxonomiesUtilisees = new HashMap<String, String[]>();
128
                        //associe un id d'item avec sa description et son URI
129
                        def itemsURI = new HashMap<String, HashMap<String, String>>();
130
                        //informations de respStmt
131
                        //resps (respId <voir ci-dessus>, [description, person, date])
132
                        def resps = new HashMap<String, String[]>();
133
                        //lance le traitement
134
                        String wordprefix = "w_";
135
                        def builder = new Xml2Ana(f);
136
                        builder.setCorrespondances(correspRef, correspType);
137
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
138
                        //builder.setAddTEIHeader();
139
                        if (!builder.process(new File(txmDir, txmfile))) {
140
                                println("Failed to process file "+f)
141
                                new File(srcDir, txmfile).delete()
142
                        }
143
                }
144
                println ""
145
        }
146
        
147
        public def getCorpusIDS() {
148
                converter.getCorpusIDS()
149
        }
150
}