Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / lasla / importer.groovy @ 187

History | View | Annotate | Download (5.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.lasla
29

    
30
import javax.xml.stream.*
31

    
32
import org.txm.*
33
import org.txm.importer.*
34
import org.txm.importer.filters.*
35
import org.txm.objects.*
36
import org.txm.scripts.*
37
import org.txm.scripts.teitxm.*
38

    
39
import filters.CutHeader.*
40
import filters.FusionHeader.*
41
import filters.TagSentences.*
42
import filters.Tokeniser.*
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class importer.
47
 */
48
class importer {
49
        
50
        /**
51
         * Run.
52
         *
53
         * @param rootDirFile the root dir file
54
         * @param encoding the encoding
55
         * @param basename the basename
56
         * @return true, if successful
57
         */
58
        public boolean run(File rootDirFile, String encoding, String basename)
59
        {
60
                String filename = rootDirFile.getName();
61
                String rootDir = rootDirFile.getAbsolutePath()+"/"
62
                
63
                //cleaning
64
                File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
65
                
66
                binDir.deleteDir();
67
                binDir.mkdir();
68
                new File(binDir,"src").deleteDir();
69
                new File(binDir,"src").mkdir();
70
                new File(binDir,"txm").deleteDir();
71
                new File(binDir,"txm").mkdir();
72
                
73
                ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
74
                
75
                //separates values and build simple xml
76
                List<File> srcfiles = new File(rootDir).listFiles();
77
                if(srcfiles == null)
78
                {
79
                        println "No file to process"
80
                        return false;
81
                }
82
                //println "BYEBYE FORTRAN";
83
                println("Converting to xml "+srcfiles.size()+ " files")
84
                for(File f : srcfiles)
85
                {
86
                        print "."
87
                        File outfile = new File(binDir,"src/"+f.getName()+".xml")
88
                        FortranColumns2XML builder = new FortranColumns2XML(f, outfile, encoding);
89
                        builder.setColumnIndexes([0, 3, 4, 8, 29, 55, 67],        [3, 4, 8, 28, 54, 66, 78]);
90
                        builder.setColumnsLinesNames(["ref","s","sent","lemme","word","line","pos"], "w", 4);
91
                        builder.setAddIds(true);
92
                        //builder.setLineCheck(3, "l");
93
                        if(!builder.process())
94
                        {
95
                                println "Failed to convert file: "+f
96
                        }
97
                }
98
                println ""
99
                
100
                rootDir = binDir.getAbsolutePath();
101
                srcfiles = new File(rootDir, "src").listFiles();
102
                //TRANSFORM INTO XML-TEI-TXM
103
                println("Building xml-tei-txm "+srcfiles.size()+ " files")
104
                for(File f : srcfiles)
105
                {
106
                        print "."
107
                        //ArrayList<String> milestones = new ArrayList<String>();
108
                        
109
                        File file = f; 
110
                        String txmfile = f.getName();
111
                        def correspType = new HashMap<String,String>()
112
                        //["ref","s","sent","word","lemme","line","pos"]
113
                        correspType.put("ref","ref");
114
                        correspType.put("s","s");
115
                        correspType.put("sent","sent");
116
                        correspType.put("lemme","lemme")
117
                        correspType.put("line","line")
118
                        correspType.put("pos","pos");
119
                        correspType.put("form","form");
120
                        
121
                        def correspRef = new HashMap<String,String>()
122
                        correspRef.put("ref","lasla");
123
                        correspRef.put("s","lasla");
124
                        correspRef.put("sent","lasla");
125
                        correspRef.put("lemme","lasla")
126
                        correspRef.put("line","lasla")
127
                        correspRef.put("pos","lasla");
128
                        correspRef.put("form","lasla");
129
                        
130
                        //il faut lister les id de tous les respStmt
131
                        def respId = ["lasla"];
132
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
133
                        def applications = new HashMap<String, ArrayList<String>>();
134
                        applications.put("lasla", new ArrayList<String>());
135
                        applications.get("lasla").add("txm");//app ident
136
                        applications.get("lasla").add("0.6");//app version
137
                        applications.get("lasla").add(null);//app report file path
138
                        
139
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
140
                                        //pour construire les ref vers les taxonomies
141
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
142
                        taxonomiesUtilisees.put("lasla",["LASLA"]);//,"lemma","lasla","grace"]);
143
                        
144
                        //associe un id d'item avec sa description et son URI
145
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
146
                        itemsURI.put("LASLA",new HashMap<String,String>());
147
                        itemsURI.get("LASLA").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml");
148
                        itemsURI.get("LASLA").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176");
149
                        
150
                        //informations de respStmt
151
                        //resps (respId <voir ci-dessus>, [description, person, date])
152
                        def resps = new HashMap<String,String[]>();
153
                        resps.put("lasla", ["initial tagging","lasla","2010-03-02",""])
154
                        //lance le traitement
155
                        String wordprefix = "w_";
156
                        def builder = new Xml2Ana(file);
157
                        builder.setCorrespondances(correspRef, correspType);
158
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
159
                        //builder.setAddTEIHeader();
160
                        if(!builder.process(new File(new File(rootDir,"txm"),txmfile)))
161
                        {
162
                                println "Failed to build xml-txm of file: "+f;
163
                        }
164
                }
165
                println ""
166
                return true;
167
        }
168
}