Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / lasla / laslaLoader.groovy @ 187

History | View | Annotate | Download (5 kB)

1
/**
2
 * Main.
3
 *
4
 * @param args the args
5
 */
6
// Copyright © 2010-2013 ENS de Lyon.
7
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
8
// Lyon 2, University of Franche-Comté, University of Nice
9
// Sophia Antipolis, University of Paris 3.
10
// 
11
// The TXM platform is free software: you can redistribute it
12
// and/or modify it under the terms of the GNU General Public
13
// License as published by the Free Software Foundation,
14
// either version 2 of the License, or (at your option) any
15
// later version.
16
// 
17
// The TXM platform is distributed in the hope that it will be
18
// useful, but WITHOUT ANY WARRANTY; without even the implied
19
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
20
// PURPOSE. See the GNU General Public License for more
21
// details.
22
// 
23
// You should have received a copy of the GNU General
24
// Public License along with the TXM platform. If not, see
25
// http://www.gnu.org/licenses.
26
// 
27
// 
28
// 
29
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
30
// $LastChangedRevision: 2386 $
31
// $LastChangedBy: mdecorde $ 
32
//
33
package org.txm.importer.lasla;
34

    
35
import java.io.File;
36
import org.txm.importer.lasla.importer;
37
import org.txm.importer.xml.compiler;
38
import org.txm.importer.xml.pager_old;
39
import org.txm.objects.*;
40
import org.txm.utils.*;
41
import org.txm.*;
42
import org.txm.utils.i18n.*;
43

    
44
// TODO: Auto-generated Javadoc
45
/* (non-Javadoc)
46
 * @see groovy.lang.Script#run()
47
 */
48
String userDir = System.getProperty("user.home");
49
String rootDir;
50
String lang;
51
String encoding;
52
String model;
53
String basename;
54
try{rootDir = rootDirBinding;lang=langBinding;encoding=encodingBinding;model=modelBinding;basename=basenameBinding;}
55
catch(Exception)
56
{        println "DEV MODE";//exception means we debug
57
        if(!org.txm.Toolbox.isInitialized())
58
        {
59
                rootDir = userDir+"/xml/lasla/";
60
                basename = "latin";
61
                lang="fr";
62
                encoding= "ISO-8859-1";
63
                model="rgaqcj";//not used
64
                Toolbox.workspace = new Workspace(new File(userDir,"TXM/workspaces/default.xml"));
65
                Toolbox.setParam(Toolbox.INSTALL_DIR,"/usr/lib/TXM");
66
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
67
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
68
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
69
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
70
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
71
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"),"TXM"));
72
        }
73
}
74

    
75
//String basename = new File(rootDir).getName().toLowerCase()
76
println "-- IMPORTER - Reading source files"
77
def imp = new importer();
78
if(!imp.run( new File(rootDir), encoding, basename))
79
{
80
        println "import process stopped";
81
        return;
82
}
83

    
84
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
85
File txmfiles = new File(binDir,"txm");
86
files = txmfiles.listFiles()
87

    
88
println "-- COMPILING - Building Search Engine indexes"
89

    
90
def files = new File(binDir, "src").listFiles()
91
File onesrcfile = null;
92
if(files.size() > 0)
93
        onesrcfile = files[0];
94
if(onesrcfile == null)
95
{
96
        println "No XML file (extension '.xml') has been found in directory "+binDir
97
        return;
98
}
99

    
100
def c = new compiler();
101
c.setDebug();
102
//c.setCwbPath("~/TXM/cwb/bin");
103
c.setAnnotationSuccess(false)
104
c.setLang(lang);
105
if(!c.run(binDir, basename, null, files, null))
106
{
107
        println "import process stopped";
108
        return;
109
}
110

    
111
//move registry file to cwb registry dir
112
File registryfile = new File(binDir,"registry/"+basename);
113
if(registryfile.exists())
114
        FileCopy.copy(registryfile,new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "registry/"+basename))
115

    
116
Workspace w = org.txm.Toolbox.workspace;
117
Project p = w.getProject("default")
118
p.removeBase(basename)
119
Base b = p.addBase(basename);
120
b.addDirectory(new File(binDir, "txm"));
121
b.setAttribute("lang", lang)
122
b.propagateAttribute("lang")
123

    
124
println "-- EDITION - Building edition"
125
new File(binDir,"HTML").deleteDir();
126
new File(binDir,"HTML").mkdir();
127
new File(binDir,"HTML/default").mkdir();
128
List<File> filelist = new File(binDir,"txm").listFiles();
129
def second = 0
130

    
131
println "Paginating text: "
132
for(String textname : b.getTextsID())
133
{
134
        Text text = b .getText(textname);
135
        File srcfile = text.getSource();
136
        File resultfile = new File(binDir, "HTML/"+srcfile.getName().substring(0,srcfile.getName().length()-4)+".html");
137
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
138
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
139
                
140
                if (second) { print(", ") }
141
                if (second > 0 && (second % 5) == 0) println ""
142
                print(srcfile.getName());
143
                second++
144
        
145
        def ed = new pager_old(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter,500,basename, "br");
146
        
147
        Edition editionweb = text.addEdition("default","html",resultfile);
148
//        println("pages "+ed.getPageFiles())
149
//        println("idx "+ed.getIdx())
150
        for(int i = 0 ; i < ed.getPageFiles().size();i++)
151
        {
152
                File f = ed.getPageFiles().get(i);
153
                String idx = ed.getIdx().get(i);
154
                editionweb.addPage(f,idx);
155
        }
156
        
157
//        Edition editionbp = text.addEdition("onepage","html",resultfile);
158
//        editionbp.addPage(resultfile,ed.getIdx().get(0));
159
}
160

    
161

    
162
w.save()
163
println "done"
164

    
165