Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / factiva / factivamailLoader.groovy @ 479

History | View | Annotate | Download (5.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-07-20 17:33:42 +0200 (ven., 20 juil. 2012) $
25
// $LastChangedRevision: 2221 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.factiva;
29

    
30
import org.txm.importer.alceste.importer;
31
import org.txm.importer.alceste.compiler;
32
import org.txm.importer.xml.pager_old;
33
import org.txm.objects.*;
34
import org.txm.utils.*;
35
import org.txm.*;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.i18n.*;
38
import org.w3c.dom.Element
39
import org.txm.utils.xml.DomUtils;
40

    
41
String userDir = System.getProperty("user.home");
42

    
43
def MONITOR;
44
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
45
BaseParameters params;
46
try {params = paramsBinding;MONITOR=monitor} catch (Exception) {
47
        println "DEV MODE";//exception means we debug
48
        debug = true
49
        params = new BaseParameters(new File(userDir, "xml/factivatxt/import.xml"))
50
        params.load()
51
        if (!org.txm.Toolbox.isInitialized()) {
52

    
53
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
54
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
55
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
56
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
57
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
58
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
59
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
60
        }
61
}
62
if (params == null) { println "no parameters. Aborting"; return; }
63

    
64
String corpusname = params.getCorpusName();
65

    
66
Element corpusElem = params.corpora.get(corpusname);
67
String basename = params.name;
68
String rootDir = params.rootDir;
69
String lang = corpusElem.getAttribute("lang");
70
String model = lang
71
String encoding = corpusElem.getAttribute("encoding");
72
boolean annotate = "true" == corpusElem.getAttribute("annotate");
73
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
74
def xslParams = params.getXsltParams(corpusElem);
75

    
76
File srcDir = new File(rootDir);
77
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
78
binDir.deleteDir();
79
binDir.mkdirs();
80
if (!binDir.exists()) {
81
        println "Could not create binDir "+binDir
82
        return;
83
}
84

    
85
File txmDir = new File(binDir,"txm/$corpusname");
86
txmDir.deleteDir();
87
txmDir.mkdirs();
88

    
89
// TRANSFORM FACTIVA TO ALCESTE HERE
90
File newSrcDir = new File(binDir, "src")
91
newSrcDir.mkdir();
92
File alcesteFile = new File(newSrcDir, srcDir.getName()+".txt");
93
FactivaMail2Alceste converter = new FactivaMail2Alceste(srcDir, alcesteFile, "UTF-8", "UTF-8")
94
if (!converter.doparse()) {
95
        println "Failed to convert files in $srcDir to one Alceste file"
96
        return false;
97
}
98
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
99
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
100
println "-- IMPORTER - Reading source files"
101

    
102
if (!(new importer().run(newSrcDir, binDir, txmDir, encoding, basename, lang))) {
103
        println "import process stopped";
104
        return;
105
}
106

    
107
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
108
println "-- ANNOTATE - Running NLP tools"
109
boolean annotationSuccess = false;
110
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
111
        annotationSuccess = true;
112
}
113

    
114
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
115
println "-- COMPILING - Building Search Engine indexes"
116
def c = new compiler();
117
c.setAnnotationSuccess(annotationSuccess)
118
if (debug) c.setDebug();
119
c.setLang(lang);
120
if (!c.run(binDir, txmDir, corpusname)) {
121
        println "import process stopped";
122
        return;
123
}
124

    
125
if (MONITOR != null) MONITOR.worked(20, "EDITION")
126
println "-- EDITION - Building edition"
127
new File(binDir,"HTML/$corpusname").deleteDir();
128
new File(binDir,"HTML/$corpusname").mkdirs();
129
File outdir = new File(binDir,"/HTML/$corpusname/default/");
130
outdir.mkdirs();
131
List<File> filelist = txmDir.listFiles();
132
Collections.sort(filelist);
133
def second = 0
134

    
135
println "Paginating texts: "
136
for (File srcfile : filelist) {
137
        print "."
138
        String txtname = srcfile.getName();
139
        int i = txtname.lastIndexOf(".");
140
        if(i > 0) txtname = txtname.substring(0, i);
141
        
142
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
143
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
144

    
145
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
146
                
147
        def ed = new pager_old(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
148
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
149
        
150
        for (i = 0 ; i < ed.getPageFiles().size();) {
151
                File f = ed.getPageFiles().get(i);
152
                String wordid = ed.getIdx().get(i);
153
                params.addPage(edition, ""+(++i), wordid);
154
        }
155
}
156

    
157
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
158
File paramFile = new File(binDir, "import.xml");
159
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;