root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / frantext / frantextLoader.groovy @ 1000
History | View | Annotate | Download (7.8 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | //
|
7 | 321 | mdecorde | // This file is part of the TXM platform.
|
8 | 321 | mdecorde | //
|
9 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
10 | 321 | mdecorde | // it under the terms of the GNU General Public License as published by
|
11 | 321 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
12 | 321 | mdecorde | // (at your option) any later version.
|
13 | 321 | mdecorde | //
|
14 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
15 | 321 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16 | 321 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17 | 321 | mdecorde | // GNU General Public License for more details.
|
18 | 321 | mdecorde | //
|
19 | 321 | mdecorde | // You should have received a copy of the GNU General Public License
|
20 | 321 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | //
|
26 | 321 | mdecorde | // $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
|
27 | 321 | mdecorde | // $LastChangedRevision: 2126 $
|
28 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
29 | 321 | mdecorde | //
|
30 | 986 | mdecorde | package org.txm.scripts.importer.frantext;
|
31 | 321 | mdecorde | |
32 | 927 | mdecorde | import javax.xml.stream.XMLStreamReader; |
33 | 1000 | mdecorde | import org.txm.importer.ApplyXsl2; |
34 | 986 | mdecorde | import org.txm.scripts.importer.bfm.importer; |
35 | 986 | mdecorde | import org.txm.scripts.importer.bfm.compiler; |
36 | 986 | mdecorde | import org.txm.scripts.importer.bfm.pager; |
37 | 321 | mdecorde | import org.txm.objects.*; |
38 | 321 | mdecorde | import org.txm.*; |
39 | 927 | mdecorde | import org.txm.core.engines.*; |
40 | 321 | mdecorde | import org.txm.utils.i18n.*; |
41 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
42 | 986 | mdecorde | import org.txm.scripts.importer.*; |
43 | 321 | mdecorde | import org.w3c.dom.Element |
44 | 479 | mdecorde | import org.txm.utils.xml.DomUtils; |
45 | 927 | mdecorde | import org.txm.utils.* |
46 | 928 | mdecorde | import org.txm.utils.io.*; |
47 | 321 | mdecorde | |
48 | 321 | mdecorde | String userDir = System.getProperty("user.home"); |
49 | 321 | mdecorde | boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
|
50 | 321 | mdecorde | def MONITOR;
|
51 | 321 | mdecorde | BaseParameters params; |
52 | 321 | mdecorde | try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
53 | 321 | mdecorde | { println "DEV MODE";//exception means we debug |
54 | 321 | mdecorde | debug = true
|
55 | 321 | mdecorde | params = new BaseParameters(new File(userDir, "xml/TESTS/frantext/import.xml")) |
56 | 321 | mdecorde | params.load() |
57 | 321 | mdecorde | if (!org.txm.Toolbox.isInitialized()) {
|
58 | 321 | mdecorde | //rootDir = userDir+"/xml/TESTS/alceste"; // directory which contains the source file
|
59 | 321 | mdecorde | |
60 | 321 | mdecorde | Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
61 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
|
62 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
|
63 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
|
64 | 321 | mdecorde | Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
65 | 321 | mdecorde | } |
66 | 321 | mdecorde | } |
67 | 321 | mdecorde | if (params == null) { println "no parameters. Aborting"; return; } |
68 | 321 | mdecorde | |
69 | 321 | mdecorde | String corpusname = params.getCorpusName();
|
70 | 321 | mdecorde | Element corpusElem = params.corpora.get(corpusname);
|
71 | 321 | mdecorde | String basename = params.name;
|
72 | 321 | mdecorde | String rootDir = params.rootDir;
|
73 | 321 | mdecorde | String lang = corpusElem.getAttribute("lang"); |
74 | 321 | mdecorde | String model = lang
|
75 | 321 | mdecorde | String encoding = corpusElem.getAttribute("encoding"); |
76 | 321 | mdecorde | boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
77 | 321 | mdecorde | String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
78 | 321 | mdecorde | def xslParams = params.getXsltParams(corpusElem);
|
79 | 321 | mdecorde | int wordsPerPage = params.getWordsPerPage("default") |
80 | 321 | mdecorde | boolean build_edition = params.getDoEdition("default") |
81 | 321 | mdecorde | |
82 | 321 | mdecorde | File srcDir = new File(rootDir); |
83 | 878 | sjacqu01 | File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename); |
84 | 321 | mdecorde | binDir.deleteDir(); |
85 | 321 | mdecorde | binDir.mkdirs(); |
86 | 321 | mdecorde | if (!binDir.exists()) {
|
87 | 321 | mdecorde | println "Could not create binDir "+binDir
|
88 | 321 | mdecorde | return;
|
89 | 321 | mdecorde | } |
90 | 321 | mdecorde | |
91 | 321 | mdecorde | File txmDir = new File(binDir,"txm/$corpusname"); |
92 | 321 | mdecorde | txmDir.deleteDir(); |
93 | 321 | mdecorde | txmDir.mkdirs(); |
94 | 321 | mdecorde | |
95 | 321 | mdecorde | // BFM XPATH PARAMETERS
|
96 | 927 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
97 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS") |
98 | 927 | mdecorde | File paramDeclarationFile = new File(srcDir, "import.properties"); |
99 | 927 | mdecorde | Properties metadataXPath = new Properties(); |
100 | 321 | mdecorde | if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
|
101 | 321 | mdecorde | InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8"); |
102 | 927 | mdecorde | metadataXPath.load(input); |
103 | 321 | mdecorde | input.close(); |
104 | 321 | mdecorde | |
105 | 927 | mdecorde | if (!metadataXPath.containsKey("titre")) |
106 | 321 | mdecorde | println "Warning: parameters property file does not contain the 'titre' metadata"
|
107 | 927 | mdecorde | if (!metadataXPath.containsKey("forme")) |
108 | 321 | mdecorde | println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
|
109 | 321 | mdecorde | } |
110 | 321 | mdecorde | else
|
111 | 321 | mdecorde | println "No '$paramDeclarationFile' file found"
|
112 | 321 | mdecorde | |
113 | 321 | mdecorde | // Apply XSL
|
114 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(5, "APPLYING XSL") |
115 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
116 | 878 | sjacqu01 | File xslFile = new File(Toolbox.getTxmHomePath(), "xsl/txm-filter-teifrantext-teibfm.xsl") // force xsl |
117 | 321 | mdecorde | |
118 | 321 | mdecorde | if (!ApplyXsl2.processImportSources(xslFile, srcDir, new File(binDir, "xsl"))) { |
119 | 321 | mdecorde | println "Error: failed to apply xsl $xsl. Aborting import."
|
120 | 321 | mdecorde | return; // error durring process |
121 | 321 | mdecorde | } |
122 | 321 | mdecorde | srcDir = new File(binDir, "xsl"); |
123 | 321 | mdecorde | |
124 | 321 | mdecorde | // copy txm files
|
125 | 321 | mdecorde | println "-- VALIDATION - checking XML source files well-formedness"
|
126 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
127 | 321 | mdecorde | List<File> srcfiles = srcDir.listFiles(); |
128 | 927 | mdecorde | for (File f : srcfiles) { // check XML format, and copy file into binDir |
129 | 967 | mdecorde | if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) |
130 | 321 | mdecorde | continue;
|
131 | 321 | mdecorde | if (ValidateXml.test(f)) {
|
132 | 321 | mdecorde | FileCopy.copy(f, new File(txmDir, f.getName())); |
133 | 321 | mdecorde | } else {
|
134 | 321 | mdecorde | println "Won't process file "+f;
|
135 | 321 | mdecorde | } |
136 | 321 | mdecorde | } |
137 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(5) |
138 | 321 | mdecorde | |
139 | 321 | mdecorde | if (txmDir.listFiles() == null) { |
140 | 321 | mdecorde | println "No txm file to process"
|
141 | 321 | mdecorde | return;
|
142 | 321 | mdecorde | } |
143 | 321 | mdecorde | |
144 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
145 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
146 | 321 | mdecorde | println "-- IMPORTER - Reading source files"
|
147 | 321 | mdecorde | def imp = new importer() |
148 | 927 | mdecorde | if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
|
149 | 321 | mdecorde | println "import process stopped";
|
150 | 321 | mdecorde | return;
|
151 | 321 | mdecorde | } |
152 | 321 | mdecorde | |
153 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
154 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
155 | 321 | mdecorde | println "-- ANNOTATE - Running NLP tools - $model model"
|
156 | 321 | mdecorde | boolean annotate_status = true; |
157 | 927 | mdecorde | if (annotate) {
|
158 | 927 | mdecorde | def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
159 | 927 | mdecorde | if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
160 | 927 | mdecorde | annotate_status = true;
|
161 | 927 | mdecorde | } |
162 | 321 | mdecorde | } |
163 | 321 | mdecorde | |
164 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
165 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
166 | 321 | mdecorde | println "-- COMPILING - Building Search Engine indexes"
|
167 | 321 | mdecorde | def c = new compiler(); |
168 | 321 | mdecorde | if (debug) c.setDebug();
|
169 | 321 | mdecorde | //c.setCwbPath("~/TXM/cwb/bin");
|
170 | 321 | mdecorde | c.setLang(lang); |
171 | 321 | mdecorde | c.setAnnotationDone(annotate_status) |
172 | 927 | mdecorde | if (!c.run(binDir, txmDir, corpusname, metadataXPath)) {
|
173 | 321 | mdecorde | println "import process stopped";
|
174 | 321 | mdecorde | return;
|
175 | 321 | mdecorde | } |
176 | 321 | mdecorde | |
177 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
178 | 321 | mdecorde | |
179 | 321 | mdecorde | new File(binDir,"HTML/$corpusname").deleteDir(); |
180 | 321 | mdecorde | new File(binDir,"HTML/$corpusname").mkdirs(); |
181 | 321 | mdecorde | if (build_edition) {
|
182 | 321 | mdecorde | |
183 | 321 | mdecorde | println "-- EDITION"
|
184 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(25, "EDITION") |
185 | 321 | mdecorde | |
186 | 321 | mdecorde | File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
187 | 321 | mdecorde | outdir.mkdirs(); |
188 | 321 | mdecorde | files = c.getOrderedTxmFiles(); |
189 | 321 | mdecorde | println("Building editions: "+files.size()+" files" ); |
190 | 321 | mdecorde | |
191 | 321 | mdecorde | for (File txmFile : files) { |
192 | 321 | mdecorde | print "."
|
193 | 321 | mdecorde | String txtname = txmFile.getName(); |
194 | 321 | mdecorde | int idx = txtname.lastIndexOf("."); |
195 | 321 | mdecorde | if(idx > 0) txtname = txtname.substring(0, idx); |
196 | 321 | mdecorde | List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
197 | 321 | mdecorde | List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"]; |
198 | 321 | mdecorde | |
199 | 321 | mdecorde | Element text = params.addText(corpusElem, txtname, txmFile);
|
200 | 321 | mdecorde | |
201 | 927 | mdecorde | def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath); |
202 | 321 | mdecorde | Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
203 | 321 | mdecorde | |
204 | 321 | mdecorde | for (i = 0 ; i < ed.getPageFiles().size();) { |
205 | 321 | mdecorde | File f = ed.getPageFiles().get(i);
|
206 | 321 | mdecorde | String wordid = ed.getIdx().get(i);
|
207 | 321 | mdecorde | params.addPage(edition, ""+(++i), wordid);
|
208 | 321 | mdecorde | } |
209 | 321 | mdecorde | } |
210 | 321 | mdecorde | } |
211 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
212 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
213 | 321 | mdecorde | File paramFile = new File(binDir, "import.xml"); |
214 | 927 | mdecorde | DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |