1 |
1 |
package org.txm.importer.xtz;
|
2 |
2 |
|
3 |
3 |
import java.io.File;
|
|
4 |
import java.io.FileFilter;
|
|
5 |
import java.util.ArrayList;
|
|
6 |
import java.util.Arrays;
|
|
7 |
import java.util.Collections;
|
4 |
8 |
import java.util.List;
|
5 |
9 |
import java.util.logging.Level;
|
6 |
10 |
|
... | ... | |
14 |
18 |
import org.txm.utils.logger.Log;
|
15 |
19 |
|
16 |
20 |
public class ImportModule {
|
17 |
|
|
|
21 |
|
18 |
22 |
public Project project;
|
19 |
|
|
|
23 |
|
20 |
24 |
public String corpusVersionProduced;
|
21 |
|
|
|
25 |
|
22 |
26 |
public File sourceDirectory;
|
|
27 |
|
23 |
28 |
public File binaryDirectory;
|
24 |
|
|
|
29 |
|
25 |
30 |
public Importer importer;
|
|
31 |
|
26 |
32 |
public Annotater annotater;
|
|
33 |
|
27 |
34 |
public Compiler compiler;
|
|
35 |
|
28 |
36 |
public Pager pager;
|
29 |
|
|
|
37 |
|
30 |
38 |
/**
|
31 |
39 |
* set the variable to false to stop the import process at next step
|
32 |
40 |
*/
|
33 |
41 |
public boolean isSuccessful = true;
|
|
42 |
|
34 |
43 |
public String reason = "none";
|
|
44 |
|
35 |
45 |
public boolean debug = false;
|
|
46 |
|
36 |
47 |
public boolean multithread = false;
|
|
48 |
|
37 |
49 |
public boolean updateCorpus = false;
|
|
50 |
|
38 |
51 |
public String corpusName;
|
39 |
|
|
|
52 |
|
40 |
53 |
IProgressMonitor monitor;
|
41 |
|
|
|
54 |
|
42 |
55 |
public void setMonitor(IProgressMonitor monitor) {
|
43 |
56 |
this.monitor = monitor;
|
44 |
57 |
}
|
45 |
|
|
46 |
|
|
|
58 |
|
|
59 |
|
47 |
60 |
public boolean isMultiThread() {
|
48 |
61 |
return multithread;
|
49 |
62 |
}
|
50 |
|
|
|
63 |
|
51 |
64 |
public boolean isDebugging() {
|
52 |
65 |
return debug;
|
53 |
66 |
}
|
54 |
|
|
|
67 |
|
55 |
68 |
public ImportModule(Project p) {
|
56 |
69 |
init(p);
|
57 |
70 |
}
|
58 |
|
|
|
71 |
|
59 |
72 |
public boolean isUpdatingCorpus() {
|
60 |
73 |
return updateCorpus;
|
61 |
74 |
}
|
62 |
|
|
|
75 |
|
63 |
76 |
public void init(Project p) {
|
64 |
77 |
this.project = p;
|
65 |
|
|
|
78 |
|
66 |
79 |
corpusName = project.getName();
|
67 |
|
//this.debug = "true".equals(project.getKeyValueParameters().get(ImportKeys.DEBUG));
|
68 |
|
|
|
80 |
// this.debug = "true".equals(project.getKeyValueParameters().get(ImportKeys.DEBUG));
|
|
81 |
|
69 |
82 |
if (Log.getLevel().intValue() < Level.INFO.intValue()) {
|
70 |
83 |
debug = true;
|
71 |
84 |
}
|
72 |
85 |
this.multithread = project.getDoMultiThread();
|
73 |
86 |
this.updateCorpus = project.getDoUpdate();
|
74 |
|
|
75 |
|
|
|
87 |
|
|
88 |
|
76 |
89 |
this.sourceDirectory = project.getSrcdir();
|
77 |
90 |
this.binaryDirectory = project.getProjectDirectory();
|
78 |
|
|
79 |
|
if (!updateCorpus) { // clean directories only if it's a new import
|
|
91 |
|
|
92 |
if (!updateCorpus) { // clean directories only if it's a new import
|
80 |
93 |
File txmDir = new File(binaryDirectory, "txm");
|
81 |
|
//DeleteDir.deleteDirectory(binaryDirectory);
|
|
94 |
// DeleteDir.deleteDirectory(binaryDirectory);
|
82 |
95 |
try {
|
83 |
96 |
p.getRCPProject().getFolder("HTML").delete(true, new LogMonitor("XTZ delete project content"));
|
84 |
97 |
p.getRCPProject().getFolder("cqp").delete(true, new LogMonitor("XTZ delete project content"));
|
... | ... | |
92 |
105 |
DeleteDir.deleteDirectory(new File(binaryDirectory, "registry"));
|
93 |
106 |
DeleteDir.deleteDirectory(new File(binaryDirectory, "tokenized"));
|
94 |
107 |
DeleteDir.deleteDirectory(new File(binaryDirectory, "txm"));
|
95 |
|
} catch (CoreException e) {
|
|
108 |
}
|
|
109 |
catch (CoreException e) {
|
96 |
110 |
e.printStackTrace();
|
97 |
111 |
}
|
98 |
112 |
txmDir.mkdirs();
|
99 |
113 |
}
|
100 |
114 |
}
|
101 |
|
|
|
115 |
|
102 |
116 |
public void start() throws InterruptedException {
|
103 |
|
|
|
117 |
|
104 |
118 |
binaryDirectory.mkdirs(); // ensure output exists
|
105 |
|
//System.out.println("ImportModule.start");
|
|
119 |
// System.out.println("ImportModule.start");
|
106 |
120 |
if (!updateCorpus) { // create XML-TXM files and annotate
|
107 |
121 |
System.out.println(TXMCoreMessages.creatingCorpus);
|
108 |
122 |
if (importer != null) {
|
109 |
|
//System.out.println("ImportModule.start: importer: "+importer);
|
|
123 |
// System.out.println("ImportModule.start: importer: "+importer);
|
110 |
124 |
if (monitor != null) monitor.subTask("-- IMPORTER - Reading source files");
|
111 |
|
|
|
125 |
|
112 |
126 |
System.out.println("-- IMPORTER - Reading source files");
|
113 |
127 |
importer.process();
|
114 |
|
//importer.checkFiles();
|
|
128 |
// importer.checkFiles();
|
115 |
129 |
isSuccessful = isSuccessful & importer.isSuccessFul();
|
116 |
130 |
if (!isSuccessful) {
|
117 |
|
System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason());
|
|
131 |
System.out.println("Error while importing corpus during 'importer' step, reason=" + importer.getReason());
|
118 |
132 |
return;
|
119 |
133 |
}
|
120 |
|
} else {
|
121 |
|
System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName));
|
122 |
134 |
}
|
|
135 |
else {
|
|
136 |
System.out.println("XML-TXM files already produced in " + new File(binaryDirectory, "txm/" + corpusName));
|
|
137 |
}
|
123 |
138 |
|
124 |
|
//System.out.println("GET FILES ORDER");
|
|
139 |
// System.out.println("GET FILES ORDER");
|
125 |
140 |
final List<String> orderedTextIDs = getTXMFilesOrder();
|
126 |
|
|
127 |
|
//declare in the right order the new texts produced in the "txm" directory
|
|
141 |
|
|
142 |
// declare in the right order the new texts produced in the "txm" directory
|
128 |
143 |
for (File build : new File(binaryDirectory, "txm").listFiles()) {
|
129 |
144 |
if (!build.isDirectory()) continue;
|
130 |
145 |
|
131 |
146 |
for (String name : orderedTextIDs) {
|
132 |
|
File xmltxmFile = new File(build, name+".xml");
|
|
147 |
File xmltxmFile = new File(build, name + ".xml");
|
133 |
148 |
if (xmltxmFile.isDirectory()) continue;
|
134 |
149 |
if (xmltxmFile.isHidden()) continue;
|
135 |
|
|
136 |
|
if (project.getText(name) != null) {
|
|
150 |
|
|
151 |
if (project.getText(name) == null) { // if text does not exists create it
|
137 |
152 |
Text t = new Text(project);
|
138 |
153 |
t.setName(name);
|
139 |
154 |
t.setTXMFile(xmltxmFile);
|
... | ... | |
141 |
156 |
}
|
142 |
157 |
}
|
143 |
158 |
}
|
144 |
|
|
|
159 |
|
145 |
160 |
boolean annotate = project.getAnnotate();
|
146 |
161 |
if (annotate && annotater != null) {
|
147 |
162 |
if (monitor != null) monitor.subTask("-- ANNOTATE - Running NLP tools");
|
... | ... | |
149 |
164 |
annotater.process();
|
150 |
165 |
isSuccessful = isSuccessful & annotater.isSuccessFul();
|
151 |
166 |
if (!isSuccessful) {
|
152 |
|
System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason());
|
|
167 |
System.out.println("Error while importing corpus during 'annotate' step, reason=" + annotater.getReason());
|
153 |
168 |
return;
|
154 |
169 |
}
|
155 |
|
} else {
|
156 |
|
//System.out.println("XML-TXM files already annotated.");
|
157 |
170 |
}
|
158 |
|
} else { // updating the corpus
|
|
171 |
else {
|
|
172 |
// System.out.println("XML-TXM files already annotated.");
|
|
173 |
}
|
|
174 |
}
|
|
175 |
else { // updating the corpus
|
159 |
176 |
System.out.println(TXMCoreMessages.updatingCorpus);
|
160 |
177 |
// fixing Text XML-TXM configurations
|
161 |
178 |
for (Text text : project.getTexts()) {
|
162 |
179 |
File f = text.getXMLTXMFile();
|
163 |
180 |
if (f == null || !f.exists()) { // ensure the XML-TXM file path is set
|
164 |
|
f = new File(project.getProjectDirectory(), "txm/"+project.getName()+"/"+text.getName()+".xml");
|
|
181 |
f = new File(project.getProjectDirectory(), "txm/" + project.getName() + "/" + text.getName() + ".xml");
|
165 |
182 |
text.setTXMFile(f);
|
166 |
183 |
}
|
167 |
184 |
}
|
168 |
185 |
}
|
169 |
|
|
|
186 |
|
170 |
187 |
// XML-TXM files are ready to be compiled
|
171 |
188 |
final List<String> orderedTextIDs = getTXMFilesOrder();
|
172 |
|
Thread Tcompiler = new Thread("XTZ Compiler - "+project.getSrcdir().getName()) {
|
|
189 |
Thread Tcompiler = new Thread("XTZ Compiler - " + project.getSrcdir().getName()) {
|
|
190 |
|
|
191 |
@Override
|
173 |
192 |
public void run() {
|
174 |
193 |
if (compiler != null) {
|
175 |
|
if (monitor != null) monitor.subTask("-- COMPILING - Building Search Engine indexes");
|
|
194 |
if (monitor != null) monitor.subTask("-- COMPILING - Building Search Engine indexes");
|
176 |
195 |
|
177 |
196 |
System.out.println("-- COMPILING - Building Search Engine indexes");
|
178 |
197 |
compiler.process(orderedTextIDs);
|
179 |
198 |
isSuccessful = isSuccessful & compiler.isSuccessFul();
|
180 |
199 |
if (!isSuccessful) {
|
181 |
|
System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason());
|
|
200 |
System.out.println("Error while importing corpus during 'compiler' step, reason=" + compiler.getReason());
|
182 |
201 |
return;
|
183 |
202 |
}
|
184 |
|
} else {
|
|
203 |
}
|
|
204 |
else {
|
185 |
205 |
System.out.println("No CQP index created.");
|
186 |
206 |
}
|
187 |
207 |
}
|
188 |
208 |
};
|
189 |
209 |
|
190 |
|
Thread Tpager = new Thread("XTZ Pager - "+project.getSrcdir().getName()) {
|
|
210 |
Thread Tpager = new Thread("XTZ Pager - " + project.getSrcdir().getName()) {
|
|
211 |
|
|
212 |
@Override
|
191 |
213 |
public void run() {
|
192 |
|
|
|
214 |
|
193 |
215 |
if (pager != null) {
|
194 |
216 |
if (monitor != null) monitor.subTask("-- EDITION - Building editions");
|
195 |
217 |
|
... | ... | |
197 |
219 |
pager.process(orderedTextIDs);
|
198 |
220 |
isSuccessful = isSuccessful & pager.isSuccessFul();
|
199 |
221 |
if (!isSuccessful) {
|
200 |
|
System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason());
|
|
222 |
System.out.println("Error while importing corpus during 'pager' step, reason=" + pager.getReason());
|
201 |
223 |
return;
|
202 |
224 |
}
|
203 |
|
} else {
|
|
225 |
}
|
|
226 |
else {
|
204 |
227 |
System.out.println("No edition produced.");
|
205 |
228 |
}
|
206 |
229 |
}
|
207 |
230 |
};
|
208 |
|
|
|
231 |
|
209 |
232 |
Tcompiler.start();
|
210 |
|
if (!multithread) { // && !updateCorpus
|
|
233 |
if (!multithread) { // && !updateCorpus
|
211 |
234 |
Tcompiler.join(); // wait for the end if not multithreaded
|
212 |
235 |
if (!isSuccessful) { // don't call pager is compiler step failed
|
213 |
236 |
return;
|
214 |
237 |
}
|
215 |
238 |
}
|
216 |
|
|
|
239 |
|
217 |
240 |
Tpager.start();
|
218 |
241 |
if (multithread) Tcompiler.join(); // wait for both threads to end
|
219 |
242 |
Tpager.join();
|
220 |
|
|
|
243 |
|
221 |
244 |
if (isSuccessful) { // all done TODO remove this code when Text._compute() will be implemented
|
222 |
245 |
for (Text t : project.getTexts()) {
|
223 |
246 |
t.setDirty(false);
|
... | ... | |
226 |
249 |
project.setDoUpdate(false);
|
227 |
250 |
}
|
228 |
251 |
}
|
229 |
|
|
|
252 |
|
230 |
253 |
protected List<String> getTXMFilesOrder() {
|
231 |
|
// //System.out.println("DEFAULT FILES ORDER");
|
232 |
|
// File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
|
233 |
|
// ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
|
234 |
|
// @Override
|
235 |
|
// public boolean accept(File file) {
|
236 |
|
// return file.isFile() && file.getName().endsWith(".xml");
|
237 |
|
// }
|
238 |
|
// })));
|
239 |
|
//
|
240 |
|
// Collections.sort(files);
|
241 |
|
return project.getTextsID();
|
242 |
|
// return files;
|
|
254 |
// //System.out.println("DEFAULT FILES ORDER");
|
|
255 |
File txmDirectory = new File(binaryDirectory, "txm/" + corpusName);
|
|
256 |
ArrayList<File> files = new ArrayList<>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
|
|
257 |
|
|
258 |
@Override
|
|
259 |
public boolean accept(File file) {
|
|
260 |
return file.isFile() && file.getName().endsWith(".xml");
|
|
261 |
}
|
|
262 |
})));
|
|
263 |
|
|
264 |
Collections.sort(files);
|
|
265 |
ArrayList<String> ids = new ArrayList<>();
|
|
266 |
for (File f : files) {
|
|
267 |
String name = f.getName();
|
|
268 |
ids.add(name.substring(0, name.length() - 4));
|
|
269 |
}
|
|
270 |
|
|
271 |
return ids;
|
|
272 |
// return project.getTextsID();
|
|
273 |
// return files;
|
243 |
274 |
}
|
244 |
|
|
|
275 |
|
245 |
276 |
public void end() {
|
246 |
277 |
File paramFile = new File(binaryDirectory, "import.xml");
|
247 |
278 |
try {
|
248 |
|
//DomUtils.save(project.root.getOwnerDocument(), paramFile);
|
|
279 |
// DomUtils.save(project.root.getOwnerDocument(), paramFile);
|
249 |
280 |
project.saveParameters(true);
|
250 |
281 |
isSuccessful = true;
|
251 |
|
} catch (Exception e) {
|
|
282 |
}
|
|
283 |
catch (Exception e) {
|
252 |
284 |
// TODO Auto-generated catch block
|
253 |
285 |
e.printStackTrace();
|
254 |
286 |
isSuccessful = false;
|
255 |
287 |
}
|
256 |
288 |
}
|
257 |
|
|
|
289 |
|
258 |
290 |
public String getCorpusName() {
|
259 |
291 |
return corpusName;
|
260 |
292 |
}
|
261 |
|
|
|
293 |
|
262 |
294 |
public String getReason() {
|
263 |
295 |
return reason;
|
264 |
296 |
}
|
265 |
|
|
|
297 |
|
266 |
298 |
public boolean isSuccessFul() {
|
267 |
299 |
return isSuccessful;
|
268 |
300 |
}
|
269 |
|
|
|
301 |
|
270 |
302 |
public Project getProject() {
|
271 |
303 |
return project;
|
272 |
304 |
}
|
273 |
|
|
|
305 |
|
274 |
306 |
public File getSourceDirectory() {
|
275 |
307 |
return sourceDirectory;
|
276 |
308 |
}
|
277 |
|
|
|
309 |
|
278 |
310 |
public File getBinaryDirectory() {
|
279 |
311 |
return binaryDirectory;
|
280 |
312 |
}
|
281 |
|
|
|
313 |
|
282 |
314 |
public void process() throws InterruptedException {
|
283 |
315 |
start();
|
284 |
316 |
if (isSuccessful)
|
285 |
317 |
end();
|
286 |
318 |
}
|
287 |
|
|
|
319 |
|
288 |
320 |
public static void main(String[] args) {
|
289 |
|
// File projectFile = new File("/home/mdecorde/xml/brown/import.xml");
|
|
321 |
// File projectFile = new File("/home/mdecorde/xml/brown/import.xml");
|
290 |
322 |
//
|
291 |
|
// ImportModule module = new ImportModule(projectFile);
|
292 |
|
// System.out.println("Parameters: "+module.getParameters());
|
293 |
|
// try {
|
294 |
|
// module.start();
|
|
323 |
// ImportModule module = new ImportModule(projectFile);
|
|
324 |
// System.out.println("Parameters: "+module.getParameters());
|
|
325 |
// try {
|
|
326 |
// module.start();
|
295 |
327 |
//
|
296 |
|
// if (module.isSuccessful) {
|
297 |
|
// System.out.println("Import sucessful. reloading corpora...");
|
298 |
|
// } else {
|
299 |
|
// System.out.println("Import failed, reason = "+module.getReason());
|
300 |
|
// }
|
301 |
|
// } catch (Exception e) {
|
302 |
|
// e.printStackTrace();
|
303 |
|
// }
|
|
328 |
// if (module.isSuccessful) {
|
|
329 |
// System.out.println("Import sucessful. reloading corpora...");
|
|
330 |
// } else {
|
|
331 |
// System.out.println("Import failed, reason = "+module.getReason());
|
|
332 |
// }
|
|
333 |
// } catch (Exception e) {
|
|
334 |
// e.printStackTrace();
|
|
335 |
// }
|
304 |
336 |
}
|
305 |
337 |
}
|