Revision 2473

tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportModule.java (revision 2473)
1 1
package org.txm.importer.xtz;
2 2

  
3 3
import java.io.File;
4
import java.io.FileFilter;
5
import java.util.ArrayList;
6
import java.util.Arrays;
7
import java.util.Collections;
4 8
import java.util.List;
5 9
import java.util.logging.Level;
6 10

  
......
14 18
import org.txm.utils.logger.Log;
15 19

  
16 20
public class ImportModule {
17

  
21
	
18 22
	public Project project;
19

  
23
	
20 24
	public String corpusVersionProduced;
21

  
25
	
22 26
	public File sourceDirectory;
27
	
23 28
	public File binaryDirectory;
24

  
29
	
25 30
	public Importer importer;
31
	
26 32
	public Annotater annotater;
33
	
27 34
	public Compiler compiler;
35
	
28 36
	public Pager pager;
29

  
37
	
30 38
	/**
31 39
	 * set the variable to false to stop the import process at next step
32 40
	 */
33 41
	public boolean isSuccessful = true;
42
	
34 43
	public String reason = "none";
44
	
35 45
	public boolean debug = false;
46
	
36 47
	public boolean multithread = false;
48
	
37 49
	public boolean updateCorpus = false;
50
	
38 51
	public String corpusName;
39

  
52
	
40 53
	IProgressMonitor monitor;
41

  
54
	
42 55
	public void setMonitor(IProgressMonitor monitor) {
43 56
		this.monitor = monitor;
44 57
	}
45

  
46

  
58
	
59
	
47 60
	public boolean isMultiThread() {
48 61
		return multithread;
49 62
	}
50

  
63
	
51 64
	public boolean isDebugging() {
52 65
		return debug;
53 66
	}
54

  
67
	
55 68
	public ImportModule(Project p) {
56 69
		init(p);
57 70
	}
58

  
71
	
59 72
	public boolean isUpdatingCorpus() {
60 73
		return updateCorpus;
61 74
	}
62

  
75
	
63 76
	public void init(Project p) {
64 77
		this.project = p;
65

  
78
		
66 79
		corpusName = project.getName();
67
		//this.debug = "true".equals(project.getKeyValueParameters().get(ImportKeys.DEBUG));
68

  
80
		// this.debug = "true".equals(project.getKeyValueParameters().get(ImportKeys.DEBUG));
81
		
69 82
		if (Log.getLevel().intValue() < Level.INFO.intValue()) {
70 83
			debug = true;
71 84
		}
72 85
		this.multithread = project.getDoMultiThread();
73 86
		this.updateCorpus = project.getDoUpdate();
74

  
75

  
87
		
88
		
76 89
		this.sourceDirectory = project.getSrcdir();
77 90
		this.binaryDirectory = project.getProjectDirectory();
78

  
79
		if (!updateCorpus) { // clean directories only if it's a new import	
91
		
92
		if (!updateCorpus) { // clean directories only if it's a new import
80 93
			File txmDir = new File(binaryDirectory, "txm");
81
			//DeleteDir.deleteDirectory(binaryDirectory);
94
			// DeleteDir.deleteDirectory(binaryDirectory);
82 95
			try {
83 96
				p.getRCPProject().getFolder("HTML").delete(true, new LogMonitor("XTZ delete project content"));
84 97
				p.getRCPProject().getFolder("cqp").delete(true, new LogMonitor("XTZ delete project content"));
......
92 105
				DeleteDir.deleteDirectory(new File(binaryDirectory, "registry"));
93 106
				DeleteDir.deleteDirectory(new File(binaryDirectory, "tokenized"));
94 107
				DeleteDir.deleteDirectory(new File(binaryDirectory, "txm"));
95
			} catch (CoreException e) {
108
			}
109
			catch (CoreException e) {
96 110
				e.printStackTrace();
97 111
			}
98 112
			txmDir.mkdirs();
99 113
		}
100 114
	}
101

  
115
	
102 116
	public void start() throws InterruptedException {
103

  
117
		
104 118
		binaryDirectory.mkdirs(); // ensure output exists
105
		//System.out.println("ImportModule.start");
119
		// System.out.println("ImportModule.start");
106 120
		if (!updateCorpus) { // create XML-TXM files and annotate
107 121
			System.out.println(TXMCoreMessages.creatingCorpus);
108 122
			if (importer != null) {
109
				//System.out.println("ImportModule.start: importer: "+importer);
123
				// System.out.println("ImportModule.start: importer: "+importer);
110 124
				if (monitor != null) monitor.subTask("-- IMPORTER - Reading source files");
111
					
125
				
112 126
				System.out.println("-- IMPORTER - Reading source files");
113 127
				importer.process();
114
				//importer.checkFiles();
128
				// importer.checkFiles();
115 129
				isSuccessful = isSuccessful & importer.isSuccessFul();
116 130
				if (!isSuccessful) {
117
					System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason());
131
					System.out.println("Error while importing corpus during 'importer' step, reason=" + importer.getReason());
118 132
					return;
119 133
				}
120
			} else {
121
				System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName));
122 134
			}
135
			else {
136
				System.out.println("XML-TXM files already produced in " + new File(binaryDirectory, "txm/" + corpusName));
137
			}
123 138
			
124
			//System.out.println("GET FILES ORDER");
139
			// System.out.println("GET FILES ORDER");
125 140
			final List<String> orderedTextIDs = getTXMFilesOrder();
126

  
127
			//declare in the right order the new texts produced in the "txm" directory
141
			
142
			// declare in the right order the new texts produced in the "txm" directory
128 143
			for (File build : new File(binaryDirectory, "txm").listFiles()) {
129 144
				if (!build.isDirectory()) continue;
130 145
				
131 146
				for (String name : orderedTextIDs) {
132
					File xmltxmFile = new File(build, name+".xml");
147
					File xmltxmFile = new File(build, name + ".xml");
133 148
					if (xmltxmFile.isDirectory()) continue;
134 149
					if (xmltxmFile.isHidden()) continue;
135

  
136
					if (project.getText(name) != null) {
150
					
151
					if (project.getText(name) == null) { // if text does not exists create it
137 152
						Text t = new Text(project);
138 153
						t.setName(name);
139 154
						t.setTXMFile(xmltxmFile);
......
141 156
					}
142 157
				}
143 158
			}
144

  
159
			
145 160
			boolean annotate = project.getAnnotate();
146 161
			if (annotate && annotater != null) {
147 162
				if (monitor != null) monitor.subTask("-- ANNOTATE - Running NLP tools");
......
149 164
				annotater.process();
150 165
				isSuccessful = isSuccessful & annotater.isSuccessFul();
151 166
				if (!isSuccessful) {
152
					System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason());
167
					System.out.println("Error while importing corpus during 'annotate' step, reason=" + annotater.getReason());
153 168
					return;
154 169
				}
155
			} else {
156
				//System.out.println("XML-TXM files already annotated.");
157 170
			}
158
		} else { // updating the corpus
171
			else {
172
				// System.out.println("XML-TXM files already annotated.");
173
			}
174
		}
175
		else { // updating the corpus
159 176
			System.out.println(TXMCoreMessages.updatingCorpus);
160 177
			// fixing Text XML-TXM configurations
161 178
			for (Text text : project.getTexts()) {
162 179
				File f = text.getXMLTXMFile();
163 180
				if (f == null || !f.exists()) { // ensure the XML-TXM file path is set
164
					f = new File(project.getProjectDirectory(), "txm/"+project.getName()+"/"+text.getName()+".xml");
181
					f = new File(project.getProjectDirectory(), "txm/" + project.getName() + "/" + text.getName() + ".xml");
165 182
					text.setTXMFile(f);
166 183
				}
167 184
			}
168 185
		}
169

  
186
		
170 187
		// XML-TXM files are ready to be compiled
171 188
		final List<String> orderedTextIDs = getTXMFilesOrder();
172
		Thread Tcompiler = new Thread("XTZ Compiler - "+project.getSrcdir().getName()) {
189
		Thread Tcompiler = new Thread("XTZ Compiler - " + project.getSrcdir().getName()) {
190
			
191
			@Override
173 192
			public void run() {
174 193
				if (compiler != null) {
175
					if (monitor != null) monitor.subTask("-- COMPILING - Building Search Engine indexes"); 
194
					if (monitor != null) monitor.subTask("-- COMPILING - Building Search Engine indexes");
176 195
					
177 196
					System.out.println("-- COMPILING - Building Search Engine indexes");
178 197
					compiler.process(orderedTextIDs);
179 198
					isSuccessful = isSuccessful & compiler.isSuccessFul();
180 199
					if (!isSuccessful) {
181
						System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason());
200
						System.out.println("Error while importing corpus during 'compiler' step, reason=" + compiler.getReason());
182 201
						return;
183 202
					}
184
				} else {
203
				}
204
				else {
185 205
					System.out.println("No CQP index created.");
186 206
				}
187 207
			}
188 208
		};
189 209
		
190
		Thread Tpager = new Thread("XTZ Pager - "+project.getSrcdir().getName()) {
210
		Thread Tpager = new Thread("XTZ Pager - " + project.getSrcdir().getName()) {
211
			
212
			@Override
191 213
			public void run() {
192

  
214
				
193 215
				if (pager != null) {
194 216
					if (monitor != null) monitor.subTask("-- EDITION - Building editions");
195 217
					
......
197 219
					pager.process(orderedTextIDs);
198 220
					isSuccessful = isSuccessful & pager.isSuccessFul();
199 221
					if (!isSuccessful) {
200
						System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason());
222
						System.out.println("Error while importing corpus during 'pager' step, reason=" + pager.getReason());
201 223
						return;
202 224
					}
203
				} else {
225
				}
226
				else {
204 227
					System.out.println("No edition produced.");
205 228
				}
206 229
			}
207 230
		};
208

  
231
		
209 232
		Tcompiler.start();
210
		if (!multithread) { //  && !updateCorpus
233
		if (!multithread) { // && !updateCorpus
211 234
			Tcompiler.join(); // wait for the end if not multithreaded
212 235
			if (!isSuccessful) { // don't call pager is compiler step failed
213 236
				return;
214 237
			}
215 238
		}
216

  
239
		
217 240
		Tpager.start();
218 241
		if (multithread) Tcompiler.join(); // wait for both threads to end
219 242
		Tpager.join();
220

  
243
		
221 244
		if (isSuccessful) { // all done TODO remove this code when Text._compute() will be implemented
222 245
			for (Text t : project.getTexts()) {
223 246
				t.setDirty(false);
......
226 249
			project.setDoUpdate(false);
227 250
		}
228 251
	}
229

  
252
	
230 253
	protected List<String> getTXMFilesOrder() {
231
		//		//System.out.println("DEFAULT FILES ORDER");
232
		//		File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
233
		//		ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
234
		//			@Override
235
		//			public boolean accept(File file) {
236
		//				return file.isFile() && file.getName().endsWith(".xml");
237
		//			}
238
		//		})));
239
		//		
240
		//		Collections.sort(files);
241
		return project.getTextsID();
242
		//		return files;
254
		// //System.out.println("DEFAULT FILES ORDER");
255
		File txmDirectory = new File(binaryDirectory, "txm/" + corpusName);
256
		ArrayList<File> files = new ArrayList<>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
257
			
258
			@Override
259
			public boolean accept(File file) {
260
				return file.isFile() && file.getName().endsWith(".xml");
261
			}
262
		})));
263
		
264
		Collections.sort(files);
265
		ArrayList<String> ids = new ArrayList<>();
266
		for (File f : files) {
267
			String name = f.getName();
268
			ids.add(name.substring(0, name.length() - 4));
269
		}
270
		
271
		return ids;
272
		// return project.getTextsID();
273
		// return files;
243 274
	}
244

  
275
	
245 276
	public void end() {
246 277
		File paramFile = new File(binaryDirectory, "import.xml");
247 278
		try {
248
			//DomUtils.save(project.root.getOwnerDocument(), paramFile);
279
			// DomUtils.save(project.root.getOwnerDocument(), paramFile);
249 280
			project.saveParameters(true);
250 281
			isSuccessful = true;
251
		} catch (Exception e) {
282
		}
283
		catch (Exception e) {
252 284
			// TODO Auto-generated catch block
253 285
			e.printStackTrace();
254 286
			isSuccessful = false;
255 287
		}
256 288
	}
257

  
289
	
258 290
	public String getCorpusName() {
259 291
		return corpusName;
260 292
	}
261

  
293
	
262 294
	public String getReason() {
263 295
		return reason;
264 296
	}
265

  
297
	
266 298
	public boolean isSuccessFul() {
267 299
		return isSuccessful;
268 300
	}
269

  
301
	
270 302
	public Project getProject() {
271 303
		return project;
272 304
	}
273

  
305
	
274 306
	public File getSourceDirectory() {
275 307
		return sourceDirectory;
276 308
	}
277

  
309
	
278 310
	public File getBinaryDirectory() {
279 311
		return binaryDirectory;
280 312
	}
281

  
313
	
282 314
	public void process() throws InterruptedException {
283 315
		start();
284 316
		if (isSuccessful)
285 317
			end();
286 318
	}
287

  
319
	
288 320
	public static void main(String[] args) {
289
		//		File projectFile = new File("/home/mdecorde/xml/brown/import.xml");
321
		// File projectFile = new File("/home/mdecorde/xml/brown/import.xml");
290 322
		//
291
		//		ImportModule module = new ImportModule(projectFile);
292
		//		System.out.println("Parameters: "+module.getParameters());
293
		//		try {
294
		//			module.start();
323
		// ImportModule module = new ImportModule(projectFile);
324
		// System.out.println("Parameters: "+module.getParameters());
325
		// try {
326
		// module.start();
295 327
		//
296
		//			if (module.isSuccessful) {
297
		//				System.out.println("Import sucessful. reloading corpora...");
298
		//			} else {
299
		//				System.out.println("Import failed, reason = "+module.getReason());
300
		//			}
301
		//		} catch (Exception e) {
302
		//			e.printStackTrace();
303
		//		}
328
		// if (module.isSuccessful) {
329
		// System.out.println("Import sucessful. reloading corpora...");
330
		// } else {
331
		// System.out.println("Import failed, reason = "+module.getReason());
332
		// }
333
		// } catch (Exception e) {
334
		// e.printStackTrace();
335
		// }
304 336
	}
305 337
}
tmp/org.txm.core/src/java/org/txm/importer/ValidateXml.java (revision 2473)
50 50
 * @author mdecorde
51 51
 */
52 52
public class ValidateXml {
53

  
53
	
54 54
	/**
55 55
	 * @throws XMLStreamException test a String, not implemented.
56 56
	 *
......
75 75
			XMLInputFactory factory = XMLInputFactory.newInstance();
76 76
			XMLStreamReader parser = factory.createXMLStreamReader(inputData);
77 77
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
78

  
78
				
79 79
			}
80 80
			parser.close();
81 81
			inputData.close();
82
		} catch (IOException e) {
82
		}
83
		catch (IOException e) {
83 84
			// TODO Auto-generated catch block
84 85
			org.txm.utils.logger.Log.printStackTrace(e);
85 86
			return false;
86 87
		}
87 88
		return true;
88 89
	}
89

  
90
	
90 91
	/**
91 92
	 * test a content stored in a String, not implemented.
92 93
	 *
......
97 98
	public static boolean test(String xmldata) {
98 99
		return testAndGetError(xmldata).length() == 0;
99 100
	}
100

  
101
	
101 102
	/**
102 103
	 * test a content stored in a String, not implemented.
103 104
	 *
......
109 110
		try {
110 111
			Node records = new XmlParser().parseText(xmldata);
111 112
			return "";
112
		} catch (Exception e) {
113
		}
114
		catch (Exception e) {
113 115
			System.out.println(e.getLocalizedMessage());
114 116
			return e.toString();
115 117
		}
116 118
	}
117

  
119
	
118 120
	/**
119 121
	 * test a file.
120 122
	 *
......
141 143
			XMLInputFactory factory = XMLInputFactory.newInstance();
142 144
			XMLStreamReader parser = factory.createXMLStreamReader(inputData);
143 145
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
144

  
146
				
145 147
			}
146 148
			parser.close();
147 149
			inputData.close();
148 150
			return true;
149
		} catch (Exception e) {
151
		}
152
		catch (Exception e) {
150 153
			System.out.println(e.getLocalizedMessage());
151 154
			return false;
152 155
		}
153 156
	}
154

  
157
	
155 158
	/**
159
	 * small test to verify a file is a XML-TEI file.
160
	 *
161
	 * @param infile
162
	 *            the file to test
163
	 * @return false if the file is not valid
164
	 */
165
	public static boolean teiTest(File infile) {
166
		if (infile.isDirectory()) {
167
			System.out.println(NLS.bind(TXMCoreMessages.xmlValidationColonP0IsADirectory, infile));
168
			return false;
169
		}
170
		if (!infile.exists()) {
171
			System.out.println(NLS.bind(TXMCoreMessages.xmlValidationColonP0DoesNotExists, infile));
172
			return false;
173
		}
174
		if (!infile.canRead()) {
175
			System.out.println(NLS.bind(TXMCoreMessages.xmlValidationColonP0IsNotReadable, infile));
176
			return false;
177
		}
178
		try {
179
			URL url = infile.toURI().toURL();
180
			InputStream inputData = url.openStream();
181
			XMLInputFactory factory = XMLInputFactory.newInstance();
182
			XMLStreamReader parser = factory.createXMLStreamReader(inputData);
183
			int n = 0;
184
			int tei = -1;
185
			int text = -1;
186
			int teiHeader = -1;
187
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
188
				if (event == XMLStreamConstants.START_ELEMENT) {
189
					String name = parser.getLocalName();
190
					if ("TEI".equals(name)) {
191
						tei = n++;
192
					}
193
					else if ("text".equals(name)) {
194
						text = n++;
195
					}
196
					else if ("teiHeader".equals(name)) {
197
						teiHeader = n++;
198
					}
199
				}
200
			}
201
			parser.close();
202
			inputData.close();
203
			
204
			boolean test = n == 3 && tei < teiHeader && teiHeader < text;
205
			if (test) {
206
				return true;
207
			}
208
			else {
209
				if (n != 3) {
210
					if (tei == -1) {
211
						System.out.println(infile.getName() + ": No 'TEI' tag found");
212
					}
213
					if (teiHeader == -1) {
214
						System.out.println(infile.getName() + ": No 'teiHeader' tag found");
215
					}
216
					if (text == -1) {
217
						System.out.println(infile.getName() + ": No 'text' tag found");
218
					}
219
				}
220
				else {
221
					System.out.println(infile.getName() + ": Malformed TEI XML");
222
				}
223
				return false;
224
			}
225
		}
226
		catch (Exception e) {
227
			System.out.println(e.getLocalizedMessage());
228
			return false;
229
		}
230
	}
231
	
232
	/**
156 233
	 * The main method.
157 234
	 *
158 235
	 * @param args
159 236
	 *            the arguments
160 237
	 */
161 238
	public static void main(String[] args) {
162
		if (ValidateXml.test(new File("~/TXM/corpora/discours/txm/01_DeGaulle.xml"))) //$NON-NLS-1$
163
			System.out.println("OK"); //$NON-NLS-1$
164
		else
165
			System.out.println("ERROR"); //$NON-NLS-1$
239
		for (File file : new File(System.getProperty("user.home"), "xml/teierrors").listFiles()) {
240
			if (ValidateXml.teiTest(file)) {
241
				System.out.println("OK: " + file); //$NON-NLS-1$
242
			}
243
			else {
244
				System.out.println("ERROR: " + file); //$NON-NLS-1$
245
			}
246
		}
166 247
	}
167
}
248
}

Also available in: Unified diff