Revision 945

tmp/org.txm.core/src/java/org/txm/importer/xtz/Importer.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4

  
5
/**
6
 * Takes any form of source files
7
 * 
8
 * After this step, the XML-TXM files are created.
9
 * 
10
 * they are validated before continuing 
11
 * @author mdecorde
12
 *
13
 */
14
public abstract class Importer extends ImportStep {
15

  
16
	public Importer(ImportModule module) {
17
		super(module);
18
		inputDirectory = module.getSourceDirectory();
19
		outputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
20
		outputDirectory.mkdirs();
21
	}
22

  
23
	public abstract void checkFiles();
24
}
0 25

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportKeys.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
public class ImportKeys {
4

  
5
	public static final String CLEAN = "clean.directories";
6
	public static final String TTMODEL = "annotate.model";
7
	public static final String TTANNOTATE = "annotate.run";
8
	public static final String LANG = "lang";
9
	
10
	public static final String MULTITHREAD = "multithread";
11
	public static final String DEBUG = "debug";
12
	public static final String UPDATECORPUS = "corpus.update";
13
	
14
	public static final String NORMALISEANAVALUES = "normalize.ana.values";
15
	public static final String NORMALISEATTRIBUTEVALUES = "normalize.attribute.values";
16
}
0 17

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportStep.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.HashMap;
5

  
6
/**
7
 * One of the step of an import module
8
 * 
9
 * @author mdecorde
10
 *
11
 */
12
public abstract class ImportStep {
13

  
14
	protected File inputDirectory, outputDirectory;
15
	protected ImportModule module;
16
	
17
	protected HashMap<String, Object> stepProperties = new HashMap<String, Object>();
18
	protected boolean isSuccessFul = false;
19
	protected String reason = "not set.";
20
	protected boolean stopAtFirstError = true;
21
	protected boolean debug = true;
22
	
23
	public ImportStep(ImportModule module) {
24
		this.module = module;
25
		debug = module.debug;
26
	}
27
	
28
	public File getInputDirectory() {
29
		return inputDirectory;
30
	}
31
	
32
	public File getOutputDirectory() {
33
		return outputDirectory;
34
	}
35
	
36
	public ImportModule getImportModule() {
37
		return module;
38
	}
39
	
40
	public boolean isSuccessFul() {
41
		return isSuccessFul;
42
	}
43
	
44
	public String getReason() {
45
		return reason;
46
	}
47
	
48
	/**
49
	 * Called when a step is interrupted to clean streams and stuff
50
	 */
51
	public abstract void cancel();
52
	
53
	public abstract void process();
54
}
0 55

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/Compiler.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.ArrayList;
5

  
6
import org.txm.utils.DeleteDir;
7

  
8
/**
9
 * Takes XML-TXM files, build the CQP files and call cwb utils
10
 * 
11
 * @author mdecorde
12
 *
13
 */
14
public class Compiler extends ImportStep {
15

  
16
	protected File cqpDirectory, registryDirectory, dataDirectory;
17
	protected ArrayList<File> files;
18
	
19
	/**
20
	 * Creates the output directories
21
	 * 
22
	 * @param module
23
	 */
24
	public Compiler(ImportModule module) {
25
		super(module);
26

  
27
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
28
		cqpDirectory = new File(module.getBinaryDirectory(), "cqp");		
29
		outputDirectory = new File(module.getBinaryDirectory(), "data");
30
		registryDirectory = new File(module.getBinaryDirectory(), "registry");
31
		dataDirectory = new File(outputDirectory, module.getCorpusName());
32

  
33
		DeleteDir.deleteDirectory(outputDirectory);
34
		outputDirectory.mkdirs();
35
		
36
		DeleteDir.deleteDirectory(dataDirectory);
37
		dataDirectory.mkdirs();
38
		
39
		DeleteDir.deleteDirectory(registryDirectory);
40
		registryDirectory.mkdirs();
41
		
42
		if (!module.isUpdatingCorpus()) {
43
			DeleteDir.deleteDirectory(cqpDirectory);
44
			cqpDirectory.mkdir();
45
		}
46
	}
47

  
48
	@Override
49
	public void cancel() {
50
		// TODO Auto-generated method stub
51
	}
52

  
53
	@Override
54
	public void process() {
55
		process(null); // no default files order set
56
	}
57
	
58
	public void process(ArrayList<File> files) {
59
		this.files = files;
60
	}
61
}
0 62

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/Annotater.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4

  
5

  
6
/**
7
 * 
8
 * Takes the XML-TXM files and wrap a TAL Tool to update the XML-TXM files
9
 * 
10
 * @author mdecorde
11
 *
12
 */
13
public abstract class Annotater extends ImportStep {
14

  
15
	public Annotater(ImportModule module) {
16
		super(module);
17
		
18
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.corpusName);
19
		outputDirectory = new File(module.getBinaryDirectory(), "txm");
20
	}
21
}
0 22

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/Step.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
public class Step {
4

  
5
	public Step() {
6
		// TODO Auto-generated constructor stub
7
	}
8

  
9
	public boolean process() {
10
		return true;
11
	}
12
}
0 13

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/Pager.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.ArrayList;
5

  
6
import org.txm.utils.DeleteDir;
7

  
8
/**
9
 * Takes the XML-TXM files and build an edition
10
 * 
11
 * @author mdecorde
12
 *
13
 */
14
public class Pager extends ImportStep {
15

  
16
	protected File htmlDirectory;
17
	protected String corpusname;
18
	protected ArrayList<File> files;
19

  
20
	public Pager(ImportModule module, String editionName) {
21
		super(module);
22

  
23
		corpusname = module.getCorpusName();
24

  
25
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
26
		htmlDirectory = new File(module.getBinaryDirectory(), "HTML/"+corpusname);
27
		outputDirectory  = new File(htmlDirectory, editionName);
28

  
29
		if (!module.isUpdatingCorpus()) {
30
			DeleteDir.deleteDirectory(outputDirectory);
31
			outputDirectory.mkdirs();
32
		}
33
	}
34

  
35
	@Override
36
	public void cancel() {
37
		// TODO Auto-generated method stub
38

  
39
	}
40

  
41
	@Override
42
	public void process() {
43
		process(null); // no default files order set
44
	}
45
	
46
	public void process(ArrayList<File> files) {
47
		this.files = files;
48
	}
49
}
0 50

  
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportModule.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.io.FileFilter;
5
import java.util.ArrayList;
6
import java.util.Arrays;
7
import java.util.Collections;
8
import java.util.logging.Level;
9

  
10
import org.txm.Toolbox;
11
import org.txm.core.preferences.TBXPreferences;
12
import org.eclipse.core.runtime.IProgressMonitor;
13
import org.txm.utils.xml.DomUtils;
14
import org.txm.objects.BaseParameters;
15
import org.txm.utils.DeleteDir;
16
import org.txm.utils.logger.Log;
17

  
18
public class ImportModule {
19

  
20
	public BaseParameters importParameters;
21
	
22
	public String corpusVersionProduced;
23

  
24
	public File sourceDirectory;
25
	public File binaryDirectory;
26

  
27
	public Importer importer;
28
	public Annotater annotater;
29
	public Compiler compiler;
30
	public Pager pager;
31

  
32
	/**
33
	 * set the variable to false to stop the import process at next step
34
	 */
35
	public boolean isSuccessful = true;
36
	public String reason = "none";
37
	public boolean debug = false;
38
	public boolean multithread = false;
39
	public boolean updateCorpus = false;
40
	public String corpusName;
41

  
42
	IProgressMonitor monitor;
43
	
44
	public void setMonitor(IProgressMonitor monitor) {
45
		this.monitor = monitor;
46
	}
47

  
48
	
49
	public boolean isMultiThread() {
50
		return multithread;
51
	}
52

  
53
	public boolean isDebugging() {
54
		return debug;
55
	}
56

  
57
	public ImportModule(File importParametersFile) {
58
		try {
59
			BaseParameters b = new BaseParameters(importParametersFile);
60
			init(b);
61
		} catch (Exception e) {
62
			e.printStackTrace();
63
		}
64
	}
65

  
66
	public ImportModule(BaseParameters p) {
67
		init(p);
68
	}
69
	
70
	public boolean isUpdatingCorpus() {
71
		return updateCorpus;
72
	}
73

  
74
	protected void init(BaseParameters p) {
75
		this.importParameters = p;
76
		this.importParameters.load();
77
		corpusName = importParameters.name;
78
		//this.debug = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.DEBUG));
79
		
80
		if (Log.getLevel().intValue() < Level.WARNING.intValue()) {
81
			debug = true;
82
		}
83
		this.multithread = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.MULTITHREAD));
84
		this.updateCorpus = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.UPDATECORPUS));
85

  
86
		
87
		this.sourceDirectory = importParameters.paramFile.getParentFile();
88
		this.binaryDirectory = new File(Toolbox.getTxmHomePath(), "corpora/"+corpusName.toUpperCase());
89

  
90
		if (!updateCorpus) { // clean directories only if it's a new import
91
			DeleteDir.deleteDirectory(binaryDirectory);
92
			binaryDirectory.mkdir();
93
			
94
			File txmDir = new File(binaryDirectory, "txm");
95
			txmDir.mkdir();
96
		}
97
	}
98

  
99
	public void start() throws InterruptedException {
100

  
101
		binaryDirectory.mkdirs(); // ensure output exists
102
		//System.out.println("ImportModule.start");
103
		if (!updateCorpus) { // create XML-TXM files and annotate
104
			//System.out.println("ImportModule.start: not updating");
105
			if (importer != null) {
106
				//System.out.println("ImportModule.start: importer: "+importer);
107
				if (monitor != null) System.out.println("-- IMPORTER - Reading source files");
108
				importer.process();
109
				//importer.checkFiles();
110
				isSuccessful = isSuccessful & importer.isSuccessFul();
111
				if (!isSuccessful) {
112
					System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason());
113
					return;
114
				}
115
			} else {
116
				System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName));
117
			}
118

  
119
			boolean annotate = "true".equals(importParameters.getCorpusElement().getAttribute("annotate"));
120
			if (annotate && annotater != null) {
121
				if (monitor != null) System.out.println("-- ANNOTATE - Running NLP tools");
122
				annotater.process();
123
				isSuccessful = isSuccessful & annotater.isSuccessFul();
124
				if (!isSuccessful) {
125
					System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason());
126
					return;
127
				}
128
			} else {
129
				//System.out.println("XML-TXM files already annotated.");
130
			}
131
		} else {
132
			System.out.println("Updating corpus...");
133
		}
134
		
135
		//System.out.println("GET FILES ORDER");
136
		final ArrayList<File> files = getTXMFilesOrder();
137
		
138
		Thread Tcompiler = new Thread() {
139
			public void run() {
140
				if (compiler != null) {
141
					if (monitor != null) System.out.println("-- COMPILING - Building Search Engine indexes");
142
					compiler.process(files);
143
					isSuccessful = isSuccessful & compiler.isSuccessFul();
144
					if (!isSuccessful) {
145
						System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason());
146
						return;
147
					}
148
				} else {
149
					System.out.println("No CQP index created.");
150
				}
151
			}
152
		};
153

  
154
		Thread Tpager = new Thread() {
155
			public void run() {
156

  
157
				if (pager != null) {
158
					if (monitor != null) System.out.println("-- EDITION - Building edition");
159
					pager.process(files);
160
					isSuccessful = isSuccessful & pager.isSuccessFul();
161
					if (!isSuccessful) {
162
						System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason());
163
						return;
164
					}
165
				} else {
166
					System.out.println("No edition produced.");
167
				}
168
			}
169
		};
170

  
171
		Tcompiler.start();
172
		if (!multithread) {
173
			Tcompiler.join(); // wait for the end if not multithreaded
174
			if (!isSuccessful) { // don't call pager is compiler step failed
175
				return;
176
			}
177
		}
178
		
179
		Tpager.start();
180
		if (multithread) Tcompiler.join(); // wait for both thread to end
181
		Tpager.join();		
182
	}
183
	
184
	protected ArrayList<File> getTXMFilesOrder() {
185
		//System.out.println("DEFAULT FILES ORDER");
186
		File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
187
		ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
188
			@Override
189
			public boolean accept(File file) {
190
				return file.isFile() && file.getName().endsWith(".xml");
191
			}
192
		})));
193
		
194
		Collections.sort(files);
195
		
196
		return files;
197
	}
198

  
199

  
200
	public void end() {
201
		File paramFile = new File(binaryDirectory, "import.xml");
202
		try {
203
			DomUtils.save(importParameters.root.getOwnerDocument(), paramFile);
204
			isSuccessful = true;
205
		} catch (Exception e) {
206
			// TODO Auto-generated catch block
207
			e.printStackTrace();
208
			isSuccessful = false;
209
		}
210
	}
211
	
212
	public String getCorpusName() {
213
		return corpusName;
214
	}
215

  
216
	public String getReason() {
217
		return reason;
218
	}
219

  
220
	public boolean isSuccessFul() {
221
		return isSuccessful;
222
	}
223

  
224
	public BaseParameters getParameters() {
225
		return importParameters;
226
	}
227

  
228
	public File getSourceDirectory() {
229
		return sourceDirectory;
230
	}
231

  
232
	public File getBinaryDirectory() {
233
		return binaryDirectory;
234
	}
235
	
236
	public void process() throws InterruptedException {
237
		start();
238
		if (isSuccessful)
239
			end();
240
	}
241

  
242
	public static void main(String[] args) {
243
		File importParametersFile = new File("/home/mdecorde/xml/brown/import.xml");
244

  
245
		ImportModule module = new ImportModule(importParametersFile);
246
		System.out.println("Parameters: "+module.getParameters());
247
		try {
248
			module.start();
249

  
250
			if (module.isSuccessful) {
251
				System.out.println("Import sucessful. reloading corpora...");
252
			} else {
253
				System.out.println("Import failed, reason = "+module.getReason());
254
			}
255
		} catch (Exception e) {
256
			e.printStackTrace();
257
		}
258
	}
259
}
0 260

  
tmp/org.txm.core/src/java/org/txm/metadatas/Metadatas.java (revision 945)
235 235
	 */
236 236
	public static boolean convertCsvToXml(File csvfile, File xmlFile, String encoding, String separator, String txtseparator, int nbheaderline) throws Exception
237 237
	{	
238
		if (separator == null || separator.length() == 0) {
239
			separator = "\t";
240
		}
241
		if (encoding == null || encoding.length() == 0) {
242
			encoding = "UTF-8";
243
		}
238 244
		xmlFile.createNewFile();
239 245

  
240 246
		if(!csvfile.exists())
tmp/org.txm.core/src/java/org/txm/Toolbox.java (revision 945)
84 84
	private static boolean state = false;
85 85

  
86 86
	public static Workspace workspace;
87

  
88 87
	
89 88
	/**
90 89
	 * 
tmp/org.txm.core/META-INF/MANIFEST.MF (revision 945)
390 390
 org.txm.importer.filters,
391 391
 org.txm.importer.scripting,
392 392
 org.txm.importer.xmltxm,
393
 org.txm.importer.xtz,
393 394
 org.txm.js,
394 395
 org.txm.js.viewer,
395 396
 org.txm.metadatas,
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/discours/importer.groovy (revision 945)
98 98
				println "Error: could not create a copy of metadata file "+csvfile.getAbsoluteFile();
99 99
				return;
100 100
			}
101
			metadatas = new Metadatas(copy, Toolbox.getPreference(TBXPreferences.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1)
101
			metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
102
		Toolbox.getMetadataColumnSeparator(), 
103
		Toolbox.getMetadataTextSeparator(), 1)
102 104
		} else {
103 105
			println "No metadata file: "+csvfile
104 106
			println "Aborting"
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/transcriber/transcriberLoader.groovy (revision 945)
122 122
		println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
123 123
		return;
124 124
	}
125
	metadatas = new Metadatas(copy, Toolbox.getPreference(Toolbox.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1)
125
	metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
126
		Toolbox.getMetadataColumnSeparator(), 
127
		Toolbox.getMetadataTextSeparator(), 1)
126 128
}
127 129
else
128 130
	println "no metadata file: "+allmetadatasfile
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/txt/txtLoader.groovy (revision 945)
97 97
		println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
98 98
		return;
99 99
	}
100
	metadatas = new Metadatas(copy, Toolbox.getPreference(Toolbox.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1)
100
	metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
101
		Toolbox.getMetadataColumnSeparator(), 
102
		Toolbox.getMetadataTextSeparator(), 1)
101 103
} else {
102 104
	println "No metadata file: "+allmetadatasfile
103 105
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Compiler.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.ArrayList;
5

  
6
import org.txm.utils.DeleteDir;
7

  
8
/**
9
 * Takes XML-TXM files, build the CQP files and call cwb utils
10
 * 
11
 * @author mdecorde
12
 *
13
 */
14
public class Compiler extends ImportStep {
15

  
16
	protected File cqpDirectory, registryDirectory, dataDirectory;
17
	protected ArrayList<File> files;
18
	
19
	/**
20
	 * Creates the output directories
21
	 * 
22
	 * @param module
23
	 */
24
	public Compiler(ImportModule module) {
25
		super(module);
26

  
27
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
28
		cqpDirectory = new File(module.getBinaryDirectory(), "cqp");		
29
		outputDirectory = new File(module.getBinaryDirectory(), "data");
30
		registryDirectory = new File(module.getBinaryDirectory(), "registry");
31
		dataDirectory = new File(outputDirectory, module.getCorpusName());
32

  
33
		DeleteDir.deleteDirectory(outputDirectory);
34
		outputDirectory.mkdirs();
35
		
36
		DeleteDir.deleteDirectory(dataDirectory);
37
		dataDirectory.mkdirs();
38
		
39
		DeleteDir.deleteDirectory(registryDirectory);
40
		registryDirectory.mkdirs();
41
		
42
		if (!module.isUpdatingCorpus()) {
43
			DeleteDir.deleteDirectory(cqpDirectory);
44
			cqpDirectory.mkdir();
45
		}
46
	}
47

  
48
	@Override
49
	public void cancel() {
50
		// TODO Auto-generated method stub
51
	}
52

  
53
	@Override
54
	public void process() {
55
		process(null); // no default files order set
56
	}
57
	
58
	public void process(ArrayList<File> files) {
59
		this.files = files;
60
	}
61
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Step.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
public class Step {
4

  
5
	public Step() {
6
		// TODO Auto-generated constructor stub
7
	}
8

  
9
	public boolean process() {
10
		return true;
11
	}
12
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportModule.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.io.FileFilter;
5
import java.util.ArrayList;
6
import java.util.Arrays;
7
import java.util.Collections;
8
import java.util.logging.Level;
9

  
10
import org.txm.Toolbox;
11
import org.txm.core.preferences.TBXPreferences;
12
import org.eclipse.core.runtime.IProgressMonitor;
13
import org.txm.utils.xml.DomUtils;
14
import org.txm.objects.BaseParameters;
15
import org.txm.utils.DeleteDir;
16
import org.txm.utils.logger.Log;
17

  
18
public class ImportModule {
19

  
20
	public BaseParameters importParameters;
21
	
22
	public String corpusVersionProduced;
23

  
24
	public File sourceDirectory;
25
	public File binaryDirectory;
26

  
27
	public Importer importer;
28
	public Annotater annotater;
29
	public Compiler compiler;
30
	public Pager pager;
31

  
32
	/**
33
	 * set the variable to false to stop the import process at next step
34
	 */
35
	public boolean isSuccessful = true;
36
	public String reason = "none";
37
	public boolean debug = false;
38
	public boolean multithread = false;
39
	public boolean updateCorpus = false;
40
	public String corpusName;
41

  
42
	IProgressMonitor monitor;
43
	
44
	public void setMonitor(IProgressMonitor monitor) {
45
		this.monitor = monitor;
46
	}
47

  
48
	
49
	public boolean isMultiThread() {
50
		return multithread;
51
	}
52

  
53
	public boolean isDebugging() {
54
		return debug;
55
	}
56

  
57
	public ImportModule(File importParametersFile) {
58
		try {
59
			BaseParameters b = new BaseParameters(importParametersFile);
60
			init(b);
61
		} catch (Exception e) {
62
			e.printStackTrace();
63
		}
64
	}
65

  
66
	public ImportModule(BaseParameters p) {
67
		init(p);
68
	}
69
	
70
	public boolean isUpdatingCorpus() {
71
		return updateCorpus;
72
	}
73

  
74
	protected void init(BaseParameters p) {
75
		this.importParameters = p;
76
		this.importParameters.load();
77
		corpusName = importParameters.name;
78
		//this.debug = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.DEBUG));
79
		
80
		if (Log.getLevel().intValue() < Level.WARNING.intValue()) {
81
			debug = true;
82
		}
83
		this.multithread = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.MULTITHREAD));
84
		this.updateCorpus = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.UPDATECORPUS));
85

  
86
		
87
		this.sourceDirectory = importParameters.paramFile.getParentFile();
88
		this.binaryDirectory = new File(Toolbox.getTxmHomePath(), "corpora/"+corpusName.toUpperCase());
89

  
90
		if (!updateCorpus) { // clean directories only if it's a new import
91
			DeleteDir.deleteDirectory(binaryDirectory);
92
			binaryDirectory.mkdir();
93
			
94
			File txmDir = new File(binaryDirectory, "txm");
95
			txmDir.mkdir();
96
		}
97
	}
98

  
99
	public void start() throws InterruptedException {
100

  
101
		binaryDirectory.mkdirs(); // ensure output exists
102
		//System.out.println("ImportModule.start");
103
		if (!updateCorpus) { // create XML-TXM files and annotate
104
			//System.out.println("ImportModule.start: not updating");
105
			if (importer != null) {
106
				//System.out.println("ImportModule.start: importer: "+importer);
107
				if (monitor != null) System.out.println("-- IMPORTER - Reading source files");
108
				importer.process();
109
				//importer.checkFiles();
110
				isSuccessful = isSuccessful & importer.isSuccessFul();
111
				if (!isSuccessful) {
112
					System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason());
113
					return;
114
				}
115
			} else {
116
				System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName));
117
			}
118

  
119
			boolean annotate = "true".equals(importParameters.getCorpusElement().getAttribute("annotate"));
120
			if (annotate && annotater != null) {
121
				if (monitor != null) System.out.println("-- ANNOTATE - Running NLP tools");
122
				annotater.process();
123
				isSuccessful = isSuccessful & annotater.isSuccessFul();
124
				if (!isSuccessful) {
125
					System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason());
126
					return;
127
				}
128
			} else {
129
				//System.out.println("XML-TXM files already annotated.");
130
			}
131
		} else {
132
			System.out.println("Updating corpus...");
133
		}
134
		
135
		//System.out.println("GET FILES ORDER");
136
		final ArrayList<File> files = getTXMFilesOrder();
137
		
138
		Thread Tcompiler = new Thread() {
139
			public void run() {
140
				if (compiler != null) {
141
					if (monitor != null) System.out.println("-- COMPILING - Building Search Engine indexes");
142
					compiler.process(files);
143
					isSuccessful = isSuccessful & compiler.isSuccessFul();
144
					if (!isSuccessful) {
145
						System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason());
146
						return;
147
					}
148
				} else {
149
					System.out.println("No CQP index created.");
150
				}
151
			}
152
		};
153

  
154
		Thread Tpager = new Thread() {
155
			public void run() {
156

  
157
				if (pager != null) {
158
					if (monitor != null) System.out.println("-- EDITION - Building edition");
159
					pager.process(files);
160
					isSuccessful = isSuccessful & pager.isSuccessFul();
161
					if (!isSuccessful) {
162
						System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason());
163
						return;
164
					}
165
				} else {
166
					System.out.println("No edition produced.");
167
				}
168
			}
169
		};
170

  
171
		Tcompiler.start();
172
		if (!multithread) {
173
			Tcompiler.join(); // wait for the end if not multithreaded
174
			if (!isSuccessful) { // don't call pager is compiler step failed
175
				return;
176
			}
177
		}
178
		
179
		Tpager.start();
180
		if (multithread) Tcompiler.join(); // wait for both thread to end
181
		Tpager.join();		
182
	}
183
	
184
	protected ArrayList<File> getTXMFilesOrder() {
185
		//System.out.println("DEFAULT FILES ORDER");
186
		File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
187
		ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
188
			@Override
189
			public boolean accept(File file) {
190
				return file.isFile() && file.getName().endsWith(".xml");
191
			}
192
		})));
193
		
194
		Collections.sort(files);
195
		
196
		return files;
197
	}
198

  
199

  
200
	public void end() {
201
		File paramFile = new File(binaryDirectory, "import.xml");
202
		try {
203
			DomUtils.save(importParameters.root.getOwnerDocument(), paramFile);
204
			isSuccessful = true;
205
		} catch (Exception e) {
206
			// TODO Auto-generated catch block
207
			e.printStackTrace();
208
			isSuccessful = false;
209
		}
210
	}
211
	
212
	public String getCorpusName() {
213
		return corpusName;
214
	}
215

  
216
	public String getReason() {
217
		return reason;
218
	}
219

  
220
	public boolean isSuccessFul() {
221
		return isSuccessful;
222
	}
223

  
224
	public BaseParameters getParameters() {
225
		return importParameters;
226
	}
227

  
228
	public File getSourceDirectory() {
229
		return sourceDirectory;
230
	}
231

  
232
	public File getBinaryDirectory() {
233
		return binaryDirectory;
234
	}
235
	
236
	public void process() throws InterruptedException {
237
		start();
238
		if (isSuccessful)
239
			end();
240
	}
241

  
242
	public static void main(String[] args) {
243
		File importParametersFile = new File("/home/mdecorde/xml/brown/import.xml");
244

  
245
		ImportModule module = new ImportModule(importParametersFile);
246
		System.out.println("Parameters: "+module.getParameters());
247
		try {
248
			module.start();
249

  
250
			if (module.isSuccessful) {
251
				System.out.println("Import sucessful. reloading corpora...");
252
			} else {
253
				System.out.println("Import failed, reason = "+module.getReason());
254
			}
255
		} catch (Exception e) {
256
			e.printStackTrace();
257
		}
258
	}
259
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Annotater.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4

  
5

  
6
/**
7
 * 
8
 * Takes the XML-TXM files and wrap a TAL Tool to update the XML-TXM files
9
 * 
10
 * @author mdecorde
11
 *
12
 */
13
public abstract class Annotater extends ImportStep {
14

  
15
	public Annotater(ImportModule module) {
16
		super(module);
17
		
18
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.corpusName);
19
		outputDirectory = new File(module.getBinaryDirectory(), "txm");
20
	}
21
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Pager.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.ArrayList;
5

  
6
import org.txm.utils.DeleteDir;
7

  
8
/**
9
 * Takes the XML-TXM files and build an edition
10
 * 
11
 * @author mdecorde
12
 *
13
 */
14
public class Pager extends ImportStep {
15

  
16
	protected File htmlDirectory;
17
	protected String corpusname;
18
	protected ArrayList<File> files;
19

  
20
	public Pager(ImportModule module, String editionName) {
21
		super(module);
22

  
23
		corpusname = module.getCorpusName();
24

  
25
		inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
26
		htmlDirectory = new File(module.getBinaryDirectory(), "HTML/"+corpusname);
27
		outputDirectory  = new File(htmlDirectory, editionName);
28

  
29
		if (!module.isUpdatingCorpus()) {
30
			DeleteDir.deleteDirectory(outputDirectory);
31
			outputDirectory.mkdirs();
32
		}
33
	}
34

  
35
	@Override
36
	public void cancel() {
37
		// TODO Auto-generated method stub
38

  
39
	}
40

  
41
	@Override
42
	public void process() {
43
		process(null); // no default files order set
44
	}
45
	
46
	public void process(ArrayList<File> files) {
47
		this.files = files;
48
	}
49
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Importer.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4

  
5
/**
6
 * Takes any form of source files
7
 * 
8
 * After this step, the XML-TXM files are created.
9
 * 
10
 * they are validated before continuing 
11
 * @author mdecorde
12
 *
13
 */
14
public abstract class Importer extends ImportStep {
15

  
16
	public Importer(ImportModule module) {
17
		super(module);
18
		inputDirectory = module.getSourceDirectory();
19
		outputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName());
20
		outputDirectory.mkdirs();
21
	}
22

  
23
	public abstract void checkFiles();
24
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportKeys.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
public class ImportKeys {
4

  
5
	public static final String CLEAN = "clean.directories";
6
	public static final String TTMODEL = "annotate.model";
7
	public static final String TTANNOTATE = "annotate.run";
8
	public static final String LANG = "lang";
9
	
10
	public static final String MULTITHREAD = "multithread";
11
	public static final String DEBUG = "debug";
12
	public static final String UPDATECORPUS = "corpus.update";
13
	
14
	public static final String NORMALISEANAVALUES = "normalize.ana.values";
15
	public static final String NORMALISEATTRIBUTEVALUES = "normalize.attribute.values";
16
}
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportStep.java (revision 945)
1
package org.txm.importer.xtz;
2

  
3
import java.io.File;
4
import java.util.HashMap;
5

  
6
/**
7
 * One of the step of an import module
8
 * 
9
 * @author mdecorde
10
 *
11
 */
12
public abstract class ImportStep {
13

  
14
	protected File inputDirectory, outputDirectory;
15
	protected ImportModule module;
16
	
17
	protected HashMap<String, Object> stepProperties = new HashMap<String, Object>();
18
	protected boolean isSuccessFul = false;
19
	protected String reason = "not set.";
20
	protected boolean stopAtFirstError = true;
21
	protected boolean debug = true;
22
	
23
	public ImportStep(ImportModule module) {
24
		this.module = module;
25
		debug = module.debug;
26
	}
27
	
28
	public File getInputDirectory() {
29
		return inputDirectory;
30
	}
31
	
32
	public File getOutputDirectory() {
33
		return outputDirectory;
34
	}
35
	
36
	public ImportModule getImportModule() {
37
		return module;
38
	}
39
	
40
	public boolean isSuccessFul() {
41
		return isSuccessFul;
42
	}
43
	
44
	public String getReason() {
45
		return reason;
46
	}
47
	
48
	/**
49
	 * Called when a step is interrupted to clean streams and stuff
50
	 */
51
	public abstract void cancel();
52
	
53
	public abstract void process();
54
}
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/Corpus.java (revision 945)
1107 1107
					this.getQualifiedCqpId(), queryResultId,
1108 1108
					query.getQueryString());
1109 1109
			queryResult = new QueryResult(queryResultId, queryResultName, this,	query);
1110
			
1110 1111
			if (save) super.addQueryLog(query.toString(), new ArrayList<String>());
1111 1112
		} catch (Exception e) {
1112 1113
			org.txm.utils.logger.Log.printStackTrace(e);
tmp/org.txm.statsengine.r.core/src/org/txm/statsengine/r/core/StartRserve.java (revision 945)
162 162
			Thread.sleep(200);
163 163
		} catch (InterruptedException ix) { }
164 164

  
165
		int attempts = 20;
165
		int attempts = 10;
166 166
		while (attempts > 0) {
167 167
			try {
168 168
				System.out.print("."); //$NON-NLS-1$
......
171 171
				return true;
172 172
			} catch (Exception e2) {
173 173
				try {
174
					Thread.sleep(2000);
174
					Thread.sleep(1500);
175 175
				} catch (InterruptedException ix) { }
176 176
			}
177 177

  

Also available in: Unified diff