Révision 945
tmp/org.txm.statsengine.r.core/src/org/txm/statsengine/r/core/StartRserve.java (revision 945) | ||
---|---|---|
162 | 162 |
Thread.sleep(200); |
163 | 163 |
} catch (InterruptedException ix) { } |
164 | 164 |
|
165 |
int attempts = 20;
|
|
165 |
int attempts = 10;
|
|
166 | 166 |
while (attempts > 0) { |
167 | 167 |
try { |
168 | 168 |
System.out.print("."); //$NON-NLS-1$ |
... | ... | |
171 | 171 |
return true; |
172 | 172 |
} catch (Exception e2) { |
173 | 173 |
try { |
174 |
Thread.sleep(2000);
|
|
174 |
Thread.sleep(1500);
|
|
175 | 175 |
} catch (InterruptedException ix) { } |
176 | 176 |
} |
177 | 177 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/discours/importer.groovy (revision 945) | ||
---|---|---|
98 | 98 |
println "Error: could not create a copy of metadata file "+csvfile.getAbsoluteFile(); |
99 | 99 |
return; |
100 | 100 |
} |
101 |
metadatas = new Metadatas(copy, Toolbox.getPreference(TBXPreferences.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1) |
|
101 |
metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), |
|
102 |
Toolbox.getMetadataColumnSeparator(), |
|
103 |
Toolbox.getMetadataTextSeparator(), 1) |
|
102 | 104 |
} else { |
103 | 105 |
println "No metadata file: "+csvfile |
104 | 106 |
println "Aborting" |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/transcriber/transcriberLoader.groovy (revision 945) | ||
---|---|---|
122 | 122 |
println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile(); |
123 | 123 |
return; |
124 | 124 |
} |
125 |
metadatas = new Metadatas(copy, Toolbox.getPreference(Toolbox.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1) |
|
125 |
metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), |
|
126 |
Toolbox.getMetadataColumnSeparator(), |
|
127 |
Toolbox.getMetadataTextSeparator(), 1) |
|
126 | 128 |
} |
127 | 129 |
else |
128 | 130 |
println "no metadata file: "+allmetadatasfile |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/txt/txtLoader.groovy (revision 945) | ||
---|---|---|
97 | 97 |
println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile(); |
98 | 98 |
return; |
99 | 99 |
} |
100 |
metadatas = new Metadatas(copy, Toolbox.getPreference(Toolbox.METADATA_ENCODING), Toolbox.getPreference(Toolbox.METADATA_COLSEPARATOR), Toolbox.getPreference(Toolbox.METADATA_TXTSEPARATOR), 1) |
|
100 |
metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), |
|
101 |
Toolbox.getMetadataColumnSeparator(), |
|
102 |
Toolbox.getMetadataTextSeparator(), 1) |
|
101 | 103 |
} else { |
102 | 104 |
println "No metadata file: "+allmetadatasfile |
103 | 105 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Annotater.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
|
|
5 |
|
|
6 |
/** |
|
7 |
* |
|
8 |
* Takes the XML-TXM files and wrap a TAL Tool to update the XML-TXM files |
|
9 |
* |
|
10 |
* @author mdecorde |
|
11 |
* |
|
12 |
*/ |
|
13 |
public abstract class Annotater extends ImportStep { |
|
14 |
|
|
15 |
public Annotater(ImportModule module) { |
|
16 |
super(module); |
|
17 |
|
|
18 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.corpusName); |
|
19 |
outputDirectory = new File(module.getBinaryDirectory(), "txm"); |
|
20 |
} |
|
21 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Pager.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.ArrayList; |
|
5 |
|
|
6 |
import org.txm.utils.DeleteDir; |
|
7 |
|
|
8 |
/** |
|
9 |
* Takes the XML-TXM files and build an edition |
|
10 |
* |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public class Pager extends ImportStep { |
|
15 |
|
|
16 |
protected File htmlDirectory; |
|
17 |
protected String corpusname; |
|
18 |
protected ArrayList<File> files; |
|
19 |
|
|
20 |
public Pager(ImportModule module, String editionName) { |
|
21 |
super(module); |
|
22 |
|
|
23 |
corpusname = module.getCorpusName(); |
|
24 |
|
|
25 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
26 |
htmlDirectory = new File(module.getBinaryDirectory(), "HTML/"+corpusname); |
|
27 |
outputDirectory = new File(htmlDirectory, editionName); |
|
28 |
|
|
29 |
if (!module.isUpdatingCorpus()) { |
|
30 |
DeleteDir.deleteDirectory(outputDirectory); |
|
31 |
outputDirectory.mkdirs(); |
|
32 |
} |
|
33 |
} |
|
34 |
|
|
35 |
@Override |
|
36 |
public void cancel() { |
|
37 |
// TODO Auto-generated method stub |
|
38 |
|
|
39 |
} |
|
40 |
|
|
41 |
@Override |
|
42 |
public void process() { |
|
43 |
process(null); // no default files order set |
|
44 |
} |
|
45 |
|
|
46 |
public void process(ArrayList<File> files) { |
|
47 |
this.files = files; |
|
48 |
} |
|
49 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Importer.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
|
|
5 |
/** |
|
6 |
* Takes any form of source files |
|
7 |
* |
|
8 |
* After this step, the XML-TXM files are created. |
|
9 |
* |
|
10 |
* they are validated before continuing |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public abstract class Importer extends ImportStep { |
|
15 |
|
|
16 |
public Importer(ImportModule module) { |
|
17 |
super(module); |
|
18 |
inputDirectory = module.getSourceDirectory(); |
|
19 |
outputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
20 |
outputDirectory.mkdirs(); |
|
21 |
} |
|
22 |
|
|
23 |
public abstract void checkFiles(); |
|
24 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportKeys.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
public class ImportKeys { |
|
4 |
|
|
5 |
public static final String CLEAN = "clean.directories"; |
|
6 |
public static final String TTMODEL = "annotate.model"; |
|
7 |
public static final String TTANNOTATE = "annotate.run"; |
|
8 |
public static final String LANG = "lang"; |
|
9 |
|
|
10 |
public static final String MULTITHREAD = "multithread"; |
|
11 |
public static final String DEBUG = "debug"; |
|
12 |
public static final String UPDATECORPUS = "corpus.update"; |
|
13 |
|
|
14 |
public static final String NORMALISEANAVALUES = "normalize.ana.values"; |
|
15 |
public static final String NORMALISEATTRIBUTEVALUES = "normalize.attribute.values"; |
|
16 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportStep.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.HashMap; |
|
5 |
|
|
6 |
/** |
|
7 |
* One of the step of an import module |
|
8 |
* |
|
9 |
* @author mdecorde |
|
10 |
* |
|
11 |
*/ |
|
12 |
public abstract class ImportStep { |
|
13 |
|
|
14 |
protected File inputDirectory, outputDirectory; |
|
15 |
protected ImportModule module; |
|
16 |
|
|
17 |
protected HashMap<String, Object> stepProperties = new HashMap<String, Object>(); |
|
18 |
protected boolean isSuccessFul = false; |
|
19 |
protected String reason = "not set."; |
|
20 |
protected boolean stopAtFirstError = true; |
|
21 |
protected boolean debug = true; |
|
22 |
|
|
23 |
public ImportStep(ImportModule module) { |
|
24 |
this.module = module; |
|
25 |
debug = module.debug; |
|
26 |
} |
|
27 |
|
|
28 |
public File getInputDirectory() { |
|
29 |
return inputDirectory; |
|
30 |
} |
|
31 |
|
|
32 |
public File getOutputDirectory() { |
|
33 |
return outputDirectory; |
|
34 |
} |
|
35 |
|
|
36 |
public ImportModule getImportModule() { |
|
37 |
return module; |
|
38 |
} |
|
39 |
|
|
40 |
public boolean isSuccessFul() { |
|
41 |
return isSuccessFul; |
|
42 |
} |
|
43 |
|
|
44 |
public String getReason() { |
|
45 |
return reason; |
|
46 |
} |
|
47 |
|
|
48 |
/** |
|
49 |
* Called when a step is interrupted to clean streams and stuff |
|
50 |
*/ |
|
51 |
public abstract void cancel(); |
|
52 |
|
|
53 |
public abstract void process(); |
|
54 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Compiler.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.ArrayList; |
|
5 |
|
|
6 |
import org.txm.utils.DeleteDir; |
|
7 |
|
|
8 |
/** |
|
9 |
* Takes XML-TXM files, build the CQP files and call cwb utils |
|
10 |
* |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public class Compiler extends ImportStep { |
|
15 |
|
|
16 |
protected File cqpDirectory, registryDirectory, dataDirectory; |
|
17 |
protected ArrayList<File> files; |
|
18 |
|
|
19 |
/** |
|
20 |
* Creates the output directories |
|
21 |
* |
|
22 |
* @param module |
|
23 |
*/ |
|
24 |
public Compiler(ImportModule module) { |
|
25 |
super(module); |
|
26 |
|
|
27 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
28 |
cqpDirectory = new File(module.getBinaryDirectory(), "cqp"); |
|
29 |
outputDirectory = new File(module.getBinaryDirectory(), "data"); |
|
30 |
registryDirectory = new File(module.getBinaryDirectory(), "registry"); |
|
31 |
dataDirectory = new File(outputDirectory, module.getCorpusName()); |
|
32 |
|
|
33 |
DeleteDir.deleteDirectory(outputDirectory); |
|
34 |
outputDirectory.mkdirs(); |
|
35 |
|
|
36 |
DeleteDir.deleteDirectory(dataDirectory); |
|
37 |
dataDirectory.mkdirs(); |
|
38 |
|
|
39 |
DeleteDir.deleteDirectory(registryDirectory); |
|
40 |
registryDirectory.mkdirs(); |
|
41 |
|
|
42 |
if (!module.isUpdatingCorpus()) { |
|
43 |
DeleteDir.deleteDirectory(cqpDirectory); |
|
44 |
cqpDirectory.mkdir(); |
|
45 |
} |
|
46 |
} |
|
47 |
|
|
48 |
@Override |
|
49 |
public void cancel() { |
|
50 |
// TODO Auto-generated method stub |
|
51 |
} |
|
52 |
|
|
53 |
@Override |
|
54 |
public void process() { |
|
55 |
process(null); // no default files order set |
|
56 |
} |
|
57 |
|
|
58 |
public void process(ArrayList<File> files) { |
|
59 |
this.files = files; |
|
60 |
} |
|
61 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Step.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
public class Step { |
|
4 |
|
|
5 |
public Step() { |
|
6 |
// TODO Auto-generated constructor stub |
|
7 |
} |
|
8 |
|
|
9 |
public boolean process() { |
|
10 |
return true; |
|
11 |
} |
|
12 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/ImportModule.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.FileFilter; |
|
5 |
import java.util.ArrayList; |
|
6 |
import java.util.Arrays; |
|
7 |
import java.util.Collections; |
|
8 |
import java.util.logging.Level; |
|
9 |
|
|
10 |
import org.txm.Toolbox; |
|
11 |
import org.txm.core.preferences.TBXPreferences; |
|
12 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
13 |
import org.txm.utils.xml.DomUtils; |
|
14 |
import org.txm.objects.BaseParameters; |
|
15 |
import org.txm.utils.DeleteDir; |
|
16 |
import org.txm.utils.logger.Log; |
|
17 |
|
|
18 |
public class ImportModule { |
|
19 |
|
|
20 |
public BaseParameters importParameters; |
|
21 |
|
|
22 |
public String corpusVersionProduced; |
|
23 |
|
|
24 |
public File sourceDirectory; |
|
25 |
public File binaryDirectory; |
|
26 |
|
|
27 |
public Importer importer; |
|
28 |
public Annotater annotater; |
|
29 |
public Compiler compiler; |
|
30 |
public Pager pager; |
|
31 |
|
|
32 |
/** |
|
33 |
* set the variable to false to stop the import process at next step |
|
34 |
*/ |
|
35 |
public boolean isSuccessful = true; |
|
36 |
public String reason = "none"; |
|
37 |
public boolean debug = false; |
|
38 |
public boolean multithread = false; |
|
39 |
public boolean updateCorpus = false; |
|
40 |
public String corpusName; |
|
41 |
|
|
42 |
IProgressMonitor monitor; |
|
43 |
|
|
44 |
public void setMonitor(IProgressMonitor monitor) { |
|
45 |
this.monitor = monitor; |
|
46 |
} |
|
47 |
|
|
48 |
|
|
49 |
public boolean isMultiThread() { |
|
50 |
return multithread; |
|
51 |
} |
|
52 |
|
|
53 |
public boolean isDebugging() { |
|
54 |
return debug; |
|
55 |
} |
|
56 |
|
|
57 |
public ImportModule(File importParametersFile) { |
|
58 |
try { |
|
59 |
BaseParameters b = new BaseParameters(importParametersFile); |
|
60 |
init(b); |
|
61 |
} catch (Exception e) { |
|
62 |
e.printStackTrace(); |
|
63 |
} |
|
64 |
} |
|
65 |
|
|
66 |
public ImportModule(BaseParameters p) { |
|
67 |
init(p); |
|
68 |
} |
|
69 |
|
|
70 |
public boolean isUpdatingCorpus() { |
|
71 |
return updateCorpus; |
|
72 |
} |
|
73 |
|
|
74 |
protected void init(BaseParameters p) { |
|
75 |
this.importParameters = p; |
|
76 |
this.importParameters.load(); |
|
77 |
corpusName = importParameters.name; |
|
78 |
//this.debug = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.DEBUG)); |
|
79 |
|
|
80 |
if (Log.getLevel().intValue() < Level.WARNING.intValue()) { |
|
81 |
debug = true; |
|
82 |
} |
|
83 |
this.multithread = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.MULTITHREAD)); |
|
84 |
this.updateCorpus = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.UPDATECORPUS)); |
|
85 |
|
|
86 |
|
|
87 |
this.sourceDirectory = importParameters.paramFile.getParentFile(); |
|
88 |
this.binaryDirectory = new File(Toolbox.getTxmHomePath(), "corpora/"+corpusName.toUpperCase()); |
|
89 |
|
|
90 |
if (!updateCorpus) { // clean directories only if it's a new import |
|
91 |
DeleteDir.deleteDirectory(binaryDirectory); |
|
92 |
binaryDirectory.mkdir(); |
|
93 |
|
|
94 |
File txmDir = new File(binaryDirectory, "txm"); |
|
95 |
txmDir.mkdir(); |
|
96 |
} |
|
97 |
} |
|
98 |
|
|
99 |
public void start() throws InterruptedException { |
|
100 |
|
|
101 |
binaryDirectory.mkdirs(); // ensure output exists |
|
102 |
//System.out.println("ImportModule.start"); |
|
103 |
if (!updateCorpus) { // create XML-TXM files and annotate |
|
104 |
//System.out.println("ImportModule.start: not updating"); |
|
105 |
if (importer != null) { |
|
106 |
//System.out.println("ImportModule.start: importer: "+importer); |
|
107 |
if (monitor != null) System.out.println("-- IMPORTER - Reading source files"); |
|
108 |
importer.process(); |
|
109 |
//importer.checkFiles(); |
|
110 |
isSuccessful = isSuccessful & importer.isSuccessFul(); |
|
111 |
if (!isSuccessful) { |
|
112 |
System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason()); |
|
113 |
return; |
|
114 |
} |
|
115 |
} else { |
|
116 |
System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName)); |
|
117 |
} |
|
118 |
|
|
119 |
boolean annotate = "true".equals(importParameters.getCorpusElement().getAttribute("annotate")); |
|
120 |
if (annotate && annotater != null) { |
|
121 |
if (monitor != null) System.out.println("-- ANNOTATE - Running NLP tools"); |
|
122 |
annotater.process(); |
|
123 |
isSuccessful = isSuccessful & annotater.isSuccessFul(); |
|
124 |
if (!isSuccessful) { |
|
125 |
System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason()); |
|
126 |
return; |
|
127 |
} |
|
128 |
} else { |
|
129 |
//System.out.println("XML-TXM files already annotated."); |
|
130 |
} |
|
131 |
} else { |
|
132 |
System.out.println("Updating corpus..."); |
|
133 |
} |
|
134 |
|
|
135 |
//System.out.println("GET FILES ORDER"); |
|
136 |
final ArrayList<File> files = getTXMFilesOrder(); |
|
137 |
|
|
138 |
Thread Tcompiler = new Thread() { |
|
139 |
public void run() { |
|
140 |
if (compiler != null) { |
|
141 |
if (monitor != null) System.out.println("-- COMPILING - Building Search Engine indexes"); |
|
142 |
compiler.process(files); |
|
143 |
isSuccessful = isSuccessful & compiler.isSuccessFul(); |
|
144 |
if (!isSuccessful) { |
|
145 |
System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason()); |
|
146 |
return; |
|
147 |
} |
|
148 |
} else { |
|
149 |
System.out.println("No CQP index created."); |
|
150 |
} |
|
151 |
} |
|
152 |
}; |
|
153 |
|
|
154 |
Thread Tpager = new Thread() { |
|
155 |
public void run() { |
|
156 |
|
|
157 |
if (pager != null) { |
|
158 |
if (monitor != null) System.out.println("-- EDITION - Building edition"); |
|
159 |
pager.process(files); |
|
160 |
isSuccessful = isSuccessful & pager.isSuccessFul(); |
|
161 |
if (!isSuccessful) { |
|
162 |
System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason()); |
|
163 |
return; |
|
164 |
} |
|
165 |
} else { |
|
166 |
System.out.println("No edition produced."); |
|
167 |
} |
|
168 |
} |
|
169 |
}; |
|
170 |
|
|
171 |
Tcompiler.start(); |
|
172 |
if (!multithread) { |
|
173 |
Tcompiler.join(); // wait for the end if not multithreaded |
|
174 |
if (!isSuccessful) { // don't call pager is compiler step failed |
|
175 |
return; |
|
176 |
} |
|
177 |
} |
|
178 |
|
|
179 |
Tpager.start(); |
|
180 |
if (multithread) Tcompiler.join(); // wait for both thread to end |
|
181 |
Tpager.join(); |
|
182 |
} |
|
183 |
|
|
184 |
protected ArrayList<File> getTXMFilesOrder() { |
|
185 |
//System.out.println("DEFAULT FILES ORDER"); |
|
186 |
File txmDirectory = new File(binaryDirectory, "txm/"+corpusName); |
|
187 |
ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() { |
|
188 |
@Override |
|
189 |
public boolean accept(File file) { |
|
190 |
return file.isFile() && file.getName().endsWith(".xml"); |
|
191 |
} |
|
192 |
}))); |
|
193 |
|
|
194 |
Collections.sort(files); |
|
195 |
|
|
196 |
return files; |
|
197 |
} |
|
198 |
|
|
199 |
|
|
200 |
public void end() { |
|
201 |
File paramFile = new File(binaryDirectory, "import.xml"); |
|
202 |
try { |
|
203 |
DomUtils.save(importParameters.root.getOwnerDocument(), paramFile); |
|
204 |
isSuccessful = true; |
|
205 |
} catch (Exception e) { |
|
206 |
// TODO Auto-generated catch block |
|
207 |
e.printStackTrace(); |
|
208 |
isSuccessful = false; |
|
209 |
} |
|
210 |
} |
|
211 |
|
|
212 |
public String getCorpusName() { |
|
213 |
return corpusName; |
|
214 |
} |
|
215 |
|
|
216 |
public String getReason() { |
|
217 |
return reason; |
|
218 |
} |
|
219 |
|
|
220 |
public boolean isSuccessFul() { |
|
221 |
return isSuccessful; |
|
222 |
} |
|
223 |
|
|
224 |
public BaseParameters getParameters() { |
|
225 |
return importParameters; |
|
226 |
} |
|
227 |
|
|
228 |
public File getSourceDirectory() { |
|
229 |
return sourceDirectory; |
|
230 |
} |
|
231 |
|
|
232 |
public File getBinaryDirectory() { |
|
233 |
return binaryDirectory; |
|
234 |
} |
|
235 |
|
|
236 |
public void process() throws InterruptedException { |
|
237 |
start(); |
|
238 |
if (isSuccessful) |
|
239 |
end(); |
|
240 |
} |
|
241 |
|
|
242 |
public static void main(String[] args) { |
|
243 |
File importParametersFile = new File("/home/mdecorde/xml/brown/import.xml"); |
|
244 |
|
|
245 |
ImportModule module = new ImportModule(importParametersFile); |
|
246 |
System.out.println("Parameters: "+module.getParameters()); |
|
247 |
try { |
|
248 |
module.start(); |
|
249 |
|
|
250 |
if (module.isSuccessful) { |
|
251 |
System.out.println("Import sucessful. reloading corpora..."); |
|
252 |
} else { |
|
253 |
System.out.println("Import failed, reason = "+module.getReason()); |
|
254 |
} |
|
255 |
} catch (Exception e) { |
|
256 |
e.printStackTrace(); |
|
257 |
} |
|
258 |
} |
|
259 |
} |
tmp/org.txm.core/src/java/org/txm/metadatas/Metadatas.java (revision 945) | ||
---|---|---|
235 | 235 |
*/ |
236 | 236 |
public static boolean convertCsvToXml(File csvfile, File xmlFile, String encoding, String separator, String txtseparator, int nbheaderline) throws Exception |
237 | 237 |
{ |
238 |
if (separator == null || separator.length() == 0) { |
|
239 |
separator = "\t"; |
|
240 |
} |
|
241 |
if (encoding == null || encoding.length() == 0) { |
|
242 |
encoding = "UTF-8"; |
|
243 |
} |
|
238 | 244 |
xmlFile.createNewFile(); |
239 | 245 |
|
240 | 246 |
if(!csvfile.exists()) |
tmp/org.txm.core/src/java/org/txm/Toolbox.java (revision 945) | ||
---|---|---|
84 | 84 |
private static boolean state = false; |
85 | 85 |
|
86 | 86 |
public static Workspace workspace; |
87 |
|
|
88 | 87 |
|
89 | 88 |
/** |
90 | 89 |
* |
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportKeys.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
public class ImportKeys { |
|
4 |
|
|
5 |
public static final String CLEAN = "clean.directories"; |
|
6 |
public static final String TTMODEL = "annotate.model"; |
|
7 |
public static final String TTANNOTATE = "annotate.run"; |
|
8 |
public static final String LANG = "lang"; |
|
9 |
|
|
10 |
public static final String MULTITHREAD = "multithread"; |
|
11 |
public static final String DEBUG = "debug"; |
|
12 |
public static final String UPDATECORPUS = "corpus.update"; |
|
13 |
|
|
14 |
public static final String NORMALISEANAVALUES = "normalize.ana.values"; |
|
15 |
public static final String NORMALISEATTRIBUTEVALUES = "normalize.attribute.values"; |
|
16 |
} |
|
0 | 17 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportStep.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.HashMap; |
|
5 |
|
|
6 |
/** |
|
7 |
* One of the step of an import module |
|
8 |
* |
|
9 |
* @author mdecorde |
|
10 |
* |
|
11 |
*/ |
|
12 |
public abstract class ImportStep { |
|
13 |
|
|
14 |
protected File inputDirectory, outputDirectory; |
|
15 |
protected ImportModule module; |
|
16 |
|
|
17 |
protected HashMap<String, Object> stepProperties = new HashMap<String, Object>(); |
|
18 |
protected boolean isSuccessFul = false; |
|
19 |
protected String reason = "not set."; |
|
20 |
protected boolean stopAtFirstError = true; |
|
21 |
protected boolean debug = true; |
|
22 |
|
|
23 |
public ImportStep(ImportModule module) { |
|
24 |
this.module = module; |
|
25 |
debug = module.debug; |
|
26 |
} |
|
27 |
|
|
28 |
public File getInputDirectory() { |
|
29 |
return inputDirectory; |
|
30 |
} |
|
31 |
|
|
32 |
public File getOutputDirectory() { |
|
33 |
return outputDirectory; |
|
34 |
} |
|
35 |
|
|
36 |
public ImportModule getImportModule() { |
|
37 |
return module; |
|
38 |
} |
|
39 |
|
|
40 |
public boolean isSuccessFul() { |
|
41 |
return isSuccessFul; |
|
42 |
} |
|
43 |
|
|
44 |
public String getReason() { |
|
45 |
return reason; |
|
46 |
} |
|
47 |
|
|
48 |
/** |
|
49 |
* Called when a step is interrupted to clean streams and stuff |
|
50 |
*/ |
|
51 |
public abstract void cancel(); |
|
52 |
|
|
53 |
public abstract void process(); |
|
54 |
} |
|
0 | 55 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/Compiler.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.ArrayList; |
|
5 |
|
|
6 |
import org.txm.utils.DeleteDir; |
|
7 |
|
|
8 |
/** |
|
9 |
* Takes XML-TXM files, build the CQP files and call cwb utils |
|
10 |
* |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public class Compiler extends ImportStep { |
|
15 |
|
|
16 |
protected File cqpDirectory, registryDirectory, dataDirectory; |
|
17 |
protected ArrayList<File> files; |
|
18 |
|
|
19 |
/** |
|
20 |
* Creates the output directories |
|
21 |
* |
|
22 |
* @param module |
|
23 |
*/ |
|
24 |
public Compiler(ImportModule module) { |
|
25 |
super(module); |
|
26 |
|
|
27 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
28 |
cqpDirectory = new File(module.getBinaryDirectory(), "cqp"); |
|
29 |
outputDirectory = new File(module.getBinaryDirectory(), "data"); |
|
30 |
registryDirectory = new File(module.getBinaryDirectory(), "registry"); |
|
31 |
dataDirectory = new File(outputDirectory, module.getCorpusName()); |
|
32 |
|
|
33 |
DeleteDir.deleteDirectory(outputDirectory); |
|
34 |
outputDirectory.mkdirs(); |
|
35 |
|
|
36 |
DeleteDir.deleteDirectory(dataDirectory); |
|
37 |
dataDirectory.mkdirs(); |
|
38 |
|
|
39 |
DeleteDir.deleteDirectory(registryDirectory); |
|
40 |
registryDirectory.mkdirs(); |
|
41 |
|
|
42 |
if (!module.isUpdatingCorpus()) { |
|
43 |
DeleteDir.deleteDirectory(cqpDirectory); |
|
44 |
cqpDirectory.mkdir(); |
|
45 |
} |
|
46 |
} |
|
47 |
|
|
48 |
@Override |
|
49 |
public void cancel() { |
|
50 |
// TODO Auto-generated method stub |
|
51 |
} |
|
52 |
|
|
53 |
@Override |
|
54 |
public void process() { |
|
55 |
process(null); // no default files order set |
|
56 |
} |
|
57 |
|
|
58 |
public void process(ArrayList<File> files) { |
|
59 |
this.files = files; |
|
60 |
} |
|
61 |
} |
|
0 | 62 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/Annotater.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
|
|
5 |
|
|
6 |
/** |
|
7 |
* |
|
8 |
* Takes the XML-TXM files and wrap a TAL Tool to update the XML-TXM files |
|
9 |
* |
|
10 |
* @author mdecorde |
|
11 |
* |
|
12 |
*/ |
|
13 |
public abstract class Annotater extends ImportStep { |
|
14 |
|
|
15 |
public Annotater(ImportModule module) { |
|
16 |
super(module); |
|
17 |
|
|
18 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.corpusName); |
|
19 |
outputDirectory = new File(module.getBinaryDirectory(), "txm"); |
|
20 |
} |
|
21 |
} |
|
0 | 22 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/Step.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
public class Step { |
|
4 |
|
|
5 |
public Step() { |
|
6 |
// TODO Auto-generated constructor stub |
|
7 |
} |
|
8 |
|
|
9 |
public boolean process() { |
|
10 |
return true; |
|
11 |
} |
|
12 |
} |
|
0 | 13 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/Pager.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.ArrayList; |
|
5 |
|
|
6 |
import org.txm.utils.DeleteDir; |
|
7 |
|
|
8 |
/** |
|
9 |
* Takes the XML-TXM files and build an edition |
|
10 |
* |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public class Pager extends ImportStep { |
|
15 |
|
|
16 |
protected File htmlDirectory; |
|
17 |
protected String corpusname; |
|
18 |
protected ArrayList<File> files; |
|
19 |
|
|
20 |
public Pager(ImportModule module, String editionName) { |
|
21 |
super(module); |
|
22 |
|
|
23 |
corpusname = module.getCorpusName(); |
|
24 |
|
|
25 |
inputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
26 |
htmlDirectory = new File(module.getBinaryDirectory(), "HTML/"+corpusname); |
|
27 |
outputDirectory = new File(htmlDirectory, editionName); |
|
28 |
|
|
29 |
if (!module.isUpdatingCorpus()) { |
|
30 |
DeleteDir.deleteDirectory(outputDirectory); |
|
31 |
outputDirectory.mkdirs(); |
|
32 |
} |
|
33 |
} |
|
34 |
|
|
35 |
@Override |
|
36 |
public void cancel() { |
|
37 |
// TODO Auto-generated method stub |
|
38 |
|
|
39 |
} |
|
40 |
|
|
41 |
@Override |
|
42 |
public void process() { |
|
43 |
process(null); // no default files order set |
|
44 |
} |
|
45 |
|
|
46 |
public void process(ArrayList<File> files) { |
|
47 |
this.files = files; |
|
48 |
} |
|
49 |
} |
|
0 | 50 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/ImportModule.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.FileFilter; |
|
5 |
import java.util.ArrayList; |
|
6 |
import java.util.Arrays; |
|
7 |
import java.util.Collections; |
|
8 |
import java.util.logging.Level; |
|
9 |
|
|
10 |
import org.txm.Toolbox; |
|
11 |
import org.txm.core.preferences.TBXPreferences; |
|
12 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
13 |
import org.txm.utils.xml.DomUtils; |
|
14 |
import org.txm.objects.BaseParameters; |
|
15 |
import org.txm.utils.DeleteDir; |
|
16 |
import org.txm.utils.logger.Log; |
|
17 |
|
|
18 |
public class ImportModule { |
|
19 |
|
|
20 |
public BaseParameters importParameters; |
|
21 |
|
|
22 |
public String corpusVersionProduced; |
|
23 |
|
|
24 |
public File sourceDirectory; |
|
25 |
public File binaryDirectory; |
|
26 |
|
|
27 |
public Importer importer; |
|
28 |
public Annotater annotater; |
|
29 |
public Compiler compiler; |
|
30 |
public Pager pager; |
|
31 |
|
|
32 |
/** |
|
33 |
* set the variable to false to stop the import process at next step |
|
34 |
*/ |
|
35 |
public boolean isSuccessful = true; |
|
36 |
public String reason = "none"; |
|
37 |
public boolean debug = false; |
|
38 |
public boolean multithread = false; |
|
39 |
public boolean updateCorpus = false; |
|
40 |
public String corpusName; |
|
41 |
|
|
42 |
IProgressMonitor monitor; |
|
43 |
|
|
44 |
public void setMonitor(IProgressMonitor monitor) { |
|
45 |
this.monitor = monitor; |
|
46 |
} |
|
47 |
|
|
48 |
|
|
49 |
public boolean isMultiThread() { |
|
50 |
return multithread; |
|
51 |
} |
|
52 |
|
|
53 |
public boolean isDebugging() { |
|
54 |
return debug; |
|
55 |
} |
|
56 |
|
|
57 |
public ImportModule(File importParametersFile) { |
|
58 |
try { |
|
59 |
BaseParameters b = new BaseParameters(importParametersFile); |
|
60 |
init(b); |
|
61 |
} catch (Exception e) { |
|
62 |
e.printStackTrace(); |
|
63 |
} |
|
64 |
} |
|
65 |
|
|
66 |
public ImportModule(BaseParameters p) { |
|
67 |
init(p); |
|
68 |
} |
|
69 |
|
|
70 |
public boolean isUpdatingCorpus() { |
|
71 |
return updateCorpus; |
|
72 |
} |
|
73 |
|
|
74 |
protected void init(BaseParameters p) { |
|
75 |
this.importParameters = p; |
|
76 |
this.importParameters.load(); |
|
77 |
corpusName = importParameters.name; |
|
78 |
//this.debug = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.DEBUG)); |
|
79 |
|
|
80 |
if (Log.getLevel().intValue() < Level.WARNING.intValue()) { |
|
81 |
debug = true; |
|
82 |
} |
|
83 |
this.multithread = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.MULTITHREAD)); |
|
84 |
this.updateCorpus = "true".equals(importParameters.getKeyValueParameters().get(ImportKeys.UPDATECORPUS)); |
|
85 |
|
|
86 |
|
|
87 |
this.sourceDirectory = importParameters.paramFile.getParentFile(); |
|
88 |
this.binaryDirectory = new File(Toolbox.getTxmHomePath(), "corpora/"+corpusName.toUpperCase()); |
|
89 |
|
|
90 |
if (!updateCorpus) { // clean directories only if it's a new import |
|
91 |
DeleteDir.deleteDirectory(binaryDirectory); |
|
92 |
binaryDirectory.mkdir(); |
|
93 |
|
|
94 |
File txmDir = new File(binaryDirectory, "txm"); |
|
95 |
txmDir.mkdir(); |
|
96 |
} |
|
97 |
} |
|
98 |
|
|
99 |
public void start() throws InterruptedException { |
|
100 |
|
|
101 |
binaryDirectory.mkdirs(); // ensure output exists |
|
102 |
//System.out.println("ImportModule.start"); |
|
103 |
if (!updateCorpus) { // create XML-TXM files and annotate |
|
104 |
//System.out.println("ImportModule.start: not updating"); |
|
105 |
if (importer != null) { |
|
106 |
//System.out.println("ImportModule.start: importer: "+importer); |
|
107 |
if (monitor != null) System.out.println("-- IMPORTER - Reading source files"); |
|
108 |
importer.process(); |
|
109 |
//importer.checkFiles(); |
|
110 |
isSuccessful = isSuccessful & importer.isSuccessFul(); |
|
111 |
if (!isSuccessful) { |
|
112 |
System.out.println("Error while importing corpus during 'importer' step, reason="+importer.getReason()); |
|
113 |
return; |
|
114 |
} |
|
115 |
} else { |
|
116 |
System.out.println("XML-TXM files already produced in "+new File(binaryDirectory, "txm/"+corpusName)); |
|
117 |
} |
|
118 |
|
|
119 |
boolean annotate = "true".equals(importParameters.getCorpusElement().getAttribute("annotate")); |
|
120 |
if (annotate && annotater != null) { |
|
121 |
if (monitor != null) System.out.println("-- ANNOTATE - Running NLP tools"); |
|
122 |
annotater.process(); |
|
123 |
isSuccessful = isSuccessful & annotater.isSuccessFul(); |
|
124 |
if (!isSuccessful) { |
|
125 |
System.out.println("Error while importing corpus during 'annotate' step, reason="+annotater.getReason()); |
|
126 |
return; |
|
127 |
} |
|
128 |
} else { |
|
129 |
//System.out.println("XML-TXM files already annotated."); |
|
130 |
} |
|
131 |
} else { |
|
132 |
System.out.println("Updating corpus..."); |
|
133 |
} |
|
134 |
|
|
135 |
//System.out.println("GET FILES ORDER"); |
|
136 |
final ArrayList<File> files = getTXMFilesOrder(); |
|
137 |
|
|
138 |
Thread Tcompiler = new Thread() { |
|
139 |
public void run() { |
|
140 |
if (compiler != null) { |
|
141 |
if (monitor != null) System.out.println("-- COMPILING - Building Search Engine indexes"); |
|
142 |
compiler.process(files); |
|
143 |
isSuccessful = isSuccessful & compiler.isSuccessFul(); |
|
144 |
if (!isSuccessful) { |
|
145 |
System.out.println("Error while importing corpus during 'compiler' step, reason="+compiler.getReason()); |
|
146 |
return; |
|
147 |
} |
|
148 |
} else { |
|
149 |
System.out.println("No CQP index created."); |
|
150 |
} |
|
151 |
} |
|
152 |
}; |
|
153 |
|
|
154 |
Thread Tpager = new Thread() { |
|
155 |
public void run() { |
|
156 |
|
|
157 |
if (pager != null) { |
|
158 |
if (monitor != null) System.out.println("-- EDITION - Building edition"); |
|
159 |
pager.process(files); |
|
160 |
isSuccessful = isSuccessful & pager.isSuccessFul(); |
|
161 |
if (!isSuccessful) { |
|
162 |
System.out.println("Error while importing corpus during 'pager' step, reason="+pager.getReason()); |
|
163 |
return; |
|
164 |
} |
|
165 |
} else { |
|
166 |
System.out.println("No edition produced."); |
|
167 |
} |
|
168 |
} |
|
169 |
}; |
|
170 |
|
|
171 |
Tcompiler.start(); |
|
172 |
if (!multithread) { |
|
173 |
Tcompiler.join(); // wait for the end if not multithreaded |
|
174 |
if (!isSuccessful) { // don't call pager is compiler step failed |
|
175 |
return; |
|
176 |
} |
|
177 |
} |
|
178 |
|
|
179 |
Tpager.start(); |
|
180 |
if (multithread) Tcompiler.join(); // wait for both thread to end |
|
181 |
Tpager.join(); |
|
182 |
} |
|
183 |
|
|
184 |
protected ArrayList<File> getTXMFilesOrder() { |
|
185 |
//System.out.println("DEFAULT FILES ORDER"); |
|
186 |
File txmDirectory = new File(binaryDirectory, "txm/"+corpusName); |
|
187 |
ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() { |
|
188 |
@Override |
|
189 |
public boolean accept(File file) { |
|
190 |
return file.isFile() && file.getName().endsWith(".xml"); |
|
191 |
} |
|
192 |
}))); |
|
193 |
|
|
194 |
Collections.sort(files); |
|
195 |
|
|
196 |
return files; |
|
197 |
} |
|
198 |
|
|
199 |
|
|
200 |
public void end() { |
|
201 |
File paramFile = new File(binaryDirectory, "import.xml"); |
|
202 |
try { |
|
203 |
DomUtils.save(importParameters.root.getOwnerDocument(), paramFile); |
|
204 |
isSuccessful = true; |
|
205 |
} catch (Exception e) { |
|
206 |
// TODO Auto-generated catch block |
|
207 |
e.printStackTrace(); |
|
208 |
isSuccessful = false; |
|
209 |
} |
|
210 |
} |
|
211 |
|
|
212 |
public String getCorpusName() { |
|
213 |
return corpusName; |
|
214 |
} |
|
215 |
|
|
216 |
public String getReason() { |
|
217 |
return reason; |
|
218 |
} |
|
219 |
|
|
220 |
public boolean isSuccessFul() { |
|
221 |
return isSuccessful; |
|
222 |
} |
|
223 |
|
|
224 |
public BaseParameters getParameters() { |
|
225 |
return importParameters; |
|
226 |
} |
|
227 |
|
|
228 |
public File getSourceDirectory() { |
|
229 |
return sourceDirectory; |
|
230 |
} |
|
231 |
|
|
232 |
public File getBinaryDirectory() { |
|
233 |
return binaryDirectory; |
|
234 |
} |
|
235 |
|
|
236 |
public void process() throws InterruptedException { |
|
237 |
start(); |
|
238 |
if (isSuccessful) |
|
239 |
end(); |
|
240 |
} |
|
241 |
|
|
242 |
public static void main(String[] args) { |
|
243 |
File importParametersFile = new File("/home/mdecorde/xml/brown/import.xml"); |
|
244 |
|
|
245 |
ImportModule module = new ImportModule(importParametersFile); |
|
246 |
System.out.println("Parameters: "+module.getParameters()); |
|
247 |
try { |
|
248 |
module.start(); |
|
249 |
|
|
250 |
if (module.isSuccessful) { |
|
251 |
System.out.println("Import sucessful. reloading corpora..."); |
|
252 |
} else { |
|
253 |
System.out.println("Import failed, reason = "+module.getReason()); |
|
254 |
} |
|
255 |
} catch (Exception e) { |
|
256 |
e.printStackTrace(); |
|
257 |
} |
|
258 |
} |
|
259 |
} |
|
0 | 260 |
tmp/org.txm.core/src/java/org/txm/importer/xtz/Importer.java (revision 945) | ||
---|---|---|
1 |
package org.txm.importer.xtz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
|
|
5 |
/** |
|
6 |
* Takes any form of source files |
|
7 |
* |
|
8 |
* After this step, the XML-TXM files are created. |
|
9 |
* |
|
10 |
* they are validated before continuing |
|
11 |
* @author mdecorde |
|
12 |
* |
|
13 |
*/ |
|
14 |
public abstract class Importer extends ImportStep { |
|
15 |
|
|
16 |
public Importer(ImportModule module) { |
|
17 |
super(module); |
|
18 |
inputDirectory = module.getSourceDirectory(); |
|
19 |
outputDirectory = new File(module.getBinaryDirectory(), "txm/"+module.getCorpusName()); |
|
20 |
outputDirectory.mkdirs(); |
|
21 |
} |
|
22 |
|
|
23 |
public abstract void checkFiles(); |
|
24 |
} |
|
0 | 25 |
tmp/org.txm.core/META-INF/MANIFEST.MF (revision 945) | ||
---|---|---|
390 | 390 |
org.txm.importer.filters, |
391 | 391 |
org.txm.importer.scripting, |
392 | 392 |
org.txm.importer.xmltxm, |
393 |
org.txm.importer.xtz, |
|
393 | 394 |
org.txm.js, |
394 | 395 |
org.txm.js.viewer, |
395 | 396 |
org.txm.metadatas, |
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/Corpus.java (revision 945) | ||
---|---|---|
1107 | 1107 |
this.getQualifiedCqpId(), queryResultId, |
1108 | 1108 |
query.getQueryString()); |
1109 | 1109 |
queryResult = new QueryResult(queryResultId, queryResultName, this, query); |
1110 |
|
|
1110 | 1111 |
if (save) super.addQueryLog(query.toString(), new ArrayList<String>()); |
1111 | 1112 |
} catch (Exception e) { |
1112 | 1113 |
org.txm.utils.logger.Log.printStackTrace(e); |
Formats disponibles : Unified diff