52 |
52 |
import java.util.HashMap;
|
53 |
53 |
import org.txm.scripts.importer.*;
|
54 |
54 |
import org.txm.utils.*;
|
|
55 |
import org.txm.utils.io.*
|
55 |
56 |
import org.txm.objects.Project;
|
56 |
57 |
import org.txm.metadatas.*;
|
57 |
58 |
|
... | ... | |
75 |
76 |
/** The outdir. */
|
76 |
77 |
File txmDir;
|
77 |
78 |
File binDir;
|
|
79 |
|
|
80 |
public boolean doTokenizeStep = true;
|
78 |
81 |
|
79 |
82 |
/** The trsfiles. */
|
80 |
83 |
ArrayList<String> trsfiles;
|
... | ... | |
100 |
103 |
this.metadatas = metadatas;
|
101 |
104 |
this.lang = lang;
|
102 |
105 |
this.project = project;
|
|
106 |
this.doTokenizeStep = project.getDoTokenizerStep()
|
103 |
107 |
}
|
104 |
108 |
|
105 |
109 |
/**
|
... | ... | |
133 |
137 |
cpb.done()
|
134 |
138 |
|
135 |
139 |
if (metadatas != null) {
|
|
140 |
|
136 |
141 |
if (metadatas.getHeadersList().size() > 0) {
|
137 |
142 |
|
138 |
143 |
println "Injecting metadata "+metadatas.getHeadersList()+" in "+trsfiles.size()+" files"
|
... | ... | |
165 |
170 |
println "Tokenizing entities "+files.length+" files"
|
166 |
171 |
cpb = new ConsoleProgressBar(files.length)
|
167 |
172 |
for (File pfile : files) {
|
|
173 |
|
168 |
174 |
cpb.tick()
|
169 |
175 |
TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL());
|
170 |
176 |
File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
|
... | ... | |
176 |
182 |
cpb.done()
|
177 |
183 |
|
178 |
184 |
//TOKENIZE
|
179 |
|
println "Tokenizing "+files.length+" files from $txmDir"
|
180 |
185 |
File tokenizedDir = new File(binDir, "tokenized")
|
181 |
186 |
tokenizedDir.deleteDir() // delete previous outputed files
|
182 |
187 |
tokenizedDir.mkdir()
|
183 |
|
cpb = new ConsoleProgressBar(files.length)
|
184 |
|
for (File pfile : files) {
|
185 |
|
cpb.tick()
|
186 |
|
String filename = pfile.getName().substring(0, pfile.getName().length()-4)
|
187 |
|
File tfile = new File(tokenizedDir, pfile.getName())
|
188 |
|
try {
|
189 |
|
TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
|
190 |
|
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
|
191 |
|
tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
|
192 |
|
if (!tokenizer.process()) {
|
193 |
|
println("Failed to tokenize "+pfile)
|
|
188 |
|
|
189 |
if (!doTokenizeStep) {
|
|
190 |
|
|
191 |
println "No tokenization do to."
|
|
192 |
cpb = new ConsoleProgressBar(files.length)
|
|
193 |
for (File f : files) {
|
|
194 |
File outfile = new File(tokenizedDir, f.getName());
|
|
195 |
FileCopy.copy(f, outfile);
|
|
196 |
cpb.tick()
|
|
197 |
}
|
|
198 |
cpb.done()
|
|
199 |
} else {
|
|
200 |
|
|
201 |
println "Tokenizing "+files.length+" files from $txmDir"
|
|
202 |
cpb = new ConsoleProgressBar(files.length)
|
|
203 |
for (File pfile : files) {
|
|
204 |
cpb.tick()
|
|
205 |
String filename = pfile.getName().substring(0, pfile.getName().length()-4)
|
|
206 |
File tfile = new File(tokenizedDir, pfile.getName())
|
|
207 |
try {
|
|
208 |
TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
|
|
209 |
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
|
|
210 |
tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
|
|
211 |
if (!tokenizer.process()) {
|
|
212 |
println("Failed to tokenize "+pfile)
|
|
213 |
}
|
|
214 |
} catch (Exception e) {
|
|
215 |
println "Error tokenizer: "+pfile
|
|
216 |
org.txm.utils.logger.Log.printStackTrace(e);
|
|
217 |
return false;
|
194 |
218 |
}
|
195 |
|
} catch (Exception e) {
|
196 |
|
println "Error tokenizer: "+pfile
|
197 |
|
org.txm.utils.logger.Log.printStackTrace(e);
|
198 |
|
return false;
|
199 |
219 |
}
|
|
220 |
cpb.done()
|
200 |
221 |
}
|
201 |
|
cpb.done()
|
202 |
|
|
|
222 |
|
203 |
223 |
//TRANSFORM INTO XML-TEI-TXM
|
204 |
224 |
files = tokenizedDir.listFiles()
|
205 |
225 |
println("Building ${files.length} XML-TXM file"+(files.length > 1?"s":""))
|
206 |
226 |
cpb = new ConsoleProgressBar(files.length)
|
207 |
227 |
for (File tfile : files) {
|
|
228 |
|
208 |
229 |
cpb.tick()
|
209 |
230 |
String filename = tfile.getName().substring(0, tfile.getName().length()-4)
|
210 |
231 |
File xmlfile = new File(txmDir, tfile.getName())
|