Révision 3010
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/limsi/limsiLoader.groovy (revision 3010) | ||
---|---|---|
65 | 65 |
|
66 | 66 |
|
67 | 67 |
//PARAMETERS |
68 |
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored |
|
69 | 68 |
boolean includeComments = false; |
70 | 69 |
boolean ignoreTranscriberMetadata = false; |
71 | 70 |
int csvHeaderNumber = 1; |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 3010) | ||
---|---|---|
73 | 73 |
/** The debug. */ |
74 | 74 |
boolean debug = false; |
75 | 75 |
|
76 |
/** The removeinterviewers. */
|
|
77 |
boolean removeinterviewers = false;
|
|
76 |
/** The indexInterviewer: index interviewer speech if true. */
|
|
77 |
boolean indexInterviewer = true;
|
|
78 | 78 |
|
79 | 79 |
/** The trans. */ |
80 | 80 |
HashMap<String, ArrayList<Pair<String, String>>> trans; |
... | ... | |
89 | 89 |
HashMap<String, ArrayList<Pair<String, String>>> topics; |
90 | 90 |
|
91 | 91 |
|
92 |
/** The interviewers. */
|
|
92 |
/** The interviewers regex */
|
|
93 | 93 |
def interviewers = null |
94 | 94 |
static HashSet<String> sectionAttrs; |
95 | 95 |
|
... | ... | |
103 | 103 |
* @param value the value |
104 | 104 |
* @return the java.lang. object |
105 | 105 |
*/ |
106 |
public removeInterviewers(boolean value) {
|
|
107 |
this.removeinterviewers = value;
|
|
106 |
public setIndexInterviewer(boolean value) {
|
|
107 |
this.indexInterviewer = value;
|
|
108 | 108 |
} |
109 | 109 |
|
110 | 110 |
File cqpFile |
... | ... | |
510 | 510 |
|
511 | 511 |
vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<"); |
512 | 512 |
|
513 |
if (removeinterviewers) {
|
|
514 |
if (!interviewers.matches(u_name)) |
|
513 |
if (!indexInterviewer) {
|
|
514 |
if (!interviewers.matches(u_name)) {
|
|
515 | 515 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
516 |
} |
|
516 | 517 |
} else { |
517 | 518 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
518 | 519 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3010) | ||
---|---|---|
146 | 146 |
cpb.tick() |
147 | 147 |
if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) { |
148 | 148 |
println("Failed to inject metadata in "+infile) |
149 |
outfile.delete() |
|
149 |
//outfile.delete()
|
|
150 | 150 |
} |
151 | 151 |
if (!infile.delete()) { |
152 | 152 |
println "ERROR: could not delete $infile" |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 3010) | ||
---|---|---|
58 | 58 |
import org.txm.utils.xml.DomUtils; |
59 | 59 |
|
60 | 60 |
//PARAMETERS |
61 |
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
|
|
61 |
boolean indexInterviewer = true;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
|
|
62 | 62 |
boolean includeComments = false; |
63 | 63 |
boolean ignoreTranscriberMetadata = false; |
64 | 64 |
//int csvHeaderNumber = 1; |
... | ... | |
137 | 137 |
props.load(input); |
138 | 138 |
input.close(); |
139 | 139 |
|
140 |
if (props.getProperty("removeInterviewer") != null)
|
|
141 |
removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
|
|
140 |
if (props.getProperty("indexInterviewer") != null)
|
|
141 |
indexInterviewer = Boolean.parseBoolean(props.get("indexInterviewer").toString());
|
|
142 | 142 |
if (props.getProperty("ignoreTranscriberMetadata") != null) |
143 | 143 |
ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString()); |
144 | 144 |
// if (props.getProperty("metadataList") != null) |
... | ... | |
149 | 149 |
// includeComments = props.get("includeComments").toString(); |
150 | 150 |
|
151 | 151 |
println "import properties: " |
152 |
println " removeInterviewer: "+removeInterviewer
|
|
152 |
println " indexInterviewer: "+indexInterviewer
|
|
153 | 153 |
println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata |
154 | 154 |
// println " metadataToKeep: "+metadatasToKeep |
155 | 155 |
// println " ignored csvHeaderSize: "+csvHeaderNumber |
... | ... | |
213 | 213 |
|
214 | 214 |
if (MONITOR != null) MONITOR.worked(5) |
215 | 215 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
216 |
println "-- Remove interviewer: "+removeInterviewer
|
|
217 |
if (removeInterviewer) {
|
|
218 |
if (metadatas == null) { |
|
219 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers." |
|
216 |
println "-- Remove interviewer: "+(!indexInterviewer)
|
|
217 |
if (!indexInterviewer) {
|
|
218 |
if (metadatas == null || !metadatas.headersList.contains("interviewer-id-regex")) {
|
|
219 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers. Add the 'interviewer-id-regex' column"
|
|
220 | 220 |
} else { |
221 | 221 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
222 | 222 |
for (File infile : txmDir.listFiles()) { |
... | ... | |
225 | 225 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
226 | 226 |
//println "filename=$filename metas= $metas" |
227 | 227 |
for (Pair p : metas) { |
228 |
if (p.getFirst().startsWith("out-of-text-to-edit-locutor")) {
|
|
228 |
if (p.getFirst().startsWith("interviewer-id-regex")) {
|
|
229 | 229 |
new RemoveSpeaker(infile, infile, p.getSecond()) |
230 | 230 |
} |
231 | 231 |
} |
... | ... | |
281 | 281 |
|
282 | 282 |
def comp = new compiler() |
283 | 283 |
if(debug) comp.setDebug(); |
284 |
comp.removeInterviewers(removeInterviewer);
|
|
284 |
comp.setIndexInterviewer(indexInterviewer);
|
|
285 | 285 |
comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata); |
286 | 286 |
if (!comp.run(project, xmltxmFiles, corpusname, "default", binDir)) { |
287 | 287 |
println "Failed to compile files"; |
tmp/org.txm.core/src/java/org/txm/metadatas/Metadatas.java (revision 3010) | ||
---|---|---|
66 | 66 |
import org.txm.utils.AsciiUtils; |
67 | 67 |
import org.txm.utils.CsvReader; |
68 | 68 |
import org.txm.utils.Pair; |
69 |
import org.txm.utils.io.FileCopy; |
|
69 | 70 |
import org.txm.utils.logger.Log; |
70 | 71 |
import org.txm.utils.xml.DomUtils; |
71 | 72 |
import org.w3c.dom.Document; |
... | ... | |
108 | 109 |
/** The ns context. */ |
109 | 110 |
NamespaceContext nsContext = new PersonalNamespaceContext(); |
110 | 111 |
|
112 |
private File inputMetadataFile; |
|
113 |
|
|
111 | 114 |
/** |
112 | 115 |
* Instantiates a new metadatas. |
113 | 116 |
* |
... | ... | |
151 | 154 |
*/ |
152 | 155 |
public Metadatas(File inputFile, String encoding, String separator, String txtseparator, int nbheaderline) { |
153 | 156 |
|
157 |
this.inputMetadataFile = inputFile; |
|
154 | 158 |
File xmlfile = new File(inputFile.getParent(), inputFile.getName() + ".xml"); |
155 | 159 |
// println "create xml file version of "+csvfile+" : "+xmlfile |
156 | 160 |
try { |
... | ... | |
615 | 619 |
public boolean injectMetadatasInXml(File infile, File outfile, String tag, String namespace) throws MalformedURLException, IOException, XMLStreamException { |
616 | 620 |
|
617 | 621 |
String key = infile.getName(); |
618 |
if (key.lastIndexOf(".") > 0) |
|
622 |
if (key.lastIndexOf(".") > 0) {
|
|
619 | 623 |
key = key.substring(0, key.lastIndexOf(".")); |
620 |
|
|
624 |
} |
|
621 | 625 |
ArrayList<org.txm.metadatas.Entry> metas = get(key); |
622 | 626 |
|
623 | 627 |
if (metas == null) { |
624 |
System.out.println("\nError: can't find metadata for text of id=" + key); |
|
625 |
System.out.println("Maybe the metadata file doesn't have the right format (comma or tab separated values?)"); |
|
626 |
return false; |
|
628 |
System.out.println("\nWarning: can't find metadata for text of id=" + key); |
|
629 |
if (this.inputMetadataFile != null && inputMetadataFile.getName().toLowerCase().endsWith(".csv")) { |
|
630 |
System.out.println("Maybe the metadata file doesn't have the right format (comma or tab separated values?)"); |
|
631 |
} |
|
632 |
if (!infile.equals(outfile)) { |
|
633 |
FileCopy.copy(infile, outfile); |
|
634 |
} |
|
635 |
return true; |
|
627 | 636 |
} |
628 | 637 |
|
629 | 638 |
// ensure attribute names format |
Formats disponibles : Unified diff