Révision 3020
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscriptionsMacro.groovy (revision 3020) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import java.time.LocalTime |
|
4 |
import java.time.format.DateTimeFormatter |
|
5 |
import org.txm.utils.* |
|
6 |
import org.txm.utils.logger.* |
|
7 |
|
|
8 |
@Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="") |
|
9 |
File trsFile |
|
10 |
|
|
11 |
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="") |
|
12 |
File trsDirectory |
|
13 |
|
|
14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
|
15 |
File resultDirectory |
|
16 |
|
|
17 |
@Field @Option(name="primarySpeakerIdPrefix", usage="speaker ID of the primary speaker", widget="String", required=false, def="") |
|
18 |
String primarySpeakerIdPrefix |
|
19 |
|
|
20 |
@Field @Option(name="otherNonPrimaryLocutor", usage="other non primary id of the other turns", widget="String", required=false, def="") |
|
21 |
String otherNonPrimaryLocutor |
|
22 |
|
|
23 |
@Field @Option(name="debug", usage="speaker ID of the primary speaker", widget="Boolean", required=false, def="false") |
|
24 |
Boolean debug |
|
25 |
|
|
26 |
if (!ParametersDialog.open(this)) return; |
|
27 |
|
|
28 |
if (resultDirectory.equals(trsDirectory) || (trsFile != null && trsFile.getParentFile().equals(resultDirectory))) { |
|
29 |
return false; |
|
30 |
} |
|
31 |
|
|
32 |
resultDirectory.mkdirs(); |
|
33 |
|
|
34 |
def trsFiles = [] |
|
35 |
if (trsDirectory != null && trsDirectory.exists()) { |
|
36 |
|
|
37 |
println "Processing TRS directory: $trsDirectory" |
|
38 |
for (File file : trsDirectory.listFiles()) { |
|
39 |
if (file.getName().toLowerCase().endsWith(".trs")) { |
|
40 |
trsFiles << file |
|
41 |
} |
|
42 |
} |
|
43 |
} else if (trsFile != null && trsFile.exists()) { |
|
44 |
println "Processing TRS file: $trsFile" |
|
45 |
trsFiles << trsFile |
|
46 |
} |
|
47 |
|
|
48 |
if (trsFiles.size() == 0) { |
|
49 |
println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory" |
|
50 |
return false |
|
51 |
} |
|
52 |
|
|
53 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
|
54 |
for (File file : trsFiles) { |
|
55 |
cpb.tick() |
|
56 |
|
|
57 |
FixTranscription fixer = new FixTranscription(file, primarySpeakerIdPrefix, otherNonPrimaryLocutor, debug) |
|
58 |
String name = FileUtils.stripExtension(file) |
|
59 |
File outFile = new File(resultDirectory, name+".trs") |
|
60 |
|
|
61 |
if (!fixer.process(outFile)) { |
|
62 |
println "WARNING: ERROR WHILE PROCESSING: "+file |
|
63 |
return false |
|
64 |
} |
|
65 |
} |
|
66 |
cpb.done() |
|
67 |
|
|
68 |
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory" |
|
69 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscription.groovy (revision 3020) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import javax.xml.stream.* |
|
4 |
|
|
5 |
import org.txm.importer.PersonalNamespaceContext |
|
6 |
import org.txm.utils.FileUtils |
|
7 |
import org.txm.xml.IdentityHook |
|
8 |
import org.txm.xml.* |
|
9 |
|
|
10 |
import java.io.BufferedOutputStream |
|
11 |
import java.io.FileOutputStream |
|
12 |
import java.io.IOException |
|
13 |
import java.net.URL |
|
14 |
import java.util.* |
|
15 |
import java.util.Map.Entry |
|
16 |
import java.util.regex.Pattern |
|
17 |
|
|
18 |
class FixTranscription extends XMLProcessor { |
|
19 |
|
|
20 |
LocalNamesHookActivator activator; |
|
21 |
IdentityHook hook; |
|
22 |
|
|
23 |
def primarySpeakerIdRegex |
|
24 |
String primarySpeakerId |
|
25 |
|
|
26 |
Boolean debug |
|
27 |
|
|
28 |
String otherNonPrimaryLocutor = "other" |
|
29 |
|
|
30 |
public FixTranscription(File xmlfile, String primarySpeakerIdPrefix, String otherNonPrimaryLocutor, Boolean debug) { |
|
31 |
super(xmlfile) |
|
32 |
this.debug = debug |
|
33 |
|
|
34 |
this.otherNonPrimaryLocutor = otherNonPrimaryLocutor |
|
35 |
if (primarySpeakerIdPrefix != null && primarySpeakerIdPrefix.length() > 0) { |
|
36 |
String id = FileUtils.stripExtension(xmlfile) |
|
37 |
|
|
38 |
this.primarySpeakerIdRegex = /$primarySpeakerIdPrefix.*/ |
|
39 |
|
|
40 |
def rez = (id =~ primarySpeakerIdRegex).findAll() |
|
41 |
def rez2 = (id =~ /$primarySpeakerIdPrefix/).findAll() |
|
42 |
if (rez2.size() != 1) { |
|
43 |
if (debug) println "WARNING: found the ${rez2.size()} matches of primary speaker prefix in the '$id' file name" |
|
44 |
this.primarySpeakerIdRegex = null |
|
45 |
} else { |
|
46 |
primarySpeakerId = rez[0] |
|
47 |
//if (debug) println "Detected primary speaker: $primarySpeakerId" |
|
48 |
} |
|
49 |
} |
|
50 |
|
|
51 |
activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]); |
|
52 |
|
|
53 |
hook = new IdentityHook("word_hook", activator, this) { |
|
54 |
|
|
55 |
boolean inTurn = false; |
|
56 |
|
|
57 |
boolean inW = false; |
|
58 |
StringBuilder wordBuffer = new StringBuilder(); |
|
59 |
|
|
60 |
String currentTime; |
|
61 |
LinkedHashMap turnInfos = new LinkedHashMap() |
|
62 |
LinkedHashMap wInfos = new LinkedHashMap() |
|
63 |
boolean other |
|
64 |
|
|
65 |
@Override |
|
66 |
public boolean deactivate() { |
|
67 |
return true; |
|
68 |
} |
|
69 |
|
|
70 |
@Override |
|
71 |
public boolean _activate() { |
|
72 |
return true; |
|
73 |
} |
|
74 |
|
|
75 |
@Override |
|
76 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
77 |
if (localname.equals("Turn")) { |
|
78 |
// store values |
|
79 |
inTurn = true; |
|
80 |
turnInfos.clear() |
|
81 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
82 |
turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
83 |
} |
|
84 |
currentTime = turnInfos["startTime"] |
|
85 |
super.processStartElement(); |
|
86 |
} else if (localname.equals("Sync")) { |
|
87 |
currentTime = parser.getAttributeValue(null, "time") |
|
88 |
super.processStartElement(); |
|
89 |
} else if (localname.equals("w")) { |
|
90 |
// store values |
|
91 |
inW = true; |
|
92 |
wInfos.clear() |
|
93 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
94 |
wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
95 |
} |
|
96 |
String time = parser.getAttributeValue(null, "time") |
|
97 |
if (time != null && time.length() > 0) { |
|
98 |
currentTime = time |
|
99 |
} |
|
100 |
wordBuffer.setLength(0); |
|
101 |
return; // write w later |
|
102 |
} |
|
103 |
else { |
|
104 |
super.processStartElement(); |
|
105 |
} |
|
106 |
} |
|
107 |
|
|
108 |
@Override |
|
109 |
protected void processCharacters() throws XMLStreamException { |
|
110 |
if (inW) { |
|
111 |
wordBuffer.append(parser.getText()) |
|
112 |
} |
|
113 |
else { |
|
114 |
super.processCharacters(); |
|
115 |
} |
|
116 |
} |
|
117 |
|
|
118 |
@Override |
|
119 |
protected void processEndElement() throws XMLStreamException { |
|
120 |
if (localname.equals("w")) { |
|
121 |
|
|
122 |
inW = false |
|
123 |
String word = wordBuffer.toString().trim() |
|
124 |
if (!other && word.startsWith("*")) { |
|
125 |
//close current Turn and start a 'other' Turn |
|
126 |
writer.writeEndElement() // current Turn |
|
127 |
writer.writeCharacters("\n") |
|
128 |
|
|
129 |
def tmpInfos = new LinkedHashMap() |
|
130 |
for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr] |
|
131 |
tmpInfos["orig-speaker"] = turnInfos["speaker"] |
|
132 |
|
|
133 |
if (primarySpeakerIdRegex == null || turnInfos["speaker"] ==~ primarySpeakerIdRegex) { |
|
134 |
tmpInfos["speaker"] = "other" |
|
135 |
} else { |
|
136 |
tmpInfos["speaker"] = primarySpeakerId |
|
137 |
} |
|
138 |
tmpInfos["startTime"] = currentTime |
|
139 |
writer.writeStartElement("Turn") |
|
140 |
for (String attr : tmpInfos.keySet()) { |
|
141 |
writer.writeAttribute(attr, tmpInfos[attr]) |
|
142 |
} |
|
143 |
|
|
144 |
other = true |
|
145 |
word = word.substring(1) |
|
146 |
} |
|
147 |
|
|
148 |
boolean shouldCloseOtherTurn = false; |
|
149 |
if (other && word.endsWith("*")) { |
|
150 |
shouldCloseOtherTurn = true; |
|
151 |
|
|
152 |
word = word.substring(0, word.length()-1) |
|
153 |
other = false |
|
154 |
} |
|
155 |
|
|
156 |
if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/> |
|
157 |
writer.writeStartElement("event") // start the initial word |
|
158 |
writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"]) |
|
159 |
writer.writeAttribute("type", "unknown") |
|
160 |
writer.writeAttribute("extent", "instantaneous") |
|
161 |
writer.writeEndElement() // event |
|
162 |
word = "" // don't write the word |
|
163 |
} |
|
164 |
|
|
165 |
if (word.length() > 0) { |
|
166 |
|
|
167 |
writer.writeStartElement("w") // start the initial word |
|
168 |
for (String attr : wInfos.keySet() ) { |
|
169 |
writer.writeAttribute(attr, wInfos[attr]) |
|
170 |
} |
|
171 |
writer.writeCharacters(word) |
|
172 |
writer.writeEndElement() // w |
|
173 |
} |
|
174 |
|
|
175 |
if (shouldCloseOtherTurn) { |
|
176 |
shouldCloseOtherTurn = false; |
|
177 |
//close the current 'other' Turn and restart the actual Turn |
|
178 |
writer.writeEndElement() // current 'other' Turn |
|
179 |
|
|
180 |
writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos |
|
181 |
turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time |
|
182 |
for (String attr : turnInfos.keySet()) { |
|
183 |
writer.writeAttribute(attr, turnInfos[attr]) |
|
184 |
} |
|
185 |
|
|
186 |
other = false |
|
187 |
} |
|
188 |
} else { |
|
189 |
super.processEndElement(); |
|
190 |
} |
|
191 |
} |
|
192 |
} |
|
193 |
} |
|
194 |
|
|
195 |
public static void main(String[] args) { |
|
196 |
File infile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
197 |
File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs") |
|
198 |
def processor = new FixTranscription(infile, true) |
|
199 |
println processor.process(outfile) |
|
200 |
} |
|
201 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/CreateTheOtherTurns.groovy (revision 3020) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import javax.xml.stream.* |
|
4 |
|
|
5 |
import org.txm.importer.PersonalNamespaceContext |
|
6 |
import org.txm.utils.FileUtils |
|
7 |
import org.txm.xml.IdentityHook |
|
8 |
import org.txm.xml.* |
|
9 |
|
|
10 |
import java.io.BufferedOutputStream |
|
11 |
import java.io.FileOutputStream |
|
12 |
import java.io.IOException |
|
13 |
import java.net.URL |
|
14 |
import java.util.* |
|
15 |
import java.util.Map.Entry |
|
16 |
import java.util.regex.Pattern |
|
17 |
|
|
18 |
class FixTranscription extends XMLProcessor { |
|
19 |
|
|
20 |
LocalNamesHookActivator activator; |
|
21 |
IdentityHook hook; |
|
22 |
|
|
23 |
def primarySpeakerIdRegex |
|
24 |
String primarySpeakerId |
|
25 |
|
|
26 |
Boolean debug |
|
27 |
|
|
28 |
String otherNonPrimaryLocutor = "other" |
|
29 |
|
|
30 |
public FixTranscription(File xmlfile, String primarySpeakerIdPrefix, String otherNonPrimaryLocutor, Boolean debug) { |
|
31 |
super(xmlfile) |
|
32 |
this.debug = debug |
|
33 |
|
|
34 |
this.otherNonPrimaryLocutor = otherNonPrimaryLocutor |
|
35 |
if (primarySpeakerIdPrefix != null && primarySpeakerIdPrefix.length() > 0) { |
|
36 |
String id = FileUtils.stripExtension(xmlfile) |
|
37 |
|
|
38 |
this.primarySpeakerIdRegex = /$primarySpeakerIdPrefix.*/ |
|
39 |
|
|
40 |
def rez = (id =~ primarySpeakerIdRegex).findAll() |
|
41 |
def rez2 = (id =~ /$primarySpeakerIdPrefix/).findAll() |
|
42 |
if (rez2.size() != 1) { |
|
43 |
if (debug) println "WARNING: found the ${rez2.size()} matches of primary speaker prefix in the '$id' file name" |
|
44 |
this.primarySpeakerIdRegex = null |
|
45 |
} else { |
|
46 |
primarySpeakerId = rez[0] |
|
47 |
//if (debug) println "Detected primary speaker: $primarySpeakerId" |
|
48 |
} |
|
49 |
} |
|
50 |
|
|
51 |
activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]); |
|
52 |
|
|
53 |
hook = new IdentityHook("word_hook", activator, this) { |
|
54 |
|
|
55 |
boolean inTurn = false; |
|
56 |
|
|
57 |
boolean inW = false; |
|
58 |
StringBuilder wordBuffer = new StringBuilder(); |
|
59 |
|
|
60 |
String currentTime; |
|
61 |
LinkedHashMap turnInfos = new LinkedHashMap() |
|
62 |
LinkedHashMap wInfos = new LinkedHashMap() |
|
63 |
boolean other |
|
64 |
|
|
65 |
@Override |
|
66 |
public boolean deactivate() { |
|
67 |
return true; |
|
68 |
} |
|
69 |
|
|
70 |
@Override |
|
71 |
public boolean _activate() { |
|
72 |
return true; |
|
73 |
} |
|
74 |
|
|
75 |
@Override |
|
76 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
77 |
if (localname.equals("Turn")) { |
|
78 |
// store values |
|
79 |
inTurn = true; |
|
80 |
turnInfos.clear() |
|
81 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
82 |
turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
83 |
} |
|
84 |
currentTime = turnInfos["startTime"] |
|
85 |
super.processStartElement(); |
|
86 |
} else if (localname.equals("Sync")) { |
|
87 |
currentTime = parser.getAttributeValue(null, "time") |
|
88 |
super.processStartElement(); |
|
89 |
} else if (localname.equals("w")) { |
|
90 |
// store values |
|
91 |
inW = true; |
|
92 |
wInfos.clear() |
|
93 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
94 |
wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
95 |
} |
|
96 |
String time = parser.getAttributeValue(null, "time") |
|
97 |
if (time != null && time.length() > 0) { |
|
98 |
currentTime = time |
|
99 |
} |
|
100 |
wordBuffer.setLength(0); |
|
101 |
return; // write w later |
|
102 |
} |
|
103 |
else { |
|
104 |
super.processStartElement(); |
|
105 |
} |
|
106 |
} |
|
107 |
|
|
108 |
@Override |
|
109 |
protected void processCharacters() throws XMLStreamException { |
|
110 |
if (inW) { |
|
111 |
wordBuffer.append(parser.getText()) |
|
112 |
} |
|
113 |
else { |
|
114 |
super.processCharacters(); |
|
115 |
} |
|
116 |
} |
|
117 |
|
|
118 |
@Override |
|
119 |
protected void processEndElement() throws XMLStreamException { |
|
120 |
if (localname.equals("w")) { |
|
121 |
|
|
122 |
inW = false |
|
123 |
String word = wordBuffer.toString().trim() |
|
124 |
if (!other && word.startsWith("*")) { |
|
125 |
//close current Turn and start a 'other' Turn |
|
126 |
writer.writeEndElement() // current Turn |
|
127 |
writer.writeCharacters("\n") |
|
128 |
|
|
129 |
def tmpInfos = new LinkedHashMap() |
|
130 |
for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr] |
|
131 |
tmpInfos["orig-speaker"] = turnInfos["speaker"] |
|
132 |
|
|
133 |
if (primarySpeakerIdRegex == null || turnInfos["speaker"] ==~ primarySpeakerIdRegex) { |
|
134 |
tmpInfos["speaker"] = "other" |
|
135 |
} else { |
|
136 |
tmpInfos["speaker"] = primarySpeakerId |
|
137 |
} |
|
138 |
tmpInfos["startTime"] = currentTime |
|
139 |
writer.writeStartElement("Turn") |
|
140 |
for (String attr : tmpInfos.keySet()) { |
|
141 |
writer.writeAttribute(attr, tmpInfos[attr]) |
|
142 |
} |
|
143 |
|
|
144 |
other = true |
|
145 |
word = word.substring(1) |
|
146 |
} |
|
147 |
|
|
148 |
boolean shouldCloseOtherTurn = false; |
|
149 |
if (other && word.endsWith("*")) { |
|
150 |
shouldCloseOtherTurn = true; |
|
151 |
|
|
152 |
word = word.substring(0, word.length()-1) |
|
153 |
other = false |
|
154 |
} |
|
155 |
|
|
156 |
if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/> |
|
157 |
writer.writeStartElement("event") // start the initial word |
|
158 |
writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"]) |
|
159 |
writer.writeAttribute("type", "unknown") |
|
160 |
writer.writeAttribute("extent", "instantaneous") |
|
161 |
writer.writeEndElement() // event |
|
162 |
word = "" // don't write the word |
|
163 |
} |
|
164 |
|
|
165 |
if (word.length() > 0) { |
|
166 |
|
|
167 |
writer.writeStartElement("w") // start the initial word |
|
168 |
for (String attr : wInfos.keySet() ) { |
|
169 |
writer.writeAttribute(attr, wInfos[attr]) |
|
170 |
} |
|
171 |
writer.writeCharacters(word) |
|
172 |
writer.writeEndElement() // w |
|
173 |
} |
|
174 |
|
|
175 |
if (shouldCloseOtherTurn) { |
|
176 |
shouldCloseOtherTurn = false; |
|
177 |
//close the current 'other' Turn and restart the actual Turn |
|
178 |
writer.writeEndElement() // current 'other' Turn |
|
179 |
|
|
180 |
writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos |
|
181 |
turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time |
|
182 |
for (String attr : turnInfos.keySet()) { |
|
183 |
writer.writeAttribute(attr, turnInfos[attr]) |
|
184 |
} |
|
185 |
|
|
186 |
other = false |
|
187 |
} |
|
188 |
} else { |
|
189 |
super.processEndElement(); |
|
190 |
} |
|
191 |
} |
|
192 |
} |
|
193 |
} |
|
194 |
|
|
195 |
public static void main(String[] args) { |
|
196 |
File infile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
197 |
File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs") |
|
198 |
def processor = new FixTranscription(infile, true) |
|
199 |
println processor.process(outfile) |
|
200 |
} |
|
201 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/PrepareTranscriptionsMacro.groovy (revision 3020) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import java.time.LocalTime |
|
4 |
import java.time.format.DateTimeFormatter |
|
5 |
import org.txm.utils.* |
|
6 |
import org.txm.utils.logger.* |
|
7 |
import org.txm.macro.transcriber.RenameSpeaker |
|
8 |
|
|
9 |
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="") |
|
10 |
File trsDirectory |
|
11 |
|
|
12 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
|
13 |
File resultDirectory |
|
14 |
|
|
15 |
@Field @Option(name="primarySpeakerIdPrefix", usage="speaker ID of the primary speaker", widget="String", required=false, def="") |
|
16 |
String primarySpeakerIdPrefix |
|
17 |
@Field @Option(name="otherNonPrimaryLocutor", usage="other non primary id of the other turns", widget="String", required=false, def="") |
|
18 |
String otherNonPrimaryLocutor |
|
19 |
|
|
20 |
@Field @Option(name="nonPrimaryLocutorReplaceRegex", usage="other non primary id of the other turns", widget="String", required=false, def="") |
|
21 |
String nonPrimaryLocutorReplaceRegex |
|
22 |
|
|
23 |
@Field @Option(name="debug", usage="speaker ID of the primary speaker", widget="Boolean", required=false, def="false") |
|
24 |
Boolean debug |
|
25 |
|
|
26 |
if (!ParametersDialog.open(this)) return; |
|
27 |
|
|
28 |
if (resultDirectory.equals(trsDirectory)) { |
|
29 |
println "Result directory must differs from trsDirectory: "+trsDirectory |
|
30 |
return false; |
|
31 |
} |
|
32 |
|
|
33 |
resultDirectory.mkdirs(); |
|
34 |
|
|
35 |
def trsFiles = [] |
|
36 |
|
|
37 |
println "Processing TRS directory: $trsDirectory" |
|
38 |
for (File file : trsDirectory.listFiles()) { |
|
39 |
if (file.getName().toLowerCase().endsWith(".trs")) { |
|
40 |
trsFiles << file |
|
41 |
} |
|
42 |
} |
|
43 |
|
|
44 |
if (trsFiles.size() == 0) { |
|
45 |
println "No XML file found for parameters trsDirectory=$trsDirectory" |
|
46 |
return false |
|
47 |
} |
|
48 |
|
|
49 |
println "CREATE THE 'OTHER' TURNS..." |
|
50 |
|
|
51 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
|
52 |
for (File file : trsFiles) { |
|
53 |
cpb.tick() |
|
54 |
|
|
55 |
CreateTheOtherTurns fixer = new CreateTheOtherTurns(file, primarySpeakerIdPrefix, otherNonPrimaryLocutor, debug) |
|
56 |
String name = FileUtils.stripExtension(file) |
|
57 |
File outFile = new File(resultDirectory, name+".trs") |
|
58 |
|
|
59 |
if (!fixer.process(outFile)) { |
|
60 |
println "WARNING: ERROR WHILE PROCESSING: "+file |
|
61 |
} |
|
62 |
} |
|
63 |
cpb.done() |
|
64 |
|
|
65 |
// FIX LOCUTORS |
|
66 |
println "NORMALIZING LOCUTORS..." |
|
67 |
cpb = new ConsoleProgressBar(trsFiles.size()) |
|
68 |
for (File file : resultDirectory.listFiles()) { |
|
69 |
cpb.tick() |
|
70 |
|
|
71 |
RenameSpeaker fixer = new RenameSpeaker(file, file, nonPrimaryLocutorReplaceRegex, null, otherNonPrimaryLocutor, otherNonPrimaryLocutor) |
|
72 |
|
|
73 |
if (!fixer.process()) { |
|
74 |
println "WARNING: ERROR WHILE PROCESSING: "+file |
|
75 |
} |
|
76 |
} |
|
77 |
cpb.done() |
|
78 |
|
|
79 |
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory" |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/pager.groovy (revision 3020) | ||
---|---|---|
39 | 39 |
import org.xml.sax.InputSource; |
40 | 40 |
import org.xml.sax.helpers.DefaultHandler; |
41 | 41 |
import org.txm.utils.io.FileCopy |
42 |
import org.txm.objects.Project |
|
42 | 43 |
|
43 | 44 |
/** Build a simple edition from a xml-tei. |
44 | 45 |
* @author mdecorde */ |
... | ... | |
68 | 69 |
|
69 | 70 |
/** The first word. */ |
70 | 71 |
boolean firstWord = true; |
72 |
|
|
73 |
boolean paginate = true; |
|
71 | 74 |
|
72 | 75 |
/** The wordvalue. */ |
73 | 76 |
String wordvalue = ""; |
... | ... | |
126 | 129 |
* @param basename the basename |
127 | 130 |
*/ |
128 | 131 |
pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore, |
129 |
List<String> NoSpaceAfter, int max, String basename, String paginationElement) { |
|
130 |
this.paginationElement = paginationElement; |
|
132 |
List<String> NoSpaceAfter, String basename, Project project) { |
|
133 |
this.paginationElement = project.getEditionDefinition("default").getPageElement() |
|
134 |
this.paginate = project.getEditionDefinition("default").getPaginateEdition() |
|
135 |
this.wordmax = project.getEditionDefinition("default").getWordsPerPage(); |
|
136 |
|
|
131 | 137 |
this.basename = basename; |
132 | 138 |
this.txtname = txtname; |
133 | 139 |
this.outdir = outdir; |
134 |
this.wordmax = max; |
|
140 |
|
|
135 | 141 |
this.NoSpaceBefore = NoSpaceBefore; |
136 | 142 |
this.NoSpaceAfter = NoSpaceAfter; |
137 | 143 |
this.url = infile.toURI().toURL(); |
... | ... | |
288 | 294 |
case XMLStreamConstants.START_ELEMENT: |
289 | 295 |
localname = parser.getLocalName(); |
290 | 296 |
|
291 |
if (localname == paginationElement) { |
|
297 |
if (paginate && localname == paginationElement) {
|
|
292 | 298 |
createNextOutput(); |
293 | 299 |
pagedWriter.write("\n"); |
294 | 300 |
if (parser.getAttributeValue(null,"n") != null) { |
... | ... | |
355 | 361 |
wordid = parser.getAttributeValue(null,"id"); |
356 | 362 |
anaValues.clear() |
357 | 363 |
wordcount++; |
358 |
if (wordcount >= wordmax) { |
|
364 |
if (paginate && wordcount >= wordmax) {
|
|
359 | 365 |
createNextOutput(); |
360 | 366 |
} |
361 | 367 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/factiva/factivaLoader.groovy (revision 3020) | ||
---|---|---|
271 | 271 |
t.setSourceFile(txmFile) |
272 | 272 |
t.setTXMFile(txmFile) |
273 | 273 |
|
274 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
|
|
274 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, basename, project);
|
|
275 | 275 |
Edition edition = new Edition(t); |
276 | 276 |
edition.setName("default"); |
277 | 277 |
edition.setIndex(outdir.getAbsolutePath()); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 3020) | ||
---|---|---|
324 | 324 |
edition.delete(); |
325 | 325 |
edition = null; |
326 | 326 |
} |
327 |
def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
|
|
327 |
def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, basename, page_element, metadatas, project);
|
|
328 | 328 |
edition = t.getEdition("default") |
329 | 329 |
edition = new Edition(t); |
330 | 330 |
edition.setName("default"); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 3020) | ||
---|---|---|
28 | 28 |
package org.txm.scripts.importer.transcriber |
29 | 29 |
|
30 | 30 |
import java.io.File; |
31 |
|
|
31 | 32 |
import java.util.ArrayList; |
32 | 33 |
|
33 | 34 |
import javax.xml.stream.* |
... | ... | |
37 | 38 |
import org.txm.metadatas.Metadatas |
38 | 39 |
import org.txm.utils.TimeFormatter |
39 | 40 |
import org.txm.utils.io.FileCopy; |
41 |
import org.txm.objects.Project |
|
40 | 42 |
|
41 |
|
|
42 | 43 |
// TODO: Auto-generated Javadoc |
43 | 44 |
/** Build Discours corpus simple edition from a xml-tei. |
44 | 45 |
* |
... | ... | |
74 | 75 |
/** The first word. */ |
75 | 76 |
boolean firstWord = true |
76 | 77 |
|
78 |
boolean paginate = true |
|
79 |
|
|
80 |
|
|
77 | 81 |
/** The wordvalue. */ |
78 | 82 |
String wordvalue; |
79 | 83 |
|
... | ... | |
156 | 160 |
* @param metadatas the metadatas |
157 | 161 |
*/ |
158 | 162 |
pager(File txmfile, File htmlDir, String txtname, List<String> NoSpaceBefore, |
159 |
List<String> NoSpaceAfter, int max, String corpusname, String cuttingTag, Metadatas metadatas) {
|
|
163 |
List<String> NoSpaceAfter, String corpusname, String cuttingTag, Metadatas metadatas, Project project) {
|
|
160 | 164 |
this.metadatas = metadatas |
161 |
this.wordmax = max;
|
|
165 |
this.wordmax = project.getEditionDefinition("default").getWordsPerPage();
|
|
162 | 166 |
this.cuttingTag = cuttingTag; |
163 | 167 |
this.corpusname = corpusname; |
164 | 168 |
this.NoSpaceBefore = NoSpaceBefore; |
... | ... | |
167 | 171 |
this.txmfile = txmfile; |
168 | 172 |
this.htmlDir = htmlDir; |
169 | 173 |
this.txtname = txtname; |
174 |
this.paginate = project.getEditionDefinition("default").getPaginateEdition() |
|
170 | 175 |
|
171 | 176 |
inputData = url.openStream(); |
172 | 177 |
factory = XMLInputFactory.newInstance(); |
... | ... | |
386 | 391 |
writer.writeAttribute("id", ""+nbBreak); |
387 | 392 |
writer.writeCharacters("\n"); |
388 | 393 |
|
389 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html") |
|
390 |
indexes << wordid |
|
394 |
if (paginate) { |
|
395 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html") |
|
396 |
indexes << wordid |
|
397 |
} |
|
391 | 398 |
|
392 | 399 |
wordcount = 0; |
393 | 400 |
shouldBreak = false; |
... | ... | |
612 | 619 |
} |
613 | 620 |
|
614 | 621 |
wordcount++; |
615 |
if (wordcount >= wordmax) { |
|
622 |
if (paginate && wordcount >= wordmax) {
|
|
616 | 623 |
shouldBreak = true; |
617 | 624 |
} |
618 | 625 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/txt/txtLoader.groovy (revision 3020) | ||
---|---|---|
188 | 188 |
t.setSourceFile(srcfile) |
189 | 189 |
t.setTXMFile(srcfile) |
190 | 190 |
|
191 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, null);
|
|
191 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, basename, project);
|
|
192 | 192 |
Edition edition = new Edition(t); |
193 | 193 |
edition.setName("default"); |
194 | 194 |
edition.setIndex(outdir.getAbsolutePath()); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/hyperbase/hyperbaseLoader.groovy (revision 3020) | ||
---|---|---|
142 | 142 |
t.setSourceFile(txmFile) |
143 | 143 |
t.setTXMFile(txmFile) |
144 | 144 |
|
145 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
|
|
145 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, basename, project);
|
|
146 | 146 |
Edition edition = new Edition(t); |
147 | 147 |
edition.setName("default"); |
148 | 148 |
edition.setIndex(outdir.getAbsolutePath()); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZPager.groovy (revision 3020) | ||
---|---|---|
28 | 28 |
String page_element; |
29 | 29 |
String wordTag; |
30 | 30 |
int wordsPerPage; |
31 |
boolean paginate; |
|
31 | 32 |
|
32 | 33 |
File cssDirectory, jsDirectory, imagesDirectory; |
33 | 34 |
|
... | ... | |
39 | 40 |
lang = project.getLang(); |
40 | 41 |
wordsPerPage = project.getEditionDefinition("default").getWordsPerPage() |
41 | 42 |
page_element = project.getEditionDefinition("default").getPageElement() |
43 |
paginate = project.getEditionDefinition("default").getPaginateEdition() |
|
42 | 44 |
wordTag = project.getTokenizerWordElement() |
43 | 45 |
|
44 | 46 |
cssDirectory = new File(module.getSourceDirectory(), "css") |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3020) | ||
---|---|---|
87 | 87 |
/** The idxstart. */ |
88 | 88 |
ArrayList<String> idxstart = new ArrayList<String>(); |
89 | 89 |
String paginationElement; |
90 |
boolean paginate; |
|
90 | 91 |
def cssList; |
91 | 92 |
def wordTag = "w"; |
92 | 93 |
def noteElements = new HashSet<String>(); |
... | ... | |
107 | 108 |
List<String> NoSpaceAfter, def cssList) { |
108 | 109 |
this.pager = pager; |
109 | 110 |
this.paginationElement = pager.page_element; |
111 |
this.paginate = pager.paginate |
|
110 | 112 |
this.cssList = cssList; |
111 | 113 |
this.basename = pager.corpusname; |
112 | 114 |
this.txtname = txtname; |
... | ... | |
317 | 319 |
currentOutOfTextElements << localname |
318 | 320 |
} |
319 | 321 |
|
320 |
if (localname == paginationElement) { |
|
322 |
if (paginate && localname == paginationElement) {
|
|
321 | 323 |
createNextOutput() |
322 | 324 |
wordcount=0; |
323 | 325 |
pagedWriter.write("\n") |
... | ... | |
453 | 455 |
wordid = getAttributeValue(parser, null,"id"); |
454 | 456 |
anaValues.clear() |
455 | 457 |
wordcount++; |
456 |
if (wordcount >= wordmax) { |
|
458 |
if (paginate && wordcount >= wordmax) {
|
|
457 | 459 |
createNextOutput(); |
458 | 460 |
} |
459 | 461 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xmltxm/xmltxmLoader.groovy (revision 3020) | ||
---|---|---|
216 | 216 |
if (second > 0 && (second++ % 5) == 0) println "" |
217 | 217 |
cpb.tick() |
218 | 218 |
|
219 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
|
|
219 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, basename, project);
|
|
220 | 220 |
Edition edition = new Edition(t); |
221 | 221 |
edition.setName("default"); |
222 | 222 |
edition.setIndex(outdir.getAbsolutePath()); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/discours/discoursLoader.groovy (revision 3020) | ||
---|---|---|
125 | 125 |
t.setName(txtname); |
126 | 126 |
t.setSourceFile(txmFile) |
127 | 127 |
t.setTXMFile(txmFile) |
128 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
|
|
128 |
def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, basename, project);
|
|
129 | 129 |
Edition edition = new Edition(t); |
130 | 130 |
edition.setName("default"); |
131 | 131 |
edition.setIndex(outdir.getAbsolutePath()); |
tmp/org.txm.core/src/java/org/txm/core/preferences/TBXPreferences.java (revision 3020) | ||
---|---|---|
65 | 65 |
|
66 | 66 |
public static final String EDITION_DEFINITION_BUILD = "edition_definition_build"; //$NON-NLS-1$ |
67 | 67 |
|
68 |
public static final String EDITION_DEFINITION_PAGINATE = "edition_definition_paginate"; //$NON-NLS-1$ |
|
69 |
|
|
68 | 70 |
public static final String EDITION_DEFINITION_WORDS_PER_PAGE = "edition_definition_words_per_page"; //$NON-NLS-1$ |
69 | 71 |
|
70 | 72 |
public static final String EDITION_DEFINITION_PAGE_BREAK_ELEMENT = "edition_definition_page_break_element"; //$NON-NLS-1$ |
tmp/org.txm.core/src/java/org/txm/objects/EditionDefinition.java (revision 3020) | ||
---|---|---|
46 | 46 |
return node.getBoolean(TBXPreferences.EDITION_DEFINITION_BUILD, false); |
47 | 47 |
} |
48 | 48 |
|
49 |
public boolean getPaginateEdition() { |
|
50 |
return node.getBoolean(TBXPreferences.EDITION_DEFINITION_PAGINATE, true); |
|
51 |
} |
|
52 |
|
|
49 | 53 |
public String getImagesDirectory() { |
50 | 54 |
return node.get(TBXPreferences.EDITION_DEFINITION_IMAGES_DIRECTORY, null); |
51 | 55 |
} |
... | ... | |
114 | 118 |
} |
115 | 119 |
return buffer.toString(); |
116 | 120 |
} |
121 |
|
|
122 |
public void setPaginateEdition(boolean paginate) { |
|
123 |
node.putBoolean(TBXPreferences.EDITION_DEFINITION_PAGINATE, paginate); |
|
124 |
} |
|
117 | 125 |
} |
Formats disponibles : Unified diff