Révision 3016
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3016) | ||
---|---|---|
20 | 20 |
this.xmlfile = xmlfile; |
21 | 21 |
} |
22 | 22 |
|
23 |
public boolean process(File outfile, boolean retokenizeWords) {
|
|
23 |
public boolean process(File outfile) { |
|
24 | 24 |
|
25 | 25 |
if (!xmlfile.exists()) return false; |
26 | 26 |
|
... | ... | |
172 | 172 |
|
173 | 173 |
// split before the word |
174 | 174 |
def puncts = [] |
175 |
if (retokenizeWords) { |
|
176 |
|
|
177 |
while (word.length() > 0 && word.matches("\\p{Punct}.+")) { |
|
178 |
puncts << word.substring(0 ,1) |
|
179 |
word = word.substring(1) |
|
180 |
} |
|
181 |
|
|
182 |
// fix "d'abord" like words |
|
183 |
Pattern reg = Pattern.compile("([^']+')(.+)") |
|
184 |
def m = reg.matcher(word) |
|
185 |
while (word.length() > 0 && m.matches()) { |
|
186 |
puncts << m.group(1) |
|
187 |
word = m.group(2) |
|
188 |
m = reg.matcher(word) |
|
189 |
} |
|
190 |
} |
|
191 |
|
|
175 |
|
|
192 | 176 |
for (def punct : puncts) { // pre-retokenize if any |
193 | 177 |
writer.writeStartElement("w") |
194 | 178 |
for (String attr : winfos.keySet()) { |
... | ... | |
198 | 182 |
writer.writeEndElement() // w |
199 | 183 |
writer.writeCharacters("\n") |
200 | 184 |
} |
201 |
|
|
202 |
puncts = [] |
|
203 |
if (retokenizeWords) { |
|
204 |
while (word.length() > 0 && word.matches(".+\\p{Punct}")) { |
|
205 |
puncts.add(0, word.substring(word.length()-1, word.length())) |
|
206 |
word = word.substring(0, word.length()-1) |
|
207 |
} |
|
208 |
} |
|
209 |
|
|
185 |
|
|
210 | 186 |
if (word.length() > 0) { |
211 | 187 |
writer.writeStartElement("w") // start the initial word |
212 | 188 |
for (String attr : winfos.keySet()) { |
... | ... | |
216 | 192 |
writer.writeEndElement() // w |
217 | 193 |
writer.writeCharacters("\n") |
218 | 194 |
} |
219 |
|
|
220 |
for (String punct : puncts) { // write post-retokenize if any |
|
221 |
writer.writeStartElement("w") |
|
222 |
for (String attr : winfos.keySet()) { |
|
223 |
writer.writeAttribute(attr, winfos[attr]) |
|
224 |
} |
|
225 |
writer.writeCharacters(punct) |
|
226 |
writer.writeEndElement() // w |
|
227 |
writer.writeCharacters("\n") |
|
228 |
} |
|
195 |
|
|
229 | 196 |
break |
230 | 197 |
} |
231 | 198 |
break |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2TranscriberMacro.groovy (revision 3016) | ||
---|---|---|
13 | 13 |
|
14 | 14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
15 | 15 |
File resultDirectory; |
16 |
|
|
17 |
@Field @Option(name="retokenize_words", usage="retokenize words prefixed or postfixed with puunctuations", widget="Boolean", required=true, def="true") |
|
18 |
Boolean retokenize_words; |
|
19 | 16 |
|
17 |
|
|
20 | 18 |
if (!ParametersDialog.open(this)) return; |
21 | 19 |
|
22 | 20 |
resultDirectory.mkdirs(); |
... | ... | |
47 | 45 |
String name = FileUtils.stripExtension(xmlFile) |
48 | 46 |
File outFile = new File(resultDirectory, name+".trs") |
49 | 47 |
|
50 |
if (!v2t.process(outFile, retokenize_words)) {
|
|
48 |
if (!v2t.process(outFile)) { |
|
51 | 49 |
println "WARNING: ERROR WHILE PROCESSING: "+xmlFile |
52 | 50 |
return false |
53 | 51 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscriptionsMacro.groovy (revision 3016) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import java.time.LocalTime |
|
4 |
import java.time.format.DateTimeFormatter |
|
5 |
import org.txm.utils.* |
|
6 |
import org.txm.utils.logger.* |
|
7 |
|
|
8 |
@Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="") |
|
9 |
File trsFile; |
|
10 |
|
|
11 |
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="") |
|
12 |
File trsDirectory; |
|
13 |
|
|
14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
|
15 |
File resultDirectory; |
|
16 |
|
|
17 |
if (!ParametersDialog.open(this)) return; |
|
18 |
|
|
19 |
if (resultDirectory.equals(trsDirectory) || (trsFile != null && trsFile.getParentFile().equals(resultDirectory))) { |
|
20 |
return false; |
|
21 |
} |
|
22 |
|
|
23 |
resultDirectory.mkdirs(); |
|
24 |
|
|
25 |
def trsFiles = [] |
|
26 |
if (trsDirectory != null && trsDirectory.exists()) { |
|
27 |
|
|
28 |
println "Processing TRS directory: $trsDirectory" |
|
29 |
for (File file : trsDirectory.listFiles()) { |
|
30 |
if (file.getName().toLowerCase().endsWith(".trs")) { |
|
31 |
trsFiles << file |
|
32 |
} |
|
33 |
} |
|
34 |
} else if (trsFile != null && trsFile.exists()) { |
|
35 |
println "Processing TRS file: $trsFile" |
|
36 |
trsFiles << trsFile |
|
37 |
} |
|
38 |
|
|
39 |
if (trsFiles.size() == 0) { |
|
40 |
println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory" |
|
41 |
return false |
|
42 |
} |
|
43 |
|
|
44 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
|
45 |
for (File file : trsFiles) { |
|
46 |
cpb.tick() |
|
47 |
FixTranscription fixer = new FixTranscription(file) |
|
48 |
String name = FileUtils.stripExtension(file) |
|
49 |
File outFile = new File(resultDirectory, name+".trs") |
|
50 |
|
|
51 |
if (!fixer.process(outFile)) { |
|
52 |
println "WARNING: ERROR WHILE PROCESSING: "+file |
|
53 |
return false |
|
54 |
} |
|
55 |
} |
|
56 |
cpb.done() |
|
57 |
|
|
58 |
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory" |
|
59 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscription.groovy (revision 3016) | ||
---|---|---|
1 |
package org.txm.macro.projects.nov13 |
|
2 |
|
|
3 |
import javax.xml.stream.* |
|
4 |
|
|
5 |
import org.txm.importer.PersonalNamespaceContext |
|
6 |
import org.txm.xml.IdentityHook |
|
7 |
import org.txm.xml.* |
|
8 |
|
|
9 |
import java.io.BufferedOutputStream |
|
10 |
import java.io.FileOutputStream |
|
11 |
import java.io.IOException |
|
12 |
import java.net.URL |
|
13 |
import java.util.* |
|
14 |
import java.util.Map.Entry |
|
15 |
import java.util.regex.Pattern |
|
16 |
|
|
17 |
class FixTranscription extends XMLProcessor { |
|
18 |
|
|
19 |
LocalNamesHookActivator activator; |
|
20 |
IdentityHook hook; |
|
21 |
|
|
22 |
public FixTranscription(File xmlfile) { |
|
23 |
super(xmlfile) |
|
24 |
|
|
25 |
activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]); |
|
26 |
|
|
27 |
hook = new IdentityHook("word_hook", activator, this) { |
|
28 |
|
|
29 |
boolean inTurn = false; |
|
30 |
|
|
31 |
boolean inW = false; |
|
32 |
StringBuilder wordBuffer = new StringBuilder(); |
|
33 |
|
|
34 |
String currentTime; |
|
35 |
LinkedHashMap turnInfos = new LinkedHashMap() |
|
36 |
LinkedHashMap wInfos = new LinkedHashMap() |
|
37 |
boolean other |
|
38 |
|
|
39 |
@Override |
|
40 |
public boolean deactivate() { |
|
41 |
return true; |
|
42 |
} |
|
43 |
|
|
44 |
@Override |
|
45 |
public boolean _activate() { |
|
46 |
return true; |
|
47 |
} |
|
48 |
|
|
49 |
@Override |
|
50 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
51 |
if (localname.equals("Turn")) { |
|
52 |
// store values |
|
53 |
inTurn = true; |
|
54 |
turnInfos.clear() |
|
55 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
56 |
turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
57 |
} |
|
58 |
currentTime = turnInfos["startTime"] |
|
59 |
super.processStartElement(); |
|
60 |
} else if (localname.equals("Sync")) { |
|
61 |
currentTime = parser.getAttributeValue(null, "time") |
|
62 |
super.processStartElement(); |
|
63 |
} else if (localname.equals("w")) { |
|
64 |
// store values |
|
65 |
inW = true; |
|
66 |
wInfos.clear() |
|
67 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
68 |
wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
69 |
} |
|
70 |
String time = parser.getAttributeValue(null, "time") |
|
71 |
if (time != null && time.length() > 0) { |
|
72 |
currentTime = time |
|
73 |
} |
|
74 |
wordBuffer.setLength(0); |
|
75 |
return; // write w later |
|
76 |
} |
|
77 |
else { |
|
78 |
super.processStartElement(); |
|
79 |
} |
|
80 |
} |
|
81 |
|
|
82 |
@Override |
|
83 |
protected void processCharacters() throws XMLStreamException { |
|
84 |
if (inW) { |
|
85 |
wordBuffer.append(parser.getText()) |
|
86 |
} |
|
87 |
else { |
|
88 |
super.processCharacters(); |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
@Override |
|
93 |
protected void processEndElement() throws XMLStreamException { |
|
94 |
if (localname.equals("w")) { |
|
95 |
|
|
96 |
inW = false |
|
97 |
String word = wordBuffer.toString().trim() |
|
98 |
if (!other && word.startsWith("*")) { |
|
99 |
//close current Turn and start a 'other' Turn |
|
100 |
writer.writeEndElement() // current Turn |
|
101 |
writer.writeCharacters("\n") |
|
102 |
|
|
103 |
def tmpInfos = new LinkedHashMap() |
|
104 |
for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr] |
|
105 |
tmpInfos["orig-speaker"] = turnInfos["speaker"] |
|
106 |
tmpInfos["speaker"] = "other" |
|
107 |
tmpInfos["startTime"] = currentTime |
|
108 |
writer.writeStartElement("Turn") |
|
109 |
for (String attr : tmpInfos.keySet()) { |
|
110 |
writer.writeAttribute(attr, tmpInfos[attr]) |
|
111 |
} |
|
112 |
|
|
113 |
other = true |
|
114 |
word = word.substring(1) |
|
115 |
} |
|
116 |
|
|
117 |
boolean shouldCloseOtherTurn = false; |
|
118 |
if (other && word.endsWith("*")) { |
|
119 |
shouldCloseOtherTurn = true; |
|
120 |
|
|
121 |
word = word.substring(0, word.length()-1) |
|
122 |
other = false |
|
123 |
} |
|
124 |
|
|
125 |
if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/> |
|
126 |
writer.writeStartElement("event") // start the initial word |
|
127 |
writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"]) |
|
128 |
writer.writeAttribute("type", "unknown") |
|
129 |
writer.writeAttribute("extent", "instantaneous") |
|
130 |
writer.writeEndElement() // event |
|
131 |
word = "" // don't write the word |
|
132 |
} |
|
133 |
|
|
134 |
if (word.length() > 0) { |
|
135 |
|
|
136 |
writer.writeStartElement("w") // start the initial word |
|
137 |
for (String attr : wInfos.keySet() ) { |
|
138 |
writer.writeAttribute(attr, wInfos[attr]) |
|
139 |
} |
|
140 |
writer.writeCharacters(word) |
|
141 |
writer.writeEndElement() // w |
|
142 |
} |
|
143 |
|
|
144 |
if (shouldCloseOtherTurn) { |
|
145 |
shouldCloseOtherTurn = false; |
|
146 |
//close the current 'other' Turn and restart the actual Turn |
|
147 |
writer.writeEndElement() // current 'other' Turn |
|
148 |
|
|
149 |
writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos |
|
150 |
turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time |
|
151 |
for (String attr : turnInfos.keySet()) { |
|
152 |
writer.writeAttribute(attr, turnInfos[attr]) |
|
153 |
} |
|
154 |
|
|
155 |
other = false |
|
156 |
} |
|
157 |
} else { |
|
158 |
super.processEndElement(); |
|
159 |
} |
|
160 |
} |
|
161 |
} |
|
162 |
} |
|
163 |
|
|
164 |
public static void main(String[] args) { |
|
165 |
File infile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
166 |
File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs") |
|
167 |
def processor = new FixTranscription(infile, true) |
|
168 |
println processor.process(outfile) |
|
169 |
} |
|
170 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZFacsPagerStep.groovy (revision 3016) | ||
---|---|---|
244 | 244 |
closeMultiWriter() |
245 | 245 |
if (parser != null) parser.close(); |
246 | 246 |
if (inputData != null) inputData.close(); |
247 |
|
|
247 | 248 |
pages << [htmlFile, wordid] // add the last page (no pb encountered |
248 | 249 |
|
249 | 250 |
return pages.size() > 1 |
... | ... | |
303 | 304 |
} |
304 | 305 |
} |
305 | 306 |
closeMultiWriter() |
306 |
pages << [htmlFile, wordid] // add the last page (no pb encountered |
|
307 |
pages << [htmlFile, wordid] // add the last page (no pb encountered)
|
|
307 | 308 |
|
308 |
return pages.size() > 1 |
|
309 |
return pages.size() >= 1
|
|
309 | 310 |
} |
310 | 311 |
|
311 | 312 |
public void printStartPage() { |
... | ... | |
338 | 339 |
} |
339 | 340 |
|
340 | 341 |
public static void main(String[] args) { |
341 |
File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
|
|
342 |
File txmFile = new File(System.getProperty("user.home"), "TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
|
|
342 | 343 |
String txtname = "1532_RabelaisPrnstctn" |
343 | 344 |
String corpusname = "BVH" |
344 | 345 |
File newEditionDirectory = new File("/tmp/xtzpagertest/") |
Formats disponibles : Unified diff