Révision 3016

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3016)
20 20
		this.xmlfile = xmlfile;
21 21
	}
22 22
	
23
	public boolean process(File outfile, boolean retokenizeWords) {
23
	public boolean process(File outfile) {
24 24
		
25 25
		if (!xmlfile.exists()) return false;
26 26
		
......
172 172
							
173 173
							// split before the word
174 174
								def puncts = []
175
								if (retokenizeWords) {
176
									
177
									while (word.length() > 0 && word.matches("\\p{Punct}.+")) {
178
										puncts << word.substring(0 ,1)
179
										word = word.substring(1)
180
									}
181
									
182
									// fix "d'abord" like words
183
									Pattern reg = Pattern.compile("([^']+')(.+)")
184
									def m = reg.matcher(word)
185
									while (word.length() > 0 && m.matches()) {
186
										puncts << m.group(1)
187
										word = m.group(2)
188
										m = reg.matcher(word)
189
									}
190
								}
191
							
175
															
192 176
								for (def punct : puncts) { // pre-retokenize if any
193 177
									writer.writeStartElement("w")
194 178
									for (String attr : winfos.keySet()) {
......
198 182
									writer.writeEndElement() // w
199 183
									writer.writeCharacters("\n")
200 184
								}
201
							
202
								puncts = []
203
								if (retokenizeWords) {
204
									while (word.length() > 0 && word.matches(".+\\p{Punct}")) {
205
										puncts.add(0, word.substring(word.length()-1, word.length()))
206
										word = word.substring(0, word.length()-1)
207
									}
208
								}
209
							
185
														
210 186
								if (word.length() > 0) {
211 187
									writer.writeStartElement("w") // start the initial word
212 188
									for (String attr : winfos.keySet()) {
......
216 192
									writer.writeEndElement() // w
217 193
									writer.writeCharacters("\n")
218 194
								}
219
							
220
								for (String punct : puncts) {  // write post-retokenize if any
221
									writer.writeStartElement("w")
222
									for (String attr : winfos.keySet()) {
223
										writer.writeAttribute(attr, winfos[attr])
224
									}
225
									writer.writeCharacters(punct)
226
									writer.writeEndElement() // w
227
									writer.writeCharacters("\n")
228
								}
195

  
229 196
								break
230 197
						}
231 198
						break
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2TranscriberMacro.groovy (revision 3016)
13 13

  
14 14
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="")
15 15
		File resultDirectory;
16
		
17
@Field @Option(name="retokenize_words", usage="retokenize words prefixed or postfixed with puunctuations", widget="Boolean", required=true, def="true")
18
		Boolean retokenize_words;
19 16

  
17

  
20 18
if (!ParametersDialog.open(this)) return;
21 19

  
22 20
resultDirectory.mkdirs();
......
47 45
	String name = FileUtils.stripExtension(xmlFile)
48 46
	File outFile = new File(resultDirectory, name+".trs")
49 47
	
50
	if (!v2t.process(outFile, retokenize_words)) {
48
	if (!v2t.process(outFile)) {
51 49
		println "WARNING: ERROR WHILE PROCESSING: "+xmlFile
52 50
		return false
53 51
	}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscriptionsMacro.groovy (revision 3016)
1
package org.txm.macro.projects.nov13
2

  
3
import java.time.LocalTime
4
import java.time.format.DateTimeFormatter
5
import org.txm.utils.*
6
import org.txm.utils.logger.*
7

  
8
@Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="")
9
		File trsFile;
10

  
11
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="")
12
		File trsDirectory;
13

  
14
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="")
15
		File resultDirectory;
16

  
17
if (!ParametersDialog.open(this)) return;
18

  
19
if (resultDirectory.equals(trsDirectory) || (trsFile != null && trsFile.getParentFile().equals(resultDirectory))) {
20
	return false;
21
}
22

  
23
resultDirectory.mkdirs();
24

  
25
def trsFiles = []
26
if (trsDirectory != null && trsDirectory.exists()) {
27
	
28
	println "Processing TRS directory: $trsDirectory"
29
	for (File file : trsDirectory.listFiles()) {
30
		if (file.getName().toLowerCase().endsWith(".trs")) {
31
			trsFiles << file
32
		}
33
	}
34
} else if (trsFile != null && trsFile.exists()) {
35
	println "Processing TRS file: $trsFile"
36
	trsFiles << trsFile
37
}
38

  
39
if (trsFiles.size() == 0) {
40
	println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory"
41
	return false
42
}
43

  
44
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size())
45
for (File file : trsFiles) {
46
	cpb.tick()
47
	FixTranscription fixer = new FixTranscription(file)
48
	String name = FileUtils.stripExtension(file)
49
	File outFile = new File(resultDirectory, name+".trs")
50
	
51
	if (!fixer.process(outFile)) {
52
		println "WARNING: ERROR WHILE PROCESSING: "+file
53
		return false
54
	}
55
}
56
cpb.done()
57

  
58
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory"
59

  
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/FixTranscription.groovy (revision 3016)
1
package org.txm.macro.projects.nov13
2

  
3
import javax.xml.stream.*
4

  
5
import org.txm.importer.PersonalNamespaceContext
6
import org.txm.xml.IdentityHook
7
import org.txm.xml.*
8

  
9
import java.io.BufferedOutputStream
10
import java.io.FileOutputStream
11
import java.io.IOException
12
import java.net.URL
13
import java.util.*
14
import java.util.Map.Entry
15
import java.util.regex.Pattern
16

  
17
class FixTranscription extends XMLProcessor {
18
	
19
	LocalNamesHookActivator activator;
20
	IdentityHook hook;
21
	
22
	public FixTranscription(File xmlfile) {
23
		super(xmlfile)
24
		
25
		activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]);
26
		
27
		hook = new IdentityHook("word_hook", activator, this) {
28
					
29
					boolean inTurn = false;
30
					
31
					boolean inW = false;
32
					StringBuilder wordBuffer = new StringBuilder();
33
					
34
					String currentTime;
35
					LinkedHashMap turnInfos = new LinkedHashMap()
36
					LinkedHashMap wInfos = new LinkedHashMap()
37
					boolean other
38
					
39
					@Override
40
					public boolean deactivate() {
41
						return true;
42
					}
43
					
44
					@Override
45
					public boolean _activate() {
46
						return true;
47
					}
48
					
49
					@Override
50
					protected void processStartElement() throws XMLStreamException, IOException {
51
						if (localname.equals("Turn")) {
52
							// store values
53
							inTurn = true;
54
							turnInfos.clear()
55
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
56
								turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
57
							}
58
							currentTime = turnInfos["startTime"]
59
							super.processStartElement();
60
						} else if (localname.equals("Sync")) {
61
							currentTime = parser.getAttributeValue(null, "time")
62
							super.processStartElement();
63
						} else if (localname.equals("w")) {
64
							// store values
65
							inW = true;
66
							wInfos.clear()
67
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
68
								wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
69
							}
70
							String time = parser.getAttributeValue(null, "time")
71
							if (time != null && time.length() > 0) {
72
								currentTime = time
73
							}
74
							wordBuffer.setLength(0);
75
							return; // write w later
76
						}
77
						else {
78
							super.processStartElement();
79
						}
80
					}
81
					
82
					@Override
83
					protected void processCharacters() throws XMLStreamException {
84
						if (inW) {
85
							wordBuffer.append(parser.getText())
86
						}
87
						else {
88
							super.processCharacters();
89
						}
90
					}
91
					
92
					@Override
93
					protected void processEndElement() throws XMLStreamException {
94
						if (localname.equals("w")) {
95
							
96
							inW = false
97
							String word = wordBuffer.toString().trim()
98
							if (!other && word.startsWith("*")) {
99
								//close current Turn and start a 'other' Turn
100
								writer.writeEndElement() // current Turn
101
								writer.writeCharacters("\n")
102
								
103
								def tmpInfos = new LinkedHashMap()
104
								for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr]
105
								tmpInfos["orig-speaker"] = turnInfos["speaker"]
106
								tmpInfos["speaker"] = "other"
107
								tmpInfos["startTime"] = currentTime
108
								writer.writeStartElement("Turn")
109
								for (String attr : tmpInfos.keySet()) {
110
									writer.writeAttribute(attr, tmpInfos[attr])
111
								}
112
								
113
								other = true
114
								word = word.substring(1)
115
							}
116
							
117
							boolean shouldCloseOtherTurn = false;
118
							if (other && word.endsWith("*")) {
119
								shouldCloseOtherTurn = true;
120
								
121
								word = word.substring(0, word.length()-1)
122
								other = false
123
							}
124
							
125
							if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/>
126
								writer.writeStartElement("event") // start the initial word
127
								writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"])
128
								writer.writeAttribute("type", "unknown")
129
								writer.writeAttribute("extent", "instantaneous")
130
								writer.writeEndElement() // event
131
								word = "" // don't write the word
132
							}
133
														
134
							if (word.length() > 0) {
135
								
136
								writer.writeStartElement("w") // start the initial word
137
								for (String attr : wInfos.keySet() ) {
138
									writer.writeAttribute(attr, wInfos[attr])
139
								}
140
								writer.writeCharacters(word)
141
								writer.writeEndElement() // w
142
							}
143
						
144
							if (shouldCloseOtherTurn) {
145
								shouldCloseOtherTurn = false;
146
								//close the current 'other' Turn and restart the actual Turn
147
								writer.writeEndElement() // current 'other' Turn
148
								
149
								writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos
150
								turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time
151
								for (String attr : turnInfos.keySet()) {
152
									writer.writeAttribute(attr, turnInfos[attr])
153
								}
154
								
155
								other = false
156
							}
157
						} else {
158
							super.processEndElement();
159
						}
160
					}
161
				}
162
	}
163
	
164
	public static void main(String[] args) {
165
		File infile = new File("/home/mdecorde/xml/vocapia","test.trs")
166
		File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs")
167
		def processor = new FixTranscription(infile, true)
168
		println processor.process(outfile)
169
	}
170
}
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZFacsPagerStep.groovy (revision 3016)
244 244
		closeMultiWriter()
245 245
		if (parser != null) parser.close();
246 246
		if (inputData != null) inputData.close();
247
		
247 248
		pages << [htmlFile, wordid] // add the last page (no pb encountered
248 249

  
249 250
		return pages.size() > 1
......
303 304
			}
304 305
		}
305 306
		closeMultiWriter()
306
		pages << [htmlFile, wordid] // add the last page (no pb encountered
307
		pages << [htmlFile, wordid] // add the last page (no pb encountered)
307 308

  
308
		return pages.size() > 1
309
		return pages.size() >= 1
309 310
	}
310 311

  
311 312
	public void printStartPage() {
......
338 339
	}
339 340
	
340 341
	public static void main(String[] args) {
341
		File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
342
		File txmFile = new File(System.getProperty("user.home"), "TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
342 343
		String txtname = "1532_RabelaisPrnstctn"
343 344
		String corpusname = "BVH"
344 345
		File newEditionDirectory = new File("/tmp/xtzpagertest/")

Formats disponibles : Unified diff