root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / pager.groovy @ 2369
History | View | Annotate | Download (26.9 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate:$
|
25 |
// $LastChangedRevision:$
|
26 |
// $LastChangedBy:$
|
27 |
//
|
28 |
package org.txm.scripts.importer.transcriber
|
29 |
|
30 |
import java.io.File; |
31 |
import java.util.ArrayList; |
32 |
|
33 |
import javax.xml.stream.* |
34 |
|
35 |
import org.txm.importer.ApplyXsl2 |
36 |
import org.txm.metadatas.MetadataGroup |
37 |
import org.txm.metadatas.Metadatas |
38 |
import org.txm.utils.io.FileCopy; |
39 |
|
40 |
|
41 |
// TODO: Auto-generated Javadoc
|
42 |
/** Build Discours corpus simple edition from a xml-tei.
|
43 |
*
|
44 |
* @author mdecorde
|
45 |
*
|
46 |
*/
|
47 |
class pager { |
48 |
|
49 |
boolean SIMPLE_TOOLTIP = false; // show less properties in word tooltips |
50 |
String ENQ_HIGHLIGHT_ELEMENT = "b" |
51 |
|
52 |
List<String> NoSpaceBefore; |
53 |
|
54 |
/** The No space after. */
|
55 |
List<String> NoSpaceAfter; |
56 |
|
57 |
/** The pages. */
|
58 |
def pages = []; |
59 |
def indexes = []; |
60 |
|
61 |
/** The wordcount. */
|
62 |
int wordcount = 0; |
63 |
|
64 |
/** The pagecount. */
|
65 |
int pagecount = 0; |
66 |
|
67 |
/** The wordmax. */
|
68 |
int wordmax = 10; |
69 |
|
70 |
/** The wordid. */
|
71 |
String wordid;
|
72 |
|
73 |
/** The first word. */
|
74 |
boolean firstWord = true; |
75 |
|
76 |
/** The wordvalue. */
|
77 |
String wordvalue;
|
78 |
|
79 |
/** The interpvalue. */
|
80 |
String interpvalue;
|
81 |
|
82 |
/** The lastword. */
|
83 |
String lastword = " "; |
84 |
|
85 |
/** The wordtype. */
|
86 |
String wordtype;
|
87 |
|
88 |
/** The flagform. */
|
89 |
boolean flagform = false; |
90 |
|
91 |
/** The flaginterp. */
|
92 |
boolean flaginterp = false; |
93 |
|
94 |
boolean flagcomment = false; |
95 |
|
96 |
/** The url. */
|
97 |
private def url; |
98 |
|
99 |
/** The input data. */
|
100 |
private def inputData; |
101 |
|
102 |
/** The factory. */
|
103 |
private def factory; |
104 |
|
105 |
/** The parser. */
|
106 |
private XMLStreamReader parser;
|
107 |
|
108 |
/** The writer. */
|
109 |
XMLStreamWriter writer; |
110 |
BufferedOutputStream output;
|
111 |
|
112 |
File txmfile;
|
113 |
|
114 |
File outfile;
|
115 |
|
116 |
String corpusname =""; |
117 |
String cuttingTag = "pb" |
118 |
String txtname;
|
119 |
File htmlDir;
|
120 |
File defaultDir;
|
121 |
Metadatas metadatas; |
122 |
|
123 |
def interviewers = []; |
124 |
def eventTranslations = ["^^":"mot inconnu", "?":"orthographe incertaine", |
125 |
"()":"rupture de syntaxe", "b":"bruit indéterminé", |
126 |
"*":"mot corrigé", |
127 |
"bb":"bruit de bouche", "bg":"bruit de gorge", |
128 |
"ch":"voix chuchotée", "conv":"conversations de fond", |
129 |
"e":"expiration", "i":"inspiration", |
130 |
"mic":"bruits micro", "n":"reniflement", |
131 |
"nontrant":"non transcrit", "pap":"froissement de papiers", |
132 |
"pf":"souffle", "pi":"inintelligible", |
133 |
"pif":"inaudible", "r":"respiration", |
134 |
"rire":"rire du locuteur", "shh":"soufle électrique", |
135 |
"sif":"sifflement du locuteur", "tx":"toux"]; |
136 |
String currentTime = ""; |
137 |
boolean bold = false; |
138 |
int writenLength = 0; |
139 |
boolean spokenTurn = false; |
140 |
boolean firstSync = false; |
141 |
boolean firstWho = false; |
142 |
/**
|
143 |
* Instantiates a new pager.
|
144 |
*
|
145 |
* @param infile the infile
|
146 |
* @param outfile the outfile
|
147 |
* @param NoSpaceBefore the no space before
|
148 |
* @param NoSpaceAfter the no space after
|
149 |
* @param max the max
|
150 |
* @param metadatas the metadatas
|
151 |
*/
|
152 |
pager(File txmfile, File htmlDir, String txtname, List<String> NoSpaceBefore, |
153 |
List<String> NoSpaceAfter, int max, String corpusname, String cuttingTag, Metadatas metadatas) { |
154 |
this.metadatas = metadatas
|
155 |
this.wordmax = max;
|
156 |
this.cuttingTag = cuttingTag;
|
157 |
this.corpusname = corpusname;
|
158 |
this.NoSpaceBefore = NoSpaceBefore;
|
159 |
this.NoSpaceAfter = NoSpaceAfter;
|
160 |
this.url = txmfile.toURI().toURL();
|
161 |
this.txmfile = txmfile;
|
162 |
this.htmlDir = htmlDir;
|
163 |
this.txtname = txtname;
|
164 |
|
165 |
inputData = url.openStream(); |
166 |
factory = XMLInputFactory.newInstance(); |
167 |
parser = factory.createXMLStreamReader(inputData); |
168 |
|
169 |
defaultDir = new File(htmlDir, "default") |
170 |
defaultDir.mkdir() |
171 |
new File(htmlDir, "onepage").mkdir() |
172 |
outfile = new File(htmlDir, "onepage/${txtname}.html"); |
173 |
createOutput(outfile) |
174 |
|
175 |
try {
|
176 |
process(); |
177 |
} catch(Exception e) { |
178 |
org.txm.utils.logger.Log.printStackTrace(e); |
179 |
if (writer != null) { |
180 |
writer.close(); |
181 |
output.close(); |
182 |
} |
183 |
} |
184 |
} |
185 |
|
186 |
/**
|
187 |
* Creates the output.
|
188 |
*
|
189 |
* @param outfile the outfile
|
190 |
* @return true, if successful
|
191 |
*/
|
192 |
private boolean createOutput(File outfile) { |
193 |
try {
|
194 |
//println "write html in : "+outfile
|
195 |
XMLOutputFactory outfactory = XMLOutputFactory.newInstance(); |
196 |
output = new BufferedOutputStream(new FileOutputStream(outfile)) |
197 |
writer = outfactory.createXMLStreamWriter(output, "UTF-8");//create a new file |
198 |
|
199 |
return true; |
200 |
} catch (Exception e) { |
201 |
System.out.println(e.getLocalizedMessage());
|
202 |
return false; |
203 |
} |
204 |
} |
205 |
|
206 |
/** The events. */
|
207 |
List<String> events = []; |
208 |
String previousEvent = "", nextEvent = ""; |
209 |
/**
|
210 |
* Process.
|
211 |
*/
|
212 |
void process() {
|
213 |
|
214 |
String previousElem = ""; |
215 |
boolean parolesRaportees = false; |
216 |
boolean firstWord = true; |
217 |
boolean shouldBreak = false; |
218 |
boolean overlapping = false; |
219 |
int nbBreak = 0; |
220 |
String previousSPK;
|
221 |
String localname = ""; |
222 |
ArrayList<String> whos = []; |
223 |
HashMap<String, String> speakers = new HashMap<String, String>(); |
224 |
HashMap<String, String> topics = new HashMap<String, String>(); |
225 |
|
226 |
writer.writeStartDocument("UTF-8","1.0"); |
227 |
writer.writeStartElement("html");
|
228 |
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
229 |
writer.writeStartElement("meta");
|
230 |
writer.writeAttribute("http-equiv", "Content-Type"); |
231 |
writer.writeAttribute("content", "text/html"); |
232 |
writer.writeAttribute("charset", "UTF-8"); |
233 |
writer.writeEndElement(); // meta
|
234 |
writer.writeStartElement("head");
|
235 |
//<link rel="stylesheet" type="text/css" href="class.css" />
|
236 |
writer.writeStartElement("link");
|
237 |
writer.writeAttribute("rel", "stylesheet"); |
238 |
writer.writeAttribute("type", "text/css"); |
239 |
writer.writeAttribute("href", "transcriber.css"); |
240 |
writer.writeEndElement(); // link
|
241 |
writer.writeStartElement("link");
|
242 |
writer.writeAttribute("rel", "stylesheet"); |
243 |
writer.writeAttribute("type", "text/css"); |
244 |
writer.writeAttribute("href", corpusname+".css"); |
245 |
writer.writeEndElement(); // link
|
246 |
writer.writeEndElement(); // head
|
247 |
|
248 |
nbBreak++ |
249 |
writer.writeStartElement("body");
|
250 |
writer.writeAttribute("class", "txmeditionpage") |
251 |
writer.writeEmptyElement("pb");
|
252 |
writer.writeAttribute("id", ""+nbBreak); |
253 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html") |
254 |
|
255 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
256 |
switch (event) {
|
257 |
case XMLStreamConstants.START_ELEMENT:
|
258 |
localname = parser.getLocalName(); |
259 |
switch (localname) {
|
260 |
case "text": |
261 |
|
262 |
writer.writeStartElement("h1");
|
263 |
writer.writeAttribute("class", "title"); |
264 |
String title = parser.getAttributeValue(null, "title"); |
265 |
|
266 |
if (title != null) { |
267 |
writer.writeCharacters(title); |
268 |
} else {
|
269 |
writer.writeCharacters("Transcription "+txmfile.getName().substring(0, txmfile.getName().length() - 4)); |
270 |
} |
271 |
|
272 |
writeMediaAccess("0.0")
|
273 |
|
274 |
writer.writeEndElement(); // h1
|
275 |
|
276 |
String subtitle = parser.getAttributeValue(null, "subtitle"); |
277 |
if (subtitle != null && subtitle.length() > 0) { |
278 |
writer.writeStartElement("h2");
|
279 |
writer.writeAttribute("class", "subtitle"); |
280 |
writer.writeCharacters(subtitle); |
281 |
writer.writeEndElement(); // h2
|
282 |
} |
283 |
|
284 |
writer.writeStartElement("table");
|
285 |
writer.writeAttribute("class", "transcription-table"); |
286 |
boolean grey = false; |
287 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
288 |
String name = parser.getAttributeName(i);
|
289 |
String value = parser.getAttributeValue(i);
|
290 |
|
291 |
if ("title" == name) { |
292 |
continue; // ignore "title" metadata |
293 |
} |
294 |
|
295 |
grey = !grey; |
296 |
writer.writeStartElement("tr");
|
297 |
if (grey) {
|
298 |
writer.writeAttribute("style","background-color:lightgrey;") |
299 |
} |
300 |
|
301 |
if (value != null) { |
302 |
writer.writeStartElement("td");
|
303 |
writer.writeCharacters(name); |
304 |
writer.writeEndElement(); // td
|
305 |
writer.writeStartElement("td");
|
306 |
writer.writeCharacters(value); |
307 |
writer.writeEndElement(); // td
|
308 |
} |
309 |
//get enqueteur to style their names
|
310 |
if (name.startsWith("enq")) { |
311 |
interviewers.add(value) |
312 |
} |
313 |
writer.writeEndElement(); // tr
|
314 |
} |
315 |
writer.writeEndElement(); // table
|
316 |
// }
|
317 |
break;
|
318 |
case "Topics": |
319 |
/*writer.writeStartElement("h2");
|
320 |
writer.writeCharacters("Topics");
|
321 |
writer.writeEndElement();
|
322 |
writer.writeStartElement("ul");
|
323 |
*/
|
324 |
break;
|
325 |
case "Topic": |
326 |
topics.put(parser.getAttributeValue(null,"id"), parser.getAttributeValue(null,"desc")) |
327 |
/*writer.writeStartElement("li");
|
328 |
writer.writeCharacters(parser.getAttributeValue(null,"desc"));
|
329 |
writer.writeStartElement("ul");
|
330 |
for(int i = 0 ; i < parser.getAttributeCount() ; i++)
|
331 |
{
|
332 |
if(parser.getAttributeLocalName(i) != "desc")
|
333 |
{
|
334 |
writer.writeStartElement("li");
|
335 |
writer.writeCharacters(parser.getAttributeLocalName(i)+": "+parser.getAttributeValue(i));
|
336 |
writer.writeEndElement();
|
337 |
}
|
338 |
}
|
339 |
writer.writeEndElement();
|
340 |
writer.writeEndElement();
|
341 |
*/
|
342 |
break;
|
343 |
case "Speakers": |
344 |
/*writer.writeStartElement("h2");
|
345 |
writer.writeCharacters("Speakers");
|
346 |
writer.writeEndElement();
|
347 |
writer.writeStartElement("ul");*/
|
348 |
break;
|
349 |
case "Speaker": |
350 |
whos.add(parser.getAttributeValue(null,"name")); |
351 |
speakers.put(parser.getAttributeValue(null,"id"), parser.getAttributeValue(null,"name")) |
352 |
/*writer.writeStartElement("li");
|
353 |
writer.writeStartElement("ul");
|
354 |
writer.writeCharacters(parser.getAttributeValue(null,"name"));
|
355 |
for(int i = 0 ; i < parser.getAttributeCount() ; i++)
|
356 |
{
|
357 |
if(parser.getAttributeLocalName(i) != "name")
|
358 |
{
|
359 |
writer.writeStartElement("li");
|
360 |
writer.writeCharacters(parser.getAttributeLocalName(i)+": "+parser.getAttributeValue(i));
|
361 |
writer.writeEndElement();
|
362 |
}
|
363 |
}
|
364 |
writer.writeEndElement();
|
365 |
writer.writeEndElement();*/
|
366 |
break;
|
367 |
case "Comment": |
368 |
spokenTurn = true;
|
369 |
writenLength++; |
370 |
writer.writeStartElement("span");
|
371 |
writer.writeAttribute("class", "comment"); |
372 |
writer.writeCharacters(" ["+parser.getAttributeValue(0)+"] "); |
373 |
writer.writeEndElement(); |
374 |
flagcomment = true;
|
375 |
break;
|
376 |
case "div": |
377 |
|
378 |
nbBreak++ |
379 |
writer.writeEmptyElement("pb");
|
380 |
writer.writeAttribute("id", ""+nbBreak); |
381 |
writer.writeCharacters("\n");
|
382 |
|
383 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html") |
384 |
indexes << wordid |
385 |
|
386 |
wordcount = 0;
|
387 |
shouldBreak = false;
|
388 |
|
389 |
writer.writeStartElement("div")
|
390 |
writer.writeAttribute("class", "section") |
391 |
|
392 |
String type = parser.getAttributeValue(null, "type") |
393 |
writer.writeAttribute("type", ""+type) |
394 |
|
395 |
String desc = parser.getAttributeValue(null, "topic") |
396 |
|
397 |
if (type != null && type.length() > 0) { |
398 |
writer.writeStartElement("h2");
|
399 |
writer.writeAttribute("class", "section-title") |
400 |
writer.writeCharacters(type); |
401 |
|
402 |
if (parser.getAttributeValue(null,"startTime") != null) { |
403 |
writeMediaAccess(parser.getAttributeValue(null,"startTime")) |
404 |
} |
405 |
|
406 |
writer.writeEndElement(); // h1
|
407 |
} |
408 |
|
409 |
if (desc != null && desc.length() > 0) { |
410 |
writer.writeStartElement("h2");
|
411 |
writer.writeAttribute("class", "section-desc") |
412 |
writer.writeCharacters(desc) |
413 |
writer.writeEndElement(); // h2
|
414 |
} |
415 |
|
416 |
def metadata = new LinkedHashMap<String, String>() // temp to store attributes |
417 |
def metadataGroups = ["metadata":[]] // default metadata group |
418 |
def metadataDeclared = false |
419 |
if (parser.getAttributeValue(null, "metadata") != null && parser.getAttributeValue(null, "metadata_groups") != null) { |
420 |
def l1 = parser.getAttributeValue(null, "metadata").split("\\|"); |
421 |
def l2 = parser.getAttributeValue(null, "metadata_groups").split("\\|"); |
422 |
for (int i = 0 ; i < l1.size() ; i++) { |
423 |
def m = l1[i]
|
424 |
def g = l2[i]
|
425 |
metadata[m] = "" // forcing order of metadata by pre-declaring |
426 |
if (!metadataGroups.containsKey(g)) metadataGroups[g] = [] |
427 |
metadataGroups[g] << m // declaring a metadata type
|
428 |
} |
429 |
metadataDeclared = true
|
430 |
} |
431 |
|
432 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
433 |
String name = parser.getAttributeLocalName(i)
|
434 |
if (!"type".equals(name) |
435 |
&& !"topic".equals(name)
|
436 |
&& !"startTime".equals(name)
|
437 |
&& !"endTime".equals(name)) {
|
438 |
if (metadataDeclared && !metadata.containsKey(name)) {
|
439 |
continue; // ignoring metadata since not in declared metadata |
440 |
} else {
|
441 |
metadataGroups["metadata"] << name
|
442 |
} |
443 |
|
444 |
metadata[name] = parser.getAttributeValue(i) |
445 |
} |
446 |
} |
447 |
|
448 |
if (metadataGroups.keySet().size() > 0) { |
449 |
writer.writeStartElement("p")
|
450 |
writer.writeAttribute("class", "section-all-metadata"); |
451 |
for (String groupName : metadataGroups.keySet()) { |
452 |
def group = metadataGroups[groupName]
|
453 |
if (group.size() > 0) { |
454 |
if (groupName.equals("text")) { |
455 |
writer.writeStartElement("p")
|
456 |
writer.writeAttribute("class", "section-"+groupName); |
457 |
for (String k : group) { |
458 |
writer.writeStartElement("p")
|
459 |
writer.writeAttribute("class", ""+groupName) |
460 |
writer.writeStartElement("h4")
|
461 |
writer.writeCharacters(k) |
462 |
writer.writeEndElement() // h4
|
463 |
writer.writeCharacters(metadata[k]) |
464 |
writer.writeEndElement() // p
|
465 |
} |
466 |
} else {
|
467 |
writer.writeStartElement("ul")
|
468 |
writer.writeAttribute("class", "section-"+groupName); |
469 |
for (String k : group) { |
470 |
writer.writeStartElement("li")
|
471 |
writer.writeAttribute("class", ""+groupName) |
472 |
writer.writeCharacters(""+k+": "+metadata[k]) |
473 |
writer.writeEndElement() // li
|
474 |
} |
475 |
} |
476 |
|
477 |
writer.writeEndElement(); // ul or p
|
478 |
} |
479 |
} |
480 |
writer.writeEndElement(); // p
|
481 |
writer.writeEmptyElement("hr")
|
482 |
} |
483 |
|
484 |
break;
|
485 |
case "sp": |
486 |
endBoldIfNeeded() |
487 |
firstSync = true;
|
488 |
firstWho = true;
|
489 |
spokenTurn = false;
|
490 |
overlapping = false
|
491 |
|
492 |
writer.writeStartElement("p");
|
493 |
writer.writeAttribute("class", "turn"); |
494 |
|
495 |
overlapping = ("true" == parser.getAttributeValue(null,"overlap")) |
496 |
String spid = parser.getAttributeValue(null,"speaker"); |
497 |
|
498 |
whos = []
|
499 |
if (overlapping) {
|
500 |
writer.writeEmptyElement("br");
|
501 |
writeSpeaker(parser.getAttributeValue(null,"speaker"), false) |
502 |
|
503 |
writer.writeEmptyElement("br");
|
504 |
whos = spid.split(" ")
|
505 |
} |
506 |
|
507 |
break;
|
508 |
case "u": |
509 |
writer.writeCharacters("\n");
|
510 |
this.currentTime = parser.getAttributeValue(null,"time"); |
511 |
|
512 |
if (previousElem == "u" && writenLength == 0) { // if previous u had no words, it was a silence |
513 |
writer.writeStartElement("span");
|
514 |
writer.writeAttribute("class", "event"); |
515 |
writer.writeCharacters("[silence]");
|
516 |
writer.writeEndElement(); // span
|
517 |
writer.writeEmptyElement("br");
|
518 |
} |
519 |
|
520 |
String spk = parser.getAttributeValue(null, "spk") |
521 |
if (spk != null && spk != previousSPK) { |
522 |
endBoldIfNeeded() |
523 |
writer.writeEmptyElement("br");
|
524 |
writeSpeaker(parser.getAttributeValue(null, "spk"), overlapping) |
525 |
startBoldIfNeeded() |
526 |
} |
527 |
|
528 |
writeCurrentTime() |
529 |
previousSPK = spk |
530 |
|
531 |
// writenLength = 0;
|
532 |
/*writer.writeStartElement("span");
|
533 |
writer.writeAttribute("class", "sync");
|
534 |
writer.writeCharacters("["+parser.getAttributeValue(null,"time")+"]");
|
535 |
writer.writeEndElement();*/
|
536 |
|
537 |
break;
|
538 |
case "event": |
539 |
spokenTurn = true;
|
540 |
writenLength++; |
541 |
String desc = parser.getAttributeValue(null,"desc"); |
542 |
desc = translateEvent(desc); |
543 |
String type = parser.getAttributeValue(null,"type"); |
544 |
if (desc.equals("paroles rapportées")) { |
545 |
if (parser.getAttributeValue(null, "extent") == "end") |
546 |
writer.writeCharacters("» ");
|
547 |
else if (parser.getAttributeValue(null, "extent") == "begin") |
548 |
writer.writeCharacters(" «");
|
549 |
} else {
|
550 |
writer.writeStartElement("span");
|
551 |
writer.writeAttribute("class", "event"); |
552 |
if (parser.getAttributeValue(null, "extent") == "end") { |
553 |
writer.writeCharacters(" <"+desc+"] "); |
554 |
if(events.size() > 0) |
555 |
events.remove(events.size()-1)
|
556 |
} |
557 |
else if (parser.getAttributeValue(null, "extent") == "begin") { |
558 |
|
559 |
writer.writeCharacters(" ["+desc+"> "); |
560 |
events.add(desc) |
561 |
} |
562 |
else if (parser.getAttributeValue(null, "extent") == "previous") { |
563 |
if(parser.getAttributeValue(null, "type") == "pronounce") |
564 |
writer.writeCharacters("_["+desc+"] "); |
565 |
else
|
566 |
writer.writeCharacters("_["+desc+"] "); |
567 |
previousEvent = desc; |
568 |
} |
569 |
else if (parser.getAttributeValue(null, "extent") == "next") { |
570 |
writer.writeCharacters(" ["+desc+"]_"); |
571 |
nextEvent = desc |
572 |
} |
573 |
else
|
574 |
writer.writeCharacters(" ["+desc+"] "); |
575 |
writer.writeEndElement(); // span@class=event
|
576 |
} |
577 |
break;
|
578 |
case "w": |
579 |
for(int i = 0 ; i < parser.getAttributeCount() ; i++) |
580 |
if(parser.getAttributeLocalName(i) == "id") { |
581 |
wordid = (parser.getAttributeValue(i)); |
582 |
break;
|
583 |
} |
584 |
|
585 |
wordcount++; |
586 |
if (wordcount >= wordmax) {
|
587 |
shouldBreak = true;
|
588 |
} |
589 |
|
590 |
if (firstWord) {
|
591 |
indexes << wordid |
592 |
firstWord = false;
|
593 |
} |
594 |
|
595 |
break;
|
596 |
|
597 |
case "ana": |
598 |
|
599 |
String type = parser.getAttributeValue(null,"type").substring(1); |
600 |
if (SIMPLE_TOOLTIP) {
|
601 |
if (type.contains("lemma") || type.contains("pos")) { |
602 |
flaginterp=true;
|
603 |
interpvalue+=", ";
|
604 |
} |
605 |
} else {
|
606 |
flaginterp=true;
|
607 |
interpvalue+=", "+type+"=" |
608 |
} |
609 |
break;
|
610 |
|
611 |
case "form": |
612 |
wordvalue="";
|
613 |
interpvalue ="";
|
614 |
flagform=true;
|
615 |
break;
|
616 |
} |
617 |
previousElem = localname; |
618 |
break;
|
619 |
|
620 |
case XMLStreamConstants.END_ELEMENT:
|
621 |
localname = parser.getLocalName(); |
622 |
switch(localname) {
|
623 |
case "text": |
624 |
break;
|
625 |
case "Topics": |
626 |
//writer.writeEndElement();
|
627 |
break;
|
628 |
case "Topic": |
629 |
break;
|
630 |
case "Speakers": |
631 |
//println "Speakers: "+speakers
|
632 |
//writer.writeEndElement();
|
633 |
break;
|
634 |
case "Speaker": |
635 |
break;
|
636 |
|
637 |
case "div": |
638 |
//writer.writeCharacters("}");
|
639 |
|
640 |
writer.writeEndElement(); // div
|
641 |
writer.writeCharacters("\n");
|
642 |
break;
|
643 |
case "sp": |
644 |
//println "CLOSING: "+parser.getLocalName()
|
645 |
endBoldIfNeeded() |
646 |
if (!spokenTurn) {
|
647 |
writer.writeStartElement("span");
|
648 |
writer.writeAttribute("class", "event"); |
649 |
writer.writeCharacters("[silence]");
|
650 |
writer.writeEndElement(); |
651 |
writer.writeEmptyElement("br");
|
652 |
} |
653 |
|
654 |
writer.writeEndElement(); // p
|
655 |
|
656 |
if (shouldBreak) {
|
657 |
nbBreak++ |
658 |
writer.writeEmptyElement("pb");
|
659 |
writer.writeAttribute("id", ""+nbBreak); |
660 |
writer.writeCharacters("\n");
|
661 |
|
662 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html") |
663 |
indexes << wordid |
664 |
|
665 |
wordcount = 0;
|
666 |
shouldBreak = false;
|
667 |
} |
668 |
writer.writeCharacters("\n");
|
669 |
break;
|
670 |
case "u": |
671 |
//writer.writeEndElement() // span@class=u
|
672 |
//writer.writeEmptyElement("br");
|
673 |
//if (overlapping) writer.writeEndElement(); // b
|
674 |
break;
|
675 |
case "event": |
676 |
break;
|
677 |
case "form": |
678 |
flagform = false
|
679 |
break;
|
680 |
case "ana": |
681 |
flaginterp = false
|
682 |
break;
|
683 |
case "w": |
684 |
writenLength++; |
685 |
spokenTurn = true;
|
686 |
int l = lastword.length();
|
687 |
String endOfLastWord = ""; |
688 |
if(l > 0) |
689 |
endOfLastWord = lastword.subSequence(l-1, l);
|
690 |
|
691 |
if(interpvalue != null) |
692 |
interpvalue = interpvalue.replace("\"","""); |
693 |
if(events.size() > 0) |
694 |
interpvalue = interpvalue.replace("event=", "event="+events.toString().replace("\"",""")); // remove ", " |
695 |
|
696 |
if(nextEvent.length() > 0) |
697 |
{ |
698 |
interpvalue = interpvalue.replace("event=", "event="+nextEvent+", ") |
699 |
nextEvent = ""
|
700 |
} |
701 |
interpvalue = interpvalue.replace("=, ","='', "); // add '' to empty interp value |
702 |
if (interpvalue.startsWith(", ")) interpvalue = interpvalue.substring(2) |
703 |
// println "** SPACE TEST"
|
704 |
// println "NoSpaceBefore: "+NoSpaceBefore+" contains ? "+wordvalue
|
705 |
// println "NoSpaceAfter: "+NoSpaceAfter+" contains ? "+lastword
|
706 |
// println "wordvalue starts with '-' ? "+wordvalue
|
707 |
// println "NoSpaceAfter: "+NoSpaceAfter+" contains endOfLastWord ? "+endOfLastWord
|
708 |
if(NoSpaceBefore.contains(wordvalue) ||
|
709 |
NoSpaceAfter.contains(lastword) || |
710 |
wordvalue.startsWith("-") ||
|
711 |
NoSpaceAfter.contains(endOfLastWord)) { |
712 |
// println " NO SPACE"
|
713 |
} else {
|
714 |
// println " SPACE"
|
715 |
writer.writeCharacters(" ");
|
716 |
} |
717 |
|
718 |
if (interpvalue.contains("rapp1")) { |
719 |
writer.writeCharacters(" «");
|
720 |
} else if (wordvalue == "\"") { |
721 |
// don't write this char
|
722 |
} else {
|
723 |
writer.writeStartElement("span");
|
724 |
writer.writeAttribute("class", "word"); |
725 |
writer.writeAttribute("title", interpvalue);
|
726 |
writer.writeAttribute("id", wordid);
|
727 |
writer.writeCharacters(wordvalue); |
728 |
writer.writeEndElement(); |
729 |
} |
730 |
if (interpvalue.contains("orth")) { |
731 |
writer.writeStartElement("span");
|
732 |
writer.writeAttribute("class", "event"); |
733 |
writer.writeCharacters("_[?]");
|
734 |
writer.writeEndElement(); |
735 |
} |
736 |
if (interpvalue.contains("corr")) { |
737 |
writer.writeStartElement("span");
|
738 |
writer.writeAttribute("class", "event"); |
739 |
writer.writeCharacters("_[!]");
|
740 |
writer.writeEndElement(); |
741 |
} |
742 |
|
743 |
if (interpvalue.contains("rapp2")) { |
744 |
writer.writeCharacters(" » ");
|
745 |
} |
746 |
|
747 |
lastword=wordvalue; |
748 |
break;
|
749 |
} |
750 |
|
751 |
break;
|
752 |
|
753 |
case XMLStreamConstants.CHARACTERS:
|
754 |
if(flagform)
|
755 |
if(parser.getText().length() > 0) |
756 |
wordvalue+=(parser.getText().trim()); |
757 |
if(flaginterp)
|
758 |
if(parser.getText().length() > 0) |
759 |
interpvalue+=(parser.getText().trim()); |
760 |
break;
|
761 |
} |
762 |
} |
763 |
writer.writeEndElement(); // body
|
764 |
|
765 |
writer.writeEmptyElement("pb");
|
766 |
nbBreak++ |
767 |
writer.writeAttribute("id", ""+nbBreak); |
768 |
|
769 |
writer.writeEndElement(); // html
|
770 |
writer.close(); |
771 |
output.close(); |
772 |
if (parser != null) parser.close(); |
773 |
if (inputData != null) inputData.close(); |
774 |
|
775 |
File txmhome = new File(org.txm.Toolbox.getTxmHomePath()); |
776 |
File xlsDir = new File(txmhome, "xsl"); |
777 |
File xslfile = new File(xlsDir,"breakByMilestone.xsl"); |
778 |
if (!xslfile.exists()) {
|
779 |
println ""
|
780 |
} |
781 |
// println "xsl: "+xslfile
|
782 |
// println "html: "+outfile
|
783 |
// println "pages: "+pages
|
784 |
// println "words: "+indexes
|
785 |
|
786 |
|
787 |
if (pages.size() > 1) { |
788 |
for (int i = 1 ; i < nbBreak ; i++) { |
789 |
ApplyXsl2 a = new ApplyXsl2(xslfile.getAbsolutePath());
|
790 |
String[] params = ["pbval1", i, "pbval2", i+1]; |
791 |
|
792 |
File resultfile = pages[i-1] |
793 |
//println "BBmilestones: "+i+" "+(i+1)+" in file "+resultfile
|
794 |
//println "process $outfile -> $resultfile"
|
795 |
a.process(outfile.getAbsolutePath(), resultfile.getAbsolutePath(), params); |
796 |
} |
797 |
} else {
|
798 |
File page = pages[0] |
799 |
FileCopy.copy(outfile, page) |
800 |
} |
801 |
outfile.delete() // onepage edition -> no more needed
|
802 |
} |
803 |
|
804 |
private void writeCurrentTime() { |
805 |
writer.writeStartElement("span");
|
806 |
writer.writeAttribute("class", "sync"); |
807 |
writer.writeCharacters(currentTime); |
808 |
|
809 |
writeMediaAccess(currentTime) |
810 |
|
811 |
writer.writeEndElement() // span
|
812 |
} |
813 |
|
814 |
private void writeMediaAccess(def time) { |
815 |
writer.writeCharacters(" ");
|
816 |
writer.writeStartElement("a");
|
817 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+time+"')"); |
818 |
writer.writeAttribute("style", "cursor: pointer;") |
819 |
writer.writeAttribute("class", "play-media") |
820 |
writer.writeCharacters("♫");
|
821 |
writer.writeEndElement(); // a
|
822 |
} |
823 |
|
824 |
private void writeSpeaker(String spk, boolean overlapping) { |
825 |
|
826 |
writer.writeStartElement("span");
|
827 |
writer.writeAttribute("class", "spk"); |
828 |
if(interviewers.contains(spk))
|
829 |
bold = true;
|
830 |
else
|
831 |
bold = false;
|
832 |
spk = spk.replaceAll('^([^0-9]*)([0-9]+)$', '$1 $2'); |
833 |
if (overlapping) writer.writeCharacters("// ") |
834 |
|
835 |
writer.writeCharacters(spk+": ")
|
836 |
|
837 |
writer.writeEndElement(); // span@class=spk
|
838 |
} |
839 |
|
840 |
private String translateEvent(String desc) { |
841 |
if(eventTranslations.containsKey(desc))
|
842 |
return eventTranslations.get(desc);
|
843 |
else
|
844 |
return desc;
|
845 |
} |
846 |
|
847 |
boolean boldOpenned = false; |
848 |
private void startBoldIfNeeded() { |
849 |
if (bold) {
|
850 |
writer.writeStartElement(ENQ_HIGHLIGHT_ELEMENT); |
851 |
boldOpenned = true;
|
852 |
} |
853 |
} |
854 |
|
855 |
private endBoldIfNeeded() {
|
856 |
if (boldOpenned) {
|
857 |
// println "CLOSE BOLD"
|
858 |
writer.writeEndElement(); // b
|
859 |
boldOpenned = false;
|
860 |
} |
861 |
} |
862 |
|
863 |
// private String formatTime(float time, boolean doshort)
|
864 |
// {
|
865 |
// String rez = " ";
|
866 |
// // if(time >= 3600) // >= 1h
|
867 |
// // {
|
868 |
// float h = time / 3600;
|
869 |
// time = time%3600;
|
870 |
// float min = (time%3600) / 60;
|
871 |
// int sec = (int)time%60;
|
872 |
//
|
873 |
// if(min < 10)
|
874 |
// rez = ""+(int)h+":0"+(int)min;//+":"+time%60;
|
875 |
// else
|
876 |
// rez = ""+(int)h+":"+(int)min;//+":"+time%60;
|
877 |
// //if (!doshort)
|
878 |
// if (sec > 9)
|
879 |
// rez += ":"+(int)time%60;
|
880 |
// else
|
881 |
// rez += ":0"+(int)time%60;
|
882 |
// // }
|
883 |
// // else if(time >= 60) // >= 1min
|
884 |
// // {
|
885 |
// // int min = time/60;
|
886 |
// // if(min < 10)
|
887 |
// // rez = "00:0"+min;//+":"+time%60;
|
888 |
// // else
|
889 |
// // rez = "00:"+min;//+":"+time%60;
|
890 |
// // if(!doshort)
|
891 |
// // rez += ":"+(int)time%60;
|
892 |
// // }
|
893 |
// // else // < 60
|
894 |
// // {
|
895 |
// // if(time < 10)
|
896 |
// // return " 0:0"+time;
|
897 |
// // else
|
898 |
// // return " 0:"+time;
|
899 |
// // }
|
900 |
// return rez;
|
901 |
// }
|
902 |
|
903 |
/**
|
904 |
* Gets the page files.
|
905 |
*
|
906 |
* @return the page files
|
907 |
*/
|
908 |
public ArrayList<File> getPageFiles() { |
909 |
return pages;
|
910 |
} |
911 |
|
912 |
/**
|
913 |
* Gets the idx.
|
914 |
*
|
915 |
* @return the idx
|
916 |
*/
|
917 |
public ArrayList<String> getIdx() { |
918 |
return indexes;
|
919 |
} |
920 |
} |