Révision 2268
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2268) | ||
|---|---|---|
| 112 | 112 |
Toolbox.getMetadataColumnSeparator(), |
| 113 | 113 |
Toolbox.getMetadataTextSeparator(), 1) |
| 114 | 114 |
} |
| 115 |
else |
|
| 115 |
else {
|
|
| 116 | 116 |
println "no metadata file: "+allMetadataFile |
| 117 |
} |
|
| 117 | 118 |
|
| 118 | 119 |
File propertyFile = new File(srcDir, "import.properties")//default |
| 119 | 120 |
Properties props = new Properties(); |
| ... | ... | |
| 289 | 290 |
cpb.done() |
| 290 | 291 |
|
| 291 | 292 |
//copy transcriber.css |
| 292 |
File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
|
| 293 |
File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
|
| 294 |
File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css") |
|
| 293 | 295 |
if (cssfile.exists() && htmlDir.exists()) {
|
| 294 |
FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css"));
|
|
| 295 |
FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
|
|
| 296 |
FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
|
|
| 297 |
FileCopy.copy(cssfile, new File(htmlDir, "default/txm.css"));
|
|
| 296 | 298 |
FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css")); |
| 297 | 299 |
} |
| 298 | 300 |
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2268) | ||
|---|---|---|
| 44 | 44 |
* |
| 45 | 45 |
*/ |
| 46 | 46 |
class pager {
|
| 47 |
|
|
| 47 |
|
|
| 48 | 48 |
boolean SIMPLE_TOOLTIP = false; // show less properties in word tooltips |
| 49 | 49 |
String ENQ_HIGHLIGHT_ELEMENT = "b" |
| 50 |
|
|
| 50 |
|
|
| 51 | 51 |
List<String> NoSpaceBefore; |
| 52 |
|
|
| 52 |
|
|
| 53 | 53 |
/** The No space after. */ |
| 54 | 54 |
List<String> NoSpaceAfter; |
| 55 |
|
|
| 55 |
|
|
| 56 | 56 |
/** The pages. */ |
| 57 | 57 |
def pages = []; |
| 58 | 58 |
def indexes = []; |
| 59 |
|
|
| 59 |
|
|
| 60 | 60 |
/** The wordcount. */ |
| 61 | 61 |
int wordcount = 0; |
| 62 |
|
|
| 62 |
|
|
| 63 | 63 |
/** The pagecount. */ |
| 64 | 64 |
int pagecount = 0; |
| 65 |
|
|
| 65 |
|
|
| 66 | 66 |
/** The wordmax. */ |
| 67 | 67 |
int wordmax = 10; |
| 68 |
|
|
| 68 |
|
|
| 69 | 69 |
/** The wordid. */ |
| 70 | 70 |
String wordid; |
| 71 |
|
|
| 71 |
|
|
| 72 | 72 |
/** The first word. */ |
| 73 | 73 |
boolean firstWord = true; |
| 74 |
|
|
| 74 |
|
|
| 75 | 75 |
/** The wordvalue. */ |
| 76 | 76 |
String wordvalue; |
| 77 |
|
|
| 77 |
|
|
| 78 | 78 |
/** The interpvalue. */ |
| 79 | 79 |
String interpvalue; |
| 80 |
|
|
| 80 |
|
|
| 81 | 81 |
/** The lastword. */ |
| 82 | 82 |
String lastword = " "; |
| 83 |
|
|
| 83 |
|
|
| 84 | 84 |
/** The wordtype. */ |
| 85 | 85 |
String wordtype; |
| 86 |
|
|
| 86 |
|
|
| 87 | 87 |
/** The flagform. */ |
| 88 | 88 |
boolean flagform = false; |
| 89 |
|
|
| 89 |
|
|
| 90 | 90 |
/** The flaginterp. */ |
| 91 | 91 |
boolean flaginterp = false; |
| 92 |
|
|
| 92 |
|
|
| 93 | 93 |
boolean flagcomment = false; |
| 94 |
|
|
| 94 |
|
|
| 95 | 95 |
/** The url. */ |
| 96 | 96 |
private def url; |
| 97 |
|
|
| 97 |
|
|
| 98 | 98 |
/** The input data. */ |
| 99 | 99 |
private def inputData; |
| 100 |
|
|
| 100 |
|
|
| 101 | 101 |
/** The factory. */ |
| 102 | 102 |
private def factory; |
| 103 |
|
|
| 103 |
|
|
| 104 | 104 |
/** The parser. */ |
| 105 | 105 |
private XMLStreamReader parser; |
| 106 |
|
|
| 106 |
|
|
| 107 | 107 |
/** The writer. */ |
| 108 | 108 |
XMLStreamWriter writer; |
| 109 | 109 |
BufferedOutputStream output; |
| 110 |
|
|
| 110 |
|
|
| 111 | 111 |
File txmfile; |
| 112 |
|
|
| 112 |
|
|
| 113 | 113 |
File outfile; |
| 114 |
|
|
| 114 |
|
|
| 115 | 115 |
String corpusname =""; |
| 116 | 116 |
String cuttingTag = "pb" |
| 117 | 117 |
String txtname; |
| 118 | 118 |
File htmlDir; |
| 119 | 119 |
File defaultDir; |
| 120 | 120 |
Metadatas metadatas; |
| 121 |
|
|
| 121 |
|
|
| 122 | 122 |
def interviewers = []; |
| 123 | 123 |
def eventTranslations = ["^^":"mot inconnu", "?":"orthographe incertaine", |
| 124 | 124 |
"()":"rupture de syntaxe", "b":"bruit indéterminé", |
| ... | ... | |
| 160 | 160 |
this.txmfile = txmfile; |
| 161 | 161 |
this.htmlDir = htmlDir; |
| 162 | 162 |
this.txtname = txtname; |
| 163 |
|
|
| 163 |
|
|
| 164 | 164 |
inputData = url.openStream(); |
| 165 | 165 |
factory = XMLInputFactory.newInstance(); |
| 166 | 166 |
parser = factory.createXMLStreamReader(inputData); |
| 167 |
|
|
| 167 |
|
|
| 168 | 168 |
defaultDir = new File(htmlDir, "default") |
| 169 | 169 |
defaultDir.mkdir() |
| 170 | 170 |
new File(htmlDir, "onepage").mkdir() |
| 171 | 171 |
outfile = new File(htmlDir, "onepage/${txtname}.html");
|
| 172 | 172 |
createOutput(outfile) |
| 173 |
|
|
| 173 |
|
|
| 174 | 174 |
try {
|
| 175 | 175 |
process(); |
| 176 | 176 |
} catch(Exception e) {
|
| ... | ... | |
| 181 | 181 |
} |
| 182 | 182 |
} |
| 183 | 183 |
} |
| 184 |
|
|
| 184 |
|
|
| 185 | 185 |
/** |
| 186 | 186 |
* Creates the output. |
| 187 | 187 |
* |
| ... | ... | |
| 194 | 194 |
XMLOutputFactory outfactory = XMLOutputFactory.newInstance(); |
| 195 | 195 |
output = new BufferedOutputStream(new FileOutputStream(outfile)) |
| 196 | 196 |
writer = outfactory.createXMLStreamWriter(output, "UTF-8");//create a new file |
| 197 |
|
|
| 197 |
|
|
| 198 | 198 |
return true; |
| 199 | 199 |
} catch (Exception e) {
|
| 200 | 200 |
System.out.println(e.getLocalizedMessage()); |
| 201 | 201 |
return false; |
| 202 | 202 |
} |
| 203 | 203 |
} |
| 204 |
|
|
| 204 |
|
|
| 205 | 205 |
/** The events. */ |
| 206 | 206 |
List<String> events = []; |
| 207 | 207 |
String previousEvent = "", nextEvent = ""; |
| ... | ... | |
| 209 | 209 |
* Process. |
| 210 | 210 |
*/ |
| 211 | 211 |
void process() {
|
| 212 |
|
|
| 212 |
|
|
| 213 | 213 |
String previousElem = ""; |
| 214 | 214 |
boolean parolesRaportees = false; |
| 215 | 215 |
boolean firstWord = true; |
| ... | ... | |
| 221 | 221 |
ArrayList<String> whos = []; |
| 222 | 222 |
HashMap<String, String> speakers = new HashMap<String, String>(); |
| 223 | 223 |
HashMap<String, String> topics = new HashMap<String, String>(); |
| 224 |
|
|
| 224 |
|
|
| 225 | 225 |
writer.writeStartDocument("UTF-8","1.0");
|
| 226 | 226 |
writer.writeStartElement("html");
|
| 227 | 227 |
//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> |
| ... | ... | |
| 231 | 231 |
writer.writeAttribute("charset", "UTF-8");
|
| 232 | 232 |
writer.writeEndElement(); |
| 233 | 233 |
writer.writeStartElement("head");
|
| 234 |
|
|
| 234 |
|
|
| 235 | 235 |
//<link rel="stylesheet" type="text/css" href="class.css" /> |
| 236 | 236 |
writer.writeStartElement("link");
|
| 237 | 237 |
writer.writeAttribute("rel", "stylesheet");
|
| ... | ... | |
| 239 | 239 |
writer.writeAttribute("href", "transcriber.css");
|
| 240 | 240 |
writer.writeEndElement(); |
| 241 | 241 |
writer.writeEndElement(); |
| 242 |
|
|
| 242 |
|
|
| 243 | 243 |
nbBreak++ |
| 244 | 244 |
writer.writeStartElement("body");
|
| 245 |
writer.writeAttribute("class", "txmeditionpage")
|
|
| 245 | 246 |
writer.writeEmptyElement("pb");
|
| 246 | 247 |
writer.writeAttribute("id", ""+nbBreak);
|
| 247 | 248 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
|
| 248 |
|
|
| 249 |
|
|
| 249 | 250 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
|
| 250 | 251 |
switch (event) {
|
| 251 | 252 |
case XMLStreamConstants.START_ELEMENT: |
| 252 | 253 |
localname = parser.getLocalName(); |
| 253 | 254 |
switch (localname) {
|
| 254 | 255 |
case "text": |
| 256 |
|
|
| 255 | 257 |
writer.writeStartElement("h2");
|
| 256 |
writer.writeAttribute("class","titre");
|
|
| 258 |
writer.writeAttribute("class","title");
|
|
| 257 | 259 |
String title = parser.getAttributeValue(null, "title"); |
| 260 |
|
|
| 258 | 261 |
if (title != null) {
|
| 259 | 262 |
writer.writeCharacters(title); |
| 260 | 263 |
} else {
|
| 261 | 264 |
writer.writeCharacters("Transcription "+txmfile.getName().substring(0, txmfile.getName().length() - 4));
|
| 262 | 265 |
} |
| 263 |
writer.writeEndElement(); |
|
| 264 |
if(metadatas != null) {
|
|
| 266 |
|
|
| 267 |
writer.writeEmptyElement("br");
|
|
| 268 |
writer.writeStartElement("a");
|
|
| 269 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '0.0')");
|
|
| 270 |
writer.writeAttribute("style", "cursor: pointer;")
|
|
| 271 |
writer.writeAttribute("class", "play-media")
|
|
| 272 |
writer.writeCharacters(" ♪♪");
|
|
| 273 |
writer.writeEndElement(); // a |
|
| 274 |
|
|
| 275 |
writer.writeEndElement(); // h2 |
|
| 276 |
|
|
| 277 |
String subtitle = parser.getAttributeValue(null, "subtitle"); |
|
| 278 |
if (subtitle != null && subtitle.length() > 0) {
|
|
| 279 |
writer.writeStartElement("h3");
|
|
| 280 |
writer.writeAttribute("class", "subtitle");
|
|
| 281 |
writer.writeCharacters(subtitle); |
|
| 282 |
writer.writeEndElement(); // h3 |
|
| 283 |
} |
|
| 284 |
|
|
| 285 |
// println "metadatas != null: "+(metadatas != null) |
|
| 286 |
// if (metadatas != null) {
|
|
| 265 | 287 |
writer.writeStartElement("table");
|
| 266 | 288 |
boolean grey = false; |
| 267 |
for (String name : metadatas.getPropertyNames()) {
|
|
| 268 |
if ("title" == name) continue; // ignore "title" metadata
|
|
| 289 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 290 |
String name = parser.getAttributeName(i); |
|
| 291 |
String value = parser.getAttributeValue(i); |
|
| 292 |
|
|
| 293 |
if ("title" == name) {
|
|
| 294 |
continue; // ignore "title" metadata |
|
| 295 |
} |
|
| 296 |
|
|
| 269 | 297 |
grey = !grey; |
| 270 | 298 |
writer.writeStartElement("tr");
|
| 271 |
if (grey) |
|
| 299 |
if (grey) {
|
|
| 272 | 300 |
writer.writeAttribute("style","background-color:lightgrey;")
|
| 273 |
String value = parser.getAttributeValue(null, name); |
|
| 301 |
} |
|
| 302 |
|
|
| 274 | 303 |
if (value != null) {
|
| 275 | 304 |
writer.writeStartElement("td");
|
| 276 | 305 |
writer.writeCharacters(name); |
| ... | ... | |
| 286 | 315 |
writer.writeEndElement(); |
| 287 | 316 |
} |
| 288 | 317 |
writer.writeEndElement(); |
| 289 |
} |
|
| 318 |
// }
|
|
| 290 | 319 |
break; |
| 291 | 320 |
case "Topics": |
| 292 | 321 |
/*writer.writeStartElement("h2");
|
| ... | ... | |
| 347 | 376 |
flagcomment = true; |
| 348 | 377 |
break; |
| 349 | 378 |
case "div": |
| 350 |
writer.writeStartElement("div");
|
|
| 351 |
writer.writeAttribute("class", "section");
|
|
| 352 |
String type = parser.getAttributeValue(null,"type"); |
|
| 353 |
String desc = parser.getAttributeValue(null,"topic"); |
|
| 354 |
String metadata = parser.getAttributeValue(null,"metadata"); |
|
| 379 |
|
|
| 380 |
nbBreak++ |
|
| 381 |
writer.writeEmptyElement("pb");
|
|
| 382 |
writer.writeAttribute("id", ""+nbBreak);
|
|
| 383 |
writer.writeCharacters("\n");
|
|
| 384 |
|
|
| 385 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
|
|
| 386 |
indexes << wordid |
|
| 387 |
|
|
| 388 |
wordcount = 0; |
|
| 389 |
shouldBreak = false; |
|
| 390 |
|
|
| 391 |
writer.writeStartElement("div")
|
|
| 392 |
writer.writeAttribute("class", "section")
|
|
| 393 |
|
|
| 394 |
String type = parser.getAttributeValue(null, "type") |
|
| 395 |
writer.writeAttribute("type", ""+type)
|
|
| 396 |
String desc = parser.getAttributeValue(null, "topic") |
|
| 397 |
|
|
| 355 | 398 |
if (type != null || desc != null) {
|
| 356 |
writer.writeStartElement("h3");
|
|
| 357 |
if (type != null || type.length() ==0) {
|
|
| 399 |
writer.writeStartElement("h2");
|
|
| 400 |
writer.writeAttribute("class", "section-title")
|
|
| 401 |
if (type != null || type.length() == 0) {
|
|
| 358 | 402 |
writer.writeCharacters(type+": "+desc); |
| 359 | 403 |
} else {
|
| 360 |
writer.writeCharacters(desc);
|
|
| 404 |
writer.writeCharacters(desc) |
|
| 361 | 405 |
} |
| 362 |
writer.writeEndElement(); // h3 |
|
| 363 | 406 |
|
| 364 |
if (metadata != null && metadata.length() > 0) { // the metadata to show
|
|
| 365 |
writer.writeStartElement("ul");
|
|
| 366 |
for (def m : metadata.split("\t")) {
|
|
| 367 |
writer.writeStartElement("li");
|
|
| 368 |
writer.writeCharacters(m); |
|
| 369 |
writer.writeEndElement(); // li |
|
| 407 |
if (parser.getAttributeValue(null,"startTime") != null) {
|
|
| 408 |
writer.writeEmptyElement("br");
|
|
| 409 |
writer.writeStartElement("a")
|
|
| 410 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+parser.getAttributeValue(null,"startTime")+"')");
|
|
| 411 |
writer.writeAttribute("style", "cursor: pointer;")
|
|
| 412 |
writer.writeAttribute("class", "play-media")
|
|
| 413 |
writer.writeCharacters(" ♪♪")
|
|
| 414 |
writer.writeEndElement() // a |
|
| 415 |
} |
|
| 416 |
|
|
| 417 |
writer.writeEndElement(); // h2 |
|
| 418 |
} |
|
| 419 |
|
|
| 420 |
String metadata = parser.getAttributeValue(null, "metadata") |
|
| 421 |
if (metadata != null && metadata.length() > 0) { // the metadata to show
|
|
| 422 |
writer.writeStartElement("ul")
|
|
| 423 |
//println "metadata=$metadata" |
|
| 424 |
for (def m : metadata.split("<li>")) {
|
|
| 425 |
writer.writeStartElement("li")
|
|
| 426 |
writer.writeCharacters(m) |
|
| 427 |
writer.writeEndElement() // li |
|
| 428 |
} |
|
| 429 |
writer.writeEndElement() // ul |
|
| 430 |
} else if (parser.getAttributeCount() > 1) { // process all attributes
|
|
| 431 |
writer.writeStartElement("ul")
|
|
| 432 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 433 |
String name = parser.getAttributeLocalName(i) |
|
| 434 |
if (!"type".equals(name) |
|
| 435 |
&& !"topic".equals(name) |
|
| 436 |
&& !"startTime".equals(name) |
|
| 437 |
&& !"endTime".equals(name)) {
|
|
| 438 |
writer.writeStartElement("li")
|
|
| 439 |
writer.writeCharacters(""+name+": "+parser.getAttributeValue(i))
|
|
| 440 |
writer.writeEndElement() // li |
|
| 370 | 441 |
} |
| 371 |
writer.writeEndElement(); // ul |
|
| 372 |
} else if (parser.getAttributeCount() > 1) { // process all attributes
|
|
| 373 |
writer.writeStartElement("ul");
|
|
| 374 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 375 |
String name = parser.getAttributeLocalName(i); |
|
| 376 |
if (!"type".equals(name) |
|
| 377 |
&& !"topic".equals(name) |
|
| 378 |
&& !"startTime".equals(name) |
|
| 379 |
&& !"endTime".equals(name)) {
|
|
| 380 |
writer.writeStartElement("li");
|
|
| 381 |
writer.writeCharacters(""+name+": "+parser.getAttributeValue(i));
|
|
| 382 |
writer.writeEndElement(); // li |
|
| 383 |
} |
|
| 384 |
} |
|
| 385 |
writer.writeEndElement(); // ul |
|
| 386 | 442 |
} |
| 443 |
writer.writeEndElement(); // ul |
|
| 387 | 444 |
} |
| 445 |
|
|
| 388 | 446 |
break; |
| 389 | 447 |
case "sp": |
| 390 | 448 |
endBoldIfNeeded() |
| ... | ... | |
| 392 | 450 |
firstWho = true; |
| 393 | 451 |
spokenTurn = false; |
| 394 | 452 |
overlapping = false |
| 395 |
|
|
| 453 |
|
|
| 396 | 454 |
writer.writeStartElement("p");
|
| 397 | 455 |
writer.writeAttribute("class", "turn");
|
| 398 |
|
|
| 456 |
|
|
| 399 | 457 |
overlapping = ("true" == parser.getAttributeValue(null,"overlap"))
|
| 400 | 458 |
String spid = parser.getAttributeValue(null,"speaker"); |
| 459 |
|
|
| 401 | 460 |
whos = [] |
| 402 | 461 |
if (overlapping) {
|
| 403 | 462 |
writer.writeEmptyElement("br");
|
| 404 | 463 |
writeSpeaker(parser.getAttributeValue(null,"speaker"), false) |
| 405 |
|
|
| 464 |
|
|
| 406 | 465 |
writer.writeEmptyElement("br");
|
| 407 | 466 |
whos = spid.split(" ")
|
| 408 | 467 |
} |
| 409 |
|
|
| 468 |
|
|
| 410 | 469 |
break; |
| 411 | 470 |
case "u": |
| 412 | 471 |
writer.writeCharacters("\n");
|
| 413 | 472 |
this.currentTime = parser.getAttributeValue(null,"time"); |
| 414 |
|
|
| 473 |
|
|
| 415 | 474 |
if (previousElem == "u" && writenLength == 0) { // if previous u had no words, it was a silence
|
| 416 | 475 |
writer.writeStartElement("span");
|
| 417 | 476 |
writer.writeAttribute("class", "event");
|
| ... | ... | |
| 419 | 478 |
writer.writeEndElement(); // span |
| 420 | 479 |
writer.writeEmptyElement("br");
|
| 421 | 480 |
} |
| 422 |
|
|
| 481 |
|
|
| 423 | 482 |
String spk = parser.getAttributeValue(null, "spk") |
| 424 | 483 |
if (spk != null && spk != previousSPK) {
|
| 425 | 484 |
endBoldIfNeeded() |
| ... | ... | |
| 427 | 486 |
writeSpeaker(parser.getAttributeValue(null, "spk"), overlapping) |
| 428 | 487 |
startBoldIfNeeded() |
| 429 | 488 |
} |
| 430 |
|
|
| 489 |
|
|
| 431 | 490 |
writeCurrentTime() |
| 432 | 491 |
previousSPK = spk |
| 433 |
|
|
| 492 |
|
|
| 434 | 493 |
// writenLength = 0; |
| 435 | 494 |
/*writer.writeStartElement("span");
|
| 436 | 495 |
writer.writeAttribute("class", "sync");
|
| 437 | 496 |
writer.writeCharacters("["+parser.getAttributeValue(null,"time")+"]");
|
| 438 | 497 |
writer.writeEndElement();*/ |
| 439 |
|
|
| 498 |
|
|
| 440 | 499 |
break; |
| 441 | 500 |
case "event": |
| 442 | 501 |
spokenTurn = true; |
| ... | ... | |
| 458 | 517 |
events.remove(events.size()-1) |
| 459 | 518 |
} |
| 460 | 519 |
else if (parser.getAttributeValue(null, "extent") == "begin") {
|
| 461 |
|
|
| 520 |
|
|
| 462 | 521 |
writer.writeCharacters(" ["+desc+"> ");
|
| 463 | 522 |
events.add(desc) |
| 464 | 523 |
} |
| ... | ... | |
| 484 | 543 |
wordid = (parser.getAttributeValue(i)); |
| 485 | 544 |
break; |
| 486 | 545 |
} |
| 487 |
|
|
| 546 |
|
|
| 488 | 547 |
wordcount++; |
| 489 | 548 |
if (wordcount >= wordmax) {
|
| 490 | 549 |
shouldBreak = true; |
| 491 | 550 |
} |
| 492 |
|
|
| 551 |
|
|
| 493 | 552 |
if (firstWord) {
|
| 494 | 553 |
indexes << wordid |
| 495 | 554 |
firstWord = false; |
| 496 | 555 |
} |
| 497 |
|
|
| 556 |
|
|
| 498 | 557 |
break; |
| 499 |
|
|
| 558 |
|
|
| 500 | 559 |
case "ana": |
| 501 |
|
|
| 560 |
|
|
| 502 | 561 |
String type = parser.getAttributeValue(null,"type").substring(1); |
| 503 | 562 |
if (SIMPLE_TOOLTIP) {
|
| 504 | 563 |
if (type.contains("lemma") || type.contains("pos")) {
|
| ... | ... | |
| 510 | 569 |
interpvalue+=", "+type+"=" |
| 511 | 570 |
} |
| 512 | 571 |
break; |
| 513 |
|
|
| 572 |
|
|
| 514 | 573 |
case "form": |
| 515 | 574 |
wordvalue=""; |
| 516 | 575 |
interpvalue =""; |
| ... | ... | |
| 536 | 595 |
break; |
| 537 | 596 |
case "Speaker": |
| 538 | 597 |
break; |
| 539 |
|
|
| 598 |
|
|
| 540 | 599 |
case "div": |
| 541 | 600 |
//writer.writeCharacters("}");
|
| 542 |
|
|
| 601 |
|
|
| 543 | 602 |
writer.writeEndElement(); // div |
| 544 | 603 |
writer.writeCharacters("\n");
|
| 545 | 604 |
break; |
| ... | ... | |
| 553 | 612 |
writer.writeEndElement(); |
| 554 | 613 |
writer.writeEmptyElement("br");
|
| 555 | 614 |
} |
| 556 |
|
|
| 615 |
|
|
| 557 | 616 |
writer.writeEndElement(); // p |
| 558 |
|
|
| 617 |
|
|
| 559 | 618 |
if (shouldBreak) {
|
| 560 | 619 |
nbBreak++ |
| 561 | 620 |
writer.writeEmptyElement("pb");
|
| 562 | 621 |
writer.writeAttribute("id", ""+nbBreak);
|
| 563 | 622 |
writer.writeCharacters("\n");
|
| 564 |
|
|
| 623 |
|
|
| 565 | 624 |
pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
|
| 566 | 625 |
indexes << wordid |
| 567 |
|
|
| 626 |
|
|
| 568 | 627 |
wordcount = 0; |
| 569 | 628 |
shouldBreak = false; |
| 570 | 629 |
} |
| ... | ... | |
| 590 | 649 |
String endOfLastWord = ""; |
| 591 | 650 |
if(l > 0) |
| 592 | 651 |
endOfLastWord = lastword.subSequence(l-1, l); |
| 593 |
|
|
| 652 |
|
|
| 594 | 653 |
if(interpvalue != null) |
| 595 | 654 |
interpvalue = interpvalue.replace("\"",""");
|
| 596 | 655 |
if(events.size() > 0) |
| 597 | 656 |
interpvalue = interpvalue.replace("event=", "event="+events.toString().replace("\"",""")); // remove ", "
|
| 598 |
|
|
| 657 |
|
|
| 599 | 658 |
if(nextEvent.length() > 0) |
| 600 | 659 |
{
|
| 601 | 660 |
interpvalue = interpvalue.replace("event=", "event="+nextEvent+", ")
|
| ... | ... | |
| 617 | 676 |
// println " SPACE" |
| 618 | 677 |
writer.writeCharacters(" ");
|
| 619 | 678 |
} |
| 620 |
|
|
| 679 |
|
|
| 621 | 680 |
if (interpvalue.contains("rapp1")) {
|
| 622 | 681 |
writer.writeCharacters(" «");
|
| 623 | 682 |
} else if (wordvalue == "\"") {
|
| ... | ... | |
| 642 | 701 |
writer.writeCharacters("_[!]");
|
| 643 | 702 |
writer.writeEndElement(); |
| 644 | 703 |
} |
| 645 |
|
|
| 704 |
|
|
| 646 | 705 |
if (interpvalue.contains("rapp2")) {
|
| 647 | 706 |
writer.writeCharacters(" » ");
|
| 648 | 707 |
} |
| 649 |
|
|
| 708 |
|
|
| 650 | 709 |
lastword=wordvalue; |
| 651 | 710 |
break; |
| 652 | 711 |
} |
| 653 |
|
|
| 712 |
|
|
| 654 | 713 |
break; |
| 655 |
|
|
| 714 |
|
|
| 656 | 715 |
case XMLStreamConstants.CHARACTERS: |
| 657 | 716 |
if(flagform) |
| 658 | 717 |
if(parser.getText().length() > 0) |
| ... | ... | |
| 664 | 723 |
} |
| 665 | 724 |
} |
| 666 | 725 |
writer.writeEndElement(); |
| 667 |
|
|
| 726 |
|
|
| 668 | 727 |
writer.writeEmptyElement("pb");
|
| 669 | 728 |
nbBreak++ |
| 670 | 729 |
writer.writeAttribute("id", ""+nbBreak);
|
| 671 |
|
|
| 730 |
|
|
| 672 | 731 |
writer.writeEndElement(); |
| 673 | 732 |
writer.close(); |
| 674 | 733 |
output.close(); |
| 675 | 734 |
if (parser != null) parser.close(); |
| 676 | 735 |
if (inputData != null) inputData.close(); |
| 677 |
|
|
| 736 |
|
|
| 678 | 737 |
File txmhome = new File(org.txm.Toolbox.getTxmHomePath()); |
| 679 | 738 |
File xlsDir = new File(txmhome, "xsl"); |
| 680 | 739 |
File xslfile = new File(xlsDir,"breakByMilestone.xsl"); |
| ... | ... | |
| 685 | 744 |
// println "html: "+outfile |
| 686 | 745 |
// println "pages: "+pages |
| 687 | 746 |
// println "words: "+indexes |
| 688 |
|
|
| 689 |
|
|
| 747 |
|
|
| 748 |
|
|
| 690 | 749 |
if (pages.size() > 1) {
|
| 691 | 750 |
for (int i = 1 ; i < nbBreak ; i++) {
|
| 692 | 751 |
ApplyXsl2 a = new ApplyXsl2(xslfile.getAbsolutePath()); |
| 693 |
String[] params = ["pbval1", i,"pbval2", i+1]; |
|
| 694 |
|
|
| 752 |
String[] params = ["pbval1", i, "pbval2", i+1];
|
|
| 753 |
|
|
| 695 | 754 |
File resultfile = pages[i-1] |
| 696 | 755 |
//println "BBmilestones: "+i+" "+(i+1)+" in file "+resultfile |
| 697 | 756 |
//println "process $outfile -> $resultfile" |
| ... | ... | |
| 702 | 761 |
FileCopy.copy(outfile, page) |
| 703 | 762 |
} |
| 704 | 763 |
} |
| 705 |
|
|
| 706 |
private void writeCurrentTime() |
|
| 707 |
{
|
|
| 764 |
|
|
| 765 |
private void writeCurrentTime() {
|
|
| 708 | 766 |
writer.writeStartElement("span");
|
| 709 | 767 |
writer.writeAttribute("class", "sync");
|
| 710 | 768 |
writer.writeCharacters(currentTime); |
| 769 |
|
|
| 770 |
writer.writeStartElement("a");
|
|
| 771 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+currentTime+"')");
|
|
| 772 |
writer.writeAttribute("style", "cursor: pointer;")
|
|
| 773 |
writer.writeAttribute("class", "play-media")
|
|
| 774 |
writer.writeCharacters(" ♪♪");
|
|
| 775 |
writer.writeEndElement(); // a |
|
| 776 |
|
|
| 711 | 777 |
writer.writeEndElement(); |
| 712 | 778 |
} |
| 713 |
|
|
| 779 |
|
|
| 714 | 780 |
private void writeSpeaker(String spk, boolean overlapping) {
|
| 715 |
|
|
| 781 |
|
|
| 716 | 782 |
writer.writeStartElement("span");
|
| 717 | 783 |
writer.writeAttribute("class", "spk");
|
| 718 | 784 |
if(interviewers.contains(spk)) |
| ... | ... | |
| 721 | 787 |
bold = false; |
| 722 | 788 |
spk = spk.replaceAll('^([^0-9]*)([0-9]+)$', '$1 $2');
|
| 723 | 789 |
if (overlapping) writer.writeCharacters("// ")
|
| 790 |
|
|
| 724 | 791 |
writer.writeCharacters(spk+": ") |
| 792 |
|
|
| 725 | 793 |
writer.writeEndElement(); // span@class=spk |
| 726 | 794 |
} |
| 727 |
|
|
| 795 |
|
|
| 728 | 796 |
private String translateEvent(String desc) {
|
| 729 | 797 |
if(eventTranslations.containsKey(desc)) |
| 730 | 798 |
return eventTranslations.get(desc); |
| 731 | 799 |
else |
| 732 | 800 |
return desc; |
| 733 | 801 |
} |
| 734 |
|
|
| 802 |
|
|
| 735 | 803 |
boolean boldOpenned = false; |
| 736 | 804 |
private void startBoldIfNeeded() {
|
| 737 | 805 |
if (bold) {
|
| ... | ... | |
| 739 | 807 |
boldOpenned = true; |
| 740 | 808 |
} |
| 741 | 809 |
} |
| 742 |
|
|
| 810 |
|
|
| 743 | 811 |
private endBoldIfNeeded() {
|
| 744 | 812 |
if (boldOpenned) {
|
| 745 | 813 |
// println "CLOSE BOLD" |
| ... | ... | |
| 747 | 815 |
boldOpenned = false; |
| 748 | 816 |
} |
| 749 | 817 |
} |
| 750 |
|
|
| 818 |
|
|
| 751 | 819 |
// private String formatTime(float time, boolean doshort) |
| 752 | 820 |
// {
|
| 753 | 821 |
// String rez = " "; |
| ... | ... | |
| 787 | 855 |
// // } |
| 788 | 856 |
// return rez; |
| 789 | 857 |
// } |
| 790 |
|
|
| 858 |
|
|
| 791 | 859 |
/** |
| 792 | 860 |
* Gets the page files. |
| 793 | 861 |
* |
| ... | ... | |
| 796 | 864 |
public ArrayList<File> getPageFiles() {
|
| 797 | 865 |
return pages; |
| 798 | 866 |
} |
| 799 |
|
|
| 867 |
|
|
| 800 | 868 |
/** |
| 801 | 869 |
* Gets the idx. |
| 802 | 870 |
* |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2268) | ||
|---|---|---|
| 10 | 10 |
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
|
| 11 | 11 |
def idTRSColumn = "Lien notice principale" |
| 12 | 12 |
def typeColumns = ["Identifiant de la notice"] |
| 13 |
def topicColumns = ["Titre propre", "Genre", "antract_debut", "antract_fin"]
|
|
| 13 |
def topicColumns = ["Titre propre", "Notes du titre"]
|
|
| 14 | 14 |
def startTimeColumn = "antract_debut" |
| 15 | 15 |
def endTimeColumn = "antract_fin" |
| 16 | 16 |
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"] |
| ... | ... | |
| 21 | 21 |
return |
| 22 | 22 |
} |
| 23 | 23 |
|
| 24 |
|
|
| 24 |
println "Loading data from $metadataFile..." |
|
| 25 | 25 |
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
|
| 26 | 26 |
reader.readHeaders() |
| 27 | 27 |
def header = reader.getHeaders() |
| ... | ... | |
| 80 | 80 |
} |
| 81 | 81 |
|
| 82 | 82 |
def sectionGroupsToInsert = [:] |
| 83 |
println "Reading data..." |
|
| 83 | 84 |
while (reader.readRecord()) {
|
| 84 | 85 |
String id = reader.get(idTRSColumn).trim() |
| 85 | 86 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
|
| ... | ... | |
| 95 | 96 |
|
| 96 | 97 |
def m = [:] |
| 97 | 98 |
|
| 98 |
for (def todo : ["metadata":metadataColumns, "topic":topicColumns, "type":typeColumns]) {
|
|
| 99 |
for (def todo : ["topic":topicColumns, "type":typeColumns]) {
|
|
| 99 | 100 |
def data = [] |
| 100 | 101 |
for (def col : todo.value) {
|
| 101 | 102 |
if (reader.get(col).trim().length() > 0) {
|
| ... | ... | |
| 104 | 105 |
} |
| 105 | 106 |
m[todo.key] = data.join("\t")
|
| 106 | 107 |
} |
| 108 |
for (def col : metadataColumns) {
|
|
| 109 |
m[AsciiUtils.buildAttributeId(col)] = reader.get(col) |
|
| 110 |
} |
|
| 107 | 111 |
|
| 108 | 112 |
m["startTime"] = strTotime(reader.get(startTimeColumn)) |
| 109 | 113 |
m["endTime"] = strTotime(reader.get(endTimeColumn)) |
| ... | ... | |
| 113 | 117 |
} |
| 114 | 118 |
} |
| 115 | 119 |
|
| 116 |
println "N sections: "+sectionGroupsToInsert.size()
|
|
| 120 |
println "Inserting sections... "+sectionGroupsToInsert.size()
|
|
| 117 | 121 |
|
| 118 | 122 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size()) |
| 119 | 123 |
for (String id : sectionGroupsToInsert.keySet()) {
|
| ... | ... | |
| 181 | 185 |
|
| 182 | 186 |
currentSection = found |
| 183 | 187 |
currentNode = new Node(trsEpisode, "Section", currentSection[2]) |
| 184 |
//trsEpisode.appendNode(currentNode) |
|
| 185 | 188 |
} |
| 186 | 189 |
} |
| 187 | 190 |
trsSection.remove(turn) |
| ... | ... | |
| 201 | 204 |
} |
| 202 | 205 |
} |
| 203 | 206 |
cpb.done() |
| 207 |
println "Done." |
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 2268) | ||
|---|---|---|
| 176 | 176 |
//TOKENIZE |
| 177 | 177 |
println "Tokenizing "+files.length+" files from $txmDir" |
| 178 | 178 |
File tokenizedDir = new File(binDir, "tokenized") |
| 179 |
tokenizedDir.deleteDir() |
|
| 179 | 180 |
tokenizedDir.mkdir() |
| 180 | 181 |
cpb = new ConsoleProgressBar(files.length) |
| 181 | 182 |
for (File pfile : files) {
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 2268) | ||
|---|---|---|
| 323 | 323 |
|
| 324 | 324 |
rend = getAttributeValue(parser, null, "rend") |
| 325 | 325 |
if (rend == null) rend = ""; |
| 326 |
|
|
| 326 | 327 |
switch (localname) {
|
| 327 | 328 |
case "text": |
| 328 | 329 |
LinkedHashMap attributes = new LinkedHashMap(); |
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/office/Table2CorpusMacro.groovy (revision 2268) | ||
|---|---|---|
| 1 |
package org.txm.macro.csv |
|
| 2 |
|
|
| 3 |
import org.kohsuke.args4j.* |
|
| 4 |
import groovy.transform.Field |
|
| 5 |
import java.nio.charset.Charset |
|
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
import org.txm.utils.* |
|
| 8 |
import javax.xml.stream.* |
|
| 9 |
import java.net.URL |
|
| 10 |
|
|
| 11 |
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx") |
|
| 12 |
File inputFile; |
|
| 13 |
|
|
| 14 |
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory") |
|
| 15 |
File outputDirectory; |
|
| 16 |
|
|
| 17 |
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire") |
|
| 18 |
def textSelector; |
|
| 19 |
|
|
| 20 |
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet") |
|
| 21 |
def structureSelector; |
|
| 22 |
|
|
| 23 |
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice") |
|
| 24 |
def textIDColumn; |
|
| 25 |
|
|
| 26 |
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale") |
|
| 27 |
def joinColumn; |
|
| 28 |
|
|
| 29 |
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
| 30 |
def textMetadataColumnList; |
|
| 31 |
|
|
| 32 |
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
| 33 |
def textContentColumnList; |
|
| 34 |
|
|
| 35 |
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div") |
|
| 36 |
def structureTag; |
|
| 37 |
|
|
| 38 |
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
| 39 |
def structureMetadataColumnList; |
|
| 40 |
|
|
| 41 |
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
| 42 |
def structureContentColumnList; |
|
| 43 |
|
|
| 44 |
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
| 45 |
def metadataDateColumnList; |
|
| 46 |
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
| 47 |
def prefixContentColumnList; |
|
| 48 |
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
| 49 |
def listContentColumnList; |
|
| 50 |
|
|
| 51 |
|
|
| 52 |
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
| 53 |
// def structureOrderColumn; |
|
| 54 |
|
|
| 55 |
if (!ParametersDialog.open(this)) return; |
|
| 56 |
|
|
| 57 |
textMetadataColumnList = textMetadataColumnList.split(",")
|
|
| 58 |
textContentColumnList = textContentColumnList.split(",")
|
|
| 59 |
structureMetadataColumnList = structureMetadataColumnList.split(",")
|
|
| 60 |
structureContentColumnList = structureContentColumnList.split(",")
|
|
| 61 |
metadataDateColumnList = metadataDateColumnList.split(",")
|
|
| 62 |
prefixContentColumnList = prefixContentColumnList.split(",")
|
|
| 63 |
listContentColumnList = listContentColumnList.split(",")
|
|
| 64 |
int ti = textSelector.indexOf("=")
|
|
| 65 |
String p1 = textSelector.substring(0, ti) |
|
| 66 |
String p2 = textSelector.substring(ti+1) |
|
| 67 |
textSelector = [p1, p2] |
|
| 68 |
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)]
|
|
| 69 |
|
|
| 70 |
|
|
| 71 |
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn" |
|
| 72 |
println "textMetadataColumnList columns: $textMetadataColumnList" |
|
| 73 |
println "textContentColumnList columns: $textContentColumnList" |
|
| 74 |
println "structureMetadataColumnList columns: $structureMetadataColumnList" |
|
| 75 |
println "structureContentColumnList columns: $structureContentColumnList" |
|
| 76 |
|
|
| 77 |
println "text selector="+textSelector |
|
| 78 |
println "structure selector="+structureSelector |
|
| 79 |
println "structureTag="+structureTag |
|
| 80 |
|
|
| 81 |
TableReader reader = new TableReader(inputFile); |
|
| 82 |
if (!reader.readHeaders()) {
|
|
| 83 |
println "Error: no header" |
|
| 84 |
return |
|
| 85 |
} |
|
| 86 |
def headers = Arrays.asList(reader.getHeaders()) |
|
| 87 |
println "table columns: $headers" |
|
| 88 |
|
|
| 89 |
def ok = true |
|
| 90 |
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList, |
|
| 91 |
"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList, |
|
| 92 |
"structureContentColumnList":structureContentColumnList]) {
|
|
| 93 |
for (String m : list.value) {
|
|
| 94 |
m = m.trim() |
|
| 95 |
if (!headers.contains(m)) {
|
|
| 96 |
println "Error: missing ${list.key} column: $m"
|
|
| 97 |
ok = false |
|
| 98 |
} |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 |
if (!ok) { return; }
|
|
| 102 |
|
|
| 103 |
// group by text |
|
| 104 |
def texts = new LinkedHashMap() |
|
| 105 |
def nRecord = 0 |
|
| 106 |
while (reader.readRecord()) {
|
|
| 107 |
nRecord++ |
|
| 108 |
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0]) |
|
| 109 |
|
|
| 110 |
String id = reader.get(textIDColumn).trim() |
|
| 111 |
String join = reader.get(joinColumn).trim() |
|
| 112 |
String textSelectorValue = reader.get(textSelector[0]).trim() |
|
| 113 |
String structureSelectorValue = reader.get(structureSelector[0]).trim() |
|
| 114 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
|
|
| 115 |
if (!texts.containsKey(id)) texts[id] = [] |
|
| 116 |
texts[id].add(0, reader.getRecord()) |
|
| 117 |
} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
|
|
| 118 |
if (!texts.containsKey(join)) texts[join] = [] |
|
| 119 |
texts[join].add(reader.getRecord()) |
|
| 120 |
} else {
|
|
| 121 |
// ignore |
|
| 122 |
} |
|
| 123 |
} |
|
| 124 |
println "N lines: "+nRecord |
|
| 125 |
println "N groups: "+texts.size() |
|
| 126 |
if (texts.size() == 0) {
|
|
| 127 |
println "No text found. Aborting." |
|
| 128 |
return |
|
| 129 |
} |
|
| 130 |
outputDirectory.mkdir() |
|
| 131 |
|
|
| 132 |
for (def id : texts.keySet()) {
|
|
| 133 |
def toWrite = texts[id] |
|
| 134 |
def text = toWrite[0] |
|
| 135 |
String textSelectorValue = text.get(textSelector[0]).trim() |
|
| 136 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
|
|
| 137 |
println "Processing text: $id" |
|
| 138 |
|
|
| 139 |
File outputfile = new File(outputDirectory, id+".xml") |
|
| 140 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
| 141 |
FileOutputStream output = new FileOutputStream(outputfile) |
|
| 142 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
| 143 |
|
|
| 144 |
writer.writeStartDocument("UTF-8","1.0")
|
|
| 145 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 146 |
writer.writeStartElement("TEI")
|
|
| 147 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 148 |
writer.writeStartElement("teiHeader")
|
|
| 149 |
writer.writeEndElement() // teiHeader |
|
| 150 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 151 |
writer.writeStartElement("text")
|
|
| 152 |
writer.writeAttribute("id", id)
|
|
| 153 |
|
|
| 154 |
for (String att : textMetadataColumnList) {
|
|
| 155 |
if (att in metadataDateColumnList) {
|
|
| 156 |
writeMetadataDate(text, att, writer) |
|
| 157 |
} else {
|
|
| 158 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct |
|
| 159 |
} |
|
| 160 |
} |
|
| 161 |
|
|
| 162 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 163 |
|
|
| 164 |
writer.writeStartElement("metadata")
|
|
| 165 |
writer.writeStartElement("list")
|
|
| 166 |
writer.writeAttribute("type", "unordered")
|
|
| 167 |
writer.writeCharacters("\n")
|
|
| 168 |
textMetadataColumnList.each { att ->
|
|
| 169 |
writer.writeStartElement("item")
|
|
| 170 |
writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
|
|
| 171 |
writer.writeEndElement() // item |
|
| 172 |
writer.writeCharacters("\n")
|
|
| 173 |
} |
|
| 174 |
writer.writeEndElement() // list |
|
| 175 |
writer.writeEndElement() // metadata |
|
| 176 |
writer.writeCharacters("\n")
|
|
| 177 |
|
|
| 178 |
for (String att : textContentColumnList) {
|
|
| 179 |
if (att in prefixContentColumnList) {
|
|
| 180 |
writeContentPrefix(text, att, writer) |
|
| 181 |
} else if (att in listContentColumnList) {
|
|
| 182 |
writeContentList(text, att, writer) |
|
| 183 |
} else {
|
|
| 184 |
writer.writeStartElement("p");
|
|
| 185 |
writer.writeAttribute("type", att.trim())
|
|
| 186 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 187 |
|
|
| 188 |
writer.writeStartElement("head")
|
|
| 189 |
writer.writeStartElement("hi")
|
|
| 190 |
writer.writeCharacters(att+" : ") |
|
| 191 |
writer.writeEndElement() // hi |
|
| 192 |
writer.writeEndElement() // head |
|
| 193 |
|
|
| 194 |
writer.writeCharacters(text.get(att)) // get textColumnList content |
|
| 195 |
writer.writeEndElement() // t |
|
| 196 |
writer.writeCharacters("\n") // simple XML formating)
|
|
| 197 |
} |
|
| 198 |
} |
|
| 199 |
|
|
| 200 |
int pb_n = 1; |
|
| 201 |
for (int i = 1 ; i < toWrite.size() ; i++) {
|
|
| 202 |
def record = toWrite[i] |
|
| 203 |
|
|
| 204 |
writer.writeEmptyElement("pb") // <pb/>
|
|
| 205 |
writer.writeAttribute("n", ""+pb_n++)
|
|
| 206 |
|
|
| 207 |
writer.writeStartElement(structureTag) |
|
| 208 |
|
|
| 209 |
for (String att : structureMetadataColumnList) {
|
|
| 210 |
if (att in metadataDateColumnList) {
|
|
| 211 |
writeMetadataDate(record, att, writer) |
|
| 212 |
} else {
|
|
| 213 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct |
|
| 214 |
} |
|
| 215 |
} |
|
| 216 |
writer.writeCharacters("\n")
|
|
| 217 |
|
|
| 218 |
writer.writeStartElement("metadata")
|
|
| 219 |
writer.writeStartElement("list")
|
|
| 220 |
writer.writeAttribute("type", "unordered")
|
|
| 221 |
writer.writeCharacters("\n")
|
|
| 222 |
structureMetadataColumnList.each { att ->
|
|
| 223 |
writer.writeStartElement("item")
|
|
| 224 |
writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
|
|
| 225 |
writer.writeEndElement() // item |
|
| 226 |
writer.writeCharacters("\n")
|
|
| 227 |
} |
|
| 228 |
writer.writeEndElement() // list |
|
| 229 |
writer.writeEndElement() // metadata |
|
| 230 |
writer.writeCharacters("\n")
|
|
| 231 |
|
|
| 232 |
for (String att : structureContentColumnList) {
|
|
| 233 |
if (att in prefixContentColumnList) {
|
|
| 234 |
writeContentPrefix(text, att, writer) |
|
| 235 |
} else if (att in listContentColumnList) {
|
|
| 236 |
writeContentList(text, att, writer) |
|
| 237 |
} else {
|
|
| 238 |
writer.writeStartElement("p");
|
|
| 239 |
writer.writeAttribute("type", att.trim())
|
|
| 240 |
writer.writeAttribute("id", record.get(textIDColumn))
|
|
| 241 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 242 |
|
|
| 243 |
writer.writeStartElement("head")
|
|
| 244 |
writer.writeStartElement("hi")
|
|
| 245 |
writer.writeCharacters(att+" : ") |
|
| 246 |
writer.writeEndElement() // hi |
|
| 247 |
writer.writeEndElement() // head |
|
| 248 |
|
|
| 249 |
writer.writeCharacters(record.get(att)) // get textColumnList content |
|
| 250 |
writer.writeEndElement() // t |
|
| 251 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 252 |
} |
|
| 253 |
} |
|
| 254 |
|
|
| 255 |
writer.writeEndElement() // struct |
|
| 256 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 257 |
} |
|
| 258 |
|
|
| 259 |
writer.writeEndElement() // text |
|
| 260 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 261 |
writer.writeEndElement() // TEI |
|
| 262 |
writer.close() |
|
| 263 |
output.close() |
|
| 264 |
reader.close() |
|
| 265 |
} else {
|
|
| 266 |
// error |
|
| 267 |
println "ERROR: '$id' text group with no text line" |
|
| 268 |
} |
|
| 269 |
} |
|
| 270 |
|
|
| 271 |
def writeContentList(def record, def att, def writer) {
|
|
| 272 |
writer.writeCharacters("\n")
|
|
| 273 |
found = false |
|
| 274 |
def value = record.get(att) |
|
| 275 |
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
|
|
| 276 |
found = true |
|
| 277 |
writer.writeCharacters("\t")
|
|
| 278 |
writer.writeStartElement("p")
|
|
| 279 |
writer.writeAttribute("rend", "list")
|
|
| 280 |
matches = (desc =~ /(?s)([^\n]+?)[\n]/) |
|
| 281 |
writer.writeCharacters(matches[0][1]) |
|
| 282 |
writer.writeEndElement() // p |
|
| 283 |
writer.writeCharacters("\n")
|
|
| 284 |
} |
|
| 285 |
if (!found) {
|
|
| 286 |
writer.writeCharacters("\t")
|
|
| 287 |
writer.writeStartElement("p")
|
|
| 288 |
writer.writeAttribute("rend", "no-list")
|
|
| 289 |
writer.writeCharacters(value) |
|
| 290 |
writer.writeEndElement() // p |
|
| 291 |
writer.writeCharacters("\n")
|
|
| 292 |
} |
|
| 293 |
} |
|
| 294 |
|
|
| 295 |
def writeContentPrefix(def record, def att, def writer) {
|
|
| 296 |
writer.writeCharacters("\n")
|
|
| 297 |
writer.writeStartElement("list")
|
|
| 298 |
writer.writeAttribute("rend", "prefixes")
|
|
| 299 |
writer.writeAttribute("type", "unordered")
|
|
| 300 |
writer.writeCharacters("\n")
|
|
| 301 |
found = false |
|
| 302 |
def value = record.get(att) |
|
| 303 |
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
|
|
| 304 |
found = true |
|
| 305 |
writer.writeCharacters("\t")
|
|
| 306 |
writer.writeStartElement("item")
|
|
| 307 |
matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
|
|
| 308 |
writer.writeAttribute("type", matches[0][1])
|
|
| 309 |
writer.writeStartElement("span")
|
|
| 310 |
writer.writeCharacters(matches[0][1]+" ") |
|
| 311 |
writer.writeEndElement() // span |
|
| 312 |
writer.writeCharacters(matches[0][2]) |
|
| 313 |
writer.writeEndElement() // item |
|
| 314 |
writer.writeCharacters("\n")
|
|
| 315 |
} |
|
| 316 |
if (!found) {
|
|
| 317 |
writer.writeCharacters(value) |
|
| 318 |
} |
|
| 319 |
writer.writeEndElement() // list |
|
| 320 |
writer.writeCharacters("\n")
|
|
| 321 |
} |
|
| 322 |
|
|
| 323 |
def writeMetadataDate(def record, def att, def writer) {
|
|
| 324 |
String value = record.get(att) |
|
| 325 |
String att_normalized = AsciiUtils.buildAttributeId(att) |
|
| 326 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
|
|
| 327 |
writer.writeAttribute(att_normalized+"jour", matches[0][1]) |
|
| 328 |
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
|
|
| 329 |
writer.writeAttribute(att_normalized+"mois", matches[0][2]) |
|
| 330 |
writer.writeAttribute(att_normalized+"annee", matches[0][3]) |
|
| 331 |
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1]) |
|
| 332 |
} |
|
| 333 |
|
|
| 334 |
/* |
|
| 335 |
String name = inputFile.getName() |
|
| 336 |
int idx = name.lastIndexOf(".")
|
|
| 337 |
if (idx > 0) name = name.substring(0, idx) |
|
| 338 |
*/ |
|
Formats disponibles : Unified diff