Révision 3041
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/RecodeSpeakers.groovy (revision 3041) | ||
---|---|---|
26 | 26 |
|
27 | 27 |
def idRegex, nameRegex |
28 | 28 |
String newId, newName |
29 |
boolean debug |
|
29 | 30 |
|
30 | 31 |
/** |
31 | 32 |
* Instantiates a new change speaker. |
... | ... | |
84 | 85 |
if (newName != null) { |
85 | 86 |
elem.setAttribute("name", newName); |
86 | 87 |
} |
88 |
if (debug) println "Recoding id $id -> $newId and name $name -> newName" |
|
87 | 89 |
} |
88 | 90 |
|
89 | 91 |
if (nameRegex != null && id =~ nameRegex) { // patch Speaker@name |
... | ... | |
98 | 100 |
|
99 | 101 |
// fix speaker turns |
100 | 102 |
|
103 |
int nReplace = 0 |
|
101 | 104 |
for (def node : nodes) { |
102 | 105 |
if (node == null) continue; |
103 | 106 |
|
... | ... | |
106 | 109 |
|
107 | 110 |
if (replacedIds.contains(id)) { |
108 | 111 |
elem.setAttribute("speaker", newId); |
112 |
nReplace++ |
|
109 | 113 |
} |
110 | 114 |
} |
111 | 115 |
|
116 |
if (debug) println "$nReplace replacements" |
|
117 |
|
|
112 | 118 |
// Création de la source DOM |
113 | 119 |
Source source = new DOMSource(doc); |
114 | 120 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3041) | ||
---|---|---|
229 | 229 |
} |
230 | 230 |
|
231 | 231 |
public static void main(String[] args) { |
232 |
File infile = new File("/home/mdecorde/xml/vocapia","test.xml") |
|
233 |
File outfile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
232 |
File infile = new File("/home/mdecorde/xml/vocapia/testé input","test.xml") |
|
233 |
File outfile = new File("/home/mdecorde/xml/vocapia/testé oh","testé.trs") |
|
234 |
outfile.getParentFile().mkdir() |
|
234 | 235 |
def processor = new Vocapia2Transcriber(infile) |
235 |
println processor.process(outfile, true)
|
|
236 |
println processor.process(outfile) |
|
236 | 237 |
} |
237 | 238 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddEventsFromWordMarkersMacro.groovy (revision 3041) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import java.nio.charset.Charset |
|
4 |
|
|
5 |
import java.time.LocalTime |
|
6 |
import java.time.format.DateTimeFormatter |
|
7 |
import org.txm.utils.* |
|
8 |
import org.txm.utils.logger.* |
|
9 |
|
|
10 |
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS", widget="Folder", required=true, def="") |
|
11 |
File trsDirectory; |
|
12 |
|
|
13 |
@Field @Option(name="resultDirectory", usage="Dossier résultat TRS", widget="Folder", required=true, def="") |
|
14 |
File resultDirectory; |
|
15 |
|
|
16 |
@Field @Option(name="wordElement", usage="The marker, spaces included", widget="String", required=true, def="w") |
|
17 |
def wordElement |
|
18 |
|
|
19 |
@Field @Option(name="newEventMarker", usage="The marker, spaces included", widget="String", required=true, def="") |
|
20 |
def newEventMarker |
|
21 |
|
|
22 |
@Field @Option(name="eventType", usage="The marker, spaces included", widget="String", required=true, def="") |
|
23 |
def eventType |
|
24 |
|
|
25 |
@Field @Option(name="eventDescription", usage="The marker, spaces included", widget="String", required=false, def="") |
|
26 |
def eventDescription |
|
27 |
|
|
28 |
|
|
29 |
@Field @Option(name="eventExtent", usage="The marker, spaces included", widget="String", required=true, def="instantaneous") |
|
30 |
def eventExtent |
|
31 |
|
|
32 |
@Field @Option(name="debug", usage="activate debug messages", widget="Boolean", required=true, def="false") |
|
33 |
boolean debug |
|
34 |
|
|
35 |
if (!ParametersDialog.open(this)) return; |
|
36 |
|
|
37 |
if (!trsDirectory.exists()) { |
|
38 |
println "$trsDirectory not found" |
|
39 |
return |
|
40 |
} |
|
41 |
|
|
42 |
println "Writing result to $resultDirectory..." |
|
43 |
def newEventMarkerRegex = /$newEventMarker/ |
|
44 |
try { |
|
45 |
def trsFiles = trsDirectory.listFiles().findAll() { it.getName().toLowerCase().endsWith(".trs") } |
|
46 |
|
|
47 |
if (trsFiles.size() == 0) { |
|
48 |
println "No TRS file to process in $trsDirectory" |
|
49 |
return; |
|
50 |
} |
|
51 |
|
|
52 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
|
53 |
for (File trsFile : trsFiles) { |
|
54 |
if (debug) println "== $trsFile ==" |
|
55 |
else cpb.tick() |
|
56 |
|
|
57 |
// Open input file |
|
58 |
def slurper = new groovy.util.XmlParser(false, true, true); |
|
59 |
//slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration |
|
60 |
//slurper.setProperty("http://javax.xml.XMLConstants/property/accessExternalDTD", "all"); // allow to read DTD from local file |
|
61 |
def trs = slurper.parse(trsFile.toURI().toString()) |
|
62 |
def trsEpisodes = trs.Episode // 1 |
|
63 |
if (trsEpisodes.size() > 1) { |
|
64 |
println "Error: multiple Episode node in $trsFile" |
|
65 |
continue |
|
66 |
} |
|
67 |
|
|
68 |
def turns = trs.Episode.Section.Turn |
|
69 |
|
|
70 |
for (int iTurn = 0 ; iTurn < turns.size() ; iTurn++) { |
|
71 |
def turn = turns[iTurn] |
|
72 |
def children = turn.children() |
|
73 |
for (int i = 0 ; i < children.size() ; i++) { |
|
74 |
def w = children[i] |
|
75 |
if (wordElement.equals(w.name())) { |
|
76 |
String wContent = w.text() |
|
77 |
if (wContent ==~ newEventMarkerRegex) { |
|
78 |
if (debug) println "Create event $w with $wContent" |
|
79 |
Node replace = new Node(null, "event"); |
|
80 |
replace.@type = eventType |
|
81 |
replace.@desc = eventDescription |
|
82 |
replace.@extent = eventExtent |
|
83 |
w.replaceNode(replace) |
|
84 |
} |
|
85 |
} |
|
86 |
} |
|
87 |
} |
|
88 |
|
|
89 |
resultDirectory.mkdir() |
|
90 |
File outfile = new File(resultDirectory, trsFile.getName()) |
|
91 |
outfile.withWriter("UTF-8") { writer -> |
|
92 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n') |
|
93 |
def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer)) |
|
94 |
printer.setPreserveWhitespace(true) |
|
95 |
printer.print(trs) |
|
96 |
} |
|
97 |
} |
|
98 |
cpb.done() |
|
99 |
|
|
100 |
println "Done." |
|
101 |
|
|
102 |
} catch(Exception e) { |
|
103 |
println "Error: "+e |
|
104 |
Log.printStackTrace(e) |
|
105 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddSectionsFromTableMacro.groovy (revision 3041) | ||
---|---|---|
34 | 34 |
@Field @Option(name="metadataColumnsGroups", usage="Colonnes des gruopes de metadonnées de section", widget="String", required=true, def="metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text") |
35 | 35 |
def metadataColumnsGroups |
36 | 36 |
|
37 |
@Field @Option(name="debug", usage="show debug messages", widget="String", required=true, def="false") |
|
38 |
def debug |
|
39 |
|
|
37 | 40 |
if (!ParametersDialog.open(this)) return; |
38 | 41 |
typeColumns = typeColumns.split(";") |
39 | 42 |
topicColumns = topicColumns.split(";") |
... | ... | |
160 | 163 |
|
161 | 164 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size()) |
162 | 165 |
for (String id : sectionGroupsToInsert.keySet()) { |
163 |
cpb.tick() |
|
166 |
if (debug) println "== $id ==" |
|
167 |
else cpb.tick() |
|
168 |
|
|
164 | 169 |
File trsFile = new File(trsDirectory, id+".trs") |
165 | 170 |
if (!trsFile.exists()) { |
166 | 171 |
continue |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/SegmentTRSInSectionFromMarkerMacro.groovy (revision 3041) | ||
---|---|---|
58 | 58 |
|
59 | 59 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
60 | 60 |
for (File trsFile : trsFiles) { |
61 |
cpb.tick() |
|
62 | 61 |
|
62 |
if (debug) println "== $trsFile ==" |
|
63 |
else cpb.tick() |
|
64 |
|
|
63 | 65 |
// Open input file |
64 | 66 |
def slurper = new groovy.util.XmlParser(false, true, true); |
65 | 67 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration |
... | ... | |
91 | 93 |
def start = Float.parseFloat(turn.@startTime) |
92 | 94 |
def end = Float.parseFloat(turn.@endTime) |
93 | 95 |
|
94 |
if (debug) println "TURN: " |
|
96 |
//if (debug) println "TURN: "
|
|
95 | 97 |
|
96 | 98 |
def children = turn.children() |
97 | 99 |
for (int i = 0 ; i < children.size() ; i++) { |
... | ... | |
106 | 108 |
} |
107 | 109 |
|
108 | 110 |
if (content.equals(newSectionMarker)) { |
109 |
if (debug) println "NEW SECTION !"
|
|
111 |
if (debug) println "New section at $turn with $node child node"
|
|
110 | 112 |
previousSection = currentSection |
111 | 113 |
currentSection = new Node(trsEpisode, "Section", new LinkedHashMap(["type":newSectionMarker, "startTime":turn.@startTime, "endTime":previousSection.attributes()["endTime"]])) |
112 | 114 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2TranscriberMacro.groovy (revision 3041) | ||
---|---|---|
14 | 14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
15 | 15 |
File resultDirectory; |
16 | 16 |
|
17 |
@Field @Option(name="debug", usage="The result directory", widget="Boolean", required=false, def="false") |
|
18 |
boolean debug; |
|
17 | 19 |
|
18 | 20 |
if (!ParametersDialog.open(this)) return; |
19 | 21 |
|
... | ... | |
40 | 42 |
|
41 | 43 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlFiles.size()) |
42 | 44 |
for (File xmlFile : xmlFiles) { |
43 |
cpb.tick() |
|
45 |
|
|
46 |
if (debug) println "== $xmlFile ==" |
|
47 |
else cpb.tick() |
|
48 |
|
|
44 | 49 |
Vocapia2Transcriber v2t = new Vocapia2Transcriber(xmlFile) |
45 | 50 |
String name = FileUtils.stripExtension(xmlFile) |
46 | 51 |
File outFile = new File(resultDirectory, name+".trs") |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/ExcelDir2XMLMacro.groovy (revision 3041) | ||
---|---|---|
1 |
package org.txm.macro.text |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
import org.apache.poi.ss.usermodel.* |
|
12 |
import org.apache.poi.hssf.usermodel.* |
|
13 |
import org.apache.poi.xssf.usermodel.* |
|
14 |
import org.apache.poi.ss.util.* |
|
15 |
|
|
16 |
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="") |
|
17 |
def inputDirectory |
|
18 |
|
|
19 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
20 |
def sheetName |
|
21 |
|
|
22 |
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root") |
|
23 |
def rootTag |
|
24 |
|
|
25 |
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit") |
|
26 |
def textTag |
|
27 |
|
|
28 |
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2") |
|
29 |
def metadataColumnList |
|
30 |
|
|
31 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
32 |
def dateColumnList |
|
33 |
|
|
34 |
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
35 |
def textColumnList |
|
36 |
|
|
37 |
if (!ParametersDialog.open(this)) return |
|
38 |
|
|
39 |
if (!inputDirectory.exists()) { |
|
40 |
println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting." |
|
41 |
return false |
|
42 |
} |
|
43 |
|
|
44 |
if (!inputDirectory.canRead()) { |
|
45 |
println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting." |
|
46 |
return false |
|
47 |
} |
|
48 |
|
|
49 |
def f = [] |
|
50 |
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it } |
|
51 |
|
|
52 |
if (f.size() == 0) { |
|
53 |
println "** ExcelDir2XML: no .xlsx file found. Aborting." |
|
54 |
return false |
|
55 |
} |
|
56 |
|
|
57 |
try { |
|
58 |
|
|
59 |
f.sort { it.name }.each { inputFile -> |
|
60 |
|
|
61 |
res = gse.run(Excel2XMLMacro, ["args":[ |
|
62 |
|
|
63 |
"inputFile":inputFile, |
|
64 |
"sheetName":sheetName, |
|
65 |
"metadataColumnList":metadataColumnList, |
|
66 |
"dateColumnList":dateColumnList, |
|
67 |
"textColumnList":textColumnList, |
|
68 |
"rootTag":rootTag, |
|
69 |
"textTag":textTag, |
|
70 |
|
|
71 |
"selection":selection, |
|
72 |
"selections":selections, |
|
73 |
"corpusViewSelection":corpusViewSelection, |
|
74 |
"corpusViewSelections":corpusViewSelections, |
|
75 |
"monitor":monitor]]) |
|
76 |
if (!res) println "** problem calling Excel2XMLMacro." |
|
77 |
} |
|
78 |
|
|
79 |
} catch (Exception e) { |
|
80 |
println "** ExcelDir2XML: unable to read input files. Aborting." |
|
81 |
println e.getLocalizedMessage() |
|
82 |
println e.printStackTrace() |
|
83 |
return false |
|
84 |
} |
|
85 |
|
|
86 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/Excel2XMLMacro.groovy (revision 3041) | ||
---|---|---|
1 |
package org.txm.macro.text |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
import java.text.SimpleDateFormat |
|
11 |
|
|
12 |
import org.apache.poi.ss.usermodel.* |
|
13 |
import org.apache.poi.hssf.usermodel.* |
|
14 |
import org.apache.poi.xssf.usermodel.* |
|
15 |
import org.apache.poi.ss.util.* |
|
16 |
|
|
17 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() } |
|
18 |
|
|
19 |
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi |
|
20 |
def getCellValueAsString = { cell -> |
|
21 |
strCellValue = null |
|
22 |
if (cell != null) { |
|
23 |
switch (cell.getCellType()) { |
|
24 |
case Cell.CELL_TYPE_STRING: |
|
25 |
strCellValue = cell.toString() |
|
26 |
break |
|
27 |
case Cell.CELL_TYPE_NUMERIC: |
|
28 |
if (DateUtil.isCellDateFormatted(cell)) { |
|
29 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy") |
|
30 |
strCellValue = dateFormat.format(cell.getDateCellValue()) |
|
31 |
} else { |
|
32 |
value = cell.getNumericCellValue() |
|
33 |
longValue = value.longValue() |
|
34 |
strCellValue = new String(longValue.toString()) |
|
35 |
} |
|
36 |
break |
|
37 |
case Cell.CELL_TYPE_BOOLEAN: |
|
38 |
strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString()) |
|
39 |
break |
|
40 |
case Cell.CELL_TYPE_BLANK: |
|
41 |
strCellValue = "" |
|
42 |
break |
|
43 |
} |
|
44 |
} |
|
45 |
if (strCellValue == null) strCellValue = "" |
|
46 |
return strCellValue |
|
47 |
} |
|
48 |
|
|
49 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
50 |
File inputFile |
|
51 |
|
|
52 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
53 |
def sheetName |
|
54 |
|
|
55 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
56 |
def rootTag |
|
57 |
|
|
58 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
59 |
def textTag |
|
60 |
|
|
61 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
62 |
def metadataColumnList |
|
63 |
|
|
64 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
65 |
def dateColumnList |
|
66 |
|
|
67 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
68 |
def textColumnList |
|
69 |
|
|
70 |
if (!ParametersDialog.open(this)) return |
|
71 |
|
|
72 |
if (!inputFile.exists()) { |
|
73 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
74 |
return false |
|
75 |
} |
|
76 |
|
|
77 |
if (!inputFile.canRead()) { |
|
78 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
79 |
return false |
|
80 |
} |
|
81 |
|
|
82 |
try { |
|
83 |
|
|
84 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() } |
|
85 |
dateColumnList = dateColumnList.split(",").collect { it.trim() } |
|
86 |
textColumnList = textColumnList.split(",").collect { it.trim() } |
|
87 |
textTag = textTag.trim() |
|
88 |
rootTag = rootTag.trim() |
|
89 |
|
|
90 |
wb = WorkbookFactory.create(inputFile) |
|
91 |
|
|
92 |
if (sheetName.length() == 0) { |
|
93 |
ws = wb.getSheetAt(0) |
|
94 |
} else { |
|
95 |
ws = wb.getSheet(sheetName) |
|
96 |
if (ws == null) { |
|
97 |
println "** Excel2XML: no '"+sheetName+" found. Aborting." |
|
98 |
return false |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
if (ws == null) { |
|
103 |
println "** Excel2XML: no sheet found. Aborting." |
|
104 |
return false |
|
105 |
} |
|
106 |
|
|
107 |
nRows = ws.getPhysicalNumberOfRows() |
|
108 |
println nRows+" rows." |
|
109 |
|
|
110 |
firstRow = ws.getRow(0) |
|
111 |
colMax = firstRow.getLastCellNum() |
|
112 |
|
|
113 |
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) } |
|
114 |
|
|
115 |
println "Headers: $headers" |
|
116 |
|
|
117 |
normalizedHeaders = headers.collect { stringToIndent(it) } |
|
118 |
|
|
119 |
ok = true |
|
120 |
metadataColumnList.each { m -> |
|
121 |
if (!headers.contains(m)) { |
|
122 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
123 |
ok = false |
|
124 |
} |
|
125 |
} |
|
126 |
textColumnList.each { t -> |
|
127 |
if (!headers.contains(t)) { |
|
128 |
println "** Excel2XML: missing textColumnList column: $t" |
|
129 |
ok = false |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
if (!ok) { return false } |
|
134 |
|
|
135 |
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) } |
|
136 |
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) } |
|
137 |
textColumnIndex = textColumnList.collect { headers.indexOf(it) } |
|
138 |
|
|
139 |
println "metadataColumnList = "+metadataColumnList |
|
140 |
println "metadataColumnIndex = "+metadataColumnIndex |
|
141 |
|
|
142 |
println "dateColumnList = "+dateColumnList |
|
143 |
println "dateColumnsIndex = "+dateColumnsIndex |
|
144 |
|
|
145 |
println "textColumnList = "+textColumnList |
|
146 |
println "textColumnIndex = "+textColumnIndex |
|
147 |
|
|
148 |
name = inputFile.getName() |
|
149 |
idx = name.lastIndexOf(".") |
|
150 |
|
|
151 |
if (idx > 0) name = name.substring(0, idx) |
|
152 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
153 |
|
|
154 |
factory = XMLOutputFactory.newInstance() |
|
155 |
output = new FileOutputStream(outputFile) |
|
156 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
157 |
|
|
158 |
writer.writeStartDocument("UTF-8","1.0") |
|
159 |
writer.writeCharacters("\n") // simple XML formating |
|
160 |
writer.writeStartElement(rootTag) |
|
161 |
writer.writeCharacters("\n") |
|
162 |
|
|
163 |
pb_n = 1 |
|
164 |
|
|
165 |
(1..nRows-1).each { rowIndex -> |
|
166 |
|
|
167 |
writer.writeCharacters(" ") |
|
168 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module) |
|
169 |
writer.writeAttribute("n", ""+pb_n++) |
|
170 |
writer.writeCharacters("\n") // simple XML formating |
|
171 |
|
|
172 |
writer.writeCharacters(" ") |
|
173 |
writer.writeStartElement(textTag) |
|
174 |
metadataColumnIndex.each { colIndex -> // build an attribute for each metadata |
|
175 |
String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)); |
|
176 |
if (s == null) s =""; |
|
177 |
value = s.replaceAll("\n", ";").trim() |
|
178 |
writer.writeAttribute(normalizedHeaders[colIndex], value) |
|
179 |
if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes |
|
180 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
181 |
writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1]) |
|
182 |
writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
183 |
writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2]) |
|
184 |
writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3]) |
|
185 |
} |
|
186 |
} |
|
187 |
writer.writeCharacters("\n") |
|
188 |
|
|
189 |
writer.writeCharacters(" ") |
|
190 |
writer.writeStartElement("metadata") |
|
191 |
writer.writeStartElement("list") |
|
192 |
writer.writeAttribute("type", "unordered") |
|
193 |
writer.writeCharacters("\n") |
|
194 |
|
|
195 |
metadataColumnIndex.each { colIndex -> |
|
196 |
writer.writeStartElement("item") |
|
197 |
writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";")) |
|
198 |
writer.writeEndElement() // item |
|
199 |
writer.writeCharacters("\n") |
|
200 |
} |
|
201 |
writer.writeCharacters(" ") |
|
202 |
writer.writeEndElement() // list |
|
203 |
writer.writeEndElement() // head |
|
204 |
writer.writeCharacters("\n") |
|
205 |
|
|
206 |
textColumnIndex.each { colIndex -> |
|
207 |
|
|
208 |
writer.writeCharacters(" ") |
|
209 |
writer.writeStartElement(normalizedHeaders[colIndex]) |
|
210 |
writer.writeStartElement("p") |
|
211 |
writer.writeStartElement("head") |
|
212 |
writer.writeStartElement("hi") |
|
213 |
writer.writeCharacters(headers[colIndex]+" : ") |
|
214 |
writer.writeEndElement() // hi |
|
215 |
writer.writeEndElement() // head |
|
216 |
value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)) |
|
217 |
|
|
218 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) { |
|
219 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc -> |
|
220 |
writer.writeStartElement("descripteur") |
|
221 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/) |
|
222 |
writer.writeAttribute("type", matches[0][1]) |
|
223 |
writer.writeCharacters(matches[0][2]) |
|
224 |
writer.writeEndElement() // descripteur |
|
225 |
} |
|
226 |
} else { |
|
227 |
writer.writeCharacters(value) |
|
228 |
} |
|
229 |
writer.writeEndElement() // p |
|
230 |
writer.writeEndElement() // textColumn |
|
231 |
writer.writeCharacters("\n") |
|
232 |
} |
|
233 |
|
|
234 |
writer.writeCharacters(" ") |
|
235 |
writer.writeEndElement() // textTag |
|
236 |
writer.writeCharacters("\n") |
|
237 |
} |
|
238 |
|
|
239 |
writer.writeEndElement() // rootTag |
|
240 |
writer.writeCharacters("\n") |
|
241 |
writer.close() |
|
242 |
output.close() |
|
243 |
println "Result file: $outputFile" |
|
244 |
|
|
245 |
} catch (Exception e) { |
|
246 |
println "** Excel2XML: unable to read input file. Aborting." |
|
247 |
println e.getLocalizedMessage() |
|
248 |
println e.printStackTrace() |
|
249 |
return false |
|
250 |
} |
|
251 |
|
|
252 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/ExcelDir2XMLMacro.groovy (revision 3041) | ||
---|---|---|
1 |
package org.txm.macro.table |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
import org.apache.poi.ss.usermodel.* |
|
12 |
import org.apache.poi.hssf.usermodel.* |
|
13 |
import org.apache.poi.xssf.usermodel.* |
|
14 |
import org.apache.poi.ss.usermodel.WorkbookFactory |
|
15 |
import org.apache.poi.ss.util.* |
|
16 |
|
|
17 |
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="") |
|
18 |
def inputDirectory |
|
19 |
|
|
20 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
21 |
def sheetName |
|
22 |
|
|
23 |
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root") |
|
24 |
def rootTag |
|
25 |
|
|
26 |
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit") |
|
27 |
def textTag |
|
28 |
|
|
29 |
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2") |
|
30 |
def metadataColumnList |
|
31 |
|
|
32 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
33 |
def dateColumnList |
|
34 |
|
|
35 |
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
36 |
def textColumnList |
|
37 |
|
|
38 |
if (!ParametersDialog.open(this)) return |
|
39 |
|
|
40 |
if (!inputDirectory.exists()) { |
|
41 |
println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting." |
|
42 |
return false |
|
43 |
} |
|
44 |
|
|
45 |
if (!inputDirectory.canRead()) { |
|
46 |
println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting." |
|
47 |
return false |
|
48 |
} |
|
49 |
|
|
50 |
def f = [] |
|
51 |
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it } |
|
52 |
|
|
53 |
if (f.size() == 0) { |
|
54 |
println "** ExcelDir2XML: no .xlsx file found. Aborting." |
|
55 |
return false |
|
56 |
} |
|
57 |
|
|
58 |
try { |
|
59 |
|
|
60 |
f.sort { it.name }.each { inputFile -> |
|
61 |
|
|
62 |
res = gse.run(Excel2XMLMacro, ["args":[ |
|
63 |
|
|
64 |
"inputFile":inputFile, |
|
65 |
"sheetName":sheetName, |
|
66 |
"metadataColumnList":metadataColumnList, |
|
67 |
"dateColumnList":dateColumnList, |
|
68 |
"textColumnList":textColumnList, |
|
69 |
"rootTag":rootTag, |
|
70 |
"textTag":textTag, |
|
71 |
|
|
72 |
"selection":selection, |
|
73 |
"selections":selections, |
|
74 |
"corpusViewSelection":corpusViewSelection, |
|
75 |
"corpusViewSelections":corpusViewSelections, |
|
76 |
"monitor":monitor]]) |
|
77 |
if (!res) println "** problem calling Excel2XMLMacro." |
|
78 |
} |
|
79 |
|
|
80 |
} catch (Exception e) { |
|
81 |
println "** ExcelDir2XML: unable to read input files. Aborting." |
|
82 |
println e.getLocalizedMessage() |
|
83 |
println e.printStackTrace() |
|
84 |
return false |
|
85 |
} |
|
86 |
|
|
87 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro.groovy (revision 3041) | ||
---|---|---|
1 |
package org.txm.macro.table |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
import java.text.SimpleDateFormat |
|
11 |
|
|
12 |
import org.apache.poi.ss.usermodel.* |
|
13 |
import org.apache.poi.hssf.usermodel.* |
|
14 |
import org.apache.poi.xssf.usermodel.* |
|
15 |
import org.apache.poi.ss.usermodel.WorkbookFactory |
|
16 |
import org.apache.poi.ss.util.* |
|
17 |
|
|
18 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() } |
|
19 |
|
|
20 |
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi |
|
21 |
def getCellValueAsString = { cell -> |
|
22 |
strCellValue = null |
|
23 |
if (cell != null) { |
|
24 |
switch (cell.getCellType()) { |
|
25 |
case CellType.STRING: |
|
26 |
strCellValue = cell.toString() |
|
27 |
break |
|
28 |
case CellType.NUMERIC: |
|
29 |
if (DateUtil.isCellDateFormatted(cell)) { |
|
30 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy") |
|
31 |
strCellValue = dateFormat.format(cell.getDateCellValue()) |
|
32 |
} else { |
|
33 |
value = cell.getNumericCellValue() |
|
34 |
longValue = value.longValue() |
|
35 |
strCellValue = new String(longValue.toString()) |
|
36 |
} |
|
37 |
break |
|
38 |
case CellType.BOOLEAN: |
|
39 |
strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString()) |
|
40 |
break |
|
41 |
case CellType.BLANK: |
|
42 |
strCellValue = "" |
|
43 |
break |
|
44 |
} |
|
45 |
} |
|
46 |
if (strCellValue == null) strCellValue = "" |
|
47 |
return strCellValue |
|
48 |
} |
|
49 |
|
|
50 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
51 |
File inputFile |
|
52 |
|
|
53 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
54 |
def sheetName |
|
55 |
|
|
56 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
57 |
def rootTag |
|
58 |
|
|
59 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
60 |
def textTag |
|
61 |
|
|
62 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
63 |
def metadataColumnList |
|
64 |
|
|
65 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
66 |
def dateColumnList |
|
67 |
|
|
68 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
69 |
def textColumnList |
|
70 |
|
|
71 |
if (!ParametersDialog.open(this)) return |
|
72 |
|
|
73 |
if (!inputFile.exists()) { |
|
74 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
75 |
return false |
|
76 |
} |
|
77 |
|
|
78 |
if (!inputFile.canRead()) { |
|
79 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
80 |
return false |
|
81 |
} |
|
82 |
|
|
83 |
try { |
|
84 |
|
|
85 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() } |
|
86 |
dateColumnList = dateColumnList.split(",").collect { it.trim() } |
|
87 |
textColumnList = textColumnList.split(",").collect { it.trim() } |
|
88 |
textTag = textTag.trim() |
|
89 |
rootTag = rootTag.trim() |
|
90 |
|
|
91 |
wb = WorkbookFactory.create(inputFile) |
|
92 |
|
|
93 |
if (sheetName.length() == 0) { |
|
94 |
ws = wb.getSheetAt(0) |
|
95 |
} else { |
|
96 |
ws = wb.getSheet(sheetName) |
|
97 |
if (ws == null) { |
|
98 |
println "** Excel2XML: no '"+sheetName+" found. Aborting." |
|
99 |
return false |
|
100 |
} |
|
101 |
} |
|
102 |
|
|
103 |
if (ws == null) { |
|
104 |
println "** Excel2XML: no sheet found. Aborting." |
|
105 |
return false |
|
106 |
} |
|
107 |
|
|
108 |
nRows = ws.getPhysicalNumberOfRows() |
|
109 |
println nRows+" rows." |
|
110 |
|
|
111 |
firstRow = ws.getRow(0) |
|
112 |
colMax = firstRow.getLastCellNum() |
|
113 |
|
|
114 |
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) } |
|
115 |
|
|
116 |
println "Headers: $headers" |
|
117 |
|
|
118 |
normalizedHeaders = headers.collect { stringToIndent(it) } |
|
119 |
|
|
120 |
ok = true |
|
121 |
metadataColumnList.each { m -> |
|
122 |
if (!headers.contains(m)) { |
|
123 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
124 |
ok = false |
|
125 |
} |
|
126 |
} |
|
127 |
textColumnList.each { t -> |
|
128 |
if (!headers.contains(t)) { |
|
129 |
println "** Excel2XML: missing textColumnList column: $t" |
|
130 |
ok = false |
|
131 |
} |
|
132 |
} |
|
133 |
|
|
134 |
if (!ok) { return false } |
|
135 |
|
|
136 |
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) } |
|
137 |
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) } |
|
138 |
textColumnIndex = textColumnList.collect { headers.indexOf(it) } |
|
139 |
|
|
140 |
println "metadataColumnList = "+metadataColumnList |
|
141 |
println "metadataColumnIndex = "+metadataColumnIndex |
|
142 |
|
|
143 |
println "dateColumnList = "+dateColumnList |
|
144 |
println "dateColumnsIndex = "+dateColumnsIndex |
|
145 |
|
|
146 |
println "textColumnList = "+textColumnList |
|
147 |
println "textColumnIndex = "+textColumnIndex |
|
148 |
|
|
149 |
name = inputFile.getName() |
|
150 |
idx = name.lastIndexOf(".") |
|
151 |
|
|
152 |
if (idx > 0) name = name.substring(0, idx) |
|
153 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
154 |
|
|
155 |
factory = XMLOutputFactory.newInstance() |
|
156 |
output = new FileOutputStream(outputFile) |
|
157 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
158 |
|
|
159 |
writer.writeStartDocument("UTF-8","1.0") |
|
160 |
writer.writeCharacters("\n") // simple XML formating |
|
161 |
writer.writeStartElement(rootTag) |
|
162 |
writer.writeCharacters("\n") |
|
163 |
|
|
164 |
pb_n = 1 |
|
165 |
|
|
166 |
(1..nRows-1).each { rowIndex -> |
|
167 |
|
|
168 |
writer.writeCharacters(" ") |
|
169 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module) |
|
170 |
writer.writeAttribute("n", ""+pb_n++) |
|
171 |
writer.writeCharacters("\n") // simple XML formating |
|
172 |
|
|
173 |
writer.writeCharacters(" ") |
|
174 |
writer.writeStartElement(textTag) |
|
175 |
metadataColumnIndex.each { colIndex -> // build an attribute for each metadata |
|
176 |
String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)); |
|
177 |
if (s == null) s =""; |
|
178 |
value = s.replaceAll("\n", ";").trim() |
|
179 |
writer.writeAttribute(normalizedHeaders[colIndex], value) |
|
180 |
if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes |
|
181 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
182 |
writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1]) |
|
183 |
writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
184 |
writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2]) |
|
185 |
writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3]) |
|
186 |
} |
|
187 |
} |
|
188 |
writer.writeCharacters("\n") |
|
189 |
|
|
190 |
writer.writeCharacters(" ") |
|
191 |
writer.writeStartElement("metadata") |
|
192 |
writer.writeStartElement("list") |
|
193 |
writer.writeAttribute("type", "unordered") |
|
194 |
writer.writeCharacters("\n") |
|
195 |
|
|
196 |
metadataColumnIndex.each { colIndex -> |
|
197 |
writer.writeStartElement("item") |
|
198 |
writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";")) |
|
199 |
writer.writeEndElement() // item |
|
200 |
writer.writeCharacters("\n") |
|
201 |
} |
|
202 |
writer.writeCharacters(" ") |
|
203 |
writer.writeEndElement() // list |
|
204 |
writer.writeEndElement() // head |
|
205 |
writer.writeCharacters("\n") |
|
206 |
|
|
207 |
textColumnIndex.each { colIndex -> |
|
208 |
|
|
209 |
writer.writeCharacters(" ") |
|
210 |
writer.writeStartElement(normalizedHeaders[colIndex]) |
|
211 |
writer.writeStartElement("p") |
|
212 |
writer.writeStartElement("head") |
|
213 |
writer.writeStartElement("hi") |
|
214 |
writer.writeCharacters(headers[colIndex]+" : ") |
|
215 |
writer.writeEndElement() // hi |
|
216 |
writer.writeEndElement() // head |
|
217 |
value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)) |
|
218 |
|
|
219 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) { |
|
220 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc -> |
|
221 |
writer.writeStartElement("descripteur") |
|
222 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/) |
|
223 |
writer.writeAttribute("type", matches[0][1]) |
|
224 |
writer.writeCharacters(matches[0][2]) |
|
225 |
writer.writeEndElement() // descripteur |
|
226 |
} |
|
227 |
} else { |
|
228 |
writer.writeCharacters(value) |
|
229 |
} |
|
230 |
writer.writeEndElement() // p |
|
231 |
writer.writeEndElement() // textColumn |
|
232 |
writer.writeCharacters("\n") |
|
233 |
} |
|
234 |
|
|
235 |
writer.writeCharacters(" ") |
|
236 |
writer.writeEndElement() // textTag |
|
237 |
writer.writeCharacters("\n") |
|
238 |
} |
|
239 |
|
|
240 |
writer.writeEndElement() // rootTag |
|
241 |
writer.writeCharacters("\n") |
|
242 |
writer.close() |
|
243 |
output.close() |
|
244 |
println "Result file: $outputFile" |
|
245 |
|
|
246 |
} catch (Exception e) { |
|
247 |
println "** Excel2XML: unable to read input file. Aborting." |
|
248 |
println e.getLocalizedMessage() |
|
249 |
println e.printStackTrace() |
|
250 |
return false |
|
251 |
} |
|
252 |
|
|
253 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/CreateTheOtherTurns.groovy (revision 3041) | ||
---|---|---|
130 | 130 |
writer.writeEndElement() // w |
131 | 131 |
} |
132 | 132 |
|
133 |
def startOtherReg = /^(.*)\*([^ ]+.*)$/
|
|
134 |
def endOtherReg = /^(.*[^ ]+)\*(.*)$/
|
|
133 |
def startOtherReg = /^(.*)\*([^\p{Zs}]+.*)$/
|
|
134 |
def endOtherReg = /^(.*[^\p{Zs}]+)\*(.*)$/
|
|
135 | 135 |
String previousOtherStarting = "<none>" |
136 | 136 |
@Override |
137 | 137 |
protected void processEndElement() throws XMLStreamException { |
... | ... | |
142 | 142 |
def m1 = word =~ startOtherReg |
143 | 143 |
|
144 | 144 |
if (m1.matches()) { |
145 |
if (debug) println "OPEN OTHER at $word" |
|
146 | 145 |
if (other) { |
147 |
println "Warning: found a starting * when one 'other' is already started at "+getLocation() |
|
148 |
println "Previous starting 'other' at "+previousOtherStarting |
|
146 |
println "Warning at "+getLocation()+" with $word: Found a starting * when one 'other' have been started at "+previousOtherStarting |
|
149 | 147 |
} else { |
148 |
if (debug) println "- ligne "+parser.getLocation().getLine()+" : ouverture de other avec '$word' -> tours '$turnInfos'" |
|
150 | 149 |
//close current Turn and start a 'other' Turn |
151 | 150 |
previousOtherStarting = ["word="+word+ " location="+getLocation()] |
152 | 151 |
String group1 = m1.group(1) |
... | ... | |
182 | 181 |
boolean shouldCloseOtherTurn = false; |
183 | 182 |
def m2 = word =~ endOtherReg |
184 | 183 |
if (m2.matches()) { |
185 |
if (debug) println "DETECT END OTHER at $word"
|
|
184 |
if (debug) println "- ligne "+parser.getLocation().getLine()+" : fermeture de other avec '$word' -> tours '$turnInfos'"
|
|
186 | 185 |
previousOtherStarting = ["word="+word+ " location="+getLocation()] |
187 | 186 |
if (other) { |
188 | 187 |
shouldCloseOtherTurn = true; |
... | ... | |
190 | 189 |
word = m2.group(1) |
191 | 190 |
other = false |
192 | 191 |
} else { |
193 |
println "Warning: found a ending * when one 'other' is not started at "+getLocation() |
|
194 |
println "Previous closing 'other' Turn at "+previousOtherStarting |
|
192 |
println "Warning at "+getLocation()+" with $word: Found a closing * when one 'other' have been closed at "+previousOtherStarting |
|
195 | 193 |
} |
196 | 194 |
} |
197 | 195 |
|
198 |
// if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/> |
|
199 |
// writer.writeStartElement("event") // start the initial word |
|
200 |
// writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"]) |
|
201 |
// writer.writeAttribute("type", "unknown") |
|
202 |
// writer.writeAttribute("extent", "instantaneous") |
|
203 |
// writer.writeEndElement() // event |
|
204 |
// word = "" // don't write the word |
|
205 |
// } |
|
206 |
|
|
207 | 196 |
if (word.length() > 0) { |
208 | 197 |
writeWord(word) |
209 | 198 |
} |
210 | 199 |
|
211 | 200 |
if (shouldCloseOtherTurn) { |
212 |
if (debug) println "CLOSE OTHER at $word" |
|
201 |
|
|
213 | 202 |
shouldCloseOtherTurn = false; |
214 | 203 |
//close the current 'other' Turn and restart the actual Turn |
215 | 204 |
writer.writeCharacters("\n") |
... | ... | |
235 | 224 |
} |
236 | 225 |
} |
237 | 226 |
} |
238 |
|
|
239 |
public static void main(String[] args) { |
|
240 |
File infile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
241 |
File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs") |
|
242 |
def processor = new FixTranscription(infile, true) |
|
243 |
println processor.process(outfile) |
|
244 |
} |
|
245 | 227 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/PrepareTranscriptionsMacro.groovy (revision 3041) | ||
---|---|---|
39 | 39 |
|
40 | 40 |
resultDirectory.mkdirs(); |
41 | 41 |
|
42 |
println "PREPARING VOCAPIA FILES TO TRS FILES..."
|
|
42 |
println "CONVERTING VOCAPIA FILES TO TRS FILES..."
|
|
43 | 43 |
File trsDirectory = new File(resultDirectory, "vocapia2trs") |
44 | 44 |
trsDirectory.mkdir() |
45 |
gse.runMacro(org.txm.macro.transcription.Vocapia2TranscriberMacro, ["vocapiaDirectory":vocapiaDirectory, "resultDirectory":trsDirectory]) |
|
45 |
gse.runMacro(org.txm.macro.transcription.Vocapia2TranscriberMacro, ["vocapiaDirectory":vocapiaDirectory, "resultDirectory":trsDirectory, "debug":debug])
|
|
46 | 46 |
|
47 | 47 |
|
48 |
println "DETECTING MARKED SECTIONS..."
|
|
48 |
println "SPOTTING SECTION MARKS..."
|
|
49 | 49 |
|
50 | 50 |
trsFiles = trsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")} |
51 | 51 |
if (trsFiles.size() == 0) { |
... | ... | |
58 | 58 |
gse.runMacro(org.txm.macro.transcription.SegmentTRSInSectionFromMarkerMacro, ["trsDirectory":trsDirectory, "resultDirectory":sectionsDirectory, "newSectionMarker":newSectionMarker, "debug":debug]) |
59 | 59 |
|
60 | 60 |
|
61 |
println "CREATING THE 'OTHER' TURNS..."
|
|
61 |
println "SPOTTING 'OTHER' TURNS..."
|
|
62 | 62 |
|
63 | 63 |
trsFiles = sectionsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")} |
64 | 64 |
if (trsFiles.size() == 0) { |
... | ... | |
70 | 70 |
otherDirectory.mkdir() |
71 | 71 |
cpb = new ConsoleProgressBar(trsFiles.size()) |
72 | 72 |
for (File file : trsFiles) { |
73 |
cpb.tick() |
|
74 | 73 |
|
74 |
if (debug) println "== $file ==" |
|
75 |
else cpb.tick() |
|
76 |
|
|
75 | 77 |
CreateTheOtherTurns fixer = new CreateTheOtherTurns(file, primarySpeakerIdRegex, otherNonPrimarySpeakerId, debug) |
76 | 78 |
String name = FileUtils.stripExtension(file) |
77 | 79 |
File outFile = new File(otherDirectory, name+".trs") |
... | ... | |
83 | 85 |
} |
84 | 86 |
cpb.done() |
85 | 87 |
|
88 |
println "CONVERTIGN WORD MARKERS TO EVENTS..." |
|
86 | 89 |
|
87 |
println "NORMALIZING LOCUTORS..." |
|
88 |
|
|
89 | 90 |
trsFiles = otherDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")} |
90 | 91 |
if (trsFiles.size() == 0) { |
92 |
println "No XML file found in $trsDirectory" |
|
93 |
return false |
|
94 |
} |
|
95 |
|
|
96 |
File eventsDirectory = new File(resultDirectory, "events") |
|
97 |
eventsDirectory.mkdir() |
|
98 |
gse.runMacro(org.txm.macro.transcription.AddEventsFromWordMarkersMacro, ["trsDirectory":otherDirectory, "resultDirectory":eventsDirectory, "wordElement": "w", "newEventMarker":"XXX", "eventDescription":"termes incompréhensibles ou inaudibles", "eventType":"pi", "eventExtent": "instantaneous", "debug":debug]) |
|
99 |
|
|
100 |
|
|
101 |
println "NORMALIZING SPEAKER IDs..." |
|
102 |
|
|
103 |
trsFiles = eventsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")} |
|
104 |
if (trsFiles.size() == 0) { |
|
91 | 105 |
println "No XML file found in $otherDirectory" |
92 | 106 |
return false |
93 | 107 |
} |
94 | 108 |
|
95 | 109 |
cpb = new ConsoleProgressBar(trsFiles.size()) |
96 | 110 |
for (File file : trsFiles) { |
97 |
cpb.tick() |
|
111 |
if (debug) println "== $file ==" |
|
112 |
else cpb.tick() |
|
98 | 113 |
|
99 | 114 |
File outFile = new File(resultDirectory, file.getName()) |
100 | 115 |
RecodeSpeakers fixer = new RecodeSpeakers(file, outFile, nonPrimarySpeakerIdRegex, null, otherNonPrimarySpeakerId, otherNonPrimarySpeakerId) |
116 |
fixer.debug = debug |
|
101 | 117 |
|
102 | 118 |
if (!fixer.process()) { |
103 | 119 |
println "WARNING: ERROR WHILE PROCESSING: "+file |
... | ... | |
110 | 126 |
otherDirectory.deleteDir() |
111 | 127 |
trsDirectory.deleteDir() |
112 | 128 |
sectionsDirectory.deleteDir() |
129 |
eventsDirectory.deleteDir() |
|
113 | 130 |
} |
114 | 131 |
|
115 | 132 |
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory" |
Formats disponibles : Unified diff