Révision 3041

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/RecodeSpeakers.groovy (revision 3041)
26 26
	
27 27
	def idRegex, nameRegex
28 28
	String newId, newName
29
	boolean debug
29 30
	
30 31
	/**
31 32
	 * Instantiates a new change speaker.
......
84 85
					if (newName != null) {
85 86
						elem.setAttribute("name", newName);
86 87
					}
88
					if (debug) println "Recoding id $id -> $newId and name $name -> newName"
87 89
				}
88 90
				
89 91
				if (nameRegex != null && id =~ nameRegex) { // patch Speaker@name
......
98 100
			
99 101
			// fix speaker turns
100 102
			
103
			int nReplace = 0
101 104
			for (def node : nodes) {
102 105
				if (node == null) continue;
103 106
				
......
106 109
				
107 110
				if (replacedIds.contains(id)) {
108 111
					elem.setAttribute("speaker", newId);
112
					nReplace++
109 113
				}
110 114
			}
111 115
			
116
			if (debug) println "$nReplace replacements"
117
			
112 118
			// Création de la source DOM
113 119
			Source source = new DOMSource(doc);
114 120
			
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3041)
229 229
	}
230 230
	
231 231
	public static void main(String[] args) {
232
		File infile = new File("/home/mdecorde/xml/vocapia","test.xml")
233
		File outfile = new File("/home/mdecorde/xml/vocapia","test.trs")
232
		File infile = new File("/home/mdecorde/xml/vocapia/testé input","test.xml")
233
		File outfile = new File("/home/mdecorde/xml/vocapia/testé oh","testé.trs")
234
		outfile.getParentFile().mkdir()
234 235
		def processor = new Vocapia2Transcriber(infile)
235
		println processor.process(outfile, true)
236
		println processor.process(outfile)
236 237
	}
237 238
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddEventsFromWordMarkersMacro.groovy (revision 3041)
1
package org.txm.macro.transcription
2

  
3
import java.nio.charset.Charset
4

  
5
import java.time.LocalTime
6
import java.time.format.DateTimeFormatter
7
import org.txm.utils.*
8
import org.txm.utils.logger.*
9

  
10
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS", widget="Folder", required=true, def="")
11
		File trsDirectory;
12

  
13
@Field @Option(name="resultDirectory", usage="Dossier résultat TRS", widget="Folder", required=true, def="")
14
		File resultDirectory;
15
		
16
@Field @Option(name="wordElement", usage="The marker, spaces included", widget="String", required=true, def="w")
17
		def wordElement
18

  
19
@Field @Option(name="newEventMarker", usage="The marker, spaces included", widget="String", required=true, def="")
20
		def newEventMarker
21
		
22
@Field @Option(name="eventType", usage="The marker, spaces included", widget="String", required=true, def="")
23
		def eventType
24
		
25
@Field @Option(name="eventDescription", usage="The marker, spaces included", widget="String", required=false, def="")
26
		def eventDescription
27

  
28

  
29
@Field @Option(name="eventExtent", usage="The marker, spaces included", widget="String", required=true, def="instantaneous")
30
		def eventExtent
31
		
32
@Field @Option(name="debug", usage="activate debug messages", widget="Boolean", required=true, def="false")
33
		boolean debug
34

  
35
if (!ParametersDialog.open(this)) return;
36

  
37
if (!trsDirectory.exists()) {
38
	println "$trsDirectory not found"
39
	return
40
}
41

  
42
println "Writing result to $resultDirectory..."
43
def newEventMarkerRegex = /$newEventMarker/
44
try {
45
	def trsFiles = trsDirectory.listFiles().findAll() { it.getName().toLowerCase().endsWith(".trs") }
46
	
47
	if (trsFiles.size() == 0) {
48
		println "No TRS file to process in $trsDirectory"
49
		return;
50
	}
51
	
52
	ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size())
53
	for (File trsFile : trsFiles) {
54
		if (debug) println "== $trsFile =="
55
		else cpb.tick()
56
		
57
		// Open input file
58
		def slurper = new groovy.util.XmlParser(false, true, true);
59
		//slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration
60
		//slurper.setProperty("http://javax.xml.XMLConstants/property/accessExternalDTD", "all"); // allow to read DTD from local file
61
		def trs = slurper.parse(trsFile.toURI().toString())
62
		def trsEpisodes = trs.Episode // 1
63
		if (trsEpisodes.size() > 1) {
64
			println "Error: multiple Episode node in $trsFile"
65
			continue
66
		}
67
		
68
		def turns = trs.Episode.Section.Turn
69

  
70
		for (int iTurn = 0 ; iTurn < turns.size() ; iTurn++) {
71
			def turn = turns[iTurn]
72
			def children = turn.children()
73
			for (int i = 0 ; i < children.size() ; i++) {
74
				def w = children[i]
75
				if (wordElement.equals(w.name())) {
76
					String wContent = w.text()
77
					if (wContent ==~ newEventMarkerRegex) {
78
						if (debug) println "Create event $w with $wContent"
79
						Node replace = new Node(null, "event");
80
						replace.@type = eventType
81
						replace.@desc = eventDescription
82
						replace.@extent = eventExtent
83
						w.replaceNode(replace)
84
					}
85
				}
86
			}
87
		}
88
		
89
		resultDirectory.mkdir()
90
		File outfile = new File(resultDirectory, trsFile.getName())
91
		outfile.withWriter("UTF-8") { writer ->
92
			writer.write('<?xml version="1.0" encoding="UTF-8"?>\n')
93
			def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer))
94
			printer.setPreserveWhitespace(true)
95
			printer.print(trs)
96
		}
97
	}
98
	cpb.done()
99
	
100
	println "Done."
101
	
102
} catch(Exception e) {
103
	println "Error: "+e
104
	Log.printStackTrace(e)
105
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddSectionsFromTableMacro.groovy (revision 3041)
34 34
@Field @Option(name="metadataColumnsGroups", usage="Colonnes des gruopes de metadonnées de section", widget="String", required=true, def="metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text")
35 35
def metadataColumnsGroups
36 36

  
37
@Field @Option(name="debug", usage="show debug messages", widget="String", required=true, def="false")
38
def debug
39

  
37 40
if (!ParametersDialog.open(this)) return;
38 41
typeColumns = typeColumns.split(";")
39 42
topicColumns = topicColumns.split(";")
......
160 163
	
161 164
	ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
162 165
	for (String id : sectionGroupsToInsert.keySet()) {
163
		cpb.tick()
166
		if (debug) println "== $id =="
167
		else cpb.tick()
168
			
164 169
		File trsFile = new File(trsDirectory, id+".trs")
165 170
		if (!trsFile.exists()) {
166 171
			continue
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/SegmentTRSInSectionFromMarkerMacro.groovy (revision 3041)
58 58
	
59 59
	ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size())
60 60
	for (File trsFile : trsFiles) {
61
		cpb.tick()
62 61
		
62
		if (debug) println "== $trsFile =="
63
		else cpb.tick()
64
		
63 65
		// Open input file
64 66
		def slurper = new groovy.util.XmlParser(false, true, true);
65 67
		slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration
......
91 93
			def start = Float.parseFloat(turn.@startTime)
92 94
			def end = Float.parseFloat(turn.@endTime)
93 95
			
94
			if (debug) println "TURN: "
96
			//if (debug) println "TURN: "
95 97
			
96 98
			def children = turn.children()
97 99
			for (int i = 0 ; i < children.size() ; i++) {
......
106 108
				}
107 109
				
108 110
				if (content.equals(newSectionMarker)) {
109
					if (debug) println "NEW SECTION !"
111
					if (debug) println "New section at $turn with $node child node"
110 112
					previousSection = currentSection
111 113
					currentSection = new Node(trsEpisode, "Section", new LinkedHashMap(["type":newSectionMarker, "startTime":turn.@startTime, "endTime":previousSection.attributes()["endTime"]]))
112 114
					
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2TranscriberMacro.groovy (revision 3041)
14 14
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="")
15 15
		File resultDirectory;
16 16

  
17
@Field @Option(name="debug", usage="The result directory", widget="Boolean", required=false, def="false")
18
		boolean debug;
17 19

  
18 20
if (!ParametersDialog.open(this)) return;
19 21

  
......
40 42

  
41 43
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlFiles.size())
42 44
for (File xmlFile : xmlFiles) {
43
	cpb.tick()
45
	
46
	if (debug) println "== $xmlFile =="
47
	else cpb.tick()
48
		
44 49
	Vocapia2Transcriber v2t = new Vocapia2Transcriber(xmlFile)
45 50
	String name = FileUtils.stripExtension(xmlFile)
46 51
	File outFile = new File(resultDirectory, name+".trs")
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/ExcelDir2XMLMacro.groovy (revision 3041)
1
package org.txm.macro.text
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
import org.apache.poi.ss.usermodel.*
12
import org.apache.poi.hssf.usermodel.*
13
import org.apache.poi.xssf.usermodel.*
14
import org.apache.poi.ss.util.*
15

  
16
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="")
17
def inputDirectory
18

  
19
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
20
def sheetName
21

  
22
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root")
23
def rootTag
24

  
25
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit")
26
def textTag
27

  
28
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2")
29
def metadataColumnList
30

  
31
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
32
def dateColumnList
33

  
34
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2")
35
def textColumnList
36

  
37
if (!ParametersDialog.open(this)) return
38

  
39
if (!inputDirectory.exists()) {
40
	println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting."
41
	return false
42
}
43

  
44
if (!inputDirectory.canRead()) {
45
	println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting."
46
	return false
47
}
48

  
49
def f = []
50
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it }
51

  
52
if (f.size() == 0) {
53
	println "** ExcelDir2XML: no .xlsx file found. Aborting."
54
	return false
55
}
56

  
57
try {
58

  
59
f.sort { it.name }.each { inputFile ->
60

  
61
	res = gse.run(Excel2XMLMacro, ["args":[
62

  
63
"inputFile":inputFile,
64
"sheetName":sheetName,
65
"metadataColumnList":metadataColumnList,
66
"dateColumnList":dateColumnList,
67
"textColumnList":textColumnList,
68
"rootTag":rootTag,
69
"textTag":textTag,
70
			
71
				"selection":selection,
72
				"selections":selections,
73
				"corpusViewSelection":corpusViewSelection,
74
				"corpusViewSelections":corpusViewSelections,
75
				"monitor":monitor]])
76
			if (!res) println "** problem calling Excel2XMLMacro."
77
}
78

  
79
} catch (Exception e) {
80
	println "** ExcelDir2XML: unable to read input files. Aborting."
81
	println e.getLocalizedMessage()
82
	println e.printStackTrace()
83
	return false
84
}
85

  
86
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/Excel2XMLMacro.groovy (revision 3041)
1
package org.txm.macro.text
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

  
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.util.*
16

  
17
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
18

  
19
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
20
def getCellValueAsString = { cell ->
21
        strCellValue = null
22
        if (cell != null) {
23
            switch (cell.getCellType()) {
24
            case Cell.CELL_TYPE_STRING:
25
                strCellValue = cell.toString()
26
                break
27
            case Cell.CELL_TYPE_NUMERIC:
28
                if (DateUtil.isCellDateFormatted(cell)) {
29
                    SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
30
                    strCellValue = dateFormat.format(cell.getDateCellValue())
31
                } else {
32
                    value = cell.getNumericCellValue()
33
                    longValue = value.longValue()
34
                    strCellValue = new String(longValue.toString())
35
                }
36
                break
37
            case Cell.CELL_TYPE_BOOLEAN:
38
                strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
39
                break
40
            case Cell.CELL_TYPE_BLANK:
41
                strCellValue = ""
42
                break
43
            }
44
        }
45
        if (strCellValue == null) strCellValue = ""
46
        return strCellValue
47
}
48

  
49
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
50
File inputFile
51

  
52
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
53
def sheetName
54

  
55
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
56
def rootTag
57

  
58
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
59
def textTag
60

  
61
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
62
def metadataColumnList
63

  
64
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
65
def dateColumnList
66

  
67
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
68
def textColumnList
69

  
70
if (!ParametersDialog.open(this)) return
71

  
72
if (!inputFile.exists()) {
73
	println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
74
	return false
75
}
76

  
77
if (!inputFile.canRead()) {
78
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
79
	return false
80
}
81

  
82
try {
83

  
84
metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
85
dateColumnList = dateColumnList.split(",").collect { it.trim() }
86
textColumnList = textColumnList.split(",").collect { it.trim() }
87
textTag = textTag.trim()
88
rootTag = rootTag.trim()
89

  
90
wb = WorkbookFactory.create(inputFile)
91

  
92
if (sheetName.length() == 0) {
93
	ws = wb.getSheetAt(0)
94
} else {
95
	ws = wb.getSheet(sheetName)
96
	if (ws == null) {
97
		println "** Excel2XML: no '"+sheetName+" found. Aborting."
98
		return false
99
	}
100
}
101

  
102
if (ws == null) {
103
	println "** Excel2XML: no sheet found. Aborting."
104
	return false
105
}
106

  
107
nRows = ws.getPhysicalNumberOfRows()
108
println nRows+" rows."
109
 
110
firstRow = ws.getRow(0)
111
colMax = firstRow.getLastCellNum()
112

  
113
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
114

  
115
println "Headers: $headers"
116

  
117
normalizedHeaders = headers.collect { stringToIndent(it) }
118

  
119
ok = true
120
metadataColumnList.each { m ->
121
	if (!headers.contains(m)) {
122
		println "** Excel2XML: missing metadataColumnList column: $m"
123
		ok = false
124
	}
125
}
126
textColumnList.each { t ->
127
	if (!headers.contains(t)) {
128
		println "** Excel2XML: missing textColumnList column: $t"
129
		ok = false
130
	}
131
}
132

  
133
if (!ok) { return false }
134

  
135
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
136
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
137
textColumnIndex = textColumnList.collect { headers.indexOf(it) }
138

  
139
println "metadataColumnList = "+metadataColumnList
140
println "metadataColumnIndex = "+metadataColumnIndex
141

  
142
println "dateColumnList = "+dateColumnList
143
println "dateColumnsIndex = "+dateColumnsIndex
144

  
145
println "textColumnList = "+textColumnList
146
println "textColumnIndex = "+textColumnIndex
147

  
148
name = inputFile.getName()
149
idx = name.lastIndexOf(".")
150

  
151
if (idx > 0) name = name.substring(0, idx)
152
outputFile = new File(inputFile.getParentFile(), name+".xml")
153

  
154
factory = XMLOutputFactory.newInstance()
155
output = new FileOutputStream(outputFile)
156
writer = factory.createXMLStreamWriter(output, "UTF-8")
157

  
158
writer.writeStartDocument("UTF-8","1.0")
159
writer.writeCharacters("\n") // simple XML formating
160
writer.writeStartElement(rootTag)
161
writer.writeCharacters("\n")
162

  
163
pb_n = 1
164

  
165
(1..nRows-1).each { rowIndex ->
166

  
167
	writer.writeCharacters("  ")
168
	writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
169
	writer.writeAttribute("n", ""+pb_n++)
170
	writer.writeCharacters("\n") // simple XML formating
171

  
172
	writer.writeCharacters("  ")
173
	writer.writeStartElement(textTag)
174
	metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
175
		String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex));
176
		if (s == null) s ="";
177
		value = s.replaceAll("\n", ";").trim()
178
		writer.writeAttribute(normalizedHeaders[colIndex], value)
179
		if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
180
			matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
181
			writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
182
			writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
183
			writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
184
			writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
185
		}
186
	}
187
	writer.writeCharacters("\n")
188

  
189
	writer.writeCharacters("    ")
190
	writer.writeStartElement("metadata")
191
	writer.writeStartElement("list")
192
	writer.writeAttribute("type", "unordered")
193
	writer.writeCharacters("\n")
194

  
195
 	metadataColumnIndex.each { colIndex ->
196
 		writer.writeStartElement("item")
197
 		writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";"))
198
		writer.writeEndElement() // item
199
		writer.writeCharacters("\n")
200
	}
201
	writer.writeCharacters("    ")
202
	writer.writeEndElement() // list
203
	writer.writeEndElement() // head
204
	writer.writeCharacters("\n")
205

  
206
   	textColumnIndex.each { colIndex ->
207
		
208
    	writer.writeCharacters("    ")
209
    	writer.writeStartElement(normalizedHeaders[colIndex])
210
		writer.writeStartElement("p")
211
		writer.writeStartElement("head")
212
		writer.writeStartElement("hi")
213
		writer.writeCharacters(headers[colIndex]+" : ")
214
		writer.writeEndElement() // hi
215
		writer.writeEndElement() // head
216
		value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex))
217

  
218
		if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
219
			value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
220
				writer.writeStartElement("descripteur")
221
				matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
222
				writer.writeAttribute("type", matches[0][1])
223
				writer.writeCharacters(matches[0][2])
224
				writer.writeEndElement() // descripteur
225
			}
226
		} else {
227
			writer.writeCharacters(value)
228
		}
229
		writer.writeEndElement() // p
230
		writer.writeEndElement() // textColumn
231
		writer.writeCharacters("\n")
232
	}
233

  
234
	writer.writeCharacters("  ")
235
	writer.writeEndElement() // textTag
236
	writer.writeCharacters("\n")
237
}
238

  
239
writer.writeEndElement() // rootTag
240
writer.writeCharacters("\n")
241
writer.close()
242
output.close()
243
println "Result file: $outputFile"
244

  
245
} catch (Exception e) {
246
	println "** Excel2XML: unable to read input file. Aborting."
247
	println e.getLocalizedMessage()
248
	println e.printStackTrace()
249
	return false
250
}
251

  
252
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/ExcelDir2XMLMacro.groovy (revision 3041)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
import org.apache.poi.ss.usermodel.*
12
import org.apache.poi.hssf.usermodel.*
13
import org.apache.poi.xssf.usermodel.*
14
import org.apache.poi.ss.usermodel.WorkbookFactory
15
import org.apache.poi.ss.util.*
16

  
17
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="")
18
def inputDirectory
19

  
20
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
21
def sheetName
22

  
23
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root")
24
def rootTag
25

  
26
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit")
27
def textTag
28

  
29
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2")
30
def metadataColumnList
31

  
32
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
33
def dateColumnList
34

  
35
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2")
36
def textColumnList
37

  
38
if (!ParametersDialog.open(this)) return
39

  
40
if (!inputDirectory.exists()) {
41
	println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting."
42
	return false
43
}
44

  
45
if (!inputDirectory.canRead()) {
46
	println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting."
47
	return false
48
}
49

  
50
def f = []
51
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it }
52

  
53
if (f.size() == 0) {
54
	println "** ExcelDir2XML: no .xlsx file found. Aborting."
55
	return false
56
}
57

  
58
try {
59

  
60
f.sort { it.name }.each { inputFile ->
61

  
62
	res = gse.run(Excel2XMLMacro, ["args":[
63

  
64
"inputFile":inputFile,
65
"sheetName":sheetName,
66
"metadataColumnList":metadataColumnList,
67
"dateColumnList":dateColumnList,
68
"textColumnList":textColumnList,
69
"rootTag":rootTag,
70
"textTag":textTag,
71
			
72
				"selection":selection,
73
				"selections":selections,
74
				"corpusViewSelection":corpusViewSelection,
75
				"corpusViewSelections":corpusViewSelections,
76
				"monitor":monitor]])
77
			if (!res) println "** problem calling Excel2XMLMacro."
78
}
79

  
80
} catch (Exception e) {
81
	println "** ExcelDir2XML: unable to read input files. Aborting."
82
	println e.getLocalizedMessage()
83
	println e.printStackTrace()
84
	return false
85
}
86

  
87
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro.groovy (revision 3041)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

  
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.usermodel.WorkbookFactory
16
import org.apache.poi.ss.util.*
17

  
18
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
19

  
20
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
21
def getCellValueAsString = { cell ->
22
        strCellValue = null
23
        if (cell != null) {
24
            switch (cell.getCellType()) {
25
            case CellType.STRING:
26
                strCellValue = cell.toString()
27
                break
28
            case CellType.NUMERIC:
29
                if (DateUtil.isCellDateFormatted(cell)) {
30
                    SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
31
                    strCellValue = dateFormat.format(cell.getDateCellValue())
32
                } else {
33
                    value = cell.getNumericCellValue()
34
                    longValue = value.longValue()
35
                    strCellValue = new String(longValue.toString())
36
                }
37
                break
38
            case CellType.BOOLEAN:
39
                strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
40
                break
41
            case CellType.BLANK:
42
                strCellValue = ""
43
                break
44
            }
45
        }
46
        if (strCellValue == null) strCellValue = ""
47
        return strCellValue
48
}
49

  
50
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
51
File inputFile
52

  
53
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
54
def sheetName
55

  
56
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
57
def rootTag
58

  
59
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
60
def textTag
61

  
62
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
63
def metadataColumnList
64

  
65
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
66
def dateColumnList
67

  
68
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
69
def textColumnList
70

  
71
if (!ParametersDialog.open(this)) return
72

  
73
if (!inputFile.exists()) {
74
	println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
75
	return false
76
}
77

  
78
if (!inputFile.canRead()) {
79
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
80
	return false
81
}
82

  
83
try {
84

  
85
metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
86
dateColumnList = dateColumnList.split(",").collect { it.trim() }
87
textColumnList = textColumnList.split(",").collect { it.trim() }
88
textTag = textTag.trim()
89
rootTag = rootTag.trim()
90

  
91
wb = WorkbookFactory.create(inputFile)
92

  
93
if (sheetName.length() == 0) {
94
	ws = wb.getSheetAt(0)
95
} else {
96
	ws = wb.getSheet(sheetName)
97
	if (ws == null) {
98
		println "** Excel2XML: no '"+sheetName+" found. Aborting."
99
		return false
100
	}
101
}
102

  
103
if (ws == null) {
104
	println "** Excel2XML: no sheet found. Aborting."
105
	return false
106
}
107

  
108
nRows = ws.getPhysicalNumberOfRows()
109
println nRows+" rows."
110
 
111
firstRow = ws.getRow(0)
112
colMax = firstRow.getLastCellNum()
113

  
114
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
115

  
116
println "Headers: $headers"
117

  
118
normalizedHeaders = headers.collect { stringToIndent(it) }
119

  
120
ok = true
121
metadataColumnList.each { m ->
122
	if (!headers.contains(m)) {
123
		println "** Excel2XML: missing metadataColumnList column: $m"
124
		ok = false
125
	}
126
}
127
textColumnList.each { t ->
128
	if (!headers.contains(t)) {
129
		println "** Excel2XML: missing textColumnList column: $t"
130
		ok = false
131
	}
132
}
133

  
134
if (!ok) { return false }
135

  
136
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
137
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
138
textColumnIndex = textColumnList.collect { headers.indexOf(it) }
139

  
140
println "metadataColumnList = "+metadataColumnList
141
println "metadataColumnIndex = "+metadataColumnIndex
142

  
143
println "dateColumnList = "+dateColumnList
144
println "dateColumnsIndex = "+dateColumnsIndex
145

  
146
println "textColumnList = "+textColumnList
147
println "textColumnIndex = "+textColumnIndex
148

  
149
name = inputFile.getName()
150
idx = name.lastIndexOf(".")
151

  
152
if (idx > 0) name = name.substring(0, idx)
153
outputFile = new File(inputFile.getParentFile(), name+".xml")
154

  
155
factory = XMLOutputFactory.newInstance()
156
output = new FileOutputStream(outputFile)
157
writer = factory.createXMLStreamWriter(output, "UTF-8")
158

  
159
writer.writeStartDocument("UTF-8","1.0")
160
writer.writeCharacters("\n") // simple XML formating
161
writer.writeStartElement(rootTag)
162
writer.writeCharacters("\n")
163

  
164
pb_n = 1
165

  
166
(1..nRows-1).each { rowIndex ->
167

  
168
	writer.writeCharacters("  ")
169
	writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
170
	writer.writeAttribute("n", ""+pb_n++)
171
	writer.writeCharacters("\n") // simple XML formating
172

  
173
	writer.writeCharacters("  ")
174
	writer.writeStartElement(textTag)
175
	metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
176
		String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex));
177
		if (s == null) s ="";
178
		value = s.replaceAll("\n", ";").trim()
179
		writer.writeAttribute(normalizedHeaders[colIndex], value)
180
		if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
181
			matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
182
			writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
183
			writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
184
			writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
185
			writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
186
		}
187
	}
188
	writer.writeCharacters("\n")
189

  
190
	writer.writeCharacters("    ")
191
	writer.writeStartElement("metadata")
192
	writer.writeStartElement("list")
193
	writer.writeAttribute("type", "unordered")
194
	writer.writeCharacters("\n")
195

  
196
 	metadataColumnIndex.each { colIndex ->
197
 		writer.writeStartElement("item")
198
 		writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";"))
199
		writer.writeEndElement() // item
200
		writer.writeCharacters("\n")
201
	}
202
	writer.writeCharacters("    ")
203
	writer.writeEndElement() // list
204
	writer.writeEndElement() // head
205
	writer.writeCharacters("\n")
206

  
207
   	textColumnIndex.each { colIndex ->
208
		
209
    	writer.writeCharacters("    ")
210
    	writer.writeStartElement(normalizedHeaders[colIndex])
211
		writer.writeStartElement("p")
212
		writer.writeStartElement("head")
213
		writer.writeStartElement("hi")
214
		writer.writeCharacters(headers[colIndex]+" : ")
215
		writer.writeEndElement() // hi
216
		writer.writeEndElement() // head
217
		value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex))
218

  
219
		if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
220
			value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
221
				writer.writeStartElement("descripteur")
222
				matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
223
				writer.writeAttribute("type", matches[0][1])
224
				writer.writeCharacters(matches[0][2])
225
				writer.writeEndElement() // descripteur
226
			}
227
		} else {
228
			writer.writeCharacters(value)
229
		}
230
		writer.writeEndElement() // p
231
		writer.writeEndElement() // textColumn
232
		writer.writeCharacters("\n")
233
	}
234

  
235
	writer.writeCharacters("  ")
236
	writer.writeEndElement() // textTag
237
	writer.writeCharacters("\n")
238
}
239

  
240
writer.writeEndElement() // rootTag
241
writer.writeCharacters("\n")
242
writer.close()
243
output.close()
244
println "Result file: $outputFile"
245

  
246
} catch (Exception e) {
247
	println "** Excel2XML: unable to read input file. Aborting."
248
	println e.getLocalizedMessage()
249
	println e.printStackTrace()
250
	return false
251
}
252

  
253
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/CreateTheOtherTurns.groovy (revision 3041)
130 130
						writer.writeEndElement() // w
131 131
					}
132 132
										
133
					def startOtherReg = /^(.*)\*([^ ]+.*)$/
134
					def endOtherReg = /^(.*[^ ]+)\*(.*)$/
133
					def startOtherReg = /^(.*)\*([^\p{Zs}]+.*)$/
134
					def endOtherReg = /^(.*[^\p{Zs}]+)\*(.*)$/
135 135
					String previousOtherStarting = "<none>"
136 136
					@Override
137 137
					protected void processEndElement() throws XMLStreamException {
......
142 142
							def m1 = word =~ startOtherReg
143 143
							
144 144
							if (m1.matches()) {
145
								if (debug) println "OPEN OTHER at $word"
146 145
								if (other) {
147
									println "Warning: found a starting * when one 'other' is already started at "+getLocation()
148
									println "Previous starting 'other' at "+previousOtherStarting
146
									println "Warning at "+getLocation()+" with $word: Found a starting * when one 'other' have been started at "+previousOtherStarting
149 147
								} else {
148
									if (debug) println "- ligne "+parser.getLocation().getLine()+" : ouverture de other avec '$word' -> tours '$turnInfos'"
150 149
									//close current Turn and start a 'other' Turn
151 150
									previousOtherStarting = ["word="+word+ " location="+getLocation()]
152 151
									String group1 = m1.group(1)
......
182 181
							boolean shouldCloseOtherTurn = false;
183 182
							def m2 = word =~ endOtherReg
184 183
							if (m2.matches()) {
185
								if (debug) println "DETECT END OTHER at $word"
184
								if (debug) println "- ligne "+parser.getLocation().getLine()+" : fermeture de other avec '$word' -> tours '$turnInfos'"
186 185
								previousOtherStarting = ["word="+word+ " location="+getLocation()]
187 186
								if (other) {
188 187
									shouldCloseOtherTurn = true;
......
190 189
									word = m2.group(1)
191 190
									other = false
192 191
								} else {
193
									println "Warning: found a ending * when one 'other' is not started at "+getLocation()
194
									println "Previous closing 'other' Turn at "+previousOtherStarting
192
									println "Warning at "+getLocation()+" with $word: Found a closing * when one 'other' have been closed at "+previousOtherStarting
195 193
								}
196 194
							}
197 195
							
198
//							if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/>
199
//								writer.writeStartElement("event") // start the initial word
200
//								writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"])
201
//								writer.writeAttribute("type", "unknown")
202
//								writer.writeAttribute("extent", "instantaneous")
203
//								writer.writeEndElement() // event
204
//								word = "" // don't write the word
205
//							}
206
							
207 196
							if (word.length() > 0) {
208 197
								writeWord(word)
209 198
							}
210 199
							
211 200
							if (shouldCloseOtherTurn) {
212
								if (debug) println "CLOSE OTHER at $word"
201
								
213 202
								shouldCloseOtherTurn = false;
214 203
								//close the current 'other' Turn and restart the actual Turn
215 204
								writer.writeCharacters("\n")
......
235 224
					}
236 225
				}
237 226
	}
238
	
239
	public static void main(String[] args) {
240
		File infile = new File("/home/mdecorde/xml/vocapia","test.trs")
241
		File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs")
242
		def processor = new FixTranscription(infile, true)
243
		println processor.process(outfile)
244
	}
245 227
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/PrepareTranscriptionsMacro.groovy (revision 3041)
39 39

  
40 40
resultDirectory.mkdirs();
41 41

  
42
println "PREPARING VOCAPIA FILES TO TRS FILES..."
42
println "CONVERTING VOCAPIA FILES TO TRS FILES..."
43 43
File trsDirectory = new File(resultDirectory, "vocapia2trs")
44 44
trsDirectory.mkdir()
45
gse.runMacro(org.txm.macro.transcription.Vocapia2TranscriberMacro, ["vocapiaDirectory":vocapiaDirectory, "resultDirectory":trsDirectory])
45
gse.runMacro(org.txm.macro.transcription.Vocapia2TranscriberMacro, ["vocapiaDirectory":vocapiaDirectory, "resultDirectory":trsDirectory, "debug":debug])
46 46

  
47 47

  
48
println "DETECTING MARKED SECTIONS..."
48
println "SPOTTING SECTION MARKS..."
49 49

  
50 50
trsFiles = trsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")}
51 51
if (trsFiles.size() == 0) {
......
58 58
gse.runMacro(org.txm.macro.transcription.SegmentTRSInSectionFromMarkerMacro, ["trsDirectory":trsDirectory, "resultDirectory":sectionsDirectory, "newSectionMarker":newSectionMarker, "debug":debug])
59 59

  
60 60

  
61
println "CREATING THE 'OTHER' TURNS..."
61
println "SPOTTING 'OTHER' TURNS..."
62 62

  
63 63
trsFiles = sectionsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")}
64 64
if (trsFiles.size() == 0) {
......
70 70
otherDirectory.mkdir()
71 71
cpb = new ConsoleProgressBar(trsFiles.size())
72 72
for (File file : trsFiles) {
73
	cpb.tick()
74 73
	
74
	if (debug) println "== $file =="
75
	else cpb.tick()
76
	
75 77
	CreateTheOtherTurns fixer = new CreateTheOtherTurns(file, primarySpeakerIdRegex, otherNonPrimarySpeakerId, debug)
76 78
	String name = FileUtils.stripExtension(file)
77 79
	File outFile = new File(otherDirectory, name+".trs")
......
83 85
}
84 86
cpb.done()
85 87

  
88
println "CONVERTIGN WORD MARKERS TO EVENTS..."
86 89

  
87
println "NORMALIZING LOCUTORS..."
88

  
89 90
trsFiles = otherDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")}
90 91
if (trsFiles.size() == 0) {
92
	println "No XML file found in $trsDirectory"
93
	return false
94
}
95

  
96
File eventsDirectory = new File(resultDirectory, "events")
97
eventsDirectory.mkdir()
98
gse.runMacro(org.txm.macro.transcription.AddEventsFromWordMarkersMacro, ["trsDirectory":otherDirectory, "resultDirectory":eventsDirectory, "wordElement": "w", "newEventMarker":"XXX", "eventDescription":"termes incompréhensibles ou inaudibles", "eventType":"pi", "eventExtent": "instantaneous", "debug":debug])
99

  
100

  
101
println "NORMALIZING SPEAKER IDs..."
102

  
103
trsFiles = eventsDirectory.listFiles().findAll(){it.getName().toLowerCase().endsWith(".trs")}
104
if (trsFiles.size() == 0) {
91 105
	println "No XML file found in $otherDirectory"
92 106
	return false
93 107
}
94 108

  
95 109
cpb = new ConsoleProgressBar(trsFiles.size())
96 110
for (File file : trsFiles) {
97
	cpb.tick()
111
	if (debug) println "== $file =="
112
	else cpb.tick()
98 113
	
99 114
	File outFile = new File(resultDirectory, file.getName())
100 115
	RecodeSpeakers fixer = new RecodeSpeakers(file, outFile, nonPrimarySpeakerIdRegex, null, otherNonPrimarySpeakerId, otherNonPrimarySpeakerId)
116
	fixer.debug = debug
101 117
	
102 118
	if (!fixer.process()) {
103 119
		println "WARNING: ERROR WHILE PROCESSING: "+file
......
110 126
	otherDirectory.deleteDir()
111 127
	trsDirectory.deleteDir()
112 128
	sectionsDirectory.deleteDir()
129
	eventsDirectory.deleteDir()
113 130
}
114 131

  
115 132
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory"

Formats disponibles : Unified diff