Revision 2027

tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/Metadata2TeiHeaderMacro.groovy (revision 2027)
1
// STANDARD DECLARATIONS
2
package org.txm.macro.urs.democrat
3

  
4
import java.nio.charset.Charset
5

  
6
import org.kohsuke.args4j.*
7
import groovy.transform.Field
8

  
9
import org.txm.importer.StaxIdentityParser
10
import org.txm.objects.*
11
import org.txm.rcp.swt.widget.parameters.*
12
import org.txm.searchengine.cqp.corpus.*
13
import org.txm.utils.CsvReader
14
import org.txm.utils.io.IOUtils
15
import org.txm.scripts.importer.XPathResult
16
import javax.xml.xpath.XPathConstants
17

  
18
class Metadata2TEiHeaderMacro {
19
	
20
	int debug = 0;
21
	public Metadata2TEiHeaderMacro(int debug) {
22
		this.debug = debug
23
	}
24
	//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
25
	//def teiHeaderTemplateFile
26
	//
27
	//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
28
	//def xpathFile
29
	//
30
	//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
31
	//def metadataFile
32
	//
33
	//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
34
	//debug
35
	//
36
	//if (!ParametersDialog.open(this)) return;
37
	//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
38
	//
39
	//
40
	//
41
	//def xpathProperties = new Properties()
42
	//xpathProperties.load(IOUtils.getReader(xpathFile))
43
	//println xpathProperties
44
	//
45
	//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
46
	//println csvReader
47
	//
48
	//csvReader.readHeaders();
49
	//def header = csvReader.getHeaders()
50
	//if (!header.contains("id")) {
51
	//	println "** 'id' column not found in $metadataFile header=$header"
52
	//	return;
53
	//}
54
	//if (!header.contains("corpus_id")) {
55
	//	println "** 'corpus_id' column not found in $metadataFile header=$header"
56
	//	return;
57
	//}
58
	//
59
	//def corpora = [:]
60
	//for (Project project : Workspace.getInstance().getProjects()) {
61
	//	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
62
	//		corpora[corpus.getID()] = corpus;
63
	//	}
64
	//}
65
	//
66
	//while (csvReader.readRecord())	{
67
	//	String text_id = csvReader.get("id")
68
	//	if (text_id == null || text_id.length() == 0) {
69
	//		//println "** @id not found for record="+csvReader.getRawRecord()
70
	//		continue;
71
	//	}
72
	//
73
	//	String corpus_id = csvReader.get("corpus_id")
74
	//	if (corpus_id == null || corpus_id.length() == 0) {
75
	//		println "** @corpus_id not found for record="+csvReader.getRawRecord()
76
	//		continue;
77
	//	}
78
	//
79
	//	MainCorpus corpus = corpora[corpus_id]
80
	//	if (corpus == null) {
81
	//		println "** no corpus found for ID=$corpus_id"
82
	//		continue;
83
	//	}
84
	//	def project = corpus.getProject()
85
	//
86
	//	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
87
	//
88
	//	if (!txmDir.exists()) {
89
	//		println "** the selected corpus has no XML-TXM files. Aborting."
90
	//	}
91
	//
92
	//
93
	//	File txmFile = new File(txmDir, text_id+".xml")
94
	//	if (txmFile.exists()) {
95
	//
96
	//		def data = [:]
97
	//
98
	//		for (def h : header) data[h] = csvReader.get(h)
99
	//
100
	//		String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
101
	//		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
102
	//			injecting(txmFile, xmlteiHeaderContent)
103
	//		} else {
104
	//			println "** Text header not updated: $txmFile"
105
	//		}
106
	//	} else {
107
	//		println "** Text not found: $txmFile"
108
	//	}
109
	//}
110

  
111
	def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) {
112
		XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
113
		for (String info : data.keySet()) {
114
			String xpath = xpathProperties[info];
115
			String value = data[info]
116
			if (xpath == null) {
117
				continue; // not a data to inject
118
			}
119
			if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
120

  
121
			def expr = xpathProcessor.xpath.compile(xpath);
122
			def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
123
			if (first != null) {
124
				switch (first.getNodeType()) {
125
					case 1: // element
126
						if (debug > 1) println "   patching: $info with "+value
127
						def newChild = xpathProcessor.doc.createElement(first.getTagName())
128
						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
129
						first.getParentNode().replaceChild(newChild, first)
130
						break;
131
					case 2: // attribute
132
						if (debug > 1) println "   patching attribute: $info with "+value
133
						first.setNodeValue(value)
134
						break;
135
					case 3: // text
136
						if (debug > 1) println "   patching text: $info with "+value
137
						first.setNodeValue(value)
138
						break;
139
					default:
140
						break
141
				}
142
			} else {
143
				println "** not found $xpath"
144
			}
145
		}
146

  
147
		//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
148
		//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
149
		String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
150
		content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
151
	}
152

  
153
	/**
154
	 * Replace the teiHeader 
155
	 * @param txmFile
156
	 * @param data
157
	 * @param paths
158
	 * @param xmlteiHeaderContent
159
	 * @return
160
	 */
161
	def injecting(File txmFile, String xmlteiHeaderContent) {
162
		println "Editing: $txmFile..."
163
		StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
164
					boolean start = false;
165
					public void processStartElement() {
166

  
167
						if (localname == "teiHeader") {
168
							start = true;
169
							if (debug > 2) println "   replacing teiHeader"
170
							if (debug > 3) println "       with $xmlteiHeaderContent"
171
							output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
172
						}
173
						if (!start) {
174
							super.processStartElement();
175
						}
176
					}
177

  
178
					public void processEndElement() {
179
						if (!start) {
180
							super.processEndElement();
181
						}
182
						if (localname == "teiHeader") {
183
							start = false;
184
							if (debug > 2) println "   replace done"
185
						}
186
					}
187

  
188
					public void processCharacters() {
189
						if (!start) {
190
							super.processCharacters();
191
						}
192
					}
193

  
194
					public void processComment() {
195
						if (!start) {
196
							super.processComment();
197
						}
198
					}
199
				}
200

  
201
		File tmpDirectory = new File("/home/mdecorde/TEMP");
202
		File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
203
		File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
204

  
205
		if (sparser.process(outfile)) { // replace inputFile
206
			if (txmFile.renameTo(copyFile)) {
207
				if (outfile.renameTo(txmFile)) {
208
					println " -> SUCCESS see $txmFile"
209
					return true
210
				} else {
211
					println " -> FAIL could not replace $txmFile"
212
					println " -> see result in $outfile"
213

  
214
				}
215
			} else {
216
				println " -> FAIL could not make a copy of $txmFile in $copyFile"
217
				println " -> see result in $outfile"
218
			}
219

  
220
		} else {
221
			println " -> FAIL see $outfile"
222
		}
223

  
224
		return false;
225
	}
226
}
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/ComputeDemocratMeasureMetadataMacro.groovy (revision 2027)
1
package org.txm.macro.urs.democrat
2

  
3
import org.txm.searchengine.cqp.corpus.MainCorpus
4
import org.txm.searchengine.cqp.corpus.query.CQLQuery
5
import visuAnalec.donnees.Corpus
6

  
7
class ComputeDemocratMeasureMetadataMacro {
8
	def getStats(MainCorpus corpus) {
9
		def data = [:]
10
		
11
		data["number of words"] = corpus.query(new CQLQuery("[]"), "TMP", false).getNMatch();
12
		
13
		Corpus analecCorpus = URSCorpora.getCorpus(corpus);
14
		def chaines = analecCorpus.getSchemas("CHAINE");
15
		def refchaines = [];
16
		for (def chaine : chaines) {
17
			if (chaine.getUnitesSousjacentes().length >= 3) {
18
				refchaines << chaine
19
			}
20
		}
21
		data["number of coreference chains"] = chaines.size()
22
		data["number of referring chains"] = refchaines.size()
23
		data["number of referring expressions"] = analecCorpus.getUnites("MENTION").size()
24
	}
25
	
26
	def updateTeiHeader(MainCorpus corpus) {
27
		
28
	}
29
}
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/BuildDemocratCorpusVersion1Macro.groovy (revision 2027)
1
package org.txm.macro.urs.democrat
2

  
3
import java.io.File
4
import java.nio.charset.Charset
5

  
6
import org.eclipse.core.runtime.IProgressMonitor
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import visuAnalec.donnees.Corpus
10
import visuAnalec.vue.Vue
11

  
12
import org.txm.annotation.urs.URSCorpora
13
import org.txm.importer.StaxIdentityParser
14
import org.txm.objects.*
15
import org.txm.rcp.commands.workspace.LoadBinaryCorpus
16
import org.txm.rcp.swt.widget.parameters.*
17
import org.txm.rcp.utils.JobHandler
18
import org.txm.searchengine.cqp.corpus.*
19
import org.txm.utils.CsvReader
20
import org.txm.utils.io.IOUtils
21
import org.txm.scripts.importer.XPathResult
22
import javax.xml.xpath.XPathConstants
23
import org.txm.annotation.urs.commands.*
24

  
25
@Field @Option(name="inputDirectory", usage="Directory with .txm files, metadata.xslx, metadata.properties...", widget="FileOpen", required=true, def="input directory path")
26
def inputDirectory
27

  
28
@Field @Option(name="outputDirectory", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="output directory path")
29
def outputDirectory
30

  
31
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
32
debug
33

  
34
if (!ParametersDialog.open(this)) return;
35
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
36

  
37
def teiHeaderTemplateFile = new File(inputDirectory, "teiHeader.xml")
38
def xpathFile = new File(inputDirectory, "metadata.properties")
39
def metadataFile = new File(inputDirectory, "metadata.tsv")
40

  
41
if (!inputDirectory.exists()) {
42
	println "no inputDirectory found: $inputDirectory"
43
	return false;
44
}
45

  
46
outputDirectory.mkdirs()
47
if (!outputDirectory.exists()) {
48
	println "outputDirectory not created: $outputDirectory"
49
	return false;
50
}
51

  
52
if (!teiHeaderTemplateFile.exists()) {
53
	println "no teiHeader template found: $teiHeaderTemplateFile"
54
	return false;
55
}
56

  
57
if (!xpathFile.exists()) {
58
	println "no xpath properties found: $xpathFile"
59
	return false;
60
}
61

  
62
if (!metadataFile.exists()) {
63
	println "no metadata TSV found: $metadataFile"
64
	return false;
65
}
66

  
67
def xpathProperties = new Properties()
68
xpathProperties.load(IOUtils.getReader(xpathFile))
69
println xpathProperties
70

  
71
def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
72
println csvReader
73

  
74
csvReader.readHeaders();
75
def header = csvReader.getHeaders()
76
if (!header.contains("id")) {
77
	println "** 'id' column not found in $metadataFile header=$header" 
78
	return;
79
}
80
if (!header.contains("corpus_id")) {
81
	println "** 'corpus_id' column not found in $metadataFile header=$header"
82
	return;
83
}
84

  
85
def corpora = [:]
86
for (Project project : Workspace.getInstance().getProjects()) {
87
	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
88
		corpora[corpus.getID()] = corpus;
89
	}
90
}
91

  
92
while (csvReader.readRecord())	{
93
	String text_id = csvReader.get("id")
94
	if (text_id == null || text_id.length() == 0) {
95
		println "** @id not found for record="+csvReader.getRawRecord()
96
		continue;
97
	}
98
	
99
	String corpus_id = csvReader.get("corpus_id")
100
	if (corpus_id == null || corpus_id.length() == 0) {
101
		println "** @corpus_id not found for record="+csvReader.getRawRecord()
102
		continue;
103
	}
104
	
105
	MainCorpus corpus = corpora[corpus_id]
106
	Project project = null
107
	if (corpus == null) {
108
		File binCorpusFile = new File(inputDirectory, corpus_id+".txm")
109
		if (!binCorpusFile.exists()) {
110
			println "** no corpus binary file found for ID=$corpus_id : $binCorpusFile"
111
			continue;
112
		}
113
		println "CALL load corpus from $binCorpusFile"
114
		JobHandler job = LoadBinaryCorpus.loadBinaryCorpusArchive(binCorpusFile)
115
		job.join();
116
		project = job.getResultObject();
117
		corpus = project.getCorpusBuild(corpus_id);
118
		corpora[corpus.getID()] = corpus;
119
	} else {
120
		println "USING loaded corpus: "+corpus
121
		project = corpus.getProject()
122
	}
123
	
124
	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
125
	
126
	if (!txmDir.exists()) {
127
		println "** the selected corpus has no XML-TXM files. Aborting."
128
		continue;
129
	}
130
	
131
	File txmFile = new File(txmDir, text_id+".xml")
132
	if (!txmFile.exists()) {
133
		println "** the selected corpus has no XML-TXM file: $txmFile"
134
		continue;
135
	}
136
	
137
	println "CALL Metadata2TeiHeaderMacro"
138
	Metadata2TEiHeaderMacro mthm = new Metadata2TEiHeaderMacro(debug);
139
	for (def h : header) data[h] = csvReader.get(h)
140
	
141
	String xmlteiHeaderContent = mthm.getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
142
	if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
143
		mthm.injecting(txmFile, xmlteiHeaderContent)
144
	} else {
145
		println "** Text header not updated: $txmFile"
146
	}
147
	
148
	println "CALL ComputeDemocratMeasureMetadataMacro"
149
	
150
	println "CALL URS Export"
151
	Corpus analecCorpus = URSCorpora.getCorpus(corpus);
152
	Vue vue = URSCorpora.getVue(corpus);
153
	if (!ExportTEICorpus.export(true, outputDirectory, true, null, corpus, analecCorpus, vue)) {
154
		println "FAIL TO EXPORT CORPUS$corpus"
155
	}
156
}

Also available in: Unified diff