Révision 3591

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/EuroPress2XMLMacro.groovy (revision 3591)
1
package org.txm.macro.misc
2
// STANDARD DECLARATIONS
3

  
4
import groovy.xml.QName
5

  
6
import java.nio.charset.Charset
7
import java.text.DecimalFormat
8
import org.txm.utils.xml.DomUtils;
9
import org.txm.importer.ValidateXml;
10
import groovy.util.XmlParser
11
import org.kohsuke.args4j.*
12
import groovy.transform.Field
13
import org.txm.rcp.swt.widget.parameters.*
14

  
15
// BEGINNING OF PARAMETERS
16
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
17
File rootDir = new File("");
18

  
19
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
20
String encoding = "iso-8859-1" // HTML files encoding
21

  
22
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
23
def debug = "true" // set true to debug the script
24

  
25
// Open the parameters input dialog box
26
if (!ParametersDialog.open(this)) return;
27

  
28
debug = ("true" == debug)
29
// END OF PARAMETERS
30

  
31
String corpusName = rootDir.getName()
32
File srcDir = new File(rootDir, "orig");
33
File outDir = new File(rootDir, "xhtml");
34
File outDir2 = new File(rootDir, corpusName);
35
File rejected = new File(rootDir, "duplicates");
36
File tmpDir = new File(rootDir, "tmp");
37

  
38

  
39
if (!srcDir.exists()) {
40
	println "STOP, srcDir does not exists $srcDir"
41
	return;
42
}
43

  
44
outDir.deleteDir()
45
outDir.mkdir()
46
outDir2.deleteDir()
47
outDir2.mkdir()
48
rejected.deleteDir()
49
rejected.mkdir()
50
tmpDir.deleteDir()
51
tmpDir.mkdir()
52

  
53
def allTags = new HashSet<String>();
54
def allStyles = new HashSet<String>();
55
def allClasses = new HashSet<String>();
56
def newPrefix = "Numéro de document : "
57
int itext = 1;
58
def formater = new DecimalFormat("0000");
59
int LIMITDIFF = 10
60
def metadatas = ["DocPublicationName", "DocHeader"]
61
def files = []
62
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
63
files = files.sort()
64

  
65
def done = new HashSet<String>();
66
def ignored = []
67
def allTitles = [:]
68
def dones = [:]
69
def ignoreds = []
70

  
71
def getText(def node) {
72
	//if (debug) println "node: "+node
73
	String s = " ";
74
	if (node instanceof String) {
75
		s += " "+node
76
	} else {
77
		for(def c : node.children())
78
			s += " "+getText(c)
79
	}
80
	//println " "+s.replace("\n", " ").trim();
81
	return " "+s.replace("\n", " ").trim();
82
}
83

  
84
println "Nb of HTML files: "+files.size()
85
for (File htmlFile : files) {
86
	println "Processing file $htmlFile"
87
	File tmpHTML = new File(tmpDir, htmlFile.getName())
88
	tmpHTML.withWriter("UTF-8") { writer -> 
89
		String txt = htmlFile.getText(encoding)
90
		txt = txt.replaceAll("<p></p>", " ");
91
		txt = txt.replaceAll("<p> </p>", " ");
92
		txt = txt.replaceAll("<br>", "<br> ");
93
		writer.write(txt)
94
	}
95
	
96
	String name = htmlFile.getName()
97
	name = name.substring(0, name.lastIndexOf("."));
98

  
99
	File xhtmlFile = new File(outDir, name+".xhtml")
100

  
101
	xhtmlFile.withWriter("UTF-8") { out ->
102
		def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
103
		println "current charset: "+doc.charset()
104
		doc.charset(Charset.forName("UTF-8"))
105
		println "current charset: "+doc.charset()
106
		out.println(doc.outerHtml​())
107
	}
108

  
109
	if (ValidateXml.test(xhtmlFile)) {
110
		def root = new XmlParser().parse(xhtmlFile)
111
		def tables = root.body.table.tbody.tr.td
112
		if (tables.size() == 0) tables = root.body.table.tr.td
113
		//println "Nb of txt : "+tables.size()
114

  
115
		for (def text : tables) {
116
			String sign = ""
117
		
118
			if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
119
			//println "TEXT "
120
			//text.setName("text")
121
			boolean endOfText = false;
122
			def textMetadatas = [:]
123
			for (String metadata : metadatas) {
124
				textMetadatas[metadata] = ""
125
			}
126
			for (def subtable : text.table) text.remove(subtable)	
127
			
128
			for (def p : text.table.p) p.addChild(" ")		
129
			
130
			for (def child : text.span) {
131
				if ("color:red; font-weight:bold".equals(child.@style)) {
132
					//text.remove(child)
133
					if (debug) "Found bold: "+child
134
					child.replaceNode { node -> w(expFound: "y", child.text())}
135
				}
136
			}
137
			
138
			def startIgnoringText = false
139
			def tmp =""
140
			def ichar = 0
141
			String title = "";
142
			def ignoredText = ""
143
			def children = text.children()
144
			for (int idx = 0 ; idx < children.size() ; idx++) {
145
				
146
				def child  = children[idx]
147
				if (debug) println "child: $child"
148
			
149
				if (startIgnoringText) {
150
					if (debug) println "Ignoring text : "+ignoredText
151
					if (child instanceof String) ignoredText += child
152
					else ignoredText += child.text()
153
					
154
					def t = text.children().remove(idx);
155
					//if (tmp.length() > 0) println "removing : "+t
156
					idx--
157
					continue; // next child
158
				}
159
			
160
				if (child instanceof String) {
161
					//println " "+child
162
					ichar += child.length()
163
				} else {
164
					ichar += child.text().length()
165
					//		allTags.add(child.name().getLocalPart())
166
					//		allClasses.add(child.@class)
167
					//		allStyles.add(child.@style)
168
					def nn = child.name()
169
					
170
					try {nn = nn.getLocalPart()} catch(Exception e) {}
171
					switch (nn) {
172
						case "br": break;
173
						case "span":
174
						if (debug) println "Found span $child"
175
							String classV = child.@class
176
							String style = child.@style
177
							if (classV != null) {
178
								if (metadatas.contains(classV)) {
179
									textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
180
									text.remove(child);
181
									idx--
182
									//println "METADATA: "+classV + " = "+child.text().trim().length();
183
								} else if ("TitreArticleVisu" == classV) {
184
									title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
185
									child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
186
								} else {
187
									println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
188
								}
189
							} else if (style != null) {
190
								if ("color:red; font-weight:bold".equals(style)) {
191
									//child.replaceNode { node -> w(expFound: "test")	}
192
									//println "KEYWORD: "+child.text().trim();
193
								} else {
194
									println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
195
								}
196
							} else {
197
								println "UNKNOWED SPAN: "+child.text().trim();
198
							}
199
							break;
200

  
201
						case "a": break
202
						case "w": break;
203
						case "b":
204
							startIgnoringText = true;
205
							tmp = child.text()
206
							//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
207
							text.remove(child);
208
							idx--
209
							break;
210
						case "i": break;
211
						case "font":
212
							if (debug) println "Found font $child"
213
							String style = child.@style
214
							if ("font-style:italic;" == style) {
215
								if (debug) println "ITALIC: "+getText(child).trim();
216
								child.replaceNode { node -> i(getText(child))}
217
							} else if ("font-weight:bold;") {
218
								if (debug) println "BOLD: "+getText(child).trim();
219
								child.replaceNode { node -> b(getText(child))}
220
							} else {
221
								println "FSTYLE: '"+style+"' = "+getText(child).trim();
222
							}
223
							break;
224
						default: println child.name()
225
					}
226
				}
227
			}
228
			
229
			//rename td to text
230
			text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
231
			
232
			//Write metadatas
233
			for( String metadata : metadatas) {
234
				text.attributes().put(metadata, textMetadatas[metadata])
235
				//sign+= " "+textMetadatas[metadata].trim()
236
			}
237
			
238
			// get document number
239
			ignoredText = ignoredText.replaceAll("\n", " ")
240
			int iNo= ignoredText.indexOf(newPrefix);
241
			//println ignoredText
242
			if (iNo >= 0) {
243
				String no =ignoredText.substring(iNo+newPrefix.length()).trim()
244
				text.attributes().put("idnews", no)
245
				//sign += " "+no
246
				text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
247
				text.attributes().put("yyyymmdd", no.substring(5,13))
248
				text.attributes().put("yyyymm", no.substring(5,11))
249
				text.attributes().put("yyyy", no.substring(5,9))
250
				text.attributes().put("mm", no.substring(9,11))
251
				text.attributes().put("dd", no.substring(11,13))
252
			}
253
			
254
			//sign += " "+ichar
255
			sign += " "+title
256
			
257
			if (allTitles[title] == null) allTitles[title] = ichar
258
			if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
259
				sign += " "+ichar
260
			}
261
			File xmlFile;
262
			if (done.contains(sign)) {
263
				ignored << sign
264
				xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
265
				ignoreds << xmlFile.getName()
266
			} else {
267
				done << sign;
268
				xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
269
				dones[sign] = xmlFile
270
			}
271
			def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
272
				writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
273
				new XmlNodePrinter(writer).print(text)
274
		}
275
	}
276
}
277

  
278
if (ignored.size() > 0) {
279
	File ignoredFile = new File (rejected, "ignored.txt");
280
	ignoredFile.withWriter("UTF-8") { writer ->
281
		writer.println "TOTAL: "+ignored.size()
282
		for (int i = 0 ; i < ignored.size() ; i++) {
283
			def sign = ignored[i]
284
			writer.println "\n**DUPLICATE\n "
285
			writer.println "keeped="+dones[sign];
286
			writer.println "rejected="+ignoreds[i];
287
			writer.println "SIGN="+sign
288
			writer.println "\n"
289
		}
290
	}
291
	println "TOTAL IGNORED: "+ignored.size()
292
}
293

  
294
println "TOTAL TEXT: $itext"
295
if (!debug) {
296
	outDir.deleteDir()
297
	tmpDir.deleteDir()
298
}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/EuroPresse2XMLMacro.groovy (revision 3591)
1
package org.txm.macro.misc
2
// STANDARD DECLARATIONS
3

  
4
import groovy.xml.QName
5

  
6
import java.nio.charset.Charset
7
import java.text.DecimalFormat
8
import org.txm.utils.xml.DomUtils;
9
import org.txm.importer.ValidateXml;
10
import groovy.util.XmlParser
11
import org.kohsuke.args4j.*
12
import groovy.transform.Field
13
import org.txm.rcp.swt.widget.parameters.*
14

  
15
// BEGINNING OF PARAMETERS
16
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
17
File rootDir = new File("");
18

  
19
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
20
String encoding = "iso-8859-1" // HTML files encoding
21

  
22
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
23
def debug = "true" // set true to debug the script
24

  
25
// Open the parameters input dialog box
26
if (!ParametersDialog.open(this)) return;
27

  
28
debug = ("true" == debug)
29
// END OF PARAMETERS
30

  
31
String corpusName = rootDir.getName()
32
File srcDir = new File(rootDir, "orig");
33
File outDir = new File(rootDir, "xhtml");
34
File outDir2 = new File(rootDir, corpusName);
35
File rejected = new File(rootDir, "duplicates");
36
File tmpDir = new File(rootDir, "tmp");
37

  
38

  
39
if (!srcDir.exists()) {
40
	println "STOP, srcDir does not exists $srcDir"
41
	return;
42
}
43

  
44
outDir.deleteDir()
45
outDir.mkdir()
46
outDir2.deleteDir()
47
outDir2.mkdir()
48
rejected.deleteDir()
49
rejected.mkdir()
50
tmpDir.deleteDir()
51
tmpDir.mkdir()
52

  
53
def allTags = new HashSet<String>();
54
def allStyles = new HashSet<String>();
55
def allClasses = new HashSet<String>();
56
def newPrefix = "Numéro de document : "
57
int itext = 1;
58
def formater = new DecimalFormat("0000");
59
int LIMITDIFF = 10
60
def metadatas = ["DocPublicationName", "DocHeader"]
61
def files = []
62
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
63
files = files.sort()
64

  
65
def done = new HashSet<String>();
66
def ignored = []
67
def allTitles = [:]
68
def dones = [:]
69
def ignoreds = []
70

  
71
def getText(def node) {
72
	//if (debug) println "node: "+node
73
	String s = " ";
74
	if (node instanceof String) {
75
		s += " "+node
76
	} else {
77
		for(def c : node.children())
78
			s += " "+getText(c)
79
	}
80
	//println " "+s.replace("\n", " ").trim();
81
	return " "+s.replace("\n", " ").trim();
82
}
83

  
84
println "Nb of HTML files: "+files.size()
85
for (File htmlFile : files) {
86
	println "Processing file $htmlFile"
87
	File tmpHTML = new File(tmpDir, htmlFile.getName())
88
	tmpHTML.withWriter("UTF-8") { writer -> 
89
		String txt = htmlFile.getText(encoding)
90
		txt = txt.replaceAll("<p></p>", " ");
91
		txt = txt.replaceAll("<p> </p>", " ");
92
		txt = txt.replaceAll("<br>", "<br> ");
93
		writer.write(txt)
94
	}
95
	
96
	String name = htmlFile.getName()
97
	name = name.substring(0, name.lastIndexOf("."));
98

  
99
	File xhtmlFile = new File(outDir, name+".xhtml")
100

  
101
	xhtmlFile.withWriter("UTF-8") { out ->
102
		def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
103
		println "current charset: "+doc.charset()
104
		doc.charset(Charset.forName("UTF-8"))
105
		println "current charset: "+doc.charset()
106
		out.println(doc.outerHtml​())
107
	}
108

  
109
	if (ValidateXml.test(xhtmlFile)) {
110
		def root = new XmlParser().parse(xhtmlFile)
111
		def tables = root.body.table.tbody.tr.td
112
		if (tables.size() == 0) tables = root.body.table.tr.td
113
		//println "Nb of txt : "+tables.size()
114

  
115
		for (def text : tables) {
116
			String sign = ""
117
		
118
			if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
119
			//println "TEXT "
120
			//text.setName("text")
121
			boolean endOfText = false;
122
			def textMetadatas = [:]
123
			for (String metadata : metadatas) {
124
				textMetadatas[metadata] = ""
125
			}
126
			for (def subtable : text.table) text.remove(subtable)	
127
			
128
			for (def p : text.table.p) p.addChild(" ")		
129
			
130
			for (def child : text.span) {
131
				if ("color:red; font-weight:bold".equals(child.@style)) {
132
					//text.remove(child)
133
					if (debug) "Found bold: "+child
134
					child.replaceNode { node -> w(expFound: "y", child.text())}
135
				}
136
			}
137
			
138
			def startIgnoringText = false
139
			def tmp =""
140
			def ichar = 0
141
			String title = "";
142
			def ignoredText = ""
143
			def children = text.children()
144
			for (int idx = 0 ; idx < children.size() ; idx++) {
145
				
146
				def child  = children[idx]
147
				if (debug) println "child: $child"
148
			
149
				if (startIgnoringText) {
150
					if (debug) println "Ignoring text : "+ignoredText
151
					if (child instanceof String) ignoredText += child
152
					else ignoredText += child.text()
153
					
154
					def t = text.children().remove(idx);
155
					//if (tmp.length() > 0) println "removing : "+t
156
					idx--
157
					continue; // next child
158
				}
159
			
160
				if (child instanceof String) {
161
					//println " "+child
162
					ichar += child.length()
163
				} else {
164
					ichar += child.text().length()
165
					//		allTags.add(child.name().getLocalPart())
166
					//		allClasses.add(child.@class)
167
					//		allStyles.add(child.@style)
168
					def nn = child.name()
169
					
170
					try {nn = nn.getLocalPart()} catch(Exception e) {}
171
					switch (nn) {
172
						case "br": break;
173
						case "span":
174
						if (debug) println "Found span $child"
175
							String classV = child.@class
176
							String style = child.@style
177
							if (classV != null) {
178
								if (metadatas.contains(classV)) {
179
									textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
180
									text.remove(child);
181
									idx--
182
									//println "METADATA: "+classV + " = "+child.text().trim().length();
183
								} else if ("TitreArticleVisu" == classV) {
184
									title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
185
									child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
186
								} else {
187
									println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
188
								}
189
							} else if (style != null) {
190
								if ("color:red; font-weight:bold".equals(style)) {
191
									//child.replaceNode { node -> w(expFound: "test")	}
192
									//println "KEYWORD: "+child.text().trim();
193
								} else {
194
									println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
195
								}
196
							} else {
197
								println "UNKNOWED SPAN: "+child.text().trim();
198
							}
199
							break;
200

  
201
						case "a": break
202
						case "w": break;
203
						case "b":
204
							startIgnoringText = true;
205
							tmp = child.text()
206
							//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
207
							text.remove(child);
208
							idx--
209
							break;
210
						case "i": break;
211
						case "font":
212
							if (debug) println "Found font $child"
213
							String style = child.@style
214
							if ("font-style:italic;" == style) {
215
								if (debug) println "ITALIC: "+getText(child).trim();
216
								child.replaceNode { node -> i(getText(child))}
217
							} else if ("font-weight:bold;") {
218
								if (debug) println "BOLD: "+getText(child).trim();
219
								child.replaceNode { node -> b(getText(child))}
220
							} else {
221
								println "FSTYLE: '"+style+"' = "+getText(child).trim();
222
							}
223
							break;
224
						default: println child.name()
225
					}
226
				}
227
			}
228
			
229
			//rename td to text
230
			text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
231
			
232
			//Write metadatas
233
			for( String metadata : metadatas) {
234
				text.attributes().put(metadata, textMetadatas[metadata])
235
				//sign+= " "+textMetadatas[metadata].trim()
236
			}
237
			
238
			// get document number
239
			ignoredText = ignoredText.replaceAll("\n", " ")
240
			int iNo= ignoredText.indexOf(newPrefix);
241
			//println ignoredText
242
			if (iNo >= 0) {
243
				String no =ignoredText.substring(iNo+newPrefix.length()).trim()
244
				text.attributes().put("idnews", no)
245
				//sign += " "+no
246
				text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
247
				text.attributes().put("yyyymmdd", no.substring(5,13))
248
				text.attributes().put("yyyymm", no.substring(5,11))
249
				text.attributes().put("yyyy", no.substring(5,9))
250
				text.attributes().put("mm", no.substring(9,11))
251
				text.attributes().put("dd", no.substring(11,13))
252
			}
253
			
254
			//sign += " "+ichar
255
			sign += " "+title
256
			
257
			if (allTitles[title] == null) allTitles[title] = ichar
258
			if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
259
				sign += " "+ichar
260
			}
261
			File xmlFile;
262
			if (done.contains(sign)) {
263
				ignored << sign
264
				xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
265
				ignoreds << xmlFile.getName()
266
			} else {
267
				done << sign;
268
				xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
269
				dones[sign] = xmlFile
270
			}
271
			def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
272
				writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
273
				new XmlNodePrinter(writer).print(text)
274
		}
275
	}
276
}
277

  
278
if (ignored.size() > 0) {
279
	File ignoredFile = new File (rejected, "ignored.txt");
280
	ignoredFile.withWriter("UTF-8") { writer ->
281
		writer.println "TOTAL: "+ignored.size()
282
		for (int i = 0 ; i < ignored.size() ; i++) {
283
			def sign = ignored[i]
284
			writer.println "\n**DUPLICATE\n "
285
			writer.println "keeped="+dones[sign];
286
			writer.println "rejected="+ignoreds[i];
287
			writer.println "SIGN="+sign
288
			writer.println "\n"
289
		}
290
	}
291
	println "TOTAL IGNORED: "+ignored.size()
292
}
293

  
294
println "TOTAL TEXT: $itext"
295
if (!debug) {
296
	outDir.deleteDir()
297
	tmpDir.deleteDir()
298
}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/hyperlink/EditionHyperlinkMacro.groovy (revision 3591)
1
// STANDARD DECLARATIONS
2
package org.txm.macro.hyperlink
3

  
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.searchengine.cqp.corpus.*
8
import org.txm.edition.rcp.handlers.OpenEdition
9
import org.txm.edition.rcp.editors.RGBA
10

  
11
/**
12
 * The macro use the "stringArgs" Groovy Binding to work. The format is: parameter=value + TAB + parameter2=value2 ...
13
 * 
14
 * Parameters needed:
15
 * - corpus: corpus id
16
 * - text: text id to display
17
 * - page: page id to display
18
 * - editions: optionnal edition ids to display ("default" is used if not set)
19
 * - wordsids: optionnal word ids to highlight&focus
20
 * 
21
 * This macro can be called from whithin TXM editions:
22
 * 
23
 * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    page=3    editions=default')">Open Edition with text+page</a>.
24
 * 
25
 * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    wordids=w_0002_6    editions=default')">Open Edition with text+wordid</a>.
26
 * 
27
 * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    wordids=w_0002_6,w_0002_7,w_0002_8,w_0002_9    editions=default')">Open Edition with text+wordids</a>.
28
 */
29

  
30

  
31
if (stringArgs == null) {
32
	println "** Error: this macro must be called from an edition hyperlink"
33
	return
34
}
35

  
36
def params = stringArgs.split("\t")
37
def hash = [:]
38
for (def param : params) {
39
	def split = param.split("=", 2)
40
	hash[split[0]] = split[1]
41
}
42
corpus = hash["corpus"]
43
text = hash["text"]
44
page = hash["page"]
45
editions = hash["editions"]
46
if (editions != null) editions = editions.split(",") as List
47
wordids = hash["wordids"]
48
if (wordids != null) wordids = wordids.split(",") as List
49

  
50
println "corpus=$corpus editions=$editions text=$text page=$page wordids=$wordids"
51
corpus = CorpusManager.getCorpusManager().getCorpora()[corpus]
52

  
53
monitor.syncExec(new Runnable() {
54
	public void run() {
55
		editor = OpenEdition.openEdition(corpus, editions)
56
		if (wordids != null && wordids.size() > 0) {
57
			try {
58
				editor.backToText(corpus.getProject().getText(text), wordids[0])
59
				
60
				editor.removeHighlightWords()
61
				editor.addHighlightWordsById(new RGBA(249, 208, 208), wordids)
62
				editor.updateWordStyles()
63
			}catch(Exception e) { e.printStackTrace()}
64
		} else {
65
			editor.goToText(text)
66
			editor.goToPage(page)
67
		}
68
	}
69
});
70

  
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 3591)
1
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
2
// @author mdecorde
3
// @author sheiden
4

  
5
// STANDARD DECLARATIONS
6
package org.txm.macro.export
7

  
8
import org.txm.searchengine.cqp.CQPSearchEngine
9
import org.txm.searchengine.cqp.corpus.*
10
import org.txm.searchengine.cqp.corpus.query.CQLQuery
11
import org.txm.Toolbox
12
import org.txm.utils.i18n.LangFormater;
13
import org.apache.commons.lang.StringUtils;
14
import org.kohsuke.args4j.*
15
import groovy.transform.Field
16
import org.txm.rcp.swt.widget.parameters.*
17

  
18
if (!(corpusViewSelection instanceof CQPCorpus)) {
19
	println "Please select a corpus"
20
	return
21
}
22

  
23
// PARAMETERS
24

  
25
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
26
File outputDirectory
27

  
28
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
29
def wordProperty
30

  
31
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false")
32
def oneWordPerLine
33

  
34
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true")
35
def oneSentencePerLine
36

  
37
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="")
38
def sentenceStructureName
39

  
40
if (!ParametersDialog.open(this)) return
41

  
42
// BEGINNING
43

  
44
if (!outputDirectory.exists()) outputDirectory.mkdirs()
45

  
46
def corpus = corpusViewSelection
47
def corpusName = corpus.getName()
48
def CQI = CQPSearchEngine.getCqiClient()
49

  
50
if (wordProperty == null || !(wordProperty.length() > 0)) {
51
	println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..."
52
	return 1
53
}
54

  
55
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
56
	println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
57
	return 1
58
}
59

  
60
if (oneSentencePerLine) {
61

  
62
	lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
63

  
64
	if (lineSeparatorStructure == null) {
65
		println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..."
66
		return 1
67
	}
68
	
69
	breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds())
70
}
71

  
72
println "Exporting $corpus text content to $outputDirectory..."
73

  
74
def wordPropertyI = corpus.getProperty(wordProperty)
75

  
76
if (wordPropertyI == null) {
77
	println "** No '$wordProperty' word property in the $corpus corpus. Aborting..."
78
	return 1
79
}
80

  
81
def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
82
def textStartBoundaries = corpus.getTextStartLimits()
83
def textEndBoundaries = corpus.getTextEndLimits()
84
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries)
85
String[] textids =  CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos)
86

  
87
if (textStartBoundaries.size() == 1) {
88
	println "1 text"
89
	} else {
90
	println ""+textStartBoundaries.size()+" texts"
91
}
92

  
93
for (int i = 0 ; i < textStartBoundaries.size() ; i++) {
94
	int start = textStartBoundaries[i]
95
	int end = textEndBoundaries[i]
96

  
97
	File txtFile = new File(outputDirectory, textids[i]+".txt")
98
	print "."
99
	def writer = txtFile.newWriter("UTF-8")
100
	int[] positions = new int[end - start + 1]
101
	int c = 0
102
	for (int p : start..end) {
103
		positions[c++] = p
104
	}
105
	int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
106
	def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
107
	def tmp = []
108
	for (int j = 0 ; j < positions.length ; j++) {
109
		int p = positions[j]
110
		tmp << words[j]
111
		if (oneSentencePerLine && breaks_pos.contains(p)) {
112
			if (oneWordPerLine) {
113
				tmp.each { word -> writer.println word }
114
			} else {
115
				writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
116
			}
117
			tmp = []
118
		} 
119
	}
120
	if (tmp.size() > 0) {
121
		if (oneWordPerLine) {
122
			tmp.each { word -> writer.println word }
123
		} else {
124
			writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
125
		}
126
	} 
127
	writer.close()
128
}  
129

  
130
println "\nDone, result saved in "+outputDirectory.getAbsolutePath()
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextsContentMacro.groovy (revision 3591)
1
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
2
// @author mdecorde
3
// @author sheiden
4

  
5
// STANDARD DECLARATIONS
6
package org.txm.macro.export
7

  
8
import org.txm.searchengine.cqp.CQPSearchEngine
9
import org.txm.searchengine.cqp.corpus.*
10
import org.txm.searchengine.cqp.corpus.query.CQLQuery
11
import org.txm.Toolbox
12
import org.txm.utils.i18n.LangFormater;
13
import org.apache.commons.lang.StringUtils;
14
import org.kohsuke.args4j.*
15
import groovy.transform.Field
16
import org.txm.rcp.swt.widget.parameters.*
17

  
18
if (!(corpusViewSelection instanceof CQPCorpus)) {
19
	println "Please select a corpus"
20
	return
21
}
22

  
23
// PARAMETERS
24

  
25
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
26
File outputDirectory
27

  
28
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
29
def wordProperty
30

  
31
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false")
32
def oneWordPerLine
33

  
34
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true")
35
def oneSentencePerLine
36

  
37
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="")
38
def sentenceStructureName
39

  
40
if (!ParametersDialog.open(this)) return
41

  
42
// BEGINNING
43

  
44
if (!outputDirectory.exists()) outputDirectory.mkdirs()
45

  
46
def corpus = corpusViewSelection
47
def corpusName = corpus.getName()
48
def CQI = CQPSearchEngine.getCqiClient()
49

  
50
if (wordProperty == null || !(wordProperty.length() > 0)) {
51
	println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..."
52
	return 1
53
}
54

  
55
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
56
	println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
57
	return 1
58
}
59

  
60
if (oneSentencePerLine) {
61

  
62
	lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
63

  
64
	if (lineSeparatorStructure == null) {
65
		println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..."
66
		return 1
67
	}
68
	
69
	breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds())
70
}
71

  
72
println "Exporting $corpus text content to $outputDirectory..."
73

  
74
def wordPropertyI = corpus.getProperty(wordProperty)
75

  
76
if (wordPropertyI == null) {
77
	println "** No '$wordProperty' word property in the $corpus corpus. Aborting..."
78
	return 1
79
}
80

  
81
def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
82
def textStartBoundaries = corpus.getTextStartLimits()
83
def textEndBoundaries = corpus.getTextEndLimits()
84
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries)
85
String[] textids =  CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos)
86

  
87
if (textStartBoundaries.size() == 1) {
88
	println "1 text"
89
	} else {
90
	println ""+textStartBoundaries.size()+" texts"
91
}
92

  
93
for (int i = 0 ; i < textStartBoundaries.size() ; i++) {
94
	int start = textStartBoundaries[i]
95
	int end = textEndBoundaries[i]
96

  
97
	File txtFile = new File(outputDirectory, textids[i]+".txt")
98
	print "."
99
	def writer = txtFile.newWriter("UTF-8")
100
	int[] positions = new int[end - start + 1]
101
	int c = 0
102
	for (int p : start..end) {
103
		positions[c++] = p
104
	}
105
	int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
106
	def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
107
	def tmp = []
108
	for (int j = 0 ; j < positions.length ; j++) {
109
		int p = positions[j]
110
		tmp << words[j]
111
		if (oneSentencePerLine && breaks_pos.contains(p)) {
112
			if (oneWordPerLine) {
113
				tmp.each { word -> writer.println word }
114
			} else {
115
				writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
116
			}
117
			tmp = []
118
		} 
119
	}
120
	if (tmp.size() > 0) {
121
		if (oneWordPerLine) {
122
			tmp.each { word -> writer.println word }
123
		} else {
124
			writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
125
		}
126
	} 
127
	writer.close()
128
}  
129

  
130
println "\nDone, result saved in "+outputDirectory.getAbsolutePath()

Formats disponibles : Unified diff