Révision 3591
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/EuroPress2XMLMacro.groovy (revision 3591) | ||
---|---|---|
1 |
package org.txm.macro.misc |
|
2 |
// STANDARD DECLARATIONS |
|
3 |
|
|
4 |
import groovy.xml.QName |
|
5 |
|
|
6 |
import java.nio.charset.Charset |
|
7 |
import java.text.DecimalFormat |
|
8 |
import org.txm.utils.xml.DomUtils; |
|
9 |
import org.txm.importer.ValidateXml; |
|
10 |
import groovy.util.XmlParser |
|
11 |
import org.kohsuke.args4j.* |
|
12 |
import groovy.transform.Field |
|
13 |
import org.txm.rcp.swt.widget.parameters.* |
|
14 |
|
|
15 |
// BEGINNING OF PARAMETERS |
|
16 |
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path") |
|
17 |
File rootDir = new File(""); |
|
18 |
|
|
19 |
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1") |
|
20 |
String encoding = "iso-8859-1" // HTML files encoding |
|
21 |
|
|
22 |
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false") |
|
23 |
def debug = "true" // set true to debug the script |
|
24 |
|
|
25 |
// Open the parameters input dialog box |
|
26 |
if (!ParametersDialog.open(this)) return; |
|
27 |
|
|
28 |
debug = ("true" == debug) |
|
29 |
// END OF PARAMETERS |
|
30 |
|
|
31 |
String corpusName = rootDir.getName() |
|
32 |
File srcDir = new File(rootDir, "orig"); |
|
33 |
File outDir = new File(rootDir, "xhtml"); |
|
34 |
File outDir2 = new File(rootDir, corpusName); |
|
35 |
File rejected = new File(rootDir, "duplicates"); |
|
36 |
File tmpDir = new File(rootDir, "tmp"); |
|
37 |
|
|
38 |
|
|
39 |
if (!srcDir.exists()) { |
|
40 |
println "STOP, srcDir does not exists $srcDir" |
|
41 |
return; |
|
42 |
} |
|
43 |
|
|
44 |
outDir.deleteDir() |
|
45 |
outDir.mkdir() |
|
46 |
outDir2.deleteDir() |
|
47 |
outDir2.mkdir() |
|
48 |
rejected.deleteDir() |
|
49 |
rejected.mkdir() |
|
50 |
tmpDir.deleteDir() |
|
51 |
tmpDir.mkdir() |
|
52 |
|
|
53 |
def allTags = new HashSet<String>(); |
|
54 |
def allStyles = new HashSet<String>(); |
|
55 |
def allClasses = new HashSet<String>(); |
|
56 |
def newPrefix = "Numéro de document : " |
|
57 |
int itext = 1; |
|
58 |
def formater = new DecimalFormat("0000"); |
|
59 |
int LIMITDIFF = 10 |
|
60 |
def metadatas = ["DocPublicationName", "DocHeader"] |
|
61 |
def files = [] |
|
62 |
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile} |
|
63 |
files = files.sort() |
|
64 |
|
|
65 |
def done = new HashSet<String>(); |
|
66 |
def ignored = [] |
|
67 |
def allTitles = [:] |
|
68 |
def dones = [:] |
|
69 |
def ignoreds = [] |
|
70 |
|
|
71 |
def getText(def node) { |
|
72 |
//if (debug) println "node: "+node |
|
73 |
String s = " "; |
|
74 |
if (node instanceof String) { |
|
75 |
s += " "+node |
|
76 |
} else { |
|
77 |
for(def c : node.children()) |
|
78 |
s += " "+getText(c) |
|
79 |
} |
|
80 |
//println " "+s.replace("\n", " ").trim(); |
|
81 |
return " "+s.replace("\n", " ").trim(); |
|
82 |
} |
|
83 |
|
|
84 |
println "Nb of HTML files: "+files.size() |
|
85 |
for (File htmlFile : files) { |
|
86 |
println "Processing file $htmlFile" |
|
87 |
File tmpHTML = new File(tmpDir, htmlFile.getName()) |
|
88 |
tmpHTML.withWriter("UTF-8") { writer -> |
|
89 |
String txt = htmlFile.getText(encoding) |
|
90 |
txt = txt.replaceAll("<p></p>", " "); |
|
91 |
txt = txt.replaceAll("<p> </p>", " "); |
|
92 |
txt = txt.replaceAll("<br>", "<br> "); |
|
93 |
writer.write(txt) |
|
94 |
} |
|
95 |
|
|
96 |
String name = htmlFile.getName() |
|
97 |
name = name.substring(0, name.lastIndexOf(".")); |
|
98 |
|
|
99 |
File xhtmlFile = new File(outDir, name+".xhtml") |
|
100 |
|
|
101 |
xhtmlFile.withWriter("UTF-8") { out -> |
|
102 |
def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString()); |
|
103 |
println "current charset: "+doc.charset() |
|
104 |
doc.charset(Charset.forName("UTF-8")) |
|
105 |
println "current charset: "+doc.charset() |
|
106 |
out.println(doc.outerHtml()) |
|
107 |
} |
|
108 |
|
|
109 |
if (ValidateXml.test(xhtmlFile)) { |
|
110 |
def root = new XmlParser().parse(xhtmlFile) |
|
111 |
def tables = root.body.table.tbody.tr.td |
|
112 |
if (tables.size() == 0) tables = root.body.table.tr.td |
|
113 |
//println "Nb of txt : "+tables.size() |
|
114 |
|
|
115 |
for (def text : tables) { |
|
116 |
String sign = "" |
|
117 |
|
|
118 |
if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; } |
|
119 |
//println "TEXT " |
|
120 |
//text.setName("text") |
|
121 |
boolean endOfText = false; |
|
122 |
def textMetadatas = [:] |
|
123 |
for (String metadata : metadatas) { |
|
124 |
textMetadatas[metadata] = "" |
|
125 |
} |
|
126 |
for (def subtable : text.table) text.remove(subtable) |
|
127 |
|
|
128 |
for (def p : text.table.p) p.addChild(" ") |
|
129 |
|
|
130 |
for (def child : text.span) { |
|
131 |
if ("color:red; font-weight:bold".equals(child.@style)) { |
|
132 |
//text.remove(child) |
|
133 |
if (debug) "Found bold: "+child |
|
134 |
child.replaceNode { node -> w(expFound: "y", child.text())} |
|
135 |
} |
|
136 |
} |
|
137 |
|
|
138 |
def startIgnoringText = false |
|
139 |
def tmp ="" |
|
140 |
def ichar = 0 |
|
141 |
String title = ""; |
|
142 |
def ignoredText = "" |
|
143 |
def children = text.children() |
|
144 |
for (int idx = 0 ; idx < children.size() ; idx++) { |
|
145 |
|
|
146 |
def child = children[idx] |
|
147 |
if (debug) println "child: $child" |
|
148 |
|
|
149 |
if (startIgnoringText) { |
|
150 |
if (debug) println "Ignoring text : "+ignoredText |
|
151 |
if (child instanceof String) ignoredText += child |
|
152 |
else ignoredText += child.text() |
|
153 |
|
|
154 |
def t = text.children().remove(idx); |
|
155 |
//if (tmp.length() > 0) println "removing : "+t |
|
156 |
idx-- |
|
157 |
continue; // next child |
|
158 |
} |
|
159 |
|
|
160 |
if (child instanceof String) { |
|
161 |
//println " "+child |
|
162 |
ichar += child.length() |
|
163 |
} else { |
|
164 |
ichar += child.text().length() |
|
165 |
// allTags.add(child.name().getLocalPart()) |
|
166 |
// allClasses.add(child.@class) |
|
167 |
// allStyles.add(child.@style) |
|
168 |
def nn = child.name() |
|
169 |
|
|
170 |
try {nn = nn.getLocalPart()} catch(Exception e) {} |
|
171 |
switch (nn) { |
|
172 |
case "br": break; |
|
173 |
case "span": |
|
174 |
if (debug) println "Found span $child" |
|
175 |
String classV = child.@class |
|
176 |
String style = child.@style |
|
177 |
if (classV != null) { |
|
178 |
if (metadatas.contains(classV)) { |
|
179 |
textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim() |
|
180 |
text.remove(child); |
|
181 |
idx-- |
|
182 |
//println "METADATA: "+classV + " = "+child.text().trim().length(); |
|
183 |
} else if ("TitreArticleVisu" == classV) { |
|
184 |
title += getText(child)//.text().trim().replaceAll("\n", " ")+ " " |
|
185 |
child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix()) |
|
186 |
} else { |
|
187 |
println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length(); |
|
188 |
} |
|
189 |
} else if (style != null) { |
|
190 |
if ("color:red; font-weight:bold".equals(style)) { |
|
191 |
//child.replaceNode { node -> w(expFound: "test") } |
|
192 |
//println "KEYWORD: "+child.text().trim(); |
|
193 |
} else { |
|
194 |
println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim(); |
|
195 |
} |
|
196 |
} else { |
|
197 |
println "UNKNOWED SPAN: "+child.text().trim(); |
|
198 |
} |
|
199 |
break; |
|
200 |
|
|
201 |
case "a": break |
|
202 |
case "w": break; |
|
203 |
case "b": |
|
204 |
startIgnoringText = true; |
|
205 |
tmp = child.text() |
|
206 |
//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp |
|
207 |
text.remove(child); |
|
208 |
idx-- |
|
209 |
break; |
|
210 |
case "i": break; |
|
211 |
case "font": |
|
212 |
if (debug) println "Found font $child" |
|
213 |
String style = child.@style |
|
214 |
if ("font-style:italic;" == style) { |
|
215 |
if (debug) println "ITALIC: "+getText(child).trim(); |
|
216 |
child.replaceNode { node -> i(getText(child))} |
|
217 |
} else if ("font-weight:bold;") { |
|
218 |
if (debug) println "BOLD: "+getText(child).trim(); |
|
219 |
child.replaceNode { node -> b(getText(child))} |
|
220 |
} else { |
|
221 |
println "FSTYLE: '"+style+"' = "+getText(child).trim(); |
|
222 |
} |
|
223 |
break; |
|
224 |
default: println child.name() |
|
225 |
} |
|
226 |
} |
|
227 |
} |
|
228 |
|
|
229 |
//rename td to text |
|
230 |
text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix()) |
|
231 |
|
|
232 |
//Write metadatas |
|
233 |
for( String metadata : metadatas) { |
|
234 |
text.attributes().put(metadata, textMetadatas[metadata]) |
|
235 |
//sign+= " "+textMetadatas[metadata].trim() |
|
236 |
} |
|
237 |
|
|
238 |
// get document number |
|
239 |
ignoredText = ignoredText.replaceAll("\n", " ") |
|
240 |
int iNo= ignoredText.indexOf(newPrefix); |
|
241 |
//println ignoredText |
|
242 |
if (iNo >= 0) { |
|
243 |
String no =ignoredText.substring(iNo+newPrefix.length()).trim() |
|
244 |
text.attributes().put("idnews", no) |
|
245 |
//sign += " "+no |
|
246 |
text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13)) |
|
247 |
text.attributes().put("yyyymmdd", no.substring(5,13)) |
|
248 |
text.attributes().put("yyyymm", no.substring(5,11)) |
|
249 |
text.attributes().put("yyyy", no.substring(5,9)) |
|
250 |
text.attributes().put("mm", no.substring(9,11)) |
|
251 |
text.attributes().put("dd", no.substring(11,13)) |
|
252 |
} |
|
253 |
|
|
254 |
//sign += " "+ichar |
|
255 |
sign += " "+title |
|
256 |
|
|
257 |
if (allTitles[title] == null) allTitles[title] = ichar |
|
258 |
if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) { |
|
259 |
sign += " "+ichar |
|
260 |
} |
|
261 |
File xmlFile; |
|
262 |
if (done.contains(sign)) { |
|
263 |
ignored << sign |
|
264 |
xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml") |
|
265 |
ignoreds << xmlFile.getName() |
|
266 |
} else { |
|
267 |
done << sign; |
|
268 |
xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml") |
|
269 |
dones[sign] = xmlFile |
|
270 |
} |
|
271 |
def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$ |
|
272 |
writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" |
|
273 |
new XmlNodePrinter(writer).print(text) |
|
274 |
} |
|
275 |
} |
|
276 |
} |
|
277 |
|
|
278 |
if (ignored.size() > 0) { |
|
279 |
File ignoredFile = new File (rejected, "ignored.txt"); |
|
280 |
ignoredFile.withWriter("UTF-8") { writer -> |
|
281 |
writer.println "TOTAL: "+ignored.size() |
|
282 |
for (int i = 0 ; i < ignored.size() ; i++) { |
|
283 |
def sign = ignored[i] |
|
284 |
writer.println "\n**DUPLICATE\n " |
|
285 |
writer.println "keeped="+dones[sign]; |
|
286 |
writer.println "rejected="+ignoreds[i]; |
|
287 |
writer.println "SIGN="+sign |
|
288 |
writer.println "\n" |
|
289 |
} |
|
290 |
} |
|
291 |
println "TOTAL IGNORED: "+ignored.size() |
|
292 |
} |
|
293 |
|
|
294 |
println "TOTAL TEXT: $itext" |
|
295 |
if (!debug) { |
|
296 |
outDir.deleteDir() |
|
297 |
tmpDir.deleteDir() |
|
298 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/EuroPresse2XMLMacro.groovy (revision 3591) | ||
---|---|---|
1 |
package org.txm.macro.misc |
|
2 |
// STANDARD DECLARATIONS |
|
3 |
|
|
4 |
import groovy.xml.QName |
|
5 |
|
|
6 |
import java.nio.charset.Charset |
|
7 |
import java.text.DecimalFormat |
|
8 |
import org.txm.utils.xml.DomUtils; |
|
9 |
import org.txm.importer.ValidateXml; |
|
10 |
import groovy.util.XmlParser |
|
11 |
import org.kohsuke.args4j.* |
|
12 |
import groovy.transform.Field |
|
13 |
import org.txm.rcp.swt.widget.parameters.* |
|
14 |
|
|
15 |
// BEGINNING OF PARAMETERS |
|
16 |
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path") |
|
17 |
File rootDir = new File(""); |
|
18 |
|
|
19 |
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1") |
|
20 |
String encoding = "iso-8859-1" // HTML files encoding |
|
21 |
|
|
22 |
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false") |
|
23 |
def debug = "true" // set true to debug the script |
|
24 |
|
|
25 |
// Open the parameters input dialog box |
|
26 |
if (!ParametersDialog.open(this)) return; |
|
27 |
|
|
28 |
debug = ("true" == debug) |
|
29 |
// END OF PARAMETERS |
|
30 |
|
|
31 |
String corpusName = rootDir.getName() |
|
32 |
File srcDir = new File(rootDir, "orig"); |
|
33 |
File outDir = new File(rootDir, "xhtml"); |
|
34 |
File outDir2 = new File(rootDir, corpusName); |
|
35 |
File rejected = new File(rootDir, "duplicates"); |
|
36 |
File tmpDir = new File(rootDir, "tmp"); |
|
37 |
|
|
38 |
|
|
39 |
if (!srcDir.exists()) { |
|
40 |
println "STOP, srcDir does not exists $srcDir" |
|
41 |
return; |
|
42 |
} |
|
43 |
|
|
44 |
outDir.deleteDir() |
|
45 |
outDir.mkdir() |
|
46 |
outDir2.deleteDir() |
|
47 |
outDir2.mkdir() |
|
48 |
rejected.deleteDir() |
|
49 |
rejected.mkdir() |
|
50 |
tmpDir.deleteDir() |
|
51 |
tmpDir.mkdir() |
|
52 |
|
|
53 |
def allTags = new HashSet<String>(); |
|
54 |
def allStyles = new HashSet<String>(); |
|
55 |
def allClasses = new HashSet<String>(); |
|
56 |
def newPrefix = "Numéro de document : " |
|
57 |
int itext = 1; |
|
58 |
def formater = new DecimalFormat("0000"); |
|
59 |
int LIMITDIFF = 10 |
|
60 |
def metadatas = ["DocPublicationName", "DocHeader"] |
|
61 |
def files = [] |
|
62 |
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile} |
|
63 |
files = files.sort() |
|
64 |
|
|
65 |
def done = new HashSet<String>(); |
|
66 |
def ignored = [] |
|
67 |
def allTitles = [:] |
|
68 |
def dones = [:] |
|
69 |
def ignoreds = [] |
|
70 |
|
|
71 |
def getText(def node) { |
|
72 |
//if (debug) println "node: "+node |
|
73 |
String s = " "; |
|
74 |
if (node instanceof String) { |
|
75 |
s += " "+node |
|
76 |
} else { |
|
77 |
for(def c : node.children()) |
|
78 |
s += " "+getText(c) |
|
79 |
} |
|
80 |
//println " "+s.replace("\n", " ").trim(); |
|
81 |
return " "+s.replace("\n", " ").trim(); |
|
82 |
} |
|
83 |
|
|
84 |
println "Nb of HTML files: "+files.size() |
|
85 |
for (File htmlFile : files) { |
|
86 |
println "Processing file $htmlFile" |
|
87 |
File tmpHTML = new File(tmpDir, htmlFile.getName()) |
|
88 |
tmpHTML.withWriter("UTF-8") { writer -> |
|
89 |
String txt = htmlFile.getText(encoding) |
|
90 |
txt = txt.replaceAll("<p></p>", " "); |
|
91 |
txt = txt.replaceAll("<p> </p>", " "); |
|
92 |
txt = txt.replaceAll("<br>", "<br> "); |
|
93 |
writer.write(txt) |
|
94 |
} |
|
95 |
|
|
96 |
String name = htmlFile.getName() |
|
97 |
name = name.substring(0, name.lastIndexOf(".")); |
|
98 |
|
|
99 |
File xhtmlFile = new File(outDir, name+".xhtml") |
|
100 |
|
|
101 |
xhtmlFile.withWriter("UTF-8") { out -> |
|
102 |
def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString()); |
|
103 |
println "current charset: "+doc.charset() |
|
104 |
doc.charset(Charset.forName("UTF-8")) |
|
105 |
println "current charset: "+doc.charset() |
|
106 |
out.println(doc.outerHtml()) |
|
107 |
} |
|
108 |
|
|
109 |
if (ValidateXml.test(xhtmlFile)) { |
|
110 |
def root = new XmlParser().parse(xhtmlFile) |
|
111 |
def tables = root.body.table.tbody.tr.td |
|
112 |
if (tables.size() == 0) tables = root.body.table.tr.td |
|
113 |
//println "Nb of txt : "+tables.size() |
|
114 |
|
|
115 |
for (def text : tables) { |
|
116 |
String sign = "" |
|
117 |
|
|
118 |
if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; } |
|
119 |
//println "TEXT " |
|
120 |
//text.setName("text") |
|
121 |
boolean endOfText = false; |
|
122 |
def textMetadatas = [:] |
|
123 |
for (String metadata : metadatas) { |
|
124 |
textMetadatas[metadata] = "" |
|
125 |
} |
|
126 |
for (def subtable : text.table) text.remove(subtable) |
|
127 |
|
|
128 |
for (def p : text.table.p) p.addChild(" ") |
|
129 |
|
|
130 |
for (def child : text.span) { |
|
131 |
if ("color:red; font-weight:bold".equals(child.@style)) { |
|
132 |
//text.remove(child) |
|
133 |
if (debug) "Found bold: "+child |
|
134 |
child.replaceNode { node -> w(expFound: "y", child.text())} |
|
135 |
} |
|
136 |
} |
|
137 |
|
|
138 |
def startIgnoringText = false |
|
139 |
def tmp ="" |
|
140 |
def ichar = 0 |
|
141 |
String title = ""; |
|
142 |
def ignoredText = "" |
|
143 |
def children = text.children() |
|
144 |
for (int idx = 0 ; idx < children.size() ; idx++) { |
|
145 |
|
|
146 |
def child = children[idx] |
|
147 |
if (debug) println "child: $child" |
|
148 |
|
|
149 |
if (startIgnoringText) { |
|
150 |
if (debug) println "Ignoring text : "+ignoredText |
|
151 |
if (child instanceof String) ignoredText += child |
|
152 |
else ignoredText += child.text() |
|
153 |
|
|
154 |
def t = text.children().remove(idx); |
|
155 |
//if (tmp.length() > 0) println "removing : "+t |
|
156 |
idx-- |
|
157 |
continue; // next child |
|
158 |
} |
|
159 |
|
|
160 |
if (child instanceof String) { |
|
161 |
//println " "+child |
|
162 |
ichar += child.length() |
|
163 |
} else { |
|
164 |
ichar += child.text().length() |
|
165 |
// allTags.add(child.name().getLocalPart()) |
|
166 |
// allClasses.add(child.@class) |
|
167 |
// allStyles.add(child.@style) |
|
168 |
def nn = child.name() |
|
169 |
|
|
170 |
try {nn = nn.getLocalPart()} catch(Exception e) {} |
|
171 |
switch (nn) { |
|
172 |
case "br": break; |
|
173 |
case "span": |
|
174 |
if (debug) println "Found span $child" |
|
175 |
String classV = child.@class |
|
176 |
String style = child.@style |
|
177 |
if (classV != null) { |
|
178 |
if (metadatas.contains(classV)) { |
|
179 |
textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim() |
|
180 |
text.remove(child); |
|
181 |
idx-- |
|
182 |
//println "METADATA: "+classV + " = "+child.text().trim().length(); |
|
183 |
} else if ("TitreArticleVisu" == classV) { |
|
184 |
title += getText(child)//.text().trim().replaceAll("\n", " ")+ " " |
|
185 |
child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix()) |
|
186 |
} else { |
|
187 |
println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length(); |
|
188 |
} |
|
189 |
} else if (style != null) { |
|
190 |
if ("color:red; font-weight:bold".equals(style)) { |
|
191 |
//child.replaceNode { node -> w(expFound: "test") } |
|
192 |
//println "KEYWORD: "+child.text().trim(); |
|
193 |
} else { |
|
194 |
println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim(); |
|
195 |
} |
|
196 |
} else { |
|
197 |
println "UNKNOWED SPAN: "+child.text().trim(); |
|
198 |
} |
|
199 |
break; |
|
200 |
|
|
201 |
case "a": break |
|
202 |
case "w": break; |
|
203 |
case "b": |
|
204 |
startIgnoringText = true; |
|
205 |
tmp = child.text() |
|
206 |
//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp |
|
207 |
text.remove(child); |
|
208 |
idx-- |
|
209 |
break; |
|
210 |
case "i": break; |
|
211 |
case "font": |
|
212 |
if (debug) println "Found font $child" |
|
213 |
String style = child.@style |
|
214 |
if ("font-style:italic;" == style) { |
|
215 |
if (debug) println "ITALIC: "+getText(child).trim(); |
|
216 |
child.replaceNode { node -> i(getText(child))} |
|
217 |
} else if ("font-weight:bold;") { |
|
218 |
if (debug) println "BOLD: "+getText(child).trim(); |
|
219 |
child.replaceNode { node -> b(getText(child))} |
|
220 |
} else { |
|
221 |
println "FSTYLE: '"+style+"' = "+getText(child).trim(); |
|
222 |
} |
|
223 |
break; |
|
224 |
default: println child.name() |
|
225 |
} |
|
226 |
} |
|
227 |
} |
|
228 |
|
|
229 |
//rename td to text |
|
230 |
text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix()) |
|
231 |
|
|
232 |
//Write metadatas |
|
233 |
for( String metadata : metadatas) { |
|
234 |
text.attributes().put(metadata, textMetadatas[metadata]) |
|
235 |
//sign+= " "+textMetadatas[metadata].trim() |
|
236 |
} |
|
237 |
|
|
238 |
// get document number |
|
239 |
ignoredText = ignoredText.replaceAll("\n", " ") |
|
240 |
int iNo= ignoredText.indexOf(newPrefix); |
|
241 |
//println ignoredText |
|
242 |
if (iNo >= 0) { |
|
243 |
String no =ignoredText.substring(iNo+newPrefix.length()).trim() |
|
244 |
text.attributes().put("idnews", no) |
|
245 |
//sign += " "+no |
|
246 |
text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13)) |
|
247 |
text.attributes().put("yyyymmdd", no.substring(5,13)) |
|
248 |
text.attributes().put("yyyymm", no.substring(5,11)) |
|
249 |
text.attributes().put("yyyy", no.substring(5,9)) |
|
250 |
text.attributes().put("mm", no.substring(9,11)) |
|
251 |
text.attributes().put("dd", no.substring(11,13)) |
|
252 |
} |
|
253 |
|
|
254 |
//sign += " "+ichar |
|
255 |
sign += " "+title |
|
256 |
|
|
257 |
if (allTitles[title] == null) allTitles[title] = ichar |
|
258 |
if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) { |
|
259 |
sign += " "+ichar |
|
260 |
} |
|
261 |
File xmlFile; |
|
262 |
if (done.contains(sign)) { |
|
263 |
ignored << sign |
|
264 |
xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml") |
|
265 |
ignoreds << xmlFile.getName() |
|
266 |
} else { |
|
267 |
done << sign; |
|
268 |
xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml") |
|
269 |
dones[sign] = xmlFile |
|
270 |
} |
|
271 |
def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$ |
|
272 |
writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" |
|
273 |
new XmlNodePrinter(writer).print(text) |
|
274 |
} |
|
275 |
} |
|
276 |
} |
|
277 |
|
|
278 |
if (ignored.size() > 0) { |
|
279 |
File ignoredFile = new File (rejected, "ignored.txt"); |
|
280 |
ignoredFile.withWriter("UTF-8") { writer -> |
|
281 |
writer.println "TOTAL: "+ignored.size() |
|
282 |
for (int i = 0 ; i < ignored.size() ; i++) { |
|
283 |
def sign = ignored[i] |
|
284 |
writer.println "\n**DUPLICATE\n " |
|
285 |
writer.println "keeped="+dones[sign]; |
|
286 |
writer.println "rejected="+ignoreds[i]; |
|
287 |
writer.println "SIGN="+sign |
|
288 |
writer.println "\n" |
|
289 |
} |
|
290 |
} |
|
291 |
println "TOTAL IGNORED: "+ignored.size() |
|
292 |
} |
|
293 |
|
|
294 |
println "TOTAL TEXT: $itext" |
|
295 |
if (!debug) { |
|
296 |
outDir.deleteDir() |
|
297 |
tmpDir.deleteDir() |
|
298 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/hyperlink/EditionHyperlinkMacro.groovy (revision 3591) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macro.hyperlink |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.searchengine.cqp.corpus.* |
|
8 |
import org.txm.edition.rcp.handlers.OpenEdition |
|
9 |
import org.txm.edition.rcp.editors.RGBA |
|
10 |
|
|
11 |
/** |
|
12 |
* The macro use the "stringArgs" Groovy Binding to work. The format is: parameter=value + TAB + parameter2=value2 ... |
|
13 |
* |
|
14 |
* Parameters needed: |
|
15 |
* - corpus: corpus id |
|
16 |
* - text: text id to display |
|
17 |
* - page: page id to display |
|
18 |
* - editions: optionnal edition ids to display ("default" is used if not set) |
|
19 |
* - wordsids: optionnal word ids to highlight&focus |
|
20 |
* |
|
21 |
* This macro can be called from whithin TXM editions: |
|
22 |
* |
|
23 |
* <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX text=0002 page=3 editions=default')">Open Edition with text+page</a>. |
|
24 |
* |
|
25 |
* <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX text=0002 wordids=w_0002_6 editions=default')">Open Edition with text+wordid</a>. |
|
26 |
* |
|
27 |
* <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX text=0002 wordids=w_0002_6,w_0002_7,w_0002_8,w_0002_9 editions=default')">Open Edition with text+wordids</a>. |
|
28 |
*/ |
|
29 |
|
|
30 |
|
|
31 |
if (stringArgs == null) { |
|
32 |
println "** Error: this macro must be called from an edition hyperlink" |
|
33 |
return |
|
34 |
} |
|
35 |
|
|
36 |
def params = stringArgs.split("\t") |
|
37 |
def hash = [:] |
|
38 |
for (def param : params) { |
|
39 |
def split = param.split("=", 2) |
|
40 |
hash[split[0]] = split[1] |
|
41 |
} |
|
42 |
corpus = hash["corpus"] |
|
43 |
text = hash["text"] |
|
44 |
page = hash["page"] |
|
45 |
editions = hash["editions"] |
|
46 |
if (editions != null) editions = editions.split(",") as List |
|
47 |
wordids = hash["wordids"] |
|
48 |
if (wordids != null) wordids = wordids.split(",") as List |
|
49 |
|
|
50 |
println "corpus=$corpus editions=$editions text=$text page=$page wordids=$wordids" |
|
51 |
corpus = CorpusManager.getCorpusManager().getCorpora()[corpus] |
|
52 |
|
|
53 |
monitor.syncExec(new Runnable() { |
|
54 |
public void run() { |
|
55 |
editor = OpenEdition.openEdition(corpus, editions) |
|
56 |
if (wordids != null && wordids.size() > 0) { |
|
57 |
try { |
|
58 |
editor.backToText(corpus.getProject().getText(text), wordids[0]) |
|
59 |
|
|
60 |
editor.removeHighlightWords() |
|
61 |
editor.addHighlightWordsById(new RGBA(249, 208, 208), wordids) |
|
62 |
editor.updateWordStyles() |
|
63 |
}catch(Exception e) { e.printStackTrace()} |
|
64 |
} else { |
|
65 |
editor.goToText(text) |
|
66 |
editor.goToPage(page) |
|
67 |
} |
|
68 |
} |
|
69 |
}); |
|
70 |
|
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 3591) | ||
---|---|---|
1 |
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// @author mdecorde |
|
3 |
// @author sheiden |
|
4 |
|
|
5 |
// STANDARD DECLARATIONS |
|
6 |
package org.txm.macro.export |
|
7 |
|
|
8 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
9 |
import org.txm.searchengine.cqp.corpus.* |
|
10 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery |
|
11 |
import org.txm.Toolbox |
|
12 |
import org.txm.utils.i18n.LangFormater; |
|
13 |
import org.apache.commons.lang.StringUtils; |
|
14 |
import org.kohsuke.args4j.* |
|
15 |
import groovy.transform.Field |
|
16 |
import org.txm.rcp.swt.widget.parameters.* |
|
17 |
|
|
18 |
if (!(corpusViewSelection instanceof CQPCorpus)) { |
|
19 |
println "Please select a corpus" |
|
20 |
return |
|
21 |
} |
|
22 |
|
|
23 |
// PARAMETERS |
|
24 |
|
|
25 |
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="") |
|
26 |
File outputDirectory |
|
27 |
|
|
28 |
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word") |
|
29 |
def wordProperty |
|
30 |
|
|
31 |
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false") |
|
32 |
def oneWordPerLine |
|
33 |
|
|
34 |
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true") |
|
35 |
def oneSentencePerLine |
|
36 |
|
|
37 |
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="") |
|
38 |
def sentenceStructureName |
|
39 |
|
|
40 |
if (!ParametersDialog.open(this)) return |
|
41 |
|
|
42 |
// BEGINNING |
|
43 |
|
|
44 |
if (!outputDirectory.exists()) outputDirectory.mkdirs() |
|
45 |
|
|
46 |
def corpus = corpusViewSelection |
|
47 |
def corpusName = corpus.getName() |
|
48 |
def CQI = CQPSearchEngine.getCqiClient() |
|
49 |
|
|
50 |
if (wordProperty == null || !(wordProperty.length() > 0)) { |
|
51 |
println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..." |
|
52 |
return 1 |
|
53 |
} |
|
54 |
|
|
55 |
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) { |
|
56 |
println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..." |
|
57 |
return 1 |
|
58 |
} |
|
59 |
|
|
60 |
if (oneSentencePerLine) { |
|
61 |
|
|
62 |
lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName) |
|
63 |
|
|
64 |
if (lineSeparatorStructure == null) { |
|
65 |
println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..." |
|
66 |
return 1 |
|
67 |
} |
|
68 |
|
|
69 |
breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds()) |
|
70 |
} |
|
71 |
|
|
72 |
println "Exporting $corpus text content to $outputDirectory..." |
|
73 |
|
|
74 |
def wordPropertyI = corpus.getProperty(wordProperty) |
|
75 |
|
|
76 |
if (wordPropertyI == null) { |
|
77 |
println "** No '$wordProperty' word property in the $corpus corpus. Aborting..." |
|
78 |
return 1 |
|
79 |
} |
|
80 |
|
|
81 |
def textidProperty = corpus.getStructuralUnit("text").getProperty("id") |
|
82 |
def textStartBoundaries = corpus.getTextStartLimits() |
|
83 |
def textEndBoundaries = corpus.getTextEndLimits() |
|
84 |
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries) |
|
85 |
String[] textids = CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos) |
|
86 |
|
|
87 |
if (textStartBoundaries.size() == 1) { |
|
88 |
println "1 text" |
|
89 |
} else { |
|
90 |
println ""+textStartBoundaries.size()+" texts" |
|
91 |
} |
|
92 |
|
|
93 |
for (int i = 0 ; i < textStartBoundaries.size() ; i++) { |
|
94 |
int start = textStartBoundaries[i] |
|
95 |
int end = textEndBoundaries[i] |
|
96 |
|
|
97 |
File txtFile = new File(outputDirectory, textids[i]+".txt") |
|
98 |
print "." |
|
99 |
def writer = txtFile.newWriter("UTF-8") |
|
100 |
int[] positions = new int[end - start + 1] |
|
101 |
int c = 0 |
|
102 |
for (int p : start..end) { |
|
103 |
positions[c++] = p |
|
104 |
} |
|
105 |
int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions) |
|
106 |
def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx) |
|
107 |
def tmp = [] |
|
108 |
for (int j = 0 ; j < positions.length ; j++) { |
|
109 |
int p = positions[j] |
|
110 |
tmp << words[j] |
|
111 |
if (oneSentencePerLine && breaks_pos.contains(p)) { |
|
112 |
if (oneWordPerLine) { |
|
113 |
tmp.each { word -> writer.println word } |
|
114 |
} else { |
|
115 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
116 |
} |
|
117 |
tmp = [] |
|
118 |
} |
|
119 |
} |
|
120 |
if (tmp.size() > 0) { |
|
121 |
if (oneWordPerLine) { |
|
122 |
tmp.each { word -> writer.println word } |
|
123 |
} else { |
|
124 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
125 |
} |
|
126 |
} |
|
127 |
writer.close() |
|
128 |
} |
|
129 |
|
|
130 |
println "\nDone, result saved in "+outputDirectory.getAbsolutePath() |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextsContentMacro.groovy (revision 3591) | ||
---|---|---|
1 |
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// @author mdecorde |
|
3 |
// @author sheiden |
|
4 |
|
|
5 |
// STANDARD DECLARATIONS |
|
6 |
package org.txm.macro.export |
|
7 |
|
|
8 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
9 |
import org.txm.searchengine.cqp.corpus.* |
|
10 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery |
|
11 |
import org.txm.Toolbox |
|
12 |
import org.txm.utils.i18n.LangFormater; |
|
13 |
import org.apache.commons.lang.StringUtils; |
|
14 |
import org.kohsuke.args4j.* |
|
15 |
import groovy.transform.Field |
|
16 |
import org.txm.rcp.swt.widget.parameters.* |
|
17 |
|
|
18 |
if (!(corpusViewSelection instanceof CQPCorpus)) { |
|
19 |
println "Please select a corpus" |
|
20 |
return |
|
21 |
} |
|
22 |
|
|
23 |
// PARAMETERS |
|
24 |
|
|
25 |
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="") |
|
26 |
File outputDirectory |
|
27 |
|
|
28 |
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word") |
|
29 |
def wordProperty |
|
30 |
|
|
31 |
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false") |
|
32 |
def oneWordPerLine |
|
33 |
|
|
34 |
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true") |
|
35 |
def oneSentencePerLine |
|
36 |
|
|
37 |
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="") |
|
38 |
def sentenceStructureName |
|
39 |
|
|
40 |
if (!ParametersDialog.open(this)) return |
|
41 |
|
|
42 |
// BEGINNING |
|
43 |
|
|
44 |
if (!outputDirectory.exists()) outputDirectory.mkdirs() |
|
45 |
|
|
46 |
def corpus = corpusViewSelection |
|
47 |
def corpusName = corpus.getName() |
|
48 |
def CQI = CQPSearchEngine.getCqiClient() |
|
49 |
|
|
50 |
if (wordProperty == null || !(wordProperty.length() > 0)) { |
|
51 |
println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..." |
|
52 |
return 1 |
|
53 |
} |
|
54 |
|
|
55 |
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) { |
|
56 |
println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..." |
|
57 |
return 1 |
|
58 |
} |
|
59 |
|
|
60 |
if (oneSentencePerLine) { |
|
61 |
|
|
62 |
lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName) |
|
63 |
|
|
64 |
if (lineSeparatorStructure == null) { |
|
65 |
println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..." |
|
66 |
return 1 |
|
67 |
} |
|
68 |
|
|
69 |
breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds()) |
|
70 |
} |
|
71 |
|
|
72 |
println "Exporting $corpus text content to $outputDirectory..." |
|
73 |
|
|
74 |
def wordPropertyI = corpus.getProperty(wordProperty) |
|
75 |
|
|
76 |
if (wordPropertyI == null) { |
|
77 |
println "** No '$wordProperty' word property in the $corpus corpus. Aborting..." |
|
78 |
return 1 |
|
79 |
} |
|
80 |
|
|
81 |
def textidProperty = corpus.getStructuralUnit("text").getProperty("id") |
|
82 |
def textStartBoundaries = corpus.getTextStartLimits() |
|
83 |
def textEndBoundaries = corpus.getTextEndLimits() |
|
84 |
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries) |
|
85 |
String[] textids = CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos) |
|
86 |
|
|
87 |
if (textStartBoundaries.size() == 1) { |
|
88 |
println "1 text" |
|
89 |
} else { |
|
90 |
println ""+textStartBoundaries.size()+" texts" |
|
91 |
} |
|
92 |
|
|
93 |
for (int i = 0 ; i < textStartBoundaries.size() ; i++) { |
|
94 |
int start = textStartBoundaries[i] |
|
95 |
int end = textEndBoundaries[i] |
|
96 |
|
|
97 |
File txtFile = new File(outputDirectory, textids[i]+".txt") |
|
98 |
print "." |
|
99 |
def writer = txtFile.newWriter("UTF-8") |
|
100 |
int[] positions = new int[end - start + 1] |
|
101 |
int c = 0 |
|
102 |
for (int p : start..end) { |
|
103 |
positions[c++] = p |
|
104 |
} |
|
105 |
int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions) |
|
106 |
def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx) |
|
107 |
def tmp = [] |
|
108 |
for (int j = 0 ; j < positions.length ; j++) { |
|
109 |
int p = positions[j] |
|
110 |
tmp << words[j] |
|
111 |
if (oneSentencePerLine && breaks_pos.contains(p)) { |
|
112 |
if (oneWordPerLine) { |
|
113 |
tmp.each { word -> writer.println word } |
|
114 |
} else { |
|
115 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
116 |
} |
|
117 |
tmp = [] |
|
118 |
} |
|
119 |
} |
|
120 |
if (tmp.size() > 0) { |
|
121 |
if (oneWordPerLine) { |
|
122 |
tmp.each { word -> writer.println word } |
|
123 |
} else { |
|
124 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
125 |
} |
|
126 |
} |
|
127 |
writer.close() |
|
128 |
} |
|
129 |
|
|
130 |
println "\nDone, result saved in "+outputDirectory.getAbsolutePath() |
Formats disponibles : Unified diff