root / tmp / org.txm.core / src / java / org / txm / scripts / importer / GetPagesFromURL.groovy @ 1688
History | View | Annotate | Download (2.4 kB)
1 |
package org.txm.scripts.importer
|
---|---|
2 |
|
3 |
import org.w3c.tidy.* |
4 |
|
5 |
def workDir = new File("/home/mdecorde/TEMP") |
6 |
def outdir = new File("/home/mdecorde/TEMP/out") |
7 |
|
8 |
def src = "http://textometrie.ens-lyon.fr/html/doc/manual/manual1.xhtml"; |
9 |
def base = src.substring(0, src.lastIndexOf("/")); |
10 |
|
11 |
def url_xpath = """//*/a[@href]/@href"""; |
12 |
def url_regexp = /.+/; |
13 |
def duplicates = false; // 'true' not implemented |
14 |
def filename_regexp = /zer/; |
15 |
def content_xpath = "//*/div[@id=\"content\"]"; |
16 |
|
17 |
def xhtmlFile = new File(workDir, "src.xhtml") |
18 |
URL2XHTMLFILE(src, xhtmlFile, workDir); |
19 |
|
20 |
println "finding urls of $xhtmlFile"
|
21 |
XPathResult xpathResult = new XPathResult(xhtmlFile);
|
22 |
def urls = new LinkedHashSet<String>(); |
23 |
|
24 |
for (String url : xpathResult.getXpathResponses(url_xpath)) |
25 |
if (urls ==~ url_regexp ) urls << url
|
26 |
println urls |
27 |
|
28 |
int i = 1; |
29 |
for (String url : urls) { |
30 |
//url = url.replaceAll()
|
31 |
def subxHtmlFile = new File(workDir, url) |
32 |
URL2XHTMLFILE(base+"/"+url, subxHtmlFile, workDir);
|
33 |
|
34 |
def txtFile = new File(outdir, url+".xhtml") |
35 |
def writer = txtFile.newWriter("UTF-8"); |
36 |
for (def node : xpathResult.getNodes(content_xpath)) { |
37 |
writer.println(node) |
38 |
} |
39 |
writer.close() |
40 |
i++ |
41 |
} |
42 |
|
43 |
def URL2XHTMLFILE(String url, File outfile, File workDir) { |
44 |
println "getting src: $url"
|
45 |
def htmlFile = new File(workDir, "base.html") |
46 |
def filestream = new FileOutputStream(htmlFile) |
47 |
def out = new BufferedOutputStream(filestream) |
48 |
out << new URL(url).openStream() |
49 |
out.close() |
50 |
|
51 |
println "xHTMLize HTML $htmlFile"
|
52 |
//TODO replace JTidy with Jsoup
|
53 |
//def tidy = new Tidy(); // obtain a new Tidy instance
|
54 |
tidy.setXHTML(true); // set desired config options using tidy setters |
55 |
tidy.setInputEncoding("UTF-8")
|
56 |
tidy.setOutputEncoding("UTF-8")
|
57 |
tidy.setShowErrors(0)
|
58 |
tidy.setTabsize(10)
|
59 |
tidy.setShowWarnings(false)
|
60 |
File xhtmlFile = new File(workDir, "base.xhtml") |
61 |
xhtmlFile.withWriter("UTF-8") { out2 ->
|
62 |
tidy.parse(htmlFile.toURI().toURL().newInputStream([readTimeout: 10000]), out2); // run tidy, providing an input and output stream |
63 |
} |
64 |
|
65 |
println "remove entities and doctype"
|
66 |
String text = xhtmlFile.getText("UTF-8") |
67 |
text = text.replaceAll(" ", " ") |
68 |
text = text.replaceAll("xmlns=\"http://www.w3.org/1999/xhtml\"", " ") |
69 |
text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
70 |
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">""", "") |
71 |
outfile.withWriter("UTF-8") { writer ->
|
72 |
writer.write(text); |
73 |
} |
74 |
} |
75 |
println "done"
|