Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / GetPagesFromURL.groovy @ 2473

History | View | Annotate | Download (2.4 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 927 mdecorde
import org.w3c.tidy.*
4 881 mdecorde
5 881 mdecorde
def workDir = new File("/home/mdecorde/TEMP")
6 881 mdecorde
def outdir = new File("/home/mdecorde/TEMP/out")
7 881 mdecorde
8 1216 mdecorde
def src = "http://textometrie.ens-lyon.fr/html/doc/manual/manual1.xhtml";
9 881 mdecorde
def base = src.substring(0, src.lastIndexOf("/"));
10 881 mdecorde
11 881 mdecorde
def url_xpath = """//*/a[@href]/@href""";
12 881 mdecorde
def url_regexp = /.+/;
13 881 mdecorde
def duplicates = false; // 'true' not implemented
14 881 mdecorde
def filename_regexp = /zer/;
15 881 mdecorde
def content_xpath = "//*/div[@id=\"content\"]";
16 881 mdecorde
17 881 mdecorde
def xhtmlFile = new File(workDir, "src.xhtml")
18 881 mdecorde
URL2XHTMLFILE(src, xhtmlFile, workDir);
19 881 mdecorde
20 881 mdecorde
println "finding urls of $xhtmlFile"
21 881 mdecorde
XPathResult xpathResult = new XPathResult(xhtmlFile);
22 881 mdecorde
def urls = new LinkedHashSet<String>();
23 881 mdecorde
24 881 mdecorde
for (String url : xpathResult.getXpathResponses(url_xpath))
25 881 mdecorde
        if (urls ==~ url_regexp ) urls << url
26 881 mdecorde
println urls
27 881 mdecorde
28 881 mdecorde
int i = 1;
29 881 mdecorde
for (String url : urls) {
30 881 mdecorde
        //url = url.replaceAll()
31 881 mdecorde
        def subxHtmlFile = new File(workDir, url)
32 881 mdecorde
        URL2XHTMLFILE(base+"/"+url, subxHtmlFile, workDir);
33 881 mdecorde
34 881 mdecorde
        def txtFile = new File(outdir, url+".xhtml")
35 881 mdecorde
        def writer = txtFile.newWriter("UTF-8");
36 881 mdecorde
        for (def node : xpathResult.getNodes(content_xpath)) {
37 881 mdecorde
                writer.println(node)
38 881 mdecorde
        }
39 881 mdecorde
        writer.close()
40 881 mdecorde
        i++
41 881 mdecorde
}
42 881 mdecorde
43 881 mdecorde
def URL2XHTMLFILE(String url, File outfile, File workDir) {
44 881 mdecorde
        println "getting src: $url"
45 881 mdecorde
        def htmlFile = new File(workDir, "base.html")
46 881 mdecorde
        def filestream = new FileOutputStream(htmlFile)
47 881 mdecorde
        def out = new BufferedOutputStream(filestream)
48 881 mdecorde
        out << new URL(url).openStream()
49 881 mdecorde
        out.close()
50 881 mdecorde
51 881 mdecorde
        println "xHTMLize HTML $htmlFile"
52 927 mdecorde
        //TODO replace JTidy with Jsoup
53 927 mdecorde
        //def tidy = new Tidy(); // obtain a new Tidy instance
54 881 mdecorde
        tidy.setXHTML(true); // set desired config options using tidy setters
55 881 mdecorde
        tidy.setInputEncoding("UTF-8")
56 881 mdecorde
        tidy.setOutputEncoding("UTF-8")
57 881 mdecorde
        tidy.setShowErrors(0)
58 881 mdecorde
        tidy.setTabsize(10)
59 881 mdecorde
        tidy.setShowWarnings(false)
60 881 mdecorde
        File xhtmlFile = new File(workDir, "base.xhtml")
61 881 mdecorde
        xhtmlFile.withWriter("UTF-8") { out2 ->
62 881 mdecorde
                tidy.parse(htmlFile.toURI().toURL().newInputStream([readTimeout: 10000]), out2); // run tidy, providing an input and output stream
63 881 mdecorde
        }
64 881 mdecorde
65 881 mdecorde
        println "remove entities and doctype"
66 881 mdecorde
        String text = xhtmlFile.getText("UTF-8")
67 881 mdecorde
        text = text.replaceAll("&nbsp;", " ")
68 881 mdecorde
        text = text.replaceAll("xmlns=\"http://www.w3.org/1999/xhtml\"", " ")
69 881 mdecorde
        text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
70 881 mdecorde
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">""", "")
71 881 mdecorde
        outfile.withWriter("UTF-8") { writer ->
72 881 mdecorde
                writer.write(text);
73 881 mdecorde
        }
74 881 mdecorde
}
75 881 mdecorde
println "done"