Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / GetPagesFromURL.groovy @ 187

History | View | Annotate | Download (2.3 kB)

1
package org.txm.importer
2

    
3
import org.w3c.tidy.Tidy
4

    
5
def workDir = new File("/home/mdecorde/TEMP")
6
def outdir = new File("/home/mdecorde/TEMP/out")
7

    
8
def src = "http://txm.sourceforge.net/doc/manual/manual1.xhtml";
9
def base = src.substring(0, src.lastIndexOf("/"));
10

    
11
def url_xpath = """//*/a[@href]/@href""";
12
def url_regexp = /.+/;
13
def duplicates = false; // 'true' not implemented
14
def filename_regexp = /zer/;
15
def content_xpath = "//*/div[@id=\"content\"]";
16

    
17
def xhtmlFile = new File(workDir, "src.xhtml")
18
URL2XHTMLFILE(src, xhtmlFile, workDir);
19

    
20
println "finding urls of $xhtmlFile"
21
XPathResult xpathResult = new XPathResult(xhtmlFile);
22
def urls = new LinkedHashSet<String>();
23

    
24
for (String url : xpathResult.getXpathResponses(url_xpath))
25
        if (urls ==~ url_regexp ) urls << url
26
println urls
27

    
28
int i = 1;
29
for (String url : urls) {
30
        //url = url.replaceAll()
31
        def subxHtmlFile = new File(workDir, url)
32
        URL2XHTMLFILE(base+"/"+url, subxHtmlFile, workDir);
33
        
34
        def txtFile = new File(outdir, url+".xhtml")
35
        def writer = txtFile.newWriter("UTF-8");
36
        for (def node : xpathResult.getNodes(content_xpath)) {
37
                writer.println(node)
38
        }
39
        writer.close()
40
        i++
41
}
42

    
43
def URL2XHTMLFILE(String url, File outfile, File workDir) {
44
        println "getting src: $url"
45
        def htmlFile = new File(workDir, "base.html")
46
        def filestream = new FileOutputStream(htmlFile)
47
        def out = new BufferedOutputStream(filestream)
48
        out << new URL(url).openStream()
49
        out.close()
50
        
51
        println "xHTMLize HTML $htmlFile"
52
        Tidy tidy = new Tidy(); // obtain a new Tidy instance
53
        tidy.setXHTML(true); // set desired config options using tidy setters
54
        tidy.setInputEncoding("UTF-8")
55
        tidy.setOutputEncoding("UTF-8")
56
        tidy.setShowErrors(0)
57
        tidy.setTabsize(10)
58
        tidy.setShowWarnings(false)
59
        File xhtmlFile = new File(workDir, "base.xhtml")
60
        xhtmlFile.withWriter("UTF-8") { out2 ->
61
                tidy.parse(htmlFile.toURI().toURL().newInputStream([readTimeout: 10000]), out2); // run tidy, providing an input and output stream
62
        }
63
        
64
        println "remove entities and doctype"
65
        String text = xhtmlFile.getText("UTF-8")
66
        text = text.replaceAll("&nbsp;", " ")
67
        text = text.replaceAll("xmlns=\"http://www.w3.org/1999/xhtml\"", " ")
68
        text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
69
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">""", "")
70
        outfile.withWriter("UTF-8") { writer ->
71
                writer.write(text);
72
        }
73
}
74
println "done"