Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTML2XHTML.groovy @ 2473

History | View | Annotate | Download (1.2 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 881 mdecorde
import org.txm.utils.CharsetDetector;
4 881 mdecorde
import org.txm.importer.ValidateXml;
5 881 mdecorde
6 881 mdecorde
File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html")
7 881 mdecorde
File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml")
8 881 mdecorde
9 881 mdecorde
String encoding = new CharsetDetector(infile).getEncoding();
10 881 mdecorde
println "Encoding: $encoding"
11 881 mdecorde
String text = infile.getText(encoding);
12 881 mdecorde
13 881 mdecorde
//lower case tags
14 881 mdecorde
text = text.replaceAll(/(<[^!][^>]*>)/,
15 881 mdecorde
        { full, word ->
16 881 mdecorde
                //fix attributes TRUC=sdf234
17 881 mdecorde
                word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' )
18 881 mdecorde
                word.toLowerCase() // bourrin
19 881 mdecorde
        } )
20 881 mdecorde
21 881 mdecorde
//lower case <.> tags
22 881 mdecorde
text = text.replaceAll(/(<.>)/,
23 881 mdecorde
        { full, word ->
24 881 mdecorde
                word.toLowerCase()
25 881 mdecorde
        } )
26 881 mdecorde
27 881 mdecorde
//resolve entities
28 881 mdecorde
text = text.replaceAll(/&nbsp;/," ")
29 881 mdecorde
30 881 mdecorde
//close tags
31 881 mdecorde
text = text.replaceAll(/<br>/,"<br/>")
32 881 mdecorde
text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>')
33 881 mdecorde
text = text.replaceAll(/<img([^>]*)>/,'<img$1/>')
34 881 mdecorde
35 881 mdecorde
//remove doctype declaration
36 881 mdecorde
text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '')
37 881 mdecorde
38 881 mdecorde
//write&Validate
39 881 mdecorde
outfile.withWriter(encoding) { writer -> writer.write(text) }
40 881 mdecorde
if (!ValidateXml.test(outfile)) {
41 881 mdecorde
        println "FILE: $outfile"
42 881 mdecorde
}