Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTML2XHTML.groovy @ 1000

History | View | Annotate | Download (1.2 kB)

1
package org.txm.scripts.importer
2

    
3
import org.txm.utils.CharsetDetector;
4
import org.txm.importer.ValidateXml;
5

    
6
File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html")
7
File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml")
8

    
9
String encoding = new CharsetDetector(infile).getEncoding();
10
println "Encoding: $encoding"
11
String text = infile.getText(encoding);
12

    
13
//lower case tags
14
text = text.replaceAll(/(<[^!][^>]*>)/, 
15
        { full, word -> 
16
                //fix attributes TRUC=sdf234
17
                word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' )
18
                word.toLowerCase() // bourrin
19
        } )
20

    
21
//lower case <.> tags
22
text = text.replaceAll(/(<.>)/,
23
        { full, word ->
24
                word.toLowerCase()
25
        } )
26

    
27
//resolve entities
28
text = text.replaceAll(/&nbsp;/," ")
29

    
30
//close tags
31
text = text.replaceAll(/<br>/,"<br/>")
32
text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>')
33
text = text.replaceAll(/<img([^>]*)>/,'<img$1/>')
34

    
35
//remove doctype declaration
36
text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '')
37

    
38
//write&Validate
39
outfile.withWriter(encoding) { writer -> writer.write(text) }
40
if (!ValidateXml.test(outfile)) {
41
        println "FILE: $outfile"
42
}