root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTML2XHTML.groovy @ 2473
History | View | Annotate | Download (1.2 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer
|
---|---|---|---|
2 | 881 | mdecorde | |
3 | 881 | mdecorde | import org.txm.utils.CharsetDetector; |
4 | 881 | mdecorde | import org.txm.importer.ValidateXml; |
5 | 881 | mdecorde | |
6 | 881 | mdecorde | File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html") |
7 | 881 | mdecorde | File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml") |
8 | 881 | mdecorde | |
9 | 881 | mdecorde | String encoding = new CharsetDetector(infile).getEncoding(); |
10 | 881 | mdecorde | println "Encoding: $encoding"
|
11 | 881 | mdecorde | String text = infile.getText(encoding);
|
12 | 881 | mdecorde | |
13 | 881 | mdecorde | //lower case tags
|
14 | 881 | mdecorde | text = text.replaceAll(/(<[^!][^>]*>)/,
|
15 | 881 | mdecorde | { full, word -> |
16 | 881 | mdecorde | //fix attributes TRUC=sdf234
|
17 | 881 | mdecorde | word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' ) |
18 | 881 | mdecorde | word.toLowerCase() // bourrin
|
19 | 881 | mdecorde | } ) |
20 | 881 | mdecorde | |
21 | 881 | mdecorde | //lower case <.> tags
|
22 | 881 | mdecorde | text = text.replaceAll(/(<.>)/,
|
23 | 881 | mdecorde | { full, word -> |
24 | 881 | mdecorde | word.toLowerCase() |
25 | 881 | mdecorde | } ) |
26 | 881 | mdecorde | |
27 | 881 | mdecorde | //resolve entities
|
28 | 881 | mdecorde | text = text.replaceAll(/ /," ") |
29 | 881 | mdecorde | |
30 | 881 | mdecorde | //close tags
|
31 | 881 | mdecorde | text = text.replaceAll(/<br>/,"<br/>") |
32 | 881 | mdecorde | text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>') |
33 | 881 | mdecorde | text = text.replaceAll(/<img([^>]*)>/,'<img$1/>') |
34 | 881 | mdecorde | |
35 | 881 | mdecorde | //remove doctype declaration
|
36 | 881 | mdecorde | text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '') |
37 | 881 | mdecorde | |
38 | 881 | mdecorde | //write&Validate
|
39 | 881 | mdecorde | outfile.withWriter(encoding) { writer -> writer.write(text) } |
40 | 881 | mdecorde | if (!ValidateXml.test(outfile)) {
|
41 | 881 | mdecorde | println "FILE: $outfile"
|
42 | 881 | mdecorde | } |