root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTML2XHTML.groovy @ 1688
History | View | Annotate | Download (1.2 kB)
1 |
package org.txm.scripts.importer
|
---|---|
2 |
|
3 |
import org.txm.utils.CharsetDetector; |
4 |
import org.txm.importer.ValidateXml; |
5 |
|
6 |
File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html") |
7 |
File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml") |
8 |
|
9 |
String encoding = new CharsetDetector(infile).getEncoding(); |
10 |
println "Encoding: $encoding"
|
11 |
String text = infile.getText(encoding);
|
12 |
|
13 |
//lower case tags
|
14 |
text = text.replaceAll(/(<[^!][^>]*>)/,
|
15 |
{ full, word -> |
16 |
//fix attributes TRUC=sdf234
|
17 |
word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' ) |
18 |
word.toLowerCase() // bourrin
|
19 |
} ) |
20 |
|
21 |
//lower case <.> tags
|
22 |
text = text.replaceAll(/(<.>)/,
|
23 |
{ full, word -> |
24 |
word.toLowerCase() |
25 |
} ) |
26 |
|
27 |
//resolve entities
|
28 |
text = text.replaceAll(/ /," ") |
29 |
|
30 |
//close tags
|
31 |
text = text.replaceAll(/<br>/,"<br/>") |
32 |
text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>') |
33 |
text = text.replaceAll(/<img([^>]*)>/,'<img$1/>') |
34 |
|
35 |
//remove doctype declaration
|
36 |
text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '') |
37 |
|
38 |
//write&Validate
|
39 |
outfile.withWriter(encoding) { writer -> writer.write(text) } |
40 |
if (!ValidateXml.test(outfile)) {
|
41 |
println "FILE: $outfile"
|
42 |
} |