Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / misc / EuroPress2XMLMacro.groovy @ 479

History | View | Annotate | Download (9.1 kB)

1 321 mdecorde
package org.txm.macro.misc
2 321 mdecorde
// STANDARD DECLARATIONS
3 321 mdecorde
4 321 mdecorde
import groovy.xml.QName
5 321 mdecorde
import java.text.DecimalFormat
6 479 mdecorde
import org.txm.utils.xml.DomUtils;
7 479 mdecorde
import org.txm.importer.ValidateXml;
8 479 mdecorde
import org.w3c.tidy.Tidy
9 479 mdecorde
import groovy.util.XmlParser
10 321 mdecorde
import org.kohsuke.args4j.*
11 479 mdecorde
import groovy.transform.Field
12 479 mdecorde
import org.txm.rcpapplication.swt.widget.parameters.*
13 321 mdecorde
14 321 mdecorde
// BEGINNING OF PARAMETERS
15 321 mdecorde
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
16 321 mdecorde
File rootDir = new File("");
17 321 mdecorde
18 321 mdecorde
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
19 321 mdecorde
String encoding = "iso-8859-1" // HTML files encoding
20 321 mdecorde
21 321 mdecorde
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
22 321 mdecorde
def debug = "true" // set true to debug the script
23 321 mdecorde
24 321 mdecorde
// Open the parameters input dialog box
25 321 mdecorde
if (!ParametersDialog.open(this)) return;
26 321 mdecorde
27 321 mdecorde
debug = ("true" == debug)
28 321 mdecorde
// END OF PARAMETERS
29 321 mdecorde
30 321 mdecorde
String corpusName = rootDir.getName()
31 321 mdecorde
File srcDir = new File(rootDir, "orig");
32 321 mdecorde
File outDir = new File(rootDir, "xhtml");
33 321 mdecorde
File outDir2 = new File(rootDir, corpusName);
34 321 mdecorde
File rejected = new File(rootDir, "duplicates");
35 321 mdecorde
File tmpDir = new File(rootDir, "tmp");
36 321 mdecorde
37 321 mdecorde
38 321 mdecorde
if (!srcDir.exists()) {
39 321 mdecorde
        println "STOP, srcDir does not exists $srcDir"
40 321 mdecorde
        return;
41 321 mdecorde
}
42 321 mdecorde
43 321 mdecorde
outDir.deleteDir()
44 321 mdecorde
outDir.mkdir()
45 321 mdecorde
outDir2.deleteDir()
46 321 mdecorde
outDir2.mkdir()
47 321 mdecorde
rejected.deleteDir()
48 321 mdecorde
rejected.mkdir()
49 321 mdecorde
tmpDir.deleteDir()
50 321 mdecorde
tmpDir.mkdir()
51 321 mdecorde
52 321 mdecorde
def allTags = new HashSet<String>();
53 321 mdecorde
def allStyles = new HashSet<String>();
54 321 mdecorde
def allClasses = new HashSet<String>();
55 321 mdecorde
def newPrefix = "Numéro de document : "
56 321 mdecorde
int itext = 1;
57 321 mdecorde
def formater = new DecimalFormat("0000");
58 321 mdecorde
int LIMITDIFF = 10
59 321 mdecorde
def metadatas = ["DocPublicationName", "DocHeader"]
60 321 mdecorde
def files = []
61 321 mdecorde
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
62 321 mdecorde
files = files.sort()
63 321 mdecorde
64 321 mdecorde
def done = new HashSet<String>();
65 321 mdecorde
def ignored = []
66 321 mdecorde
def allTitles = [:]
67 321 mdecorde
def dones = [:]
68 321 mdecorde
def ignoreds = []
69 321 mdecorde
70 321 mdecorde
def getText(def node) {
71 321 mdecorde
        //if (debug) println "node: "+node
72 321 mdecorde
        String s = " ";
73 321 mdecorde
        if (node instanceof String) {
74 321 mdecorde
                s += " "+node
75 321 mdecorde
        } else {
76 321 mdecorde
                for(def c : node.children())
77 321 mdecorde
                        s += " "+getText(c)
78 321 mdecorde
        }
79 321 mdecorde
        //println " "+s.replace("\n", " ").trim();
80 321 mdecorde
        return " "+s.replace("\n", " ").trim();
81 321 mdecorde
}
82 321 mdecorde
83 321 mdecorde
println "Nb of HTML files: "+files.size()
84 321 mdecorde
for (File htmlFile : files) {
85 321 mdecorde
        println "Processing file $htmlFile"
86 321 mdecorde
        File tmpHTML = new File(tmpDir, htmlFile.getName())
87 321 mdecorde
        tmpHTML.withWriter("UTF-8") { writer ->
88 321 mdecorde
                String txt = htmlFile.getText(encoding)
89 321 mdecorde
                txt = txt.replaceAll("<p></p>", " ");
90 321 mdecorde
                txt = txt.replaceAll("<p> </p>", " ");
91 321 mdecorde
                txt = txt.replaceAll("<br>", "<br> ");
92 321 mdecorde
                writer.write(txt)
93 321 mdecorde
        }
94 321 mdecorde
95 321 mdecorde
        String name = htmlFile.getName()
96 479 mdecorde
        name = name.substring(0, name.lastIndexOf("."));
97 321 mdecorde
98 321 mdecorde
        File xhtmlFile = new File(outDir, name+".xhtml")
99 321 mdecorde
100 321 mdecorde
        Tidy tidy = new Tidy(); // obtain a new Tidy instance
101 321 mdecorde
        tidy.setXHTML(true); // set desired config options using tidy setters
102 321 mdecorde
        tidy.setInputEncoding("UTF-8")
103 321 mdecorde
        tidy.setOutputEncoding("UTF-8")
104 321 mdecorde
        tidy.setShowErrors(0)
105 321 mdecorde
        tidy.setShowWarnings(false)
106 321 mdecorde
        xhtmlFile.withWriter("UTF-8") { out ->
107 321 mdecorde
                tidy.parse(tmpHTML.toURI().toURL().newInputStream(), out); // run tidy, providing an input and output stream
108 321 mdecorde
        }
109 321 mdecorde
110 321 mdecorde
        if (ValidateXml.test(xhtmlFile)) {
111 321 mdecorde
                def root = new XmlParser().parse(xhtmlFile)
112 321 mdecorde
                def tables = root.body.table.tbody.tr.td
113 321 mdecorde
                if (tables.size() == 0) tables = root.body.table.tr.td
114 321 mdecorde
                //println "Nb of txt : "+tables.size()
115 321 mdecorde
116 321 mdecorde
                for (def text : tables) {
117 321 mdecorde
                        String sign = ""
118 321 mdecorde
119 321 mdecorde
                        if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
120 321 mdecorde
                        //println "TEXT "
121 321 mdecorde
                        //text.setName("text")
122 321 mdecorde
                        boolean endOfText = false;
123 321 mdecorde
                        def textMetadatas = [:]
124 321 mdecorde
                        for (String metadata : metadatas) {
125 321 mdecorde
                                textMetadatas[metadata] = ""
126 321 mdecorde
                        }
127 321 mdecorde
                        for (def subtable : text.table) text.remove(subtable)
128 321 mdecorde
129 321 mdecorde
                        for (def p : text.table.p) p.addChild(" ")
130 321 mdecorde
131 321 mdecorde
                        for (def child : text.span) {
132 321 mdecorde
                                if ("color:red; font-weight:bold".equals(child.@style)) {
133 321 mdecorde
                                        //text.remove(child)
134 321 mdecorde
                                        if (debug) "Found bold: "+child
135 321 mdecorde
                                        child.replaceNode { node -> w(expFound: "y", child.text())}
136 321 mdecorde
                                }
137 321 mdecorde
                        }
138 321 mdecorde
139 321 mdecorde
                        def startIgnoringText = false
140 321 mdecorde
                        def tmp =""
141 321 mdecorde
                        def ichar = 0
142 321 mdecorde
                        String title = "";
143 321 mdecorde
                        def ignoredText = ""
144 321 mdecorde
                        def children = text.children()
145 321 mdecorde
                        for (int idx = 0 ; idx < children.size() ; idx++) {
146 321 mdecorde
147 321 mdecorde
                                def child  = children[idx]
148 321 mdecorde
                                if (debug) println "child: $child"
149 321 mdecorde
150 321 mdecorde
                                if (startIgnoringText) {
151 321 mdecorde
                                        if (debug) println "Ignoring text : "+ignoredText
152 321 mdecorde
                                        if (child instanceof String) ignoredText += child
153 321 mdecorde
                                        else ignoredText += child.text()
154 321 mdecorde
155 321 mdecorde
                                        def t = text.children().remove(idx);
156 321 mdecorde
                                        //if (tmp.length() > 0) println "removing : "+t
157 321 mdecorde
                                        idx--
158 321 mdecorde
                                        continue; // next child
159 321 mdecorde
                                }
160 321 mdecorde
161 321 mdecorde
                                if (child instanceof String) {
162 321 mdecorde
                                        //println " "+child
163 321 mdecorde
                                        ichar += child.length()
164 321 mdecorde
                                } else {
165 321 mdecorde
                                        ichar += child.text().length()
166 321 mdecorde
                                        //                allTags.add(child.name().getLocalPart())
167 321 mdecorde
                                        //                allClasses.add(child.@class)
168 321 mdecorde
                                        //                allStyles.add(child.@style)
169 321 mdecorde
                                        def nn = child.name()
170 321 mdecorde
171 321 mdecorde
                                        try {nn = nn.getLocalPart()} catch(Exception e) {}
172 321 mdecorde
                                        switch (nn) {
173 321 mdecorde
                                                case "br": break;
174 321 mdecorde
                                                case "span":
175 321 mdecorde
                                                if (debug) println "Found span $child"
176 321 mdecorde
                                                        String classV = child.@class
177 321 mdecorde
                                                        String style = child.@style
178 321 mdecorde
                                                        if (classV != null) {
179 321 mdecorde
                                                                if (metadatas.contains(classV)) {
180 321 mdecorde
                                                                        textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
181 321 mdecorde
                                                                        text.remove(child);
182 321 mdecorde
                                                                        idx--
183 321 mdecorde
                                                                        //println "METADATA: "+classV + " = "+child.text().trim().length();
184 321 mdecorde
                                                                } else if ("TitreArticleVisu" == classV) {
185 321 mdecorde
                                                                        title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
186 321 mdecorde
                                                                        child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
187 321 mdecorde
                                                                } else {
188 321 mdecorde
                                                                        println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
189 321 mdecorde
                                                                }
190 321 mdecorde
                                                        } else if (style != null) {
191 321 mdecorde
                                                                if ("color:red; font-weight:bold".equals(style)) {
192 321 mdecorde
                                                                        //child.replaceNode { node -> w(expFound: "test")        }
193 321 mdecorde
                                                                        //println "KEYWORD: "+child.text().trim();
194 321 mdecorde
                                                                } else {
195 321 mdecorde
                                                                        println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
196 321 mdecorde
                                                                }
197 321 mdecorde
                                                        } else {
198 321 mdecorde
                                                                println "UNKNOWED SPAN: "+child.text().trim();
199 321 mdecorde
                                                        }
200 321 mdecorde
                                                        break;
201 321 mdecorde
202 321 mdecorde
                                                case "a": break
203 321 mdecorde
                                                case "w": break;
204 321 mdecorde
                                                case "b":
205 321 mdecorde
                                                        startIgnoringText = true;
206 321 mdecorde
                                                        tmp = child.text()
207 321 mdecorde
                                                        //if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
208 321 mdecorde
                                                        text.remove(child);
209 321 mdecorde
                                                        idx--
210 321 mdecorde
                                                        break;
211 321 mdecorde
                                                case "i": break;
212 321 mdecorde
                                                case "font":
213 321 mdecorde
                                                        if (debug) println "Found font $child"
214 321 mdecorde
                                                        String style = child.@style
215 321 mdecorde
                                                        if ("font-style:italic;" == style) {
216 321 mdecorde
                                                                if (debug) println "ITALIC: "+getText(child).trim();
217 321 mdecorde
                                                                child.replaceNode { node -> i(getText(child))}
218 321 mdecorde
                                                        } else if ("font-weight:bold;") {
219 321 mdecorde
                                                                if (debug) println "BOLD: "+getText(child).trim();
220 321 mdecorde
                                                                child.replaceNode { node -> b(getText(child))}
221 321 mdecorde
                                                        } else {
222 321 mdecorde
                                                                println "FSTYLE: '"+style+"' = "+getText(child).trim();
223 321 mdecorde
                                                        }
224 321 mdecorde
                                                        break;
225 321 mdecorde
                                                default: println child.name()
226 321 mdecorde
                                        }
227 321 mdecorde
                                }
228 321 mdecorde
                        }
229 321 mdecorde
230 321 mdecorde
                        //rename td to text
231 321 mdecorde
                        text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
232 321 mdecorde
233 321 mdecorde
                        //Write metadatas
234 321 mdecorde
                        for( String metadata : metadatas) {
235 321 mdecorde
                                text.attributes().put(metadata, textMetadatas[metadata])
236 321 mdecorde
                                //sign+= " "+textMetadatas[metadata].trim()
237 321 mdecorde
                        }
238 321 mdecorde
239 321 mdecorde
                        // get document number
240 321 mdecorde
                        ignoredText = ignoredText.replaceAll("\n", " ")
241 321 mdecorde
                        int iNo= ignoredText.indexOf(newPrefix);
242 321 mdecorde
                        //println ignoredText
243 321 mdecorde
                        if (iNo >= 0) {
244 321 mdecorde
                                String no =ignoredText.substring(iNo+newPrefix.length()).trim()
245 321 mdecorde
                                text.attributes().put("idnews", no)
246 321 mdecorde
                                //sign += " "+no
247 321 mdecorde
                                text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
248 321 mdecorde
                                text.attributes().put("yyyymmdd", no.substring(5,13))
249 321 mdecorde
                                text.attributes().put("yyyymm", no.substring(5,11))
250 321 mdecorde
                                text.attributes().put("yyyy", no.substring(5,9))
251 321 mdecorde
                                text.attributes().put("mm", no.substring(9,11))
252 321 mdecorde
                                text.attributes().put("dd", no.substring(11,13))
253 321 mdecorde
                        }
254 321 mdecorde
255 321 mdecorde
                        //sign += " "+ichar
256 321 mdecorde
                        sign += " "+title
257 321 mdecorde
258 321 mdecorde
                        if (allTitles[title] == null) allTitles[title] = ichar
259 321 mdecorde
                        if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
260 321 mdecorde
                                sign += " "+ichar
261 321 mdecorde
                        }
262 321 mdecorde
                        File xmlFile;
263 321 mdecorde
                        if (done.contains(sign)) {
264 321 mdecorde
                                ignored << sign
265 321 mdecorde
                                xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
266 321 mdecorde
                                ignoreds << xmlFile.getName()
267 321 mdecorde
                        } else {
268 321 mdecorde
                                done << sign;
269 321 mdecorde
                                xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
270 321 mdecorde
                                dones[sign] = xmlFile
271 321 mdecorde
                        }
272 321 mdecorde
                        def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
273 321 mdecorde
                                writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
274 321 mdecorde
                                new XmlNodePrinter(writer).print(text)
275 321 mdecorde
                }
276 321 mdecorde
        }
277 321 mdecorde
}
278 321 mdecorde
279 321 mdecorde
if (ignored.size() > 0) {
280 321 mdecorde
        File ignoredFile = new File (rejected, "ignored.txt");
281 321 mdecorde
        ignoredFile.withWriter("UTF-8") { writer ->
282 321 mdecorde
                writer.println "TOTAL: "+ignored.size()
283 321 mdecorde
                for (int i = 0 ; i < ignored.size() ; i++) {
284 321 mdecorde
                        def sign = ignored[i]
285 321 mdecorde
                        writer.println "\n**DUPLICATE\n "
286 321 mdecorde
                        writer.println "keeped="+dones[sign];
287 321 mdecorde
                        writer.println "rejected="+ignoreds[i];
288 321 mdecorde
                        writer.println "SIGN="+sign
289 321 mdecorde
                        writer.println "\n"
290 321 mdecorde
                }
291 321 mdecorde
        }
292 321 mdecorde
        println "TOTAL IGNORED: "+ignored.size()
293 321 mdecorde
}
294 321 mdecorde
295 321 mdecorde
println "TOTAL TEXT: $itext"
296 321 mdecorde
if (!debug) {
297 321 mdecorde
        outDir.deleteDir()
298 321 mdecorde
        tmpDir.deleteDir()
299 321 mdecorde
}