Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / misc / EuroPress2XMLMacro.groovy @ 1000

History | View | Annotate | Download (9.3 kB)

1
package org.txm.macro.misc
2
// STANDARD DECLARATIONS
3

    
4
import groovy.xml.QName
5

    
6
import java.nio.charset.Charset
7
import java.text.DecimalFormat
8
import org.txm.utils.xml.DomUtils;
9
import org.txm.importer.ValidateXml;
10
import org.w3c.tidy.Tidy
11
import groovy.util.XmlParser
12
import org.kohsuke.args4j.*
13
import groovy.transform.Field
14
import org.txm.rcp.swt.widget.parameters.*
15

    
16
// BEGINNING OF PARAMETERS
17
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
18
File rootDir = new File("");
19

    
20
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
21
String encoding = "iso-8859-1" // HTML files encoding
22

    
23
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
24
def debug = "true" // set true to debug the script
25

    
26
// Open the parameters input dialog box
27
if (!ParametersDialog.open(this)) return;
28

    
29
debug = ("true" == debug)
30
// END OF PARAMETERS
31

    
32
String corpusName = rootDir.getName()
33
File srcDir = new File(rootDir, "orig");
34
File outDir = new File(rootDir, "xhtml");
35
File outDir2 = new File(rootDir, corpusName);
36
File rejected = new File(rootDir, "duplicates");
37
File tmpDir = new File(rootDir, "tmp");
38

    
39

    
40
if (!srcDir.exists()) {
41
        println "STOP, srcDir does not exists $srcDir"
42
        return;
43
}
44

    
45
outDir.deleteDir()
46
outDir.mkdir()
47
outDir2.deleteDir()
48
outDir2.mkdir()
49
rejected.deleteDir()
50
rejected.mkdir()
51
tmpDir.deleteDir()
52
tmpDir.mkdir()
53

    
54
def allTags = new HashSet<String>();
55
def allStyles = new HashSet<String>();
56
def allClasses = new HashSet<String>();
57
def newPrefix = "Numéro de document : "
58
int itext = 1;
59
def formater = new DecimalFormat("0000");
60
int LIMITDIFF = 10
61
def metadatas = ["DocPublicationName", "DocHeader"]
62
def files = []
63
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
64
files = files.sort()
65

    
66
def done = new HashSet<String>();
67
def ignored = []
68
def allTitles = [:]
69
def dones = [:]
70
def ignoreds = []
71

    
72
def getText(def node) {
73
        //if (debug) println "node: "+node
74
        String s = " ";
75
        if (node instanceof String) {
76
                s += " "+node
77
        } else {
78
                for(def c : node.children())
79
                        s += " "+getText(c)
80
        }
81
        //println " "+s.replace("\n", " ").trim();
82
        return " "+s.replace("\n", " ").trim();
83
}
84

    
85
println "Nb of HTML files: "+files.size()
86
for (File htmlFile : files) {
87
        println "Processing file $htmlFile"
88
        File tmpHTML = new File(tmpDir, htmlFile.getName())
89
        tmpHTML.withWriter("UTF-8") { writer -> 
90
                String txt = htmlFile.getText(encoding)
91
                txt = txt.replaceAll("<p></p>", " ");
92
                txt = txt.replaceAll("<p> </p>", " ");
93
                txt = txt.replaceAll("<br>", "<br> ");
94
                writer.write(txt)
95
        }
96
        
97
        String name = htmlFile.getName()
98
        name = name.substring(0, name.lastIndexOf("."));
99

    
100
        File xhtmlFile = new File(outDir, name+".xhtml")
101

    
102
//        Tidy tidy = new Tidy(); // obtain a new Tidy instance
103
//        tidy.setXHTML(true); // set desired config options using tidy setters
104
//        tidy.setInputEncoding("UTF-8")
105
//        tidy.setOutputEncoding("UTF-8")
106
//        tidy.setShowErrors(0)
107
//        tidy.setShowWarnings(false)
108
        xhtmlFile.withWriter("UTF-8") { out ->
109
                def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
110
                println "current charset: "+doc.charset()
111
                doc.charset(Charset.forName("UTF-8"))
112
                println "current charset: "+doc.charset()
113
                out.println(doc.outerHtml())
114
        }
115

    
116
        if (ValidateXml.test(xhtmlFile)) {
117
                def root = new XmlParser().parse(xhtmlFile)
118
                def tables = root.body.table.tbody.tr.td
119
                if (tables.size() == 0) tables = root.body.table.tr.td
120
                //println "Nb of txt : "+tables.size()
121

    
122
                for (def text : tables) {
123
                        String sign = ""
124
                
125
                        if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
126
                        //println "TEXT "
127
                        //text.setName("text")
128
                        boolean endOfText = false;
129
                        def textMetadatas = [:]
130
                        for (String metadata : metadatas) {
131
                                textMetadatas[metadata] = ""
132
                        }
133
                        for (def subtable : text.table) text.remove(subtable)        
134
                        
135
                        for (def p : text.table.p) p.addChild(" ")                
136
                        
137
                        for (def child : text.span) {
138
                                if ("color:red; font-weight:bold".equals(child.@style)) {
139
                                        //text.remove(child)
140
                                        if (debug) "Found bold: "+child
141
                                        child.replaceNode { node -> w(expFound: "y", child.text())}
142
                                }
143
                        }
144
                        
145
                        def startIgnoringText = false
146
                        def tmp =""
147
                        def ichar = 0
148
                        String title = "";
149
                        def ignoredText = ""
150
                        def children = text.children()
151
                        for (int idx = 0 ; idx < children.size() ; idx++) {
152
                                
153
                                def child  = children[idx]
154
                                if (debug) println "child: $child"
155
                        
156
                                if (startIgnoringText) {
157
                                        if (debug) println "Ignoring text : "+ignoredText
158
                                        if (child instanceof String) ignoredText += child
159
                                        else ignoredText += child.text()
160
                                        
161
                                        def t = text.children().remove(idx);
162
                                        //if (tmp.length() > 0) println "removing : "+t
163
                                        idx--
164
                                        continue; // next child
165
                                }
166
                        
167
                                if (child instanceof String) {
168
                                        //println " "+child
169
                                        ichar += child.length()
170
                                } else {
171
                                        ichar += child.text().length()
172
                                        //                allTags.add(child.name().getLocalPart())
173
                                        //                allClasses.add(child.@class)
174
                                        //                allStyles.add(child.@style)
175
                                        def nn = child.name()
176
                                        
177
                                        try {nn = nn.getLocalPart()} catch(Exception e) {}
178
                                        switch (nn) {
179
                                                case "br": break;
180
                                                case "span":
181
                                                if (debug) println "Found span $child"
182
                                                        String classV = child.@class
183
                                                        String style = child.@style
184
                                                        if (classV != null) {
185
                                                                if (metadatas.contains(classV)) {
186
                                                                        textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
187
                                                                        text.remove(child);
188
                                                                        idx--
189
                                                                        //println "METADATA: "+classV + " = "+child.text().trim().length();
190
                                                                } else if ("TitreArticleVisu" == classV) {
191
                                                                        title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
192
                                                                        child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
193
                                                                } else {
194
                                                                        println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
195
                                                                }
196
                                                        } else if (style != null) {
197
                                                                if ("color:red; font-weight:bold".equals(style)) {
198
                                                                        //child.replaceNode { node -> w(expFound: "test")        }
199
                                                                        //println "KEYWORD: "+child.text().trim();
200
                                                                } else {
201
                                                                        println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
202
                                                                }
203
                                                        } else {
204
                                                                println "UNKNOWED SPAN: "+child.text().trim();
205
                                                        }
206
                                                        break;
207

    
208
                                                case "a": break
209
                                                case "w": break;
210
                                                case "b":
211
                                                        startIgnoringText = true;
212
                                                        tmp = child.text()
213
                                                        //if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
214
                                                        text.remove(child);
215
                                                        idx--
216
                                                        break;
217
                                                case "i": break;
218
                                                case "font":
219
                                                        if (debug) println "Found font $child"
220
                                                        String style = child.@style
221
                                                        if ("font-style:italic;" == style) {
222
                                                                if (debug) println "ITALIC: "+getText(child).trim();
223
                                                                child.replaceNode { node -> i(getText(child))}
224
                                                        } else if ("font-weight:bold;") {
225
                                                                if (debug) println "BOLD: "+getText(child).trim();
226
                                                                child.replaceNode { node -> b(getText(child))}
227
                                                        } else {
228
                                                                println "FSTYLE: '"+style+"' = "+getText(child).trim();
229
                                                        }
230
                                                        break;
231
                                                default: println child.name()
232
                                        }
233
                                }
234
                        }
235
                        
236
                        //rename td to text
237
                        text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
238
                        
239
                        //Write metadatas
240
                        for( String metadata : metadatas) {
241
                                text.attributes().put(metadata, textMetadatas[metadata])
242
                                //sign+= " "+textMetadatas[metadata].trim()
243
                        }
244
                        
245
                        // get document number
246
                        ignoredText = ignoredText.replaceAll("\n", " ")
247
                        int iNo= ignoredText.indexOf(newPrefix);
248
                        //println ignoredText
249
                        if (iNo >= 0) {
250
                                String no =ignoredText.substring(iNo+newPrefix.length()).trim()
251
                                text.attributes().put("idnews", no)
252
                                //sign += " "+no
253
                                text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
254
                                text.attributes().put("yyyymmdd", no.substring(5,13))
255
                                text.attributes().put("yyyymm", no.substring(5,11))
256
                                text.attributes().put("yyyy", no.substring(5,9))
257
                                text.attributes().put("mm", no.substring(9,11))
258
                                text.attributes().put("dd", no.substring(11,13))
259
                        }
260
                        
261
                        //sign += " "+ichar
262
                        sign += " "+title
263
                        
264
                        if (allTitles[title] == null) allTitles[title] = ichar
265
                        if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
266
                                sign += " "+ichar
267
                        }
268
                        File xmlFile;
269
                        if (done.contains(sign)) {
270
                                ignored << sign
271
                                xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
272
                                ignoreds << xmlFile.getName()
273
                        } else {
274
                                done << sign;
275
                                xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
276
                                dones[sign] = xmlFile
277
                        }
278
                        def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
279
                                writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
280
                                new XmlNodePrinter(writer).print(text)
281
                }
282
        }
283
}
284

    
285
if (ignored.size() > 0) {
286
        File ignoredFile = new File (rejected, "ignored.txt");
287
        ignoredFile.withWriter("UTF-8") { writer ->
288
                writer.println "TOTAL: "+ignored.size()
289
                for (int i = 0 ; i < ignored.size() ; i++) {
290
                        def sign = ignored[i]
291
                        writer.println "\n**DUPLICATE\n "
292
                        writer.println "keeped="+dones[sign];
293
                        writer.println "rejected="+ignoreds[i];
294
                        writer.println "SIGN="+sign
295
                        writer.println "\n"
296
                }
297
        }
298
        println "TOTAL IGNORED: "+ignored.size()
299
}
300

    
301
println "TOTAL TEXT: $itext"
302
if (!debug) {
303
        outDir.deleteDir()
304
        tmpDir.deleteDir()
305
}