Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macro / misc / EuroPress2XMLMacro.groovy @ 148

History | View | Annotate | Download (9.1 kB)

1
package org.txm.macro.misc
2
// STANDARD DECLARATIONS
3

    
4
import groovy.transform.Field
5
import groovy.xml.QName
6

    
7
import java.text.DecimalFormat
8

    
9
import org.kohsuke.args4j.*
10
import org.txm.importer.ValidateXml
11
import org.txm.rcpapplication.swt.widget.parameters.*
12
import org.w3c.tidy.Tidy
13

    
14
// BEGINNING OF PARAMETERS
15
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
16
File rootDir = new File("");
17

    
18
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
19
String encoding = "iso-8859-1" // HTML files encoding
20

    
21
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
22
def debug = "true" // set true to debug the script
23

    
24
// Open the parameters input dialog box
25
if (!ParametersDialog.open(this)) return;
26

    
27
debug = ("true" == debug)
28
// END OF PARAMETERS
29

    
30
String corpusName = rootDir.getName()
31
File srcDir = new File(rootDir, "orig");
32
File outDir = new File(rootDir, "xhtml");
33
File outDir2 = new File(rootDir, corpusName);
34
File rejected = new File(rootDir, "duplicates");
35
File tmpDir = new File(rootDir, "tmp");
36

    
37

    
38
if (!srcDir.exists()) {
39
        println "STOP, srcDir does not exists $srcDir"
40
        return;
41
}
42

    
43
outDir.deleteDir()
44
outDir.mkdir()
45
outDir2.deleteDir()
46
outDir2.mkdir()
47
rejected.deleteDir()
48
rejected.mkdir()
49
tmpDir.deleteDir()
50
tmpDir.mkdir()
51

    
52
def allTags = new HashSet<String>();
53
def allStyles = new HashSet<String>();
54
def allClasses = new HashSet<String>();
55
def newPrefix = "Numéro de document : "
56
int itext = 1;
57
def formater = new DecimalFormat("0000");
58
int LIMITDIFF = 10
59
def metadatas = ["DocPublicationName", "DocHeader"]
60
def files = []
61
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
62
files = files.sort()
63

    
64
def done = new HashSet<String>();
65
def ignored = []
66
def allTitles = [:]
67
def dones = [:]
68
def ignoreds = []
69

    
70
def getText(def node) {
71
        //if (debug) println "node: "+node
72
        String s = " ";
73
        if (node instanceof String) {
74
                s += " "+node
75
        } else {
76
                for(def c : node.children())
77
                        s += " "+getText(c)
78
        }
79
        //println " "+s.replace("\n", " ").trim();
80
        return " "+s.replace("\n", " ").trim();
81
}
82

    
83
println "Nb of HTML files: "+files.size()
84
for (File htmlFile : files) {
85
        println "Processing file $htmlFile"
86
        File tmpHTML = new File(tmpDir, htmlFile.getName())
87
        tmpHTML.withWriter("UTF-8") { writer -> 
88
                String txt = htmlFile.getText(encoding)
89
                txt = txt.replaceAll("<p></p>", " ");
90
                txt = txt.replaceAll("<p> </p>", " ");
91
                txt = txt.replaceAll("<br>", "<br> ");
92
                writer.write(txt)
93
        }
94
        
95
        String name = htmlFile.getName()
96
        name = name.substring(0, name.indexOf("."));
97

    
98
        File xhtmlFile = new File(outDir, name+".xhtml")
99

    
100
        Tidy tidy = new Tidy(); // obtain a new Tidy instance
101
        tidy.setXHTML(true); // set desired config options using tidy setters
102
        tidy.setInputEncoding("UTF-8")
103
        tidy.setOutputEncoding("UTF-8")
104
        tidy.setShowErrors(0)
105
        tidy.setShowWarnings(false)
106
        xhtmlFile.withWriter("UTF-8") { out ->
107
                tidy.parse(tmpHTML.toURI().toURL().newInputStream(), out); // run tidy, providing an input and output stream
108
        }
109

    
110
        if (ValidateXml.test(xhtmlFile)) {
111
                def root = new XmlParser().parse(xhtmlFile)
112
                def tables = root.body.table.tbody.tr.td
113
                if (tables.size() == 0) tables = root.body.table.tr.td
114
                //println "Nb of txt : "+tables.size()
115

    
116
                for (def text : tables) {
117
                        String sign = ""
118
                
119
                        if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
120
                        //println "TEXT "
121
                        //text.setName("text")
122
                        boolean endOfText = false;
123
                        def textMetadatas = [:]
124
                        for (String metadata : metadatas) {
125
                                textMetadatas[metadata] = ""
126
                        }
127
                        for (def subtable : text.table) text.remove(subtable)        
128
                        
129
                        for (def p : text.table.p) p.addChild(" ")                
130
                        
131
                        for (def child : text.span) {
132
                                if ("color:red; font-weight:bold".equals(child.@style)) {
133
                                        //text.remove(child)
134
                                        if (debug) "Found bold: "+child
135
                                        child.replaceNode { node -> w(expFound: "y", child.text())}
136
                                }
137
                        }
138
                        
139
                        def startIgnoringText = false
140
                        def tmp =""
141
                        def ichar = 0
142
                        String title = "";
143
                        def ignoredText = ""
144
                        def children = text.children()
145
                        for (int idx = 0 ; idx < children.size() ; idx++) {
146
                                
147
                                def child  = children[idx]
148
                                if (debug) println "child: $child"
149
                        
150
                                if (startIgnoringText) {
151
                                        if (debug) println "Ignoring text : "+ignoredText
152
                                        if (child instanceof String) ignoredText += child
153
                                        else ignoredText += child.text()
154
                                        
155
                                        def t = text.children().remove(idx);
156
                                        //if (tmp.length() > 0) println "removing : "+t
157
                                        idx--
158
                                        continue; // next child
159
                                }
160
                        
161
                                if (child instanceof String) {
162
                                        //println " "+child
163
                                        ichar += child.length()
164
                                } else {
165
                                        ichar += child.text().length()
166
                                        //                allTags.add(child.name().getLocalPart())
167
                                        //                allClasses.add(child.@class)
168
                                        //                allStyles.add(child.@style)
169
                                        def nn = child.name()
170
                                        
171
                                        try {nn = nn.getLocalPart()} catch(Exception e) {}
172
                                        switch (nn) {
173
                                                case "br": break;
174
                                                case "span":
175
                                                if (debug) println "Found span $child"
176
                                                        String classV = child.@class
177
                                                        String style = child.@style
178
                                                        if (classV != null) {
179
                                                                if (metadatas.contains(classV)) {
180
                                                                        textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
181
                                                                        text.remove(child);
182
                                                                        idx--
183
                                                                        //println "METADATA: "+classV + " = "+child.text().trim().length();
184
                                                                } else if ("TitreArticleVisu" == classV) {
185
                                                                        title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
186
                                                                        child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
187
                                                                } else {
188
                                                                        println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
189
                                                                }
190
                                                        } else if (style != null) {
191
                                                                if ("color:red; font-weight:bold".equals(style)) {
192
                                                                        //child.replaceNode { node -> w(expFound: "test")        }
193
                                                                        //println "KEYWORD: "+child.text().trim();
194
                                                                } else {
195
                                                                        println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
196
                                                                }
197
                                                        } else {
198
                                                                println "UNKNOWED SPAN: "+child.text().trim();
199
                                                        }
200
                                                        break;
201

    
202
                                                case "a": break
203
                                                case "w": break;
204
                                                case "b":
205
                                                        startIgnoringText = true;
206
                                                        tmp = child.text()
207
                                                        //if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
208
                                                        text.remove(child);
209
                                                        idx--
210
                                                        break;
211
                                                case "i": break;
212
                                                case "font":
213
                                                        if (debug) println "Found font $child"
214
                                                        String style = child.@style
215
                                                        if ("font-style:italic;" == style) {
216
                                                                if (debug) println "ITALIC: "+getText(child).trim();
217
                                                                child.replaceNode { node -> i(getText(child))}
218
                                                        } else if ("font-weight:bold;") {
219
                                                                if (debug) println "BOLD: "+getText(child).trim();
220
                                                                child.replaceNode { node -> b(getText(child))}
221
                                                        } else {
222
                                                                println "FSTYLE: '"+style+"' = "+getText(child).trim();
223
                                                        }
224
                                                        break;
225
                                                default: println child.name()
226
                                        }
227
                                }
228
                        }
229
                        
230
                        //rename td to text
231
                        text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
232
                        
233
                        //Write metadatas
234
                        for( String metadata : metadatas) {
235
                                text.attributes().put(metadata, textMetadatas[metadata])
236
                                //sign+= " "+textMetadatas[metadata].trim()
237
                        }
238
                        
239
                        // get document number
240
                        ignoredText = ignoredText.replaceAll("\n", " ")
241
                        int iNo= ignoredText.indexOf(newPrefix);
242
                        //println ignoredText
243
                        if (iNo >= 0) {
244
                                String no =ignoredText.substring(iNo+newPrefix.length()).trim()
245
                                text.attributes().put("idnews", no)
246
                                //sign += " "+no
247
                                text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
248
                                text.attributes().put("yyyymmdd", no.substring(5,13))
249
                                text.attributes().put("yyyymm", no.substring(5,11))
250
                                text.attributes().put("yyyy", no.substring(5,9))
251
                                text.attributes().put("mm", no.substring(9,11))
252
                                text.attributes().put("dd", no.substring(11,13))
253
                        }
254
                        
255
                        //sign += " "+ichar
256
                        sign += " "+title
257
                        
258
                        if (allTitles[title] == null) allTitles[title] = ichar
259
                        if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
260
                                sign += " "+ichar
261
                        }
262
                        File xmlFile;
263
                        if (done.contains(sign)) {
264
                                ignored << sign
265
                                xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
266
                                ignoreds << xmlFile.getName()
267
                        } else {
268
                                done << sign;
269
                                xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
270
                                dones[sign] = xmlFile
271
                        }
272
                        def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
273
                                writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
274
                                new XmlNodePrinter(writer).print(text)
275
                }
276
        }
277
}
278

    
279
if (ignored.size() > 0) {
280
        File ignoredFile = new File (rejected, "ignored.txt");
281
        ignoredFile.withWriter("UTF-8") { writer ->
282
                writer.println "TOTAL: "+ignored.size()
283
                for (int i = 0 ; i < ignored.size() ; i++) {
284
                        def sign = ignored[i]
285
                        writer.println "\n**DUPLICATE\n "
286
                        writer.println "keeped="+dones[sign];
287
                        writer.println "rejected="+ignoreds[i];
288
                        writer.println "SIGN="+sign
289
                        writer.println "\n"
290
                }
291
        }
292
        println "TOTAL IGNORED: "+ignored.size()
293
}
294

    
295
println "TOTAL TEXT: $itext"
296
if (!debug) {
297
        outDir.deleteDir()
298
        tmpDir.deleteDir()
299
}