Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / misc / RenameFilesMacro.groovy @ 2375

History | View | Annotate | Download (9 kB)

1
package org.txm.macro.misc
2
// STANDARD DECLARATIONS
3

    
4
import groovy.xml.QName
5

    
6
import java.nio.charset.Charset
7
import java.text.DecimalFormat
8
import org.txm.utils.xml.DomUtils;
9
import org.txm.importer.ValidateXml;
10
import groovy.util.XmlParser
11
import org.kohsuke.args4j.*
12
import groovy.transform.Field
13
import org.txm.rcp.swt.widget.parameters.*
14

    
15
// BEGINNING OF PARAMETERS
16
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
17
File rootDir = new File("");
18

    
19
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
20
String encoding = "iso-8859-1" // HTML files encoding
21

    
22
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
23
def debug = "true" // set true to debug the script
24

    
25
// Open the parameters input dialog box
26
if (!ParametersDialog.open(this)) return;
27

    
28
debug = ("true" == debug)
29
// END OF PARAMETERS
30

    
31
String corpusName = rootDir.getName()
32
File srcDir = new File(rootDir, "orig");
33
File outDir = new File(rootDir, "xhtml");
34
File outDir2 = new File(rootDir, corpusName);
35
File rejected = new File(rootDir, "duplicates");
36
File tmpDir = new File(rootDir, "tmp");
37

    
38
if (!srcDir.exists()) {
39
        println "STOP, srcDir does not exists $srcDir"
40
        return;
41
}
42

    
43
outDir.deleteDir()
44
outDir.mkdir()
45
outDir2.deleteDir()
46
outDir2.mkdir()
47
rejected.deleteDir()
48
rejected.mkdir()
49
tmpDir.deleteDir()
50
tmpDir.mkdir()
51

    
52
def allTags = new HashSet<String>();
53
def allStyles = new HashSet<String>();
54
def allClasses = new HashSet<String>();
55
def newPrefix = "Numéro de document : "
56
int itext = 1;
57
def formater = new DecimalFormat("0000");
58
int LIMITDIFF = 10
59
def metadatas = ["DocPublicationName", "DocHeader"]
60
def files = []
61
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
62
files = files.sort()
63

    
64
def done = new HashSet<String>();
65
def ignored = []
66
def allTitles = [:]
67
def dones = [:]
68
def ignoreds = []
69

    
70
def getText(def node) {
71
        //if (debug) println "node: "+node
72
        String s = " ";
73
        if (node instanceof String) {
74
                s += " "+node
75
        } else {
76
                for (def c : node.children()) {
77
                        s += " "+getText(c)
78
                }
79
        }
80
        //println " "+s.replace("\n", " ").trim();
81
        return " "+s.replace("\n", " ").trim();
82
}
83

    
84
println "Nb of HTML files: "+files.size()
85
for (File htmlFile : files) {
86
        println "Processing file $htmlFile"
87
        File tmpHTML = new File(tmpDir, htmlFile.getName())
88
        tmpHTML.withWriter("UTF-8") { writer -> 
89
                String txt = htmlFile.getText(encoding)
90
                txt = txt.replaceAll("<p></p>", " ");
91
                txt = txt.replaceAll("<p> </p>", " ");
92
                txt = txt.replaceAll("<br>", "<br> ");
93
                writer.write(txt)
94
        }
95
        
96
        String name = htmlFile.getName()
97
        name = name.substring(0, name.lastIndexOf("."));
98

    
99
        File xhtmlFile = new File(outDir, name+".xhtml")
100

    
101
        xhtmlFile.withWriter("UTF-8") { out ->
102
                def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
103
                println "current charset: "+doc.charset()
104
                doc.charset(Charset.forName("UTF-8"))
105
                println "current charset: "+doc.charset()
106
                out.println(doc.outerHtml())
107
        }
108

    
109
        if (ValidateXml.test(xhtmlFile)) {
110
                def root = new XmlParser().parse(xhtmlFile)
111
                def tables = root.body.table.tbody.tr.td
112
                if (tables.size() == 0) tables = root.body.table.tr.td
113
                //println "Nb of txt : "+tables.size()
114

    
115
                for (def text : tables) {
116
                        String sign = ""
117
                
118
                        if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
119
                        //println "TEXT "
120
                        //text.setName("text")
121
                        boolean endOfText = false;
122
                        def textMetadatas = [:]
123
                        for (String metadata : metadatas) {
124
                                textMetadatas[metadata] = ""
125
                        }
126
                        for (def subtable : text.table) text.remove(subtable)        
127
                        
128
                        for (def p : text.table.p) p.addChild(" ")                
129
                        
130
                        for (def child : text.span) {
131
                                if ("color:red; font-weight:bold".equals(child.@style)) {
132
                                        //text.remove(child)
133
                                        if (debug) "Found bold: "+child
134
                                        child.replaceNode { node -> w(expFound: "y", child.text())}
135
                                }
136
                        }
137
                        
138
                        def startIgnoringText = false
139
                        def tmp =""
140
                        def ichar = 0
141
                        String title = "";
142
                        def ignoredText = ""
143
                        def children = text.children()
144
                        for (int idx = 0 ; idx < children.size() ; idx++) {
145
                                
146
                                def child  = children[idx]
147
                                if (debug) println "child: $child"
148
                        
149
                                if (startIgnoringText) {
150
                                        if (debug) println "Ignoring text : "+ignoredText
151
                                        if (child instanceof String) ignoredText += child
152
                                        else ignoredText += child.text()
153
                                        
154
                                        def t = text.children().remove(idx);
155
                                        //if (tmp.length() > 0) println "removing : "+t
156
                                        idx--
157
                                        continue; // next child
158
                                }
159
                        
160
                                if (child instanceof String) {
161
                                        //println " "+child
162
                                        ichar += child.length()
163
                                } else {
164
                                        ichar += child.text().length()
165
                                        //                allTags.add(child.name().getLocalPart())
166
                                        //                allClasses.add(child.@class)
167
                                        //                allStyles.add(child.@style)
168
                                        def nn = child.name()
169
                                        
170
                                        try {nn = nn.getLocalPart()} catch(Exception e) {}
171
                                        switch (nn) {
172
                                                case "br": break;
173
                                                case "span":
174
                                                if (debug) println "Found span $child"
175
                                                        String classV = child.@class
176
                                                        String style = child.@style
177
                                                        if (classV != null) {
178
                                                                if (metadatas.contains(classV)) {
179
                                                                        textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
180
                                                                        text.remove(child);
181
                                                                        idx--
182
                                                                        //println "METADATA: "+classV + " = "+child.text().trim().length();
183
                                                                } else if ("TitreArticleVisu" == classV) {
184
                                                                        title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
185
                                                                        child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
186
                                                                } else {
187
                                                                        println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
188
                                                                }
189
                                                        } else if (style != null) {
190
                                                                if ("color:red; font-weight:bold".equals(style)) {
191
                                                                        //child.replaceNode { node -> w(expFound: "test")        }
192
                                                                        //println "KEYWORD: "+child.text().trim();
193
                                                                } else {
194
                                                                        println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
195
                                                                }
196
                                                        } else {
197
                                                                println "UNKNOWED SPAN: "+child.text().trim();
198
                                                        }
199
                                                        break;
200

    
201
                                                case "a": break
202
                                                case "w": break;
203
                                                case "b":
204
                                                        startIgnoringText = true;
205
                                                        tmp = child.text()
206
                                                        //if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
207
                                                        text.remove(child);
208
                                                        idx--
209
                                                        break;
210
                                                case "i": break;
211
                                                case "font":
212
                                                        if (debug) println "Found font $child"
213
                                                        String style = child.@style
214
                                                        if ("font-style:italic;" == style) {
215
                                                                if (debug) println "ITALIC: "+getText(child).trim();
216
                                                                child.replaceNode { node -> i(getText(child))}
217
                                                        } else if ("font-weight:bold;") {
218
                                                                if (debug) println "BOLD: "+getText(child).trim();
219
                                                                child.replaceNode { node -> b(getText(child))}
220
                                                        } else {
221
                                                                println "FSTYLE: '"+style+"' = "+getText(child).trim();
222
                                                        }
223
                                                        break;
224
                                                default: println child.name()
225
                                        }
226
                                }
227
                        }
228
                        
229
                        //rename td to text
230
                        text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
231
                        
232
                        //Write metadatas
233
                        for( String metadata : metadatas) {
234
                                text.attributes().put(metadata, textMetadatas[metadata])
235
                                //sign+= " "+textMetadatas[metadata].trim()
236
                        }
237
                        
238
                        // get document number
239
                        ignoredText = ignoredText.replaceAll("\n", " ")
240
                        int iNo = ignoredText.indexOf(newPrefix);
241
                        //println ignoredText
242
                        if (iNo >= 0) {
243
                                String no = ignoredText.substring(iNo+newPrefix.length()).trim()
244
                                text.attributes().put("idnews", no)
245
                                //sign += " "+no
246
                                text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
247
                                text.attributes().put("yyyymmdd", no.substring(5,13))
248
                                text.attributes().put("yyyymm", no.substring(5,11))
249
                                text.attributes().put("yyyy", no.substring(5,9))
250
                                text.attributes().put("mm", no.substring(9,11))
251
                                text.attributes().put("dd", no.substring(11,13))
252
                        }
253
                        
254
                        //sign += " "+ichar
255
                        sign += " "+title
256
                        
257
                        if (allTitles[title] == null) allTitles[title] = ichar
258
                        if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
259
                                sign += " "+ichar
260
                        }
261
                        File xmlFile;
262
                        if (done.contains(sign)) {
263
                                ignored << sign
264
                                xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
265
                                ignoreds << xmlFile.getName()
266
                        } else {
267
                                done << sign;
268
                                xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
269
                                dones[sign] = xmlFile
270
                        }
271
                        def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
272
                                writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
273
                                new XmlNodePrinter(writer).print(text)
274
                }
275
        }
276
}
277

    
278
if (ignored.size() > 0) {
279
        File ignoredFile = new File (rejected, "ignored.txt");
280
        ignoredFile.withWriter("UTF-8") { writer ->
281
                writer.println "TOTAL: "+ignored.size()
282
                for (int i = 0 ; i < ignored.size() ; i++) {
283
                        def sign = ignored[i]
284
                        writer.println "\n**DUPLICATE\n "
285
                        writer.println "keeped="+dones[sign];
286
                        writer.println "rejected="+ignoreds[i];
287
                        writer.println "SIGN="+sign
288
                        writer.println "\n"
289
                }
290
        }
291
        println "TOTAL IGNORED: "+ignored.size()
292
}
293

    
294
println "TOTAL TEXT: $itext"
295
if (!debug) {
296
        outDir.deleteDir()
297
        tmpDir.deleteDir()
298
}