Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / conversion / EuroPressToXML2018Macro.groovy @ 2769

History | View | Annotate | Download (14.9 kB)

1
package org.txm.macro.conversion
2
// STANDARD DECLARATIONS
3

    
4
import groovy.xml.QName
5
import java.text.DecimalFormat
6
import org.txm.utils.xml.DomUtils
7
import org.txm.importer.ValidateXml
8
import groovy.util.XmlParser
9
import org.kohsuke.args4j.*
10
import groovy.transform.Field
11
import org.txm.rcpapplication.swt.widget.parameters.*
12
import org.jsoup.Jsoup
13
import org.jsoup.nodes.Document.OutputSettings.Syntax
14

    
15
// README
16
// This macros needs the following libraries: jsoup-1.11.3.jar
17

    
18
// BEGINNING OF PARAMETERS
19

    
20
@Field @Option(name="inputDir", usage="The directory containing the html files, to export from the Europress portal", widget="Folder", required=true, def="")
21
def inputDir
22

    
23
@Field @Option(name="inputEncoding", usage="character encoding used in the HTML exported files", widget="String", required=false, def="iso-8859-1")
24
String inputEncoding
25

    
26
@Field @Option(name="outputDir", usage="The directory containing the result files, to import with the XTZ+CSV import module into TXM", widget="Folder", required=true, def="")
27
def outputDir
28

    
29
@Field @Option(name="corpusName", usage="corpus name", widget="String", required=true, def="")
30
String corpusName
31

    
32
@Field @Option(name="columnSeparator",usage="", widget="String", required=false, def=",")
33
def columnSeparator
34

    
35
@Field @Option(name="txtSeparator",usage="", widget="String", required=false, def="\"")
36
def txtSeparator
37

    
38
@Field @Option(name="debug", usage="show debug messages and keep temporary results", widget="Boolean", required=false, def="false")
39
def debug
40

    
41
// Open the parameters input dialog box
42
if (!ParametersDialog.open(this)) return
43

    
44
// END OF PARAMETERS
45

    
46
if (!inputDir.exists()) {
47
        println "** inputDir does not exist: $inputDir, aborting."
48
        return false
49
}
50

    
51
xslposttokContent = """<?xml version="1.0"?>
52
<xsl:stylesheet xmlns:edate="http://exslt.org/dates-and-times"
53
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0"
54
  xmlns:txm="http://textometrie.org/ns/1.0"
55
  exclude-result-prefixes="tei edate" xpath-default-namespace="http://www.tei-c.org/ns/1.0" version="2.0">
56

57
  <!--
58
This software is dual-licensed:
59

60
1. Distributed under a Creative Commons Attribution-ShareAlike 3.0
61
Unported License http://creativecommons.org/licenses/by-sa/3.0/ 
62

63
2. http://www.opensource.org/licenses/BSD-2-Clause
64
                
65
All rights reserved.
66

67
Redistribution and use in source and binary forms, with or without
68
modification, are permitted provided that the following conditions are
69
met:
70

71
* Redistributions of source code must retain the above copyright
72
notice, this list of conditions and the following disclaimer.
73

74
* Redistributions in binary form must reproduce the above copyright
75
notice, this list of conditions and the following disclaimer in the
76
documentation and/or other materials provided with the distribution.
77

78
This software is provided by the copyright holders and contributors
79
"as is" and any express or implied warranties, including, but not
80
limited to, the implied warranties of merchantability and fitness for
81
a particular purpose are disclaimed. In no event shall the copyright
82
holder or contributors be liable for any direct, indirect, incidental,
83
special, exemplary, or consequential damages (including, but not
84
limited to, procurement of substitute goods or services; loss of use,
85
data, or profits; or business interruption) however caused and on any
86
theory of liability, whether in contract, strict liability, or tort
87
(including negligence or otherwise) arising in any way out of the use
88
of this software, even if advised of the possibility of such damage.
89

90
     
91
This stylesheet adds a ref attribute to w elements that will be used for
92
references in TXM concordances. Can be used with TXM XTZ import module.
93

94
w ref is composed of :
95
- docpublicationname
96
- date
97

98
Written by Alexei Lavrentiev, UMR 5317 IHRIM, 2017
99
Serge Heiden, UMR 5317 IHRIM, 2018
100
  -->
101

102

103
  <xsl:output method="xml" encoding="utf-8" omit-xml-declaration="no"/> 
104
  
105
  
106
  <!-- General patterns: all elements, attributes, comments and processing instructions are copied -->
107
  
108
  <xsl:template match="*">      
109
        <xsl:copy>
110
          <xsl:apply-templates select="*|@*|processing-instruction()|comment()|text()"/>
111
        </xsl:copy>    
112
  </xsl:template>
113
  
114
  <xsl:template match="*" mode="position"><xsl:value-of select="count(preceding-sibling::*)"/></xsl:template>
115

116
  <xsl:template match="@*|comment()|processing-instruction()">
117
    <xsl:copy/>
118
  </xsl:template>
119
  
120
  <xsl:template match="*:w">
121
    <xsl:variable name="ref">
122

123
        <xsl:choose>
124
         <xsl:when test="ancestor::*:text[1]/@ref">
125
           <!-- <xsl:text>ref: </xsl:text> -->
126
           <xsl:value-of select="ancestor::*:text[1]/@ref"/>
127
         </xsl:when>
128
         <xsl:otherwise>
129
          <!-- <xsl:text>[NO docpublicationname]</xsl:text> -->
130
         </xsl:otherwise>
131
       </xsl:choose>
132
        <xsl:choose>
133
         <xsl:when test="ancestor::*:text[1]/@docpublicationname">
134
           <!-- <xsl:text>docpublicationname: </xsl:text> -->
135
           <xsl:value-of select="ancestor::*:text[1]/@docpublicationname"/>
136
         </xsl:when>
137
         <xsl:otherwise>
138
          <!-- <xsl:text>[NO docpublicationname]</xsl:text> -->
139
         </xsl:otherwise>
140
       </xsl:choose>
141
        <xsl:choose>
142
         <xsl:when test="ancestor::*:text[1]/@date">
143
           <!-- <xsl:text>date: </xsl:text> -->
144
           <xsl:value-of select="ancestor::*:text[1]/@date"/>
145
         </xsl:when>
146
         <xsl:otherwise>
147
          <!-- <xsl:text>[NO date]</xsl:text> -->
148
         </xsl:otherwise>
149
       </xsl:choose>
150

151
<!--
152
      <xsl:if test="ancestor::*:text/@*:id and preceding::*:pb[1]/@n">
153
        <xsl:text>, </xsl:text>
154
      </xsl:if>
155
-->
156
      <xsl:if test="ancestor::*:p[1]/@n">
157
        <xsl:text>§ </xsl:text>
158
        <xsl:value-of select="ancestor::*:p[1]/@n"/>
159
      </xsl:if>
160
<!--
161
      <xsl:if test="preceding::*:pb[1]/@n">
162
        <xsl:text>p. </xsl:text>
163
        <xsl:value-of select="preceding::*:pb[1]/@n"/>
164
      </xsl:if>      <xsl:if test="(ancestor::*:text/@*:id or preceding::*:pb[1]/@n) and preceding::*:lb[1]/@n">
165
        <xsl:text>, </xsl:text>
166
      </xsl:if>
167
      <xsl:if test="preceding::*:lb[1]/@n">
168
        <xsl:text>l. </xsl:text>
169
        <xsl:value-of select="preceding::*:lb[1]/@n"/>
170
      </xsl:if>
171
-->
172
    </xsl:variable>
173
        <xsl:copy>
174
          <xsl:apply-templates select="@*"/>
175
          <xsl:attribute name="ref"><xsl:value-of select="\$ref"/></xsl:attribute>
176
          <xsl:apply-templates select="*|processing-instruction()|comment()|text()"/>
177
        </xsl:copy>
178
  </xsl:template>  
179

180
</xsl:stylesheet>
181
"""
182

    
183
cssContent = """/*  
184
   Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
185
   Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
186
   @author cbourdot
187
   @author sheiden
188

189
   TXM default CSS 06-2017
190

191
*/
192

193
.txmeditionpage {
194
        font-size: 14px;
195
        text-indent: none;
196
        text-align: justify;
197
        box-shadow: .3125em .3125em .625em 0 #888;
198
        margin: 1.25em auto;
199
        padding: 1.25em;
200
        width: 400px;
201
        min-height: 90%;
202
}
203

204
.txmeditionpb {
205
        text-align: center;
206
}
207

208
.txmeditionpb::before {
209
        content: "- ";
210
}
211

212
.txmeditionpb::after {
213
        content: " -";
214
}
215

216
.txmlettrinep:first-letter {
217
    float: left;
218
    font-size: 6em;
219
    line-height: 1;
220
    margin-right: 0.2em;
221
}
222

223
a {
224
        color:#802520;
225
}
226

227
h1 {
228
        font-size: 20px;
229
        font-variant: small-caps;
230
        text-align: center;
231
        color:#802520;
232
}
233

234
h2 {
235
        font-size: 18px;
236
        font-variant: small-caps;
237
        text-align: center;
238
        color:#802520;
239
}
240

241
h3 {
242
        font-size: 16px;
243
        font-variant: small-caps;
244
        text-align: center;
245
        color:#802520;
246
}
247

248
p {
249
            text-indent: 0.2cm;
250
        text-align: justify;
251
            text-justify: inter-word;        
252
  }
253

254
img {
255
    margin: 10px 10px 10px 10px;
256
}
257

258
td[rend="table-cell-align-right"] {
259
        text-align: right;
260
}
261

262
td[rend="table-cell-align-left"] {
263
        text-align: left;
264
}
265

266
td[rend="table-cell-align-center"] {
267
        text-align: center;
268
}
269
"""
270

    
271
outputDir.deleteDir()
272
outputDir.mkdir()
273
outputDir = new File(outputDir, corpusName)
274
outputDir.deleteDir()
275
outputDir.mkdir()
276

    
277
tmpDir = new File(outputDir, "tmp")
278
tmpDir.deleteDir()
279
tmpDir.mkdir()
280

    
281
tmpXhtmlOutput = new File(outputDir, "xhtml")
282
tmpXhtmlOutput.deleteDir()
283
tmpXhtmlOutput.mkdir()
284

    
285
duplicates = new File(outputDir, "duplicates")
286
duplicates.deleteDir()
287
duplicates.mkdir()
288

    
289
xslDir = new File(outputDir, "xsl")
290
xslDir.mkdir()
291
xslposttokDir = new File(xslDir, "3-posttok")
292
xslposttokDir.mkdir()
293
xslposttokFile = new File(xslposttokDir, "txm-posttok-addRef-ref.xsl")
294
cssDir = new File(outputDir, "css")
295
cssDir.mkdir()
296
cssFile = new File(cssDir, corpusName+".css")
297

    
298
xslposttokFile << xslposttokContent
299
cssFile << cssContent
300

    
301
metadataFile = new File(outputDir, "metadata.csv")
302
metadataWriter = metadataFile.newWriter("UTF-8")
303

    
304
int itext = 0
305
def formater = new DecimalFormat("0000")
306

    
307
// HTML elements containing metadata content, with @class=metadataKeys
308
def metadataKeys = ["DocPublicationName", "DocHeader", "titreArticle"]
309

    
310
// HTML elements containing text content, with @class=textClass
311
def textClass = "docOcurrContainer"
312

    
313
// write metadata header
314
metadataWriter.print "id"
315
metadataKeys.each { metadataWriter.print columnSeparator+it.toLowerCase() }
316
// DocHeader substrings
317
metadataWriter.print columnSeparator+"rubrique"
318
metadataWriter.print columnSeparator+"date"
319
metadataWriter.print columnSeparator+"words"
320
metadataWriter.print columnSeparator+"pages"
321
metadataWriter.print columnSeparator+"textorder" // date
322
metadataWriter.println ""
323
                
324
def files = []
325
inputDir.eachFileMatch(~/.*\.(html|HTML)/){ htmlFile -> files << htmlFile}
326
files = files.sort()
327

    
328
def done = new HashSet<String>()
329
def ignored = []
330
def allTitles = new HashSet()
331
def dones = [:]
332
def ignoreds = []
333

    
334
// scan node for text content
335
def getText(def node) {
336
        //if (debug) println "node: "+node
337
        String s = " "
338
        if (node instanceof String) {
339
                s += " "+node
340
        } else {
341
                for(def c : node.children())
342
                        s += " "+getText(c)
343
        }
344
        //println " "+s.replace("\n", " ").trim()
345
        return " "+s.replace("\n", " ").trim()
346
}
347

    
348
println files.size()+" files to process."
349
println "Creating $metadataFile"
350

    
351
for (File htmlFile : files) {
352
        println "Processing $htmlFile"
353
                
354
        String name = htmlFile.getName()
355
        name = name.substring(0, name.lastIndexOf("."))
356

    
357
        File xhtmlFile = new File(tmpXhtmlOutput, name+".xhtml")
358

    
359
        
360
        if (inputEncoding.size() > 0) {
361
                doc = Jsoup.parse(htmlFile, inputEncoding, "")
362
        } else{
363
                doc = Jsoup.parse(htmlFile, "UTF8")
364
        }
365
        doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml)
366
        doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
367

    
368
        xhtmlFile.withWriter("UTF-8") { out ->
369
                out.print doc.html().replace("\"=\"\"", "")
370
        }
371

    
372
        if (!ValidateXml.test(xhtmlFile)) {
373
                println "Error: $xhtmlFile is malformed."
374
                continue
375
        }
376
        
377
        def root = new XmlParser(false, true, true).parse(xhtmlFile)
378
        
379
        // one <article> per text
380
        for (def article : root.body.article) {
381
                def textMetadata = [:]
382
        
383
                for (def key : metadataKeys) {
384
                        def values = article.'**'.find { node -> node instanceof groovy.util.Node && node["@class"] == key }
385
                        textMetadata[key] = values
386
                }
387
                
388
                def textContent = article.'**'.find { node -> node instanceof groovy.util.Node && node["@class"] == textClass }
389

    
390
                String sign = getText(textMetadata["titreArticle"]) // identify a text
391
                
392
                // build text id
393
                itext++
394
                File xmlFile = xmlFile = new File(outputDir, name+"_"+formater.format((itext))+".xml")
395
                if (allTitles.contains(sign)) {
396
                                ignored << sign
397
                                xmlFile = new File(duplicates, name+"_"+formater.format(itext)+".xml")
398
                                ignoreds << xmlFile.getName()
399
                }
400
                allTitles.add(sign)
401
                
402
                textId = name+"_"+formater.format(itext)
403
                
404
                def rubrique
405
                def date
406
                def words
407
                def pages
408
                def textorder
409
                def ref
410

    
411
                // write metadata
412
                metadataWriter.print "$textId"
413
                for (def k : textMetadata.keySet()) {
414
                        value = getText(textMetadata[k])
415
                        if (value == null) value = "N/A"
416
                        def potentialDate
417
                        if (k == "DocHeader") { // date, words, pages
418
                                docHeaderParse = (value =~ /^(.*)((lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche) [0-9][0-9]? (janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre) [0-9]{4}) - ([0-9]+) mots, p\. (.*)$/)
419
                                if (docHeaderParse.size() == 1) {
420

    
421
                                        date = docHeaderParse[0][2]
422
                                        style = java.text.DateFormat.FULL
423
                                        df = java.text.DateFormat.getDateInstance(style, java.util.Locale.FRANCE)
424
                                        potentialDate = df.parse(date)
425
                                        formatter = new java.text.SimpleDateFormat("yyyy-MM-dd")
426
                                        rubrique = docHeaderParse[0][1].trim()
427
                                        date = formatter.format(potentialDate)
428
                                        textorder = date
429
                                        words = docHeaderParse[0][5]
430
                                        pages = docHeaderParse[0][6]
431
                                        
432
                                } else { // date
433
                                        docHeaderParse = (value =~ /^(.*)((lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche) ([0-9][0-9]?) (janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre) ([0-9]{4}))(.*)$/)
434
                                        if (docHeaderParse.size() == 1) {
435
                                                rubrique = "NA"
436
                                                date = docHeaderParse[0][2]
437
                                                style = java.text.DateFormat.FULL
438
                                                df = java.text.DateFormat.getDateInstance(style, java.util.Locale.FRANCE)
439
                                                try {
440
                                                        potentialDate = df.parse(date)
441
                                                        formatter = new java.text.SimpleDateFormat("yyyy-MM-dd")
442
                                                        date = formatter.format(potentialDate)
443
                                                } catch (Exception e) {
444
                                                        println "can't parse date: '$date'"
445
                                                        date = "NA"
446
                                                }
447
                                                textorder = date
448
                                                words = "NA"
449
                                                pages = "NA"
450
                                        } else {
451
                                                rubrique = "NA"
452
                                                date = "NA"
453
                                                textorder = date
454
                                                words = "NA"
455
                                                pages = "NA"
456
                                        }
457
                                }
458
                        } else if (k == "DocPublicationName") ref = value.trim()
459
                        
460
                        metadataWriter.print columnSeparator+txtSeparator+value.replaceAll("\n", "").trim().replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
461
                }
462
                ref = ref+", "+date
463
                metadataWriter.print columnSeparator+txtSeparator+rubrique.replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
464
                metadataWriter.print columnSeparator+txtSeparator+date.replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
465
                metadataWriter.print columnSeparator+txtSeparator+words+txtSeparator
466
                metadataWriter.print columnSeparator+txtSeparator+pages+txtSeparator
467
                metadataWriter.print columnSeparator+txtSeparator+textorder+txtSeparator
468
                metadataWriter.println ""
469
                
470
                // write content
471
                def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8"))
472
                writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
473
                textContent.name = "text" // set root tag to "text"
474
                textContent["@id"] = textId // set the text id
475
                textContent["@ref"] = ref
476
                
477
                // write XML file
478
                new XmlNodePrinter(writer).print(textContent)
479
                writer.close()
480
        }
481
        
482
//        File xmlFile = 
483
        //println textMetadata.size()
484
}
485

    
486
metadataWriter.close()
487

    
488
if (ignored.size() > 0) {
489
        File ignoredFile = new File (duplicates, "ignored.txt")
490
        ignoredFile.withWriter("UTF-8") { writer ->
491
                writer.println "TOTAL: "+ignored.size()
492
                for (int i = 0 ; i < ignored.size() ; i++) {
493
                        def sign = ignored[i]
494
                        writer.println "\n**DUPLICATE\n "
495
                        writer.println "keeped="+dones[sign]
496
                        writer.println "duplicates="+ignoreds[i]
497
                        writer.println "SIGN="+sign
498
                        writer.println "\n"
499
                }
500
        }
501
        println "TOTAL IGNORED: "+ignored.size()
502
        println "        see $ignoredFile for text IDs"
503
}
504

    
505
println "$itext articles found."
506
if (debug) {
507
        tmpXhtmlOutput.deleteDir()
508
        tmpDir.deleteDir()
509
}