Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / NiceToXML.groovy @ 187

History | View | Annotate | Download (5.1 kB)

1
package org.txm.importer
2

    
3
import javax.xml.stream.XMLStreamException
4
import org.apache.tools.ant.types.resources.selectors.InstanceOf;
5
import groovy.xml.*
6

    
7
def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/")
8
File srcdir = new File(root, "orig");
9
File outdir = new File(root, "tmp");
10
File okdir = new File(root, "ok");
11
File ok2dir = new File(root, "ok2");
12
File temoignagedir = new File(root, "temoignages");
13
ok2dir.deleteDir()
14
ok2dir.mkdir()
15

    
16
//rename title -> head
17
for (def file : outdir.listFiles()) {
18
        if (!file.getName().endsWith(".xml")) continue;
19
        def doc = new XmlParser().parse(file);
20
        
21
        for (def note : doc.body.chapter.title) {
22
                println note
23
                note.name = "head"
24
        }
25
        
26
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
27
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
28
        }
29
}
30

    
31
/*
32
// ADD chapter@title
33
for (def file : outdir.listFiles()) {
34
        if (!file.getName().endsWith(".xml")) continue;
35
        def doc = new XmlParser().parse(file);
36
        
37
        for (def chapter : doc.body.chapter) {
38
                for (def title : chapter.title) {
39
                        chapter.@title = title.text()
40
                        break;
41
                }
42
        }
43
        
44
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
45
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
46
        }
47
}
48
*/
49
//FIX figure and caption inclusions
50
/*
51
for (def file : outdir.listFiles()) {
52
        if (!file.getName().endsWith(".xml")) continue;
53
        def doc = new XmlParser().parse(file);
54
        for (def note : doc.body."**".figure) {
55
                note.name = "note"
56
                //println "fig : $note"
57
                //if ("Image :" == note.text()) {
58
                        def children = note.parent().children()
59
                        int i = children.indexOf(note)
60
                        //println i + " < "+children.size()
61
                        def nextChild = children[i+1]
62
                        if (nextChild != null && nextChild.name().toString() == "caption") {
63
                                println nextChild
64

65
                                note.value = "Images : "+note.text()// + " "+nextChild.text()
66
                                //println note
67

68
                                children.remove(i+1)
69
                                note.append(nextChild)
70
                        }
71
                //}
72
        }
73

74
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
75
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
76
        }
77
}
78
*/
79
/*
80
// DOCBOOK -> DOCBOOK TEXT ONLY
81
for (def file : outdir.listFiles()) {
82
        def doc = new XmlParser().parse(file);
83

84
        def body = null
85
        def bookinfo = null
86
        def preface = null
87
        for (def e : doc.body) body = e
88

89
        for (def e : doc.bookinfo) {
90
                doc.remove(e)
91
        }
92
        for (def e : doc.preface) {
93
                doc.remove(e)
94
        }
95
        for (def e : doc.appendix) {
96
                doc.remove(e)
97
        }
98
        for (def e : doc.chapter) {
99
                doc.remove(e)
100
        }
101

102
        if (body == null) {
103
                println "error text: "+file
104
                continue
105
        }
106

107
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
108
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
109
        }
110
        //                writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
111
        //                        mkp.yield body
112
        //                  })
113
}
114
*/
115
//DOCBOOK to TEI
116
/*
117
 for (def file : outdir.listFiles()) {
118
 def doc = new XmlParser().parse(file);
119
 def body = null
120
 //def bookinfo = null
121
 for (def e : doc.body) body = e
122
 //for (def e : doc.bookinfo) bookinfo = e
123
 //println body.getClass()
124
 if (body == null) {
125
 println "error text: "+file
126
 continue
127
 }
128
 //        bookinfo.name = "teiHeader"
129
 body.name = "text"
130
 def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei')
131
 //        for (def node : body."**") {
132
 //                if (node instanceof String) continue
133
 //                def name = node.name()
134
 //                if (name instanceof String)
135
 //                        node.name = teins.get(name)
136
 //                else 
137
 //                        node.name = teins.get(name.getLocalPart())
138
 //        }
139
 for (def figure : body."**".figure) {
140
 figure.name = "note"
141
 figure.value = "Image : " + figure.caption.text()
142
 }
143
 for (def chapter : body."**".chapter) {
144
 chapter.name = "div"
145
 chapter.@type = "chapter"
146
 }
147
 for (def caption : body."**".title) {
148
 caption.name = "head"
149
 }
150
 for (def para : body."**".para) {
151
 para.name = "p"
152
 }
153
 def newdoc = new Node(null, "TEI");
154
 newdoc.@xmlns="http://www.tei-c.org/ns/1.0";
155
 newdoc.append(new Node(null, "teiHeader"))
156
 newdoc.append(body)
157
 new File(okdir, file.getName()).withWriter("UTF-8") { writer ->
158
 new XmlNodePrinter(new PrintWriter(writer)).print(newdoc)
159
 }
160
 //                writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
161
 //                        mkp.yield body
162
 //                  })
163
 }
164
 */
165

    
166
// remove TEI
167
/*outdir.deleteDir()
168
 outdir.mkdir()
169
 def errors = []
170
 for (def file : srcdir.listFiles()) {
171
 if (file.isDirectory()) continue;
172
 //new EncodingConverter(file, "Windows-1252", "UTF-8")
173
 File outfile = new File(outdir, file.getName());
174
 outfile.withWriter("UTF-8") { writer ->
175
 file.eachLine("UTF-8") { line ->
176
 if (line.trim() == "<TEI>") {
177
 } else if (line.trim() == "</TEI>") {
178
 writer.println("</book>")
179
 } else if (line.trim() == "<book lang=\"fr\"/>") {
180
 writer.println("<book lang=\"fr\">")
181
 } else {
182
 writer.println(line)
183
 }
184
 }                
185
 }
186
 try {
187
 ValidateXml.testAndThrow(outfile);
188
 } catch (XMLStreamException e) {
189
 println file.getName() + " : "+ e.getMessage()
190
 errors << file
191
 if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) {
192
 println "Delete line : "+e.location.lineNumber
193
 }
194
 println ""
195
 }
196
 }
197
 */
198
println "done"
199
//if (errors.size() > 0)
200
//        println ""+errors.size()+" errors : $errors"
201
//String content = file.getText("Windows-1252")
202
//println content