Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / NiceToXML.groovy @ 1688

History | View | Annotate | Download (5.1 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 881 mdecorde
import javax.xml.stream.XMLStreamException
4 881 mdecorde
import org.apache.tools.ant.types.resources.selectors.InstanceOf;
5 881 mdecorde
import groovy.xml.*
6 881 mdecorde
7 881 mdecorde
def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/")
8 881 mdecorde
File srcdir = new File(root, "orig");
9 881 mdecorde
File outdir = new File(root, "tmp");
10 881 mdecorde
File okdir = new File(root, "ok");
11 881 mdecorde
File ok2dir = new File(root, "ok2");
12 881 mdecorde
File temoignagedir = new File(root, "temoignages");
13 881 mdecorde
ok2dir.deleteDir()
14 881 mdecorde
ok2dir.mkdir()
15 881 mdecorde
16 881 mdecorde
//rename title -> head
17 881 mdecorde
for (def file : outdir.listFiles()) {
18 881 mdecorde
        if (!file.getName().endsWith(".xml")) continue;
19 881 mdecorde
        def doc = new XmlParser().parse(file);
20 881 mdecorde
21 881 mdecorde
        for (def note : doc.body.chapter.title) {
22 881 mdecorde
                println note
23 881 mdecorde
                note.name = "head"
24 881 mdecorde
        }
25 881 mdecorde
26 881 mdecorde
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
27 881 mdecorde
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
28 881 mdecorde
        }
29 881 mdecorde
}
30 881 mdecorde
31 881 mdecorde
/*
32 881 mdecorde
// ADD chapter@title
33 881 mdecorde
for (def file : outdir.listFiles()) {
34 881 mdecorde
        if (!file.getName().endsWith(".xml")) continue;
35 881 mdecorde
        def doc = new XmlParser().parse(file);
36 881 mdecorde

37 881 mdecorde
        for (def chapter : doc.body.chapter) {
38 881 mdecorde
                for (def title : chapter.title) {
39 881 mdecorde
                        chapter.@title = title.text()
40 881 mdecorde
                        break;
41 881 mdecorde
                }
42 881 mdecorde
        }
43 881 mdecorde

44 881 mdecorde
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
45 881 mdecorde
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
46 881 mdecorde
        }
47 881 mdecorde
}
48 881 mdecorde
*/
49 881 mdecorde
//FIX figure and caption inclusions
50 881 mdecorde
/*
51 881 mdecorde
for (def file : outdir.listFiles()) {
52 881 mdecorde
        if (!file.getName().endsWith(".xml")) continue;
53 881 mdecorde
        def doc = new XmlParser().parse(file);
54 881 mdecorde
        for (def note : doc.body."**".figure) {
55 881 mdecorde
                note.name = "note"
56 881 mdecorde
                //println "fig : $note"
57 881 mdecorde
                //if ("Image :" == note.text()) {
58 881 mdecorde
                        def children = note.parent().children()
59 881 mdecorde
                        int i = children.indexOf(note)
60 881 mdecorde
                        //println i + " < "+children.size()
61 881 mdecorde
                        def nextChild = children[i+1]
62 881 mdecorde
                        if (nextChild != null && nextChild.name().toString() == "caption") {
63 881 mdecorde
                                println nextChild
64 881 mdecorde

65 881 mdecorde
                                note.value = "Images : "+note.text()// + " "+nextChild.text()
66 881 mdecorde
                                //println note
67 881 mdecorde

68 881 mdecorde
                                children.remove(i+1)
69 881 mdecorde
                                note.append(nextChild)
70 881 mdecorde
                        }
71 881 mdecorde
                //}
72 881 mdecorde
        }
73 881 mdecorde

74 881 mdecorde
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
75 881 mdecorde
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
76 881 mdecorde
        }
77 881 mdecorde
}
78 881 mdecorde
*/
79 881 mdecorde
/*
80 881 mdecorde
// DOCBOOK -> DOCBOOK TEXT ONLY
81 881 mdecorde
for (def file : outdir.listFiles()) {
82 881 mdecorde
        def doc = new XmlParser().parse(file);
83 881 mdecorde

84 881 mdecorde
        def body = null
85 881 mdecorde
        def bookinfo = null
86 881 mdecorde
        def preface = null
87 881 mdecorde
        for (def e : doc.body) body = e
88 881 mdecorde

89 881 mdecorde
        for (def e : doc.bookinfo) {
90 881 mdecorde
                doc.remove(e)
91 881 mdecorde
        }
92 881 mdecorde
        for (def e : doc.preface) {
93 881 mdecorde
                doc.remove(e)
94 881 mdecorde
        }
95 881 mdecorde
        for (def e : doc.appendix) {
96 881 mdecorde
                doc.remove(e)
97 881 mdecorde
        }
98 881 mdecorde
        for (def e : doc.chapter) {
99 881 mdecorde
                doc.remove(e)
100 881 mdecorde
        }
101 881 mdecorde

102 881 mdecorde
        if (body == null) {
103 881 mdecorde
                println "error text: "+file
104 881 mdecorde
                continue
105 881 mdecorde
        }
106 881 mdecorde

107 881 mdecorde
        new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
108 881 mdecorde
                new XmlNodePrinter(new PrintWriter(writer)).print(doc)
109 881 mdecorde
        }
110 881 mdecorde
        //                writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
111 881 mdecorde
        //                        mkp.yield body
112 881 mdecorde
        //                  })
113 881 mdecorde
}
114 881 mdecorde
*/
115 881 mdecorde
//DOCBOOK to TEI
116 881 mdecorde
/*
117 881 mdecorde
 for (def file : outdir.listFiles()) {
118 881 mdecorde
 def doc = new XmlParser().parse(file);
119 881 mdecorde
 def body = null
120 881 mdecorde
 //def bookinfo = null
121 881 mdecorde
 for (def e : doc.body) body = e
122 881 mdecorde
 //for (def e : doc.bookinfo) bookinfo = e
123 881 mdecorde
 //println body.getClass()
124 881 mdecorde
 if (body == null) {
125 881 mdecorde
 println "error text: "+file
126 881 mdecorde
 continue
127 881 mdecorde
 }
128 881 mdecorde
 //        bookinfo.name = "teiHeader"
129 881 mdecorde
 body.name = "text"
130 881 mdecorde
 def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei')
131 881 mdecorde
 //        for (def node : body."**") {
132 881 mdecorde
 //                if (node instanceof String) continue
133 881 mdecorde
 //                def name = node.name()
134 881 mdecorde
 //                if (name instanceof String)
135 881 mdecorde
 //                        node.name = teins.get(name)
136 881 mdecorde
 //                else
137 881 mdecorde
 //                        node.name = teins.get(name.getLocalPart())
138 881 mdecorde
 //        }
139 881 mdecorde
 for (def figure : body."**".figure) {
140 881 mdecorde
 figure.name = "note"
141 881 mdecorde
 figure.value = "Image : " + figure.caption.text()
142 881 mdecorde
 }
143 881 mdecorde
 for (def chapter : body."**".chapter) {
144 881 mdecorde
 chapter.name = "div"
145 881 mdecorde
 chapter.@type = "chapter"
146 881 mdecorde
 }
147 881 mdecorde
 for (def caption : body."**".title) {
148 881 mdecorde
 caption.name = "head"
149 881 mdecorde
 }
150 881 mdecorde
 for (def para : body."**".para) {
151 881 mdecorde
 para.name = "p"
152 881 mdecorde
 }
153 881 mdecorde
 def newdoc = new Node(null, "TEI");
154 881 mdecorde
 newdoc.@xmlns="http://www.tei-c.org/ns/1.0";
155 881 mdecorde
 newdoc.append(new Node(null, "teiHeader"))
156 881 mdecorde
 newdoc.append(body)
157 881 mdecorde
 new File(okdir, file.getName()).withWriter("UTF-8") { writer ->
158 881 mdecorde
 new XmlNodePrinter(new PrintWriter(writer)).print(newdoc)
159 881 mdecorde
 }
160 881 mdecorde
 //                writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
161 881 mdecorde
 //                        mkp.yield body
162 881 mdecorde
 //                  })
163 881 mdecorde
 }
164 881 mdecorde
 */
165 881 mdecorde
166 881 mdecorde
// remove TEI
167 881 mdecorde
/*outdir.deleteDir()
168 881 mdecorde
 outdir.mkdir()
169 881 mdecorde
 def errors = []
170 881 mdecorde
 for (def file : srcdir.listFiles()) {
171 881 mdecorde
 if (file.isDirectory()) continue;
172 881 mdecorde
 //new EncodingConverter(file, "Windows-1252", "UTF-8")
173 881 mdecorde
 File outfile = new File(outdir, file.getName());
174 881 mdecorde
 outfile.withWriter("UTF-8") { writer ->
175 881 mdecorde
 file.eachLine("UTF-8") { line ->
176 881 mdecorde
 if (line.trim() == "<TEI>") {
177 881 mdecorde
 } else if (line.trim() == "</TEI>") {
178 881 mdecorde
 writer.println("</book>")
179 881 mdecorde
 } else if (line.trim() == "<book lang=\"fr\"/>") {
180 881 mdecorde
 writer.println("<book lang=\"fr\">")
181 881 mdecorde
 } else {
182 881 mdecorde
 writer.println(line)
183 881 mdecorde
 }
184 881 mdecorde
 }
185 881 mdecorde
 }
186 881 mdecorde
 try {
187 881 mdecorde
 ValidateXml.testAndThrow(outfile);
188 881 mdecorde
 } catch (XMLStreamException e) {
189 881 mdecorde
 println file.getName() + " : "+ e.getMessage()
190 881 mdecorde
 errors << file
191 881 mdecorde
 if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) {
192 881 mdecorde
 println "Delete line : "+e.location.lineNumber
193 881 mdecorde
 }
194 881 mdecorde
 println ""
195 881 mdecorde
 }
196 881 mdecorde
 }
197 881 mdecorde
 */
198 881 mdecorde
println "done"
199 881 mdecorde
//if (errors.size() > 0)
200 881 mdecorde
//        println ""+errors.size()+" errors : $errors"
201 881 mdecorde
//String content = file.getText("Windows-1252")
202 881 mdecorde
//println content