root / tmp / org.txm.core / src / java / org / txm / scripts / importer / NiceToXML.groovy @ 2473
History | View | Annotate | Download (5.1 kB)
1 |
package org.txm.scripts.importer
|
---|---|
2 |
|
3 |
import javax.xml.stream.XMLStreamException |
4 |
import org.apache.tools.ant.types.resources.selectors.InstanceOf; |
5 |
import groovy.xml.* |
6 |
|
7 |
def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/") |
8 |
File srcdir = new File(root, "orig"); |
9 |
File outdir = new File(root, "tmp"); |
10 |
File okdir = new File(root, "ok"); |
11 |
File ok2dir = new File(root, "ok2"); |
12 |
File temoignagedir = new File(root, "temoignages"); |
13 |
ok2dir.deleteDir() |
14 |
ok2dir.mkdir() |
15 |
|
16 |
//rename title -> head
|
17 |
for (def file : outdir.listFiles()) { |
18 |
if (!file.getName().endsWith(".xml")) continue; |
19 |
def doc = new XmlParser().parse(file); |
20 |
|
21 |
for (def note : doc.body.chapter.title) { |
22 |
println note |
23 |
note.name = "head"
|
24 |
} |
25 |
|
26 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
27 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
28 |
} |
29 |
} |
30 |
|
31 |
/*
|
32 |
// ADD chapter@title
|
33 |
for (def file : outdir.listFiles()) {
|
34 |
if (!file.getName().endsWith(".xml")) continue;
|
35 |
def doc = new XmlParser().parse(file);
|
36 |
|
37 |
for (def chapter : doc.body.chapter) {
|
38 |
for (def title : chapter.title) {
|
39 |
chapter.@title = title.text()
|
40 |
break;
|
41 |
}
|
42 |
}
|
43 |
|
44 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
45 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
46 |
}
|
47 |
}
|
48 |
*/
|
49 |
//FIX figure and caption inclusions
|
50 |
/*
|
51 |
for (def file : outdir.listFiles()) {
|
52 |
if (!file.getName().endsWith(".xml")) continue;
|
53 |
def doc = new XmlParser().parse(file);
|
54 |
for (def note : doc.body."**".figure) {
|
55 |
note.name = "note"
|
56 |
//println "fig : $note"
|
57 |
//if ("Image :" == note.text()) {
|
58 |
def children = note.parent().children()
|
59 |
int i = children.indexOf(note)
|
60 |
//println i + " < "+children.size()
|
61 |
def nextChild = children[i+1]
|
62 |
if (nextChild != null && nextChild.name().toString() == "caption") {
|
63 |
println nextChild
|
64 |
|
65 |
note.value = "Images : "+note.text()// + " "+nextChild.text()
|
66 |
//println note
|
67 |
|
68 |
children.remove(i+1)
|
69 |
note.append(nextChild)
|
70 |
}
|
71 |
//}
|
72 |
}
|
73 |
|
74 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
75 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
76 |
}
|
77 |
}
|
78 |
*/
|
79 |
/*
|
80 |
// DOCBOOK -> DOCBOOK TEXT ONLY
|
81 |
for (def file : outdir.listFiles()) {
|
82 |
def doc = new XmlParser().parse(file);
|
83 |
|
84 |
def body = null
|
85 |
def bookinfo = null
|
86 |
def preface = null
|
87 |
for (def e : doc.body) body = e
|
88 |
|
89 |
for (def e : doc.bookinfo) {
|
90 |
doc.remove(e)
|
91 |
}
|
92 |
for (def e : doc.preface) {
|
93 |
doc.remove(e)
|
94 |
}
|
95 |
for (def e : doc.appendix) {
|
96 |
doc.remove(e)
|
97 |
}
|
98 |
for (def e : doc.chapter) {
|
99 |
doc.remove(e)
|
100 |
}
|
101 |
|
102 |
if (body == null) {
|
103 |
println "error text: "+file
|
104 |
continue
|
105 |
}
|
106 |
|
107 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
108 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
109 |
}
|
110 |
// writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
|
111 |
// mkp.yield body
|
112 |
// })
|
113 |
}
|
114 |
*/
|
115 |
//DOCBOOK to TEI
|
116 |
/*
|
117 |
for (def file : outdir.listFiles()) {
|
118 |
def doc = new XmlParser().parse(file);
|
119 |
def body = null
|
120 |
//def bookinfo = null
|
121 |
for (def e : doc.body) body = e
|
122 |
//for (def e : doc.bookinfo) bookinfo = e
|
123 |
//println body.getClass()
|
124 |
if (body == null) {
|
125 |
println "error text: "+file
|
126 |
continue
|
127 |
}
|
128 |
// bookinfo.name = "teiHeader"
|
129 |
body.name = "text"
|
130 |
def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei')
|
131 |
// for (def node : body."**") {
|
132 |
// if (node instanceof String) continue
|
133 |
// def name = node.name()
|
134 |
// if (name instanceof String)
|
135 |
// node.name = teins.get(name)
|
136 |
// else
|
137 |
// node.name = teins.get(name.getLocalPart())
|
138 |
// }
|
139 |
for (def figure : body."**".figure) {
|
140 |
figure.name = "note"
|
141 |
figure.value = "Image : " + figure.caption.text()
|
142 |
}
|
143 |
for (def chapter : body."**".chapter) {
|
144 |
chapter.name = "div"
|
145 |
chapter.@type = "chapter"
|
146 |
}
|
147 |
for (def caption : body."**".title) {
|
148 |
caption.name = "head"
|
149 |
}
|
150 |
for (def para : body."**".para) {
|
151 |
para.name = "p"
|
152 |
}
|
153 |
def newdoc = new Node(null, "TEI");
|
154 |
newdoc.@xmlns="http://www.tei-c.org/ns/1.0";
|
155 |
newdoc.append(new Node(null, "teiHeader"))
|
156 |
newdoc.append(body)
|
157 |
new File(okdir, file.getName()).withWriter("UTF-8") { writer ->
|
158 |
new XmlNodePrinter(new PrintWriter(writer)).print(newdoc)
|
159 |
}
|
160 |
// writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
|
161 |
// mkp.yield body
|
162 |
// })
|
163 |
}
|
164 |
*/
|
165 |
|
166 |
// remove TEI
|
167 |
/*outdir.deleteDir()
|
168 |
outdir.mkdir()
|
169 |
def errors = []
|
170 |
for (def file : srcdir.listFiles()) {
|
171 |
if (file.isDirectory()) continue;
|
172 |
//new EncodingConverter(file, "Windows-1252", "UTF-8")
|
173 |
File outfile = new File(outdir, file.getName());
|
174 |
outfile.withWriter("UTF-8") { writer ->
|
175 |
file.eachLine("UTF-8") { line ->
|
176 |
if (line.trim() == "<TEI>") {
|
177 |
} else if (line.trim() == "</TEI>") {
|
178 |
writer.println("</book>")
|
179 |
} else if (line.trim() == "<book lang=\"fr\"/>") {
|
180 |
writer.println("<book lang=\"fr\">")
|
181 |
} else {
|
182 |
writer.println(line)
|
183 |
}
|
184 |
}
|
185 |
}
|
186 |
try {
|
187 |
ValidateXml.testAndThrow(outfile);
|
188 |
} catch (XMLStreamException e) {
|
189 |
println file.getName() + " : "+ e.getMessage()
|
190 |
errors << file
|
191 |
if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) {
|
192 |
println "Delete line : "+e.location.lineNumber
|
193 |
}
|
194 |
println ""
|
195 |
}
|
196 |
}
|
197 |
*/
|
198 |
println "done"
|
199 |
//if (errors.size() > 0)
|
200 |
// println ""+errors.size()+" errors : $errors"
|
201 |
//String content = file.getText("Windows-1252")
|
202 |
//println content
|