root / tmp / org.txm.core / src / java / org / txm / scripts / importer / NiceToXML.groovy @ 1688
History | View | Annotate | Download (5.1 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer
|
---|---|---|---|
2 | 881 | mdecorde | |
3 | 881 | mdecorde | import javax.xml.stream.XMLStreamException |
4 | 881 | mdecorde | import org.apache.tools.ant.types.resources.selectors.InstanceOf; |
5 | 881 | mdecorde | import groovy.xml.* |
6 | 881 | mdecorde | |
7 | 881 | mdecorde | def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/") |
8 | 881 | mdecorde | File srcdir = new File(root, "orig"); |
9 | 881 | mdecorde | File outdir = new File(root, "tmp"); |
10 | 881 | mdecorde | File okdir = new File(root, "ok"); |
11 | 881 | mdecorde | File ok2dir = new File(root, "ok2"); |
12 | 881 | mdecorde | File temoignagedir = new File(root, "temoignages"); |
13 | 881 | mdecorde | ok2dir.deleteDir() |
14 | 881 | mdecorde | ok2dir.mkdir() |
15 | 881 | mdecorde | |
16 | 881 | mdecorde | //rename title -> head
|
17 | 881 | mdecorde | for (def file : outdir.listFiles()) { |
18 | 881 | mdecorde | if (!file.getName().endsWith(".xml")) continue; |
19 | 881 | mdecorde | def doc = new XmlParser().parse(file); |
20 | 881 | mdecorde | |
21 | 881 | mdecorde | for (def note : doc.body.chapter.title) { |
22 | 881 | mdecorde | println note |
23 | 881 | mdecorde | note.name = "head"
|
24 | 881 | mdecorde | } |
25 | 881 | mdecorde | |
26 | 881 | mdecorde | new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
27 | 881 | mdecorde | new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
28 | 881 | mdecorde | } |
29 | 881 | mdecorde | } |
30 | 881 | mdecorde | |
31 | 881 | mdecorde | /*
|
32 | 881 | mdecorde | // ADD chapter@title
|
33 | 881 | mdecorde | for (def file : outdir.listFiles()) {
|
34 | 881 | mdecorde | if (!file.getName().endsWith(".xml")) continue;
|
35 | 881 | mdecorde | def doc = new XmlParser().parse(file);
|
36 | 881 | mdecorde | |
37 | 881 | mdecorde | for (def chapter : doc.body.chapter) {
|
38 | 881 | mdecorde | for (def title : chapter.title) {
|
39 | 881 | mdecorde | chapter.@title = title.text()
|
40 | 881 | mdecorde | break;
|
41 | 881 | mdecorde | }
|
42 | 881 | mdecorde | }
|
43 | 881 | mdecorde | |
44 | 881 | mdecorde | new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
45 | 881 | mdecorde | new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
46 | 881 | mdecorde | }
|
47 | 881 | mdecorde | }
|
48 | 881 | mdecorde | */
|
49 | 881 | mdecorde | //FIX figure and caption inclusions
|
50 | 881 | mdecorde | /*
|
51 | 881 | mdecorde | for (def file : outdir.listFiles()) {
|
52 | 881 | mdecorde | if (!file.getName().endsWith(".xml")) continue;
|
53 | 881 | mdecorde | def doc = new XmlParser().parse(file);
|
54 | 881 | mdecorde | for (def note : doc.body."**".figure) {
|
55 | 881 | mdecorde | note.name = "note"
|
56 | 881 | mdecorde | //println "fig : $note"
|
57 | 881 | mdecorde | //if ("Image :" == note.text()) {
|
58 | 881 | mdecorde | def children = note.parent().children()
|
59 | 881 | mdecorde | int i = children.indexOf(note)
|
60 | 881 | mdecorde | //println i + " < "+children.size()
|
61 | 881 | mdecorde | def nextChild = children[i+1]
|
62 | 881 | mdecorde | if (nextChild != null && nextChild.name().toString() == "caption") {
|
63 | 881 | mdecorde | println nextChild
|
64 | 881 | mdecorde | |
65 | 881 | mdecorde | note.value = "Images : "+note.text()// + " "+nextChild.text()
|
66 | 881 | mdecorde | //println note
|
67 | 881 | mdecorde | |
68 | 881 | mdecorde | children.remove(i+1)
|
69 | 881 | mdecorde | note.append(nextChild)
|
70 | 881 | mdecorde | }
|
71 | 881 | mdecorde | //}
|
72 | 881 | mdecorde | }
|
73 | 881 | mdecorde | |
74 | 881 | mdecorde | new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
75 | 881 | mdecorde | new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
76 | 881 | mdecorde | }
|
77 | 881 | mdecorde | }
|
78 | 881 | mdecorde | */
|
79 | 881 | mdecorde | /*
|
80 | 881 | mdecorde | // DOCBOOK -> DOCBOOK TEXT ONLY
|
81 | 881 | mdecorde | for (def file : outdir.listFiles()) {
|
82 | 881 | mdecorde | def doc = new XmlParser().parse(file);
|
83 | 881 | mdecorde | |
84 | 881 | mdecorde | def body = null
|
85 | 881 | mdecorde | def bookinfo = null
|
86 | 881 | mdecorde | def preface = null
|
87 | 881 | mdecorde | for (def e : doc.body) body = e
|
88 | 881 | mdecorde | |
89 | 881 | mdecorde | for (def e : doc.bookinfo) {
|
90 | 881 | mdecorde | doc.remove(e)
|
91 | 881 | mdecorde | }
|
92 | 881 | mdecorde | for (def e : doc.preface) {
|
93 | 881 | mdecorde | doc.remove(e)
|
94 | 881 | mdecorde | }
|
95 | 881 | mdecorde | for (def e : doc.appendix) {
|
96 | 881 | mdecorde | doc.remove(e)
|
97 | 881 | mdecorde | }
|
98 | 881 | mdecorde | for (def e : doc.chapter) {
|
99 | 881 | mdecorde | doc.remove(e)
|
100 | 881 | mdecorde | }
|
101 | 881 | mdecorde | |
102 | 881 | mdecorde | if (body == null) {
|
103 | 881 | mdecorde | println "error text: "+file
|
104 | 881 | mdecorde | continue
|
105 | 881 | mdecorde | }
|
106 | 881 | mdecorde | |
107 | 881 | mdecorde | new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
|
108 | 881 | mdecorde | new XmlNodePrinter(new PrintWriter(writer)).print(doc)
|
109 | 881 | mdecorde | }
|
110 | 881 | mdecorde | // writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
|
111 | 881 | mdecorde | // mkp.yield body
|
112 | 881 | mdecorde | // })
|
113 | 881 | mdecorde | }
|
114 | 881 | mdecorde | */
|
115 | 881 | mdecorde | //DOCBOOK to TEI
|
116 | 881 | mdecorde | /*
|
117 | 881 | mdecorde | for (def file : outdir.listFiles()) {
|
118 | 881 | mdecorde | def doc = new XmlParser().parse(file);
|
119 | 881 | mdecorde | def body = null
|
120 | 881 | mdecorde | //def bookinfo = null
|
121 | 881 | mdecorde | for (def e : doc.body) body = e
|
122 | 881 | mdecorde | //for (def e : doc.bookinfo) bookinfo = e
|
123 | 881 | mdecorde | //println body.getClass()
|
124 | 881 | mdecorde | if (body == null) {
|
125 | 881 | mdecorde | println "error text: "+file
|
126 | 881 | mdecorde | continue
|
127 | 881 | mdecorde | }
|
128 | 881 | mdecorde | // bookinfo.name = "teiHeader"
|
129 | 881 | mdecorde | body.name = "text"
|
130 | 881 | mdecorde | def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei')
|
131 | 881 | mdecorde | // for (def node : body."**") {
|
132 | 881 | mdecorde | // if (node instanceof String) continue
|
133 | 881 | mdecorde | // def name = node.name()
|
134 | 881 | mdecorde | // if (name instanceof String)
|
135 | 881 | mdecorde | // node.name = teins.get(name)
|
136 | 881 | mdecorde | // else
|
137 | 881 | mdecorde | // node.name = teins.get(name.getLocalPart())
|
138 | 881 | mdecorde | // }
|
139 | 881 | mdecorde | for (def figure : body."**".figure) {
|
140 | 881 | mdecorde | figure.name = "note"
|
141 | 881 | mdecorde | figure.value = "Image : " + figure.caption.text()
|
142 | 881 | mdecorde | }
|
143 | 881 | mdecorde | for (def chapter : body."**".chapter) {
|
144 | 881 | mdecorde | chapter.name = "div"
|
145 | 881 | mdecorde | chapter.@type = "chapter"
|
146 | 881 | mdecorde | }
|
147 | 881 | mdecorde | for (def caption : body."**".title) {
|
148 | 881 | mdecorde | caption.name = "head"
|
149 | 881 | mdecorde | }
|
150 | 881 | mdecorde | for (def para : body."**".para) {
|
151 | 881 | mdecorde | para.name = "p"
|
152 | 881 | mdecorde | }
|
153 | 881 | mdecorde | def newdoc = new Node(null, "TEI");
|
154 | 881 | mdecorde | newdoc.@xmlns="http://www.tei-c.org/ns/1.0";
|
155 | 881 | mdecorde | newdoc.append(new Node(null, "teiHeader"))
|
156 | 881 | mdecorde | newdoc.append(body)
|
157 | 881 | mdecorde | new File(okdir, file.getName()).withWriter("UTF-8") { writer ->
|
158 | 881 | mdecorde | new XmlNodePrinter(new PrintWriter(writer)).print(newdoc)
|
159 | 881 | mdecorde | }
|
160 | 881 | mdecorde | // writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
|
161 | 881 | mdecorde | // mkp.yield body
|
162 | 881 | mdecorde | // })
|
163 | 881 | mdecorde | }
|
164 | 881 | mdecorde | */
|
165 | 881 | mdecorde | |
166 | 881 | mdecorde | // remove TEI
|
167 | 881 | mdecorde | /*outdir.deleteDir()
|
168 | 881 | mdecorde | outdir.mkdir()
|
169 | 881 | mdecorde | def errors = []
|
170 | 881 | mdecorde | for (def file : srcdir.listFiles()) {
|
171 | 881 | mdecorde | if (file.isDirectory()) continue;
|
172 | 881 | mdecorde | //new EncodingConverter(file, "Windows-1252", "UTF-8")
|
173 | 881 | mdecorde | File outfile = new File(outdir, file.getName());
|
174 | 881 | mdecorde | outfile.withWriter("UTF-8") { writer ->
|
175 | 881 | mdecorde | file.eachLine("UTF-8") { line ->
|
176 | 881 | mdecorde | if (line.trim() == "<TEI>") {
|
177 | 881 | mdecorde | } else if (line.trim() == "</TEI>") {
|
178 | 881 | mdecorde | writer.println("</book>")
|
179 | 881 | mdecorde | } else if (line.trim() == "<book lang=\"fr\"/>") {
|
180 | 881 | mdecorde | writer.println("<book lang=\"fr\">")
|
181 | 881 | mdecorde | } else {
|
182 | 881 | mdecorde | writer.println(line)
|
183 | 881 | mdecorde | }
|
184 | 881 | mdecorde | }
|
185 | 881 | mdecorde | }
|
186 | 881 | mdecorde | try {
|
187 | 881 | mdecorde | ValidateXml.testAndThrow(outfile);
|
188 | 881 | mdecorde | } catch (XMLStreamException e) {
|
189 | 881 | mdecorde | println file.getName() + " : "+ e.getMessage()
|
190 | 881 | mdecorde | errors << file
|
191 | 881 | mdecorde | if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) {
|
192 | 881 | mdecorde | println "Delete line : "+e.location.lineNumber
|
193 | 881 | mdecorde | }
|
194 | 881 | mdecorde | println ""
|
195 | 881 | mdecorde | }
|
196 | 881 | mdecorde | }
|
197 | 881 | mdecorde | */
|
198 | 881 | mdecorde | println "done"
|
199 | 881 | mdecorde | //if (errors.size() > 0)
|
200 | 881 | mdecorde | // println ""+errors.size()+" errors : $errors"
|
201 | 881 | mdecorde | //String content = file.getText("Windows-1252")
|
202 | 881 | mdecorde | //println content |