Révision 881
tmp/org.txm.specificities.core/.classpath (revision 881) | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 | 2 |
<classpath> |
3 | 3 |
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> |
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/> |
|
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"> |
|
5 |
<accessrules> |
|
6 |
<accessrule kind="accessible" pattern="**"/> |
|
7 |
</accessrules> |
|
8 |
</classpathentry> |
|
5 | 9 |
<classpathentry kind="src" path="src"/> |
6 | 10 |
<classpathentry kind="output" path="bin"/> |
7 | 11 |
</classpath> |
tmp/org.txm.specificities.core/META-INF/MANIFEST.MF (revision 881) | ||
---|---|---|
1 | 1 |
Manifest-Version: 1.0 |
2 |
Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp |
|
3 |
ort,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org |
|
4 |
.eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,or |
|
5 |
g.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpo |
|
6 |
rt,org.txm.statsengine.r.core;visibility:=reexport,org.txm.lexicaltab |
|
7 |
le.core;bundle-version="1.0.0";visibility:=reexport,org.txm.statsengi |
|
8 |
ne.core;bundle-version="1.0.0";visibility:=reexport,org.txm.core;bund |
|
9 |
le-version="0.7.0";visibility:=reexport,org.txm.progression.core;bund |
|
10 |
le-version="1.0.0";visibility:=reexport,org.txm.chartsengine.core;bun |
|
11 |
dle-version="1.0.0";visibility:=reexport,org.txm.chartsengine.jfreech |
|
12 |
art.core;bundle-version="1.0.0";visibility:=reexport,org.txm.chartsen |
|
13 |
gine.r.core;bundle-version="1.0.0";visibility:=reexport |
|
2 |
Require-Bundle: org.txm.lexicaltable.core;bundle-version="1.0.0";visibility:=reexport, |
|
3 |
org.txm.progression.core;bundle-version="1.0.0";visibility:=reexport |
|
14 | 4 |
Export-Package: org.txm.functions.contrasts, |
15 | 5 |
org.txm.specificities.core.chartsengine.jfreechart, |
16 | 6 |
org.txm.specificities.core.chartsengine.r, |
tmp/org.txm.specificities.feature/feature.xml (revision 881) | ||
---|---|---|
17 | 17 |
</license> |
18 | 18 |
|
19 | 19 |
<requires> |
20 |
<import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/> |
|
21 |
<import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/> |
|
22 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
23 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
24 |
<import plugin="org.txm.statsengine.r.core"/> |
|
25 | 20 |
<import plugin="org.txm.lexicaltable.core" version="1.0.0" match="greaterOrEqual"/> |
26 |
<import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
27 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
28 | 21 |
<import plugin="org.txm.progression.core" version="1.0.0" match="greaterOrEqual"/> |
29 |
<import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
30 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
|
31 |
<import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
32 |
<import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/> |
|
33 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
|
34 |
<import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
35 |
<import plugin="org.txm.statsengine.r.rcp"/> |
|
36 | 22 |
<import plugin="org.txm.chartsengine.rcp"/> |
37 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
38 |
<import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/> |
|
39 | 23 |
<import plugin="org.txm.lexicaltable.rcp"/> |
40 | 24 |
</requires> |
41 | 25 |
|
tmp/org.txm.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 881) | ||
---|---|---|
1 | 1 |
eclipse.preferences.version=1 |
2 |
groovy.compiler.level=23
|
|
2 |
groovy.compiler.level=-1
|
|
3 | 3 |
groovy.script.filters=scripts/**/*.groovy,y,src/main/resources/**/*.groovy,y,src/test/resources/**/*.groovy,y |
tmp/org.txm.core/src/java/org/txm/importer/NiceToXML.groovy (revision 881) | ||
---|---|---|
1 |
package org.txm.importer |
|
2 |
|
|
3 |
import javax.xml.stream.XMLStreamException |
|
4 |
import org.apache.tools.ant.types.resources.selectors.InstanceOf; |
|
5 |
import groovy.xml.* |
|
6 |
|
|
7 |
def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/") |
|
8 |
File srcdir = new File(root, "orig"); |
|
9 |
File outdir = new File(root, "tmp"); |
|
10 |
File okdir = new File(root, "ok"); |
|
11 |
File ok2dir = new File(root, "ok2"); |
|
12 |
File temoignagedir = new File(root, "temoignages"); |
|
13 |
ok2dir.deleteDir() |
|
14 |
ok2dir.mkdir() |
|
15 |
|
|
16 |
//rename title -> head |
|
17 |
for (def file : outdir.listFiles()) { |
|
18 |
if (!file.getName().endsWith(".xml")) continue; |
|
19 |
def doc = new XmlParser().parse(file); |
|
20 |
|
|
21 |
for (def note : doc.body.chapter.title) { |
|
22 |
println note |
|
23 |
note.name = "head" |
|
24 |
} |
|
25 |
|
|
26 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
|
27 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
|
28 |
} |
|
29 |
} |
|
30 |
|
|
31 |
/* |
|
32 |
// ADD chapter@title |
|
33 |
for (def file : outdir.listFiles()) { |
|
34 |
if (!file.getName().endsWith(".xml")) continue; |
|
35 |
def doc = new XmlParser().parse(file); |
|
36 |
|
|
37 |
for (def chapter : doc.body.chapter) { |
|
38 |
for (def title : chapter.title) { |
|
39 |
chapter.@title = title.text() |
|
40 |
break; |
|
41 |
} |
|
42 |
} |
|
43 |
|
|
44 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
|
45 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
|
46 |
} |
|
47 |
} |
|
48 |
*/ |
|
49 |
//FIX figure and caption inclusions |
|
50 |
/* |
|
51 |
for (def file : outdir.listFiles()) { |
|
52 |
if (!file.getName().endsWith(".xml")) continue; |
|
53 |
def doc = new XmlParser().parse(file); |
|
54 |
for (def note : doc.body."**".figure) { |
|
55 |
note.name = "note" |
|
56 |
//println "fig : $note" |
|
57 |
//if ("Image :" == note.text()) { |
|
58 |
def children = note.parent().children() |
|
59 |
int i = children.indexOf(note) |
|
60 |
//println i + " < "+children.size() |
|
61 |
def nextChild = children[i+1] |
|
62 |
if (nextChild != null && nextChild.name().toString() == "caption") { |
|
63 |
println nextChild |
|
64 |
|
|
65 |
note.value = "Images : "+note.text()// + " "+nextChild.text() |
|
66 |
//println note |
|
67 |
|
|
68 |
children.remove(i+1) |
|
69 |
note.append(nextChild) |
|
70 |
} |
|
71 |
//} |
|
72 |
} |
|
73 |
|
|
74 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
|
75 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
|
76 |
} |
|
77 |
} |
|
78 |
*/ |
|
79 |
/* |
|
80 |
// DOCBOOK -> DOCBOOK TEXT ONLY |
|
81 |
for (def file : outdir.listFiles()) { |
|
82 |
def doc = new XmlParser().parse(file); |
|
83 |
|
|
84 |
def body = null |
|
85 |
def bookinfo = null |
|
86 |
def preface = null |
|
87 |
for (def e : doc.body) body = e |
|
88 |
|
|
89 |
for (def e : doc.bookinfo) { |
|
90 |
doc.remove(e) |
|
91 |
} |
|
92 |
for (def e : doc.preface) { |
|
93 |
doc.remove(e) |
|
94 |
} |
|
95 |
for (def e : doc.appendix) { |
|
96 |
doc.remove(e) |
|
97 |
} |
|
98 |
for (def e : doc.chapter) { |
|
99 |
doc.remove(e) |
|
100 |
} |
|
101 |
|
|
102 |
if (body == null) { |
|
103 |
println "error text: "+file |
|
104 |
continue |
|
105 |
} |
|
106 |
|
|
107 |
new File(ok2dir, file.getName()).withWriter("UTF-8") { writer -> |
|
108 |
new XmlNodePrinter(new PrintWriter(writer)).print(doc) |
|
109 |
} |
|
110 |
// writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind { |
|
111 |
// mkp.yield body |
|
112 |
// }) |
|
113 |
} |
|
114 |
*/ |
|
115 |
//DOCBOOK to TEI |
|
116 |
/* |
|
117 |
for (def file : outdir.listFiles()) { |
|
118 |
def doc = new XmlParser().parse(file); |
|
119 |
def body = null |
|
120 |
//def bookinfo = null |
|
121 |
for (def e : doc.body) body = e |
|
122 |
//for (def e : doc.bookinfo) bookinfo = e |
|
123 |
//println body.getClass() |
|
124 |
if (body == null) { |
|
125 |
println "error text: "+file |
|
126 |
continue |
|
127 |
} |
|
128 |
// bookinfo.name = "teiHeader" |
|
129 |
body.name = "text" |
|
130 |
def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei') |
|
131 |
// for (def node : body."**") { |
|
132 |
// if (node instanceof String) continue |
|
133 |
// def name = node.name() |
|
134 |
// if (name instanceof String) |
|
135 |
// node.name = teins.get(name) |
|
136 |
// else |
|
137 |
// node.name = teins.get(name.getLocalPart()) |
|
138 |
// } |
|
139 |
for (def figure : body."**".figure) { |
|
140 |
figure.name = "note" |
|
141 |
figure.value = "Image : " + figure.caption.text() |
|
142 |
} |
|
143 |
for (def chapter : body."**".chapter) { |
|
144 |
chapter.name = "div" |
|
145 |
chapter.@type = "chapter" |
|
146 |
} |
|
147 |
for (def caption : body."**".title) { |
|
148 |
caption.name = "head" |
|
149 |
} |
|
150 |
for (def para : body."**".para) { |
|
151 |
para.name = "p" |
|
152 |
} |
|
153 |
def newdoc = new Node(null, "TEI"); |
|
154 |
newdoc.@xmlns="http://www.tei-c.org/ns/1.0"; |
|
155 |
newdoc.append(new Node(null, "teiHeader")) |
|
156 |
newdoc.append(body) |
|
157 |
new File(okdir, file.getName()).withWriter("UTF-8") { writer -> |
|
158 |
new XmlNodePrinter(new PrintWriter(writer)).print(newdoc) |
|
159 |
} |
|
160 |
// writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind { |
|
161 |
// mkp.yield body |
|
162 |
// }) |
|
163 |
} |
|
164 |
*/ |
|
165 |
|
|
166 |
// remove TEI |
|
167 |
/*outdir.deleteDir() |
|
168 |
outdir.mkdir() |
|
169 |
def errors = [] |
|
170 |
for (def file : srcdir.listFiles()) { |
|
171 |
if (file.isDirectory()) continue; |
|
172 |
//new EncodingConverter(file, "Windows-1252", "UTF-8") |
|
173 |
File outfile = new File(outdir, file.getName()); |
|
174 |
outfile.withWriter("UTF-8") { writer -> |
|
175 |
file.eachLine("UTF-8") { line -> |
|
176 |
if (line.trim() == "<TEI>") { |
|
177 |
} else if (line.trim() == "</TEI>") { |
|
178 |
writer.println("</book>") |
|
179 |
} else if (line.trim() == "<book lang=\"fr\"/>") { |
|
180 |
writer.println("<book lang=\"fr\">") |
|
181 |
} else { |
|
182 |
writer.println(line) |
|
183 |
} |
|
184 |
} |
|
185 |
} |
|
186 |
try { |
|
187 |
ValidateXml.testAndThrow(outfile); |
|
188 |
} catch (XMLStreamException e) { |
|
189 |
println file.getName() + " : "+ e.getMessage() |
|
190 |
errors << file |
|
191 |
if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) { |
|
192 |
println "Delete line : "+e.location.lineNumber |
|
193 |
} |
|
194 |
println "" |
|
195 |
} |
|
196 |
} |
|
197 |
*/ |
|
198 |
println "done" |
|
199 |
//if (errors.size() > 0) |
|
200 |
// println ""+errors.size()+" errors : $errors" |
|
201 |
//String content = file.getText("Windows-1252") |
|
202 |
//println content |
tmp/org.txm.core/src/java/org/txm/importer/WExtractWithMode.groovy (revision 881) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $ |
|
25 |
// $LastChangedRevision: 2038 $ |
|
26 |
// $LastChangedBy: alavrentev $ |
|
27 |
// |
|
28 |
package org.txm.importer |
|
29 |
|
|
30 |
import javax.xml.parsers.DocumentBuilder; |
|
31 |
import javax.xml.parsers.DocumentBuilderFactory; |
|
32 |
import javax.xml.parsers.ParserConfigurationException; |
|
33 |
import javax.xml.transform.OutputKeys; |
|
34 |
import javax.xml.transform.Result; |
|
35 |
import javax.xml.transform.Source; |
|
36 |
import javax.xml.transform.Transformer; |
|
37 |
import javax.xml.transform.TransformerFactory; |
|
38 |
import javax.xml.transform.dom.DOMSource; |
|
39 |
import javax.xml.transform.stream.StreamResult; |
|
40 |
|
|
41 |
import org.w3c.dom.Document; |
|
42 |
import org.w3c.dom.Element; |
|
43 |
import org.w3c.dom.NodeList; |
|
44 |
import org.xml.sax.SAXException; |
|
45 |
|
|
46 |
import javax.xml.stream.*; |
|
47 |
import java.io.File; |
|
48 |
import java.net.URL; |
|
49 |
|
|
50 |
// TODO: Auto-generated Javadoc |
|
51 |
/** |
|
52 |
* Extract w tags from a tei file |
|
53 |
* not finished. |
|
54 |
* |
|
55 |
* @author mdecorde |
|
56 |
*/ |
|
57 |
class WExtractWithMode |
|
58 |
{ |
|
59 |
|
|
60 |
/** |
|
61 |
* Process. |
|
62 |
* |
|
63 |
* @param infile the infile |
|
64 |
* @param outfile the outfile |
|
65 |
* @param max the max |
|
66 |
* @return the java.lang. object |
|
67 |
*/ |
|
68 |
public process(File infile, File outfile, String modemax) |
|
69 |
{ |
|
70 |
println "Process "+infile.getName()+", keep $modemax words" |
|
71 |
int count = this.countW(infile); |
|
72 |
|
|
73 |
int max = 0 |
|
74 |
String mode = "" |
|
75 |
|
|
76 |
try { |
|
77 |
mode = modemax.split("/")[0] |
|
78 |
max = Integer.parseInt(modemax.split("/")[1]) |
|
79 |
}catch(Exception e ){} |
|
80 |
|
|
81 |
if(count < max) |
|
82 |
{ |
|
83 |
println "can't extract $max words, the file "+infile.getName()+" contains only $count words" |
|
84 |
return; |
|
85 |
} |
|
86 |
//String ms = "#ms_K" |
|
87 |
int part = 0; |
|
88 |
if (mode == "3") |
|
89 |
{ |
|
90 |
part = max/3 |
|
91 |
} |
|
92 |
else if (mode == "2") |
|
93 |
{ |
|
94 |
part = max/2 |
|
95 |
} |
|
96 |
else if (mode == "1a" || mode == "1m" || mode == "1z") |
|
97 |
{ |
|
98 |
part = max |
|
99 |
} |
|
100 |
else |
|
101 |
{ |
|
102 |
println "mode must be 1a, 1m, 1z, 2 or 3" |
|
103 |
return |
|
104 |
} |
|
105 |
int from1 = 0 |
|
106 |
int to1 = 0 |
|
107 |
if (mode != "1m" && mode != "1z") |
|
108 |
{ |
|
109 |
to1 = part |
|
110 |
} |
|
111 |
int from2 = 0 |
|
112 |
int to2 = 0 |
|
113 |
if (mode == "3" || mode == "1m") |
|
114 |
{ |
|
115 |
from2 = (count/2) - (part/2); |
|
116 |
to2 =(count/2) + (part/2); |
|
117 |
} |
|
118 |
int from3 = 0 |
|
119 |
int to3 = 0 |
|
120 |
if (mode != "1a" && mode != "1m") |
|
121 |
{ |
|
122 |
from3 = count -part; |
|
123 |
to3= count-1; |
|
124 |
} |
|
125 |
boolean isSic = false; |
|
126 |
boolean isW = false; |
|
127 |
boolean isText = false; |
|
128 |
boolean printW = true; |
|
129 |
int wcount=0; |
|
130 |
|
|
131 |
println " count : "+count |
|
132 |
println " get from "+from1+" to "+to1 |
|
133 |
println " get from "+from2+" to "+to2 |
|
134 |
println " get from "+from3+" to "+to3 |
|
135 |
|
|
136 |
|
|
137 |
String localname; |
|
138 |
String prefix; |
|
139 |
InputStream inputData = infile.toURI().toURL().openStream(); |
|
140 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
|
141 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
|
142 |
XMLOutputFactory factory = XMLOutputFactory.newInstance(); |
|
143 |
|
|
144 |
FileOutputStream output = new FileOutputStream(outfile) |
|
145 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8"); |
|
146 |
|
|
147 |
writer.writeStartDocument("utf-8", "1.0"); |
|
148 |
|
|
149 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
|
150 |
{ |
|
151 |
if(isText) |
|
152 |
{ |
|
153 |
if((wcount >= from1 && wcount <= to1 )|| |
|
154 |
(wcount >= from2 && wcount <= to2) || |
|
155 |
(wcount >= from3 && wcount <= to3)) |
|
156 |
printW = true; |
|
157 |
else |
|
158 |
printW = false; |
|
159 |
} |
|
160 |
else |
|
161 |
printW = true; |
|
162 |
|
|
163 |
switch (event) |
|
164 |
{ |
|
165 |
case XMLStreamConstants.START_ELEMENT: |
|
166 |
localname = parser.getLocalName(); |
|
167 |
prefix = parser.getPrefix(); |
|
168 |
|
|
169 |
/* |
|
170 |
if(localname == "supplied") |
|
171 |
if(parser.getAttributeValue(null,"source") != null) |
|
172 |
ms = parser.getAttributeValue(null,"source") |
|
173 |
if(localname == "sic") |
|
174 |
{ |
|
175 |
isSic= true; |
|
176 |
} |
|
177 |
*/ |
|
178 |
if(localname == "text") |
|
179 |
isText = true; |
|
180 |
|
|
181 |
if(localname == "w") |
|
182 |
{ |
|
183 |
isW= true; |
|
184 |
wcount++; |
|
185 |
|
|
186 |
if(isText) |
|
187 |
{ |
|
188 |
if((wcount >= from1 && wcount <= to1 )|| |
|
189 |
(wcount >= from2 && wcount <= to2) || |
|
190 |
(wcount >= from3 && wcount <= to3)) |
|
191 |
printW = true; |
|
192 |
else |
|
193 |
printW = false; |
|
194 |
} |
|
195 |
else |
|
196 |
printW = true; |
|
197 |
} |
|
198 |
|
|
199 |
/*if(!isSic) |
|
200 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg") |
|
201 |
{*/ |
|
202 |
if(localname == "w") |
|
203 |
{ |
|
204 |
if(printW) |
|
205 |
{ |
|
206 |
if(prefix != null && prefix.length() > 0) |
|
207 |
writer.writeStartElement(prefix+":"+localname); |
|
208 |
else |
|
209 |
writer.writeStartElement(localname); |
|
210 |
|
|
211 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
|
212 |
{ |
|
213 |
if(parser.getAttributePrefix(i)!= "") |
|
214 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
|
215 |
else |
|
216 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
|
217 |
} |
|
218 |
//writer.writeAttribute("srcmf:src", ms); |
|
219 |
} |
|
220 |
} |
|
221 |
else |
|
222 |
{ |
|
223 |
if(prefix != null && prefix.length() > 0) |
|
224 |
writer.writeStartElement(prefix+":"+localname); |
|
225 |
else |
|
226 |
writer.writeStartElement(localname); |
|
227 |
|
|
228 |
if(localname == "teiHeader") |
|
229 |
{ |
|
230 |
writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0"); |
|
231 |
writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0"); |
|
232 |
//writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index"); |
|
233 |
} |
|
234 |
|
|
235 |
if(localname == "TEI") |
|
236 |
{ |
|
237 |
writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0"); |
|
238 |
} |
|
239 |
|
|
240 |
for(int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
|
241 |
{ |
|
242 |
if(parser.getAttributePrefix(i)!= "") |
|
243 |
writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
|
244 |
else |
|
245 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); |
|
246 |
} |
|
247 |
} |
|
248 |
//} |
|
249 |
break; |
|
250 |
|
|
251 |
case XMLStreamConstants.END_ELEMENT: |
|
252 |
localname =parser.getLocalName() |
|
253 |
|
|
254 |
/*if(localname == "sic") |
|
255 |
isSic= false; |
|
256 |
if(localname == "w") |
|
257 |
isW= false; |
|
258 |
if(localname == "supplied" && ms != "#ms_K") |
|
259 |
ms = "#ms_K"; |
|
260 |
if(!isSic) |
|
261 |
if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg") |
|
262 |
{*/ |
|
263 |
if(localname == "w") |
|
264 |
{ |
|
265 |
if(printW) |
|
266 |
{ |
|
267 |
writer.writeEndElement(); |
|
268 |
writer.writeCharacters("\n"); |
|
269 |
} |
|
270 |
} |
|
271 |
else |
|
272 |
{ |
|
273 |
writer.writeEndElement(); |
|
274 |
writer.writeCharacters("\n"); |
|
275 |
} |
|
276 |
// } |
|
277 |
|
|
278 |
break; |
|
279 |
|
|
280 |
case XMLStreamConstants.CHARACTERS: |
|
281 |
//if(!isSic) |
|
282 |
if(isW) |
|
283 |
{ |
|
284 |
if(printW) |
|
285 |
{ |
|
286 |
writer.writeCharacters(parser.getText().trim()); |
|
287 |
} |
|
288 |
} |
|
289 |
else |
|
290 |
writer.writeCharacters(parser.getText().trim()); |
|
291 |
break; |
|
292 |
} |
|
293 |
} |
|
294 |
writer.flush(); |
|
295 |
writer.close(); |
|
296 |
output.close() |
|
297 |
inputData.close(); |
|
298 |
} |
|
299 |
|
|
300 |
/** |
|
301 |
* Count w. |
|
302 |
* |
|
303 |
* @param infile the infile |
|
304 |
* @return the int |
|
305 |
*/ |
|
306 |
public int countW(File infile) |
|
307 |
{ |
|
308 |
InputStream inputData = infile.toURI().toURL().openStream(); |
|
309 |
XMLInputFactory inputfactory = XMLInputFactory.newInstance(); |
|
310 |
XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData); |
|
311 |
|
|
312 |
int count = 0; |
|
313 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
|
314 |
{ |
|
315 |
switch (event) |
|
316 |
{ |
|
317 |
case XMLStreamConstants.START_ELEMENT: |
|
318 |
if(parser.getLocalName() == "w") |
|
319 |
count++; |
|
320 |
} |
|
321 |
} |
|
322 |
inputData.close() |
|
323 |
return count; |
|
324 |
} |
|
325 |
|
|
326 |
/** |
|
327 |
* The main method. |
|
328 |
* |
|
329 |
* @param args the arguments |
|
330 |
*/ |
|
331 |
public static void main(String[] args) |
|
332 |
{ |
|
333 |
String userDir = System.getProperty("user.home"); |
|
334 |
|
|
335 |
File directory = new File(userDir+"/xml/extract/"); |
|
336 |
File outdir = new File(userDir+"/xml/extract/","results"); |
|
337 |
outdir.mkdir(); |
|
338 |
|
|
339 |
File maxfilemode = new File(userDir+"/xml/extract/maxfilemode"); |
|
340 |
/* |
|
341 |
* maxfilemode format: |
|
342 |
* |
|
343 |
* filename1.xml 3 45000 |
|
344 |
* filename2.xml 1a 15000 |
|
345 |
* filename3.xml 1m 15000 |
|
346 |
* filename4.xml 1z 15000 |
|
347 |
* filename5.xml 2 22500 |
|
348 |
*/ |
|
349 |
HashMap<File, String> maxperfile = new HashMap<File, String>(); |
|
350 |
maxfilemode.eachLine{it-> |
|
351 |
String[] split = it.split("\t"); |
|
352 |
if(split.length == 3) |
|
353 |
{ |
|
354 |
try |
|
355 |
{ |
|
356 |
String filename = it.split("\t")[0]; |
|
357 |
String modemax = it.split("\t")[1]+"/"+it.split("\t")[2] |
|
358 |
maxperfile.put(filename, modemax); |
|
359 |
}catch(Exception e ){} |
|
360 |
} |
|
361 |
} |
|
362 |
println maxperfile; |
|
363 |
|
|
364 |
def files = directory.listFiles(); |
|
365 |
for(File infile : files) |
|
366 |
{ |
|
367 |
|
|
368 |
if(maxperfile.containsKey(infile.getName())) |
|
369 |
{ |
|
370 |
File outfile = new File(outdir, infile.getName()); |
|
371 |
String modemax = maxperfile.get(infile.getName()); |
|
372 |
new WExtractWithMode().process(infile, outfile, modemax) |
|
373 |
} |
|
374 |
} |
|
375 |
} |
|
376 |
} |
tmp/org.txm.core/src/java/org/txm/importer/HTML2XHTML.groovy (revision 881) | ||
---|---|---|
1 |
package org.txm.importer |
|
2 |
|
|
3 |
import org.txm.utils.CharsetDetector; |
|
4 |
import org.txm.importer.ValidateXml; |
|
5 |
|
|
6 |
File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html") |
|
7 |
File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml") |
|
8 |
|
|
9 |
String encoding = new CharsetDetector(infile).getEncoding(); |
|
10 |
println "Encoding: $encoding" |
|
11 |
String text = infile.getText(encoding); |
|
12 |
|
|
13 |
//lower case tags |
|
14 |
text = text.replaceAll(/(<[^!][^>]*>)/, |
|
15 |
{ full, word -> |
|
16 |
//fix attributes TRUC=sdf234 |
|
17 |
word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' ) |
|
18 |
word.toLowerCase() // bourrin |
|
19 |
} ) |
|
20 |
|
|
21 |
//lower case <.> tags |
|
22 |
text = text.replaceAll(/(<.>)/, |
|
23 |
{ full, word -> |
|
24 |
word.toLowerCase() |
|
25 |
} ) |
|
26 |
|
|
27 |
//resolve entities |
|
28 |
text = text.replaceAll(/ /," ") |
|
29 |
|
|
30 |
//close tags |
|
31 |
text = text.replaceAll(/<br>/,"<br/>") |
|
32 |
text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>') |
|
33 |
text = text.replaceAll(/<img([^>]*)>/,'<img$1/>') |
|
34 |
|
|
35 |
//remove doctype declaration |
|
36 |
text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '') |
|
37 |
|
|
38 |
//write&Validate |
|
39 |
outfile.withWriter(encoding) { writer -> writer.write(text) } |
|
40 |
if (!ValidateXml.test(outfile)) { |
|
41 |
println "FILE: $outfile" |
|
42 |
} |
tmp/org.txm.core/src/java/org/txm/importer/XMLText2TXTCSV.groovy (revision 881) | ||
---|---|---|
1 |
package org.txm.importer |
|
2 |
|
|
3 |
File dir = new File("/home/mdecorde/xml/voeux/split_xml") |
|
4 |
File outdir = new File("/home/mdecorde/xml/voeux/split_txtcsv") |
|
5 |
println "1) xml -> txt + write metadata.csv" |
|
6 |
outdir.deleteDir() |
|
7 |
outdir.mkdir() |
|
8 |
File metadatafile = new File(outdir, "metadata.csv") |
|
9 |
String csvString = "" |
|
10 |
|
|
11 |
def files = dir.listFiles() |
|
12 |
files.sort() |
|
13 |
for(File f : files) |
|
14 |
{ |
|
15 |
File outfile = new File(outdir, f.getName()+".txt"); |
|
16 |
String text = f.getText("UTF-8"); |
|
17 |
String texttag = text.find("<text id.*>") |
|
18 |
// println texttag |
|
19 |
// texttag = texttag.replaceAll('<text id="([^"]+)"', '<text id="$1.txt"') |
|
20 |
// println "> "+texttag |
|
21 |
text = text.replaceAll("<text.*>", "").replace("</text>", ""); |
|
22 |
outfile.withWriter("UTF-8"){writer -> writer.write(text) } |
|
23 |
csvString += texttag.replace("<text id=","").replace(" loc=", ",").replace(" annee=", ",").replace("\">", "\"")+"\n" |
|
24 |
} |
|
25 |
|
|
26 |
println "2) write metadata.csv" |
|
27 |
metadatafile.withWriter("UTF-8"){csvwriter -> |
|
28 |
csvwriter.write("\"id\",\"loc\",\"annee\"\n"); |
|
29 |
csvwriter.write(csvString)} |
|
30 |
|
|
31 |
println "3) rename Voeux_*" |
|
32 |
outdir.eachFileMatch(~/Voeux_.*/) {file-> file.renameTo(new File(outdir, file.getName().substring(6, 10)+".txt")) } |
tmp/org.txm.core/src/java/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 881) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $ |
|
25 |
// $LastChangedRevision: 3400 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
|
|
30 |
import java.text.DateFormat; |
|
31 |
import java.util.Date; |
|
32 |
import java.util.ArrayList; |
|
33 |
import javax.xml.stream.*; |
|
34 |
import java.net.URL; |
|
35 |
|
|
36 |
import org.txm.Toolbox; |
|
37 |
import org.txm.importer.filters.*; |
|
38 |
|
|
39 |
// TODO: Auto-generated Javadoc |
|
40 |
/** |
|
41 |
* The Class BuildTTSrc. |
|
42 |
* |
|
43 |
* @author mdecorde |
|
44 |
* build the TT source for tigerSearch |
|
45 |
*/ |
|
46 |
|
|
47 |
public class BuildTTSrc { |
|
48 |
|
|
49 |
/** The url. */ |
|
50 |
private def url; |
|
51 |
|
|
52 |
/** The input data. */ |
|
53 |
private def inputData; |
|
54 |
|
|
55 |
/** The factory. */ |
|
56 |
private def factory; |
|
57 |
|
|
58 |
/** The parser. */ |
|
59 |
private XMLStreamReader parser; |
|
60 |
|
|
61 |
/** The output. */ |
|
62 |
private BufferedWriter output; |
|
63 |
|
|
64 |
/** |
|
65 |
* Instantiates a new builds the tt src. |
|
66 |
* uses XML-TXM V2 |
|
67 |
* |
|
68 |
* @param url the url of the file to process |
|
69 |
*/ |
|
70 |
public BuildTTSrc(URL url) { |
|
71 |
try { |
|
72 |
this.url = url; |
|
73 |
inputData = url.openStream(); |
|
74 |
factory = XMLInputFactory.newInstance(); |
|
75 |
parser = factory.createXMLStreamReader(inputData); |
|
76 |
|
|
77 |
} catch (XMLStreamException ex) { |
|
78 |
System.out.println(ex); |
|
79 |
} catch (IOException ex) { |
|
80 |
System.out.println("IOException while parsing "); |
|
81 |
} |
|
82 |
} |
|
83 |
|
|
84 |
/** |
|
85 |
* Creates the output. |
|
86 |
* |
|
87 |
* @param outfile the outfile |
|
88 |
* @return true, if successful |
|
89 |
*/ |
|
90 |
private boolean createOutput(File outfile) { |
|
91 |
try { |
|
92 |
File f = outfile; |
|
93 |
output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), |
|
94 |
"UTF-8")); |
|
95 |
return true; |
|
96 |
} catch (Exception e) { |
|
97 |
System.out.println(e.getLocalizedMessage()); |
|
98 |
return false; |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
/** |
|
103 |
* Process. |
|
104 |
* |
|
105 |
* @param outfile the outfile |
|
106 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found |
|
107 |
* @return true, if successful |
|
108 |
*/ |
|
109 |
public boolean process(File outfile, String formtype) { |
|
110 |
if (!createOutput(outfile)) |
|
111 |
return false; |
|
112 |
|
|
113 |
boolean flagform = false; // to catch the content of the form tag |
|
114 |
boolean firstform = false; // to know if its the first form of the w element |
|
115 |
String form = ""; // the content of the form tag |
|
116 |
String lastopenlocalname = ""; |
|
117 |
String localname = ""; |
|
118 |
StringBuffer buffer = new StringBuffer(); |
|
119 |
try { |
|
120 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
|
121 |
switch (event) { |
|
122 |
case XMLStreamConstants.START_ELEMENT: |
|
123 |
localname = parser.getLocalName(); |
|
124 |
switch (localname) { |
|
125 |
case "w": |
|
126 |
//firstform = true; |
|
127 |
break; |
|
128 |
case "form": |
|
129 |
// if (firstform) { |
|
130 |
// if (formtype != null) { |
|
131 |
// if(parser.getAttributeCount() > 0 |
|
132 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
133 |
// flagform = true; |
|
134 |
// } |
|
135 |
// else |
|
136 |
flagform = true; |
|
137 |
form = ""; |
|
138 |
firstform = false; |
|
139 |
//} |
|
140 |
break; |
|
141 |
case "s": // TreeTagger can use s tags |
|
142 |
buffer.append("<s>\n"); |
|
143 |
break; |
|
144 |
} |
|
145 |
break; |
|
146 |
case XMLStreamConstants.END_ELEMENT: |
|
147 |
localname = parser.getLocalName(); |
|
148 |
switch (localname) { |
|
149 |
case "form": |
|
150 |
flagform = false; |
|
151 |
form = form.trim() |
|
152 |
if (form.length() == 0) buffer.append("__EMPTY__\n"); |
|
153 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n"); |
|
154 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
155 |
break; |
|
156 |
|
|
157 |
case "s": |
|
158 |
buffer.append("</s>\n"); |
|
159 |
break; |
|
160 |
} |
|
161 |
break; |
|
162 |
|
|
163 |
case XMLStreamConstants.CHARACTERS: |
|
164 |
if (flagform) { |
|
165 |
if (parser.getText().length() > 0) |
|
166 |
form += parser.getText(); |
|
167 |
} |
|
168 |
break; |
|
169 |
} |
|
170 |
} |
|
171 |
|
|
172 |
String str = buffer.toString() |
|
173 |
if ("false".equals(Toolbox.getPreference(Toolbox.TREETAGGER_APOSTROPHE))) { |
|
174 |
str = str.replace("’", "'").replace("‘", "'"); |
|
175 |
} |
|
176 |
output.write(str) |
|
177 |
output.close(); |
|
178 |
parser.close(); |
|
179 |
inputData.close(); |
|
180 |
} catch (Exception ex) { |
|
181 |
System.out.println(ex); |
|
182 |
return false; |
|
183 |
} |
|
184 |
|
|
185 |
return true; |
|
186 |
} |
|
187 |
|
|
188 |
/** |
|
189 |
* The main method. |
|
190 |
* |
|
191 |
* @param args the arguments |
|
192 |
*/ |
|
193 |
public static void main(String[] args) { |
|
194 |
|
|
195 |
String rootDir = "~/xml/rgaqcj/"; |
|
196 |
// new File(rootDir+"/identity/").mkdir(); |
|
197 |
|
|
198 |
ArrayList<String> milestones = new ArrayList<String>();// the tags who |
|
199 |
// you want them |
|
200 |
// to stay |
|
201 |
// milestones |
|
202 |
milestones.add("tagUsage"); |
|
203 |
milestones.add("pb"); |
|
204 |
milestones.add("lb"); |
|
205 |
milestones.add("catRef"); |
|
206 |
|
|
207 |
File srcfile = new File(rootDir + "anainline/", "roland-p5.xml"); |
|
208 |
File resultfile = new File(rootDir + "ttsrc/", "roland-p5.tt"); |
|
209 |
println("build ttsrc file : " + srcfile + " to : " + resultfile); |
|
210 |
|
|
211 |
def builder = new BuildTTSrc(srcfile.toURL(), milestones); |
|
212 |
builder.process(resultfile); |
|
213 |
|
|
214 |
return; |
|
215 |
} |
|
216 |
|
|
217 |
} |
tmp/org.txm.core/src/java/org/txm/importer/xmltxm/Xml2Ana.groovy (revision 881) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $ |
|
25 |
// $LastChangedRevision: 3430 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
|
|
30 |
import org.txm.importer.HasElement; |
|
31 |
import org.txm.importer.StaxIdentityParser; |
|
32 |
|
|
33 |
import java.text.DateFormat; |
|
34 |
import java.util.ArrayList; |
|
35 |
import java.util.Date; |
|
36 |
import java.util.HashMap; |
|
37 |
import java.util.Locale; |
|
38 |
|
|
39 |
import javax.xml.stream.*; |
|
40 |
|
|
41 |
import java.net.URL; |
|
42 |
|
|
43 |
import org.txm.importer.filters.*; |
|
44 |
import org.txm.utils.AsciiUtils; |
|
45 |
|
|
46 |
/** |
|
47 |
* The Class Xml2Ana. |
|
48 |
* |
|
49 |
* @author mdecorde |
|
50 |
* transform : pre xml-tei file >> xml-tei-txm file |
|
51 |
* The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt |
|
52 |
* |
|
53 |
* you must specify the correspondance between word attributs and ana types&respStmtIDs |
|
54 |
* then the attributes of w tags will be transformed into interp tag |
|
55 |
*/ |
|
56 |
public class Xml2Ana extends StaxIdentityParser |
|
57 |
{ |
|
58 |
/** The dir. */ |
|
59 |
private def dir; |
|
60 |
|
|
61 |
/** The convert all attributes. */ |
|
62 |
private boolean convertAllAttributes = false; |
|
63 |
|
|
64 |
/** The corresp type. */ |
|
65 |
HashMap<String,String> correspType; |
|
66 |
|
|
67 |
/** The corresp ref. */ |
|
68 |
HashMap<String,String> correspRef; |
|
69 |
|
|
70 |
/** The check tags. */ |
|
71 |
HashMap<String,Boolean> checkTags = new HashMap<String,Boolean>(); |
|
72 |
|
|
73 |
/** The resp id. */ |
|
74 |
def respId = []; |
|
75 |
|
|
76 |
/** The applications. */ |
|
77 |
HashMap<String,File> applications; |
|
78 |
|
|
79 |
/** The taxonomies. */ |
|
80 |
HashMap<String,String[]> taxonomies; |
|
81 |
|
|
82 |
/** The resps. */ |
|
83 |
HashMap<String,String[]> resps; |
|
84 |
|
|
85 |
/** The items. */ |
|
86 |
HashMap<String,HashMap<String,String>> items; |
|
87 |
|
|
88 |
/** The XML headeradded. */ |
|
89 |
boolean XMLHeaderadded = false; |
|
90 |
String textname; |
|
91 |
String wtag = "w"; |
|
92 |
|
|
93 |
public static final String TEXT = "text" |
|
94 |
public static final String ID = "id" |
|
95 |
|
|
96 |
/** |
|
97 |
* Instantiates a new xml2 ana. |
|
98 |
* |
|
99 |
* @param url the url |
|
100 |
* @param wordprefix the wordprefix |
|
101 |
*/ |
|
102 |
public Xml2Ana(File file) { |
|
103 |
super(file.toURI().toURL()); |
|
104 |
//File file = new File(url.getFile()).getAbsoluteFile() |
|
105 |
textname = file.getName(); |
|
106 |
int idx = textname.lastIndexOf("."); |
|
107 |
if (idx > 0) |
|
108 |
textname = textname.substring(0, idx) |
|
109 |
|
|
110 |
|
|
111 |
checkTags.put("respStmt",false); |
|
112 |
checkTags.put("titleStmt",false); |
|
113 |
checkTags.put("appInfo",false); |
|
114 |
|
|
115 |
hasText = new HasElement(file, TEXT).process(); |
|
116 |
} |
|
117 |
|
|
118 |
/** |
|
119 |
* Sets the convert all atrtibutes. |
|
120 |
* |
|
121 |
* @param value the value |
|
122 |
* @return the java.lang. object |
|
123 |
*/ |
|
124 |
public setConvertAllAtrtibutes(boolean value) { |
|
125 |
convertAllAttributes = value; |
|
126 |
} |
|
127 |
|
|
128 |
/** |
|
129 |
* Sets the convert all atrtibutes. |
|
130 |
* |
|
131 |
* @param value the value |
|
132 |
* @return the java.lang. object |
|
133 |
*/ |
|
134 |
public setWordTag(String wtag) { |
|
135 |
this.wtag = wtag |
|
136 |
} |
|
137 |
|
|
138 |
int idcount = 0; |
|
139 |
boolean flagWord = false; |
|
140 |
int firstElement = 0; |
|
141 |
boolean teiElementAdded = false; |
|
142 |
boolean teiHeaderElementAdded = false; |
|
143 |
boolean hasText = false; |
|
144 |
boolean textElementAdded = false; |
|
145 |
def anabalises = []; |
|
146 |
protected void processStartElement() |
|
147 |
{ |
|
148 |
// println "checkTags=$checkTags"; |
|
149 |
// println "parser=$parser"; |
|
150 |
firstElement++; |
|
151 |
|
|
152 |
if (this.checkTags.containsKey(parser.getLocalName())) { |
|
153 |
this.checkTags.put(parser.getLocalName(), true); |
|
154 |
} |
|
155 |
|
|
156 |
switch (parser.getLocalName()) { |
|
157 |
case wtag: |
|
158 |
if (!hasText) { |
|
159 |
writer.writeStartElement(TEXT); |
|
160 |
writer.writeAttribute(ID, textname); |
|
161 |
textElementAdded = true; |
|
162 |
hasText = true; |
|
163 |
} |
|
164 |
idcount++; // increment word counter |
|
165 |
anabalises.clear(); |
|
166 |
|
|
167 |
writer.writeStartElement(parser.getLocalName()); // write w |
|
168 |
|
|
169 |
for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces |
|
170 |
writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i)); |
|
171 |
|
|
172 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes |
|
173 |
String type = parser.getAttributeLocalName(i); |
|
174 |
String value = parser.getAttributeValue(i); |
|
175 |
if (correspType.containsKey(type)) { // check if txm:ana |
|
176 |
String corresptype = correspType.get(type); |
|
177 |
String ref = correspRef.get(type); |
|
178 |
anabalises.add(["#"+ref, "#"+corresptype, value]); |
|
179 |
} else if (type == ID) { // keep id attribute |
|
180 |
String wordid = value |
|
181 |
if (wordid.startsWith("w")) { |
|
182 |
if (!wordid.startsWith("w_")) |
|
183 |
wordid = "w_"+wordid.substring(1) |
|
184 |
} |
|
185 |
// else { |
|
186 |
// wordid = "w_"+textname+"_"+wordid; |
|
187 |
// } |
|
188 |
|
|
189 |
wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value |
|
190 |
|
|
191 |
writer.writeAttribute(type, wordid); |
|
192 |
|
|
193 |
} else { // add attributes that was in the original <w> |
|
194 |
if (convertAllAttributes) |
|
195 |
anabalises.add(["none","#"+type, value]) |
|
196 |
else |
|
197 |
writer.writeAttribute(type, value); |
|
198 |
} |
|
199 |
} |
|
200 |
|
|
201 |
flagWord = true; // start to capture the form |
|
202 |
writer.writeStartElement(TXMNS, "form"); |
|
203 |
break; |
|
204 |
|
|
205 |
case "TEI": |
|
206 |
super.processStartElement(); |
|
207 |
boolean hasTeiNS = false; |
|
208 |
boolean hasTXMNs = false; |
|
209 |
for (int i = 0 ; i < parser.getNamespaceCount() ; i++) { |
|
210 |
if (parser.getNamespaceURI(i) == TXMNS) |
|
211 |
hasTXMNs = true; |
|
212 |
else if (parser.getNamespaceURI(i) == TEINS) |
|
213 |
hasTeiNS = true; |
|
214 |
} |
|
215 |
if (!hasTeiNS) { |
|
216 |
writer.writeDefaultNamespace(TEINS); |
|
217 |
} |
|
218 |
if (!hasTXMNs) |
|
219 |
writer.writeNamespace(TXM, TXMNS); |
|
220 |
break; |
|
221 |
|
|
222 |
default: |
|
223 |
|
|
224 |
if (TEXT.equals(localname)) { |
|
225 |
hasText = true; |
|
226 |
} |
|
227 |
|
|
228 |
if (firstElement == 1) { // test if first element is TEI |
|
229 |
//println "first tag: "+parser.getLocalName() |
|
230 |
if (localname != "TEI") { // "TEI" is missing |
|
231 |
teiElementAdded = true; |
|
232 |
addTEIElement(); |
|
233 |
} else if (!hasText) { |
|
234 |
writer.writeStartElement(TEXT); |
|
235 |
writer.writeAttribute(ID, textname); |
|
236 |
textElementAdded = true; |
|
237 |
hasText = true; |
|
238 |
} |
|
239 |
} |
|
240 |
if (firstElement == 2 && teiElementAdded != true) { |
|
241 |
//println "second tag: "+parser.getLocalName() |
|
242 |
if (localname != "teiHeader") { // teiHeader is missing |
|
243 |
writeTeiHeader(); |
|
244 |
hasTeiHeader = true |
|
245 |
teiHeaderElementAdded = true |
|
246 |
} |
|
247 |
} else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) { |
|
248 |
writer.writeStartElement(TEXT); |
|
249 |
writer.writeAttribute(ID, textname); |
|
250 |
textElementAdded = true; |
|
251 |
hasText = true; |
|
252 |
} |
|
253 |
|
|
254 |
super.processStartElement(); |
|
255 |
if (TEXT.equals(localname)) { |
|
256 |
if (!parser.getAttributeValue(null, ID)) { |
|
257 |
writer.writeAttribute(ID, textname); |
|
258 |
} |
|
259 |
} |
|
260 |
} |
|
261 |
} |
|
262 |
|
|
263 |
protected void after() |
|
264 |
{ |
|
265 |
if (textElementAdded) { |
|
266 |
writer.writeEndElement(); // text |
|
267 |
} |
|
268 |
if (teiElementAdded) { |
|
269 |
writer.writeEndElement(); // TEI |
|
270 |
} |
|
271 |
super.after(); // close writer, parser, etc |
|
272 |
} |
|
273 |
|
|
274 |
protected void addTEIElement() |
|
275 |
{ |
|
276 |
writer.writeStartElement("TEI"); |
|
277 |
writer.writeDefaultNamespace(TEINS); |
|
278 |
writer.writeNamespace(TXM, TXMNS); |
|
279 |
writer.writeNamespace(TEI, TEINS); |
|
280 |
writeTeiHeader(); |
|
281 |
} |
|
282 |
|
|
283 |
protected void processCharacters() |
|
284 |
{ |
|
285 |
if (flagWord) { |
|
286 |
writer.writeCharacters(parser.getText().trim()); // keep form in 1 line |
|
287 |
} else { |
|
288 |
super.processCharacters(); |
|
289 |
} |
|
290 |
} |
|
291 |
|
|
292 |
boolean hasClassDecl = false; |
|
293 |
boolean hasFileDesc = false; |
|
294 |
boolean hasEncodingDesc = false; |
|
295 |
boolean hasTeiHeader = false; |
|
296 |
boolean hasTEI = false; |
|
297 |
public static String ANA = "ana" |
|
298 |
public static String RESP = "resp" |
|
299 |
public static String TYPE = "type" |
|
300 |
protected void processEndElement() |
|
301 |
{ |
|
302 |
switch (parser.getLocalName()) { |
|
303 |
case wtag: |
|
304 |
writer.writeEndElement(); // txm:form |
|
305 |
for (def values : anabalises) |
|
306 |
{// <txm:ana resp=ref type=corresptype>value</txm:ana> |
|
307 |
writer.writeStartElement(TXMNS, ANA); |
|
308 |
writer.writeAttribute(RESP, values[0]); |
|
309 |
writer.writeAttribute(TYPE, values[1]); |
|
310 |
writer.writeCharacters(values[2]); |
|
311 |
writer.writeEndElement(); // txm:ana |
|
312 |
} |
|
313 |
|
|
314 |
flagWord = false; |
|
315 |
break; |
|
316 |
|
|
317 |
case "fileDesc": |
|
318 |
hasFileDesc = true; |
|
319 |
this.writeTXMResps(); |
|
320 |
break; |
|
321 |
|
|
322 |
case "classDecl": |
|
323 |
hasClassDecl=true; |
|
324 |
this.writeTXMTaxonomies(); |
|
325 |
break; |
|
326 |
case "encodingDesc": |
|
327 |
hasEncodingDesc = true; |
|
328 |
writeContentOfEncodingDesc(); |
|
329 |
break; |
|
330 |
|
|
331 |
case "teiHeader": |
|
332 |
hasTeiHeader = true |
|
333 |
if (!hasEncodingDesc) { |
|
334 |
writer.writeStartElement("encodingDesc"); |
|
335 |
writeContentOfEncodingDesc(); |
|
336 |
writer.writeEndElement(); |
|
337 |
} |
|
338 |
|
|
339 |
break; |
|
340 |
case "TEI": |
|
341 |
hasTEI = true; |
|
342 |
if (!hasTeiHeader) { |
|
343 |
writeTeiHeader(); |
|
344 |
} |
|
345 |
break; |
|
346 |
} |
|
347 |
|
|
348 |
super.processEndElement(); |
|
349 |
} |
|
350 |
|
|
351 |
protected void writeTeiHeader() |
|
352 |
{ |
|
353 |
writer.writeStartElement("teiHeader"); |
|
354 |
writer.writeStartElement("fileDesc") |
|
355 |
this.writeTXMResps(); |
|
356 |
writer.writeStartElement("titleStmt") |
|
357 |
writer.writeStartElement("title") |
|
358 |
writer.writeEndElement(); // title |
|
359 |
writer.writeEndElement(); // titleStmt |
|
360 |
writer.writeStartElement("publicationStmt") |
|
361 |
writer.writeEndElement(); // publicationStmt |
|
362 |
writer.writeStartElement("sourceDesc") |
|
363 |
writer.writeEndElement(); // sourceDesc |
|
364 |
writer.writeEndElement(); // fileDesc |
|
365 |
writer.writeStartElement("encodingDesc"); |
|
366 |
writeContentOfEncodingDesc(); |
|
367 |
writer.writeEndElement(); // encodingDesc |
|
368 |
writer.writeEndElement(); // teiHeader |
|
369 |
} |
|
370 |
|
|
371 |
protected void writeContentOfEncodingDesc() |
|
372 |
{ |
|
373 |
writer.writeStartElement("appInfo") |
|
374 |
this.writeTXMApps(); |
|
375 |
writer.writeEndElement(); // appInfo |
|
376 |
if (!hasClassDecl) { |
|
377 |
writer.writeStartElement("classDecl"); |
|
378 |
this.writeTXMTaxonomies(); |
|
379 |
writer.writeEndElement(); // classDecl |
|
380 |
} |
|
381 |
} |
|
382 |
|
|
383 |
/** |
|
384 |
* Check resp. |
|
385 |
* |
|
386 |
* @return the string |
|
387 |
*/ |
|
388 |
public String checkResp() |
|
389 |
{ |
|
390 |
String rez ="found tags : \n"; |
|
391 |
for (String key : checkTags.keySet()) |
|
392 |
rez += "\t"+key+"\n"; |
|
393 |
return rez; |
|
394 |
} |
|
395 |
|
|
396 |
/** |
|
397 |
* Sets the correspondances. |
|
398 |
* |
|
399 |
* @param correspRef the corresp ref |
|
400 |
* @param correspType the corresp type |
|
401 |
*/ |
|
402 |
public void setCorrespondances(correspRef, correspType) |
|
403 |
{ |
|
404 |
this.correspRef = correspRef; |
|
405 |
this.correspType = correspType; |
|
406 |
} |
|
407 |
|
|
408 |
/** |
|
409 |
* Sets the header infos. |
|
410 |
* |
|
411 |
* @param respId the resp id |
|
412 |
* @param resps the resps |
|
413 |
* @param applications the applications |
|
414 |
* @param taxonomies the taxonomies |
|
415 |
* @param items the items |
|
416 |
*/ |
|
417 |
public void setHeaderInfos(respId,resps, applications, taxonomies, items) |
|
418 |
{ |
|
419 |
this.respId = respId |
|
420 |
this.resps = resps |
|
421 |
this.applications = applications |
|
422 |
this.taxonomies = taxonomies; |
|
423 |
this.items = items; |
|
424 |
} |
|
425 |
|
|
426 |
/** |
|
427 |
* Write txm resps. |
|
428 |
*/ |
|
429 |
public void writeTXMResps() |
|
430 |
{ |
|
431 |
for (String ref : respId) { |
|
432 |
String[] infos = resps.get(ref); |
|
433 |
writer.writeStartElement("respStmt"); |
|
434 |
writer.writeStartElement(RESP); |
|
435 |
writer.writeAttribute(ID,ref); |
|
436 |
writer.writeCharacters(infos[0]); |
|
437 |
writer.writeStartElement("date"); |
|
438 |
writer.writeAttribute("when",infos[2]); |
|
439 |
writer.writeCharacters(infos[3]); |
|
440 |
writer.writeEndElement(); // date |
|
441 |
writer.writeEndElement(); //resp |
|
442 |
writer.writeStartElement("name"); |
|
443 |
writer.writeAttribute(TYPE, "person"); |
|
444 |
writer.writeCharacters(infos[1]) |
|
445 |
writer.writeEndElement(); // name |
|
446 |
writer.writeEndElement(); //respStmt |
|
447 |
} |
|
448 |
} |
|
449 |
|
|
450 |
/** |
|
451 |
* Write txm apps. |
|
452 |
*/ |
|
453 |
public void writeTXMApps() |
|
454 |
{ |
|
455 |
for (String ref : respId) { |
|
456 |
List<String> list= applications.get(ref); |
|
457 |
String ident = list.get(0); |
|
458 |
String version = list.get(1); |
|
459 |
File report = list.get(2); |
|
460 |
|
|
461 |
writer.writeStartElement(TXMNS, "application"); |
|
462 |
writer.writeAttribute("ident", ident); |
|
463 |
writer.writeAttribute("version", version); |
|
464 |
writer.writeAttribute(RESP, ref); |
|
465 |
|
|
466 |
//get txm:commandLine from GeneratedReport |
|
467 |
if (report != null) { |
|
468 |
writer.writeCharacters("");writer.flush(); |
|
469 |
Reader reader = new FileReader(report); |
|
470 |
String line = reader.readLine(); |
|
471 |
while (line != null) { |
|
472 |
if (line.length() != 0) |
|
473 |
output.write(line+"\n"); |
|
474 |
line = reader.readLine(); |
|
475 |
} |
|
476 |
reader.close(); |
|
477 |
} |
|
478 |
|
|
479 |
writer.writeStartElement("ab"); |
|
480 |
writer.writeAttribute(TYPE, "annotation"); |
|
481 |
for (String item : taxonomies.get(ref)) { |
|
482 |
writer.writeStartElement("list"); |
|
483 |
writer.writeEmptyElement("ref"); |
|
484 |
writer.writeAttribute(TYPE, "tagset"); |
|
485 |
writer.writeAttribute("target", item); |
|
486 |
writer.writeEndElement(); // list |
|
487 |
} |
|
488 |
writer.writeEndElement(); // ab |
|
489 |
writer.writeEndElement(); // txm:application |
|
490 |
} |
|
491 |
} |
|
492 |
|
|
493 |
/** |
|
494 |
* Write txm taxonomies. |
|
495 |
*/ |
|
496 |
public void writeTXMTaxonomies() |
|
497 |
{ |
|
498 |
for (String tax : items.keySet()) { |
|
499 |
writer.writeStartElement("taxonomy"); |
|
500 |
writer.writeAttribute(ID, tax); |
|
501 |
|
|
502 |
writer.writeStartElement("bibl"); |
|
503 |
writer.writeAttribute(TYPE, "tagset"); |
|
504 |
writer.writeStartElement("title"); |
|
505 |
writer.writeCharacters(tax); |
|
506 |
writer.writeEndElement(); // title |
|
507 |
|
|
508 |
for (String type : items.get(tax).keySet()) { |
|
509 |
writer.writeEmptyElement("ref"); |
|
510 |
writer.writeAttribute(TYPE, type); |
|
511 |
writer.writeAttribute("target", items.get(tax).get(type)); |
|
512 |
} |
|
513 |
writer.writeEndElement(); // bibl |
|
514 |
writer.writeEndElement(); // taxonomy |
|
515 |
} |
|
516 |
} |
|
517 |
|
|
518 |
/** |
|
519 |
* The main method. |
|
520 |
* |
|
521 |
* @param args the arguments |
Formats disponibles : Unified diff