Révision 3209
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/office/Table2CorpusMacro.groovy (revision 3209) | ||
---|---|---|
1 |
package org.txm.macro.csv |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx") |
|
12 |
File inputFile; |
|
13 |
|
|
14 |
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory") |
|
15 |
File outputDirectory; |
|
16 |
|
|
17 |
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire") |
|
18 |
def textSelector; |
|
19 |
|
|
20 |
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet") |
|
21 |
def structureSelector; |
|
22 |
|
|
23 |
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice") |
|
24 |
def textIDColumn; |
|
25 |
|
|
26 |
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale") |
|
27 |
def joinColumn; |
|
28 |
|
|
29 |
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
30 |
def textMetadataColumnList; |
|
31 |
|
|
32 |
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
33 |
def textContentColumnList; |
|
34 |
|
|
35 |
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div") |
|
36 |
def structureTag; |
|
37 |
|
|
38 |
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
39 |
def structureMetadataColumnList; |
|
40 |
|
|
41 |
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
42 |
def structureContentColumnList; |
|
43 |
|
|
44 |
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
45 |
def metadataDateColumnList; |
|
46 |
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
47 |
def prefixContentColumnList; |
|
48 |
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
49 |
def listContentColumnList; |
|
50 |
|
|
51 |
|
|
52 |
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
53 |
// def structureOrderColumn; |
|
54 |
|
|
55 |
if (!ParametersDialog.open(this)) return; |
|
56 |
|
|
57 |
textMetadataColumnList = textMetadataColumnList.split(",") |
|
58 |
textContentColumnList = textContentColumnList.split(",") |
|
59 |
structureMetadataColumnList = structureMetadataColumnList.split(",") |
|
60 |
structureContentColumnList = structureContentColumnList.split(",") |
|
61 |
metadataDateColumnList = metadataDateColumnList.split(",") |
|
62 |
prefixContentColumnList = prefixContentColumnList.split(",") |
|
63 |
listContentColumnList = listContentColumnList.split(",") |
|
64 |
int ti = textSelector.indexOf("=") |
|
65 |
String p1 = textSelector.substring(0, ti) |
|
66 |
String p2 = textSelector.substring(ti+1) |
|
67 |
textSelector = [p1, p2] |
|
68 |
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)] |
|
69 |
|
|
70 |
|
|
71 |
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn" |
|
72 |
println "textMetadataColumnList columns: $textMetadataColumnList" |
|
73 |
println "textContentColumnList columns: $textContentColumnList" |
|
74 |
println "structureMetadataColumnList columns: $structureMetadataColumnList" |
|
75 |
println "structureContentColumnList columns: $structureContentColumnList" |
|
76 |
|
|
77 |
println "text selector="+textSelector |
|
78 |
println "structure selector="+structureSelector |
|
79 |
println "structureTag="+structureTag |
|
80 |
|
|
81 |
TableReader reader = new TableReader(inputFile); |
|
82 |
if (!reader.readHeaders()) { |
|
83 |
println "Error: no header" |
|
84 |
return |
|
85 |
} |
|
86 |
def headers = Arrays.asList(reader.getHeaders()) |
|
87 |
println "table columns: $headers" |
|
88 |
|
|
89 |
def ok = true |
|
90 |
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList, |
|
91 |
"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList, |
|
92 |
"structureContentColumnList":structureContentColumnList]) { |
|
93 |
for (String m : list.value) { |
|
94 |
m = m.trim() |
|
95 |
if (!headers.contains(m)) { |
|
96 |
println "Error: missing ${list.key} column: $m" |
|
97 |
ok = false |
|
98 |
} |
|
99 |
} |
|
100 |
} |
|
101 |
if (!ok) { return; } |
|
102 |
|
|
103 |
// group by text |
|
104 |
def texts = new LinkedHashMap() |
|
105 |
def nRecord = 0 |
|
106 |
while (reader.readRecord()) { |
|
107 |
nRecord++ |
|
108 |
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0]) |
|
109 |
|
|
110 |
String id = reader.get(textIDColumn).trim() |
|
111 |
String join = reader.get(joinColumn).trim() |
|
112 |
String textSelectorValue = reader.get(textSelector[0]).trim() |
|
113 |
String structureSelectorValue = reader.get(structureSelector[0]).trim() |
|
114 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) { |
|
115 |
if (!texts.containsKey(id)) texts[id] = [] |
|
116 |
texts[id].add(0, reader.getRecord()) |
|
117 |
} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) { |
|
118 |
if (!texts.containsKey(join)) texts[join] = [] |
|
119 |
texts[join].add(reader.getRecord()) |
|
120 |
} else { |
|
121 |
// ignore |
|
122 |
} |
|
123 |
} |
|
124 |
println "N lines: "+nRecord |
|
125 |
println "N groups: "+texts.size() |
|
126 |
if (texts.size() == 0) { |
|
127 |
println "No text found. Aborting." |
|
128 |
return |
|
129 |
} |
|
130 |
outputDirectory.mkdir() |
|
131 |
|
|
132 |
for (def id : texts.keySet()) { |
|
133 |
def toWrite = texts[id] |
|
134 |
def text = toWrite[0] |
|
135 |
String textSelectorValue = text.get(textSelector[0]).trim() |
|
136 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) { |
|
137 |
println "Processing text: $id" |
|
138 |
|
|
139 |
File outputfile = new File(outputDirectory, id+".xml") |
|
140 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
141 |
FileOutputStream output = new FileOutputStream(outputfile) |
|
142 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
143 |
|
|
144 |
writer.writeStartDocument("UTF-8","1.0") |
|
145 |
writer.writeCharacters("\n") // simple XML formating |
|
146 |
writer.writeStartElement("TEI") |
|
147 |
writer.writeCharacters("\n") // simple XML formating |
|
148 |
writer.writeStartElement("teiHeader") |
|
149 |
writer.writeEndElement() // teiHeader |
|
150 |
writer.writeCharacters("\n") // simple XML formating |
|
151 |
writer.writeStartElement("text") |
|
152 |
writer.writeAttribute("id", id) |
|
153 |
|
|
154 |
for (String att : textMetadataColumnList) { |
|
155 |
if (att in metadataDateColumnList) { |
|
156 |
writeMetadataDate(text, att, writer) |
|
157 |
} else { |
|
158 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct |
|
159 |
} |
|
160 |
} |
|
161 |
|
|
162 |
writer.writeCharacters("\n") // simple XML formating |
|
163 |
|
|
164 |
writer.writeStartElement("metadata") |
|
165 |
writer.writeStartElement("list") |
|
166 |
writer.writeAttribute("type", "unordered") |
|
167 |
writer.writeCharacters("\n") |
|
168 |
textMetadataColumnList.each { att -> |
|
169 |
writer.writeStartElement("item") |
|
170 |
writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";")) |
|
171 |
writer.writeEndElement() // item |
|
172 |
writer.writeCharacters("\n") |
|
173 |
} |
|
174 |
writer.writeEndElement() // list |
|
175 |
writer.writeEndElement() // metadata |
|
176 |
writer.writeCharacters("\n") |
|
177 |
|
|
178 |
for (String att : textContentColumnList) { |
|
179 |
if (att in prefixContentColumnList) { |
|
180 |
writeContentPrefix(text, att, writer) |
|
181 |
} else if (att in listContentColumnList) { |
|
182 |
writeContentList(text, att, writer) |
|
183 |
} else { |
|
184 |
writer.writeStartElement("p"); |
|
185 |
writer.writeAttribute("type", att.trim()) |
|
186 |
writer.writeCharacters("\n") // simple XML formating |
|
187 |
|
|
188 |
writer.writeStartElement("head") |
|
189 |
writer.writeStartElement("hi") |
|
190 |
writer.writeCharacters(att+" : ") |
|
191 |
writer.writeEndElement() // hi |
|
192 |
writer.writeEndElement() // head |
|
193 |
|
|
194 |
writer.writeCharacters(text.get(att)) // get textColumnList content |
|
195 |
writer.writeEndElement() // t |
|
196 |
writer.writeCharacters("\n") // simple XML formating) |
|
197 |
} |
|
198 |
} |
|
199 |
|
|
200 |
int pb_n = 1; |
|
201 |
for (int i = 1 ; i < toWrite.size() ; i++) { |
|
202 |
def record = toWrite[i] |
|
203 |
|
|
204 |
writer.writeEmptyElement("pb") // <pb/> |
|
205 |
writer.writeAttribute("n", ""+pb_n++) |
|
206 |
|
|
207 |
writer.writeStartElement(structureTag) |
|
208 |
|
|
209 |
for (String att : structureMetadataColumnList) { |
|
210 |
if (att in metadataDateColumnList) { |
|
211 |
writeMetadataDate(record, att, writer) |
|
212 |
} else { |
|
213 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct |
|
214 |
} |
|
215 |
} |
|
216 |
writer.writeCharacters("\n") |
|
217 |
|
|
218 |
writer.writeStartElement("metadata") |
|
219 |
writer.writeStartElement("list") |
|
220 |
writer.writeAttribute("type", "unordered") |
|
221 |
writer.writeCharacters("\n") |
|
222 |
structureMetadataColumnList.each { att -> |
|
223 |
writer.writeStartElement("item") |
|
224 |
writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";")) |
|
225 |
writer.writeEndElement() // item |
|
226 |
writer.writeCharacters("\n") |
|
227 |
} |
|
228 |
writer.writeEndElement() // list |
|
229 |
writer.writeEndElement() // metadata |
|
230 |
writer.writeCharacters("\n") |
|
231 |
|
|
232 |
for (String att : structureContentColumnList) { |
|
233 |
if (att in prefixContentColumnList) { |
|
234 |
writeContentPrefix(text, att, writer) |
|
235 |
} else if (att in listContentColumnList) { |
|
236 |
writeContentList(text, att, writer) |
|
237 |
} else { |
|
238 |
writer.writeStartElement("p"); |
|
239 |
writer.writeAttribute("type", att.trim()) |
|
240 |
writer.writeAttribute("id", record.get(textIDColumn)) |
|
241 |
writer.writeCharacters("\n") // simple XML formating |
|
242 |
|
|
243 |
writer.writeStartElement("head") |
|
244 |
writer.writeStartElement("hi") |
|
245 |
writer.writeCharacters(att+" : ") |
|
246 |
writer.writeEndElement() // hi |
|
247 |
writer.writeEndElement() // head |
|
248 |
|
|
249 |
writer.writeCharacters(record.get(att)) // get textColumnList content |
|
250 |
writer.writeEndElement() // t |
|
251 |
writer.writeCharacters("\n") // simple XML formating |
|
252 |
} |
|
253 |
} |
|
254 |
|
|
255 |
writer.writeEndElement() // struct |
|
256 |
writer.writeCharacters("\n") // simple XML formating |
|
257 |
} |
|
258 |
|
|
259 |
writer.writeEndElement() // text |
|
260 |
writer.writeCharacters("\n") // simple XML formating |
|
261 |
writer.writeEndElement() // TEI |
|
262 |
writer.close() |
|
263 |
output.close() |
|
264 |
reader.close() |
|
265 |
} else { |
|
266 |
// error |
|
267 |
println "ERROR: '$id' text group with no text line" |
|
268 |
} |
|
269 |
} |
|
270 |
|
|
271 |
def writeContentList(def record, def att, def writer) { |
|
272 |
writer.writeCharacters("\n") |
|
273 |
found = false |
|
274 |
def value = record.get(att) |
|
275 |
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc -> |
|
276 |
found = true |
|
277 |
writer.writeCharacters("\t") |
|
278 |
writer.writeStartElement("p") |
|
279 |
writer.writeAttribute("rend", "list") |
|
280 |
matches = (desc =~ /(?s)([^\n]+?)[\n]/) |
|
281 |
writer.writeCharacters(matches[0][1]) |
|
282 |
writer.writeEndElement() // p |
|
283 |
writer.writeCharacters("\n") |
|
284 |
} |
|
285 |
if (!found) { |
|
286 |
writer.writeCharacters("\t") |
|
287 |
writer.writeStartElement("p") |
|
288 |
writer.writeAttribute("rend", "no-list") |
|
289 |
writer.writeCharacters(value) |
|
290 |
writer.writeEndElement() // p |
|
291 |
writer.writeCharacters("\n") |
|
292 |
} |
|
293 |
} |
|
294 |
|
|
295 |
def writeContentPrefix(def record, def att, def writer) { |
|
296 |
writer.writeCharacters("\n") |
|
297 |
writer.writeStartElement("list") |
|
298 |
writer.writeAttribute("rend", "prefixes") |
|
299 |
writer.writeAttribute("type", "unordered") |
|
300 |
writer.writeCharacters("\n") |
|
301 |
found = false |
|
302 |
def value = record.get(att) |
|
303 |
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc -> |
|
304 |
found = true |
|
305 |
writer.writeCharacters("\t") |
|
306 |
writer.writeStartElement("item") |
|
307 |
matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/) |
|
308 |
writer.writeAttribute("type", matches[0][1]) |
|
309 |
writer.writeStartElement("span") |
|
310 |
writer.writeCharacters(matches[0][1]+" ") |
|
311 |
writer.writeEndElement() // span |
|
312 |
writer.writeCharacters(matches[0][2]) |
|
313 |
writer.writeEndElement() // item |
|
314 |
writer.writeCharacters("\n") |
|
315 |
} |
|
316 |
if (!found) { |
|
317 |
writer.writeCharacters(value) |
|
318 |
} |
|
319 |
writer.writeEndElement() // list |
|
320 |
writer.writeCharacters("\n") |
|
321 |
} |
|
322 |
|
|
323 |
def writeMetadataDate(def record, def att, def writer) { |
|
324 |
String value = record.get(att) |
|
325 |
String att_normalized = AsciiUtils.buildAttributeId(att) |
|
326 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
327 |
writer.writeAttribute(att_normalized+"jour", matches[0][1]) |
|
328 |
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
329 |
writer.writeAttribute(att_normalized+"mois", matches[0][2]) |
|
330 |
writer.writeAttribute(att_normalized+"annee", matches[0][3]) |
|
331 |
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1]) |
|
332 |
} |
|
333 |
|
|
334 |
/* |
|
335 |
String name = inputFile.getName() |
|
336 |
int idx = name.lastIndexOf(".") |
|
337 |
if (idx > 0) name = name.substring(0, idx) |
|
338 |
*/ |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/annotation/CQLList2WordPropertiesMacro.groovy (revision 3209) | ||
---|---|---|
29 | 29 |
@Field @Option(name="word_property", usage="The word property to annotate", widget="String", required=true, def="type") |
30 | 30 |
String word_property |
31 | 31 |
|
32 |
@Field @Option(name="do_not_update_corpus_indexes_and_editions", usage="Select to only update the XML-TXM files", widget="Boolean", required=true, def="false")
|
|
33 |
def do_not_update_corpus_indexes_and_editions
|
|
32 |
@Field @Option(name="update_corpus_indexes_and_editions", usage="Unselect to only update the XML-TXM files", widget="Boolean", required=true, def="true")
|
|
33 |
def update_corpus_indexes_and_editions |
|
34 | 34 |
|
35 | 35 |
// Open the parameters input dialog box |
36 | 36 |
if (!ParametersDialog.open(this)) return |
... | ... | |
44 | 44 |
int n = gse.run(CQLList2WordAnnotationsMacro, hash) |
45 | 45 |
|
46 | 46 |
if (n > 0) { |
47 |
println "Saving annotations..." |
|
47 |
println "Saving $n annotations..."
|
|
48 | 48 |
// monitor.syncExec(new Runnable() { |
49 | 49 |
// public void run() { |
50 | 50 |
def saveJob = SaveAnnotations.save(mcorpus); |
51 | 51 |
if (saveJob == null || saveJob.getResult() == Status.CANCEL_STATUS) { |
52 | 52 |
// update editor corpus |
53 |
System.out.println("Fail to save annotations of the corpus."); //$NON-NLS-1$
|
|
53 |
System.out.println("** Error: failed to save the annotations of the corpus."); //$NON-NLS-1$
|
|
54 | 54 |
return; |
55 | 55 |
} else { |
56 |
if (!do_not_update_corpus_indexes_and_editions) { |
|
56 |
if (update_corpus_indexes_and_editions) { |
|
57 |
println "Updating corpus indexes and editions..." |
|
57 | 58 |
UpdateCorpus.update(mcorpus) |
58 | 59 |
} |
59 | 60 |
|
... | ... | |
64 | 65 |
} else { |
65 | 66 |
println "No annotation to save." |
66 | 67 |
return false |
67 |
} |
|
68 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/annotation/WordProperty2WordMacro.groovy (revision 3209) | ||
---|---|---|
1 |
// Copyright © 2021 ENS Lyon |
|
2 |
// Licensed under the terms of the GNU General Public License version 3 (http://www.gnu.org/licenses/gpl-3.0.html) |
|
3 |
// @author mdecorde |
|
4 |
// @author sheiden |
|
1 | 5 |
|
2 | 6 |
package org.txm.macro.annotation |
3 | 7 |
|
... | ... | |
28 | 32 |
Project project = mcorpus.getProject() |
29 | 33 |
File txmDir = new File(project.getProjectDirectory(), "txm/"+mcorpus.getName()) |
30 | 34 |
|
31 |
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="result_directory")
|
|
35 |
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="") |
|
32 | 36 |
File outputDirectory |
33 | 37 |
|
34 |
@Field @Option(name="word_property", usage="The word property to move", widget="String", required=true, def="type")
|
|
38 |
@Field @Option(name="word_property", usage="The word property to project", widget="String", required=true, def="type")
|
|
35 | 39 |
String word_property |
36 | 40 |
|
37 |
//@Field @Option(name="word_property_value_to_ignore", usage="The annotation type", widget="String", required=true, def="__undef__")
|
|
38 |
//String word_property_value_to_ignore
|
|
41 |
@Field @Option(name="values_to_ignore_regex", usage="regex of values not to project", widget="String", required=true, def="")
|
|
42 |
String values_to_ignore_regex
|
|
39 | 43 |
|
44 |
@Field @Option(name="debug", usage="Afficher les messages de debug", widget="Boolean", required=true, def="false") |
|
45 |
def debug |
|
46 |
|
|
40 | 47 |
// Open the parameters input dialog box |
41 | 48 |
if (!ParametersDialog.open(this)) return |
42 | 49 |
|
... | ... | |
45 | 52 |
return false |
46 | 53 |
} |
47 | 54 |
|
55 |
values_to_ignore_regex = /$values_to_ignore_regex/ |
|
56 |
|
|
57 |
project.compute(); |
|
58 |
|
|
48 | 59 |
for (Text text : project.getChildren(Text.class)) { |
49 | 60 |
println "Text: "+text |
50 | 61 |
|
51 | 62 |
File orig = text.getXMLTXMFile() |
52 |
if (!orig.exists()) { |
|
53 |
println "Warning: no XML-TXM file found for "+text
|
|
54 |
continue // next !
|
|
63 |
if ((orig == null) || !orig.exists()) {
|
|
64 |
println "** Error: no XML-TXM file found for "+text
|
|
65 |
return // next !
|
|
55 | 66 |
} |
56 | 67 |
|
57 | 68 |
File result = new File(outputDirectory, orig.getName()) |
... | ... | |
62 | 73 |
} |
63 | 74 |
|
64 | 75 |
XMLProcessor xp = new XMLProcessor(orig); |
65 |
LocalNameHookActivator activator = new LocalNameHookActivator("w");
|
|
76 |
LocalNameHookActivator activator = new LocalNameHookActivator("w") |
|
66 | 77 |
new DOMIdentityHook("word", activator, xp) { |
67 | 78 |
@Override |
68 | 79 |
public void processDom() { |
69 | 80 |
//println dom |
70 | 81 |
use(groovy.xml.dom.DOMCategory) { |
71 | 82 |
def form = dom.form[0] |
72 |
def anatype = dom.ana.findAll(){ a -> a['@type'] == "#$word_property"} |
|
83 |
def anatype = dom.ana.findAll(){ a -> |
|
84 |
if (debug && (a['@type'] == "#$word_property")) { |
|
85 |
println 'a["@type"] = '+a["@type"] |
|
86 |
println 'a.text() = '+a.text() |
|
87 |
println 'a["@type"] == "#$word_property"'+" = "+(a['@type'] == "#$word_property") |
|
88 |
println "values_to_ignore_regex = "+values_to_ignore_regex |
|
89 |
println '!(a.text() ==~ values_to_ignore_regex))'+" = "+(!(a.text() ==~ values_to_ignore_regex)) |
|
90 |
println '((a["@type"] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) = '+((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) |
|
91 |
} |
|
92 |
return ((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) |
|
93 |
} |
|
73 | 94 |
// println form.text()+" -> $anatype" |
74 | 95 |
if (anatype.size() > 0) { // keep the word |
75 | 96 |
anatype = anatype[0] |
... | ... | |
78 | 99 |
anatype['@type'] = "#form" |
79 | 100 |
anatype.setTextContent(s) |
80 | 101 |
} else { |
81 |
dom = null;
|
|
102 |
dom = null |
|
82 | 103 |
} |
83 | 104 |
} |
84 | 105 |
} |
85 | 106 |
} |
86 | 107 |
xp.process(result) |
87 |
} |
|
108 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro.groovy (revision 3209) | ||
---|---|---|
1 |
package org.txm.macro.table |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
|
|
7 |
import org.txm.libs.msoffice.ReadExcel |
|
8 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
9 |
import org.txm.utils.* |
|
10 |
import javax.xml.stream.* |
|
11 |
import java.net.URL |
|
12 |
import java.text.SimpleDateFormat |
|
13 |
|
|
14 |
import org.apache.poi.ss.usermodel.* |
|
15 |
import org.apache.poi.hssf.usermodel.* |
|
16 |
import org.apache.poi.xssf.usermodel.* |
|
17 |
import org.apache.poi.ss.usermodel.WorkbookFactory |
|
18 |
import org.apache.poi.ss.util.* |
|
19 |
|
|
20 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() } |
|
21 |
|
|
22 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
23 |
File inputFile |
|
24 |
|
|
25 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
26 |
def sheetName |
|
27 |
|
|
28 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
29 |
def rootTag |
|
30 |
|
|
31 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
32 |
def textTag |
|
33 |
|
|
34 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
35 |
def metadataColumnList |
|
36 |
|
|
37 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
38 |
def dateColumnList |
|
39 |
|
|
40 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
41 |
def textColumnList |
|
42 |
|
|
43 |
@Field @Option(name="EmbedInTEI", usage="text columns list separated by comma", widget="Boolean", required=false, def="false") |
|
44 |
def EmbedInTEI |
|
45 |
|
|
46 |
if (!ParametersDialog.open(this)) return |
|
47 |
|
|
48 |
if (!inputFile.exists()) { |
|
49 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
50 |
return false |
|
51 |
} |
|
52 |
|
|
53 |
if (!inputFile.canRead()) { |
|
54 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
55 |
return false |
|
56 |
} |
|
57 |
|
|
58 |
try { |
|
59 |
|
|
60 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() } |
|
61 |
dateColumnList = dateColumnList.split(",").collect { it.trim() } |
|
62 |
textColumnList = textColumnList.split(",").collect { it.trim() } |
|
63 |
textTag = textTag.trim() |
|
64 |
rootTag = rootTag.trim() |
|
65 |
|
|
66 |
ReadExcel excel = new ReadExcel(inputFile, sheetName); |
|
67 |
excel.readHeaders(); |
|
68 |
|
|
69 |
headers = excel.getHeaders() |
|
70 |
|
|
71 |
println "Headers: $headers" |
|
72 |
|
|
73 |
ok = true |
|
74 |
metadataColumnList.each { m -> |
|
75 |
if (!headers.contains(m)) { |
|
76 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
77 |
ok = false |
|
78 |
} |
|
79 |
} |
|
80 |
textColumnList.each { t -> |
|
81 |
if (!headers.contains(t)) { |
|
82 |
println "** Excel2XML: missing textColumnList column: $t" |
|
83 |
ok = false |
|
84 |
} |
|
85 |
} |
|
86 |
|
|
87 |
if (!ok) { return false } |
|
88 |
|
|
89 |
println "metadataColumnList = "+metadataColumnList |
|
90 |
println "dateColumnList = "+dateColumnList |
|
91 |
println "textColumnList = "+textColumnList |
|
92 |
|
|
93 |
name = inputFile.getName() |
|
94 |
idx = name.lastIndexOf(".") |
|
95 |
|
|
96 |
if (idx > 0) name = name.substring(0, idx) |
|
97 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
98 |
|
|
99 |
factory = XMLOutputFactory.newInstance() |
|
100 |
output = new FileOutputStream(outputFile) |
|
101 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
102 |
|
|
103 |
writer.writeStartDocument("UTF-8","1.0") |
|
104 |
writer.writeCharacters("\n") // simple XML formating |
|
105 |
|
|
106 |
if (EmbedInTEI) { |
|
107 |
writer.writeStartElement("TEI") |
|
108 |
writer.writeStartElement("teiHeader") |
|
109 |
writer.writeEndElement() // teiHeader |
|
110 |
writer.writeStartElement("text") |
|
111 |
writer.writeCharacters("\n") |
|
112 |
} |
|
113 |
|
|
114 |
writer.writeStartElement(rootTag) |
|
115 |
writer.writeCharacters("\n") |
|
116 |
|
|
117 |
pb_n = 1 |
|
118 |
|
|
119 |
//(1..nRows-1).each { rowIndex -> |
|
120 |
def record = null |
|
121 |
while (excel.readRecord()) { |
|
122 |
|
|
123 |
record = excel.getRecord() |
|
124 |
if (record.isEmpty()) continue; |
|
125 |
|
|
126 |
//println "record=$record" |
|
127 |
|
|
128 |
writer.writeCharacters(" ") |
|
129 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module) |
|
130 |
writer.writeAttribute("n", ""+pb_n++) |
|
131 |
writer.writeCharacters("\n") // simple XML formating |
|
132 |
|
|
133 |
writer.writeCharacters(" ") |
|
134 |
writer.writeStartElement(textTag) |
|
135 |
//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata |
|
136 |
for (def metadataName : metadataColumnList) { |
|
137 |
String s = record.get(metadataName); |
|
138 |
if (s == null) s = ""; |
|
139 |
|
|
140 |
value = s.replaceAll("\n", ";").trim() |
|
141 |
writer.writeAttribute(stringToIndent(metadataName), value) |
|
142 |
|
|
143 |
if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes |
|
144 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
145 |
writer.writeAttribute(metadataName+"-jour", matches[0][1]) |
|
146 |
writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
147 |
writer.writeAttribute(metadataName+"-mois", matches[0][2]) |
|
148 |
writer.writeAttribute(metadataName+"-annee", matches[0][3]) |
|
149 |
} |
|
150 |
} |
|
151 |
writer.writeCharacters("\n") |
|
152 |
|
|
153 |
writer.writeCharacters(" ") |
|
154 |
writer.writeStartElement("metadata") |
|
155 |
writer.writeStartElement("list") |
|
156 |
writer.writeAttribute("type", "unordered") |
|
157 |
writer.writeCharacters("\n") |
|
158 |
|
|
159 |
//metadataColumnIndex.each { colIndex -> |
|
160 |
for (def metadataName : metadataColumnList) { |
|
161 |
writer.writeStartElement("item") |
|
162 |
writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";")) |
|
163 |
writer.writeEndElement() // item |
|
164 |
writer.writeCharacters("\n") |
|
165 |
} |
|
166 |
writer.writeCharacters(" ") |
|
167 |
writer.writeEndElement() // list |
|
168 |
writer.writeEndElement() // head |
|
169 |
writer.writeCharacters("\n") |
|
170 |
|
|
171 |
//textColumnIndex.each { colIndex -> |
|
172 |
for (def textColumnName : textColumnList) { |
|
173 |
|
|
174 |
writer.writeCharacters(" ") |
|
175 |
writer.writeStartElement(stringToIndent(textColumnName)) |
|
176 |
writer.writeStartElement("p") |
|
177 |
writer.writeStartElement("head") |
|
178 |
writer.writeStartElement("hi") |
|
179 |
writer.writeCharacters(textColumnName+" : ") |
|
180 |
writer.writeEndElement() // hi |
|
181 |
writer.writeEndElement() // head |
|
182 |
|
|
183 |
value = record.get(textColumnName) |
|
184 |
|
|
185 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) { |
|
186 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc -> |
|
187 |
writer.writeStartElement("descripteur") |
|
188 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/) |
|
189 |
writer.writeAttribute("type", matches[0][1]) |
|
190 |
writer.writeCharacters(matches[0][2]) |
|
191 |
writer.writeEndElement() // descripteur |
|
192 |
} |
|
193 |
} else { |
|
194 |
writer.writeCharacters(value) |
|
195 |
} |
|
196 |
writer.writeEndElement() // p |
|
197 |
writer.writeEndElement() // textColumn |
|
198 |
writer.writeCharacters("\n") |
|
199 |
} |
|
200 |
|
|
201 |
writer.writeCharacters(" ") |
|
202 |
writer.writeEndElement() // textTag |
|
203 |
writer.writeCharacters("\n") |
|
204 |
} |
|
205 |
|
|
206 |
writer.writeEndElement() // rootTag |
|
207 |
writer.writeCharacters("\n") |
|
208 |
|
|
209 |
if (EmbedInTEI) { |
|
210 |
writer.writeEndElement() // text |
|
211 |
writer.writeCharacters("\n") |
|
212 |
writer.writeEndElement() // TEI |
|
213 |
writer.writeCharacters("\n") |
|
214 |
} |
|
215 |
|
|
216 |
writer.close() |
|
217 |
output.close() |
|
218 |
println "Result file: $outputFile" |
|
219 |
|
|
220 |
} catch (Exception e) { |
|
221 |
println "** Excel2XML: unable to read input file. Aborting." |
|
222 |
println e.getLocalizedMessage() |
|
223 |
println e.printStackTrace() |
|
224 |
return false |
|
225 |
} |
|
226 |
|
|
227 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro2.groovy (revision 3209) | ||
---|---|---|
1 |
package org.txm.macro.table |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
|
|
7 |
import org.txm.libs.msoffice.ReadExcel |
|
8 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
9 |
import org.txm.utils.* |
|
10 |
import javax.xml.stream.* |
|
11 |
import java.net.URL |
|
12 |
import java.text.SimpleDateFormat |
|
13 |
|
|
14 |
import org.apache.poi.ss.usermodel.* |
|
15 |
import org.apache.poi.hssf.usermodel.* |
|
16 |
import org.apache.poi.xssf.usermodel.* |
|
17 |
import org.apache.poi.ss.usermodel.WorkbookFactory |
|
18 |
import org.apache.poi.ss.util.* |
|
19 |
|
|
20 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() } |
|
21 |
|
|
22 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
23 |
File inputFile |
|
24 |
|
|
25 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
26 |
def sheetName |
|
27 |
|
|
28 |
@Field @Option(name="EmbedInTEI", usage="embed xml content in TEI text element", widget="Boolean", required=false, def="false") |
|
29 |
def EmbedInTEI |
|
30 |
|
|
31 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
32 |
def rootTag |
|
33 |
|
|
34 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
35 |
def textTag |
|
36 |
|
|
37 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
38 |
def metadataColumnList |
|
39 |
|
|
40 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
41 |
def dateColumnList |
|
42 |
|
|
43 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
44 |
def textColumnList |
|
45 |
|
|
46 |
@Field @Option(name="prefixContentColumnList", usage="columns which content is a semi-colon separated prefixes list", widget="String", required=false, def="") |
|
47 |
def prefixContentColumnList |
|
48 |
|
|
49 |
@Field @Option(name="listContentColumnList", usage="columns which content is a semi-colon separated list", widget="String", required=false, def="") |
|
50 |
def listContentColumnList |
|
51 |
|
|
52 |
|
|
53 |
if (!ParametersDialog.open(this)) return |
|
54 |
|
|
55 |
if (!inputFile.exists()) { |
|
56 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
57 |
return false |
|
58 |
} |
|
59 |
|
|
60 |
if (!inputFile.canRead()) { |
|
61 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
62 |
return false |
|
63 |
} |
|
64 |
|
|
65 |
try { |
|
66 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() } |
|
67 |
dateColumnList = dateColumnList.split(",").collect { it.trim() } |
|
68 |
textColumnList = textColumnList.split(",").collect { it.trim() } |
|
69 |
prefixContentColumnList = prefixContentColumnList.split(",").collect { it.trim() } |
|
70 |
listContentColumnList = listContentColumnList.split(",").collect { it.trim() } |
|
71 |
textTag = textTag.trim() |
|
72 |
rootTag = rootTag.trim() |
|
73 |
|
|
74 |
ReadExcel excel = new ReadExcel(inputFile, sheetName); |
|
75 |
excel.readHeaders(); |
|
76 |
|
|
77 |
headers = excel.getHeaders() |
|
78 |
|
|
79 |
println "Headers: $headers" |
|
80 |
|
|
81 |
ok = true |
|
82 |
metadataColumnList.each { m -> |
|
83 |
if (!headers.contains(m)) { |
|
84 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
85 |
ok = false |
|
86 |
} |
|
87 |
} |
|
88 |
textColumnList.each { t -> |
|
89 |
if (!headers.contains(t)) { |
|
90 |
println "** Excel2XML: missing textColumnList column: $t" |
|
91 |
ok = false |
|
92 |
} |
|
93 |
} |
|
94 |
|
|
95 |
if (!ok) { return false } |
|
96 |
|
|
97 |
println "metadataColumnList = "+metadataColumnList |
|
98 |
println "dateColumnList = "+dateColumnList |
|
99 |
println "textColumnList = "+textColumnList |
|
100 |
|
|
101 |
name = inputFile.getName() |
|
102 |
idx = name.lastIndexOf(".") |
|
103 |
|
|
104 |
if (idx > 0) name = name.substring(0, idx) |
|
105 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
106 |
|
|
107 |
factory = XMLOutputFactory.newInstance() |
|
108 |
output = new FileOutputStream(outputFile) |
|
109 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
110 |
|
|
111 |
writer.writeStartDocument("UTF-8","1.0") |
|
112 |
writer.writeCharacters("\n") // simple XML formating |
|
113 |
|
|
114 |
if (EmbedInTEI) { |
|
115 |
writer.writeStartElement("TEI") |
|
116 |
writer.writeStartElement("teiHeader") |
|
117 |
writer.writeEndElement() // teiHeader |
|
118 |
writer.writeStartElement("text") |
|
119 |
writer.writeCharacters("\n") |
|
120 |
} |
|
121 |
|
|
122 |
if (rootTag != null && rootTag.length() > 0) { |
|
123 |
writer.writeStartElement(rootTag) |
|
124 |
writer.writeCharacters("\n") |
|
125 |
} |
|
126 |
|
|
127 |
pb_n = 1 |
|
128 |
|
|
129 |
//(1..nRows-1).each { rowIndex -> |
|
130 |
def record = null |
|
131 |
while (excel.readRecord()) { |
|
132 |
|
|
133 |
record = excel.getRecord() |
|
134 |
if (record.isEmpty()) continue; |
|
135 |
|
|
136 |
//println "record=$record" |
|
137 |
|
|
138 |
writer.writeCharacters(" ") |
|
139 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module) |
|
140 |
writer.writeAttribute("n", ""+pb_n++) |
|
141 |
writer.writeCharacters("\n") // simple XML formating |
|
142 |
|
|
143 |
writer.writeCharacters(" ") |
|
144 |
if (textTag != null && textTag.length() > 0) { |
|
145 |
writer.writeStartElement(textTag) |
|
146 |
|
|
147 |
//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata |
|
148 |
for (def metadataName : metadataColumnList) { |
|
149 |
String s = record.get(metadataName); |
|
150 |
if (s == null) s = ""; |
|
151 |
|
|
152 |
value = s.replaceAll("\n", ";").trim() |
|
153 |
writer.writeAttribute(stringToIndent(metadataName), value) |
|
154 |
|
|
155 |
if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes |
|
156 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
157 |
writer.writeAttribute(metadataName+"-jour", matches[0][1]) |
|
158 |
writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
159 |
writer.writeAttribute(metadataName+"-mois", matches[0][2]) |
|
160 |
writer.writeAttribute(metadataName+"-annee", matches[0][3]) |
|
161 |
} |
|
162 |
} |
|
163 |
writer.writeCharacters("\n") |
|
164 |
} |
|
165 |
|
|
166 |
writer.writeCharacters(" ") |
|
167 |
writer.writeStartElement("metadata") |
|
168 |
writer.writeStartElement("list") |
|
169 |
writer.writeAttribute("type", "unordered") |
|
170 |
writer.writeCharacters("\n") |
|
171 |
|
|
172 |
//metadataColumnIndex.each { colIndex -> |
|
173 |
for (def metadataName : metadataColumnList) { |
|
174 |
writer.writeStartElement("item") |
|
175 |
writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";")) |
|
176 |
writer.writeEndElement() // item |
|
177 |
writer.writeCharacters("\n") |
|
178 |
} |
|
179 |
writer.writeCharacters(" ") |
|
180 |
writer.writeEndElement() // list |
|
181 |
writer.writeEndElement() // head |
|
182 |
writer.writeCharacters("\n") |
|
183 |
|
|
184 |
//textColumnIndex.each { colIndex -> |
|
185 |
for (def textColumnName : textColumnList) { |
|
186 |
|
|
187 |
writer.writeCharacters(" ") |
|
188 |
writer.writeStartElement(stringToIndent(textColumnName)) |
|
189 |
writer.writeStartElement("p") |
|
190 |
writer.writeStartElement("head") |
|
191 |
writer.writeStartElement("hi") |
|
192 |
writer.writeCharacters(textColumnName+" : ") |
|
193 |
writer.writeEndElement() // hi |
|
194 |
writer.writeEndElement() // head |
|
195 |
|
|
196 |
value = record.get(textColumnName) |
|
197 |
|
|
198 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) { |
|
199 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc -> |
|
200 |
writer.writeStartElement("descripteur") |
|
201 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/) |
|
202 |
writer.writeAttribute("type", matches[0][1]) |
|
203 |
writer.writeCharacters(matches[0][2]) |
|
204 |
writer.writeEndElement() // descripteur |
|
205 |
} |
|
206 |
} else { |
|
207 |
writer.writeCharacters(value) |
|
208 |
} |
|
209 |
writer.writeEndElement() // p |
|
210 |
writer.writeEndElement() // textColumn |
|
211 |
writer.writeCharacters("\n") |
|
212 |
} |
|
213 |
|
|
214 |
writer.writeCharacters(" ") |
|
215 |
|
|
216 |
if (textTag != null && textTag.length() > 0) { |
|
217 |
writer.writeEndElement() // textTag |
|
218 |
writer.writeCharacters("\n") |
|
219 |
} |
|
220 |
} |
|
221 |
|
|
222 |
if (rootTag != null && rootTag.length() > 0) { |
|
223 |
writer.writeEndElement() // rootTag |
|
224 |
writer.writeCharacters("\n") |
|
225 |
} |
|
226 |
|
|
227 |
if (EmbedInTEI) { |
|
228 |
writer.writeEndElement() // text |
|
229 |
writer.writeCharacters("\n") |
|
230 |
writer.writeEndElement() // TEI |
|
231 |
writer.writeCharacters("\n") |
|
232 |
} |
|
233 |
|
|
234 |
writer.close() |
|
235 |
output.close() |
|
236 |
println "Result file: $outputFile" |
|
237 |
|
|
238 |
} catch (Exception e) { |
|
239 |
println "** Excel2XML: unable to read input file. Aborting." |
|
240 |
println e.getLocalizedMessage() |
|
241 |
println e.printStackTrace() |
|
242 |
return false |
|
243 |
} |
|
244 |
|
|
245 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Table2TextCorpusMacro.groovy (revision 3209) | ||
---|---|---|
1 |
package org.txm.macro.csv |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
import org.txm.libs.msoffice.ReadExcel |
|
11 |
|
|
12 |
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx") |
|
13 |
File inputFile; |
|
14 |
|
|
15 |
@Field @Option(name="outputDirectory", usage="output directory", widget="Folder", required=false, def="directory") |
|
16 |
File outputDirectory; |
|
17 |
|
|
18 |
@Field @Option(name="textsSeparator", usage="Texts", widget="Separator", required=false, def="Texts") |
|
19 |
def textsSeparator |
|
20 |
|
|
21 |
@Field @Option(name="textIDColumn", usage="<text> id column", widget="String", required=false, def="Identifiant de la notice") |
|
22 |
def textIDColumn; |
|
23 |
|
|
24 |
@Field @Option(name="textSelector", usage="<text> lines selector format is column=value", widget="String", required=false, def="Type de notice=Notice sommaire") |
|
25 |
def textSelector; |
|
26 |
|
|
27 |
@Field @Option(name="textMetadataColumnList", usage="<text> metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
28 |
def textMetadataColumnList; |
|
29 |
|
|
30 |
@Field @Option(name="textContentColumnList", usage="<text> textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
31 |
def textContentColumnList; |
|
32 |
|
|
33 |
@Field @Option(name="structuresSeparator", usage="Structures", widget="Separator", required=false, def="Structures") |
|
34 |
def structuresSeparator |
|
35 |
|
|
36 |
@Field @Option(name="structureTag", usage="structure element to create", widget="String", required=false, def="div") |
|
37 |
def structureTag; |
|
38 |
|
|
39 |
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet") |
|
40 |
def structureSelector; |
|
41 |
|
|
42 |
@Field @Option(name="textJoinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale") |
|
43 |
def textJoinColumn; |
|
44 |
|
|
45 |
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
46 |
def structureMetadataColumnList; |
|
47 |
|
|
48 |
@Field @Option(name="structureContentColumnList", usage="structure textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
49 |
def structureContentColumnList; |
|
50 |
|
|
51 |
@Field @Option(name="typesSeparator", usage="Columns types", widget="Separator", required=false, def="Columns types") |
|
52 |
def typesSeparator |
|
53 |
|
|
54 |
@Field @Option(name="dateColumnTypeList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
55 |
def dateColumnTypeList |
|
56 |
|
|
57 |
@Field @Option(name="prefixesColumnTypeList", usage="metadata columns of type=Prefixes", widget="String", required=false, def="") |
|
58 |
def prefixesColumnTypeList |
|
59 |
|
|
60 |
@Field @Option(name="listColumnTypeList", usage="metadata columns of type=List semi-colon separated", widget="String", required=false, def="") |
|
61 |
def listColumnTypeList |
|
62 |
|
|
63 |
@Field @Option(name="debug", usage="Show devug messages", widget="Boolean", required=false, def="false") |
|
64 |
def debug |
|
65 |
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
66 |
// def structureOrderColumn; |
|
67 |
|
|
68 |
if (!ParametersDialog.open(this)) return; |
|
69 |
|
|
70 |
textMetadataColumnList = textMetadataColumnList.split(",").collect { it.trim() } |
|
71 |
textContentColumnList = textContentColumnList.split(",").collect { it.trim() } |
|
72 |
structureMetadataColumnList = structureMetadataColumnList.split(",").collect { it.trim() } |
|
73 |
structureContentColumnList = structureContentColumnList.split(",").collect { it.trim() } |
|
74 |
dateColumnTypeList = dateColumnTypeList.split(",").collect { it.trim() } |
|
75 |
prefixesColumnTypeList = prefixesColumnTypeList.split(",").collect { it.trim() } |
|
76 |
listColumnTypeList = listColumnTypeList.split(",").collect { it.trim() } |
|
77 |
if (textSelector != null && textSelector.contains("=")) { |
|
78 |
textSelector = textSelector.trim() |
|
79 |
textSelector = [textSelector.substring(0, textSelector.indexOf("=")).trim(), textSelector.substring(textSelector.indexOf("=")+1).trim()] |
|
80 |
} else { |
|
81 |
textSelector = [null, null] |
|
82 |
} |
|
83 |
if (structureSelector != null && structureSelector.contains("=")) { |
|
84 |
structureSelector = structureSelector.trim() |
|
85 |
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")).trim(), structureSelector.substring(structureSelector.indexOf("=")+1).trim()] |
|
86 |
} else { |
|
87 |
structureSelector = [null, null] |
|
88 |
} |
|
89 |
|
|
90 |
println "textIDColumn, textJoinColumn=$textIDColumn, $textJoinColumn" |
|
91 |
println "textMetadataColumnList columns: $textMetadataColumnList" |
|
92 |
println "textContentColumnList columns: $textContentColumnList" |
|
93 |
println "structureMetadataColumnList columns: $structureMetadataColumnList" |
|
94 |
println "structureContentColumnList columns: $structureContentColumnList" |
|
95 |
|
|
96 |
println "text selector="+textSelector |
|
97 |
println "structure selector="+structureSelector |
|
98 |
println "structureTag="+structureTag |
|
99 |
|
|
100 |
def reader = null |
|
101 |
try { |
|
102 |
reader = new TableReader(inputFile); |
|
103 |
println "Reading $inputFile with TableReader..." |
|
104 |
} catch(Exception e) { |
|
105 |
reader = new ReadExcel(inputFile, null); |
|
106 |
println "Reading $inputFile with ReadExcel..." |
|
107 |
} |
|
108 |
|
|
109 |
if (!reader.readHeaders()) { |
|
110 |
println "** Error: no header" |
|
111 |
return |
|
112 |
} |
|
113 |
def headers = Arrays.asList(reader.getHeaders()) |
|
114 |
println "$inputFile table column names: $headers" |
|
115 |
|
|
116 |
def ok = true |
|
117 |
def hash = ["selection":[textIDColumn, textJoinColumn], "textMetadataColumnList":textMetadataColumnList, |
|
118 |
"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList, |
|
119 |
"structureContentColumnList":structureContentColumnList] |
|
120 |
for (def key : hash.keySet()) { |
|
121 |
for (def m : hash[key]) { |
|
122 |
if (!headers.contains(m)) { |
|
123 |
println "** Error: missing $key column: $m" |
|
124 |
ok = false |
|
125 |
} |
|
126 |
} |
|
127 |
} |
|
128 |
|
|
129 |
if (!ok) { return; } |
|
130 |
|
|
131 |
// group records by text |
|
132 |
def texts = new LinkedHashMap() |
|
133 |
def nRecord = 0 |
|
134 |
def nRecordToWrite = 0 |
|
135 |
while (reader.readRecord()) { |
|
136 |
nRecord++ |
|
137 |
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0]) |
|
138 |
|
|
139 |
String id = reader.get(textIDColumn).trim() |
|
140 |
String join = reader.get(textJoinColumn).trim() |
|
141 |
String textSelectorValue = reader.get(textSelector[0]).trim() |
|
142 |
String structureSelectorValue = reader.get(structureSelector[0]).trim() |
|
143 |
|
|
144 |
if (textIDColumn != null && textJoinColumn != null && textIDColumn.length() > 0 && textJoinColumn.length() > 0) { |
|
145 |
if (textSelectorValue != null && structureSelectorValue != null) { |
|
146 |
|
|
147 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) { |
|
148 |
if (!texts.containsKey(id)) texts[id] = [] |
|
149 |
texts[id].add(0, reader.getRecord()) |
|
150 |
} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) { |
|
151 |
if (!texts.containsKey(join)) texts[join] = [] |
|
152 |
texts[join].add(reader.getRecord()) |
|
153 |
nRecordToWrite++ |
|
154 |
} else { |
|
155 |
// ignore record |
|
156 |
} |
|
157 |
} else { // no text&structure selector set, take the record if 'id' or 'join' are set |
|
158 |
if (id.length() > 0) { |
|
159 |
if (!texts.containsKey(id)) texts[id] = [] |
|
160 |
texts[id].add(0, reader.getRecord()) |
|
161 |
} else if (join.length() > 0) { |
|
162 |
if (!texts.containsKey(join)) texts[join] = [] |
|
163 |
texts[join].add(reader.getRecord()) |
|
164 |
nRecordToWrite++ |
|
165 |
} else { |
|
166 |
// ignore record |
|
167 |
} |
|
168 |
} |
|
169 |
} else { // no id & join parameter set -> regroup all record in one file |
|
170 |
if (!texts.containsKey("result.xml")) texts["result.xml"] = [] |
|
171 |
texts["result.xml"].add(reader.getRecord()) |
|
172 |
nRecordToWrite++ |
|
173 |
} |
|
174 |
} |
|
175 |
|
|
176 |
println "N records: "+nRecord |
|
177 |
println "N records to write: "+nRecordToWrite |
|
178 |
println "N texts to build: "+texts.size() |
|
179 |
if (texts.size() == 0) { |
|
180 |
println "No text found. Aborting." |
|
181 |
return |
|
182 |
} |
|
183 |
outputDirectory.mkdir() |
|
184 |
|
|
185 |
for (def id : texts.keySet()) { |
|
186 |
def toWrite = texts[id] |
|
187 |
def text = toWrite[0] |
|
188 |
String textSelectorValue = text.get(textSelector[0]).trim() |
|
189 |
if (textSelectorValue == null || (textSelectorValue != null && textSelectorValue.matches(textSelector[1]))) { |
|
190 |
if (debug) println "Processing text: $id" |
|
191 |
|
|
192 |
File outputfile = new File(outputDirectory, id+".xml") |
|
193 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
194 |
FileOutputStream output = new FileOutputStream(outputfile) |
|
195 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
196 |
|
|
197 |
writer.writeStartDocument("UTF-8","1.0") |
|
198 |
writer.writeCharacters("\n") // simple XML formating |
|
199 |
writer.writeStartElement("TEI") |
|
200 |
writer.writeCharacters("\n") // simple XML formating |
|
201 |
writer.writeStartElement("teiHeader") |
|
202 |
writer.writeEndElement() // teiHeader |
|
203 |
writer.writeCharacters("\n") // simple XML formating |
|
204 |
writer.writeStartElement("text") |
|
205 |
writer.writeAttribute("id", id) |
|
206 |
|
|
207 |
for (String att : textMetadataColumnList) { |
|
208 |
if (att in dateColumnTypeList) { |
|
209 |
writeMetadataDate(text, att, writer) |
|
210 |
} else { |
|
211 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct |
|
212 |
} |
|
213 |
} |
|
214 |
|
|
215 |
writer.writeCharacters("\n") // simple XML formating |
|
216 |
|
|
217 |
writer.writeStartElement("metadata") |
|
218 |
writer.writeStartElement("list") |
|
219 |
writer.writeAttribute("type", "unordered") |
|
220 |
writer.writeCharacters("\n") |
|
221 |
textMetadataColumnList.each { att -> |
|
222 |
writer.writeStartElement("item") |
|
223 |
writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";")) |
|
224 |
writer.writeEndElement() // item |
|
225 |
writer.writeCharacters("\n") |
|
226 |
} |
|
227 |
writer.writeEndElement() // list |
|
228 |
writer.writeEndElement() // metadata |
|
229 |
writer.writeCharacters("\n") |
|
230 |
|
|
231 |
for (String att : textContentColumnList) { |
|
232 |
|
|
233 |
if (att in prefixesColumnTypeList) { |
|
234 |
|
|
235 |
writePrefixTextContent(text, att, writer) |
|
236 |
} else if (att in listColumnTypeList) { |
|
237 |
|
|
238 |
writeListTextContent(text, att, writer) |
|
239 |
} else { |
|
240 |
|
|
241 |
writer.writeStartElement("p"); |
|
242 |
writer.writeAttribute("type", att.trim()) |
|
243 |
writer.writeCharacters("\n") // simple XML formating |
|
244 |
|
|
245 |
writer.writeStartElement("head") |
|
246 |
writer.writeStartElement("hi") |
|
247 |
writer.writeCharacters(att+" : ") |
|
248 |
writer.writeEndElement() // hi |
|
249 |
writer.writeEndElement() // head |
|
250 |
|
|
251 |
writer.writeCharacters(text.get(att)) // get textColumnList content |
|
252 |
writer.writeEndElement() // t |
|
253 |
writer.writeCharacters("\n") // simple XML formating) |
|
254 |
} |
|
255 |
} |
|
256 |
|
|
257 |
int pb_n = 1; |
|
258 |
for (int i = 1 ; i < toWrite.size() ; i++) { |
|
259 |
def record = toWrite[i] |
|
260 |
|
|
261 |
writer.writeEmptyElement("pb") // <pb/> |
|
262 |
writer.writeAttribute("n", ""+pb_n++) |
|
263 |
writer.writeAttribute("type", "record") |
|
264 |
|
|
265 |
writer.writeStartElement(structureTag) |
|
266 |
|
|
267 |
for (String att : structureMetadataColumnList) { |
|
268 |
if (att in dateColumnTypeList) { |
|
269 |
writeMetadataDate(record, att, writer) |
|
270 |
} else { |
|
271 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct |
|
272 |
} |
|
273 |
} |
|
274 |
writer.writeCharacters("\n") |
|
275 |
|
|
276 |
writer.writeStartElement("metadata") |
|
277 |
writer.writeStartElement("list") |
|
278 |
writer.writeAttribute("type", "unordered") |
|
279 |
writer.writeCharacters("\n") |
|
280 |
structureMetadataColumnList.each { att -> |
|
281 |
writer.writeStartElement("item") |
|
282 |
writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";")) |
|
283 |
writer.writeEndElement() // item |
|
284 |
writer.writeCharacters("\n") |
|
285 |
} |
|
286 |
writer.writeEndElement() // list |
|
287 |
writer.writeEndElement() // metadata |
|
288 |
writer.writeCharacters("\n") |
|
289 |
|
|
290 |
for (String att : structureContentColumnList) { |
|
291 |
if (att in prefixesColumnTypeList) { |
|
292 |
writePrefixTextContent(text, att, writer) |
|
293 |
} else if (att in listColumnTypeList) { |
|
294 |
writeListTextContent(text, att, writer) |
|
295 |
} else { |
|
296 |
writer.writeStartElement("p"); |
|
297 |
writer.writeAttribute("type", att.trim()) |
|
298 |
writer.writeAttribute("id", record.get(textIDColumn)) |
|
299 |
writer.writeCharacters("\n") // simple XML formating |
|
300 |
|
|
301 |
writer.writeStartElement("head") |
|
302 |
writer.writeStartElement("hi") |
|
303 |
writer.writeCharacters(att+" : ") |
|
304 |
writer.writeEndElement() // hi |
|
305 |
writer.writeEndElement() // head |
|
306 |
|
|
307 |
writer.writeCharacters(record.get(att)) // get textColumnList content |
|
308 |
writer.writeEndElement() // t |
|
309 |
writer.writeCharacters("\n") // simple XML formating |
|
310 |
} |
|
311 |
} |
|
312 |
|
|
313 |
writer.writeEndElement() // struct |
|
314 |
writer.writeCharacters("\n") // simple XML formating |
|
315 |
} |
|
316 |
|
|
317 |
writer.writeEndElement() // text |
|
318 |
writer.writeCharacters("\n") // simple XML formating |
|
319 |
writer.writeEndElement() // TEI |
|
320 |
writer.close() |
|
321 |
output.close() |
|
322 |
reader.close() |
|
323 |
} else { |
|
324 |
// error |
|
325 |
println "ERROR: '$id' text group with no text line" |
|
326 |
} |
|
327 |
} |
|
328 |
|
|
329 |
def writeListTextContent(def record, def att, def writer) { |
|
330 |
writer.writeCharacters("\n") |
|
331 |
found = false |
|
332 |
def value = record.get(att) |
|
333 |
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc -> |
|
334 |
found = true |
|
335 |
writer.writeCharacters("\t") |
|
336 |
writer.writeStartElement("p") |
|
337 |
writer.writeAttribute("rend", "list") |
|
338 |
matches = (desc =~ /(?s)([^\n]+?)[\n]/) |
|
339 |
writer.writeCharacters(matches[0][1]) |
|
340 |
writer.writeEndElement() // p |
|
341 |
writer.writeCharacters("\n") |
|
342 |
} |
|
343 |
if (!found) { |
|
344 |
writer.writeCharacters("\t") |
|
345 |
writer.writeStartElement("p") |
|
346 |
writer.writeAttribute("rend", "no-list") |
|
347 |
writer.writeCharacters(value) |
|
348 |
writer.writeEndElement() // p |
|
349 |
writer.writeCharacters("\n") |
|
350 |
} |
|
351 |
} |
|
352 |
|
|
353 |
def writePrefixTextContent(def record, def att, def writer) { |
|
354 |
writer.writeCharacters("\n") |
|
355 |
writer.writeStartElement("list") |
|
356 |
writer.writeAttribute("rend", "prefixes") |
|
357 |
writer.writeAttribute("type", "unordered") |
|
358 |
writer.writeCharacters("\n") |
|
359 |
found = false |
|
360 |
def value = record.get(att) |
|
361 |
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc -> |
|
362 |
found = true |
|
363 |
writer.writeCharacters("\t") |
|
364 |
writer.writeStartElement("item") |
|
365 |
matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/) |
|
366 |
writer.writeAttribute("type", matches[0][1]) |
|
367 |
writer.writeStartElement("span") |
|
368 |
writer.writeCharacters(matches[0][1]+" ") |
|
369 |
writer.writeEndElement() // span |
|
370 |
writer.writeCharacters(matches[0][2]) |
|
371 |
writer.writeEndElement() // item |
|
372 |
writer.writeCharacters("\n") |
|
373 |
} |
|
374 |
if (!found) { |
|
375 |
writer.writeCharacters(value) |
|
376 |
} |
|
377 |
writer.writeEndElement() // list |
|
378 |
writer.writeCharacters("\n") |
|
379 |
} |
|
380 |
|
|
381 |
def writeMetadataDate(def record, def att, def writer) { |
|
382 |
String value = record.get(att) |
|
383 |
String att_normalized = AsciiUtils.buildAttributeId(att) |
|
384 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
385 |
writer.writeAttribute(att_normalized+"jour", matches[0][1]) |
|
386 |
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
387 |
writer.writeAttribute(att_normalized+"mois", matches[0][2]) |
|
388 |
writer.writeAttribute(att_normalized+"annee", matches[0][3]) |
|
389 |
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1]) |
|
390 |
} |
|
391 |
|
|
392 |
/* |
|
393 |
String name = inputFile.getName() |
|
394 |
int idx = name.lastIndexOf(".") |
|
395 |
if (idx > 0) name = name.substring(0, idx) |
|
396 |
*/ |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/WordProperty2WordMacro.groovy (revision 3209) | ||
---|---|---|
1 |
// Copyright © 2021 ENS Lyon |
|
2 |
// Licensed under the terms of the GNU General Public License version 3 (http://www.gnu.org/licenses/gpl-3.0.html) |
|
3 |
// @author mdecorde |
|
4 |
// @author sheiden |
|
1 | 5 |
|
2 | 6 |
package org.txm.macro.annotation |
3 | 7 |
|
... | ... | |
28 | 32 |
Project project = mcorpus.getProject() |
29 | 33 |
File txmDir = new File(project.getProjectDirectory(), "txm/"+mcorpus.getName()) |
30 | 34 |
|
31 |
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="result_directory")
|
|
35 |
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="") |
|
32 | 36 |
File outputDirectory |
33 | 37 |
|
34 |
@Field @Option(name="word_property", usage="The word property to move", widget="String", required=true, def="type")
|
|
38 |
@Field @Option(name="word_property", usage="The word property to project", widget="String", required=true, def="type")
|
|
35 | 39 |
String word_property |
36 | 40 |
|
37 |
@Field @Option(name="word_property_value_to_ignore", usage="The property value to ignore", widget="String", required=true, def="__undef__")
|
|
38 |
String word_property_value_to_ignore
|
|
41 |
@Field @Option(name="values_to_ignore_regex", usage="regex of values not to project", widget="String", required=true, def="")
|
|
42 |
String values_to_ignore_regex
|
|
39 | 43 |
|
44 |
@Field @Option(name="debug", usage="Afficher les messages de debug", widget="Boolean", required=true, def="false") |
|
45 |
def debug |
|
46 |
|
|
40 | 47 |
// Open the parameters input dialog box |
41 | 48 |
if (!ParametersDialog.open(this)) return |
42 | 49 |
|
... | ... | |
45 | 52 |
return false |
46 | 53 |
} |
47 | 54 |
|
55 |
values_to_ignore_regex = /$values_to_ignore_regex/ |
|
56 |
|
|
48 | 57 |
for (Text text : project.getChildren(Text.class)) { |
49 | 58 |
println "Text: "+text |
50 | 59 |
|
... | ... | |
62 | 71 |
} |
63 | 72 |
|
64 | 73 |
XMLProcessor xp = new XMLProcessor(orig); |
65 |
LocalNameHookActivator activator = new LocalNameHookActivator("w");
|
|
74 |
LocalNameHookActivator activator = new LocalNameHookActivator("w") |
|
66 | 75 |
new DOMIdentityHook("word", activator, xp) { |
67 | 76 |
@Override |
68 | 77 |
public void processDom() { |
69 | 78 |
//println dom |
70 | 79 |
use(groovy.xml.dom.DOMCategory) { |
71 | 80 |
def form = dom.form[0] |
72 |
def anatype = dom.ana.findAll(){ a -> a['@type'] == "#$word_property"} |
|
81 |
def anatype = dom.ana.findAll(){ a -> |
|
82 |
if (debug && (a['@type'] == "#$word_property")) { |
|
83 |
println 'a["@type"] = '+a["@type"] |
|
84 |
println 'a.text() = '+a.text() |
|
85 |
println 'a["@type"] == "#$word_property"'+" = "+(a['@type'] == "#$word_property") |
|
86 |
println "values_to_ignore_regex = "+values_to_ignore_regex |
|
87 |
println '!(a.text() ==~ values_to_ignore_regex))'+" = "+(!(a.text() ==~ values_to_ignore_regex)) |
|
88 |
println '((a["@type"] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) = '+((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) |
|
89 |
} |
|
90 |
return ((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) |
|
91 |
} |
|
73 | 92 |
// println form.text()+" -> $anatype" |
74 | 93 |
if (anatype.size() > 0) { // keep the word |
75 | 94 |
anatype = anatype[0] |
... | ... | |
78 | 97 |
anatype['@type'] = "#form" |
79 | 98 |
anatype.setTextContent(s) |
80 | 99 |
} else { |
81 |
dom = null;
|
|
100 |
dom = null |
|
82 | 101 |
} |
83 | 102 |
} |
84 | 103 |
} |
85 | 104 |
} |
86 | 105 |
xp.process(result) |
87 |
} |
|
106 |
} |
Formats disponibles : Unified diff