tei2teiHM.py
1 |
#!/usr/bin/python
|
---|---|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
#import xlrd
|
5 |
#import pandas as pd
|
6 |
import os |
7 |
import re |
8 |
import unicodedata |
9 |
import sys |
10 |
import csv |
11 |
import xml.etree.ElementTree as ET |
12 |
import prettify |
13 |
|
14 |
from os import listdir, rename |
15 |
from os.path import isfile, join |
16 |
|
17 |
# ouverture du fichier Excel
|
18 |
from xml.dom import minidom |
19 |
|
20 |
## les paramètres :
|
21 |
myxmlfile = sys.argv[1] #chemin relatif/nom du fichier XML |
22 |
mypath = sys.argv[2] #chemin système jusqu'au script |
23 |
|
24 |
print "L'encodage du système python:", sys.getdefaultencoding() ## ASCII |
25 |
print "L'encodage des fichiers : ", sys.getfilesystemencoding() ## UTF-8 |
26 |
|
27 |
#### NON CONSEILLE apparemment mais bon ???
|
28 |
reload(sys)
|
29 |
sys.setdefaultencoding('utf8')
|
30 |
|
31 |
inputxmlfile = join(mypath, myxmlfile); # Fichier XML d'entrée
|
32 |
print "InputFile XML : ",inputxmlfile |
33 |
resFrfile = join(mypath, 'newFr.xml') # Fichier résultat |
34 |
print "OutputFile XML : ", resFrfile # Fichier XML de sortie |
35 |
|
36 |
|
37 |
def findTextChild(node, strText): |
38 |
#print "\tNode [", node.nodeName, "]"
|
39 |
nameNode = node.nodeName |
40 |
if nameNode == '#text': |
41 |
strNode = node.nodeValue.encode('utf-8')#, 'ignore') |
42 |
if strText is None: |
43 |
strReturn = "" + strNode
|
44 |
else :
|
45 |
strReturn = str(strText) + strNode
|
46 |
|
47 |
return strReturn
|
48 |
elif nameNode == 'note': |
49 |
return strText
|
50 |
else :
|
51 |
if node.hasChildNodes():
|
52 |
for child in node.childNodes: |
53 |
strText = findTextChild(child, strText) |
54 |
return strText
|
55 |
|
56 |
|
57 |
def findText(childCell, titlecell): |
58 |
if childCell.hasChildNodes():
|
59 |
for child in childCell.childNodes: |
60 |
nameChildCell = child.nodeName |
61 |
if nameChildCell == '#text': |
62 |
strNode = child.nodeValue.encode('utf-8')#, 'ignore') |
63 |
if titlecell is None: |
64 |
titlecell = "" + strNode
|
65 |
else :
|
66 |
titlecell = str(titlecell) + strNode
|
67 |
else :
|
68 |
titlecell = findTextChild(child, titlecell) |
69 |
return titlecell
|
70 |
|
71 |
#<text name="1550_Torrentiniana">
|
72 |
# <body>
|
73 |
# <div xml:id="t1_Bo" type="Book" rend="Oeuvre">
|
74 |
# <seg xml:id="t1_Seg1" type="Segment" n="1" rend="Segment 1">PITTORE SCULTORE ET ARCHITETTO</seg>
|
75 |
|
76 |
#<text name="Azpilcueta_pt_1549">
|
77 |
# <front type="Prologue" xml:id="p1_Pr">
|
78 |
# <head type="Title" xml:id="p1_Pr-Seg0">PREFACE TO THE READER, Portuguese Edition, 1549</head>
|
79 |
# <seg n="1" type="Segment" xml:id="p1_Pr-Seg1">El doctor Martin de Azpilcueta Nauarro al lector</seg>
|
80 |
|
81 |
# xml:id="i1_Ch10-Seg12"
|
82 |
|
83 |
def buildHead(elRoot, textId, divId, textHead): |
84 |
headId = textId + "_" + divId+ "-Seg0" |
85 |
#print "#build Head ",headId
|
86 |
|
87 |
### ELEMENT <head>
|
88 |
strHead = u"head"
|
89 |
headEl = ET.SubElement(elRoot, strHead.encode('utf-8'))
|
90 |
|
91 |
### with text node
|
92 |
#print "## texte normal en sortie ## ",type(textHead), textSeg
|
93 |
headEl.text = str(textHead).encode('utf-8') |
94 |
|
95 |
### @type = "Title"
|
96 |
typeHead = u"Title"
|
97 |
headEl.set("type", typeHead.encode('utf-8')) |
98 |
### @xml:id = "Segment 1"
|
99 |
headEl.set("xml:id", headId.encode('utf-8')) |
100 |
return headEl
|
101 |
|
102 |
def buildDivision(elRoot, textId, numId): |
103 |
divId = textId + "_" + "Ch" + numId |
104 |
#print "#build Div ",divId
|
105 |
|
106 |
### ELEMENT <div>
|
107 |
strDiv = u"div"
|
108 |
divEl = ET.SubElement(elRoot, strDiv.encode('utf-8'))
|
109 |
|
110 |
### @n = "1"
|
111 |
divEl.set("n", numId.encode('utf-8')) |
112 |
### @type = "Chapitre"
|
113 |
typeDiv = u"Chapitre"
|
114 |
divEl.set("type", typeDiv.encode('utf-8')) |
115 |
### @xml:id = "Div 1"
|
116 |
divEl.set("xml:id", divId.encode('utf-8')) |
117 |
return divEl
|
118 |
|
119 |
def buildSegment(elRoot, textId, divId, textSeg, numSeg): |
120 |
segId = textId + "_" + divId+ "-Seg"+ numSeg |
121 |
#print "#build Segment ",segId
|
122 |
|
123 |
### ELEMENT <seg>
|
124 |
strSeg = u"seg"
|
125 |
segEl = ET.SubElement(elRoot, strSeg.encode('utf-8'))#, 'ignore')) |
126 |
|
127 |
### with text node
|
128 |
#print "## texte normal en sortie ## ",type(textSeg), textSeg
|
129 |
segEl.text = str(textSeg).encode('utf-8')#, 'ignore') |
130 |
|
131 |
### @n = "1"
|
132 |
segEl.set("n", numSeg.encode('utf-8'))#, 'ignore')) |
133 |
### @type = "Segment"
|
134 |
typeSeg = u"Segment"
|
135 |
segEl.set("type", typeSeg.encode('utf-8'))#, 'ignore')) |
136 |
### @xml:id = "Segment 1"
|
137 |
segEl.set("xml:id", segId.encode('utf-8'))#, 'ignore')) |
138 |
return segEl
|
139 |
|
140 |
|
141 |
def buildMetadataCorpus(elRoot): |
142 |
headerEl = ET.SubElement(elRoot, 'teiHeader')
|
143 |
fileDescEl = ET.SubElement(headerEl, 'fileDesc')
|
144 |
|
145 |
#### <titleStmt>
|
146 |
editionStmtEl = ET.SubElement(fileDescEl, 'editionStmt')
|
147 |
pEl = ET.SubElement(editionStmtEl, 'p')
|
148 |
pEl.text = 'à donner des détails'
|
149 |
|
150 |
#### <extent>
|
151 |
extentEl = ET.SubElement(fileDescEl, 'extent')
|
152 |
pEl = ET.SubElement(extentEl, 'p')
|
153 |
pEl.text = '?? segments alignés, ?? mots'
|
154 |
|
155 |
#### <publicationStmt>
|
156 |
publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
|
157 |
dateEl = ET.SubElement(publicationStmtEl, 'date')
|
158 |
dateEl.text = '2017'
|
159 |
pubPlaceEl = ET.SubElement(publicationStmtEl, 'pubPlace')
|
160 |
pubPlaceEl.text = 'Tours'
|
161 |
publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
|
162 |
publisherEl.text = 'Centre de Renaissance de Tours'
|
163 |
|
164 |
#### <sourceDesc>
|
165 |
sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
|
166 |
biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
|
167 |
monogrEl = ET.SubElement(biblStructEl, 'monogr')
|
168 |
|
169 |
authorEl = ET.SubElement(monogrEl, 'author')
|
170 |
authorEl.text = 'Toshinori Uetano'
|
171 |
titleEl = ET.SubElement(monogrEl, 'title')
|
172 |
titleEl.text = 'Corpus parallèle de Arcardia, de Jacopo Sannazaro'
|
173 |
respStmtEl = ET.SubElement(monogrEl, 'respStmt')
|
174 |
respEl = ET.SubElement(respStmtEl, 'resp')
|
175 |
respEl.text = '??'
|
176 |
nameEl = ET.SubElement(respStmtEl, 'name')
|
177 |
nameEl.text = '??'
|
178 |
imprintEl = ET.SubElement(monogrEl, 'imprint')
|
179 |
publisherEl = ET.SubElement(imprintEl, 'publisher')
|
180 |
publisherEl.text = "???"
|
181 |
dateEl = ET.SubElement(imprintEl, 'date')
|
182 |
dateEl.text = "2017"
|
183 |
|
184 |
#### <encodingDesc>
|
185 |
encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
|
186 |
projectDescEl = ET.SubElement(encodingDescEl, 'projectDesc')
|
187 |
pEl = ET.SubElement(projectDescEl, 'p')
|
188 |
pEl.text = 'Le corpus est le résultat d\'une compilation de textes etc...'
|
189 |
|
190 |
editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
|
191 |
correctionEl = ET.SubElement(editorialDeclEl, 'correction')
|
192 |
pEl = ET.SubElement(correctionEl, 'p')
|
193 |
pEl.text = 'Le travail éditorial consiste à ...'
|
194 |
|
195 |
segmentationEl = ET.SubElement(editorialDeclEl, 'segmentation')
|
196 |
pEl = ET.SubElement(segmentationEl, 'p')
|
197 |
pEl.text = 'La division du texte en segments philologiques correspond à ...'
|
198 |
|
199 |
|
200 |
def buildMetadata(elRoot, type): |
201 |
headerEl = ET.SubElement(elRoot, 'teiHeader')
|
202 |
fileDescEl = ET.SubElement(headerEl, 'fileDesc')
|
203 |
|
204 |
#### <titleStmt>
|
205 |
titleStmtEl = ET.SubElement(fileDescEl, 'titleStmt')
|
206 |
titleEl = ET.SubElement(titleStmtEl, 'title')
|
207 |
titleEl.text = '????'
|
208 |
|
209 |
#### <extent>
|
210 |
extentEl = ET.SubElement(fileDescEl, 'extent')
|
211 |
extentEl.text = 'Taille ....'
|
212 |
|
213 |
#### <publicationStmt>
|
214 |
publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
|
215 |
dateEl = ET.SubElement(publicationStmtEl, 'date')
|
216 |
#date.text = 'Taille ....'
|
217 |
publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
|
218 |
#publisherEl.text = 'Taille ....'
|
219 |
addressEl = ET.SubElement(publicationStmtEl, 'address')
|
220 |
#addressEl.text = 'Taille ....'
|
221 |
|
222 |
#### <sourceDesc>
|
223 |
sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
|
224 |
biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
|
225 |
monogrEl = ET.SubElement(biblStructEl, 'monogr')
|
226 |
|
227 |
if type == 'it': |
228 |
monogrEl.set("ana", "16e") |
229 |
monogrEl.set("lang", "it") |
230 |
monogrEl.set("xml:lang", "it") |
231 |
monogrEl.set("xml:id", "it1") |
232 |
titleEl = ET.SubElement(monogrEl, 'title')
|
233 |
titleEl.text = 'Arcadia'
|
234 |
authorEl = ET.SubElement(monogrEl, 'author')
|
235 |
authorEl.text = 'Francesco Erspamer'
|
236 |
authorEl.set("xml:lang", "it") |
237 |
editorEl = ET.SubElement(monogrEl, 'editor')
|
238 |
editorEl.text = 'Alde ??'
|
239 |
imprintEl = ET.SubElement(monogrEl, 'imprint')
|
240 |
publisherEl = ET.SubElement(imprintEl, 'publisher')
|
241 |
publisherEl.text = "édition de Venise, Alde ?"
|
242 |
dateEl = ET.SubElement(imprintEl, 'date')
|
243 |
dateEl.text = "1505"
|
244 |
|
245 |
elif type == 'fr': |
246 |
monogrEl.set("ana", "16e") |
247 |
monogrEl.set("lang", "fr") |
248 |
monogrEl.set("xml:lang", "fr") |
249 |
monogrEl.set("xml:id", "fr1") |
250 |
titleEl = ET.SubElement(monogrEl, 'title')
|
251 |
titleEl.text = 'Arcadie'
|
252 |
authorEl = ET.SubElement(monogrEl, 'author')
|
253 |
authorEl.text = 'Jehan Martin'
|
254 |
authorEl.set("xml:lang", "fr") |
255 |
editorEl = ET.SubElement(monogrEl, 'editor')
|
256 |
editorEl.text = 'De Vascosan et Corrozet ?'
|
257 |
imprintEl = ET.SubElement(monogrEl, 'imprint')
|
258 |
publisherEl = ET.SubElement(imprintEl, 'publisher')
|
259 |
publisherEl.text = "édition de Michel de Vascosan pour luy et Gilles Corrozet ?"
|
260 |
dateEl = ET.SubElement(imprintEl, 'date')
|
261 |
dateEl.text = "1544"
|
262 |
|
263 |
else:
|
264 |
print "NO OTHER TEXT normally ?" |
265 |
|
266 |
#### <encodingDesc>
|
267 |
encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
|
268 |
editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
|
269 |
normalizationEl = ET.SubElement(editorialDeclEl, 'normalization')
|
270 |
|
271 |
profileDescEl = ET.SubElement(headerEl, 'profileDesc')
|
272 |
langUsageEl = ET.SubElement(profileDescEl, 'langUsage')
|
273 |
languageEl = ET.SubElement(langUsageEl, 'language')
|
274 |
languageEl.set("ident", type) |
275 |
|
276 |
|
277 |
|
278 |
def buildText(elRoot, type): |
279 |
textEl = ET.SubElement(elRoot, 'text')
|
280 |
if type== 'it': |
281 |
textEl.set("name", "Erspamer_1505") |
282 |
elif type == 'fr': |
283 |
textEl.set("name", "Martin_1544") |
284 |
else:
|
285 |
print "No attribute NAME here !!!!" |
286 |
bodyEl = ET.SubElement(textEl, 'body')
|
287 |
#if type == 'it':
|
288 |
#elif type == 'fr':
|
289 |
return bodyEl
|
290 |
|
291 |
|
292 |
def findTitleSeg(title): |
293 |
titleFull = ''
|
294 |
if title :
|
295 |
if ',' in title : |
296 |
titleTable = title.split(',')
|
297 |
if len(titleTable) >= 2: |
298 |
### in LEFT PART any space will be replaced by "_"
|
299 |
titlecellLeft = titleTable[0]
|
300 |
titlecellLeft = titlecellLeft.replace(' ','_') |
301 |
|
302 |
### in RIGHT PART, some mistakes in the string
|
303 |
titlecellRight = ""
|
304 |
cellRight = titleTable[1]
|
305 |
if "-" in cellRight : |
306 |
cellRightTable = cellRight.split('-')
|
307 |
numC = 0
|
308 |
for c in cellRightTable: |
309 |
right = re.sub('[^0-9]*', '', c) |
310 |
if numC == 1: |
311 |
titlecellRight = titlecellRight + '-' + right
|
312 |
else :
|
313 |
titlecellRight = titlecellRight + right |
314 |
numC = numC + 1
|
315 |
|
316 |
else :
|
317 |
titlecellRight = re.sub('[^0-9]*', '', cellRight) |
318 |
|
319 |
titleFull = titlecellLeft+"_"+titlecellRight
|
320 |
#print "TITLE FULL : ",titleFull
|
321 |
|
322 |
else :
|
323 |
print "ATTTTTTTTTTENTION !!! ", len(titleCellTable) |
324 |
titleFull = title |
325 |
else :
|
326 |
if "-" in title : |
327 |
titleTable = title.split('-')
|
328 |
numC = 0
|
329 |
for c in titleTable: |
330 |
titleStr = re.sub('[^0-9]*', '', c) |
331 |
if numC == 1: |
332 |
titleFull = titleFull + '-' + titleStr
|
333 |
else :
|
334 |
titleFull = titleFull + titleStr |
335 |
numC = numC + 1
|
336 |
|
337 |
else :
|
338 |
titleFull = title.replace(' ','_') |
339 |
#titleFull = re.sub('[^0-9]*', '', title)
|
340 |
#print "TITLE FULL : ",titleFull
|
341 |
return titleFull
|
342 |
|
343 |
###### Preparing the bitext TEI output file #######
|
344 |
elRootCorpus = ET.Element("TeiCorpus")
|
345 |
buildMetadataCorpus(elRootCorpus) |
346 |
|
347 |
#### IT ####
|
348 |
textIdIt = "it1"
|
349 |
elRootIt = ET.SubElement(elRootCorpus, "TEI")
|
350 |
buildMetadata(elRootIt, "it")
|
351 |
elTextIt = buildText(elRootIt, "it")
|
352 |
|
353 |
#### FR ####
|
354 |
textIdFr = "fr1"
|
355 |
elRootFr = ET.SubElement(elRootCorpus, "TEI")
|
356 |
buildMetadata(elRootFr, "fr")
|
357 |
elTextFr = buildText(elRootFr, "fr")
|
358 |
|
359 |
|
360 |
##############################
|
361 |
# PARSING du fichier XML #
|
362 |
# mapping entre les titres #
|
363 |
# et ajout de l'id dans XML #
|
364 |
##############################
|
365 |
|
366 |
#<row>
|
367 |
# <cell>Ecloga II, vv. 1-12</cell>
|
368 |
# <cell>
|
369 |
# <p rend="justify">
|
370 |
# <hi rend="italic">Itene</hi>
|
371 |
# <hi rend="italic">
|
372 |
# <note xml:id="ftn121" place="foot" n="121"> <hi rend="italic">itene</hi> : andatevene. Cfr. Conti, <hi rend="italic">Canzoniere</hi>, CXLIX, 7-8 : “Itene a casa, et noi lassate al bosco / pasciute pecorelle” (Erspamer).</note>
|
373 |
# </hi>
|
374 |
# <hi rend="italic"> all'ombra degli </hi>
|
375 |
# <hi rend="color(#ff0000)italic">ameni faggi</hi>
|
376 |
# <hi rend="italic">,</hi></p>
|
377 |
# <p rend="justify italic">pasciute pecorelle, omai che 'l sole</p>
|
378 |
# <p rend="justify italic">sul mezzo giorno indrizza i caldi raggi.</p>
|
379 |
# <p rend="justify">
|
380 |
# <hi rend="italic">Ivi udirete </hi>
|
381 |
# <hi rend="bold italic">l'alte</hi>
|
382 |
# <hi rend="italic">
|
383 |
# <note xml:id="ftn122" place="foot" n="122"> <hi rend="italic">alte</hi> : pronunciate a voce alta (piú che 'nobili' : lo stile pastorale è infatti umile per definizione ; cfr. P, 4 ; IIIe, 29 ; X, 19) (Erspamer).</note>
|
384 |
# </hi>
|
385 |
# <hi rend="bold italic"> mie parole</hi>
|
386 |
# </p>
|
387 |
|
388 |
|
389 |
#### READING the XML TEI file #####
|
390 |
doc = minidom.parse(inputxmlfile) |
391 |
root = doc.documentElement |
392 |
|
393 |
numTable = 1
|
394 |
for table in root.getElementsByTagName('table'): |
395 |
divId = "Ch" + str(numTable) |
396 |
elDivIt = buildDivision(elTextIt, textIdIt, str(numTable))
|
397 |
elDivFr = buildDivision(elTextFr, textIdFr, str(numTable))
|
398 |
if table.hasChildNodes():
|
399 |
#print "\tchild"
|
400 |
numRow = 1
|
401 |
for childRow in table.childNodes: |
402 |
name = childRow.nodeName |
403 |
#print name
|
404 |
if name == 'row': |
405 |
#print "in row real"
|
406 |
if childRow.hasChildNodes():
|
407 |
numCell = 0
|
408 |
titlecell = u""
|
409 |
for childCell in childRow.childNodes: |
410 |
nameCell = childCell.nodeName |
411 |
if childCell.nodeName == 'cell': |
412 |
cell = u""
|
413 |
textfr = u""
|
414 |
textit = u""
|
415 |
#print "==========Cell=========="
|
416 |
if numCell == 0 : |
417 |
titlecell = findText(childCell, titlecell) |
418 |
titleorigin = titlecell |
419 |
## Exemple : A la Sampogna, 17-18íòú
|
420 |
titlecell = findTitleSeg(titlecell) |
421 |
print "TITRE ; ",titleorigin, ";", titlecell |
422 |
|
423 |
if titlecell:
|
424 |
buildHead(elDivIt, textIdIt, divId, titlecell) |
425 |
buildHead(elDivFr, textIdFr, divId, titlecell) |
426 |
elif numCell == 1 : ### FR |
427 |
textit = findText(childCell, textit) |
428 |
#print "[it] : ",textit
|
429 |
buildSegment(elDivIt, textIdIt, divId, textit, str(numRow))
|
430 |
|
431 |
elif numCell == 2 : ### IT |
432 |
textfr = findText(childCell, textfr) |
433 |
#print "[fr] : ",textfr
|
434 |
buildSegment(elDivFr, textIdFr, divId, textfr, str(numRow))
|
435 |
|
436 |
else : ### |
437 |
print "[what cell] ???????????" |
438 |
if childCell.hasChildNodes():
|
439 |
|
440 |
for child in childCell.childNodes: |
441 |
nameChildCell = child.nodeName |
442 |
#print nameChildCell
|
443 |
if nameChildCell == '#text': |
444 |
cell = child.nodeValue |
445 |
else :
|
446 |
cell = findTextChild(child, "")
|
447 |
print "OTHER ? ", cell |
448 |
numCell = numCell + 1
|
449 |
#print "[N]", numCell
|
450 |
numRow = numRow + 1
|
451 |
print "Chapitre ",numTable |
452 |
numTable = numTable + 1
|
453 |
|
454 |
treeFr = ET.ElementTree(elRootCorpus) |
455 |
#treeFr.write(sys.stdout)
|
456 |
resFrfile.encode("utf-8")
|
457 |
treeFr.write(resFrfile) |
458 |
#ET.tostring(treeFr, encoding='utf-8', method="xml")
|