tei2teiHM.py

Severine Gedzelman, 26/09/2017 15:00

Télécharger (15,09 ko)

 
1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
3

    
4
#import xlrd
5
#import pandas as pd  
6
import os
7
import re
8
import unicodedata
9
import sys
10
import csv
11
import xml.etree.ElementTree as ET
12
import prettify
13

    
14
from os import listdir, rename
15
from os.path import isfile, join
16

    
17
# ouverture du fichier Excel 
18
from xml.dom import minidom
19

    
20
## les paramètres : 
21
myxmlfile = sys.argv[1] #chemin relatif/nom du fichier XML
22
mypath = sys.argv[2]        #chemin système jusqu'au script
23

    
24
print "L'encodage du système python:", sys.getdefaultencoding() ## ASCII
25
print "L'encodage des fichiers : ", sys.getfilesystemencoding() ## UTF-8
26

    
27
#### NON CONSEILLE apparemment mais bon ???
28
reload(sys)  
29
sys.setdefaultencoding('utf8')
30

    
31
inputxmlfile = join(mypath, myxmlfile); # Fichier XML d'entrée
32
print "InputFile XML : ",inputxmlfile
33
resFrfile = join(mypath, 'newFr.xml') # Fichier résultat
34
print "OutputFile XML : ", resFrfile                # Fichier XML de sortie
35

    
36

    
37
def findTextChild(node, strText):
38
  #print "\tNode [", node.nodeName, "]"
39
  nameNode = node.nodeName
40
  if nameNode == '#text':
41
    strNode = node.nodeValue.encode('utf-8')#, 'ignore')
42
    if strText is None:
43
      strReturn = "" + strNode
44
    else :
45
      strReturn = str(strText) + strNode
46
   
47
    return strReturn
48
  elif nameNode == 'note':
49
    return strText
50
  else :
51
    if node.hasChildNodes():
52
      for child in node.childNodes:
53
        strText = findTextChild(child, strText)
54
      return strText
55

    
56

    
57
def findText(childCell, titlecell):
58
  if childCell.hasChildNodes():
59
    for child in childCell.childNodes:
60
      nameChildCell = child.nodeName
61
      if nameChildCell == '#text':
62
        strNode = child.nodeValue.encode('utf-8')#, 'ignore')
63
        if titlecell is None:
64
          titlecell = "" + strNode
65
        else :
66
          titlecell = str(titlecell) + strNode
67
      else : 
68
        titlecell = findTextChild(child, titlecell)
69
  return titlecell
70

    
71
#<text name="1550_Torrentiniana">
72
#    <body>
73
#      <div xml:id="t1_Bo" type="Book" rend="Oeuvre">
74
#          <seg xml:id="t1_Seg1" type="Segment" n="1" rend="Segment 1">PITTORE SCULTORE ET ARCHITETTO</seg>
75

    
76
#<text name="Azpilcueta_pt_1549">
77
#  <front type="Prologue" xml:id="p1_Pr">
78
#      <head type="Title" xml:id="p1_Pr-Seg0">PREFACE TO THE READER, Portuguese Edition, 1549</head>
79
#          <seg n="1" type="Segment" xml:id="p1_Pr-Seg1">El doctor Martin de Azpilcueta Nauarro al lector</seg>
80
      
81
#          xml:id="i1_Ch10-Seg12"
82

    
83
def buildHead(elRoot, textId, divId, textHead):
84
  headId = textId + "_" + divId+ "-Seg0"
85
  #print "#build Head ",headId
86
  
87
  ### ELEMENT <head>
88
  strHead = u"head"
89
  headEl = ET.SubElement(elRoot, strHead.encode('utf-8'))
90
 
91
  ### with text node 
92
  #print "## texte normal en sortie ## ",type(textHead), textSeg
93
  headEl.text = str(textHead).encode('utf-8')
94
  
95
  ### @type = "Title"
96
  typeHead = u"Title"
97
  headEl.set("type", typeHead.encode('utf-8'))
98
  ### @xml:id = "Segment 1"
99
  headEl.set("xml:id", headId.encode('utf-8'))
100
  return headEl
101

    
102
def buildDivision(elRoot, textId, numId):
103
  divId = textId + "_" + "Ch" + numId
104
  #print "#build Div ",divId
105
  
106
  ### ELEMENT <div>
107
  strDiv = u"div"
108
  divEl = ET.SubElement(elRoot, strDiv.encode('utf-8'))
109
 
110
  ### @n = "1"
111
  divEl.set("n", numId.encode('utf-8'))
112
  ### @type = "Chapitre"
113
  typeDiv = u"Chapitre"
114
  divEl.set("type", typeDiv.encode('utf-8'))
115
  ### @xml:id = "Div 1"
116
  divEl.set("xml:id", divId.encode('utf-8'))
117
  return divEl
118

    
119
def buildSegment(elRoot, textId, divId, textSeg, numSeg):
120
  segId = textId + "_" + divId+ "-Seg"+ numSeg
121
  #print "#build Segment ",segId
122
  
123
  ### ELEMENT <seg>
124
  strSeg = u"seg"
125
  segEl = ET.SubElement(elRoot, strSeg.encode('utf-8'))#, 'ignore'))
126
 
127
  ### with text node 
128
  #print "## texte normal en sortie ## ",type(textSeg), textSeg
129
  segEl.text = str(textSeg).encode('utf-8')#, 'ignore')
130
  
131
  ### @n = "1"
132
  segEl.set("n", numSeg.encode('utf-8'))#, 'ignore'))
133
  ### @type = "Segment"
134
  typeSeg = u"Segment"
135
  segEl.set("type", typeSeg.encode('utf-8'))#, 'ignore'))
136
  ### @xml:id = "Segment 1"
137
  segEl.set("xml:id", segId.encode('utf-8'))#, 'ignore'))
138
  return segEl
139
  
140

    
141
def buildMetadataCorpus(elRoot):
142
  headerEl = ET.SubElement(elRoot, 'teiHeader')
143
  fileDescEl = ET.SubElement(headerEl, 'fileDesc')
144

    
145
  #### <titleStmt>
146
  editionStmtEl = ET.SubElement(fileDescEl, 'editionStmt')
147
  pEl = ET.SubElement(editionStmtEl, 'p')
148
  pEl.text = 'à donner des détails'
149

    
150
  #### <extent>
151
  extentEl = ET.SubElement(fileDescEl, 'extent')
152
  pEl = ET.SubElement(extentEl, 'p')
153
  pEl.text = '?? segments alignés, ?? mots'
154
  
155
  #### <publicationStmt>
156
  publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
157
  dateEl = ET.SubElement(publicationStmtEl, 'date')
158
  dateEl.text = '2017'
159
  pubPlaceEl = ET.SubElement(publicationStmtEl, 'pubPlace')
160
  pubPlaceEl.text = 'Tours'
161
  publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
162
  publisherEl.text = 'Centre de Renaissance de Tours'
163
  
164
  #### <sourceDesc>
165
  sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
166
  biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
167
  monogrEl = ET.SubElement(biblStructEl, 'monogr')
168
  
169
  authorEl = ET.SubElement(monogrEl, 'author')
170
  authorEl.text = 'Toshinori Uetano'
171
  titleEl = ET.SubElement(monogrEl, 'title')
172
  titleEl.text = 'Corpus parallèle de Arcardia, de Jacopo Sannazaro'
173
  respStmtEl = ET.SubElement(monogrEl, 'respStmt')
174
  respEl = ET.SubElement(respStmtEl, 'resp')
175
  respEl.text = '??'
176
  nameEl = ET.SubElement(respStmtEl, 'name')
177
  nameEl.text = '??'
178
  imprintEl = ET.SubElement(monogrEl, 'imprint')
179
  publisherEl = ET.SubElement(imprintEl, 'publisher')
180
  publisherEl.text = "???"
181
  dateEl = ET.SubElement(imprintEl, 'date')
182
  dateEl.text = "2017"
183

    
184
  #### <encodingDesc>
185
  encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
186
  projectDescEl = ET.SubElement(encodingDescEl, 'projectDesc')
187
  pEl = ET.SubElement(projectDescEl, 'p')
188
  pEl.text = 'Le corpus est le résultat d\'une compilation de textes etc...'
189

    
190
  editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
191
  correctionEl = ET.SubElement(editorialDeclEl, 'correction')
192
  pEl = ET.SubElement(correctionEl, 'p')
193
  pEl.text = 'Le travail éditorial consiste à ...'
194

    
195
  segmentationEl = ET.SubElement(editorialDeclEl, 'segmentation')
196
  pEl = ET.SubElement(segmentationEl, 'p')
197
  pEl.text = 'La division du texte en segments philologiques correspond à ...'
198

    
199

    
200
def buildMetadata(elRoot, type):
201
  headerEl = ET.SubElement(elRoot, 'teiHeader')
202
  fileDescEl = ET.SubElement(headerEl, 'fileDesc')
203

    
204
  #### <titleStmt>
205
  titleStmtEl = ET.SubElement(fileDescEl, 'titleStmt')
206
  titleEl = ET.SubElement(titleStmtEl, 'title')
207
  titleEl.text = '????'
208

    
209
  #### <extent>
210
  extentEl = ET.SubElement(fileDescEl, 'extent')
211
  extentEl.text = 'Taille ....'
212

    
213
  #### <publicationStmt>
214
  publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
215
  dateEl = ET.SubElement(publicationStmtEl, 'date')
216
  #date.text = 'Taille ....'
217
  publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
218
  #publisherEl.text = 'Taille ....'
219
  addressEl = ET.SubElement(publicationStmtEl, 'address')
220
  #addressEl.text = 'Taille ....'
221

    
222
  #### <sourceDesc>
223
  sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
224
  biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
225
  monogrEl = ET.SubElement(biblStructEl, 'monogr')
226
  
227
  if type == 'it':
228
    monogrEl.set("ana", "16e")
229
    monogrEl.set("lang", "it")
230
    monogrEl.set("xml:lang", "it")
231
    monogrEl.set("xml:id", "it1")
232
    titleEl = ET.SubElement(monogrEl, 'title')
233
    titleEl.text = 'Arcadia'
234
    authorEl = ET.SubElement(monogrEl, 'author')
235
    authorEl.text = 'Francesco Erspamer'
236
    authorEl.set("xml:lang", "it")
237
    editorEl = ET.SubElement(monogrEl, 'editor')
238
    editorEl.text = 'Alde ??'
239
    imprintEl = ET.SubElement(monogrEl, 'imprint')
240
    publisherEl = ET.SubElement(imprintEl, 'publisher')
241
    publisherEl.text = "édition de Venise, Alde ?"
242
    dateEl = ET.SubElement(imprintEl, 'date')
243
    dateEl.text = "1505"
244

    
245
  elif type == 'fr':
246
    monogrEl.set("ana", "16e")
247
    monogrEl.set("lang", "fr")
248
    monogrEl.set("xml:lang", "fr")
249
    monogrEl.set("xml:id", "fr1")
250
    titleEl = ET.SubElement(monogrEl, 'title')
251
    titleEl.text = 'Arcadie'
252
    authorEl = ET.SubElement(monogrEl, 'author')
253
    authorEl.text = 'Jehan Martin'
254
    authorEl.set("xml:lang", "fr")
255
    editorEl = ET.SubElement(monogrEl, 'editor')
256
    editorEl.text = 'De Vascosan et Corrozet ?'
257
    imprintEl = ET.SubElement(monogrEl, 'imprint')
258
    publisherEl = ET.SubElement(imprintEl, 'publisher')
259
    publisherEl.text = "édition de Michel de Vascosan pour luy et Gilles Corrozet ?"
260
    dateEl = ET.SubElement(imprintEl, 'date')
261
    dateEl.text = "1544"
262

    
263
  else:
264
    print "NO OTHER TEXT normally ?"
265
  
266
  #### <encodingDesc>
267
  encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
268
  editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
269
  normalizationEl = ET.SubElement(editorialDeclEl, 'normalization')
270

    
271
  profileDescEl = ET.SubElement(headerEl, 'profileDesc')
272
  langUsageEl = ET.SubElement(profileDescEl, 'langUsage')
273
  languageEl = ET.SubElement(langUsageEl, 'language')
274
  languageEl.set("ident", type)
275

    
276

    
277

    
278
def buildText(elRoot, type):
279
  textEl = ET.SubElement(elRoot, 'text')
280
  if type== 'it':
281
    textEl.set("name", "Erspamer_1505")
282
  elif type == 'fr':
283
    textEl.set("name", "Martin_1544")
284
  else:
285
    print "No attribute NAME here !!!!"
286
  bodyEl = ET.SubElement(textEl, 'body')
287
  #if type == 'it':
288
  #elif type == 'fr':
289
  return bodyEl
290

    
291

    
292
def findTitleSeg(title):
293
  titleFull = ''
294
  if title :
295
    if ',' in title :
296
      titleTable = title.split(',')
297
      if len(titleTable) >= 2:
298
        ### in LEFT PART any space will be replaced by "_"
299
        titlecellLeft = titleTable[0]
300
        titlecellLeft = titlecellLeft.replace(' ','_')
301

    
302
        ### in RIGHT PART, some mistakes in the string
303
        titlecellRight = ""
304
        cellRight = titleTable[1]
305
        if "-" in cellRight :
306
          cellRightTable = cellRight.split('-')
307
          numC = 0
308
          for c in cellRightTable:
309
            right = re.sub('[^0-9]*', '', c)
310
            if numC == 1:
311
              titlecellRight = titlecellRight + '-' + right
312
            else :
313
              titlecellRight = titlecellRight + right
314
            numC = numC + 1
315
                    
316
        else :
317
          titlecellRight = re.sub('[^0-9]*', '', cellRight)
318
          
319
        titleFull = titlecellLeft+"_"+titlecellRight
320
        #print "TITLE FULL : ",titleFull
321

    
322
      else :
323
        print "ATTTTTTTTTTENTION !!! ", len(titleCellTable)
324
        titleFull = title
325
    else :
326
      if "-" in title :
327
        titleTable = title.split('-')
328
        numC = 0
329
        for c in titleTable:
330
          titleStr = re.sub('[^0-9]*', '', c)
331
          if numC == 1:
332
            titleFull = titleFull + '-' + titleStr
333
          else :
334
            titleFull = titleFull + titleStr
335
          numC = numC + 1
336
                    
337
      else :
338
        titleFull = title.replace(' ','_')
339
        #titleFull = re.sub('[^0-9]*', '', title)
340
  #print "TITLE FULL : ",titleFull
341
  return titleFull
342

    
343
###### Preparing the bitext TEI output file #######
344
elRootCorpus = ET.Element("TeiCorpus")
345
buildMetadataCorpus(elRootCorpus)
346

    
347
#### IT ####
348
textIdIt = "it1"
349
elRootIt = ET.SubElement(elRootCorpus, "TEI")
350
buildMetadata(elRootIt, "it")
351
elTextIt = buildText(elRootIt, "it")
352

    
353
#### FR ####
354
textIdFr = "fr1"
355
elRootFr = ET.SubElement(elRootCorpus, "TEI")
356
buildMetadata(elRootFr, "fr")
357
elTextFr = buildText(elRootFr, "fr")
358

    
359

    
360
##############################
361
# PARSING du fichier XML     #
362
# mapping entre les titres   #
363
# et ajout de l'id dans XML  #
364
##############################
365

    
366
#<row>
367
#    <cell>Ecloga II, vv. 1-12</cell>
368
#    <cell>
369
#        <p rend="justify">
370
#            <hi rend="italic">Itene</hi>
371
#            <hi rend="italic">
372
#                <note xml:id="ftn121" place="foot" n="121"> <hi rend="italic">itene</hi> : andatevene. Cfr. Conti, <hi rend="italic">Canzoniere</hi>, CXLIX, 7-8 : “Itene a casa, et noi lassate al bosco / pasciute pecorelle” (Erspamer).</note>
373
#            </hi>
374
#            <hi rend="italic">  all'ombra degli </hi>
375
#            <hi rend="color(#ff0000)italic">ameni faggi</hi>
376
#            <hi rend="italic">,</hi></p>
377
#        <p rend="justify italic">pasciute pecorelle, omai che 'l sole</p>
378
#        <p rend="justify italic">sul mezzo giorno indrizza i caldi raggi.</p>
379
#        <p rend="justify">
380
#            <hi rend="italic">Ivi udirete </hi>
381
#            <hi rend="bold italic">l'alte</hi>
382
#            <hi rend="italic">
383
#                <note xml:id="ftn122" place="foot" n="122"> <hi rend="italic">alte</hi>  : pronunciate a voce alta (piú che 'nobili' : lo stile pastorale è infatti umile per definizione ; cfr. P, 4  ; IIIe, 29 ; X, 19) (Erspamer).</note>
384
#            </hi>
385
#            <hi rend="bold italic">  mie parole</hi>
386
#        </p>
387

    
388

    
389
#### READING the XML TEI file #####
390
doc = minidom.parse(inputxmlfile)
391
root = doc.documentElement
392

    
393
numTable = 1
394
for table in root.getElementsByTagName('table'):
395
  divId = "Ch" + str(numTable)
396
  elDivIt = buildDivision(elTextIt, textIdIt, str(numTable))
397
  elDivFr = buildDivision(elTextFr, textIdFr, str(numTable))
398
  if table.hasChildNodes():
399
    #print "\tchild"
400
    numRow = 1
401
    for childRow in table.childNodes:
402
      name = childRow.nodeName
403
      #print name
404
      if name == 'row':
405
        #print "in row real"
406
        if childRow.hasChildNodes():
407
          numCell = 0
408
          titlecell = u""
409
          for childCell in childRow.childNodes:
410
            nameCell = childCell.nodeName
411
            if childCell.nodeName == 'cell':
412
              cell = u""
413
              textfr = u""
414
              textit = u""
415
              #print "==========Cell=========="
416
              if numCell == 0 :
417
                titlecell = findText(childCell, titlecell)
418
                titleorigin = titlecell
419
                ## Exemple : A la Sampogna, 17-18íòú
420
                titlecell = findTitleSeg(titlecell)
421
                print "TITRE ; ",titleorigin, ";", titlecell
422
                
423
                if titlecell:
424
                  buildHead(elDivIt, textIdIt, divId, titlecell)
425
                  buildHead(elDivFr, textIdFr, divId, titlecell)
426
              elif numCell == 1 : ### FR
427
                textit = findText(childCell, textit)
428
                #print "[it] : ",textit
429
                buildSegment(elDivIt, textIdIt, divId, textit, str(numRow))
430
                
431
              elif numCell == 2 : ### IT
432
                textfr = findText(childCell, textfr)
433
                #print "[fr] : ",textfr
434
                buildSegment(elDivFr, textIdFr, divId, textfr, str(numRow))
435
                
436
              else : ### 
437
                print "[what cell] ???????????"
438
                if childCell.hasChildNodes():
439
                  
440
                  for child in childCell.childNodes:
441
                    nameChildCell = child.nodeName
442
                    #print nameChildCell
443
                    if nameChildCell == '#text':
444
                      cell = child.nodeValue
445
                    else : 
446
                      cell = findTextChild(child, "")
447
                print "OTHER ? ", cell
448
            numCell = numCell + 1
449
            #print "[N]", numCell
450
      numRow = numRow + 1  
451
  print "Chapitre ",numTable
452
  numTable = numTable + 1
453

    
454
treeFr = ET.ElementTree(elRootCorpus)
455
#treeFr.write(sys.stdout)
456
resFrfile.encode("utf-8")
457
treeFr.write(resFrfile)
458
#ET.tostring(treeFr, encoding='utf-8', method="xml")