# PITTORE SCULTORE ET ARCHITETTO # # # PREFACE TO THE READER, Portuguese Edition, 1549 # El doctor Martin de Azpilcueta Nauarro al lector # xml:id="i1_Ch10-Seg12" def buildHead(elRoot, textId, divId, textHead): headId = textId + "_" + divId+ "-Seg0" #print "#build Head ",headId ### ELEMENT strHead = u"head" headEl = ET.SubElement(elRoot, strHead.encode('utf-8')) ### with text node #print "## texte normal en sortie ## ",type(textHead), textSeg headEl.text = str(textHead).encode('utf-8') ### @type = "Title" typeHead = u"Title" headEl.set("type", typeHead.encode('utf-8')) ### @xml:id = "Segment 1" headEl.set("xml:id", headId.encode('utf-8')) return headEl def buildDivision(elRoot, textId, numId): divId = textId + "_" + "Ch" + numId #print "#build Div ",divId ### ELEMENT

strDiv = u"div" divEl = ET.SubElement(elRoot, strDiv.encode('utf-8')) ### @n = "1" divEl.set("n", numId.encode('utf-8')) ### @type = "Chapitre" typeDiv = u"Chapitre" divEl.set("type", typeDiv.encode('utf-8')) ### @xml:id = "Div 1" divEl.set("xml:id", divId.encode('utf-8')) return divEl def buildSegment(elRoot, textId, divId, textSeg, numSeg): segId = textId + "_" + divId+ "-Seg"+ numSeg #print "#build Segment ",segId ### ELEMENT strSeg = u"seg" segEl = ET.SubElement(elRoot, strSeg.encode('utf-8'))#, 'ignore')) ### with text node #print "## texte normal en sortie ## ",type(textSeg), textSeg segEl.text = str(textSeg).encode('utf-8')#, 'ignore') ### @n = "1" segEl.set("n", numSeg.encode('utf-8'))#, 'ignore')) ### @type = "Segment" typeSeg = u"Segment" segEl.set("type", typeSeg.encode('utf-8'))#, 'ignore')) ### @xml:id = "Segment 1" segEl.set("xml:id", segId.encode('utf-8'))#, 'ignore')) return segEl def buildMetadataCorpus(elRoot): headerEl = ET.SubElement(elRoot, 'teiHeader') fileDescEl = ET.SubElement(headerEl, 'fileDesc') #### editionStmtEl = ET.SubElement(fileDescEl, 'editionStmt') pEl = ET.SubElement(editionStmtEl, 'p') pEl.text = 'à donner des détails' #### extentEl = ET.SubElement(fileDescEl, 'extent') pEl = ET.SubElement(extentEl, 'p') pEl.text = '?? segments alignés, ?? mots' #### publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt') dateEl = ET.SubElement(publicationStmtEl, 'date') dateEl.text = '2017' pubPlaceEl = ET.SubElement(publicationStmtEl, 'pubPlace') pubPlaceEl.text = 'Tours' publisherEl = ET.SubElement(publicationStmtEl, 'publisher') publisherEl.text = 'Centre de Renaissance de Tours' #### sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc') biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl') monogrEl = ET.SubElement(biblStructEl, 'monogr') authorEl = ET.SubElement(monogrEl, 'author') authorEl.text = 'Toshinori Uetano' titleEl = ET.SubElement(monogrEl, 'title') titleEl.text = 'Corpus parallèle de Arcardia, de Jacopo Sannazaro' respStmtEl = ET.SubElement(monogrEl, 'respStmt') respEl = ET.SubElement(respStmtEl, 'resp') respEl.text = '??' nameEl = ET.SubElement(respStmtEl, 'name') nameEl.text = '??' imprintEl = ET.SubElement(monogrEl, 'imprint') publisherEl = ET.SubElement(imprintEl, 'publisher') publisherEl.text = "???" dateEl = ET.SubElement(imprintEl, 'date') dateEl.text = "2017" #### encodingDescEl = ET.SubElement(headerEl, 'encodingDesc') projectDescEl = ET.SubElement(encodingDescEl, 'projectDesc') pEl = ET.SubElement(projectDescEl, 'p') pEl.text = 'Le corpus est le résultat d\'une compilation de textes etc...' editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl') correctionEl = ET.SubElement(editorialDeclEl, 'correction') pEl = ET.SubElement(correctionEl, 'p') pEl.text = 'Le travail éditorial consiste à ...' segmentationEl = ET.SubElement(editorialDeclEl, 'segmentation') pEl = ET.SubElement(segmentationEl, 'p') pEl.text = 'La division du texte en segments philologiques correspond à ...' def buildMetadata(elRoot, type): headerEl = ET.SubElement(elRoot, 'teiHeader') fileDescEl = ET.SubElement(headerEl, 'fileDesc') #### titleStmtEl = ET.SubElement(fileDescEl, 'titleStmt') titleEl = ET.SubElement(titleStmtEl, 'title') titleEl.text = '????' #### extentEl = ET.SubElement(fileDescEl, 'extent') extentEl.text = 'Taille ....' #### publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt') dateEl = ET.SubElement(publicationStmtEl, 'date') #date.text = 'Taille ....' publisherEl = ET.SubElement(publicationStmtEl, 'publisher') #publisherEl.text = 'Taille ....' addressEl = ET.SubElement(publicationStmtEl, 'address') #addressEl.text = 'Taille ....' #### sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc') biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl') monogrEl = ET.SubElement(biblStructEl, 'monogr') if type == 'it': monogrEl.set("ana", "16e") monogrEl.set("lang", "it") monogrEl.set("xml:lang", "it") monogrEl.set("xml:id", "it1") titleEl = ET.SubElement(monogrEl, 'title') titleEl.text = 'Arcadia' authorEl = ET.SubElement(monogrEl, 'author') authorEl.text = 'Francesco Erspamer' authorEl.set("xml:lang", "it") editorEl = ET.SubElement(monogrEl, 'editor') editorEl.text = 'Alde ??' imprintEl = ET.SubElement(monogrEl, 'imprint') publisherEl = ET.SubElement(imprintEl, 'publisher') publisherEl.text = "édition de Venise, Alde ?" dateEl = ET.SubElement(imprintEl, 'date') dateEl.text = "1505" elif type == 'fr': monogrEl.set("ana", "16e") monogrEl.set("lang", "fr") monogrEl.set("xml:lang", "fr") monogrEl.set("xml:id", "fr1") titleEl = ET.SubElement(monogrEl, 'title') titleEl.text = 'Arcadie' authorEl = ET.SubElement(monogrEl, 'author') authorEl.text = 'Jehan Martin' authorEl.set("xml:lang", "fr") editorEl = ET.SubElement(monogrEl, 'editor') editorEl.text = 'De Vascosan et Corrozet ?' imprintEl = ET.SubElement(monogrEl, 'imprint') publisherEl = ET.SubElement(imprintEl, 'publisher') publisherEl.text = "édition de Michel de Vascosan pour luy et Gilles Corrozet ?" dateEl = ET.SubElement(imprintEl, 'date') dateEl.text = "1544" else: print "NO OTHER TEXT normally ?" #### encodingDescEl = ET.SubElement(headerEl, 'encodingDesc') editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl') normalizationEl = ET.SubElement(editorialDeclEl, 'normalization') profileDescEl = ET.SubElement(headerEl, 'profileDesc') langUsageEl = ET.SubElement(profileDescEl, 'langUsage') languageEl = ET.SubElement(langUsageEl, 'language') languageEl.set("ident", type) def buildText(elRoot, type): textEl = ET.SubElement(elRoot, 'text') if type== 'it': textEl.set("name", "Erspamer_1505") elif type == 'fr': textEl.set("name", "Martin_1544") else: print "No attribute NAME here !!!!" bodyEl = ET.SubElement(textEl, 'body') #if type == 'it': #elif type == 'fr': return bodyEl def findTitleSeg(title): titleFull = '' if title : if ',' in title : titleTable = title.split(',') if len(titleTable) >= 2: ### in LEFT PART any space will be replaced by "_" titlecellLeft = titleTable[0] titlecellLeft = titlecellLeft.replace(' ','_') ### in RIGHT PART, some mistakes in the string titlecellRight = "" cellRight = titleTable[1] if "-" in cellRight : cellRightTable = cellRight.split('-') numC = 0 for c in cellRightTable: right = re.sub('[^0-9]*', '', c) if numC == 1: titlecellRight = titlecellRight + '-' + right else : titlecellRight = titlecellRight + right numC = numC + 1 else : titlecellRight = re.sub('[^0-9]*', '', cellRight) titleFull = titlecellLeft+"_"+titlecellRight #print "TITLE FULL : ",titleFull else : print "ATTTTTTTTTTENTION !!! ", len(titleCellTable) titleFull = title else : if "-" in title : titleTable = title.split('-') numC = 0 for c in titleTable: titleStr = re.sub('[^0-9]*', '', c) if numC == 1: titleFull = titleFull + '-' + titleStr else : titleFull = titleFull + titleStr numC = numC + 1 else : titleFull = title.replace(' ','_') #titleFull = re.sub('[^0-9]*', '', title) #print "TITLE FULL : ",titleFull return titleFull ###### Preparing the bitext TEI output file ####### elRootCorpus = ET.Element("TeiCorpus") buildMetadataCorpus(elRootCorpus) #### IT #### textIdIt = "it1" elRootIt = ET.SubElement(elRootCorpus, "TEI") buildMetadata(elRootIt, "it") elTextIt = buildText(elRootIt, "it") #### FR #### textIdFr = "fr1" elRootFr = ET.SubElement(elRootCorpus, "TEI") buildMetadata(elRootFr, "fr") elTextFr = buildText(elRootFr, "fr") ############################## # PARSING du fichier XML # # mapping entre les titres # # et ajout de l'id dans XML # ############################## # # Ecloga II, vv. 1-12 # #

# Itene # # itene : andatevene. Cfr. Conti, Canzoniere, CXLIX, 7-8 : “Itene a casa, et noi lassate al bosco / pasciute pecorelle” (Erspamer). # # all'ombra degli # ameni faggi # ,

pasciute pecorelle, omai che 'l sole

sul mezzo giorno indrizza i caldi raggi.

# Ivi udirete # l'alte # # alte : pronunciate a voce alta (piú che 'nobili' : lo stile pastorale è infatti umile per definizione ; cfr. P, 4 ; IIIe, 29 ; X, 19) (Erspamer). # # mie parole #

#### READING the XML TEI file ##### doc = minidom.parse(inputxmlfile) root = doc.documentElement numTable = 1 for table in root.getElementsByTagName('table'): divId = "Ch" + str(numTable) elDivIt = buildDivision(elTextIt, textIdIt, str(numTable)) elDivFr = buildDivision(elTextFr, textIdFr, str(numTable)) if table.hasChildNodes(): #print "\tchild" numRow = 1 for childRow in table.childNodes: name = childRow.nodeName #print name if name == 'row': #print "in row real" if childRow.hasChildNodes(): numCell = 0 titlecell = u"" for childCell in childRow.childNodes: nameCell = childCell.nodeName if childCell.nodeName == 'cell': cell = u"" textfr = u"" textit = u"" #print "==========Cell==========" if numCell == 0 : titlecell = findText(childCell, titlecell) titleorigin = titlecell ## Exemple : A la Sampogna, 17-18íòú titlecell = findTitleSeg(titlecell) print "TITRE ; ",titleorigin, ";", titlecell if titlecell: buildHead(elDivIt, textIdIt, divId, titlecell) buildHead(elDivFr, textIdFr, divId, titlecell) elif numCell == 1 : ### FR textit = findText(childCell, textit) #print "[it] : ",textit buildSegment(elDivIt, textIdIt, divId, textit, str(numRow)) elif numCell == 2 : ### IT textfr = findText(childCell, textfr) #print "[fr] : ",textfr buildSegment(elDivFr, textIdFr, divId, textfr, str(numRow)) else : ### print "[what cell] ???????????" if childCell.hasChildNodes(): for child in childCell.childNodes: nameChildCell = child.nodeName #print nameChildCell if nameChildCell == '#text': cell = child.nodeValue else : cell = findTextChild(child, "") print "OTHER ? ", cell numCell = numCell + 1 #print "[N]", numCell numRow = numRow + 1 print "Chapitre ",numTable numTable = numTable + 1 treeFr = ET.ElementTree(elRootCorpus) #treeFr.write(sys.stdout) resFrfile.encode("utf-8") treeFr.write(resFrfile) #ET.tostring(treeFr, encoding='utf-8', method="xml")