#!/usr/bin/python
# -*- coding: utf-8 -*-

#import xlrd
#import pandas as pd  
import os
import re
import unicodedata
import sys
import csv
import xml.etree.ElementTree as ET
import prettify

from os import listdir, rename
from os.path import isfile, join

# ouverture du fichier Excel 
from xml.dom import minidom

## les paramètres : 
myxmlfile = sys.argv[1] #chemin relatif/nom du fichier XML
mypath = sys.argv[2]	#chemin système jusqu'au script

print "L'encodage du système python:", sys.getdefaultencoding() ## ASCII
print "L'encodage des fichiers : ", sys.getfilesystemencoding() ## UTF-8

#### NON CONSEILLE apparemment mais bon ???
reload(sys)  
sys.setdefaultencoding('utf8')

inputxmlfile = join(mypath, myxmlfile); # Fichier XML d'entrée
print "InputFile XML : ",inputxmlfile
resFrfile = join(mypath, 'newFr.xml') # Fichier résultat
print "OutputFile XML : ", resFrfile		# Fichier XML de sortie


def findTextChild(node, strText):
  #print "\tNode [", node.nodeName, "]"
  nameNode = node.nodeName
  if nameNode == '#text':
    strNode = node.nodeValue.encode('utf-8')#, 'ignore')
    if strText is None:
      strReturn = "" + strNode
    else :
      strReturn = str(strText) + strNode
   
    return strReturn
  elif nameNode == 'note':
    return strText
  else :
    if node.hasChildNodes():
      for child in node.childNodes:
        strText = findTextChild(child, strText)
      return strText


def findText(childCell, titlecell):
  if childCell.hasChildNodes():
    for child in childCell.childNodes:
      nameChildCell = child.nodeName
      if nameChildCell == '#text':
        strNode = child.nodeValue.encode('utf-8')#, 'ignore')
        if titlecell is None:
          titlecell = "" + strNode
        else :
          titlecell = str(titlecell) + strNode
      else : 
        titlecell = findTextChild(child, titlecell)
  return titlecell

#<text name="1550_Torrentiniana">
#    <body>
#      <div xml:id="t1_Bo" type="Book" rend="Oeuvre">
#          <seg xml:id="t1_Seg1" type="Segment" n="1" rend="Segment 1">PITTORE SCULTORE ET ARCHITETTO</seg>

#<text name="Azpilcueta_pt_1549">
#  <front type="Prologue" xml:id="p1_Pr">
#      <head type="Title" xml:id="p1_Pr-Seg0">PREFACE TO THE READER, Portuguese Edition, 1549</head>
#          <seg n="1" type="Segment" xml:id="p1_Pr-Seg1">El doctor Martin de Azpilcueta Nauarro al lector</seg>
      
#          xml:id="i1_Ch10-Seg12"

def buildHead(elRoot, textId, divId, textHead):
  headId = textId + "_" + divId+ "-Seg0"
  #print "#build Head ",headId
  
  ### ELEMENT <head>
  strHead = u"head"
  headEl = ET.SubElement(elRoot, strHead.encode('utf-8'))
 
  ### with text node 
  #print "## texte normal en sortie ## ",type(textHead), textSeg
  headEl.text = str(textHead).encode('utf-8')
  
  ### @type = "Title"
  typeHead = u"Title"
  headEl.set("type", typeHead.encode('utf-8'))
  ### @xml:id = "Segment 1"
  headEl.set("xml:id", headId.encode('utf-8'))
  return headEl

def buildDivision(elRoot, textId, numId):
  divId = textId + "_" + "Ch" + numId
  #print "#build Div ",divId
  
  ### ELEMENT <div>
  strDiv = u"div"
  divEl = ET.SubElement(elRoot, strDiv.encode('utf-8'))
 
  ### @n = "1"
  divEl.set("n", numId.encode('utf-8'))
  ### @type = "Chapitre"
  typeDiv = u"Chapitre"
  divEl.set("type", typeDiv.encode('utf-8'))
  ### @xml:id = "Div 1"
  divEl.set("xml:id", divId.encode('utf-8'))
  return divEl

def buildSegment(elRoot, textId, divId, textSeg, numSeg):
  segId = textId + "_" + divId+ "-Seg"+ numSeg
  #print "#build Segment ",segId
  
  ### ELEMENT <seg>
  strSeg = u"seg"
  segEl = ET.SubElement(elRoot, strSeg.encode('utf-8'))#, 'ignore'))
 
  ### with text node 
  #print "## texte normal en sortie ## ",type(textSeg), textSeg
  segEl.text = str(textSeg).encode('utf-8')#, 'ignore')
  
  ### @n = "1"
  segEl.set("n", numSeg.encode('utf-8'))#, 'ignore'))
  ### @type = "Segment"
  typeSeg = u"Segment"
  segEl.set("type", typeSeg.encode('utf-8'))#, 'ignore'))
  ### @xml:id = "Segment 1"
  segEl.set("xml:id", segId.encode('utf-8'))#, 'ignore'))
  return segEl
  

def buildMetadataCorpus(elRoot):
  headerEl = ET.SubElement(elRoot, 'teiHeader')
  fileDescEl = ET.SubElement(headerEl, 'fileDesc')

  #### <titleStmt>
  editionStmtEl = ET.SubElement(fileDescEl, 'editionStmt')
  pEl = ET.SubElement(editionStmtEl, 'p')
  pEl.text = 'à donner des détails'

  #### <extent>
  extentEl = ET.SubElement(fileDescEl, 'extent')
  pEl = ET.SubElement(extentEl, 'p')
  pEl.text = '?? segments alignés, ?? mots'
  
  #### <publicationStmt>
  publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
  dateEl = ET.SubElement(publicationStmtEl, 'date')
  dateEl.text = '2017'
  pubPlaceEl = ET.SubElement(publicationStmtEl, 'pubPlace')
  pubPlaceEl.text = 'Tours'
  publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
  publisherEl.text = 'Centre de Renaissance de Tours'
  
  #### <sourceDesc>
  sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
  biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
  monogrEl = ET.SubElement(biblStructEl, 'monogr')
  
  authorEl = ET.SubElement(monogrEl, 'author')
  authorEl.text = 'Toshinori Uetano'
  titleEl = ET.SubElement(monogrEl, 'title')
  titleEl.text = 'Corpus parallèle de Arcardia, de Jacopo Sannazaro'
  respStmtEl = ET.SubElement(monogrEl, 'respStmt')
  respEl = ET.SubElement(respStmtEl, 'resp')
  respEl.text = '??'
  nameEl = ET.SubElement(respStmtEl, 'name')
  nameEl.text = '??'
  imprintEl = ET.SubElement(monogrEl, 'imprint')
  publisherEl = ET.SubElement(imprintEl, 'publisher')
  publisherEl.text = "???"
  dateEl = ET.SubElement(imprintEl, 'date')
  dateEl.text = "2017"

  #### <encodingDesc>
  encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
  projectDescEl = ET.SubElement(encodingDescEl, 'projectDesc')
  pEl = ET.SubElement(projectDescEl, 'p')
  pEl.text = 'Le corpus est le résultat d\'une compilation de textes etc...'

  editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
  correctionEl = ET.SubElement(editorialDeclEl, 'correction')
  pEl = ET.SubElement(correctionEl, 'p')
  pEl.text = 'Le travail éditorial consiste à ...'

  segmentationEl = ET.SubElement(editorialDeclEl, 'segmentation')
  pEl = ET.SubElement(segmentationEl, 'p')
  pEl.text = 'La division du texte en segments philologiques correspond à ...'


def buildMetadata(elRoot, type):
  headerEl = ET.SubElement(elRoot, 'teiHeader')
  fileDescEl = ET.SubElement(headerEl, 'fileDesc')

  #### <titleStmt>
  titleStmtEl = ET.SubElement(fileDescEl, 'titleStmt')
  titleEl = ET.SubElement(titleStmtEl, 'title')
  titleEl.text = '????'

  #### <extent>
  extentEl = ET.SubElement(fileDescEl, 'extent')
  extentEl.text = 'Taille ....'

  #### <publicationStmt>
  publicationStmtEl = ET.SubElement(fileDescEl, 'publicationStmt')
  dateEl = ET.SubElement(publicationStmtEl, 'date')
  #date.text = 'Taille ....'
  publisherEl = ET.SubElement(publicationStmtEl, 'publisher')
  #publisherEl.text = 'Taille ....'
  addressEl = ET.SubElement(publicationStmtEl, 'address')
  #addressEl.text = 'Taille ....'

  #### <sourceDesc>
  sourceDescEl = ET.SubElement(fileDescEl, 'sourceDesc')
  biblStructEl = ET.SubElement(sourceDescEl, 'biblStructEl')
  monogrEl = ET.SubElement(biblStructEl, 'monogr')
  
  if type == 'it':
    monogrEl.set("ana", "16e")
    monogrEl.set("lang", "it")
    monogrEl.set("xml:lang", "it")
    monogrEl.set("xml:id", "it1")
    titleEl = ET.SubElement(monogrEl, 'title')
    titleEl.text = 'Arcadia'
    authorEl = ET.SubElement(monogrEl, 'author')
    authorEl.text = 'Francesco Erspamer'
    authorEl.set("xml:lang", "it")
    editorEl = ET.SubElement(monogrEl, 'editor')
    editorEl.text = 'Alde ??'
    imprintEl = ET.SubElement(monogrEl, 'imprint')
    publisherEl = ET.SubElement(imprintEl, 'publisher')
    publisherEl.text = "édition de Venise, Alde ?"
    dateEl = ET.SubElement(imprintEl, 'date')
    dateEl.text = "1505"

  elif type == 'fr':
    monogrEl.set("ana", "16e")
    monogrEl.set("lang", "fr")
    monogrEl.set("xml:lang", "fr")
    monogrEl.set("xml:id", "fr1")
    titleEl = ET.SubElement(monogrEl, 'title')
    titleEl.text = 'Arcadie'
    authorEl = ET.SubElement(monogrEl, 'author')
    authorEl.text = 'Jehan Martin'
    authorEl.set("xml:lang", "fr")
    editorEl = ET.SubElement(monogrEl, 'editor')
    editorEl.text = 'De Vascosan et Corrozet ?'
    imprintEl = ET.SubElement(monogrEl, 'imprint')
    publisherEl = ET.SubElement(imprintEl, 'publisher')
    publisherEl.text = "édition de Michel de Vascosan pour luy et Gilles Corrozet ?"
    dateEl = ET.SubElement(imprintEl, 'date')
    dateEl.text = "1544"

  else:
    print "NO OTHER TEXT normally ?"
  
  #### <encodingDesc>
  encodingDescEl = ET.SubElement(headerEl, 'encodingDesc')
  editorialDeclEl = ET.SubElement(encodingDescEl, 'editorialDecl')
  normalizationEl = ET.SubElement(editorialDeclEl, 'normalization')

  profileDescEl = ET.SubElement(headerEl, 'profileDesc')
  langUsageEl = ET.SubElement(profileDescEl, 'langUsage')
  languageEl = ET.SubElement(langUsageEl, 'language')
  languageEl.set("ident", type)



def buildText(elRoot, type):
  textEl = ET.SubElement(elRoot, 'text')
  if type== 'it':
    textEl.set("name", "Erspamer_1505")
  elif type == 'fr':
    textEl.set("name", "Martin_1544")
  else:
    print "No attribute NAME here !!!!"
  bodyEl = ET.SubElement(textEl, 'body')
  #if type == 'it':
  #elif type == 'fr':
  return bodyEl


def findTitleSeg(title):
  titleFull = ''
  if title :
    if ',' in title :
      titleTable = title.split(',')
      if len(titleTable) >= 2:
        ### in LEFT PART any space will be replaced by "_"
        titlecellLeft = titleTable[0]
        titlecellLeft = titlecellLeft.replace(' ','_')

        ### in RIGHT PART, some mistakes in the string
        titlecellRight = ""
        cellRight = titleTable[1]
        if "-" in cellRight :
          cellRightTable = cellRight.split('-')
          numC = 0
          for c in cellRightTable:
            right = re.sub('[^0-9]*', '', c)
            if numC == 1:
              titlecellRight = titlecellRight + '-' + right
            else :
              titlecellRight = titlecellRight + right
            numC = numC + 1
                    
        else :
          titlecellRight = re.sub('[^0-9]*', '', cellRight)
          
        titleFull = titlecellLeft+"_"+titlecellRight
        #print "TITLE FULL : ",titleFull

      else :
        print "ATTTTTTTTTTENTION !!! ", len(titleCellTable)
        titleFull = title
    else :
      if "-" in title :
        titleTable = title.split('-')
        numC = 0
        for c in titleTable:
          titleStr = re.sub('[^0-9]*', '', c)
          if numC == 1:
            titleFull = titleFull + '-' + titleStr
          else :
            titleFull = titleFull + titleStr
          numC = numC + 1
                    
      else :
        titleFull = title.replace(' ','_')
        #titleFull = re.sub('[^0-9]*', '', title)
  #print "TITLE FULL : ",titleFull
  return titleFull

###### Preparing the bitext TEI output file #######
elRootCorpus = ET.Element("TeiCorpus")
buildMetadataCorpus(elRootCorpus)

#### IT ####
textIdIt = "it1"
elRootIt = ET.SubElement(elRootCorpus, "TEI")
buildMetadata(elRootIt, "it")
elTextIt = buildText(elRootIt, "it")

#### FR ####
textIdFr = "fr1"
elRootFr = ET.SubElement(elRootCorpus, "TEI")
buildMetadata(elRootFr, "fr")
elTextFr = buildText(elRootFr, "fr")


##############################
# PARSING du fichier XML     #
# mapping entre les titres   #
# et ajout de l'id dans XML  #
##############################

#<row>
#    <cell>Ecloga II, vv. 1-12</cell>
#    <cell>
#        <p rend="justify">
#            <hi rend="italic">Itene</hi>
#            <hi rend="italic">
#                <note xml:id="ftn121" place="foot" n="121"> <hi rend="italic">itene</hi> : andatevene. Cfr. Conti, <hi rend="italic">Canzoniere</hi>, CXLIX, 7-8 : “Itene a casa, et noi lassate al bosco / pasciute pecorelle” (Erspamer).</note>
#            </hi>
#            <hi rend="italic">  all'ombra degli </hi>
#            <hi rend="color(#ff0000)italic">ameni faggi</hi>
#            <hi rend="italic">,</hi></p>
#        <p rend="justify italic">pasciute pecorelle, omai che 'l sole</p>
#        <p rend="justify italic">sul mezzo giorno indrizza i caldi raggi.</p>
#        <p rend="justify">
#            <hi rend="italic">Ivi udirete </hi>
#            <hi rend="bold italic">l'alte</hi>
#            <hi rend="italic">
#                <note xml:id="ftn122" place="foot" n="122"> <hi rend="italic">alte</hi>  : pronunciate a voce alta (piú che 'nobili' : lo stile pastorale è infatti umile per definizione ; cfr. P, 4  ; IIIe, 29 ; X, 19) (Erspamer).</note>
#            </hi>
#            <hi rend="bold italic">  mie parole</hi>
#        </p>


#### READING the XML TEI file #####
doc = minidom.parse(inputxmlfile)
root = doc.documentElement

numTable = 1
for table in root.getElementsByTagName('table'):
  divId = "Ch" + str(numTable)
  elDivIt = buildDivision(elTextIt, textIdIt, str(numTable))
  elDivFr = buildDivision(elTextFr, textIdFr, str(numTable))
  if table.hasChildNodes():
    #print "\tchild"
    numRow = 1
    for childRow in table.childNodes:
      name = childRow.nodeName
      #print name
      if name == 'row':
        #print "in row real"
        if childRow.hasChildNodes():
          numCell = 0
          titlecell = u""
          for childCell in childRow.childNodes:
            nameCell = childCell.nodeName
            if childCell.nodeName == 'cell':
              cell = u""
              textfr = u""
              textit = u""
              #print "==========Cell=========="
              if numCell == 0 :
                titlecell = findText(childCell, titlecell)
                titleorigin = titlecell
                ## Exemple : A la Sampogna, 17-18íòú
                titlecell = findTitleSeg(titlecell)
                print "TITRE ; ",titleorigin, ";", titlecell
                
                if titlecell:
                  buildHead(elDivIt, textIdIt, divId, titlecell)
                  buildHead(elDivFr, textIdFr, divId, titlecell)
              elif numCell == 1 : ### FR
                textit = findText(childCell, textit)
                #print "[it] : ",textit
                buildSegment(elDivIt, textIdIt, divId, textit, str(numRow))
                
              elif numCell == 2 : ### IT
                textfr = findText(childCell, textfr)
                #print "[fr] : ",textfr
                buildSegment(elDivFr, textIdFr, divId, textfr, str(numRow))
                
              else : ### 
                print "[what cell] ???????????"
                if childCell.hasChildNodes():
                  
                  for child in childCell.childNodes:
                    nameChildCell = child.nodeName
                    #print nameChildCell
                    if nameChildCell == '#text':
                      cell = child.nodeValue
                    else : 
                      cell = findTextChild(child, "")
                print "OTHER ? ", cell
            numCell = numCell + 1
            #print "[N]", numCell
      numRow = numRow + 1  
  print "Chapitre ",numTable
  numTable = numTable + 1

treeFr = ET.ElementTree(elRootCorpus)
#treeFr.write(sys.stdout)
resFrfile.encode("utf-8")
treeFr.write(resFrfile)
#ET.tostring(treeFr, encoding='utf-8', method="xml")
