doc2base.py

Severine Gedzelman, 08/08/2018 11:16

       #!/usr/bin/python
       # -*- coding: utf-8 -*-
       import os
       import re
       import unicodedata
       import sys
       import csv
       import string
       from os import listdir, rename
       from os.path import isfile, join
       ## répertoire depuis lequel on lance le script
       mypath = os.getcwd()
       mypath_results = sys.argv[1] ##''.join([mypath,'/results/'])
       mypath_input = sys.argv[2] ##''.join([mypath,'/results/'])
       mypath_input_papers = sys.argv[3]
       #mypath_files = ''.join([mypath,'/txt/'])
       respubfile = join(mypath_results, "table_publications_2.csv")
       file_publication = open(respubfile, 'w')
       strLinePub = 'id_publication ; name ; year ; comment ; id_author ; id_paper \n'
       file_publication.write(strLinePub)
       respaperfile = join(mypath_results, "table_papers_2.csv")
       file_paper = open(respaperfile, 'w')
       strLinePaper = 'id_paper ; name ; comment ; first_date ; last_date ; authors \n'
       file_paper.write(strLinePaper)
       onlyfiles = [f for f in listdir(mypath_input) if isfile(join(mypath_input, f))]
       #A partir de fichiers "acteur" (ex: "Act3_Capuana.txt"), préparé par Marco Borelli
       #Le script va compléter la table des journaux : Papers ######
       #Le script va remplir la table des re-publications : Publications ######
       def prepareListPapers():
         papers_map = {}
         mypath_input_papers
         ##############################
         # PARSING du fichier CSV     #
         ##############################
         with open(mypath_input_papers) as csvfile:
           filereader = csv.reader(csvfile, delimiter='#')
           i = 0
           for row in filereader:
             paper_line = {}
             if i == 0:
               print "First line (and first column) : ",row[0]
             else :
               name_paper = row[1]
               name_paper = name_paper.strip(" ")
               paper_line = {"id": row[0], "name": name_paper,"comment": row[2],"first_date": row[3],"last_date": row[4],"authors": row[5]}
               papers_map[name_paper] = paper_line
             i = i + 1
         return papers_map
       def prepareListDates(ldates, id_author, id_paper, nbPaper, nbPub):
         print "list of dates for [",id_paper,"]"
         #1888 (3) uno in due puntate;
         #1894
         PY = {}
         dates = ldates.split(";")
         for dateStr in dates :
           date = dateStr.strip(" ")
           date = date.strip(" ")
           ### SHOULD CHECK if multiple dates, ce serait une erreur d'écriture du fichier
           ### la liste des dates séparées par des "," au lieu de ";"
           realdate = date[0:5] #PREVIOUSLY date = dateStr[0:5]
           realdate = realdate.strip("\n")
           realdate = realdate.strip(" ")
           realdate = realdate.translate(None, string.whitespace)
           print "\tDate [", realdate,"]"
           nbpubstr = date[5:len(date)]
           num = ''
           comment = ''
           if nbpubstr :
             print "\tNb Pub =", nbpubstr
             pos_paradeb = nbpubstr.find('(')
             pos_paraend = nbpubstr.find(')')
             if (pos_paradeb != -1 and pos_paraend != -1) :
               num = nbpubstr[pos_paradeb+1:pos_paraend]
               #print "\tNumber of publications = ", num
               if num.isdigit() :
                 PY[realdate] = num
               else :
                 #print "\tNot a number !!! alors c'est 1 et ajout de commentaire"
                 comment = num
                 print "\tNb Pub = 1 and Comment : ",comment
                 PY[realdate] = 1
                 id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
                 nbPub = nbPub+1
             else:
               #print "\t!!!!!!!!! RIEN TROUVE COMME NUM !!!!!!!! alors c'est 1"
               print "\tNb Pub = 1 (a) !!!"
               PY[realdate] = 1
               id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
               nbPub = nbPub+1
             pos_dashdeb = nbpubstr.find('#')
             pos_dashend = nbpubstr.rfind('#')
             if (pos_dashdeb != -1 and pos_dashend != -1) :
               comment = nbpubstr[pos_dashdeb+1:pos_dashend]
               print "\tCommentaire = ", comment
             if num.isdigit() :
                 PY[realdate] = num
                 for n in range(0,int(num)):
                   id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
                   nbPub = nbPub+1
                   strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
                   file_publication.write(strLinePub)
             else :
               if comment :
                 strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
               else :
                 strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';;',id_author,';', id_paper,'\n'])
               file_publication.write(strLinePub)
           else:
             #print "\t!!!!!!!!! RIEN TROUVE COMME NUM !!!!!!!! alors c'est 1"
             print "\tNb Pub = 1 (b) !!!"
             PY[realdate] = 1
             id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
             strLinePub = ''.join([id_pub, ';UNKNOWN;',realdate,';;',id_author,';', id_paper,'\n'])
             file_publication.write(strLinePub)
             nbPub = nbPub+1
         return PY
       def findAuthor(path_file) :
         #/Users/gazelledess/ownCloud/Sev/Projets/Triangle/Doctorants/ProjetMarcoBorelli/Transformation/txt//Act9_DiGiacomo.txt
         pos_backslash = path_file.rfind('/')
         if pos_backslash != -1:
           name_file = path_file[pos_backslash+1:len(path_file)]
           pos_underscore = name_file.find('_')
           prefix_id = name_file[0:pos_underscore]
           return prefix_id
         else :
           print "NO BACKSLASH ??? isn't it strange ?"
           return "UNKNOWN_ID"
       def addAuthorToPaper(id_author, paper):
         print "addAuthorToPaper === > ",id_author, " in ",paper["id"]
         authors_list = paper["authors"]
         if authors_list is list :
           print "|||[authors_list - in file : ",authors_list,"]|||"
           authors_list_str = str(authors_list)
           authors_list_str = authors_list_str.strip(",")
           authors_list_2 = authors_list_str.split(",")
           #authors_list = authors_list.split(",")
           if id_author in authors_list :
             print "ALREADY (a) in the list of authors !!!!!!"
           else :
             authors_list.append(''.join([id_author, ', ']))
             print "ADDING ",id_author," to the list of authors"
             #paper["authors"] = authors_list
         else :
           if authors_list :
             if id_author in authors_list :
               print "ALREADY (b) in the list of authors !!!!!!"
             else :
               authors_list_str = ''.join(authors_list)
               authors_list_str = authors_list_str.strip(",")
               print "NEW LIST : ",authors_list_str
               #authors_list_2 = authors_list_str.split(",")
               #authors_list_2.append(''.join([', ',id_author, ', ']))
               authors_list_2 = ''.join([authors_list_str,id_author, ', '])
               print "NEW LIST 2 : ",authors_list_2
               authors_list = authors_list_2
               paper["authors"] = authors_list
               #author_list.append(''.join([id_author, ', ']))
               #paper["authors"] = authors_list_2
           else :
             authors_list = []
             authors_list.append(''.join([id_author, ', ']))
             paper["authors"] = authors_list
             print "NO AUTHOR FOR THIS PAPER YET - ADDING ++++"
       """"""""""""""""""""""""""""""
       """ Prepare the name for the paper """
       """"""""""""""""""""""""""""""
       def preparePaperName(name, papers):
         pos_para = name.find('(')
         if pos_para != -1:
           namesubstring = name[0:pos_para]
           cleanname = namesubstring.strip(" ")
           name = cleanname.strip(" ")
           name = name.upper()
           #print "NAME : ", name
           #return cleanname
         else:
           #print "NAME STAYS LIKE THIS => ", name
           name = name.strip(" ")
           name = name.upper()
         #id_paper = -1
         name = ''.join(['[',name,"]"])
         paper_name = name
         return paper_name
       """"""""""""""""""""""""""""""
       """ Initialize a paper with this
           name """
       """"""""""""""""""""""""""""""
       def setPaper(paper_name, nbpaper, papers):
         id_paper = ''.join(['Paper',str(nbpaper)])
         print "----->> NEW PAPER ", paper_name ,"with id : ",id_paper
         papers[paper_name] = {"id": id_paper, "name": paper_name,"comment": "","first_date": "","last_date": "","authors": ""}
         paper = papers[paper_name]
         return paper
       """Fonction inutile en réalité !!!"""
       def getPaper(id_paper, papers):
         for paper_key in papers.keys() :
           paper = papers[paper_key]
           paper_id = paper["id"]
           if paper_id == id_paper:
             return paper
         return -1
       """"""""""""""""""""""""""""""
       """ Find which paper corresponds
           to that name """
       """"""""""""""""""""""""""""""
       def getPaperByName(paper_name, papers):
         for paper_key in papers.keys() :
           paper = papers[paper_key]
           name_of_paper = paper["name"]
           if paper_name == name_of_paper:
             return paper
         return -1
       """"""""""""""""""""""""""""""
       """ Set the min and max among all
           the publication dates for
           this paper """
       """"""""""""""""""""""""""""""
       def setDatesForPaper(dates, paper):
         #### min date ####
         mindate = 2000
         date_min_from_file = paper["first_date"]
         if date_min_from_file :
             mindate = int(date_min_from_file)
             print "\t\t HAS mindate => ", mindate
         #### max date ####
         maxdate = 1000
         date_max_from_file = paper["last_date"]
         if date_max_from_file :
             maxdate = int(date_max_from_file)
             print "\t\t HAS maxdate => ", maxdate
         ## Find if there are other min or max among the dates
         for date in dates.keys():
           #print "\t D = ", date
           if mindate > int(date) :
             mindate = int(date)
             print "\t\t CHANGING mindate => ", mindate
           if maxdate < int(date) :
             maxdate = int(date)
             print "\t\t CHANGING maxdate => ", maxdate
         paper["first_date"] = mindate
         paper["last_date"] = maxdate
       """"""""""""""""""""""""""""""
       """ Output the resulted table
           about Papers """
       """"""""""""""""""""""""""""""
       def writePapersTable(papers):
         #papers[paper_name]= {"id": id_paper, "name": paper_name,"comment": "","first_date": "","last_date": "","authors": ""}
         for paper_key in papers.keys():
           paper = papers[paper_key]
           name_paper = paper["name"]
           id_paper = paper["id"]
           first_date = paper["first_date"]
           last_date = paper["last_date"]
           authors = paper["authors"]
           authors = ''.join(authors)
           strLinePaper = ''.join([id_paper,' ; ', name_paper,' ; ; ',str(first_date),' ; ',str(last_date),' ; ',authors,'\n'])
           file_paper.write(strLinePaper)
       def getMaxId(papers):
         max_id = 0
         for paper_key in papers.keys():
           paper = papers[paper_key]
           paper_id = paper["id"]
           id_str = paper_id[5:len(paper_id)]
           print "id = ",id_str
           if int(id_str)> max_id:
             max_id = int(id_str)
         return max_id
       def treatFiles():
         #papers = {}
         papers = prepareListPapers()
         authors_id = []
         authorsByPaper = {}
         nbPub = 1
         nbPaper = getMaxId(papers)
         for fi in onlyfiles:
           if(fi =='.DS_Store'):
             print 'FILE Macintosh is : ', fi
           else :
               mypath_file = ''.join([mypath_input, '/', fi])
               print "############### READING FILE : ", mypath_file
               ######### Create Author ID #########
               id_author = findAuthor(mypath_file)
               authors_id.append(id_author)
               ######### Analyse each line (left str and right str of :) #########
               #exemple :
               #Rivista Minima (5 novelle): 1872; 1873; 1877 (dal ciclo Mio figlio! ma non so quante); 1879; 1883
               text_file = open(mypath_file, "r")
               lines = text_file.readlines()
               for l in lines:
                 pos_dot = l.find(':')
                 left_line = l[0:pos_dot]
                 right_line = l[pos_dot+1:len(l)]
                 id_paper = -1
                 ### Prepare name of the journal with the left string ###
                 paper_name = preparePaperName(left_line, papers)
                 ### Create the paper and add its info in the table ###
                 paper = getPaperByName(paper_name, papers)
                 if paper == -1:
                   nbPaper = nbPaper+1
                   paper = setPaper(paper_name, nbPaper, papers)
                 addAuthorToPaper(id_author, paper)
                 dates = prepareListDates(right_line, id_author, paper["id"], nbPaper, nbPub)
                 setDatesForPaper(dates, paper)
                 print "------",paper_name,"--------"
               text_file.close()
               print "##############################"
         writePapersTable(papers)
         print "Nb papers = ",len(papers)
       treatFiles()

Chantier HN Triangle » NouvellesItaliennes19eme

doc2base.py