doc2base.py

Severine Gedzelman, 08/08/2018 11:16

Télécharger (11,64 ko)

 
1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
3

    
4
import os
5
import re
6
import unicodedata
7
import sys
8
import csv
9
import string
10

    
11
from os import listdir, rename
12
from os.path import isfile, join
13

    
14
## répertoire depuis lequel on lance le script
15
mypath = os.getcwd()
16
mypath_results = sys.argv[1] ##''.join([mypath,'/results/'])
17
mypath_input = sys.argv[2] ##''.join([mypath,'/results/'])
18
mypath_input_papers = sys.argv[3]
19
#mypath_files = ''.join([mypath,'/txt/'])
20

    
21
respubfile = join(mypath_results, "table_publications_2.csv") 
22
file_publication = open(respubfile, 'w')
23
strLinePub = 'id_publication ; name ; year ; comment ; id_author ; id_paper \n'
24
file_publication.write(strLinePub)
25

    
26
respaperfile = join(mypath_results, "table_papers_2.csv") 
27
file_paper = open(respaperfile, 'w')
28
strLinePaper = 'id_paper ; name ; comment ; first_date ; last_date ; authors \n'
29
file_paper.write(strLinePaper)
30

    
31
onlyfiles = [f for f in listdir(mypath_input) if isfile(join(mypath_input, f))]
32

    
33

    
34
#A partir de fichiers "acteur" (ex: "Act3_Capuana.txt"), préparé par Marco Borelli
35
#Le script va compléter la table des journaux : Papers ###### 
36
#Le script va remplir la table des re-publications : Publications ###### 
37

    
38
def prepareListPapers():
39
  papers_map = {}
40
  mypath_input_papers
41

    
42
  ##############################
43
  # PARSING du fichier CSV     #
44
  ##############################
45
  with open(mypath_input_papers) as csvfile:
46
    filereader = csv.reader(csvfile, delimiter='#')
47
    i = 0
48
    for row in filereader:
49
      paper_line = {}
50
      if i == 0:
51
        print "First line (and first column) : ",row[0]
52
      else : 
53
        name_paper = row[1]
54
        name_paper = name_paper.strip(" ")
55
        paper_line = {"id": row[0], "name": name_paper,"comment": row[2],"first_date": row[3],"last_date": row[4],"authors": row[5]}
56
        papers_map[name_paper] = paper_line
57
      i = i + 1
58
  return papers_map
59

    
60
def prepareListDates(ldates, id_author, id_paper, nbPaper, nbPub):
61
  print "list of dates for [",id_paper,"]"
62
  #1888 (3) uno in due puntate; 
63
  #1894
64
  PY = {}
65
  dates = ldates.split(";")
66
  for dateStr in dates :
67
    date = dateStr.strip(" ")
68
    date = date.strip(" ")
69
   
70
    ### SHOULD CHECK if multiple dates, ce serait une erreur d'écriture du fichier
71
    ### la liste des dates séparées par des "," au lieu de ";" 
72

    
73
    realdate = date[0:5] #PREVIOUSLY date = dateStr[0:5]
74
    realdate = realdate.strip("\n")
75
    realdate = realdate.strip(" ")
76
    realdate = realdate.translate(None, string.whitespace)
77
    print "\tDate [", realdate,"]"
78

    
79
    nbpubstr = date[5:len(date)]
80
    num = ''
81
    comment = ''
82
    if nbpubstr :
83
      print "\tNb Pub =", nbpubstr
84
      pos_paradeb = nbpubstr.find('(')
85
      pos_paraend = nbpubstr.find(')')
86
      if (pos_paradeb != -1 and pos_paraend != -1) :
87
        num = nbpubstr[pos_paradeb+1:pos_paraend]
88
        #print "\tNumber of publications = ", num
89
        if num.isdigit() :
90
          PY[realdate] = num
91
        else :
92
          #print "\tNot a number !!! alors c'est 1 et ajout de commentaire"
93
          comment = num
94
          print "\tNb Pub = 1 and Comment : ",comment
95
          PY[realdate] = 1
96
          id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
97
          nbPub = nbPub+1
98
      else:
99
        #print "\t!!!!!!!!! RIEN TROUVE COMME NUM !!!!!!!! alors c'est 1"
100
        print "\tNb Pub = 1 (a) !!!"
101
        PY[realdate] = 1
102
        id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
103
        nbPub = nbPub+1
104

    
105
      pos_dashdeb = nbpubstr.find('#')
106
      pos_dashend = nbpubstr.rfind('#')
107
      if (pos_dashdeb != -1 and pos_dashend != -1) :
108
        comment = nbpubstr[pos_dashdeb+1:pos_dashend]
109
        print "\tCommentaire = ", comment
110
 
111
      if num.isdigit() :
112
          PY[realdate] = num
113
          for n in range(0,int(num)):
114
            id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
115
            nbPub = nbPub+1
116
            strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
117
            file_publication.write(strLinePub)
118
      else :
119
        if comment :
120
          strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
121
        else : 
122
          strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';;',id_author,';', id_paper,'\n'])
123
        file_publication.write(strLinePub)
124
      
125
    else:
126
      #print "\t!!!!!!!!! RIEN TROUVE COMME NUM !!!!!!!! alors c'est 1"
127
      print "\tNb Pub = 1 (b) !!!"
128
      PY[realdate] = 1  
129
      id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])  
130
      strLinePub = ''.join([id_pub, ';UNKNOWN;',realdate,';;',id_author,';', id_paper,'\n'])
131
      file_publication.write(strLinePub)
132
      nbPub = nbPub+1
133
  return PY
134
 
135
def findAuthor(path_file) :
136
  #/Users/gazelledess/ownCloud/Sev/Projets/Triangle/Doctorants/ProjetMarcoBorelli/Transformation/txt//Act9_DiGiacomo.txt
137
  pos_backslash = path_file.rfind('/')
138
  if pos_backslash != -1:
139
    name_file = path_file[pos_backslash+1:len(path_file)]
140
    pos_underscore = name_file.find('_')
141
    prefix_id = name_file[0:pos_underscore]
142
    return prefix_id
143
  else :
144
    print "NO BACKSLASH ??? isn't it strange ?"
145
    return "UNKNOWN_ID"
146

    
147
def addAuthorToPaper(id_author, paper):
148
  print "addAuthorToPaper === > ",id_author, " in ",paper["id"]
149
  
150
  authors_list = paper["authors"]
151
  
152
  if authors_list is list :
153
    print "|||[authors_list - in file : ",authors_list,"]|||"
154
    authors_list_str = str(authors_list)
155
    authors_list_str = authors_list_str.strip(",")
156
    authors_list_2 = authors_list_str.split(",")
157
    #authors_list = authors_list.split(",")
158
    if id_author in authors_list :
159
      print "ALREADY (a) in the list of authors !!!!!!"
160
    else :
161
      authors_list.append(''.join([id_author, ', ']))
162
      print "ADDING ",id_author," to the list of authors"
163
      #paper["authors"] = authors_list
164
  else :
165
    if authors_list :
166
      if id_author in authors_list :
167
        print "ALREADY (b) in the list of authors !!!!!!"
168
      else :
169
        authors_list_str = ''.join(authors_list)
170
        authors_list_str = authors_list_str.strip(",")
171
        print "NEW LIST : ",authors_list_str
172
        #authors_list_2 = authors_list_str.split(",")
173
        #authors_list_2.append(''.join([', ',id_author, ', ']))
174
        authors_list_2 = ''.join([authors_list_str,id_author, ', '])
175
        print "NEW LIST 2 : ",authors_list_2
176
        authors_list = authors_list_2
177
        paper["authors"] = authors_list
178
        #author_list.append(''.join([id_author, ', ']))
179
        #paper["authors"] = authors_list_2
180
    else : 
181
      authors_list = []
182
      authors_list.append(''.join([id_author, ', ']))
183
      paper["authors"] = authors_list
184
      print "NO AUTHOR FOR THIS PAPER YET - ADDING ++++"
185

    
186
""""""""""""""""""""""""""""""
187
""" Prepare the name for the paper """
188
""""""""""""""""""""""""""""""
189
def preparePaperName(name, papers):
190
  pos_para = name.find('(')
191
  if pos_para != -1:
192
    namesubstring = name[0:pos_para]
193
    cleanname = namesubstring.strip(" ")
194
    name = cleanname.strip(" ")
195
    name = name.upper()
196
    #print "NAME : ", name
197
    #return cleanname
198
  else:
199
    #print "NAME STAYS LIKE THIS => ", name
200
    name = name.strip(" ")
201
    name = name.upper()
202

    
203
  #id_paper = -1
204
  name = ''.join(['[',name,"]"])
205
  paper_name = name
206
  return paper_name
207

    
208
""""""""""""""""""""""""""""""
209
""" Initialize a paper with this 
210
    name """
211
""""""""""""""""""""""""""""""
212
def setPaper(paper_name, nbpaper, papers):
213
  id_paper = ''.join(['Paper',str(nbpaper)])
214
  print "----->> NEW PAPER ", paper_name ,"with id : ",id_paper
215
  papers[paper_name] = {"id": id_paper, "name": paper_name,"comment": "","first_date": "","last_date": "","authors": ""}
216
  paper = papers[paper_name]
217
  return paper
218

    
219
"""Fonction inutile en réalité !!!""" 
220
def getPaper(id_paper, papers):
221
  for paper_key in papers.keys() :
222
    paper = papers[paper_key]
223
    paper_id = paper["id"]
224
    if paper_id == id_paper:
225
      return paper
226
  return -1
227

    
228
""""""""""""""""""""""""""""""
229
""" Find which paper corresponds
230
    to that name """
231
""""""""""""""""""""""""""""""
232
def getPaperByName(paper_name, papers):
233
  for paper_key in papers.keys() :
234
    paper = papers[paper_key]
235
    name_of_paper = paper["name"]
236
    if paper_name == name_of_paper:
237
      return paper
238
  return -1
239

    
240
""""""""""""""""""""""""""""""
241
""" Set the min and max among all
242
    the publication dates for 
243
    this paper """
244
""""""""""""""""""""""""""""""
245
def setDatesForPaper(dates, paper):
246
  #### min date ####
247
  mindate = 2000
248
  date_min_from_file = paper["first_date"]
249
  if date_min_from_file :
250
      mindate = int(date_min_from_file)
251
      print "\t\t HAS mindate => ", mindate
252

    
253
  #### max date ####
254
  maxdate = 1000
255
  date_max_from_file = paper["last_date"]
256
  if date_max_from_file :
257
      maxdate = int(date_max_from_file)
258
      print "\t\t HAS maxdate => ", maxdate
259

    
260
  ## Find if there are other min or max among the dates
261
  for date in dates.keys():
262
    #print "\t D = ", date
263
    if mindate > int(date) :
264
      mindate = int(date)
265
      print "\t\t CHANGING mindate => ", mindate
266
    if maxdate < int(date) :
267
      maxdate = int(date)
268
      print "\t\t CHANGING maxdate => ", maxdate
269
              
270
  paper["first_date"] = mindate
271
  paper["last_date"] = maxdate
272

    
273
""""""""""""""""""""""""""""""
274
""" Output the resulted table 
275
    about Papers """
276
""""""""""""""""""""""""""""""
277
def writePapersTable(papers):
278
  #papers[paper_name]= {"id": id_paper, "name": paper_name,"comment": "","first_date": "","last_date": "","authors": ""}
279
    
280
  for paper_key in papers.keys():
281
    paper = papers[paper_key] 
282
    name_paper = paper["name"]
283
    id_paper = paper["id"]
284
    first_date = paper["first_date"]
285
    last_date = paper["last_date"]
286
    authors = paper["authors"]
287
    authors = ''.join(authors)
288
    strLinePaper = ''.join([id_paper,' ; ', name_paper,' ; ; ',str(first_date),' ; ',str(last_date),' ; ',authors,'\n'])
289
    file_paper.write(strLinePaper)
290

    
291
def getMaxId(papers):
292
  max_id = 0
293
  for paper_key in papers.keys():
294
    paper = papers[paper_key] 
295
    paper_id = paper["id"]
296
    id_str = paper_id[5:len(paper_id)]
297
    print "id = ",id_str
298
    if int(id_str)> max_id:
299
      max_id = int(id_str)
300
  return max_id
301

    
302
def treatFiles():
303
  #papers = {}
304
  papers = prepareListPapers()
305
  authors_id = []
306
  authorsByPaper = {}
307
  nbPub = 1
308
  nbPaper = getMaxId(papers)
309

    
310
  for fi in onlyfiles:
311
    if(fi =='.DS_Store'):
312
      print 'FILE Macintosh is : ', fi
313
    else :
314
        mypath_file = ''.join([mypath_input, '/', fi])
315
        print "############### READING FILE : ", mypath_file
316

    
317
        ######### Create Author ID #########
318
        id_author = findAuthor(mypath_file)
319
        authors_id.append(id_author)
320
    
321
        ######### Analyse each line (left str and right str of :) #########
322
        #exemple : 
323
        #Rivista Minima (5 novelle): 1872; 1873; 1877 (dal ciclo Mio figlio! ma non so quante); 1879; 1883
324

    
325
        text_file = open(mypath_file, "r")
326
        lines = text_file.readlines()
327
        for l in lines:
328
          pos_dot = l.find(':')
329
          left_line = l[0:pos_dot]
330
          right_line = l[pos_dot+1:len(l)]
331
          id_paper = -1
332
          ### Prepare name of the journal with the left string ###
333
          paper_name = preparePaperName(left_line, papers)
334
          ### Create the paper and add its info in the table ###
335
          paper = getPaperByName(paper_name, papers)
336
          if paper == -1:
337
            nbPaper = nbPaper+1
338
            paper = setPaper(paper_name, nbPaper, papers)
339
            
340
          addAuthorToPaper(id_author, paper)
341
          
342
          dates = prepareListDates(right_line, id_author, paper["id"], nbPaper, nbPub)
343
          setDatesForPaper(dates, paper)
344
          
345
          
346
          print "------",paper_name,"--------"
347

    
348
        text_file.close()
349
        print "##############################"
350
  writePapersTable(papers)
351
  print "Nb papers = ",len(papers)
352
treatFiles()
353