1
|
|
2
|
|
3
|
|
4
|
import os
|
5
|
import re
|
6
|
import unicodedata
|
7
|
import sys
|
8
|
import csv
|
9
|
import string
|
10
|
|
11
|
from os import listdir, rename
|
12
|
from os.path import isfile, join
|
13
|
|
14
|
|
15
|
mypath = os.getcwd()
|
16
|
mypath_results = sys.argv[1]
|
17
|
mypath_input = sys.argv[2]
|
18
|
mypath_input_papers = sys.argv[3]
|
19
|
|
20
|
|
21
|
respubfile = join(mypath_results, "table_publications_2.csv")
|
22
|
file_publication = open(respubfile, 'w')
|
23
|
strLinePub = 'id_publication ; name ; year ; comment ; id_author ; id_paper \n'
|
24
|
file_publication.write(strLinePub)
|
25
|
|
26
|
respaperfile = join(mypath_results, "table_papers_2.csv")
|
27
|
file_paper = open(respaperfile, 'w')
|
28
|
strLinePaper = 'id_paper ; name ; comment ; first_date ; last_date ; authors \n'
|
29
|
file_paper.write(strLinePaper)
|
30
|
|
31
|
onlyfiles = [f for f in listdir(mypath_input) if isfile(join(mypath_input, f))]
|
32
|
|
33
|
|
34
|
|
35
|
|
36
|
|
37
|
|
38
|
def prepareListPapers():
|
39
|
papers_map = {}
|
40
|
mypath_input_papers
|
41
|
|
42
|
|
43
|
|
44
|
|
45
|
with open(mypath_input_papers) as csvfile:
|
46
|
filereader = csv.reader(csvfile, delimiter='#')
|
47
|
i = 0
|
48
|
for row in filereader:
|
49
|
paper_line = {}
|
50
|
if i == 0:
|
51
|
print "First line (and first column) : ",row[0]
|
52
|
else :
|
53
|
name_paper = row[1]
|
54
|
name_paper = name_paper.strip(" ")
|
55
|
paper_line = {"id": row[0], "name": name_paper,"comment": row[2],"first_date": row[3],"last_date": row[4],"authors": row[5]}
|
56
|
papers_map[name_paper] = paper_line
|
57
|
i = i + 1
|
58
|
return papers_map
|
59
|
|
60
|
def prepareListDates(ldates, id_author, id_paper, nbPaper, nbPub):
|
61
|
print "list of dates for [",id_paper,"]"
|
62
|
|
63
|
|
64
|
PY = {}
|
65
|
dates = ldates.split(";")
|
66
|
for dateStr in dates :
|
67
|
date = dateStr.strip(" ")
|
68
|
date = date.strip(" ")
|
69
|
|
70
|
|
71
|
|
72
|
|
73
|
realdate = date[0:5]
|
74
|
realdate = realdate.strip("\n")
|
75
|
realdate = realdate.strip(" ")
|
76
|
realdate = realdate.translate(None, string.whitespace)
|
77
|
print "\tDate [", realdate,"]"
|
78
|
|
79
|
nbpubstr = date[5:len(date)]
|
80
|
num = ''
|
81
|
comment = ''
|
82
|
if nbpubstr :
|
83
|
print "\tNb Pub =", nbpubstr
|
84
|
pos_paradeb = nbpubstr.find('(')
|
85
|
pos_paraend = nbpubstr.find(')')
|
86
|
if (pos_paradeb != -1 and pos_paraend != -1) :
|
87
|
num = nbpubstr[pos_paradeb+1:pos_paraend]
|
88
|
|
89
|
if num.isdigit() :
|
90
|
PY[realdate] = num
|
91
|
else :
|
92
|
|
93
|
comment = num
|
94
|
print "\tNb Pub = 1 and Comment : ",comment
|
95
|
PY[realdate] = 1
|
96
|
id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
|
97
|
nbPub = nbPub+1
|
98
|
else:
|
99
|
|
100
|
print "\tNb Pub = 1 (a) !!!"
|
101
|
PY[realdate] = 1
|
102
|
id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
|
103
|
nbPub = nbPub+1
|
104
|
|
105
|
pos_dashdeb = nbpubstr.find('#')
|
106
|
pos_dashend = nbpubstr.rfind('#')
|
107
|
if (pos_dashdeb != -1 and pos_dashend != -1) :
|
108
|
comment = nbpubstr[pos_dashdeb+1:pos_dashend]
|
109
|
print "\tCommentaire = ", comment
|
110
|
|
111
|
if num.isdigit() :
|
112
|
PY[realdate] = num
|
113
|
for n in range(0,int(num)):
|
114
|
id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
|
115
|
nbPub = nbPub+1
|
116
|
strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
|
117
|
file_publication.write(strLinePub)
|
118
|
else :
|
119
|
if comment :
|
120
|
strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';', comment ,';',id_author,';', id_paper,'\n'])
|
121
|
else :
|
122
|
strLinePub = ''.join([id_pub, ';UNKNOWN;',str(realdate),';;',id_author,';', id_paper,'\n'])
|
123
|
file_publication.write(strLinePub)
|
124
|
|
125
|
else:
|
126
|
|
127
|
print "\tNb Pub = 1 (b) !!!"
|
128
|
PY[realdate] = 1
|
129
|
id_pub = ''.join(['Publi', str(nbPaper), '-', str(nbPub)])
|
130
|
strLinePub = ''.join([id_pub, ';UNKNOWN;',realdate,';;',id_author,';', id_paper,'\n'])
|
131
|
file_publication.write(strLinePub)
|
132
|
nbPub = nbPub+1
|
133
|
return PY
|
134
|
|
135
|
def findAuthor(path_file) :
|
136
|
|
137
|
pos_backslash = path_file.rfind('/')
|
138
|
if pos_backslash != -1:
|
139
|
name_file = path_file[pos_backslash+1:len(path_file)]
|
140
|
pos_underscore = name_file.find('_')
|
141
|
prefix_id = name_file[0:pos_underscore]
|
142
|
return prefix_id
|
143
|
else :
|
144
|
print "NO BACKSLASH ??? isn't it strange ?"
|
145
|
return "UNKNOWN_ID"
|
146
|
|
147
|
def addAuthorToPaper(id_author, paper):
|
148
|
print "addAuthorToPaper === > ",id_author, " in ",paper["id"]
|
149
|
|
150
|
authors_list = paper["authors"]
|
151
|
|
152
|
if authors_list is list :
|
153
|
print "|||[authors_list - in file : ",authors_list,"]|||"
|
154
|
authors_list_str = str(authors_list)
|
155
|
authors_list_str = authors_list_str.strip(",")
|
156
|
authors_list_2 = authors_list_str.split(",")
|
157
|
|
158
|
if id_author in authors_list :
|
159
|
print "ALREADY (a) in the list of authors !!!!!!"
|
160
|
else :
|
161
|
authors_list.append(''.join([id_author, ', ']))
|
162
|
print "ADDING ",id_author," to the list of authors"
|
163
|
|
164
|
else :
|
165
|
if authors_list :
|
166
|
if id_author in authors_list :
|
167
|
print "ALREADY (b) in the list of authors !!!!!!"
|
168
|
else :
|
169
|
authors_list_str = ''.join(authors_list)
|
170
|
authors_list_str = authors_list_str.strip(",")
|
171
|
print "NEW LIST : ",authors_list_str
|
172
|
|
173
|
|
174
|
authors_list_2 = ''.join([authors_list_str,id_author, ', '])
|
175
|
print "NEW LIST 2 : ",authors_list_2
|
176
|
authors_list = authors_list_2
|
177
|
paper["authors"] = authors_list
|
178
|
|
179
|
|
180
|
else :
|
181
|
authors_list = []
|
182
|
authors_list.append(''.join([id_author, ', ']))
|
183
|
paper["authors"] = authors_list
|
184
|
print "NO AUTHOR FOR THIS PAPER YET - ADDING ++++"
|
185
|
|
186
|
""""""""""""""""""""""""
|
187
|
|
188
|
""""""""""""""""""""""""
|
189
|
def preparePaperName(name, papers):
|
190
|
pos_para = name.find('(')
|
191
|
if pos_para != -1:
|
192
|
namesubstring = name[0:pos_para]
|
193
|
cleanname = namesubstring.strip(" ")
|
194
|
name = cleanname.strip(" ")
|
195
|
name = name.upper()
|
196
|
|
197
|
|
198
|
else:
|
199
|
|
200
|
name = name.strip(" ")
|
201
|
name = name.upper()
|
202
|
|
203
|
|
204
|
name = ''.join(['[',name,"]"])
|
205
|
paper_name = name
|
206
|
return paper_name
|
207
|
|
208
|
""""""""""""""""""""""""
|
209
|
|
210
|
|
211
|
""""""""""""""""""""""""
|
212
|
def setPaper(paper_name, nbpaper, papers):
|
213
|
id_paper = ''.join(['Paper',str(nbpaper)])
|
214
|
print "----->> NEW PAPER ", paper_name ,"with id : ",id_paper
|
215
|
papers[paper_name] = {"id": id_paper, "name": paper_name,"comment": "","first_date": "","last_date": "","authors": ""}
|
216
|
paper = papers[paper_name]
|
217
|
return paper
|
218
|
|
219
|
|
220
|
def getPaper(id_paper, papers):
|
221
|
for paper_key in papers.keys() :
|
222
|
paper = papers[paper_key]
|
223
|
paper_id = paper["id"]
|
224
|
if paper_id == id_paper:
|
225
|
return paper
|
226
|
return -1
|
227
|
|
228
|
""""""""""""""""""""""""
|
229
|
|
230
|
|
231
|
""""""""""""""""""""""""
|
232
|
def getPaperByName(paper_name, papers):
|
233
|
for paper_key in papers.keys() :
|
234
|
paper = papers[paper_key]
|
235
|
name_of_paper = paper["name"]
|
236
|
if paper_name == name_of_paper:
|
237
|
return paper
|
238
|
return -1
|
239
|
|
240
|
""""""""""""""""""""""""
|
241
|
|
242
|
|
243
|
|
244
|
""""""""""""""""""""""""
|
245
|
def setDatesForPaper(dates, paper):
|
246
|
|
247
|
mindate = 2000
|
248
|
date_min_from_file = paper["first_date"]
|
249
|
if date_min_from_file :
|
250
|
mindate = int(date_min_from_file)
|
251
|
print "\t\t HAS mindate => ", mindate
|
252
|
|
253
|
|
254
|
maxdate = 1000
|
255
|
date_max_from_file = paper["last_date"]
|
256
|
if date_max_from_file :
|
257
|
maxdate = int(date_max_from_file)
|
258
|
print "\t\t HAS maxdate => ", maxdate
|
259
|
|
260
|
|
261
|
for date in dates.keys():
|
262
|
|
263
|
if mindate > int(date) :
|
264
|
mindate = int(date)
|
265
|
print "\t\t CHANGING mindate => ", mindate
|
266
|
if maxdate < int(date) :
|
267
|
maxdate = int(date)
|
268
|
print "\t\t CHANGING maxdate => ", maxdate
|
269
|
|
270
|
paper["first_date"] = mindate
|
271
|
paper["last_date"] = maxdate
|
272
|
|
273
|
""""""""""""""""""""""""
|
274
|
|
275
|
|
276
|
""""""""""""""""""""""""
|
277
|
def writePapersTable(papers):
|
278
|
|
279
|
|
280
|
for paper_key in papers.keys():
|
281
|
paper = papers[paper_key]
|
282
|
name_paper = paper["name"]
|
283
|
id_paper = paper["id"]
|
284
|
first_date = paper["first_date"]
|
285
|
last_date = paper["last_date"]
|
286
|
authors = paper["authors"]
|
287
|
authors = ''.join(authors)
|
288
|
strLinePaper = ''.join([id_paper,' ; ', name_paper,' ; ; ',str(first_date),' ; ',str(last_date),' ; ',authors,'\n'])
|
289
|
file_paper.write(strLinePaper)
|
290
|
|
291
|
def getMaxId(papers):
|
292
|
max_id = 0
|
293
|
for paper_key in papers.keys():
|
294
|
paper = papers[paper_key]
|
295
|
paper_id = paper["id"]
|
296
|
id_str = paper_id[5:len(paper_id)]
|
297
|
print "id = ",id_str
|
298
|
if int(id_str)> max_id:
|
299
|
max_id = int(id_str)
|
300
|
return max_id
|
301
|
|
302
|
def treatFiles():
|
303
|
|
304
|
papers = prepareListPapers()
|
305
|
authors_id = []
|
306
|
authorsByPaper = {}
|
307
|
nbPub = 1
|
308
|
nbPaper = getMaxId(papers)
|
309
|
|
310
|
for fi in onlyfiles:
|
311
|
if(fi =='.DS_Store'):
|
312
|
print 'FILE Macintosh is : ', fi
|
313
|
else :
|
314
|
mypath_file = ''.join([mypath_input, '/', fi])
|
315
|
print "############### READING FILE : ", mypath_file
|
316
|
|
317
|
|
318
|
id_author = findAuthor(mypath_file)
|
319
|
authors_id.append(id_author)
|
320
|
|
321
|
|
322
|
|
323
|
|
324
|
|
325
|
text_file = open(mypath_file, "r")
|
326
|
lines = text_file.readlines()
|
327
|
for l in lines:
|
328
|
pos_dot = l.find(':')
|
329
|
left_line = l[0:pos_dot]
|
330
|
right_line = l[pos_dot+1:len(l)]
|
331
|
id_paper = -1
|
332
|
|
333
|
paper_name = preparePaperName(left_line, papers)
|
334
|
|
335
|
paper = getPaperByName(paper_name, papers)
|
336
|
if paper == -1:
|
337
|
nbPaper = nbPaper+1
|
338
|
paper = setPaper(paper_name, nbPaper, papers)
|
339
|
|
340
|
addAuthorToPaper(id_author, paper)
|
341
|
|
342
|
dates = prepareListDates(right_line, id_author, paper["id"], nbPaper, nbPub)
|
343
|
setDatesForPaper(dates, paper)
|
344
|
|
345
|
|
346
|
print "------",paper_name,"--------"
|
347
|
|
348
|
text_file.close()
|
349
|
print "##############################"
|
350
|
writePapersTable(papers)
|
351
|
print "Nb papers = ",len(papers)
|
352
|
treatFiles()
|
353
|
|