findDuplicatePapers.py

Severine Gedzelman, 10/08/2018 16:41

Télécharger (1,83 ko)

 
1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
3

    
4
import os
5
import re
6
import unicodedata
7
import sys
8
import csv
9
import string
10

    
11
from os import listdir, rename
12
from os.path import isfile, join
13

    
14
## répertoire depuis lequel on lance le script
15
mypath = os.getcwd()
16
mypath_input_papers = sys.argv[1]
17

    
18
def prepareListPapers():
19
  papers_map = {}
20

    
21
  ##############################
22
  # PARSING du fichier CSV     #
23
  ##############################
24
  with open(mypath_input_papers) as csvfile:
25
    filereader = csv.reader(csvfile, delimiter='#')
26
    i = 0
27
    for row in filereader:
28
      paper_line = {}
29
      if i == 0:
30
        print "First line (and first column) : ",row[0]
31
      else : 
32
        name_paper = row[1]
33
        name_paper = name_paper.strip(" ")
34
        id_paper = row[0]
35
        id_paper = id_paper.translate(None, string.whitespace)
36
        paper_line = {"id": id_paper, "name": name_paper,"comment": row[2],"first_date": row[3],"last_date": row[4],"authors": row[5]}
37
        papers_map[name_paper] = paper_line
38
        print "PAPER : [",id_paper,"]"
39
      i = i + 1
40
  return papers_map
41

    
42
def findDuplicateNames(papers, paper_key):
43
  paper = papers[paper_key]
44
  name_paper = paper["name"]
45
  name_paper_2 = name_paper.strip(" ")
46
  name_paper_2 = name_paper_2[1:len(name_paper_2)-1]
47
  id_paper = paper["id"]
48
  for paper_other_key in papers.keys() :
49
    paper_other = papers[paper_other_key]
50
    name_other_paper = paper_other["name"]
51
    name_other_paper_2 = name_other_paper.strip(" ")
52
    name_other_paper_2 = name_other_paper_2[1:len(name_other_paper_2)-1]
53
    if name_other_paper_2 in name_paper_2:
54
      if name_other_paper_2 != name_paper_2:
55
        id_paper_other = paper_other["id"]
56
        print "#",id_paper,"#",name_paper_2,"#",id_paper_other,"#",name_other_paper_2
57

    
58
papers = prepareListPapers()
59
for paper in papers.keys() :
60
    findDuplicateNames(papers, paper)
61