findDuplicatePublicationIds.py

Severine Gedzelman, 10/08/2018 16:41

Télécharger (1017 octet)

 
1
#!/usr/bin/python
2
# -*- coding: utf-8 -*-
3

    
4
import os
5
import re
6
import unicodedata
7
import sys
8
import csv
9
import string
10

    
11
from os import listdir, rename
12
from os.path import isfile, join
13

    
14
## répertoire depuis lequel on lance le script
15
mypath = os.getcwd()
16
mypath_input_publications = sys.argv[1]
17

    
18
def findDuplicates():
19
  publications_map = []
20

    
21
  ##############################
22
  # PARSING du fichier CSV     #
23
  ##############################
24
  with open(mypath_input_publications) as csvfile:
25
    filereader = csv.reader(csvfile, delimiter='#')
26
    i = 0
27
    for row in filereader:
28
      paper_line = {}
29
      if i == 0:
30
        print "First line (and first column) : ",row[0]
31
      else : 
32
        #print "Line",i
33
        id_publi = row[0]
34
        id_publi = id_publi.translate(None, string.whitespace)
35
        if id_publi in publications_map:
36
          print "\t\t FIND DUPLICATE [",id_publi,"]"
37
        else :
38
          publications_map.append(id_publi)
39
      i = i + 1
40
  return publications_map
41

    
42

    
43
findDuplicates()
44