findDuplicatePublicationIds.py
1 |
#!/usr/bin/python
|
---|---|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
import os |
5 |
import re |
6 |
import unicodedata |
7 |
import sys |
8 |
import csv |
9 |
import string |
10 |
|
11 |
from os import listdir, rename |
12 |
from os.path import isfile, join |
13 |
|
14 |
## répertoire depuis lequel on lance le script
|
15 |
mypath = os.getcwd() |
16 |
mypath_input_publications = sys.argv[1]
|
17 |
|
18 |
def findDuplicates(): |
19 |
publications_map = [] |
20 |
|
21 |
##############################
|
22 |
# PARSING du fichier CSV #
|
23 |
##############################
|
24 |
with open(mypath_input_publications) as csvfile: |
25 |
filereader = csv.reader(csvfile, delimiter='#')
|
26 |
i = 0
|
27 |
for row in filereader: |
28 |
paper_line = {} |
29 |
if i == 0: |
30 |
print "First line (and first column) : ",row[0] |
31 |
else :
|
32 |
#print "Line",i
|
33 |
id_publi = row[0]
|
34 |
id_publi = id_publi.translate(None, string.whitespace)
|
35 |
if id_publi in publications_map: |
36 |
print "\t\t FIND DUPLICATE [",id_publi,"]" |
37 |
else :
|
38 |
publications_map.append(id_publi) |
39 |
i = i + 1
|
40 |
return publications_map
|
41 |
|
42 |
|
43 |
findDuplicates() |
44 |
|