-
Notifications
You must be signed in to change notification settings - Fork 0
/
squash-csv.py
executable file
·67 lines (59 loc) · 2.1 KB
/
squash-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
"""
Dedupe a CSV using the first column as a primary key
Specify inputfile, outputfile, and optionally the csv delimiter
and separating symbol you want to use to separate the data from
rows which are combined
"""
import csv
import sys
import getopt
def main(argv):
inputfile = ''
outputfile = ''
delimiter = ','
symbol = ' | '
try:
opts, args = getopt.getopt(argv, "hi:o:d:s:", ["ifile=", "ofile=", "delimiter=", "symbol="])
except getopt.GetoptError:
print 'dedupe.py -i <inputfile> -o <outputfile>'
for opt, arg in opts:
if opt == '-h':
print ('dedupe.py -i <inputfile> -o <outputfile>' +
'[-d <csv delimiter>] [-s <separating symbol>]')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-d", "--delimiter"):
delimiter = arg
elif opt in ("-s", "--symbol"):
symbol = arg
if not (inputfile and outputfile):
print ('dedupe.py -i <inputfile> -o <outputfile>' +
'[-d <csv delimiter>] [-s <separating symbol>]')
sys.exit()
print 'Deduping csv: ', inputfile
print 'Into: ', outputfile
data = {}
header = []
with open(inputfile, 'rb') as csvfile:
hunchreader = csv.reader(csvfile, delimiter=delimiter)
header = next(hunchreader, None)
for row in hunchreader:
if row[0] in data:
for col in range(1, len(row)):
values = data[row[0]][col-1].split(symbol)
if row[col] != data[row[0]][col-1] and row[col] not in values:
data[row[0]][col-1] += symbol + row[col]
else:
data[row[0]] = row[1:]
with open(outputfile, 'wb') as csvfile:
hunchwriter = csv.writer(csvfile, delimiter=delimiter)
hunchwriter.writerow(header)
for key, value in data.items():
hunchwriter.writerow([key]+value)
print "\nDone."
if __name__ == "__main__":
main(sys.argv[1:])