-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean-csv.py
46 lines (40 loc) · 1.36 KB
/
clean-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import csv
import sys
if sys.argv > 1:
filename = sys.argv[1]
else:
filename = 'projects.csv'
if filename == 'projects_collected.csv':
fields = ['legacy_id', 'proposal_id', 'professional_status_type',
'affiliation_name', 'country', 'domestic']
no_dupes = True
else:
fields = ['tel', 'legacy_id', 'proposal_id', 'scheduled', 'downtime', 'actual']
no_dupes = False
print >>sys.stderr, 'Processing', filename
print >>sys.stderr, 'Fields:', ', '.join(fields)
print >>sys.stderr, 'Avoid duplicate rows:', no_dupes
readthis = open(filename)
records = [record for record in csv.DictReader(readthis)]
readthis.close()
writemes = []
for record in records:
writeme = {}
for field in fields:
writeme[field] = record[field].strip().replace('\xA0', '')
if field == 'legacy_id':
prev_id = writeme[field]
writeme[field] = writeme[field].upper().rstrip('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
if prev_id != writeme[field]:
print >>sys.stderr, 'Stripped %s to %s' % (prev_id, writeme[field])
if no_dupes:
if writeme not in writemes:
writemes.append(writeme)
else:
writemes.append(writeme)
writethis = open(filename, 'w')
writethis.write(','.join(fields))
writethis.write('\n')
writer = csv.DictWriter(writethis, fields)
writer.writerows(writemes)
writethis.close()