-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean.py
191 lines (147 loc) · 7.63 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import csv
import re
from geocode import geocode_address
import time
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def read_file(source_file, output_file):
with open(source_file) as csv_source_file:
with open(output_file, 'w') as csv_output_file:
fieldnames = ['street_address', 'city', 'unit', 'date_issued', 'clean_address', 'extras', 'geocode_status', 'geocode_place_name', 'lat', 'lon']
reader = csv.DictReader(csv_source_file)
writer = csv.DictWriter(csv_output_file, fieldnames=fieldnames)
# Counter to limit rows for testing and to keep track of the number of requests
i = 0
# Initiate geocode status counts
success_count = 0
check_count = 0
fail_count = 0
empty_row_count = 0
# # Number of rows to test on
# n = 300
writer.writeheader()
for row in reader:
# # Only do this for the first n rows
# if i > n:
# break
# Increment the row counter
i += 1
# If the street address field is empty, skip it
if not row['street_address']:
print "row {} is empty".format(i)
empty_row_count += 1
continue
street_address = row['street_address']
# TO DO: refactor to make these regex operations seperate functions
# Create regex to find common delimters of extra information in the street_address field
unit_marker_regex = re.compile(ur'(apt|room|#|unit|suite|bldg|\(|downstairs|upstairs|;)', re.IGNORECASE)
# Find a unit marker in street_address
unit_matches = re.findall(unit_marker_regex, street_address)
# If there are matches, put everything after first match into the extras string
# and put everything before the first match into the clean_address string
if unit_matches:
start_index = street_address.find(unit_matches[0])
extra = street_address[start_index:]
clean_address = street_address[:start_index]
# print "clean_address: {}".format(clean_address)
else:
clean_address = street_address
extra = None
# Create regex to find non-digits at beginning of clean_address string
# For example: "Islander Lodge Motel, 2428 Central Ave"
non_digit_start_regex = re.compile(ur'^\D*')
# Find non-digit start in clean_address
non_digit_matches = re.findall(non_digit_start_regex, clean_address)
# print non_digit_matches
# If there are matches, extract the non-digit start and add it to beginning
# of extras string and remove it from clean_address
if non_digit_matches:
non_digit_start = non_digit_matches[0]
# Get the index of the first character in the non-digit start, which should be 0
start_index = clean_address.find(non_digit_start)
# Get the index of the last character in the non-digit start
start_index = start_index + len(non_digit_start)
# Add the non-digit start to the extra string
if extra:
extra = non_digit_start + extra
else:
extra = non_digit_start
# Remove the non-digit start from the clean_address
clean_address = clean_address[start_index:]
# Strip spaces and puntuation off beginning and end of clean_address and extra
punct_to_strip = ' .,;'
clean_address = clean_address.strip(' .,;')
if extra:
extra = extra.strip(punct_to_strip)
clean_address = ', '.join([clean_address, row['city'], 'CA'])
proxim_lat = 37.8044
proxim_lon = -122.2697
# Geocode address and set row variables
geocode_result = geocode_address(clean_address, proxim_lat=proxim_lat, proxim_lon=proxim_lon)
# Determine status as success, check or failed
if geocode_result:
if float(geocode_result['relevance']) >= 0.9:
# print geocode_result['place_name']
# print row['city']
# If the geocoder just picked up the city, set status to CHECK
if geocode_result['place_name'] == row['city']:
geocode_status = "CHECK - CITY ONLY"
check_count += 1
# If the coordinates are unreasonably far from the proximity coordinates
elif abs(float(geocode_result['lat']) - proxim_lat) > 1 or abs(float(geocode_result['lon']) - proxim_lon) > 1:
geocode_status = "CHECK - OUT OF RANGE"
check_count += 1
# Otherwise, mark it as a success
else:
geocode_status = "SUCCESS"
success_count += 1
else:
geocode_status = "CHECK"
check_count += 1
geocode_place_name = geocode_result['place_name']
lat = geocode_result['lat']
lon = geocode_result['lon']
else:
geocode_status = "FAILED"
geocode_place_name = None
lat = None
lon = None
fail_count += 1
print "row {}: {} {}".format(i, clean_address, geocode_status)
# TO DO: deal with empty rows
try:
writer.writerow({
'street_address': row['street_address'],
'city': row['city'],
'unit': row['unit'],
'date_issued': row['date_issued'],
'clean_address': clean_address,
'extras': extra,
'geocode_status': geocode_status,
'geocode_place_name': geocode_place_name,
'lat': lat,
'lon': lon
})
except Exception, e:
writer.writerow({
'street_address': row['street_address'],
'city': row['city'],
'unit': row['unit'],
'date_issued': row['date_issued'],
'clean_address': clean_address,
'extras': extra,
'geocode_status': geocode_status,
'geocode_place_name': e,
'lat': None,
'lon': None
})
print str(e)
# Sleep for 0.1 secs to prevent exceeding the request limits for the mapbox geocoder
# Should result in fewer than 600 requests per minute with all of the reading/writing and geocoding requests themselves
time.sleep(0.1)
print "Success count: {}".format(success_count)
print "Check count: {}".format(check_count)
print "Failure count: {}".format(fail_count)
csv_output_file.close()
csv_source_file.close()