Skip to content
This repository has been archived by the owner on Nov 20, 2020. It is now read-only.

Commit

Permalink
downloads contractor contribs and tries to match results to contracto…
Browse files Browse the repository at this point in the history
…r list
  • Loading branch information
gautamh committed Aug 4, 2017
1 parent eaa2d6c commit 5f42a9c
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 19 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ nj_campfin_scraped.json
out.json
temp_filing.pdf
build/
dist/
dist/
*.csv
*.json
*.html
48 changes: 30 additions & 18 deletions njcampfin/get_contributor_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,26 @@ def get_data_for_entity(entity):
except Exception as e:
print(traceback.format_exc())
print(entity + " FAILED")
return ''
return 'err'
except OSError as e:
if e.errno not in (errno.ECONNRESET, errno.ECONNABORTED, errno.EPIPE):
raise
else:
print(entity + " FAILED")
return ''
return 'err'
except ConnectionResetError:
print(entity + " FAILED")
return ''
return 'err'
except RemoteDisconnected:
print(entity + " FAILED")
return ''
return 'err'
except requests.exceptions.ConnectionError:
print(entity + " FAILED")
return ''
return 'err'
soup = BeautifulSoup(r.text)
if soup.find('input', {'id':'__VIEWSTATE'}) is None:
print(entity + " FAILED")
return 'err'
viewstate = soup.find('input', {'id':'__VIEWSTATE'})['value']
viewstategenerator = soup.find('input', {'id':'__VIEWSTATEGENERATOR'})['value']

Expand Down Expand Up @@ -106,27 +109,27 @@ def get_data_for_entity(entity):
except Exception as e:
print(traceback.format_exc())
print(entity + " FAILED")
return ''
return 'err'
except OSError as e:
if e.errno not in (errno.ECONNRESET, errno.ECONNABORTED, errno.EPIPE):
raise
else:
print(entity + " FAILED")
return ''
return 'err'
except ConnectionResetError:
print(entity + " FAILED")
return ''
return 'err'
except RemoteDisconnected:
print(entity + " FAILED")
return ''
return 'err'
except requests.exceptions.ConnectionError:
print(entity + " FAILED")
return ''
return 'err'
soup = BeautifulSoup(r.text)

if soup.find('input', {'id':'__VIEWSTATE'}) is None:
print(entity + " FAILED")
return ''
return 'err'
viewstate = soup.find('input', {'id':'__VIEWSTATE'})['value']
viewstategenerator = soup.find('input', {'id':'__VIEWSTATEGENERATOR'})['value']

Expand Down Expand Up @@ -185,22 +188,22 @@ def get_data_for_entity(entity):
except Exception as e:
print(traceback.format_exc())
print(entity + " FAILED")
return ''
return 'err'
except OSError as e:
if e.errno not in (errno.ECONNRESET, errno.ECONNABORTED, errno.EPIPE):
raise
else:
print(entity + " FAILED")
return ''
return 'err'
except ConnectionResetError:
print(entity + " FAILED")
return ''
return 'err'
except RemoteDisconnected:
print(entity + " FAILED")
return ''
return 'err'
except requests.exceptions.ConnectionError:
print(entity + " FAILED")
return ''
return 'err'
return r.content

def clean_organization_name(name):
Expand All @@ -218,19 +221,21 @@ def main():
entities = []
header = []
results = []
failures = []
with open(sys.argv[1], 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
entities.append(clean_organization_name(row[0]))
for entity in entities:
entity_data = get_data_for_entity(entity)
if entity_data == '':
if entity_data == 'err':
failures.append(entity)
continue
f = StringIO(entity_data.decode('utf-8'))
reader = csv.reader(f)
header = next(reader)
for row in reader:
results.append(row)
results.append(row + [entity])
print(entity)

if (len(sys.argv) >= 2 and sys.argv[2] is not None and sys.argv[2] != ''):
Expand All @@ -241,6 +246,13 @@ def main():
else:
print(results)

if (len(sys.argv) >= 3 and sys.argv[3] is not None and sys.argv[3] != ''):
with open(sys.argv[3], 'w') as g:
writer = csv.writer(g)
writer.writerows(failures)
else:
print(results)


if __name__=='__main__':
main()
32 changes: 32 additions & 0 deletions njcampfin/match_contractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import csv
from fuzzywuzzy import fuzz

results = []

with open('/Users/208301/aggregate_property_contribs.csv', 'r') as f:
with open('/Users/208301/property_contracts.csv', 'r') as g:
contracts = []
contribs_reader = csv.reader(f)
contracts_reader = csv.reader(g)
next(contracts_reader)
for row in contracts_reader:
contracts.append(row)

next(contribs_reader)
for row in contribs_reader:
best_match = ''
best_match_amount = -1
best_score = 0
for contract in contracts:
score = fuzz.ratio(row[0], contract[0])
if score > best_score and score > 75:
best_match = contract[0]
best_score = score
best_match_amount = contract[4]

new_row = row + [best_match, best_match_amount]
results.append(new_row)

with open('/Users/208301/aggregate_property_contribs_matched.csv', 'w') as h:
writer = csv.writer(h)
writer.writerows(results)

0 comments on commit 5f42a9c

Please sign in to comment.