-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmysql_init_db.py
204 lines (172 loc) · 8.44 KB
/
mysql_init_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from __future__ import print_function
import os
import zipfile
import warnings
try:
from urllib2 import urlopen # Python2
except ImportError:
from urllib.request import urlopen # Python3
import MySQLdb
warnings.filterwarnings('ignore', category=MySQLdb.Warning)
contributions_zip_file = 'Illinois-campaign-contributions.txt.zip'
contributions_txt_file = 'Illinois-campaign-contributions.txt'
if not os.path.exists(contributions_zip_file) :
print('downloading', contributions_zip_file, '(~60mb) ...')
u = urlopen('https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
localFile = open(contributions_zip_file, 'wb')
localFile.write(u.read())
localFile.close()
if not os.path.exists(contributions_txt_file) :
zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
print('extracting %s' % contributions_zip_file)
zip_file_contents = zip_file.namelist()
for f in zip_file_contents:
if ('.txt' in f):
zip_file.extract(f)
zip_file.close()
conn = MySQLdb.connect(read_default_file = os.path.abspath('.') + '/mysql.cnf',
local_infile = 1,
db='contributions')
c = conn.cursor()
print('importing raw data from csv...')
c.execute("DROP TABLE IF EXISTS raw_table")
c.execute("DROP TABLE IF EXISTS donors")
c.execute("DROP TABLE IF EXISTS recipients")
c.execute("DROP TABLE IF EXISTS contributions")
c.execute("DROP TABLE IF EXISTS processed_donors")
c.execute("CREATE TABLE raw_table "
"(reciept_id INT, last_name VARCHAR(70), first_name VARCHAR(35), "
" address_1 VARCHAR(35), address_2 VARCHAR(36), city VARCHAR(20), "
" state VARCHAR(15), zip VARCHAR(11), report_type VARCHAR(24), "
" date_recieved VARCHAR(10), loan_amount VARCHAR(12), "
" amount VARCHAR(23), receipt_type VARCHAR(23), "
" employer VARCHAR(70), occupation VARCHAR(40), "
" vendor_last_name VARCHAR(70), vendor_first_name VARCHAR(20), "
" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
" vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
" vendor_zip VARCHAR(10), description VARCHAR(90), "
" election_type VARCHAR(10), election_year VARCHAR(10), "
" report_period_begin VARCHAR(10), report_period_end VARCHAR(33), "
" committee_name VARCHAR(70), committee_id VARCHAR(37)) "
"CHARACTER SET utf8 COLLATE utf8_unicode_ci")
conn.commit()
c.execute("LOAD DATA LOCAL INFILE %s INTO TABLE raw_table "
"FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' "
"IGNORE 1 LINES "
"(reciept_id, last_name, first_name, "
" address_1, address_2, city, state, "
" zip, report_type, date_recieved, "
" loan_amount, amount, receipt_type, "
" employer, occupation, vendor_last_name, "
" vendor_first_name, vendor_address_1, "
" vendor_address_2, vendor_city, vendor_state, "
" vendor_zip, description, election_type, "
" election_year, "
" report_period_begin, report_period_end, "
" committee_name, committee_id, @dummy)",
(contributions_txt_file,))
conn.commit()
print('creating donors table...')
c.execute("CREATE TABLE donors "
"(donor_id INTEGER PRIMARY KEY AUTO_INCREMENT, "
" last_name VARCHAR(70), first_name VARCHAR(35), "
" address_1 VARCHAR(35), address_2 VARCHAR(36), "
" city VARCHAR(20), state VARCHAR(15), "
" zip VARCHAR(11), employer VARCHAR(70), "
" occupation VARCHAR(40)) "
"CHARACTER SET utf8 COLLATE utf8_unicode_ci")
c.execute("INSERT INTO donors "
"(first_name, last_name, address_1,"
" address_2, city, state, zip, employer, occupation) "
"SELECT DISTINCT "
"TRIM(first_name), TRIM(last_name), TRIM(address_1), "
"TRIM(address_2), TRIM(city), TRIM(state), TRIM(zip), "
"TRIM(employer), TRIM(occupation) "
"FROM raw_table")
conn.commit()
print('creating indexes on donors table')
c.execute("CREATE INDEX donors_donor_info ON donors "
"(last_name, first_name, address_1, address_2, city, "
" state, zip)")
conn.commit()
print('creating recipients table...')
c.execute("CREATE TABLE recipients "
"(recipient_id INTEGER PRIMARY KEY AUTO_INCREMENT, name VARCHAR(70)) "
"CHARACTER SET utf8 COLLATE utf8_unicode_ci")
c.execute("INSERT IGNORE INTO recipients "
"SELECT DISTINCT committee_id, committee_name FROM raw_table")
conn.commit()
print('creating contributions table')
c.execute("CREATE TABLE contributions "
"(contribution_id INT, donor_id INT, recipient_id INT, "
" report_type VARCHAR(24), date_recieved DATE, "
" loan_amount VARCHAR(12), amount VARCHAR(23), "
" receipt_type VARCHAR(23), "
" vendor_last_name VARCHAR(70), "
" vendor_first_name VARCHAR(20), "
" vendor_address_1 VARCHAR(35), vendor_address_2 VARCHAR(31), "
" vendor_city VARCHAR(20), vendor_state VARCHAR(10), "
" vendor_zip VARCHAR(10), description VARCHAR(90), "
" election_type VARCHAR(10), election_year VARCHAR(10), "
" report_period_begin DATE, report_period_end DATE) "
"CHARACTER SET utf8 COLLATE utf8_unicode_ci")
c.execute("INSERT INTO contributions "
"SELECT reciept_id, donors.donor_id, committee_id, "
" report_type, STR_TO_DATE(date_recieved, '%m/%d/%Y'), "
" loan_amount, amount, "
" receipt_type, vendor_last_name , "
" vendor_first_name, vendor_address_1, vendor_address_2, "
" vendor_city, vendor_state, vendor_zip, description, "
" election_type, election_year, "
" STR_TO_DATE(report_period_begin, '%m/%d/%Y'), "
" STR_TO_DATE(report_period_end, '%m/%d/%Y') "
"FROM raw_table JOIN donors ON "
"donors.first_name = TRIM(raw_table.first_name) AND "
"donors.last_name = TRIM(raw_table.last_name) AND "
"donors.address_1 = TRIM(raw_table.address_1) AND "
"donors.address_2 = TRIM(raw_table.address_2) AND "
"donors.city = TRIM(raw_table.city) AND "
"donors.state = TRIM(raw_table.state) AND "
"donors.employer = TRIM(raw_table.employer) AND "
"donors.occupation = TRIM(raw_table.occupation) AND "
"donors.zip = TRIM(raw_table.zip)")
conn.commit()
print('creating indexes on contributions')
c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)")
c.execute("CREATE INDEX donor_idx ON contributions (donor_id)")
c.execute("CREATE INDEX recipient_idx ON contributions (recipient_id)")
conn.commit()
print('nullifying empty strings in donors')
c.execute("UPDATE donors "
"SET "
"first_name = CASE first_name WHEN '' THEN NULL ELSE first_name END, "
"last_name = CASE last_name WHEN '' THEN NULL ELSE last_name END, "
"address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, "
"address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, "
"city = CASE city WHEN '' THEN NULL ELSE city END, "
"state = CASE state WHEN '' THEN NULL ELSE state END, "
"employer = CASE employer WHEN '' THEN NULL ELSE employer END, "
"occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, "
"zip = CASE zip WHEN '' THEN NULL ELSE zip END")
conn.commit()
c.execute("CREATE TABLE processed_donors AS "
"(SELECT donor_id, "
" LOWER(city) AS city, "
" CASE WHEN (first_name IS NULL AND last_name IS NULL) "
" THEN NULL "
" ELSE LOWER(CONCAT_WS(' ', first_name, last_name)) "
" END AS name, "
" LOWER(zip) AS zip, "
" LOWER(state) AS state, "
" CASE WHEN (address_1 IS NULL AND address_2 IS NULL) "
" THEN NULL "
" ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) "
" END AS address, "
" LOWER(occupation) AS occupation, "
" LOWER(employer) AS employer, "
" ISNULL(first_name) AS person "
" FROM donors)")
c.execute("CREATE INDEX donor_idx ON processed_donors (donor_id)")
c.close()
conn.close()
print('done')