-
Notifications
You must be signed in to change notification settings - Fork 0
/
ETLscript.py
228 lines (179 loc) · 8.49 KB
/
ETLscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
#overall csv related. i purposely made separate functions for my convenience, incase i needed to restart something
def allcsv():
#removes unenriched rows from og crime file, then combines the rest of the file
df = pd.read_csv('crime.csv', header=None)
#rows up to 25470
index_to_remove = df[df[0] == 25470].index[0]
df = df.drop(df.index[index_to_remove+1:])
df.to_csv('crime.csv', index=False, header=None)
df1 = pd.read_csv('crime.csv')
df2 = pd.read_csv('crime_25471_50000.csv')
df3 = pd.read_csv('crime_50001_75000.csv')
df4 = pd.read_csv('crime_75001_100000.csv')
df5 = pd.read_csv('crime_100001_125000.csv')
df6 = pd.read_csv('crime_125001_150000.csv')
df7 = pd.read_csv('crime_150001_175000.csv')
df8 = pd.read_csv('crime_175001_200000.csv')
df9 = pd.read_csv('crime_200001_225000.csv')
combined_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], axis=0)
#drop the last two columns, bc it generated some extra stuff
combined_df = combined_df.iloc[:, :-1]
#renamed header for convenience
combined_df = combined_df.rename(columns={'Unnamed: 0': 'id'})
combined_df.to_csv('crime.csv', index=False)
def replace_neighborhood():
df = pd.read_csv("crime.csv", dtype=str)
#replace empty neighborhood values with values from neighborhood_lookup
for i in range(len(df)):
if pd.isna(df.at[i, 'neighborhood']):
df.at[i, 'neighborhood'] = df.at[i, 'neighbourhood_lookup']
df.to_csv("crime.csv", index=False)
def removecols():
df = pd.read_csv("crime.csv")
#fill missing values in 'country' column with 'United States'
df['country'].fillna('United States', inplace=True)
#drop rows with empty cells in said columns
columns_to_check = ['crime','date','neighborhood','npu','road','county','city','state','country']
df.dropna(subset=columns_to_check, inplace=True)
#select the columns to save
cols_to_save = ['id'] + columns_to_check
df.loc[:, cols_to_save].to_csv("crime.csv", index=False)
def removeduplicates():
df = pd.read_csv("crime.csv")
df.drop_duplicates(subset=df.columns.difference(['id']), inplace=True)
df.to_csv("crime.csv", index=False)
#stopping at this part, i decided to manually clean some rows I didn't need on excel
#like for example, I had one row of Sandy Springs for city, and that value is insignificant. or fixing spelling errors, like 17th St NW (northwest)
#ran the remove duplicates function one more time after this
#this is where the dim and fact table stuff starts, using the already cleaned data
#crime related
def crimedim():
df = pd.read_csv("crime.csv")
crime_severity = {
'HOMICIDE': 0,
'RAPE': 1,
'AGG ASSAULT': 2,
'ROBBERY-PEDESTRIAN': 3,
'ROBBERY-RESIDENCE': 4,
'ROBBERY-COMMERCIAL': 6,
'BURGLARY-RESIDENCE': 7,
'BURGLARY-NONRES': 8,
'AUTO THEFT': 9,
'LARCENY-FROM VEHICLE': 10,
'LARCENY-NON VEHICLE': 11
}
df_severity = df[['id', 'crime']].copy()
#map the crime severity values to the 'crime' column
df_severity['crime_severity'] = df_severity['crime'].map(crime_severity)
#drop invalid rows
df_severity.dropna(subset=['crime_severity'], inplace=True)
df_severity.to_csv("crimeIDS.csv", index=False)
#drops duplicates
no_duplicates('crimeIDS.csv', 'dimcrime.csv')
def crimeid():
#for fact table, crime_key
dimcrime1 = pd.read_csv('dimcrime.csv')
crimedim = pd.read_csv('crimeIDS.csv')
key_map = dict(zip(dimcrime1[['crime', 'crime_severity']].apply(tuple, axis=1), dimcrime1['key']))
crimedim['id'] = crimedim[['crime', 'crime_severity']].apply(tuple, axis=1).map(key_map)
crimedim.to_csv('crimeIDS.csv', index=False)
#date related
def datedim():
df = pd.read_csv('crime.csv', usecols=['id', 'date'])
#extract year, month, and day from the date column
df['year'] = pd.DatetimeIndex(df['date']).year
#extract year quarter
df['year quarter'] = pd.PeriodIndex(pd.to_datetime(df['date']), freq='Q').strftime('%qQ%y')
df['year quarter'] = df['year quarter'].str[:-3] + 'Q'
df['month'] = pd.DatetimeIndex(df['date']).month
#map the month numbers to month names
month_names = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}
df['month'] = df['month'].map(month_names)
df['day'] = pd.DatetimeIndex(df['date']).day
#extract day of the week
df['day of the week'] = pd.DatetimeIndex(df['date']).day_name()
#map day of the week to weekday/weekend
weekend_days = ['Saturday', 'Sunday']
df['weekday/weekend'] = df['day of the week'].apply(lambda x: 'weekend' if x in weekend_days else 'weekday')
df.to_csv('dateIDS.csv', index=False)
no_duplicates('dateIDS.csv', 'dimdate.csv')
def dateid():
#for fact table, date_key
dimdate1 = pd.read_csv('dimdate.csv')
datedim = pd.read_csv('dateIDS.csv')
key_map = dict(zip(dimdate1[['date', 'year','year quarter','month','day','day of the week','weekday/weekend']].apply(tuple, axis=1), dimdate1['key']))
datedim['id'] = datedim[['date', 'year','year quarter','month','day','day of the week','weekday/weekend']].apply(tuple, axis=1).map(key_map)
datedim.to_csv('dateIDS.csv', index=False)
#loc1 related
def loc1dim():
df = pd.read_csv('crime.csv', usecols=['id', 'neighborhood', 'npu'])
df.to_csv('loc1IDS.csv', index=False)
no_duplicates('loc1IDS.csv', 'dimloc1.csv')
def loc1id():
#for fact table, loc1_key
dimloc11 = pd.read_csv('dimloc1.csv')
loc1dim = pd.read_csv('loc1IDS.csv')
key_map = dict(zip(dimloc11[['neighborhood','npu']].apply(tuple, axis=1), dimloc11['key']))
loc1dim['id'] = loc1dim[['neighborhood','npu']].apply(tuple, axis=1).map(key_map)
loc1dim.to_csv('loc1IDS.csv', index=False)
#loc2 related
def loc2dim():
df = pd.read_csv('crime.csv', usecols=['id', 'country','state','city','county','road'])
df.to_csv('loc2IDS.csv', index=False)
no_duplicates('loc2IDS.csv', 'dimloc2.csv')
def loc2id():
#for fact table, loc2_key
dimloc21 = pd.read_csv('dimloc2.csv')
loc2dim = pd.read_csv('loc2IDS.csv')
key_map = dict(zip(dimloc21[['country','state','city','county','road']].apply(tuple, axis=1), dimloc21['key']))
loc2dim['id'] = loc2dim[['country','state','city','county','road']].apply(tuple, axis=1).map(key_map)
loc2dim.to_csv('loc2IDS.csv', index=False)
def fact():
df = pd.read_csv('crime.csv', usecols=['id'])
df = pd.concat([df, pd.read_csv('crimeIDS.csv', usecols=['id']).rename(columns={'id': 'crimekey'})], axis=1)
df = pd.concat([df, pd.read_csv('dateIDS.csv', usecols=['id']).rename(columns={'id': 'datekey'})], axis=1)
df = pd.concat([df, pd.read_csv('loc1IDS.csv', usecols=['id']).rename(columns={'id': 'loc1key'})], axis=1)
df = pd.concat([df, pd.read_csv('loc2IDS.csv', usecols=['id']).rename(columns={'id': 'loc2key'})], axis=1)
#add a count column with 1's
count = [1]*len(df)
df['countn'] = count
df.to_csv('fact.csv', index=False, float_format='%.0f', sep=',')
#extra functions
def no_duplicates(input_file, output_file):
#removes all duplicates, with parameters
df = pd.read_csv(input_file)
df.drop_duplicates(subset=df.columns.difference(['id']), inplace=True)
#reset index
df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'key'}, inplace=True)
df.drop('id', axis=1, inplace=True)
df.to_csv(output_file, index=False)
def colshift():
#reordering columns
df = pd.read_csv('dimloc1.csv')
df1 = pd.read_csv('dimloc2.csv')
df2 = pd.read_csv('dimdate.csv')
df = df[['key','npu','neighborhood']]
df1 = df1[['key','country','state','city','county','road']]
df2 = df2[['key','year','year quarter','month','weekday/weekend','day of the week','date','day']]
# Save the updated dataframe to a new CSV file
df.to_csv('dimloc1.csv', index=False)
df1.to_csv('dimloc2.csv', index=False)
df2.to_csv('dimdate.csv', index=False)
#uncomment to run (ctrl /)
# allcsv()
# replace_neighborhood()
# removecols()
# removeduplicates()
# crimedim()
# crimeid()
# datedim()
# dateid()
# loc1dim()
# loc1id()
# loc2dim()
# loc2id()
# fact()
# colshift()