-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreport.py
244 lines (225 loc) · 8.49 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import pandas as pd
import numpy as np
import time
#q2
from datetime import datetime
from dateutil.parser import parse
#q3
import json
#higher order functions
def read_and_format_source1():
#read source1_csv
df1 = pd.read_csv("source1.csv")
#separate audience col
i = df1.columns.get_loc('audience')
df1_2 = df1['audience'].str.split("_", expand=True)
df1_2.columns = ['state', 'hair_color', 'age']
df1_3 = pd.concat([df1.iloc[:, :i], df1_2, df1.iloc[:, i+1:]], axis=1)
return df1_3.drop_duplicates()
def read_and_format_source2():
df2 = pd.read_csv("source2.csv")
df2_1 = df2.join(df2['actions'].apply(json.loads).apply(pd.Series))
return df2_1
#globals
source1_raw = read_and_format_source1()
source2_raw = read_and_format_source2()
#q1: what was the total spent against people with purple hair?
def question1(data, color):
source1 = data["one"]
#get ids of all campaign_ids with hair color as list
purple_ids = source1.loc[source1['hair_color'] == color, 'campaign_id']
source2 = data["two"]
#get all campaign_ids from source 2 that have purple_ids
source2_purples = source2[source2['campaign_id'].isin(purple_ids)]
return source2_purples["spend"].values.sum()
#q2: how many campaigns spent on more than 4 days?
def question2(data, days):
source2 = data["two"]
#group rows by campaign_id, then date
grouped_source2 = source2.groupby(["campaign_id"])["date"]
count_greater_than = 0
#iterate through and get length of groupings
for name, group in grouped_source2:
if len(group) > days:
count_greater_than += 1
return count_greater_than
#q3: how many times did source H report on clicks?
def question3(data, source):
#filter source2 for rows that contain source
source2 = data["two"].loc[data["two"]["actions"].str.contains(source), :]
source_click_count = 0
#iterate through rows
for index, row in source2.iterrows():
i = 0
#iterate through converted actions columns
while i < 5:
try:
#make sure cell is dict, has source H and action is clicks
if type(row[i]) is dict and row[i][source] and row[i]["action"] == "clicks":
source_click_count += row[i][source]
except KeyError:
pass
i += 1
return source_click_count
#q4: which sources reported more "junk" than "noise"?
def question4(data, actions_str):
#filter source2 for rows that contain junk or noise
source2 = data["two"].loc[data["two"]["actions"].str.contains(actions_str),:]
#create empty dicts
noise_obj = {}
junk_obj = {}
#iterate through rows
for index, row in source2.iterrows():
#iterate through converted actions columns
i = 0
while i < 5:
try:
if type(row[i]) is dict:
for key, value in row[i].items():
#check action noise
if row[i]["action"] == "noise":
#increment value
if key in noise_obj:
noise_obj[key] += value
#assign value
else:
noise_obj[key] = value
if row[i]["action"] == "junk":
if key in junk_obj:
junk_obj[key] += value
else:
junk_obj[key] = value
except KeyError:
pass
i += 1
return_arr = []
for key, value in noise_obj.items():
if junk_obj[key] > noise_obj[key]:
return_arr.append(key)
return return_arr
#q5: what was the total cost per view for all video ads, truncated to two decimal places?
def question5(data, ad_type, action_str):
#filter source2 for rows with ad_type video
source2 = data["two"].loc[data["two"]["ad_type"] == ad_type, :]
view_sum = 0
video_spend_sum = 0
#iterate through rows
for index, row in source2.iterrows():
#sum video spend
video_spend_sum += row["spend"]
#iterate through converted action columns
i = 0
while i < 5:
try:
if type(row[i]) is dict:
for key, value in row[i].items():
#make sure action is action_str
if row[i]["action"] == action_str:
view_sum += value
break
except KeyError:
pass
i += 1
#compute and return total cost per view of all video ads
return round(video_spend_sum/view_sum, 2)
#q6: how many source B conversions were there for campaigns targeting NY?
def question6(data, source, action_str, state):
#filter source1 and create list of ids with state param
state_ids = data["one"].loc[data["one"]["state"] == state, "campaign_id"].tolist()
#filter source2 for rows that contain conversions
source2 = data["two"].loc[data["two"]["actions"].str.contains(action_str), :]
source_state_conversion = {source: 0}
#iterate through rows
for index, row in source2.iterrows():
#check row is in state_ids
if row["campaign_id"] in state_ids:
#iterate through converted actions columns
i = 0
while i < 5:
try:
if type(row[i]) is dict:
for key, value in row[i].items():
#make sure conversion and desired source
if row[i]["action"] == action_str and key in source_state_conversion:
source_state_conversion[key] += value
except KeyError:
pass
i += 1
#return source conversions for state
return source_state_conversion[source]
#7. what combination of state and hair color had the best CPM?
#most impressions, lowest cost
def question7(data):
source1 = data["one"]
#iterate and create two dictionaires
#state_hair_impressions key state-hair combo, value sum of impressions
#state_hair_ids key state-hair combo, value array of ids
state_hair_impressions = {}
state_hair_ids = {}
for index, row in source1.iterrows():
if(row["state"] + "-" + row["hair_color"]) in state_hair_impressions:
state_hair_impressions[row["state"] + "-" + row["hair_color"]] += row["impressions"]
if row["campaign_id"] not in state_hair_ids[row["state"] + "-" + row["hair_color"]]:
state_hair_ids[row["state"] + "-" + row["hair_color"]].append(row["campaign_id"])
else:
state_hair_impressions[row["state"] + "-" + row["hair_color"]] = row["impressions"]
state_hair_ids[row["state"] + "-" + row["hair_color"]] = [row["campaign_id"]]
source2 = data["two"]
#iterate and create dictionary
#campaign_id_spend key is id, value sum of spend
campaign_id_spend = {}
for index, row in source2.iterrows():
if row["campaign_id"] in campaign_id_spend:
campaign_id_spend[row["campaign_id"]] += row["spend"]
else:
campaign_id_spend[row["campaign_id"]] = row["spend"]
#print("len(campaign_id_spend)")
#print(len(campaign_id_spend))
#Analysis
#iterate and create dictionary key is state-hair combo, value is compute cpm
state_hair_cpm = {}
number_of_ids_in_state_hair_ids = 0
#itereate through state_hair_ids and state_hair_impressions simultaneously
for (k, v), (k2, v2) in zip(state_hair_ids.items(), state_hair_impressions.items()):
number_of_ids_in_state_hair_ids += len(v)
#iterate through state_hair_ids value (array of ids) and get spend amt from campaign_id_spend
#compute cpm as spend / impressions * 1000
#add to state_hair_cpm
for i in v:
if i in campaign_id_spend:
state_hair_cpm[k] = round((campaign_id_spend[i] / v2) * 1000, 2)
#print("number_of_ids_in_state_hair_ids")
#print(number_of_ids_in_state_hair_ids) #why doesn't this value equal the length of campaign_id_spend??
#return key of minimum value of state_hair_cpm
return min(state_hair_cpm, key=state_hair_cpm.get).split("-")
#function calls and runtimes
start_time = time.time()
q1_start_time = time.time()
print('q1:', question1({"one": source1_raw, "two": source2_raw}, "purple"))
q1_end_time = time.time()
#print("--- q1 runtime %s seconds ---" % (q1_end_time - q1_start_time))
q2_start_time = time.time()
print('q2:', question2({"two": source2_raw}, 4))
q2_end_time = time.time()
#print("--- q2 runtime %s seconds ---" % (q2_end_time - q2_start_time))
q3_start_time = time.time()
print('q3:', question3({"two": source2_raw}, "H"))
q3_end_time = time.time()
#print("--- q3 runtime %s seconds ---" % (q3_end_time - q3_start_time))
q4_start_time = time.time()
print('q4:', question4({"two": source2_raw},"junk|noise"))
q4_end_time = time.time()
#print("--- q4 runtime %s seconds ---" % (q4_end_time - q4_start_time))
q5_start_time = time.time()
print('q5:', question5({"two": source2_raw}, "video", "views"))
q5_end_time = time.time()
#print("--- q5 runtime %s seconds ---" % (q5_end_time - q5_start_time))
q6_start_time = time.time()
print('q6:', question6({"one": source1_raw, "two": source2_raw}, "B", "conversions", "NY"))
q6_end_time = time.time()
#print("--- q6 runtime %s seconds ---" % (q6_end_time - q6_start_time))"""
q7_start_time = time.time()
print('q7:', question7({"one": source1_raw, "two": source2_raw}))
q7_end_time = time.time()
#print("--- q7 runtime %s seconds ---" % (q7_end_time - q7_start_time))
print("--- runtime %s seconds ---" % (time.time() - start_time))