-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.R
147 lines (126 loc) · 6.06 KB
/
script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
load_packages <- function() {
library(googlesheets)
library(readr)
library(dplyr)
library(lubridate)
library(stringr)
}
suppressPackageStartupMessages(load_packages())
# download report details using googlesheets via OAuth authentication
gs_auth()
ss <- gs_title("Reports identified for back-up")
report_details <- gs_read(ss, ws = "report_details")
# in case above command fails
# becuase someone has changed the name of the sheet
# use the below back-up commands which works with document key
# ss_backup <- gs_key("1lAfyijkUGc_cvC01VhgMHTwsOxWMQyoavujcHi27SXE")
# report_details <- gs_read(ss_backup, ws = "report_details")
get_daily_dates <- function(start_period, end_period) {
diff <- interval(start_period, end_period)
diff <- as.period(diff, unit = "day")
dates <- start_period + 0:diff$day
dates <- gsub("-", "", dates)
dates <- paste0("D", dates)
return(dates)
}
get_weekly_dates <- function(start_period, end_period) {
diff <- interval(start_period, end_period)
diff <- as.period(diff, unit = "day")
num_of_weeks <- diff$day%/%7
dates <- start_period + weeks(0:num_of_weeks)
# IBM gives incomplete weekly data if year end falls in the middle of the week
# hence new year starts have to have their own weeks
dates <- append(dates, c(ymd("2018-01-01"), ymd("2019-01-01")))
dates <- gsub("-", "", dates)
dates <- paste0("W", dates)
return(dates)
}
get_monthly_dates <- function(start_period) {
dates <- start_period %m+% months(0:24)
dates = gsub("-", "", dates)
dates = paste0("M", dates)
return(dates)
}
get_period <- function(start_period, end_period, frequency) {
# returns vector of all periods that can be inserted into api link
if(frequency == "daily") {return(get_daily_dates(start_period, end_period))}
if(frequency == "weekly") {return(get_weekly_dates(start_period, end_period))}
if(frequency == "monthly") {return(get_monthly_dates(start_period))}
}
get_modified_api <- function(period, original_api) {
# return api valid for that period
gsub(pattern = "period_a=.\\d{8}", replacement = paste0("period_a=",period), original_api)
}
get_raw_report <- function(api_modified, report_name, period) {
raw_report <- read_csv(api_modified)
download_entry <- paste(report_name,nrow(raw_report), period, sep =",")
write_lines(download_entry,"download_log.csv", append = TRUE )
if(nrow(raw_report) > 49999) {warning_entry <- paste(report_name,nrow(raw_report),period, sep = ",")
write_lines(warning_entry, "warning_log.csv", append = TRUE)}
return(raw_report)
}
get_clean_report <- function(raw_report, report_name, frequency, unit_of_analysis, period) {
# returns clean report
# remove totals
raw_report <- raw_report %>%
filter(`Site ID` != "Total")
# remove 'Bounce Rate Integer' if it exists as one of the columns
if(str_detect(str_flatten(colnames(raw_report), collapse = ","), "Bounce Rate Integer")) {loc_bounce_rate_integer <- grep("Bounce Rate Integer", colnames(raw_report))
raw_report <- raw_report[,-loc_bounce_rate_integer]}
# convert average time on page to seconds
if(str_detect(str_flatten(colnames(raw_report), collapse = ","), "Average Time On Page")) {raw_report$`Average Time On Page` <- as.numeric(raw_report$`Average Time On Page`)}
# break down top_second_category into two categories
if(str_detect(str_flatten(colnames(raw_report), collapse = ","), "Page Attribute: URL Top Second Category")) {
cats <- str_split_fixed(raw_report$`Page Attribute: URL Top Second Category`,fixed("|"), 2)
cats <- as.data.frame(cats, stringsAsFactors = FALSE)
cats <- apply(cats, 2, function(col) ifelse(col == "", NA, as.character(col)))
colnames(cats) <- c("Top Level Category", "Top Second Level Category")
raw_report <- cbind(raw_report, cats)
raw_report <- raw_report %>%
select(`Site ID`,`Mobile Device Type`,`Marketing Channel`,`Page Attribute: URL Top Second Category`, `Top Level Category`, `Top Second Level Category`,`Page Attribute: Page Type`,`Page Views`,`Entry Page Views`,`Average Time On Page`,`Bounce Rate`,`Unique Visitors`)
}
# add columns for
# site url <not doing>
# site group/segment <not doing>
# add date column to match period
period <- gsub("\\D+", "", period)
raw_report$date <- period
# report frequency (daily, weekly, monthly)
raw_report$frequency <- frequency
# report unit of analysis (pageviews, unique visitors, etc.)
raw_report$unit_of_analysis <- unit_of_analysis
# report name as per google sheet
raw_report$report_name <- report_name
return(raw_report)
}
save_raw_report <- function(raw_report, report_name, unit_of_analysis, frequency, period) {
# saves report in csv format
file_name <- paste(report_name, unit_of_analysis, frequency, period, sep = "_")
file_name <- paste0(file_name, ".csv")
path_name <- paste("raw_reports", file_name, sep = "/")
write_csv(raw_report, path = path_name)
}
save_clean_report <- function(clean_report, report_name, unit_of_analysis, frequency, period) {
# saves report in csv format
# saves report in csv format
file_name <- paste(report_name, unit_of_analysis, frequency, period, sep = "_")
file_name <- paste0(file_name, ".csv")
path_name <- paste("clean_reports", file_name, sep = "/")
write_csv(clean_report, path = path_name)
}
for(i in 1:nrow(report_details)) {
report_name <- report_details$report_name[i]
frequency <- report_details$frequency[i]
unit_of_analysis <- report_details$unit_of_analysis[i]
start_period <- report_details$start_period[i]
end_period <- report_details$end_period[i]
original_api <- report_details$original_api[i]
report_periods <- get_period(start_period, end_period, frequency)
for(period in report_periods){
api_modified <- get_modified_api(period, original_api)
raw_report <- get_raw_report(api_modified, report_name, period)
save_raw_report(raw_report, report_name, unit_of_analysis, frequency, period)
clean_report <- get_clean_report(raw_report, report_name, frequency, unit_of_analysis, period)
save_clean_report(clean_report, report_name, unit_of_analysis, frequency, period)
}
}