-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path1-stats-averages.Rmd
68 lines (55 loc) · 2.51 KB
/
1-stats-averages.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
---
title: "Stats - Team Season Averages"
---
Load libraries
```{r, warning=FALSE, message=FALSE}
library(tidyverse)
```
Read in the data
```{r}
reg_season_stats <- read.csv("data/RegularSeasonDetailedResults.csv", stringsAsFactors = FALSE)
tourney_stats <- read.csv("data/NCAATourneyDetailedResults.csv", stringsAsFactors = FALSE)
head(reg_season_stats)
```
Combine regular season and tournament
```{r}
combined_stats <- bind_rows(reg_season_stats, tourney_stats)
```
Combine winning teams and losing teams stats
```{r}
#duplicate data
combined_statsCompiled <- combined_stats
#change the column names so that they are identical for merging
names(combined_statsCompiled) <- gsub(x = names(combined_statsCompiled), pattern = "WTeam", replacement = "Team")
names(combined_statsCompiled) <- gsub(x = names(combined_statsCompiled), pattern = "LTeam", replacement = "Team")
names(combined_statsCompiled) <- gsub(x = names(combined_statsCompiled), pattern = "W", replacement = "")
names(combined_statsCompiled) <- gsub(x = names(combined_statsCompiled), pattern = "L", replacement = "")
#combine WTeam and LTeam stats
combined_statsCompiled <- bind_rows(combined_statsCompiled[,c(1,3:4,9:21)],combined_statsCompiled[,c(1,5:6,22:34)])
head(combined_statsCompiled)
```
Calculate team season averages
```{r}
#find all unique values by BOTH Season and TeamID
#this will be the skeleton of the final database we want
combined_statsCompiledUnique <- distinct(combined_statsCompiled,Season, TeamID, .keep_all = TRUE)
#loop through all the unique values
for (i in 1:NROW(combined_statsCompiledUnique)){
#we want to replace the per game data in each row of the unique data frame with averages for that team in that season
#first filter the FULL season results by two variables, the current row SEASON and TEAM ID
#then only select the rows where an average can be computed
#compute the average and insert into the respective row of the unique data frame
combined_statsCompiledUnique[i,c(3:16)] <- colMeans(combined_statsCompiled %>%
filter(Season == combined_statsCompiledUnique[i,1], TeamID == combined_statsCompiledUnique[i,2]) %>%
dplyr::select(-(Season:TeamID)))
}
```
Rename variables
```{r}
names(combined_statsCompiledUnique)[3:16] <- paste(sep = "_", "AVG", names(combined_statsCompiledUnique)[3:16])
head(combined_statsCompiledUnique)
```
Output to file
```{r}
write.csv(combined_statsCompiledUnique, file="output/stats-averages.csv", row.names = FALSE)
```