classification.Rmd

---
title: "Classification"
author: "Jenny Li, Liz Cao, Kristy Ma"
date: '2022-04-07'
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, error=TRUE, message=FALSE, warning=FALSE)
```

```{r, library}
library(dplyr)
library(readr)
library(ggplot2)
library(tidymodels)
library(probably)
library(vip)
tidymodels_prefer()
theme_set(theme_bw())       
Sys.setlocale("LC_TIME", "English")
set.seed(74)
```

```{r, reading data}
breastCa<-read_csv(file = "breast-cancer.csv")
```
## Data Cleaning
```{r}
breastCa_Re<-breastCa %>% 
  drop_na() %>% 
  select(-c(13:22)) %>% 
  select(-1)

breastCa_Re_new<-breastCa_Re%>%
  mutate(concave_points_mean=`concave points_mean`)%>%
  select(-10)
```

## LASSO and Logistic Regression

# Implete Lasso Logistic Regression in tidymodels
```{r}

# Make sure you set reference level (to the outcome you are NOT interested in)
breastCa_Re_new2 <- breastCa_Re_new%>%
  mutate(diagnosis = relevel(factor(diagnosis ), ref='B')) #set reference level

data_cv10 <- vfold_cv(breastCa_Re_new2, v = 10)


# Logistic LASSO Regression Model Spec
logistic_lasso_spec_tune <- logistic_reg() %>%
    set_engine('glmnet') %>%
    set_args(mixture = 1, penalty = tune()) %>%
    set_mode('classification')

# Recipe
logistic_rec <- recipe(diagnosis ~ ., data = breastCa_Re_new2) %>%
    step_normalize(all_numeric_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>% 
    add_recipe(logistic_rec) %>%
    add_model(logistic_lasso_spec_tune) 

# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
  penalty(range = c(-5, 1)), #log10 transformed  (kept moving min down from 0)
  levels = 100)

tune_output <- tune_grid( 
  log_lasso_wf, # workflow
  resamples = data_cv10, # cv folds
  metrics = metric_set(roc_auc,accuracy),
  control = control_resamples(save_pred = TRUE, event_level = 'second'),
  grid = penalty_grid # penalty grid defined above
)

# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_output) + theme_classic()
```
# Inspecting the Model
```{r}
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'roc_auc', desc(penalty)) # choose penalty value based on the largest penalty within 1 se of the highest CV roc_auc
final_fit_se <- finalize_workflow(log_lasso_wf, best_se_penalty) %>% # incorporates penalty value to workflow 
    fit(data = breastCa_Re_new2)

final_fit_se %>% tidy()

final_fit_se %>% tidy() %>%
  filter(estimate == 0)

#variable importance
glmnet_output <- final_fit_se %>% extract_fit_engine()
    
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0

# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
    # Extract coefficient path (sorted from highest to lowest lambda)
    this_coeff_path <- bool_predictor_exclude[row,]
    # Compute and return the # of lambdas until this variable is out forever
    ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})

# Create a dataset of this information and sort
var_imp_data <- tibble(
    var_name = rownames(bool_predictor_exclude),
    var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
```

#Evaluation Metrics
```{r}
# CV results for "best lambda"
tune_output %>%
    collect_metrics() %>%
    filter(penalty == best_se_penalty %>% pull(penalty))

# Count up number of B and M in the training data
breastCa_Re_new2 %>%
    count(diagnosis) # Name of the outcome variable goes inside count()

#Compute the NIR
NIR<- 357/(357+212)
NIR
```

#Threshold
```{r}
# Soft Predictions on Training Data
final_output <-
  final_fit_se %>% predict(new_data = breastCa_Re_new2, type = 'prob') %>%     bind_cols(breastCa_Re_new2)


final_output %>%
  ggplot(aes(x = diagnosis, y = .pred_M)) +
  geom_boxplot()

# Use soft predictions
final_output %>%
    roc_curve(diagnosis,.pred_M,event_level = 'second') %>%
    autoplot()

# thresholds in terms of reference level
threshold_output <- final_output %>%
    threshold_perf(truth = diagnosis, estimate = .pred_B, thresholds = seq(0,1,by=.01)) 

# J-index v. threshold for not M
threshold_output %>%
    filter(.metric == 'j_index') %>%
    ggplot(aes(x = .threshold, y = .estimate)) +
    geom_line() +
    labs(y = 'J-index', x = 'threshold') +
    theme_classic()

threshold_output %>%
    filter(.metric == 'j_index') %>%
    arrange(desc(.estimate))

# Distance v. threshold for not M

threshold_output %>%
    filter(.metric == 'distance') %>%
    ggplot(aes(x = .threshold, y = .estimate)) +
    geom_line() +
    labs(y = 'Distance', x = 'threshold') +
    theme_classic()

threshold_output %>%
    filter(.metric == 'distance') %>%
    arrange(.estimate)

log_metrics <- metric_set(accuracy,sens,yardstick::spec)

final_output %>%
    mutate(.pred_class = make_two_class_pred(.pred_B, levels(diagnosis), threshold = .64)) %>%
    log_metrics(truth = diagnosis, estimate = .pred_class, event_level = 'second')
```


## Random Forest

# Building Random Forest
```{r}
# Model Specification
rf_spec <- rand_forest() %>%
  set_engine(engine = 'ranger') %>% 
  set_args(mtry = NULL, # size of random subset of variables; default is floor(sqrt(ncol(x)))
           trees = 1000, # Number of trees
           min_n = 2,
           probability = FALSE, # FALSE: hard predictions
           importance = 'impurity') %>% 
  set_mode('classification') # change this for regression tree

# Recipe
data_rec <- recipe(diagnosis ~ ., data = breastCa_Re_new2)

# Workflows
data_wf_mtry2 <- workflow() %>%
  add_model(rf_spec %>% set_args(mtry = 2)) %>%
  add_recipe(data_rec)

# Create workflows for mtry = 4 , 10, and 20
data_wf_mtry4 <- workflow() %>%
  add_model(rf_spec %>% set_args(mtry = 4)) %>%
  add_recipe(data_rec)

data_wf_mtry10 <- workflow() %>%
  add_model(rf_spec %>% set_args(mtry = 10)) %>%
  add_recipe(data_rec)

data_wf_mtry20 <- workflow() %>%
  add_model(rf_spec %>% set_args(mtry = 20)) %>%
  add_recipe(data_rec)
```

```{r}
# Fit Models

set.seed(123) # make sure to run this before each fit so that you have the same 1000 trees
data_fit_mtry2 <- fit(data_wf_mtry2, data = breastCa_Re_new2)

set.seed(123)
data_fit_mtry4 <- fit(data_wf_mtry4, data = breastCa_Re_new2)

set.seed(123) 
data_fit_mtry10 <- fit(data_wf_mtry10, data = breastCa_Re_new2)

set.seed(123)
data_fit_mtry20 <- fit(data_wf_mtry20, data = breastCa_Re_new2)
```

```{r}
# Custom Function to get OOB predictions, true observed outcomes and add a model label
rf_OOB_output <- function(fit_model, model_label, truth){
    tibble(
          .pred_diagnosis = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
          diagnosis = truth,
          model = model_label
      )
}

#check out the function output
rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis))
```


```{r}
# Evaluate OOB Metrics

data_rf_OOB_output <- bind_rows(
    rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis)),
    rf_OOB_output(data_fit_mtry4,'mtry4', breastCa_Re_new2 %>% pull(diagnosis)),
    rf_OOB_output(data_fit_mtry10,'mtry10', breastCa_Re_new2 %>% pull(diagnosis)),
    rf_OOB_output(data_fit_mtry20,'mtry20', breastCa_Re_new2 %>% pull(diagnosis))
)


data_rf_OOB_output %>% 
    group_by(model) %>%
    accuracy(truth = diagnosis, estimate = .pred_diagnosis)
```

#Preliminary interpretation
```{r}
data_rf_OOB_output %>% 
    group_by(model) %>%
    accuracy(truth = diagnosis, estimate =.pred_diagnosis) %>%
  mutate(mtry = as.numeric(stringr::str_replace(model,'mtry',''))) %>%
  ggplot(aes(x = mtry, y = .estimate )) + 
  geom_point() +
  geom_line() +
  theme_classic()
```

> Although mtry= 20 (all predictors) randomly sampled predictors at each split is overall baset with an overall accuracy of about 96.66%, consider of overfitting, we prefer the other two model which might not perform as well as mtry = 20 but better than mtry = 4 -- mtry = 2 and mtry = 10. In addtion, since the larger number of randomly sampled predictors at each split we decide to include, the longer computational time for the forest, we decide to choose mtry=2 as our best tuning parameter.

#Evaluating the forest
```{r}
data_fit_mtry2
```


```{r}
rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis)) %>%
    conf_mat(truth = diagnosis, estimate= .pred_diagnosis)

```

#Variable importance measures
```{r}
data_fit_mtry2 %>% 
    extract_fit_engine() %>% 
    vip(num_features = 30) + theme_classic()
```

```{r}
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = area_worst)) +
    geom_violin() + theme_classic()
```

```{r}
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = fractal_dimension_mean)) +
    geom_violin() + theme_classic()
```

```{r}
#intermediate important
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = perimeter_mean)) +
    geom_violin() + theme_classic()
```