-
Notifications
You must be signed in to change notification settings - Fork 7
/
model-caret-in-practice.Rmd
126 lines (100 loc) · 3.2 KB
/
model-caret-in-practice.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
---
layout: page
title: xwMOOC 모형
subtitle: "`caret` 예측모형 실전코드"
author:
name: xwMOOC
url: https://www.facebook.com/groups/tidyverse/
affiliation: Tidyverse Korea
date: "`r Sys.Date()`"
output:
html_document:
toc: yes
toc_float: true
highlight: tango
code_folding: show
number_section: true
self_contained: true
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE,
comment="", digits = 3, tidy = FALSE, prompt = FALSE, fig.align = 'center')
```
# 기계학습 예측모형 기본기 [^best-algorithm] [^applied-predictive-modeling] {#predictive-model}
```{r taste-predictive-model}
# 0. 환경설정 ------
library(caret)
library(tidyverse)
library(janitor)
library(doSNOW)
# 1. 데이터 ------
data(GermanCredit)
# 2. 데이터 전처리 ------
## 변수명 정리 -----
credit_dat <- GermanCredit %>%
clean_names() %>%
tbl_df
## 예측모형에 사용되는 변수정리 -----
all_variables <- names(credit_dat)
remove_variables <- names(credit_dat)[nearZeroVar(credit_dat)]
credit_df <- credit_dat[ , setdiff(all_variables, remove_variables)]
# 3. 예측모형 ------
## 3.1. 병렬처리 환경설정
num_cores <- parallel:::detectCores()
start_time <- Sys.time()
cl <- makeCluster(num_cores, type = "SOCK")
registerDoSNOW(cl)
## 3.2. 훈련 vs 검증/시험
train_test_index <- createDataPartition(credit_df$class, p = 0.7, list = FALSE)
train <- credit_df[train_test_index, ]
test <- credit_df[-train_test_index, ]
## 3.3. 모형 개발/검증 데이터셋 준비 ------
cv_folds <- createMultiFolds(train$class, k = 10, times = 5)
cv_ctrl <- trainControl(method = "cv", number = 10,
index = cv_folds,
summaryFunction = twoClassSummary,
classProbs = TRUE, # 매우 중요
verboseIter = TRUE)
ranger_tune_grid <- expand.grid(
.mtry = c(2,16,31,48),
.splitrule = c("gini","extratrees"),
.min.node.size = c(5, 10)
)
glmnet_tune_grid <- expand.grid(
alpha = 0:1,
lambda = seq(0.0001, 1, length=10)
)
## 3.2. 예측모형 적용
### glmnet
gc_glmnet_model <- train(class ~., train,
method = "glmnet",
metric = "Sens",
preProcess = c("zv", "center", "scale", "spatialSign"),
tuneGrid = glmnet_tune_grid,
trControl = cv_ctrl)
### ranger
gc_ranger_model <- train(class ~., train,
method = "ranger",
metric = "Sens",
preProcess = c("zv", "center", "scale", "spatialSign"),
tuneGrid = ranger_tune_grid,
# tuneLength = 7,
trControl = cv_ctrl)
# 4. 모형 비교평가-----
model_list <- list(
glmnet = gc_glmnet_model,
rf = gc_ranger_model
)
resamps <- resamples(model_list)
summary(resamps)
dotplot(resamps, metric = "Sens")
# 5. 모형성능 평가 -----
gc_pred_class <- predict(gc_glmnet_model, newdata = test, type="raw")
## 혼동행렬 -----
confusionMatrix(gc_pred_class, test$class)
mc_total_time <- Sys.time() - start_time
mc_total_time
stopCluster(cl)
```