HR Analytics and Employee performance IBM use of model classifiers to predict whether an employee is likely to quit or not.
library(VIM)
library(plotrix)
library(dplyr)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(lubridate)
library(corrplot)
library(shiny)
library(DT)
library(shinydashboard)
library(shinythemes)
library(ggthemes)
library(ggcorrplot)
library(cowplot)
library(caret)
library(rpart)
library(corrplot)
library(pROC)
data_ibm <- data
trainIndex <- createDataPartition(data_ibm$Attrition, p=0.80,list=FALSE, times=1)
train <- data_ibm[trainIndex,]
test <- data_ibm[-trainIndex,]
prop_train <- train %>% select(Attrition) %>%
group_by(Attrition) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))
prop_test <- test %>% select(Attrition) %>%
group_by(Attrition) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))
rpart.tree <- rpart(Attrition ~ ., data=train)
var_imp <- data.frame(rpart.tree$variable.importance)
var_imp$features <- rownames(var_imp)
var_imp <- var_imp[, c(2, 1)]
var_imp$importance <- round(var_imp$rpart.tree.variable.importance, 2)
var_imp$rpart.tree.variable.importance <- NULL
colorCount <- length(unique(var_imp$features))
feature_importance <- var_imp %>%
ggplot(aes(x=reorder(features, importance), y=importance, fill=features)) + geom_bar(stat='identity') + coord_flip() +
theme(legend.position = "none",plot.title = element_text(hjust = 0.5))+
geom_label(aes(label=paste0(importance, "%")), colour = "white", fontface = "italic", hjust=0.6) +
labs(title="", x="Features", y="Importance")
- Determine the independent variables
- Incorporating dependent variable "Attrition" in the model
- Find the variables most important in determining employee attrition
- Convert the model's data type from "character" to "formula"
- Incorporate tranining data into formulas and build models
# First, combine 30 variable dependencies to find meaningful variables
independentvariables1=c("Age", "BusinessTravel", "DailyRate", "Department", "DistanceFromHome", "Education", "EducationField", "EnvironmentSatisfaction", "Gender", "HourlyRate", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", "OverTime", "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager")
Model1=paste(independentvariables1,collapse="+")
Model_1=paste("Att~",Model1)
formula=as.formula(Model_1)
# Run Training Model1
Trainingmodel1=glm(formula=formula,data=TrainingData,family="binomial")
Trainingmodel1=step(object = Trainingmodel1,direction = "both")
summary(Trainingmodel1)
# After the first scoring iteration, choose the most significant value based on the P-value, which is 17 variables
# Combine 17 dependent variables to find meaningful variables
independentvariables2=c("BusinessTravel", "DistanceFromHome", "EnvironmentSatisfaction","JobInvolvement","JobRole","JobSatisfaction","MaritalStatus","NumCompaniesWorked","OverTime","RelationshipSatisfaction","TotalWorkingYears", "TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager")
Model2=paste(independentvariables2,collapse="+")
Model_2=paste("Att~",Model2)
formula=as.formula(Model_2)
# Run Training Model2
Trainingmodel2=glm(formula=formula,data=TrainingData,family="binomial")
Trainingmodel2=step(object = Trainingmodel2,direction = "both")
summary(Trainingmodel2)
# After the second iteration of the calculation, remove the JobRole variable because the JobRoleSales Representative is the only JobRole that matters
# So we remove JobRole as a variant for our model.
# Combine 16 variable dependencies to find meaningful variables
independentvariables3=c("BusinessTravel", "DistanceFromHome", "EnvironmentSatisfaction","JobInvolvement","JobSatisfaction","MaritalStatus","NumCompaniesWorked","OverTime","RelationshipSatisfaction","TotalWorkingYears", "TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole", "YearsSinceLastPromotion","YearsWithCurrManager")
Model3=paste(independentvariables3,collapse="+")
Model_3=paste("Att~",Model3)
formula=as.formula(Model_3)
# Run Training Model3
Trainingmodel3=glm(formula=formula,data=TrainingData,family="binomial")
Trainingmodel3=step(object = Trainingmodel3,direction = "both")
summary(Trainingmodel3)
# After the third scoring iteration, WorkLifeBalance, YearsAtCompany, YearsWithCurrManager
# Combine 13 variable dependencies to find meaningful variables
independentvariables4=c("BusinessTravel", "DistanceFromHome", "EnvironmentSatisfaction","JobInvolvement","JobSatisfaction","MaritalStatus","NumCompaniesWorked","OverTime","RelationshipSatisfaction","TotalWorkingYears", "TrainingTimesLastYear","YearsInCurrentRole", "YearsSinceLastPromotion")
Model4=paste(independentvariables4,collapse="+")
Model_4=paste("Att~",Model4)
formula=as.formula(Model_4)
# Run Training Model4
Trainingmodel4=glm(formula=formula,data=TrainingData,family="binomial")
Trainingmodel4=step(object = Trainingmodel4,direction = "both")
summary(Trainingmodel4)
troc1=roc(response=Trainingmodel1$y,predictor = Trainingmodel1$fitted.values,plot=T)
troc1$auc
troc2=roc(response=Trainingmodel2$y,predictor = Trainingmodel2$fitted.values,plot=T)
troc2$auc
troc3=roc(response=Trainingmodel3$y,predictor = Trainingmodel3$fitted.values,plot=T)
troc3$auc
troc4=roc(response=Trainingmodel4$y,predictor = Trainingmodel4$fitted.values,plot=T)
troc4$auc
🔗 Datasets
- any kind of constructive suggestions are welcome , Please upvote this if you find it useful ✨