-
Notifications
You must be signed in to change notification settings - Fork 0
/
NaiveBayes.R
133 lines (106 loc) · 4.54 KB
/
NaiveBayes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(e1071)
library(caret)
library(tidyverse)
library(RColorBrewer)
library(tm)
library(SnowballC)
# Example 1: Email spam
#######################
# Load the email spam data and wrangle it.
email <- read.csv("../data/email-spam.csv") %>% tibble()
# Factorize the output variable with "Yes" and "No".
email$spam <- factor(
ifelse(email$spam == 1, "Yes", "No"),
levels = c("Yes", "No"))
# Visualize the distribution of emails.
email %>%
group_by(spam) %>%
count(name = "freq") %>%
ungroup() %>%
mutate(spam = fct_reorder(spam, levels(spam))) %>%
ggplot(aes(x = spam, y = freq, fill = spam)) +
geom_bar(stat = "identity") +
labs(
title = "Spam Emails\n",
x = "Spam\n",
y = "\n# Emails",
fill = "Spam"
) +
coord_flip() +
theme_minimal() +
scale_fill_brewer(palette = "Paired")
# Partition the data into a training set and test set.
test.index <- sample(1:nrow(email), size = floor(0.75 * nrow(jobs)))
email$test <- F
email$test[test.index] <- T
# Train a Naive Bayes classifier on the training data.
set.seed(7483)
nb <- naiveBayes(spam ~ ., data = email, subset = which(test == T))
# Use the model to predict the test data.
pred <- predict(nb, email[email$test == F,], type = "class")
# Evaluate the model.
confusionMatrix(pred, email$spam[email$test == F])
# Example 2: SMS spam
#####################
# Data source: https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/
# This example uses text mining to create a corpus and applies Naive Bayes on the corpus.
# Load and wrangle the data.
sms <- read.table("../data/sms_spam_collection.txt", sep = "\t", header = F, quote = "")
names(sms) <- c("Label", "Message")
sms$Label <- factor(sms$Label)
# Find frequency and likelihood of spam messages in original data.
tibble(
SpamMessages = sum(sms$Label == "spam"),
SpamMessagesPercent = paste0(round(sum(sms$Label == "spam") / nrow(sms) * 100, 1), "%"),
HamMessages = sum(sms$Label == "ham"),
HamMessagesPercent = paste0(round(sum(sms$Label == "ham") / nrow(sms) * 100, 1), "%"),
TotalMessages = nrow(sms)
)
# Show 3 example messages.
set.seed(34389)
sms[runif(3, 1, nrow(sms)),]
# Create a corpus of all the SMS messages.
# Inspect the first 3 documents (messages).
corpus <- VCorpus(VectorSource(sms$Message))
lapply(corpus[1:3], function(x) as.character(x))
# Clean the corpus and perform stemming.
corpus.clean <- tm_map(corpus, content_transformer(tolower))
corpus.clean <- tm_map(corpus.clean, removeNumbers)
corpus.clean <- tm_map(corpus.clean, removeWords, stopwords(kind = "en"))
corpus.clean <- tm_map(corpus.clean, removePunctuation)
corpus.clean <- tm_map(corpus.clean, stemDocument)
# Show the same 3 example messages to confirm they have been cleaned.
set.seed(34389)
lapply(corpus.clean[runif(3, 1, nrow(sms))], as.character)
# Tokenize terms and create a document-term incident matrix (DTM).
# Preview the matrix.
dtm <- DocumentTermMatrix(corpus.clean)
inspect(dtm)
# Convert the DTM to a matrix for computations.
dtm.matrix <- as.matrix(dtm)
colSums(dtm.matrix)[100:110] # Number of occurrences of words 100 - 110.
# Partition the terms into a training set and test set (75%-25% split).
dtm.train <- dtm[1:floor(0.75 * nrow(dtm)), ]
dtm.test <- dtm[(floor(0.75 * nrow(dtm)) + 1):nrow(dtm), ]
train.labels <- sms$Label[1:floor(0.75 * nrow(dtm))]
test.labels <- sms$Label[(floor(0.75 * nrow(dtm)) + 1):nrow(dtm)]
# Find terms occurring in at least 10 messages. Use these to subset the training
# and test sets.
freq.words <- findFreqTerms(dtm.train, lowfreq = 10)
dtm.train.freq <- dtm.train[, freq.words]
dtm.test.freq <- dtm.test[, freq.words]
# Convert terms to categorical (Y = term occurs, N = term does not occur).
train.data <- apply(dtm.train.freq, MARGIN = 2, FUN = function(x) ifelse(x > 0, "Y", "N"))
test.data <- apply(dtm.test.freq, MARGIN = 2, FUN = function(x) ifelse(x > 0, "Y", "N"))
# Train the Naive Bayes model on the training data. Preview the first 5 terms and
# their fitted classification probabilities.
m <- naiveBayes(train.data, train.labels)
m$tables[1:5]
# Use the model to predict the test data labels.
pred <- predict(m, test.data, type = "class")
# Evaluate the model by comparing the predicted labels against the ground truth.
confusionMatrix(pred, test.labels)
# The model performs well, but can be improved by setting the Laplace estimator to 1.
m.2 <- naiveBayes(train.data, train.labels, laplace = 1)
pred.2 <- predict(m.2, test.data, type = "class")
confusionMatrix(pred.2, test.labels)