-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvsm.R
92 lines (77 loc) · 2.92 KB
/
vsm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#######
library(tm)
library(SnowballC)
doc1 <- "Stray cats are running all over the place. I see 10 a day!"
doc2 <- "Cats are killers. They kill billions of animals a year."
doc3 <- "The best food in Columbus, OH is the North Market."
doc4 <- "Brand A is the best tasting cat food around. Your cat will love it."
doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats."
doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy."
doc7 <- "I have nothing to say. In summary, I have told you nothing."
#######
doc.list <- list(doc1, doc2, doc3, doc4, doc5, doc6, doc7)
N.docs <- length(doc.list)
names(doc.list) <- paste("doc",c(1:N.docs))
head(doc.list)
doc.list$doc1
write.csv(doc.list,"doc.csv")
##query
query <- "Healthy cat food"
###
my.docs <- VectorSource(c(doc.list, query))
my.docs$Names <- c(names(doc.list),"query")
my.docs
names(doc.list)
my.corpus <- Corpus(my.docs)
as.character(my.corpus$doc1)
my.corpus<-tm_map(my.corpus,PlainTextDocument)
#transformation
my.corpus <- tm_map(my.corpus, removePunctuation)
getTransformations()
inspect(my.corpus)
##########
my.corpus <- tm_map(my.corpus, removeNumbers)
my.corpus <- tm_map(my.corpus, tolower)
my.corpus <- tm_map(my.corpus, stripWhitespace)
my.corpus$doc4
term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)
inspect(term.doc.matrix.stm[0:14, ])
term.doc.matrix <- as.matrix(term.doc.matrix.stm)
names(term.doc.matrix)
#########
get.tf.idf.weights <- function(tf.vec, df) {
# Computes tfidf weights from a term frequency vector and a document
# frequency scalar
weight = rep(0, length(tf.vec))
weight[tf.vec > 0] = (1 + log2(tf.vec[tf.vec > 0])) * log2(N.docs/df)
weight
}
cat("A word appearing in 4 of 6 documents, occuring 1, 2, 3, and 6 times, respectively: \n",
get.tf.idf.weights(c(1, 2, 3, 0, 0, 6), 4))
#####
get.weights.per.term.vec <- function(tfidf.row) {
term.df <- sum(tfidf.row[1:N.docs] > 0)
tf.idf.vec <- get.tf.idf.weights(tfidf.row, term.df)
return(tf.idf.vec)
}
tfidf.matrix <- t(apply(term.doc.matrix, c(1), FUN = get.weights.per.term.vec))
colnames(tfidf.matrix) <- colnames(term.doc.matrix)
tfidf.matrix[0:3, ]
###################
angle <- seq(-pi, pi, by = pi/16)
plot(cos(angle) ~ angle, type = "b", xlab = "angle in radians", main = "Cosine similarity by angle")
############
tfidf.matrix <- scale(tfidf.matrix, center = FALSE, scale = sqrt(colSums(tfidf.matrix^2)))
tfidf.matrix[0:3, ]
##########
query.vector <- tfidf.matrix[, (N.docs + 1)]
tfidf.matrix <- tfidf.matrix[, 1:N.docs]
######
doc.scores <- t(query.vector) %*% tfidf.matrix
########
results.df <- data.frame(doc = names(doc.list), score = t(doc.scores), text = unlist(doc.list))
results.df <- results.df[order(results.df$score, decreasing = TRUE), ]
#####result
options(width = 2000)
print(results.df, row.names = FALSE, right = FALSE, digits = 2)
####