-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaiveBayes.py
179 lines (141 loc) · 5.54 KB
/
naiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from collections import defaultdict
import numpy as np
# noinspection SpellCheckingInspection
class NaiveBayesClassifier(object):
def __init__(self, n_gram=1, printing=False):
self.prior = defaultdict(int)
self.logprior = {}
self.bigdoc = defaultdict(list)
self.loglikelihoods = defaultdict(defaultdict)
self.V = []
self.n = n_gram
def compute_prior_and_bigdoc(self, training_set, training_labels):
'''
Computes the prior and the bigdoc (from the book's algorithm)
:param training_set:
a list of all documents of the training set
:param training_labels:
a list of labels corresponding to the documents in the training set
:return:
None
'''
for x, y in zip(training_set, training_labels):
all_words = x.split(" ")
if self.n == 1:
grams = all_words
else:
grams = self.words_to_grams(all_words)
self.prior[y] += len(grams)
self.bigdoc[y].append(x)
def compute_vocabulary(self, documents):
vocabulary = set()
for doc in documents:
for word in doc.split(" "):
vocabulary.add(word.lower())
return vocabulary
def count_word_in_classes(self):
counts = {}
for c in list(self.bigdoc.keys()):
docs = self.bigdoc[c]
counts[c] = defaultdict(int)
for doc in docs:
words = doc.split(" ")
for word in words:
counts[c][word] += 1
return counts
def train(self, training_set, training_labels, alpha=1):
# Get number of documents
N_doc = len(training_set)
# Get vocabulary used in training set
self.V = self.compute_vocabulary(training_set)
# Create bigdoc
for x, y in zip(training_set, training_labels):
self.bigdoc[y].append(x)
# Get set of all classes
all_classes = set(training_labels)
# Compute a dictionary with all word counts for each class
self.word_count = self.count_word_in_classes()
# For each class
for c in all_classes:
# Get number of documents for that class
N_c = float(sum(training_labels == c))
# Compute logprior for class
self.logprior[c] = np.log(N_c / N_doc)
# Calculate the sum of counts of words in current class
total_count = 0
for word in self.V:
total_count += self.word_count[c][word]
# For every word, get the count and compute the log-likelihood for this class
for word in self.V:
count = self.word_count[c][word]
self.loglikelihoods[c][word] = np.log((count + alpha) / (total_count + alpha * len(self.V)))
def predict(self, test_doc):
sums = {
0: 0,
1: 0,
}
for c in self.bigdoc.keys():
sums[c] = self.logprior[c]
words = test_doc.split(" ")
for word in words:
if word in self.V:
sums[c] += self.loglikelihoods[c][word]
return sums
# Big file stuff
import string
import json
with open("reviews.json", mode="r", encoding="utf-8") as f:
reviews = json.load(f)
sentiment_numerical_val = {
'NEG': 0,
'POS': 1
}
import pprint
def split_review_data(reviews, split=900, remove_punc=False, separation=" "):
training_set = []
training_labels = []
validation_set = []
validation_labels = []
for i, r in enumerate(reviews):
if i==0: print(str(r['content'])); print(dict(r).keys())
cv = int(r["cv"])
sent = sentiment_numerical_val[r["sentiment"]]
content_string = ""
for sentence in r["content"]:
for word in sentence:
content_string += word[0].lower() + separation
if remove_punc:
exclude = set(string.punctuation)
content_string = ''.join(character for character in content_string if character not in exclude)
if 0 < cv < split:
training_set.append(content_string)
training_labels.append(sent)
else:
validation_set.append(content_string)
validation_labels.append(sent)
return training_set, np.array(training_labels), validation_set, np.array(validation_labels)
def evaluate_predictions(validation_set,validation_labels,trained_classifier):
correct_predictions = 0
predictions_list = []
prediction = -1
for dataset,label in zip(validation_set, validation_labels):
probabilities = trained_classifier.predict(dataset)
if probabilities[0] >= probabilities[1]:
prediction = 0
elif probabilities[0] < probabilities[1]:
prediction = 1
if prediction == label:
correct_predictions += 1
predictions_list.append("+")
else:
predictions_list.append("-")
print("Predicted correctly {} out of {} ({}%)".format(correct_predictions,len(validation_labels),round(correct_predictions/len(validation_labels)*100,5)))
return predictions_list, round(correct_predictions/len(validation_labels)*100)
training_set, training_labels, validation_set, validation_labels = split_review_data(reviews)
import time
start = time.time()
NBclassifier = NaiveBayesClassifier()
NBclassifier.train(training_set, training_labels, alpha=1)
results, acc = evaluate_predictions(validation_set, validation_labels, NBclassifier)
end = time.time()
print('Ran in {} seconds'.format(round(end - start, 3)))