-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
71 lines (55 loc) · 1.72 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import nltk
import os
import sys
def main():
# Read data from files
if len(sys.argv) != 2:
sys.exit("Usage: python sentiment.py corpus")
positives, negatives = load_data(sys.argv[1])
# Create a set of all words
words = set()
for document in positives:
words.update(document)
for document in negatives:
words.update(document)
# Extract features from text
training = []
training.extend(generate_features(positives, words, "Positive"))
training.extend(generate_features(negatives, words, "Negative"))
# Classify a new sample
classifier = nltk.NaiveBayesClassifier.train(training)
s = input("s: ")
result = (classify(classifier, s, words))
for key in result.samples():
print(f"{key}: {result.prob(key):.4f}")
def extract_words(document):
return set(
word.lower() for word in nltk.word_tokenize(document)
if any(c.isalpha() for c in word)
)
def load_data(directory):
result = []
for filename in ["positives.txt", "negatives.txt"]:
with open(os.path.join(directory, filename)) as f:
result.append([
extract_words(line)
for line in f.read().splitlines()
])
return result
def generate_features(documents, words, label):
features = []
for document in documents:
features.append(({
word: (word in document)
for word in words
}, label))
return features
def classify(classifier, document, words):
document_words = extract_words(document)
features = {
word: (word in document_words)
for word in words
}
return classifier.prob_classify(features)
if __name__ == "__main__":
main()