-
Notifications
You must be signed in to change notification settings - Fork 1
/
init.py
77 lines (59 loc) · 2.26 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
"""
Main module
@author: Niraj Gautam
"""
import generate_pos
import pickle
from helper import feature_extraction
import train_models
import random
import test_models
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import seaborn as sn
if __name__== "__main__":
generatePos = generate_pos.GeneratePos(quoteFile="quote.txt",nonQuoteFile="nonquote.txt")
quotePos, nonQoutePos = generatePos.generate_pos()
quoteFile = open ('./data/quote.txt', "rb")
quote = pickle.load(quoteFile)
nonquoteFile = open ('./data/nonquote.txt', "rb")
nonquote = pickle.load(nonquoteFile)
#creating word feature list from POS variables
allowed_word_types = ["JJ","R","NN","V","VBN","VBP","VB"]
all_words = []
documents = []
for q in quote:
documents.append((q,"p"))
for w in quotePos:
for c in w:
if c[1] in allowed_word_types:
all_words.append(c[0].lower())
for n in nonquote:
documents.append((n,"n"))
for w in nonQoutePos:
for c in w:
if c[1] in allowed_word_types:
all_words.append(c[0].lower())
# create object of feature extraction and generate feature set
featureExtraction= feature_extraction.Featurextraction()
featuresets = [(featureExtraction.find_feature(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)
print(len(featuresets))
# generate train/test dataset
training_set = featuresets[:6000]
testing_set = featuresets[6000:]
# Model training
t= train_models.TrainModel(training_set)
t.train()
# Model testing against testing set
testModel= test_models.TestModel(testing_set)
pred=testModel.test()
y_test = [f[1] for f in testing_set]
# plot confusion matrix for visualizing result
cm = confusion_matrix(y_test, pred)
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
#log classification report
print(classification_report(y_test, pred))