-
Notifications
You must be signed in to change notification settings - Fork 437
/
doc2vec.py
62 lines (50 loc) · 1.41 KB
/
doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn import metrics
import numpy as np
dataset = 'mr'
# shulffing
doc_name_list = []
doc_train_list = []
doc_test_list = []
f = open('data/' + dataset + '_pvdm_200.vec', 'r')
# _pvdm_200.vec
# _doc_vectors.txt
vector_lines = f.readlines()
f.close()
f = open('data/' + dataset + '.txt', 'r')
lines = f.readlines()
f.close()
train_x = []
train_y = []
test_x = []
test_y = []
for i in range(len(lines)):
line = lines[i].strip()
temp = line.split("\t")
vector_line = vector_lines[i + 1].strip().split(' ') # +1
doc_vec = vector_line[1:]
for j in range(len(doc_vec)):
doc_vec[j] = float(doc_vec[j])
# print(doc_vec)
# doc_vec = np.array(doc_vec)
if temp[1].find('test') != -1:
test_y.append(temp[2])
test_x.append(doc_vec)
elif temp[1].find('train') != -1:
train_y.append(temp[2])
train_x.append(doc_vec)
train_x = np.array(train_x)
test_x = np.array(test_x)
clf = LogisticRegression(random_state=0)
clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)
correct_count = 0
for i in range(len(test_y)):
if predict_y[i] == test_y[i]:
correct_count += 1
accuracy = correct_count * 1.0 / len(test_y)
print(dataset, accuracy)
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(test_y, predict_y, digits=4))