forked from wavewangyue/text-classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm_doc2vec.py
68 lines (50 loc) · 1.74 KB
/
svm_doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
VECTOR_DIR = 'vectors.bin'
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
TEST_SPLIT = 0.2
train_docs = open('train_contents.txt').read().split('\n')
train_labels = open('train_labels.txt').read().split('\n')
test_docs = open('test_contents.txt').read().split('\n')
test_labels = open('test_labels.txt').read().split('\n')
def train_d2v_model():
all_docs = train_docs + test_docs
fout = open('all_contents.txt','w')
fout.write('\n'.join(all_docs))
fout.close()
import gensim
sentences = gensim.models.doc2vec.TaggedLineDocument('all_contents.txt')
model = gensim.models.Doc2Vec(sentences, size=200, window=5, min_count=5)
model.save('doc2vec.model')
print 'num of docs: ' + str(len(model.docvecs))
if __name__ == '__main__':
print '(1) training doc2vec model...'
# train_d2v_model()
print '(2) load doc2vec model...'
import gensim
model = gensim.models.Doc2Vec.load('doc2vec.model')
x_train = []
x_test = []
y_train = train_labels
y_test = test_labels
for idx, docvec in enumerate(model.docvecs):
if idx < 17600:
x_train.append(docvec)
else:
x_test.append(docvec)
print 'train doc shape: '+str(len(x_train))+' , '+str(len(x_train[0]))
print 'test doc shape: '+str(len(x_test))+' , '+str(len(x_test[0]))
print '(3) SVM...'
from sklearn.svm import SVC
svclf = SVC(kernel = 'rbf')
svclf.fit(x_train,y_train)
preds = svclf.predict(x_test);
num = 0
preds = preds.tolist()
for i,pred in enumerate(preds):
if int(pred) == int(y_test[i]):
num += 1
print 'precision_score:' + str(float(num) / len(preds))