-
Notifications
You must be signed in to change notification settings - Fork 437
/
build_corpus.py
107 lines (81 loc) · 2.18 KB
/
build_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
# build corpus
dataset = '20ng'
f = open('data/' + dataset + '.txt', 'r')
lines = f.readlines()
docs = []
for line in lines:
temp = line.split("\t")
doc_file = open(temp[0], 'r')
doc_content = doc_file.read()
doc_file.close()
print(temp[0], doc_content)
doc_content = doc_content.replace('\n', ' ')
docs.append(doc_content)
corpus_str = '\n'.join(docs)
f.close()
f = open('data/corpus/' + dataset + '.txt', 'w')
f.write(corpus_str)
f.close()
'''
# datasets from PTE paper
f = open('data/dblp/label_train.txt', 'r')
lines = f.readlines()
f.close()
doc_id = 0
doc_name_list = []
for line in lines:
string = str(doc_id) + '\t' + 'train' + '\t' + line.strip()
doc_name_list.append(string)
doc_id += 1
f = open('data/dblp/label_test.txt', 'r')
lines = f.readlines()
f.close()
for line in lines:
string = str(doc_id) + '\t' + 'test' + '\t' + line.strip()
doc_name_list.append(string)
doc_id += 1
doc_list_str = '\n'.join(doc_name_list)
f = open('data/dblp.txt', 'w')
f.write(doc_list_str)
f.close()
# TREC, R8, R52, WebKB
dataset = 'R52'
f = open('data/' + dataset + '/train.txt', 'r')
lines = f.readlines()
f.close()
doc_id = 0
doc_name_list = []
doc_content_list = []
for line in lines:
line = line.strip()
label = line[:line.find('\t')]
content = line[line.find('\t') + 1:]
string = str(doc_id) + '\t' + 'train' + '\t' + label
doc_name_list.append(string)
doc_content_list.append(content)
doc_id += 1
f = open('data/' + dataset + '/test.txt', 'r')
lines = f.readlines()
f.close()
for line in lines:
line = line.strip()
label = line[:line.find('\t')]
content = line[line.find('\t') + 1:]
string = str(doc_id) + '\t' + 'test' + '\t' + label
doc_name_list.append(string)
doc_content_list.append(content)
doc_id += 1
doc_list_str = '\n'.join(doc_name_list)
f = open('data/' + dataset + '.txt', 'w')
f.write(doc_list_str)
f.close()
doc_name_list_str = '\n'.join(doc_name_list)
f = open('data/' + dataset + '.txt', 'w')
f.write(doc_list_str)
f.close()
doc_content_list_str = '\n'.join(doc_content_list)
f = open('data/corpus/' + dataset + '.txt', 'w')
f.write(doc_content_list_str)
f.close()
'''