-
Notifications
You must be signed in to change notification settings - Fork 29
/
MakeOpenKP.py
77 lines (68 loc) · 2.57 KB
/
MakeOpenKP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from random import shuffle
import json
testUrls = []
preds = 'kp80k_blingkpe.tsv'
docs = 'OpenKPDocs.tsv'
with open(preds,'r') as f:
for l in f:
l = l.strip().split('\t')
testUrls.append(l[0])
allUrls = []
with open(docs,'r') as f:
for l in f:
l = l.strip().split('\t')
allUrls.append(l[0])
testSet = set(testUrls)
allSet = set(allUrls)
excludeTest = testSet - allSet
testUrls = list(testSet - excludeTest)
trainUrls = list(allSet - testSet)
print("There are {} unique urls with {} in the dev set".format(len(allSet), len(testSet)))
shuffle(trainUrls)
shuffle(testUrls)
index = int(len(testUrls)/2) -1
devUrls = testUrls[:index]
evalUrls = testUrls[index:]
"""
with open('trainURLs.tsv','w') as w:
for url in trainUrls:
w.write("{}\n".format(url))
with open('devURLs.tsv','w') as w:
for url in devUrls:
w.write("{}\n".format(url))
with open('evalURLs.tsv','w') as w:
for url in evalUrls:
w.write("{}\n".format(url))
"""
#URL\tCleanBody Tokes\tVDOM\tAllPropertyIDX\tKeyPhrases\tKP_DL
with open(docs,'r') as f:
with open('OpenKPFull.jsonl','w') as w:
with open('OpenKPTrain.jsonl','w') as train:
with open('OpenKPDev.jsonl','w') as dev:
with open('OpenKPEval.jsonl','w') as test:
with open('OpenKPEvalPublic.jsonl','w') as test_public:
for l in f:
l = l.strip().split('\t')
url = l[0]
text = l[1]
visual = l[2]
kp = json.loads(l[4])
for i in kp:
if len(i) == 0:
kp.pop
data = {}
data['url'] = url
data['text'] = text
data['VDOM'] = visual
data['KeyPhrases'] = kp
output = "{}\n".format(json.dumps(data))
if url in allUrls:
w.write(output)
if url in trainUrls:
train.write(output)
if url in devUrls:
dev.write(output)
if url in evalUrls:
test.write(output)
data.pop('KeyPhrases')
test_public.write('{}\n'.format(json.dumps(data)))