-
Notifications
You must be signed in to change notification settings - Fork 2
/
resolve_qualifiers.py
186 lines (168 loc) · 7.31 KB
/
resolve_qualifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import csv
import json
import re
#####################################################
# CONSTANTS #
#####################################################
DUMP_SPECIFICATION = "wikidata_clean"
PATH_TO_INPUT_FILE = "dumps/" + DUMP_SPECIFICATION + ".nt"
PATH_TO_UNIQUE_PREDICATES_DUMP = "tmp_dumps//" + DUMP_SPECIFICATION + "_unique_predicates.csv"
PATH_TO_QUALIFIER_DUMP = "tmp_dumps//" + DUMP_SPECIFICATION + "_qualifiers_resolved.csv"
PATH_TO_OUTPUT_FILE = "dumps/" + DUMP_SPECIFICATION + ".csv"
TYPE_PATTERN = re.compile("Q[0-9]+\-[0-9]+")
#####################################################
# FUNCTIONS #
#####################################################
def normalize_object(kg_object):
if "http://www.wikidata.org" in kg_object:
return kg_object.rsplit("/", 1)[1]
if '"^^' in kg_object:
kg_object = kg_object.rsplit('"^^', 1)[0]
kg_object += '"'
if '"@en' in kg_object:
kg_object = kg_object.replace('"@en', "")
kg_object += '"'
o = kg_object
return o
def normalize_wikidata_url(kg_item):
return kg_item.rsplit("/", 1)[1]
def create_unique_predicates():
predicate_nodes = dict()
type_nodes = dict()
intermediate_nodes = dict()
with open(PATH_TO_INPUT_FILE, "r") as fp_in:
with open(PATH_TO_UNIQUE_PREDICATES_DUMP, "w") as fp_out:
line = fp_in.readline()
rows = ""
count = -1
while line:
currentLine = line
line = fp_in.readline()
s, p, o = currentLine.replace(">", "").replace("<", "").split(" ", 2)
# remove " .\n" at end of each triple from object
o = o[:-3]
# remove prefix-url from kg item
s, p, o = normalize_wikidata_url(s), normalize_wikidata_url(p), normalize_object(o)
o = o.strip()
# create unique type nodes
if p == "P31":
if not type_nodes.get(o):
type_nodes[o] = 1
type_index = 0
else:
type_index = type_nodes[o]
type_nodes[o] += 1
o = o + "-" + str(type_index)
# create unique predicate nodes
if not predicate_nodes.get(p):
predicate_nodes[p] = 1
predicate_index = 0
else:
predicate_index = predicate_nodes[p]
predicate_nodes[p] += 1
p = p + "-" + str(predicate_index)
# check for statements
if "-" in o:
if o.startswith(s) or o.startswith("q" + s[1:]):
# store in dict dummy node and unique predicate, remove this line from dump
intermediate_nodes[o] = p
continue
if o.startswith("p" + s[1:]):
continue
count += 1
rows += str(s) + "," + str(p) + "," + str(o) + "\n"
if count == 1000:
count = 0
fp_out.write(rows)
rows = ""
fp_out.write(rows)
with open("tmp_dumps/qualifier_intermediate_nodes.json", "w") as json_file:
json.dump(intermediate_nodes, json_file)
return
def resolve_qualifiers():
triples = dict()
with open("tmp_dumps/qualifier_intermediate_nodes.json", "r") as json_file:
intermediate_nodes = json.load(json_file)
with open(PATH_TO_UNIQUE_PREDICATES_DUMP, "r") as fp_in:
with open(PATH_TO_QUALIFIER_DUMP, "w") as fp_out:
line = fp_in.readline()
rows = ""
count = -1
while line:
currentLine = line
line = fp_in.readline()
s, p, o = currentLine.split(",", 2)
o = o.strip()
# check for statement
if "-" in s:
# look up corresponding predicate in dictionary, if not contained in dict: 00 will be returned
p_val = intermediate_nodes.get(s, "00")
# actually a lot of dummy nodes as subject are apparently not as objects in dump due to earlier pruning (see csv.log for a list of them)
if p_val == "00":
continue
# if qualifier predicate and predicate from dict are the same: create the corresponding triple from this
if p.split("-")[0] == p_val.split("-")[0]:
s = s.split("-")[0]
if s.startswith("q"):
s = "Q" + s[1:]
p = p_val
# store in new dictionary subject, object, predicate (later used to prune direct triples)
if not s in triples.keys():
triples[s] = dict()
triples[s][o] = p_val
# if not: add predicate as subject of this triple
else:
s = p_val
count += 1
rows += str(s) + "," + str(p) + "," + str(o) + "\n"
if count == 1000:
count = 0
fp_out.write(rows)
rows = ""
fp_out.write(rows)
with open("tmp_dumps/qualifier_triples.json", "w") as json_file:
json.dump(triples, json_file)
return
def prune_duplicate_lines():
continue_flag = False
with open("tmp_dumps/qualifier_triples.json", "r") as json_file:
triples = json.load(json_file)
with open(PATH_TO_QUALIFIER_DUMP, "r") as fp_in:
with open(PATH_TO_OUTPUT_FILE, "w") as fp_out:
line = fp_in.readline()
rows = ""
count = -1
while line:
currentLine = line
line = fp_in.readline()
s, p, o = currentLine.split(",", 2)
o = o.strip()
if s in triples.keys():
for o_s in triples[s].keys():
if o == o_s:
# get stored predicate
p_s = triples[s][o_s]
# check if direct predicate which can be pruned
if not p == p_s and p.split("-")[0] == p_s.split("-")[0]:
continue_flag = True
elif p.split("-")[0] == "P31":
if o_s.split("-")[0] == o.split("-")[0]:
p_s = triples[s][o_s]
if not p_s == p:
if p.split("-")[0] == p_s.split("-")[0]:
continue_flag = True
if continue_flag:
continue_flag = False
continue
count += 1
rows += str(s) + "," + str(p) + "," + str(o) + "\n"
if count == 1000:
count = 0
fp_out.write(rows)
rows = ""
fp_out.write(rows)
return
create_unique_predicates()
resolve_qualifiers()
prune_duplicate_lines()
print("done")