Skip to content

Commit

Permalink
Merge pull request #726 from joernhees/fix_canonicalization
Browse files Browse the repository at this point in the history
fixes #725, huge thanks to @jimmccusker
  • Loading branch information
joernhees authored Apr 9, 2017
2 parents af23007 + d80b313 commit ee2c08b
Show file tree
Hide file tree
Showing 2 changed files with 211 additions and 18 deletions.
28 changes: 22 additions & 6 deletions rdflib/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ def __init__(self, nodes, hashfunc, color=(), hash_cache=None):
self.hashfunc = hashfunc
self._hash_color = None

def __str__(self):
nodes, color = self.key()
return "Color %s (%s nodes)" % (color, nodes)

def key(self):
return (len(self.nodes), self.hash_color())

Expand Down Expand Up @@ -277,7 +281,7 @@ def _initial_color(self):
others = set()
self._neighbors = defaultdict(set)
for s, p, o in self.graph:
nodes = set([s, o])
nodes = set([s, p, o])
b = set([x for x in nodes if isinstance(x, BNode)])
if len(b) > 0:
others |= nodes - b
Expand All @@ -286,6 +290,9 @@ def _initial_color(self):
self._neighbors[s].add(o)
if isinstance(o, BNode):
self._neighbors[o].add(s)
if isinstance(p, BNode):
self._neighbors[p].add(s)
self._neighbors[p].add(p)
if len(bnodes) > 0:
return [
Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
Expand Down Expand Up @@ -317,7 +324,7 @@ def _refine(self, coloring, sequence):
while len(sequence) > 0 and not self._discrete(coloring):
W = sequence.pop()
for c in coloring[:]:
if len(c.nodes) > 1:
if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode):
colors = sorted(c.distinguish(W, self.graph),
key=lambda x: x.key(),
reverse=True)
Expand All @@ -328,8 +335,17 @@ def _refine(self, coloring, sequence):
sequence = sequence[:si] + colors + sequence[si+1:]
except ValueError:
sequence = colors[1:] + sequence

return coloring
combined_colors = []
combined_color_map = dict()
for color in coloring:
color_hash = color.hash_color()
# This is a hash collision, and be combined into a single color for individuation.
if color_hash in combined_color_map:
combined_color_map[color_hash].nodes.extend(color.nodes)
else:
combined_colors.append(color)
combined_color_map[color_hash] = color
return combined_colors

@_runtime("to_hash_runtime")
def to_hash(self, stats=None):
Expand Down Expand Up @@ -515,14 +531,14 @@ def isomorphic(graph1, graph2):



def to_canonical_graph(g1):
def to_canonical_graph(g1, stats=None):
"""Creates a canonical, read-only graph.
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical SHA-256 checksums, correlated with the graph contents.
"""
graph = Graph()
graph += _TripleCanonicalizer(g1).canonical_triples()
graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
return ReadOnlyGraphAggregate([graph])


Expand Down
201 changes: 189 additions & 12 deletions test/test_canonicalization.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from collections import Counter
from rdflib import Graph, RDF, BNode, URIRef, Namespace, ConjunctiveGraph, Literal
from rdflib.compare import to_isomorphic, to_canonical_graph

import rdflib
from rdflib.plugins.memory import IOMemory

from six import text_type
from io import StringIO

Expand Down Expand Up @@ -220,28 +224,28 @@ def test_issue494_collapsing_bnodes():
RDF['Statement']),
]

print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
print('triple_bnode degrees:')
for triple_bnode in g.subjects(RDF['type'], RDF['Statement']):
print(len(list(g.triples([triple_bnode, None, None]))))
print('all node degrees:')
# print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
# print('triple_bnode degrees:')
# for triple_bnode in g.subjects(RDF['type'], RDF['Statement']):
# print(len(list(g.triples([triple_bnode, None, None]))))
# print('all node degrees:')
g_node_degs = sorted([
len(list(g.triples([node, None, None])))
for node in g.all_nodes()
], reverse=True)
print(g_node_degs)
# print(g_node_degs)

cg = to_canonical_graph(g)
print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
print('triple_bnode degrees:')
for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']):
print(len(list(cg.triples([triple_bnode, None, None]))))
print('all node degrees:')
# print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
# print('triple_bnode degrees:')
# for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']):
# print(len(list(cg.triples([triple_bnode, None, None]))))
# print('all node degrees:')
cg_node_degs = sorted([
len(list(cg.triples([node, None, None])))
for node in cg.all_nodes()
], reverse=True)
print(cg_node_degs)
# print(cg_node_degs)

assert len(g) == len(cg), \
'canonicalization changed number of triples in graph'
Expand All @@ -253,6 +257,24 @@ def test_issue494_collapsing_bnodes():
assert g_node_degs == cg_node_degs, \
'canonicalization changed node degrees'

# counter for subject, predicate and object nodes
g_pos_counts = Counter(), Counter(), Counter()
for t in g:
for i, node in enumerate(t):
g_pos_counts[i][t] += 1
g_count_signature = [sorted(c.values()) for c in g_pos_counts]

cg = to_canonical_graph(g)
cg_pos_counts = Counter(), Counter(), Counter()
for t in cg:
for i, node in enumerate(t):
cg_pos_counts[i][t] += 1
cg_count_signature = [sorted(c.values()) for c in cg_pos_counts]

assert g_count_signature == cg_count_signature, \
'canonicalization changed node position counts'


def test_issue682_signing_named_graphs():
ns = Namespace("http://love.com#")

Expand Down Expand Up @@ -282,3 +304,158 @@ def test_issue682_signing_named_graphs():
assert len(ig) == len(g)
assert len(igmary) < len(ig)
assert ig.graph_digest() != igmary.graph_digest()


def test_issue725_collapsing_bnodes_2():
g = Graph()
g += [
(BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
BNode('v2')),
(BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
BNode('v0')),
(BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
URIRef(u'urn:gp_learner:fixed_var:target')),
(BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
(BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
BNode('v1')),
(BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
BNode('v0')),
(BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
URIRef(u'urn:gp_learner:fixed_var:target')),
(BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
(BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
BNode('v5')),
(BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
BNode('v4')),
(BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
URIRef(u'urn:gp_learner:fixed_var:target')),
(BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
(BNode('N86ac7ca781f546ae939b8963895f672e'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
URIRef(u'urn:gp_learner:fixed_var:source')),
(BNode('N86ac7ca781f546ae939b8963895f672e'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
BNode('v0')),
(BNode('N86ac7ca781f546ae939b8963895f672e'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
URIRef(u'urn:gp_learner:fixed_var:target')),
(BNode('N86ac7ca781f546ae939b8963895f672e'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
(BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
BNode('v1')),
(BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
BNode('v3')),
(BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
URIRef(u'urn:gp_learner:fixed_var:target')),
(BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'))
]

turtle = '''
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
[] a rdf:Statement ;
rdf:object [ ] ;
rdf:predicate _:v0 ;
rdf:subject <urn:gp_learner:fixed_var:target> .
[] a rdf:Statement ;
rdf:object _:v1 ;
rdf:predicate _:v0 ;
rdf:subject <urn:gp_learner:fixed_var:target> .
[] a rdf:Statement ;
rdf:object [ ] ;
rdf:predicate [ ] ;
rdf:subject <urn:gp_learner:fixed_var:target> .
[] a rdf:Statement ;
rdf:object <urn:gp_learner:fixed_var:source> ;
rdf:predicate _:v0 ;
rdf:subject <urn:gp_learner:fixed_var:target> .
[] a rdf:Statement ;
rdf:object _:v1 ;
rdf:predicate [ ] ;
rdf:subject <urn:gp_learner:fixed_var:target> .'''

# g = Graph()
# g.parse(data=turtle, format='turtle')

stats = {}
cg = rdflib.compare.to_canonical_graph(g, stats=stats)

# print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
# print ('triple_bnode degrees:')
# for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
# print (len(list(g.triples([triple_bnode, None, None]))))
# print ('all node out-degrees:')
# print (sorted(
# [len(list(g.triples([node, None, None]))) for node in g.all_nodes()]))
# print ('all node in-degrees:')
# print (sorted(
# [len(list(g.triples([None, None, node]))) for node in g.all_nodes()]))
# print(g.serialize(format='n3'))
#
# print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
# print ('triple_bnode degrees:')
# for triple_bnode in cg.subjects(rdflib.RDF['type'],
# rdflib.RDF['Statement']):
# print (len(list(cg.triples([triple_bnode, None, None]))))
# print ('all node out-degrees:')
# print (sorted(
# [len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()]))
# print ('all node in-degrees:')
# print (sorted(
# [len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()]))
# print(cg.serialize(format='n3'))

assert (len(g.all_nodes()) == len(cg.all_nodes()))

cg = to_canonical_graph(g)
assert len(g) == len(cg), \
'canonicalization changed number of triples in graph'
assert len(g.all_nodes()) == len(cg.all_nodes()), \
'canonicalization changed number of nodes in graph'
assert len(list(g.subjects(RDF['type'], RDF['Statement']))) == \
len(list(cg.subjects(RDF['type'], RDF['Statement']))), \
'canonicalization changed number of statements'

# counter for subject, predicate and object nodes
g_pos_counts = Counter(), Counter(), Counter()
for t in g:
for i, node in enumerate(t):
g_pos_counts[i][t] += 1
g_count_signature = [sorted(c.values()) for c in g_pos_counts]

cg_pos_counts = Counter(), Counter(), Counter()
for t in cg:
for i, node in enumerate(t):
cg_pos_counts[i][t] += 1
cg_count_signature = [sorted(c.values()) for c in cg_pos_counts]

assert g_count_signature == cg_count_signature, \
'canonicalization changed node position counts'

0 comments on commit ee2c08b

Please sign in to comment.