Skip to content

Commit

Permalink
Merge pull request #496 from RDFLib/fix_canonicalization
Browse files Browse the repository at this point in the history
fixed #494 canonicalization sometimes collapses BNodes
  • Loading branch information
joernhees committed Aug 10, 2015
2 parents fbc29da + b8df01f commit 9ba4547
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 7 deletions.
9 changes: 6 additions & 3 deletions rdflib/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,9 @@ def stringify(x):
return unicode(x)
if isinstance(color, Node):
return stringify(color)
value = sum(map(self.hashfunc, ' '.join([stringify(x) for x in color])))
value = 0
for triple in color:
value += self.hashfunc(' '.join([stringify(x) for x in triple]))
val = u"%x" % value
self._hash_cache[color] = val
return val
Expand Down Expand Up @@ -290,7 +292,7 @@ def _initial_color(self):

def _individuate(self, color, individual):
new_color = list(color.color)
new_color.append((len(color.nodes)))
new_color.append((len(color.nodes),))

color.nodes.remove(individual)
c = Color([individual], self.hashfunc, tuple(new_color),
Expand Down Expand Up @@ -320,6 +322,7 @@ def _refine(self, coloring, sequence):
sequence = sequence[:si] + colors + sequence[si+1:]
except ValueError:
sequence = colors[1:] + sequence

return coloring

@_runtime("to_hash_runtime")
Expand Down Expand Up @@ -407,7 +410,6 @@ def _traces(self, coloring, stats=None, depth=[0]):
stats['prunings'] += 1
discrete = [x for x in best if self._discrete(x)]
if len(discrete) == 0:
very_best = None
best_score = None
best_depth = None
for coloring in best:
Expand All @@ -434,6 +436,7 @@ def canonical_triples(self, stats=None):
if stats is not None:
stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
stats['initial_color_count'] = len(coloring)

if not self._discrete(coloring):
depth = [0]
coloring = self._traces(coloring, stats=stats, depth=depth)
Expand Down
109 changes: 105 additions & 4 deletions test/test_canonicalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def negative_graph_match_test():
True
],
[ unicode('''@prefix : <http://example.org/ns#> .
:linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]],
:linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]],
[ :related [ :related :linear_two_step_symmatry_end]].'''),
unicode('''@prefix : <http://example.org/ns#> .
:linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]],
:linear_two_step_symmetry_start :related [ :related [ :related :linear_two_step_symmatry_end]],
[ :related [ :related :linear_two_step_symmatry_end]].'''),
True
],
Expand Down Expand Up @@ -68,7 +68,7 @@ def negative_graph_match_test():
].'''),
False
],
# This test fails because the algorithm purposefully breaks the symmetry of symetric
# This test fails because the algorithm purposefully breaks the symmetry of symetric
[ unicode('''@prefix : <http://example.org/ns#> .
_:a :rel [
:rel [
Expand Down Expand Up @@ -144,8 +144,109 @@ def negative_graph_match_test():
def fn(rdf1, rdf2, identical):
digest1 = get_digest_value(rdf1,"text/turtle")
digest2 = get_digest_value(rdf2,"text/turtle")
print rdf1
print digest1
print rdf2
print digest2
assert (digest1 == digest2) == identical
for inputs in testInputs:
yield fn, inputs[0], inputs[1], inputs[2]
yield fn, inputs[0], inputs[1], inputs[2]

def test_issue494_collapsing_bnodes():
"""Test for https://github.com/RDFLib/rdflib/issues/494 collapsing BNodes"""
g = Graph()
g += [
(BNode('Na1a8fbcf755f41c1b5728f326be50994'),
RDF['object'],
URIRef(u'source')),
(BNode('Na1a8fbcf755f41c1b5728f326be50994'),
RDF['predicate'],
BNode('vcb3')),
(BNode('Na1a8fbcf755f41c1b5728f326be50994'),
RDF['subject'],
BNode('vcb2')),
(BNode('Na1a8fbcf755f41c1b5728f326be50994'),
RDF['type'],
RDF['Statement']),
(BNode('Na713b02f320d409c806ff0190db324f4'),
RDF['object'],
URIRef(u'target')),
(BNode('Na713b02f320d409c806ff0190db324f4'),
RDF['predicate'],
BNode('vcb0')),
(BNode('Na713b02f320d409c806ff0190db324f4'),
RDF['subject'],
URIRef(u'source')),
(BNode('Na713b02f320d409c806ff0190db324f4'),
RDF['type'],
RDF['Statement']),
(BNode('Ndb804ba690a64b3dbb9063c68d5e3550'),
RDF['object'],
BNode('vr0KcS4')),
(BNode('Ndb804ba690a64b3dbb9063c68d5e3550'),
RDF['predicate'],
BNode('vrby3JV')),
(BNode('Ndb804ba690a64b3dbb9063c68d5e3550'),
RDF['subject'],
URIRef(u'source')),
(BNode('Ndb804ba690a64b3dbb9063c68d5e3550'),
RDF['type'],
RDF['Statement']),
(BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'),
RDF['object'],
URIRef(u'source')),
(BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'),
RDF['predicate'],
BNode('vcb5')),
(BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'),
RDF['subject'],
URIRef(u'target')),
(BNode('Ndfc47fb1cd2d4382bcb8d5eb7835a636'),
RDF['type'],
RDF['Statement']),
(BNode('Nec6864ef180843838aa9805bac835c98'),
RDF['object'],
URIRef(u'source')),
(BNode('Nec6864ef180843838aa9805bac835c98'),
RDF['predicate'],
BNode('vcb4')),
(BNode('Nec6864ef180843838aa9805bac835c98'),
RDF['subject'],
URIRef(u'source')),
(BNode('Nec6864ef180843838aa9805bac835c98'),
RDF['type'],
RDF['Statement']),
]

print 'graph length: %d, nodes: %d' % (len(g), len(g.all_nodes()))
print 'triple_bnode degrees:'
for triple_bnode in g.subjects(RDF['type'], RDF['Statement']):
print len(list(g.triples([triple_bnode, None, None])))
print 'all node degrees:'
g_node_degs = sorted([
len(list(g.triples([node, None, None])))
for node in g.all_nodes()
], reverse=True)
print g_node_degs

cg = to_canonical_graph(g)
print 'graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes()))
print 'triple_bnode degrees:'
for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']):
print len(list(cg.triples([triple_bnode, None, None])))
print 'all node degrees:'
cg_node_degs = sorted([
len(list(cg.triples([node, None, None])))
for node in cg.all_nodes()
], reverse=True)
print cg_node_degs

assert len(g) == len(cg), \
'canonicalization changed number of triples in graph'
assert len(g.all_nodes()) == len(cg.all_nodes()), \
'canonicalization changed number of nodes in graph'
assert len(list(g.subjects(RDF['type'], RDF['Statement']))) == \
len(list(cg.subjects(RDF['type'], RDF['Statement']))), \
'canonicalization changed number of statements'
assert g_node_degs == cg_node_degs, \
'canonicalization changed node degrees'

0 comments on commit 9ba4547

Please sign in to comment.