From c0960e252a90c26b969e97018a23833e68bf258c Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 28 Jul 2021 11:48:16 +0200 Subject: [PATCH 1/2] Fix base case for cycle detection When multiple Blank Nodes referenced each other, they would erroneously be marked as being part of a cycle and hence not a chain, leading to the chain detection algorithm not working and the blank nodes being listed at the top level of the graph with a non-deterministic identifier. This fixes that; if those Blank Nodes for a chain without cycles, they will now be represented as a tree without needing to generate identifiers for them. --- src/rdf.test.ts | 2 +- src/rdfjs.internal.ts | 6 +- src/resource/solidDataset.test.ts | 109 ++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 3 deletions(-) diff --git a/src/rdf.test.ts b/src/rdf.test.ts index 3c6bb24493..ab399e9a58 100644 --- a/src/rdf.test.ts +++ b/src/rdf.test.ts @@ -587,7 +587,7 @@ describe("fromRdfJsDataset", () => { ); }); - it("can parse chained Blank Nodes with a single lnk that end in a dangling Blank Node", () => { + it("can parse chained Blank Nodes with a single link that end in a dangling Blank Node", () => { const mockDataset: ImmutableDataset = { type: "Dataset", graphs: { default: {} }, diff --git a/src/rdfjs.internal.ts b/src/rdfjs.internal.ts index e9e08d5022..4606fc4477 100644 --- a/src/rdfjs.internal.ts +++ b/src/rdfjs.internal.ts @@ -502,9 +502,11 @@ function getCycleBlankNodes( ) .map((quad) => quad.object as RdfJs.BlankNode); - // If no Blank Nodes are connected to `currentNode`, we're done: + // If no Blank Nodes are connected to `currentNode`, and `currentNode` is not + // part of a cycle, we're done; the currently traversed Nodes do not form a + // cycle: if (blankNodeObjects.length === 0) { - return traversedBlankNodes; + return []; } // Store that we've traversed `currentNode`, then move on to all the Blank diff --git a/src/resource/solidDataset.test.ts b/src/resource/solidDataset.test.ts index 33d5220309..73fa448af4 100644 --- a/src/resource/solidDataset.test.ts +++ b/src/resource/solidDataset.test.ts @@ -167,6 +167,115 @@ describe("responseToSolidDataset", () => { }); }); + it("does not include non-deterministic identifiers when it detects non-cyclic chains of Blank Nodes", async () => { + const turtle = ` + @prefix : <#>. + @prefix foaf: . + @prefix vcard: . + @prefix acl: . + + <> a foaf:PersonalProfileDocument; foaf:maker :me; foaf:primaryTopic :me. + + :me + a foaf:Person; + vcard:fn "Vincent"; + acl:trustedApp + [ + acl:mode acl:Append, acl:Control, acl:Read, acl:Write; + acl:origin + ], + [ + acl:mode acl:Append, acl:Control, acl:Read, acl:Write; + acl:origin + ]. + `; + + const response = new Response(turtle, { + headers: { + "Content-Type": "text/turtle", + }, + }); + jest + .spyOn(response, "url", "get") + .mockReturnValue("https://some.pod/resource"); + const solidDataset = await responseToSolidDataset(response); + + expect(solidDataset).toStrictEqual({ + graphs: { + default: { + "https://some.pod/resource": { + predicates: { + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": { + namedNodes: [ + "http://xmlns.com/foaf/0.1/PersonalProfileDocument", + ], + }, + "http://xmlns.com/foaf/0.1/maker": { + namedNodes: ["https://some.pod/resource#me"], + }, + "http://xmlns.com/foaf/0.1/primaryTopic": { + namedNodes: ["https://some.pod/resource#me"], + }, + }, + type: "Subject", + url: "https://some.pod/resource", + }, + "https://some.pod/resource#me": { + predicates: { + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": { + namedNodes: ["http://xmlns.com/foaf/0.1/Person"], + }, + "http://www.w3.org/2006/vcard/ns#fn": { + literals: { + "http://www.w3.org/2001/XMLSchema#string": ["Vincent"], + }, + }, + "http://www.w3.org/ns/auth/acl#trustedApp": { + blankNodes: [ + { + "http://www.w3.org/ns/auth/acl#mode": { + namedNodes: [ + "http://www.w3.org/ns/auth/acl#Append", + "http://www.w3.org/ns/auth/acl#Control", + "http://www.w3.org/ns/auth/acl#Read", + "http://www.w3.org/ns/auth/acl#Write", + ], + }, + "http://www.w3.org/ns/auth/acl#origin": { + namedNodes: ["http://localhost:3000"], + }, + }, + { + "http://www.w3.org/ns/auth/acl#mode": { + namedNodes: [ + "http://www.w3.org/ns/auth/acl#Append", + "http://www.w3.org/ns/auth/acl#Control", + "http://www.w3.org/ns/auth/acl#Read", + "http://www.w3.org/ns/auth/acl#Write", + ], + }, + "http://www.w3.org/ns/auth/acl#origin": { + namedNodes: ["https://penny.vincenttunru.com"], + }, + }, + ], + }, + }, + type: "Subject", + url: "https://some.pod/resource#me", + }, + }, + }, + internal_resourceInfo: { + contentType: "text/turtle", + isRawData: false, + linkedResources: {}, + sourceIri: "https://some.pod/resource", + }, + type: "Dataset", + }); + }); + it("throws a meaningful error when the server returned a 403", async () => { const response = new Response("Not allowed", { status: 403 }); jest From 61d0c89a16a39be0e54af07b6874fc2d4331a2d7 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 28 Jul 2021 11:50:41 +0200 Subject: [PATCH 2/2] Disable chain detection for many Blank Nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chain detection algorithm is relatively expensive. Since it is only an optimisation to make our Dataset data structure deterministic in more cases, it makes sense to avoid running it when it takes a lot of time — the different representations are equivalent. --- src/resource/solidDataset.test.ts | 40 +++++++++++++++++++++++++++++++ src/resource/solidDataset.ts | 12 ++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/src/resource/solidDataset.test.ts b/src/resource/solidDataset.test.ts index 73fa448af4..0727899ef0 100644 --- a/src/resource/solidDataset.test.ts +++ b/src/resource/solidDataset.test.ts @@ -276,6 +276,46 @@ describe("responseToSolidDataset", () => { }); }); + it("does not attempt to detect chains when there are many Blank Nodes, to avoid performance bottlenecks", async () => { + function getChainedBlankNode(iteration: number): string { + if (iteration === 1000) { + return ` "Base case"`; + } + return ` [${getChainedBlankNode( + iteration + 1 + )}]`; + } + const turtle = ` + @prefix : <#>. + @prefix vcard: . + + :me vcard:fn [${getChainedBlankNode(0)}]. + `; + + const response = new Response(turtle, { + headers: { + "Content-Type": "text/turtle", + }, + }); + jest + .spyOn(response, "url", "get") + .mockReturnValue("https://some.pod/resource"); + + const t0 = performance.now(); + const solidDataset = await responseToSolidDataset(response); + const t1 = performance.now(); + + // Parsing a document with over 1000 statements will always be somewhat slow + // (hence allowing it to take a second), but if it attempts to detect + // chains, it will take on the order of >20 seconds. + expect(t1 - t0).toBeLessThan(1000); + // Blank Nodes should be listed explicitly, rather than as properties on + // https://some.pod/resource#me: + expect(Object.keys(solidDataset.graphs.default)).not.toStrictEqual([ + "https://some.pod/resource#me", + ]); + }); + it("throws a meaningful error when the server returned a 403", async () => { const response = new Response("Not allowed", { status: 403 }); jest diff --git a/src/resource/solidDataset.ts b/src/resource/solidDataset.ts index 70caa52f76..7d3c9a37cf 100644 --- a/src/resource/solidDataset.ts +++ b/src/resource/solidDataset.ts @@ -233,11 +233,19 @@ export async function responseToSolidDataset( solidDataset = addRdfJsQuadToDataset(solidDataset, quad); } }); - parser.onComplete(() => { + parser.onComplete(async () => { + // If a Resource contains more than this number of Blank Nodes, + // we consider the detection of chains (O(n^2), I think) to be too + // expensive, and just incorporate them as regular Blank Nodes with + // non-deterministic, ad-hoc identifiers into the SolidDataset: + const maxBlankNodesToDetectChainsFor = 20; // Some Blank Nodes only serve to use a set of Quads as the Object for a // single Subject. Those Quads will be added to the SolidDataset when // their Subject's Blank Node is encountered in the Object position. - const chainBlankNodes = getChainBlankNodes(quadsWithBlankNodes); + const chainBlankNodes = + quadsWithBlankNodes.length <= maxBlankNodesToDetectChainsFor + ? getChainBlankNodes(quadsWithBlankNodes) + : []; const quadsWithoutChainBlankNodeSubjects = quadsWithBlankNodes.filter( (quad) => chainBlankNodes.every(