flowr-analysis · Ellpeck · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/.github/workflows/qa.yaml b/.github/workflows/qa.yaml
@@ -1,6 +1,6 @@
 name: "QA"
 # Runs on each push and tests flowR for the default configuration
-# Depending on the targets etc. this may perform many more checks!
+# Depending on the targets, etc. this may perform many more checks!
 
 'on':
   push:
@@ -102,16 +102,16 @@ jobs:
         with:
           node-version: ${{ env.ACTION_NODE_VERSION }}
           registry-url: "https://registry.npmjs.org/"
-      
+
       - name: "⬇️ Setup R"
         uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ env.ACTION_R_VERSION }}
-      
+
       - name: "📦 Install R Packages"
         shell: Rscript {0}
         run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")
-        
+
       - name: "🧪 Run the Tests"
         run: bash .github/workflows/scripts/run-flowr-command.sh "test-full -- --forbid-only"
 
@@ -151,16 +151,16 @@ jobs:
         with:
           node-version: ${{ env.ACTION_NODE_VERSION }}
           registry-url: "https://registry.npmjs.org/"
-      
+
       - name: "⬇️ Setup R"
         uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ env.ACTION_R_VERSION }}
-      
+
       - name: "📦 Install R Packages"
         shell: Rscript {0}
         run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")
-        
+
       - name: "⏱️ Run the performance benchmarks"
         run: bash .github/workflows/scripts/run-flowr-command.sh performance-test -- 1 1 "${{ matrix.name }}"
 
@@ -237,7 +237,7 @@ jobs:
           gh-repository: ${{ github.repository }}
           benchmark-data-dir-path: wiki/stats/benchmark/
           auto-push: false
-        
+
   deploy-doc:
     needs: [ test, performance-test ]
     name: "🚀 Build and Deploy Documentation (only on main)"
@@ -264,16 +264,16 @@ jobs:
         with:
           node-version: ${{ env.ACTION_NODE_VERSION }}
           registry-url: "https://registry.npmjs.org/"
-      
+
       - name: "⬇️ Setup R"
         uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ env.ACTION_R_VERSION }}
-      
+
       - name: "📦 Install R Packages"
         shell: Rscript {0}
         run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")
-        
+
       - name: "🛠️ Build the documentation"
         run: bash .github/workflows/scripts/run-flowr-command.sh doc
 

diff --git a/src/dataflow/cluster.ts b/src/dataflow/cluster.ts
@@ -0,0 +1,33 @@
+import type { DataflowGraph } from './graph/graph';
+import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id';
+
+export type DataflowGraphClusters = DataflowGraphCluster[];
+export interface DataflowGraphCluster {
+	readonly startNode: NodeId;
+	readonly members:   readonly NodeId[];
+}
+
+export function findAllClusters(graph: DataflowGraph): DataflowGraphClusters {
+	const clusters: DataflowGraphClusters = [];
+	const notReached = new Set<NodeId>([...graph.vertices(true)].map(([id]) => id));
+	/* TODO: probably it is best to start from back to front ? */
+	while(notReached.size > 0){
+		const [startNode] = notReached;
+		notReached.delete(startNode);
+		clusters.push({ startNode: startNode, members: [startNode, ...cluster(graph, startNode, notReached)] });
+	}
+	return clusters;
+}
+
+function cluster(graph: DataflowGraph, from: NodeId, notReached: Set<NodeId>): NodeId[] {
+	const edges: NodeId[] = [];
+	// TODO do we only need outgoing edges?? help
+	for(const [to] of graph.outgoingEdges(from) ?? []) {
+		// TODO just deleting these is insufficient, examples like: edge(0, 1) + edge(1, 0)
+		if(notReached.delete(to)) {
+			edges.push(to);
+			edges.push(...cluster(graph, to, notReached));
+		}
+	}
+	return edges;
+}
diff --git a/test/functionality/dataflow/graph/cluster-tests.ts b/test/functionality/dataflow/graph/cluster-tests.ts
@@ -0,0 +1,86 @@
+import type { DataflowGraph } from '../../../../src/dataflow/graph/graph';
+import type { DataflowGraphClusters } from '../../../../src/dataflow/cluster';
+import { findAllClusters } from '../../../../src/dataflow/cluster';
+import { assert } from 'chai';
+import { emptyGraph } from '../../_helper/dataflow/dataflowgraph-builder';
+import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse';
+import { PipelineExecutor } from '../../../../src/core/pipeline-executor';
+import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines';
+import { requestFromInput } from '../../../../src/r-bridge/retriever';
+import { deterministicCountingIdGenerator } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate';
+import { withShell } from '../../_helper/shell';
+import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse';
+import type { NodeId } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/node-id';
+
+describe('Graph Clustering', () => {
+	describe('Simple Graph Tests', () => {
+		function test(name: string, graph: DataflowGraph, expected: DataflowGraphClusters): void {
+			it(name, () => compareClusters(findAllClusters(graph), expected));
+		}
+
+		test('empty', emptyGraph(), []);
+		test('single vertex', emptyGraph().use(0, 'x'), [{ startNode: 0, members: [0] }]);
+		test('single edge', emptyGraph().use(0, 'x').use(1, 'y').reads(0, 1), [{ startNode: 0, members: [0, 1] }]);
+		test('two single-edge',
+			emptyGraph().use(0, 'x').use(1, 'y').reads(0, 1).use(2, 'z').use(3, 'w').reads(2, 3),
+			[{ startNode: 0, members: [0, 1] }, { startNode: 2, members: [2, 3] }]);
+	});
+
+	describe('Code Snippets', withShell(shell => {
+		function test(name: string, code: string, clusters: readonly SingleSlicingCriterion[][]): void {
+			it(name, async() => {
+				const info = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
+					shell,
+					request: requestFromInput(code),
+					getId:   deterministicCountingIdGenerator(0)
+				}).allRemainingSteps();
+
+				const graph = info.dataflow.graph;
+
+				// resolve all criteria
+				const resolved = clusters.map(c => ({
+					startNode: '',
+					members:   c.map(s => slicingCriterionToId(s, graph.idMap ?? info.normalize.idMap))
+				}));
+				const actual = findAllClusters(graph);
+				compareClusters(actual, resolved);
+			});
+		}
+
+		test('assignment', 'x <- 3', [['1:1', '1:3', '1:6']]);
+		test('two independent assignments', 'x <- 3\ny <- 4', [['1:1', '1:3', '1:6'], ['2:1', '2:3', '2:6']]);
+		test('with a print call', 'x <- 3\nprint(x)', [['1:1', '1:3', '1:6', '2:1', '2:7']]);
+		test('late join of clusters', 'x <- 3\ny <- 4\nprint(x + y)', [['1:1', '1:3', '1:6', '2:1', '2:3', '2:6', '3:1', '3:7', '3:9', '3:11']]);
+		test('contain call target', 'y <- 42\nf <- function(x) { x * y }\nf(2)\nf(3)', [['1:1', '1:3', '1:6', '2:1', '2:3', '2:6', '2:15', '2:18', '2:20', '2:22', '2:24', '3:1', '3:3', '4:1', '4:3']]);
+		test('some odd ducklings', 'y <- 42\nz <- 5\nf <- function(x) { x * y }\nf(2)\nprint(z)\nf(3)\nu', [
+			['1:1', '1:3', '1:6', '3:1', '3:3', '3:6', '3:15', '3:18', '3:20', '3:22', '3:24', '3:1', '3:3', '6:1', '6:3'], /* call as before */
+			['2:1', '2:3', '2:6', '5:1', '5:7'], /* print & z */
+			['7:1'] /* u */
+		]);
+	}));
+});
+
+function compareClusters(actual: DataflowGraphClusters, expected: DataflowGraphClusters): void {
+	actual = normalizeClusters(actual);
+	expected = normalizeClusters(expected);
+
+	assert.equal(actual.length, expected.length, `Different number of clusters: ${JSON.stringify(actual)} vs. wanted: ${JSON.stringify(expected)}`);
+	for(let i = 0; i < actual.length; i++) {
+		assert.equal(actual[i].members.length, expected[i].members.length, `Member amounts of cluster differ: ${actual[i].members.toString()} vs ${expected[i].members.toString()}`);
+		for(let m = 0; m < actual[i].members.length; m++) {
+			assert.equal(actual[i].members[m], expected[i].members[m], `Member ${actual[i].members[m]} of cluster ${i} differs`);
+		}
+	}
+
+	function compareIds(a: NodeId | undefined, b: NodeId | undefined): number {
+		return String(a ?? '').localeCompare(String(b ?? ''));
+	}
+
+	function normalizeClusters(clusters: DataflowGraphClusters): DataflowGraphClusters {
+		/* sort order and the order members */
+		return clusters.map(c => ({
+			startNode: c.startNode,
+			members:   [...c.members].sort(compareIds)
+		})).sort((a, b) => compareIds(a.members[0], b.members[0]));
+	}
+}