Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataflow cluster analysis #985

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions .github/workflows/qa.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: "QA"
# Runs on each push and tests flowR for the default configuration
# Depending on the targets etc. this may perform many more checks!
# Depending on the targets, etc. this may perform many more checks!

'on':
push:
Expand Down Expand Up @@ -102,16 +102,16 @@ jobs:
with:
node-version: ${{ env.ACTION_NODE_VERSION }}
registry-url: "https://registry.npmjs.org/"

- name: "⬇️ Setup R"
uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ env.ACTION_R_VERSION }}

- name: "📦 Install R Packages"
shell: Rscript {0}
run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")

- name: "🧪 Run the Tests"
run: bash .github/workflows/scripts/run-flowr-command.sh "test-full -- --forbid-only"

Expand Down Expand Up @@ -151,16 +151,16 @@ jobs:
with:
node-version: ${{ env.ACTION_NODE_VERSION }}
registry-url: "https://registry.npmjs.org/"

- name: "⬇️ Setup R"
uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ env.ACTION_R_VERSION }}

- name: "📦 Install R Packages"
shell: Rscript {0}
run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")

- name: "⏱️ Run the performance benchmarks"
run: bash .github/workflows/scripts/run-flowr-command.sh performance-test -- 1 1 "${{ matrix.name }}"

Expand Down Expand Up @@ -237,7 +237,7 @@ jobs:
gh-repository: ${{ github.repository }}
benchmark-data-dir-path: wiki/stats/benchmark/
auto-push: false

deploy-doc:
needs: [ test, performance-test ]
name: "🚀 Build and Deploy Documentation (only on main)"
Expand All @@ -264,16 +264,16 @@ jobs:
with:
node-version: ${{ env.ACTION_NODE_VERSION }}
registry-url: "https://registry.npmjs.org/"

- name: "⬇️ Setup R"
uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ env.ACTION_R_VERSION }}

- name: "📦 Install R Packages"
shell: Rscript {0}
run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")

- name: "🛠️ Build the documentation"
run: bash .github/workflows/scripts/run-flowr-command.sh doc

Expand Down
33 changes: 33 additions & 0 deletions src/dataflow/cluster.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import type { DataflowGraph } from './graph/graph';
import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id';

export type DataflowGraphClusters = DataflowGraphCluster[];
export interface DataflowGraphCluster {
readonly startNode: NodeId;
readonly members: readonly NodeId[];
}

export function findAllClusters(graph: DataflowGraph): DataflowGraphClusters {
const clusters: DataflowGraphClusters = [];
const notReached = new Set<NodeId>([...graph.vertices(true)].map(([id]) => id));
/* TODO: probably it is best to start from back to front ? */
while(notReached.size > 0){
const [startNode] = notReached;
notReached.delete(startNode);
clusters.push({ startNode: startNode, members: [startNode, ...cluster(graph, startNode, notReached)] });
}
return clusters;
}

function cluster(graph: DataflowGraph, from: NodeId, notReached: Set<NodeId>): NodeId[] {
const edges: NodeId[] = [];
// TODO do we only need outgoing edges?? help
for(const [to] of graph.outgoingEdges(from) ?? []) {
// TODO just deleting these is insufficient, examples like: edge(0, 1) + edge(1, 0)
if(notReached.delete(to)) {
edges.push(to);
edges.push(...cluster(graph, to, notReached));
}
}
return edges;
}
86 changes: 86 additions & 0 deletions test/functionality/dataflow/graph/cluster-tests.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import type { DataflowGraph } from '../../../../src/dataflow/graph/graph';
import type { DataflowGraphClusters } from '../../../../src/dataflow/cluster';
import { findAllClusters } from '../../../../src/dataflow/cluster';
import { assert } from 'chai';
import { emptyGraph } from '../../_helper/dataflow/dataflowgraph-builder';
import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse';
import { PipelineExecutor } from '../../../../src/core/pipeline-executor';
import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines';
import { requestFromInput } from '../../../../src/r-bridge/retriever';
import { deterministicCountingIdGenerator } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate';
import { withShell } from '../../_helper/shell';
import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse';
import type { NodeId } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/node-id';

describe('Graph Clustering', () => {
describe('Simple Graph Tests', () => {
function test(name: string, graph: DataflowGraph, expected: DataflowGraphClusters): void {
it(name, () => compareClusters(findAllClusters(graph), expected));
}

test('empty', emptyGraph(), []);
test('single vertex', emptyGraph().use(0, 'x'), [{ startNode: 0, members: [0] }]);
test('single edge', emptyGraph().use(0, 'x').use(1, 'y').reads(0, 1), [{ startNode: 0, members: [0, 1] }]);
test('two single-edge',
emptyGraph().use(0, 'x').use(1, 'y').reads(0, 1).use(2, 'z').use(3, 'w').reads(2, 3),
[{ startNode: 0, members: [0, 1] }, { startNode: 2, members: [2, 3] }]);
});

describe('Code Snippets', withShell(shell => {
function test(name: string, code: string, clusters: readonly SingleSlicingCriterion[][]): void {
it(name, async() => {
const info = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
shell,
request: requestFromInput(code),
getId: deterministicCountingIdGenerator(0)
}).allRemainingSteps();

const graph = info.dataflow.graph;

// resolve all criteria
const resolved = clusters.map(c => ({
startNode: '',
members: c.map(s => slicingCriterionToId(s, graph.idMap ?? info.normalize.idMap))
}));
const actual = findAllClusters(graph);
compareClusters(actual, resolved);
});
}

test('assignment', 'x <- 3', [['1:1', '1:3', '1:6']]);
test('two independent assignments', 'x <- 3\ny <- 4', [['1:1', '1:3', '1:6'], ['2:1', '2:3', '2:6']]);
test('with a print call', 'x <- 3\nprint(x)', [['1:1', '1:3', '1:6', '2:1', '2:7']]);
test('late join of clusters', 'x <- 3\ny <- 4\nprint(x + y)', [['1:1', '1:3', '1:6', '2:1', '2:3', '2:6', '3:1', '3:7', '3:9', '3:11']]);
test('contain call target', 'y <- 42\nf <- function(x) { x * y }\nf(2)\nf(3)', [['1:1', '1:3', '1:6', '2:1', '2:3', '2:6', '2:15', '2:18', '2:20', '2:22', '2:24', '3:1', '3:3', '4:1', '4:3']]);
test('some odd ducklings', 'y <- 42\nz <- 5\nf <- function(x) { x * y }\nf(2)\nprint(z)\nf(3)\nu', [
['1:1', '1:3', '1:6', '3:1', '3:3', '3:6', '3:15', '3:18', '3:20', '3:22', '3:24', '3:1', '3:3', '6:1', '6:3'], /* call as before */
['2:1', '2:3', '2:6', '5:1', '5:7'], /* print & z */
['7:1'] /* u */
]);
}));
});

function compareClusters(actual: DataflowGraphClusters, expected: DataflowGraphClusters): void {
actual = normalizeClusters(actual);
expected = normalizeClusters(expected);

assert.equal(actual.length, expected.length, `Different number of clusters: ${JSON.stringify(actual)} vs. wanted: ${JSON.stringify(expected)}`);
for(let i = 0; i < actual.length; i++) {
assert.equal(actual[i].members.length, expected[i].members.length, `Member amounts of cluster differ: ${actual[i].members.toString()} vs ${expected[i].members.toString()}`);
for(let m = 0; m < actual[i].members.length; m++) {
assert.equal(actual[i].members[m], expected[i].members[m], `Member ${actual[i].members[m]} of cluster ${i} differs`);
}
}

function compareIds(a: NodeId | undefined, b: NodeId | undefined): number {
return String(a ?? '').localeCompare(String(b ?? ''));
}

function normalizeClusters(clusters: DataflowGraphClusters): DataflowGraphClusters {
/* sort order and the order members */
return clusters.map(c => ({
startNode: c.startNode,
members: [...c.members].sort(compareIds)
})).sort((a, b) => compareIds(a.members[0], b.members[0]));
}
}
Loading