Add additional ":lineage" command and message (#915)

* feat: first attempt at the new getLineage command * feat: walk additional edges for determining the lineage * feat: store lineage IDs in a set * feat: prevent infinite recursion when determining the lineage IDs * feat-fix: fixed lineage network request * feat-fix: get the dfg and ast from the pipeline in the correct way * doc: added lineage request doc * feat-fix: add filetoken to error messages Co-authored-by: Florian Sihler <florian.sihler@uni-ulm.de> * feat-fix: updated comment for better clarity * feat-fix: more guards, more better * feat-fix: clearer function interface * feat-fix: removed unused import No I didn't push with --no-verify * feat-fix: removed console.log Co-authored-by: Florian Sihler <florian.sihler@uni-ulm.de> * feat-fix: criteria resolve now only requires an idmap * test(lineage): basic test setup * lint-fix: handle linter issues and empty test labels * refactor(lineage): remove redundant promise * doc(lineage): refine wiki entries --------- Co-authored-by: Florian Sihler <florian.sihler@uni-ulm.de>
flowr-analysis · Aug 30, 2024 · 429eef3 · 429eef3 · github-actions · Aug 30, 2024
1 parent 7e27718
commit 429eef3
Show file tree

Hide file tree

Showing 9 changed files with 270 additions and 18 deletions.
diff --git a/src/cli/repl/commands/commands.ts b/src/cli/repl/commands/commands.ts
@@ -13,6 +13,7 @@ import { italic , bold } from '../../../util/ansi'
 import { splitAtEscapeSensitive } from '../../../util/args'
 import { guard } from '../../../util/assert'
 import { scripts } from '../../common/scripts-info'
+import { getLineageCommand } from './lineage'
 
 function printHelpForScript(script: [string, ReplCommand], f: OutputFormatter): string {
 	const base = `  ${bold(padCmd(':' + script[0]), f)}${script[1].description}`
@@ -66,7 +67,8 @@ const _commands: Record<string, ReplCommand> = {
 	'dataflow':     dataflowCommand,
 	'dataflow*':    dataflowStarCommand,
 	'controlflow':  controlflowCommand,
-	'controlflow*': controlflowStarCommand
+	'controlflow*': controlflowStarCommand,
+	'lineage':      getLineageCommand
 }
 let commandsInitialized = false
 

diff --git a/src/cli/repl/commands/lineage.ts b/src/cli/repl/commands/lineage.ts
@@ -0,0 +1,79 @@
+import type { ReplCommand } from './main'
+import { PipelineExecutor } from '../../../core/pipeline-executor'
+import { DEFAULT_DATAFLOW_PIPELINE } from '../../../core/steps/pipeline/default-pipelines'
+import type { RShell } from '../../../r-bridge/shell'
+import { requestFromInput } from '../../../r-bridge/retriever'
+import type { SingleSlicingCriterion } from '../../../slicing/criterion/parse'
+import { slicingCriterionToId } from '../../../slicing/criterion/parse'
+import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'
+import type { OutgoingEdges } from '../../../dataflow/graph/graph'
+import type { DataflowGraphEdge } from '../../../dataflow/graph/edge'
+import { edgeIncludesType, EdgeType } from '../../../dataflow/graph/edge'
+import type { DataflowInformation } from '../../../dataflow/info'
+import type { NormalizedAst } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'
+import { guard } from '../../../util/assert'
+
+function splitAt(str: string, idx: number) {
+	return [str.slice(0, idx), str.slice(idx)]
+}
+
+async function getDfg(shell: RShell, remainingLine: string) {
+	return await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
+		shell,
+		request: requestFromInput(remainingLine.trim())
+	}).allRemainingSteps()
+}
+
+function filterRelevantEdges(edge: DataflowGraphEdge) {
+	return edgeIncludesType(EdgeType.DefinedBy | EdgeType.DefinedByOnCall | EdgeType.Returns | EdgeType.Reads, edge.types)
+}
+
+function pushRelevantEdges(queue: [NodeId, DataflowGraphEdge][], outgoingEdges: OutgoingEdges) {
+	queue.push(...[...outgoingEdges].filter(([_, edge]) => filterRelevantEdges(edge)))
+}
+
+/**
+ * Get the lineage of a node in the dataflow graph
+ *
+ * @param criterion - The criterion to get the lineage of
+ * @param ast       - The normalized AST
+ * @param dfg       - The dataflow graph
+ * @returns The lineage of the node represented as a set of node ids
+ */
+export function getLineage(criterion: SingleSlicingCriterion, { idMap } : NormalizedAst, dfg: DataflowInformation): Set<NodeId> {
+	const src = dfg.graph.get(slicingCriterionToId(criterion, idMap))
+	guard(src !== undefined, 'The ID pointed to by the criterion does not exist in the dataflow graph')
+	const [vertex, outgoingEdges] = src
+	const result: Set<NodeId> = new Set([vertex.id])
+	const edgeQueue: [NodeId, DataflowGraphEdge][] = []
+	pushRelevantEdges(edgeQueue, outgoingEdges)
+
+	while(edgeQueue.length > 0) {
+		const [target] = edgeQueue.shift() as [NodeId, DataflowGraphEdge]
+		if(result.has(target)) {
+			continue
+		}
+
+		result.add(target)
+
+		const outgoingEdges = dfg.graph.outgoingEdges(target)
+		if(outgoingEdges !== undefined) {
+			pushRelevantEdges(edgeQueue, outgoingEdges)
+		}
+	}
+
+	return result
+}
+
+export const getLineageCommand: ReplCommand = {
+	description:  'Get the lineage of an R object',
+	usageExample: ':lineage',
+	aliases:      ['lin'],
+	script:       false,
+	fn:           async(output, shell, remainingLine) => {
+		const [criterion, rest] = splitAt(remainingLine, remainingLine.indexOf(' '))
+		const { dataflow: dfg, normalize: ast } = await getDfg(shell, rest)
+		const lineageIds = getLineage(criterion as SingleSlicingCriterion, ast, dfg)
+		output.stdout([...lineageIds].join('\n'))
+	}
+}
diff --git a/src/cli/repl/server/connection.ts b/src/cli/repl/server/connection.ts
@@ -35,6 +35,10 @@ import fs from 'fs'
 import type { RParseRequests } from '../../../r-bridge/retriever'
 import { autoSelectLibrary } from '../../../reconstruct/auto-select/auto-select-defaults'
 import { makeMagicCommentHandler } from '../../../reconstruct/auto-select/magic-comments'
+import type { LineageRequestMessage, LineageResponseMessage } from './messages/lineage'
+import { requestLineageMessage } from './messages/lineage'
+import { getLineage } from '../commands/lineage'
+import { guard } from '../../../util/assert'
 
 /**
  * Each connection handles a single client, answering to its requests.
@@ -92,6 +96,9 @@ export class FlowRServerConnection {
 			case 'request-repl-execution':
 				this.handleRepl(request.message as ExecuteRequestMessage)
 				break
+			case 'request-lineage':
+				this.handleLineageRequest(request.message as LineageRequestMessage)
+				break
 			default:
 				sendMessage<FlowrErrorMessage>(this.socket, {
 					id:     request.message.id,
@@ -277,6 +284,38 @@ export class FlowRServerConnection {
 		})
 	}
 
+	private handleLineageRequest(base: LineageRequestMessage) {
+		const requestResult = validateMessage(base, requestLineageMessage)
+
+		if(requestResult.type === 'error') {
+			answerForValidationError(this.socket, requestResult, base.id)
+			return
+		}
+
+		const request = requestResult.message
+		this.logger.info(`[${this.name}] Received lineage request for criterion ${request.criterion}`)
+
+		const fileInformation = this.fileMap.get(request.filetoken)
+		if(!fileInformation) {
+			sendMessage<FlowrErrorMessage>(this.socket, {
+				id:     request.id,
+				type:   'error',
+				fatal:  false,
+				reason: `The file token ${request.filetoken} has never been analyzed.`
+			})
+			return
+		}
+
+		const { dataflow: dfg, normalize: ast } = fileInformation.pipeline.getResults(true)
+		guard(dfg !== undefined, `Dataflow graph must be present (request: ${request.filetoken})`)
+		guard(ast !== undefined, `AST must be present (request: ${request.filetoken})`)
+		const lineageIds = getLineage(request.criterion, ast, dfg)
+		sendMessage<LineageResponseMessage>(this.socket, {
+			type:    'response-lineage',
+			id:      request.id,
+			lineage: [...lineageIds]
+		})
+	}
 }
 
 export function sanitizeAnalysisResults(results: Partial<PipelineOutput<typeof DEFAULT_SLICING_PIPELINE>>): DeepPartial<PipelineOutput<typeof DEFAULT_SLICING_PIPELINE>> {

diff --git a/src/cli/repl/server/messages/lineage.ts b/src/cli/repl/server/messages/lineage.ts
@@ -0,0 +1,28 @@
+import type { SingleSlicingCriterion } from '../../../../slicing/criterion/parse'
+import type { IdMessageBase, MessageDefinition } from './messages'
+import type { NodeId } from '../../../../r-bridge/lang-4.x/ast/model/processing/node-id'
+import Joi from 'joi'
+
+export interface LineageRequestMessage extends IdMessageBase {
+	type:      'request-lineage',
+	/** The {@link FileAnalysisRequestMessage#filetoken} of the file/data */
+	filetoken: string,
+	/** The criterion to start the lineage from */
+	criterion: SingleSlicingCriterion,
+}
+
+export const requestLineageMessage: MessageDefinition<LineageRequestMessage> = {
+	type:   'request-lineage',
+	schema: Joi.object({
+		type:      Joi.string().valid('request-lineage').required(),
+		id:        Joi.string().optional(),
+		filetoken: Joi.string().required(),
+		criterion: Joi.string().required()
+	})
+}
+
+export interface LineageResponseMessage extends IdMessageBase {
+	type:    'response-lineage',
+	/** The lineage of the given criterion. With this being the representation of a set, there is no guarantee about order. */
+	lineage: NodeId[]
+}
diff --git a/src/cli/repl/server/messages/messages.ts b/src/cli/repl/server/messages/messages.ts
@@ -9,6 +9,7 @@ import type { FileAnalysisRequestMessage, FileAnalysisResponseMessageJson } from
 import type { ExecuteEndMessage, ExecuteIntermediateResponseMessage, ExecuteRequestMessage } from './repl'
 import type { SliceRequestMessage, SliceResponseMessage } from './slice'
 import type { FlowrErrorMessage } from './error'
+import type { LineageRequestMessage, LineageResponseMessage } from './lineage'
 
 /**
  * If you send a message it must *not* contain a newline but the message must be terminated by a newline.
@@ -51,4 +52,6 @@ export type FlowrMessage = FlowrHelloResponseMessage
                          | ExecuteEndMessage
                          | SliceRequestMessage
                          | SliceResponseMessage
+                         | LineageRequestMessage
+                         | LineageResponseMessage
                          | FlowrErrorMessage
diff --git a/test/functionality/_helper/label.ts b/test/functionality/_helper/label.ts
@@ -20,7 +20,7 @@ const uniqueTestId = (() => {
 })()
 
 
-const TestLabelContexts = ['parse', 'desugar', 'dataflow', 'other', 'slice', 'output'] as const
+const TestLabelContexts = ['parse', 'desugar', 'dataflow', 'other', 'slice', 'output', 'lineage'] as const
 export type TestLabelContext = typeof TestLabelContexts[number]
 
 export interface TestLabel extends MergeableRecord {

diff --git a/test/functionality/lineage/lineage.spec.ts b/test/functionality/lineage/lineage.spec.ts
@@ -0,0 +1,34 @@
+import { withShell } from '../_helper/shell'
+import { PipelineExecutor } from '../../../src/core/pipeline-executor'
+import { DEFAULT_DATAFLOW_PIPELINE } from '../../../src/core/steps/pipeline/default-pipelines'
+import { requestFromInput } from '../../../src/r-bridge/retriever'
+import type { SingleSlicingCriterion } from '../../../src/slicing/criterion/parse'
+import { getLineage } from '../../../src/cli/repl/commands/lineage'
+import type { TestLabel } from '../_helper/label'
+import { decorateLabelContext, label } from '../_helper/label'
+import type { NodeId } from '../../../src/r-bridge/lang-4.x/ast/model/processing/node-id'
+import { assert } from 'chai'
+import { setEquals } from '../../../src/util/set'
+import { OperatorDatabase } from '../../../src/r-bridge/lang-4.x/ast/model/operators'
+
+describe('Test lineage', withShell(shell => {
+
+	function assertLineage(title: string | TestLabel, request: string, criterion: SingleSlicingCriterion, expected: NodeId[]) {
+		const effectiveName = decorateLabelContext(title, ['lineage'])
+
+		return it(effectiveName, async() => {
+			const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
+				shell,
+				request: requestFromInput(request)
+			}).allRemainingSteps()
+			const lineageIds = getLineage(criterion, result.normalize, result.dataflow)
+			assert.isTrue(setEquals(lineageIds, new Set(expected)), `Expected ${JSON.stringify(expected)} but got ${JSON.stringify([...lineageIds])}`)
+		})
+	}
+
+	assertLineage(label('The demo lineage', [
+		'name-normal', ...OperatorDatabase['<-'].capabilities, 'newlines'
+	]), `c <- x
+b <- c
+a <- b`, '3@a', [0, 1, 2, 3, 4, 5, 6, 7, 8])
+}))
diff --git a/test/functionality/slicing/static-program-slices/calls-tests.ts b/test/functionality/slicing/static-program-slices/calls-tests.ts
@@ -506,8 +506,11 @@ print(x)`, ['4@x'], 'x <- 3\nx'/*, { expectedOutput: '[1] 2' }*/)
 \`<-\` <- \`*\`
 x <- 3
 print(y = x)`, ['4@y'], 'y=x')
-		assertSliced(label('redefine in local scope', []),
-			shell, `f <- function() {
+		assertSliced(label('redefine in local scope', [
+			'newlines', ...OperatorDatabase['<-'].capabilities, ...OperatorDatabase['*'].capabilities,
+			'numbers', 'name-escaped', 'call-normal', 'function-definitions', 'redefinition-of-built-in-functions-primitives'
+		]),
+		shell, `f <- function() {
    x <- 2
    \`<-\` <- \`*\`
    x <- 3
@@ -565,7 +568,7 @@ foo(.x = f(3))`)
 			assertSliced(label('nested definition in unknown foo', capabilities), shell,
 				'x <- function() { 3 }\nfoo(.x = function(y) { c(X = x()) })', ['2@foo'],
 				'x <- function() { 3 }\nfoo(.x = function(y) { c(X = x()) })')
-			assertSliced(label('nested definition in unknown foo with reference', []), shell,
+			assertSliced(label('nested definition in unknown foo with reference', capabilities), shell,
 				'x <- function() { 3 }\ng = function(y) { c(X = x()) }\nfoo(.x = g)', ['3@foo'],
 				'x <- function() { 3 }\ng = function(y) { c(X = x()) }\nfoo(.x = g)')
 		})
Benchmark suite	Current: `429eef3`	Previous: `4fb2496`	Ratio
`Retrieve AST from R code`	`243.5575759090909` ms (`107.86543297512463`)	`238.1723235909091` ms (`96.71402866840735`)	`1.02`
`Normalize R AST`	`19.803893954545455` ms (`34.45843540804731`)	`19.4989345` ms (`33.57276767882534`)	`1.02`
`Produce dataflow information`	`38.19301140909091` ms (`82.05917244485015`)	`38.792485772727275` ms (`84.2875165952748`)	`0.98`
`Total per-file`	`803.4673899090909` ms (`1529.7960877051385`)	`799.2110945` ms (`1544.5301502222158`)	`1.01`
`Static slicing`	`1.2176619293341087` ms (`1.1120394000143086`)	`1.1726790800999163` ms (`1.0086901141796194`)	`1.04`
`Reconstruct code`	`0.25070812632072736` ms (`0.19966603837215122`)	`0.24335608198030453` ms (`0.18573862888815007`)	`1.03`
`Total per-slice`	`1.4848747471150194` ms (`1.1618629904438125`)	`1.4331761674751269` ms (`1.0536924338527542`)	`1.04`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.7869724682442361` #	`0.786663222057468` #	`1.00`
`reduction (normalized tokens)`	`0.7640044233283717` #	`0.763664433957929` #	`1.00`
`memory (df-graph)`	`147.58589311079547` KiB (`359.2574768951678`)	`147.66770241477272` KiB (`359.55136525995476`)	`1.00`
Benchmark suite	Current: `429eef3`	Previous: `4fb2496`	Ratio
`Retrieve AST from R code`	`250.14352703999998` ms (`46.22166925640989`)	`240.9999736` ms (`45.52546037334287`)	`1.04`
`Normalize R AST`	`23.11401898` ms (`17.75707281573565`)	`22.044827100000003` ms (`17.4420619617221`)	`1.05`
`Produce dataflow information`	`71.96789064000001` ms (`86.92611070053505`)	`68.41181266` ms (`83.25046712198441`)	`1.05`
`Total per-file`	`3589.1298162199996` ms (`7924.056376688097`)	`3603.0243807399997` ms (`7958.676569737224`)	`1.00`
`Static slicing`	`7.240201848248687` ms (`20.33921754263181`)	`7.403007833130669` ms (`20.923205633042343`)	`0.98`
`Reconstruct code`	`0.27129692298715036` ms (`0.17211426023828647`)	`0.24681178006363166` ms (`0.15169934993997963`)	`1.10`
`Total per-slice`	`7.520366205749907` ms (`20.369225019376092`)	`7.657637977886095` ms (`20.95063234618626`)	`0.98`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.9181372100742089` #	`0.9214445180065712` #	`1.00`
`reduction (normalized tokens)`	`0.884931018000862` #	`0.88847659105633` #	`1.00`
`memory (df-graph)`	`142.5410546875` KiB (`146.7038638918548`)	`142.5463671875` KiB (`146.6995040110581`)	`1.00`