Support Library Loads and Installations as Potential Side Effects (#944)

flowr-analysis · Aug 31, 2024 · 9a55656 · 9a55656 · github-actions · Aug 31, 2024
2 parents 9e75557 + 3f5ac40
commit 9a55656
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 39 deletions.
diff --git a/src/cli/repl/server/connection.ts b/src/cli/repl/server/connection.ts
@@ -33,12 +33,12 @@ import { DataflowGraph } from '../../../dataflow/graph/graph'
 import * as tmp from 'tmp'
 import fs from 'fs'
 import type { RParseRequests } from '../../../r-bridge/retriever'
-import { autoSelectLibrary } from '../../../reconstruct/auto-select/auto-select-defaults'
 import { makeMagicCommentHandler } from '../../../reconstruct/auto-select/magic-comments'
 import type { LineageRequestMessage, LineageResponseMessage } from './messages/lineage'
 import { requestLineageMessage } from './messages/lineage'
 import { getLineage } from '../commands/lineage'
 import { guard } from '../../../util/assert'
+import { doNotAutoSelect } from '../../../reconstruct/auto-select/auto-select-defaults'
 
 /**
  * Each connection handles a single client, answering to its requests.
@@ -227,7 +227,7 @@ export class FlowRServerConnection {
 
 		fileInformation.pipeline.updateRequest({
 			criterion:    request.criterion,
-			autoSelectIf: request.noMagicComments ? autoSelectLibrary : makeMagicCommentHandler(autoSelectLibrary)
+			autoSelectIf: request.noMagicComments ? doNotAutoSelect : makeMagicCommentHandler(doNotAutoSelect)
 		})
 
 		void fileInformation.pipeline.allRemainingSteps(true).then(results => {

diff --git a/src/cli/slicer-app.ts b/src/cli/slicer-app.ts
@@ -11,8 +11,8 @@ import type { SingleSlicingCriterion, SlicingCriteria } from '../slicing/criteri
 import type { ReconstructionResult } from '../reconstruct/reconstruct'
 import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id'
 import { stats2string } from '../benchmark/stats/print'
-import { autoSelectLibrary } from '../reconstruct/auto-select/auto-select-defaults'
 import { makeMagicCommentHandler } from '../reconstruct/auto-select/magic-comments'
+import { doNotAutoSelect } from '../reconstruct/auto-select/auto-select-defaults'
 
 export interface SlicerCliOptions {
 	verbose:             boolean
@@ -48,7 +48,7 @@ async function getSlice() {
 		options['input-is-text']
 			? { request: 'text', content: options.input }
 			: { request: 'file', content: options.input },
-		options['no-magic-comments'] ? autoSelectLibrary : makeMagicCommentHandler(autoSelectLibrary)
+		options['no-magic-comments'] ? doNotAutoSelect : makeMagicCommentHandler(doNotAutoSelect)
 	)
 
 	let mappedSlices: { criterion: SingleSlicingCriterion, id: NodeId }[] = []

diff --git a/src/dataflow/environments/built-in.ts b/src/dataflow/environments/built-in.ts
@@ -186,7 +186,7 @@ registerBuiltInFunctions(true,  ['[', '[['],
 registerBuiltInFunctions(true,  ['$', '@'],                                        processAccess,             { treatIndicesAsString: true }                                               )
 registerBuiltInFunctions(true,  ['if', 'ifelse'],                                  processIfThenElse,         {}                                                                           )
 registerBuiltInFunctions(true,  ['get'],                                           processGet,                {}                                                                           )
-registerBuiltInFunctions(false, ['library'],                                       processLibrary,            {}                                                                           )
+registerBuiltInFunctions(false, ['library', 'require'],                            processLibrary,            {}                                                                           )
 registerBuiltInFunctions(true,  ['<-', '='],                                       processAssignment,         { canBeReplacement: true }                                                   )
 registerBuiltInFunctions(true,  [':=', 'assign'],                                  processAssignment,         {}                                                                           )
 registerBuiltInFunctions(true,  ['delayedAssign'],                                 processAssignment,         { quoteSource: true }                                                        )
@@ -203,6 +203,12 @@ registerBuiltInFunctions(true,  ['repeat'],
 registerBuiltInFunctions(true,  ['while'],                                         processWhileLoop,          {}                                                                           )
 registerBuiltInFunctions(true,  ['options'],                                       defaultBuiltInProcessor,   { hasUnknownSideEffects: true, forceArgs: 'all' as const }                   )
 registerBuiltInFunctions(true,  ['on.exit', 'sys.on.exit'],                        defaultBuiltInProcessor,   { hasUnknownSideEffects: true }                                              )
+/* library and require is handled above */
+registerBuiltInFunctions(true,  ['requireNamespace', 'loadNamespace', 'attachNamespace', 'asNamespace'], defaultBuiltInProcessor, { hasUnknownSideEffects: true }                                         )
+/* downloader and installer functions (R, devtools, BiocManager) */
+registerBuiltInFunctions(true,  ['library.dynam', 'install.packages','install', 'install_github', 'install_gitlab', 'install_bitbucket', 'install_url', 'install_git', 'install_svn', 'install_local', 'install_version', 'update_packages'], defaultBuiltInProcessor, { hasUnknownSideEffects: true }                               )
+/* weird env attachments */
+registerBuiltInFunctions(true,  ['attach'],                                        defaultBuiltInProcessor,   { hasUnknownSideEffects: true }                                              )
 
 /* they are all mapped to `<-` but we separate super assignments */
 registerReplacementFunctions({ makeMaybe: true },  ['<-', '<<-'], '[', '[[', '$', '@', 'names', 'dimnames', 'attributes', 'attr', 'class', 'levels', 'rownames', 'colnames')
diff --git a/src/dataflow/internal/process/functions/call/built-in/built-in-library.ts b/src/dataflow/internal/process/functions/call/built-in/built-in-library.ts
@@ -12,21 +12,21 @@ import { RType } from '../../../../../../r-bridge/lang-4.x/ast/model/type'
 import { wrapArgumentsUnnamed } from '../argument/make-argument'
 
 
-/* we currently do not mark this as an unknown side effect, as we can enable/disable this with a toggle */
 export function processLibrary<OtherInfo>(
 	name: RSymbol<OtherInfo & ParentInformation>,
 	args: readonly RFunctionArgument<OtherInfo & ParentInformation>[],
 	rootId: NodeId,
 	data: DataflowProcessorInformation<OtherInfo & ParentInformation>
 ): DataflowInformation {
+	/* we do not really know what loading the library does and what side effects it causes, hence we mark it as an unknown side effect */
 	if(args.length !== 1) {
 		dataflowLogger.warn(`Currently only one-arg library-likes are allows (for ${name.content}), skipping`)
-		return processKnownFunctionCall({ name, args, rootId, data }).information
+		return processKnownFunctionCall({ name, args, rootId, data, hasUnknownSideEffect: true }).information
 	}
 	const nameToLoad = unpackArgument(args[0])
 	if(nameToLoad === undefined || nameToLoad.type !== RType.Symbol) {
 		dataflowLogger.warn('No library name provided, skipping')
-		return processKnownFunctionCall({ name, args, rootId, data }).information
+		return processKnownFunctionCall({ name, args, rootId, data, hasUnknownSideEffect: true }).information
 	}
 
 	// treat as a function call but convert the first argument to a string
@@ -40,6 +40,8 @@ export function processLibrary<OtherInfo>(
 			str:    nameToLoad.content
 		}
 	}
-
-	return processKnownFunctionCall({ name, args: wrapArgumentsUnnamed([newArg], data.completeAst.idMap), rootId, data }).information
+	return processKnownFunctionCall({
+		name, args:                 wrapArgumentsUnnamed([newArg], data.completeAst.idMap), rootId, data,
+		hasUnknownSideEffect: true
+	}).information
 }
diff --git a/src/dataflow/internal/process/functions/call/known-call-handling.ts b/src/dataflow/internal/process/functions/call/known-call-handling.ts
@@ -16,16 +16,18 @@ import { dataflowLogger } from '../../../../logger'
 import { VertexType } from '../../../../graph/vertex'
 
 export interface ProcessKnownFunctionCallInput<OtherInfo> extends ForceArguments {
-	readonly name:          RSymbol<OtherInfo & ParentInformation>
-	readonly args:          readonly (RNode<OtherInfo & ParentInformation> | RFunctionArgument<OtherInfo & ParentInformation>)[]
-	readonly rootId:        NodeId
-	readonly data:          DataflowProcessorInformation<OtherInfo & ParentInformation>
-	/* should arguments be processed from right to left? This does not affect the order recorded in the call but of the environments */
-	readonly reverseOrder?: boolean
+	readonly name:                  RSymbol<OtherInfo & ParentInformation>
+	readonly args:                  readonly (RNode<OtherInfo & ParentInformation> | RFunctionArgument<OtherInfo & ParentInformation>)[]
+	readonly rootId:                NodeId
+	readonly data:                  DataflowProcessorInformation<OtherInfo & ParentInformation>
+	/** should arguments be processed from right to left? This does not affect the order recorded in the call but of the environments */
+	readonly reverseOrder?:         boolean
 	/** which arguments are to be marked as {@link EdgeType#NonStandardEvaluation|non-standard-evaluation}? */
-	readonly markAsNSE?:    readonly number[]
-	/* allows passing a data processor in-between each argument */
-	readonly patchData?:    (data: DataflowProcessorInformation<OtherInfo & ParentInformation>, arg: number) => DataflowProcessorInformation<OtherInfo & ParentInformation>
+	readonly markAsNSE?:            readonly number[]
+	/** allows passing a data processor in-between each argument */
+	readonly patchData?:            (data: DataflowProcessorInformation<OtherInfo & ParentInformation>, arg: number) => DataflowProcessorInformation<OtherInfo & ParentInformation>
+	/** Does the call have a side effect that we do not know a lot about which may have further consequences? */
+	readonly hasUnknownSideEffect?: boolean
 }
 
 export interface ProcessKnownFunctionCallResult {
@@ -56,7 +58,7 @@ export function markNonStandardEvaluationEdges(
 }
 
 export function processKnownFunctionCall<OtherInfo>(
-	{ name,args, rootId,data, reverseOrder = false, markAsNSE = undefined, forceArgs, patchData = d => d }: ProcessKnownFunctionCallInput<OtherInfo>
+	{ name,args, rootId,data, reverseOrder = false, markAsNSE = undefined, forceArgs, patchData = d => d, hasUnknownSideEffect }: ProcessKnownFunctionCallInput<OtherInfo>
 ): ProcessKnownFunctionCallResult {
 	const functionName = processDataflowFor(name, data)
 
@@ -85,6 +87,10 @@ export function processKnownFunctionCall<OtherInfo>(
 		args:                reverseOrder ? [...callArgs].reverse() : callArgs
 	})
 
+	if(hasUnknownSideEffect) {
+		finalGraph.markIdForUnknownSideEffects(rootId)
+	}
+
 	const inIds = remainingReadInArgs
 	const fnRef = { nodeId: rootId, name: functionCallName, controlDependencies: data.controlDependencies, call: true as const }
 	inIds.push(fnRef)

diff --git a/src/reconstruct/auto-select/auto-select-defaults.ts b/src/reconstruct/auto-select/auto-select-defaults.ts
@@ -1,6 +1,5 @@
-import type { NoInfo, RNode } from '../../r-bridge/lang-4.x/ast/model/model'
+import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'
 import type { ParentInformation, NormalizedAst } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'
-import { RType } from '../../r-bridge/lang-4.x/ast/model/type'
 
 /**
  * The structure of the predicate that should be used to determine
@@ -19,16 +18,3 @@ export type AutoSelectPredicate = (node: RNode<ParentInformation>, fullAst: Norm
 export function doNotAutoSelect(_node: RNode): boolean {
 	return false
 }
-
-const libraryFunctionCall = /^(library|require|((require|load|attach)Namespace))$/
-
-/**
- * A variant of the {@link AutoSelectPredicate} which does its best
- * to select any kind of library import automatically.
- */
-export function autoSelectLibrary<Info = NoInfo>(node: RNode<Info>): boolean {
-	if(node.type !== RType.FunctionCall || !node.named) {
-		return false
-	}
-	return libraryFunctionCall.test(node.functionName.content)
-}
diff --git a/src/reconstruct/reconstruct.ts b/src/reconstruct/reconstruct.ts
@@ -31,7 +31,7 @@ import type { StatefulFoldFunctions } from '../r-bridge/lang-4.x/ast/model/proce
 import { foldAstStateful } from '../r-bridge/lang-4.x/ast/model/processing/stateful-fold'
 import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id'
 import type { AutoSelectPredicate } from './auto-select/auto-select-defaults'
-import { autoSelectLibrary } from './auto-select/auto-select-defaults'
+import { doNotAutoSelect } from './auto-select/auto-select-defaults'
 
 type Selection = ReadonlySet<NodeId>
 interface PrettyPrintLine {
@@ -491,7 +491,7 @@ function removeOuterExpressionListIfApplicable(result: PrettyPrintLine[], linesW
  *
  * @returns The number of lines for which `autoSelectIf` triggered, as well as the reconstructed code itself.
  */
-export function reconstructToCode(ast: NormalizedAst, selection: Selection, autoSelectIf: AutoSelectPredicate = autoSelectLibrary): ReconstructionResult {
+export function reconstructToCode(ast: NormalizedAst, selection: Selection, autoSelectIf: AutoSelectPredicate = doNotAutoSelect): ReconstructionResult {
 	if(reconstructLogger.settings.minLevel <= LogLevel.Trace) {
 		reconstructLogger.trace(`reconstruct ast with ids: ${JSON.stringify([...selection])}`)
 	}

diff --git a/test/functionality/benchmark/slicer.spec.ts b/test/functionality/benchmark/slicer.spec.ts
@@ -135,8 +135,8 @@ cat(d)`
 				tokensNoComments:                  { min: 13, max: 35, median: 19, mean: 22.333333333333332, std: 9.285592184789413,  total: 67 },
 				normalizedTokens:                  { min: 8,  max: 19, median: 11, mean: (8+11+19)/3,        std: 4.642796092394707,  total: 38 },
 				normalizedTokensNoComments:        { min: 8,  max: 19, median: 11, mean: (8+11+19)/3,        std: 4.642796092394707,  total: 38 },
-				dataflowNodes:                     { min: 3,  max: 14, median: 6,  mean: (3+6+14)/3,         std: 4.642796092394707,  total: 23 },
-				linesWithAutoSelected:             { min: 1,  max: 1,  median: 1,  mean: 1,                  std: 0,                  total: 3  } // always select one library statement
+				dataflowNodes:                     { min: 5,  max: 16, median: 8,  mean: (5+8+16)/3,         std: 4.642796092394707,  total: 29 },
+				linesWithAutoSelected:             { min: 0,  max: 0,  median: 0,  mean: 0,                  std: 0,                  total: 0  }
 			}, statInfo)
 
 			assert.deepStrictEqual(stats.perSliceMeasurements.sliceCriteriaSizes, {

diff --git a/test/functionality/slicing/static-program-slices/calls-tests.ts b/test/functionality/slicing/static-program-slices/calls-tests.ts
@@ -682,5 +682,12 @@ x`)
 		'x\non.exit(function() 3)', ['1@x'],
 		'x\non.exit(function() 3)'
 		)
+		assertSliced(label('Library Loads and Installations', [
+			'functions-with-global-side-effects', 'name-normal', 'strings', 'call-normal', 'unnamed-arguments', 'newlines', 'library-loading'
+		]), shell,
+		/* w should be included as it defined the package to be loaded by the library call */
+		'v\nlibrary(x)\nrequire(y)\nw <- "x"\nattachNamespace(w)\nloadNamespace("x")', ['1@v'],
+		'v\nlibrary(x)\nrequire(y)\nw <- "x"\nattachNamespace(w)\nloadNamespace("x")'
+		)
 	})
 }))
Benchmark suite	Current: `9a55656`	Previous: `f7dd663`	Ratio
`Retrieve AST from R code`	`232.9759162727273` ms (`97.24980725607145`)	`238.5188252272727` ms (`100.22506468361182`)	`0.98`
`Normalize R AST`	`19.456760227272728` ms (`34.10133909844928`)	`19.296409454545454` ms (`33.092243108777645`)	`1.01`
`Produce dataflow information`	`36.96388472727273` ms (`79.6515602036736`)	`36.9739589090909` ms (`77.82386121660545`)	`1.00`
`Total per-file`	`797.8512177272728` ms (`1419.6467316743372`)	`776.2117469545454` ms (`1469.1651040608185`)	`1.03`
`Static slicing`	`2.1742035660405294` ms (`1.311249252650579`)	`1.1872604452349385` ms (`0.9948751822483859`)	`1.83`
`Reconstruct code`	`0.22627408163345789` ms (`0.18394678082601015`)	`0.2468089451648185` ms (`0.1946140644840204`)	`0.92`
`Total per-slice`	`2.416743962146326` ms (`1.3763076282879023`)	`1.4516484087621992` ms (`1.0470370397968392`)	`1.66`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.7869724682442361` #	`0.7869724682442361` #	`1`
`reduction (normalized tokens)`	`0.7640044233283717` #	`0.7640044233283717` #	`1`
`memory (df-graph)`	`147.58589311079547` KiB (`359.2574768951678`)	`147.58589311079547` KiB (`359.2574768951678`)	`1`
Benchmark suite	Current: `9a55656`	Previous: `f7dd663`	Ratio
`Retrieve AST from R code`	`240.82628366` ms (`45.82416232679357`)	`237.40032316` ms (`43.13005941619528`)	`1.01`
`Normalize R AST`	`22.1822889` ms (`16.845014085543838`)	`22.09362788` ms (`16.761602459541297`)	`1.00`
`Produce dataflow information`	`68.8968469` ms (`83.31038058237489`)	`68.97634276000001` ms (`82.8815602292033`)	`1.00`
`Total per-file`	`3855.0505958400004` ms (`7992.375386354442`)	`3713.15087572` ms (`7958.055524113993`)	`1.04`
`Static slicing`	`8.041008380272569` ms (`20.008916221797243`)	`7.663184753343301` ms (`20.21369881919582`)	`1.05`
`Reconstruct code`	`0.22075087219097142` ms (`0.13854816137233103`)	`0.23133881371574955` ms (`0.1459510437889604`)	`0.95`
`Total per-slice`	`8.269617661835138` ms (`20.039413848499013`)	`7.902366581545092` ms (`20.237900525090257`)	`1.05`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.9054767484661775` #	`0.9060203516109888` #	`1.00`
`reduction (normalized tokens)`	`0.8742607285608068` #	`0.8749621691839767` #	`1.00`
`memory (df-graph)`	`142.38015625` KiB (`146.62401913756796`)	`142.4557421875` KiB (`146.62238285216054`)	`1.00`