From 55452ec7e68d0cbf06cd5840d667fe9d4b2e80b7 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Fri, 12 Apr 2024 00:42:21 -0500 Subject: [PATCH] feat(swingset): allow slow termination of vats This introduces new `runPolicy()` controls which enable "slow termination" of vats. When configured, terminated vats are immediately dead (all promises are rejected, all new messages go splat, they never run again), however the vat's state is deleted slowly, one piece at a time. This makes it safe to terminate large vats, with a long history, lots of c-list imports/exports, or large vatstore tables, without fear of causing an overload (by e.g. dropping 100k references all in a single crank). See docs/run-policy.md for details and configuration instructions. refs #8928 --- packages/SwingSet/docs/run-policy.md | 138 ++++++++++++++++- packages/SwingSet/src/kernel/kernel.js | 65 +++++++- .../SwingSet/src/kernel/state/kernelKeeper.js | 142 +++++++++++++++++- .../SwingSet/src/kernel/state/vatKeeper.js | 34 ++++- packages/SwingSet/src/lib/runPolicies.js | 41 +++++ packages/SwingSet/src/types-external.js | 9 +- packages/SwingSet/src/types-internal.js | 4 +- .../SwingSet/test/snapshots/test-state.js.md | 4 +- .../test/snapshots/test-state.js.snap | Bin 278 -> 278 bytes packages/SwingSet/test/test-state.js | 4 + .../bootstrap-slow-terminate.js | 54 +++++++ .../slow-termination/test-slow-termination.js | 139 +++++++++++++++++ .../slow-termination/vat-slow-terminate.js | 22 +++ .../vat-admin/terminate/test-terminate.js | 6 +- 14 files changed, 634 insertions(+), 28 deletions(-) create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js diff --git a/packages/SwingSet/docs/run-policy.md b/packages/SwingSet/docs/run-policy.md index 0eba4d940d2d..058781eec68e 100644 --- a/packages/SwingSet/docs/run-policy.md +++ b/packages/SwingSet/docs/run-policy.md @@ -35,7 +35,12 @@ The kernel will invoke the following methods on the policy object (so all must e * `policy.crankFailed()` * `policy.emptyCrank()` -All methods should return `true` if the kernel should keep running, or `false` if it should stop. +All those methods should return `true` if the kernel should keep running, or `false` if it should stop. + +The following methods are optional (for backwards compatibility with policy objects created for older kernels): + +* `policy.allowCleanup()` : may return budget, see "Terminated-Vat Cleanup" below +* `policy.didCleanup({ cleanups })` (if missing, kernel pretends it returned `true` to keep running) The `computrons` argument may be `undefined` (e.g. if the crank was delivered to a non-`xs worker`-based vat, such as the comms vat). The policy should probably treat this as equivalent to some "typical" number of computrons. @@ -53,6 +58,27 @@ More arguments may be added in the future, such as: The run policy should be provided as the first argument to `controller.run()`. If omitted, the kernel defaults to `forever`, a policy that runs until the queue is empty. +## Terminated-Vat Cleanup + +Some vats may grow very large (i.e. large c-lists with lots of imported/exported objects, or lots of vatstore entries). If/when these are terminated, the burst of cleanup work might overwhelm the kernel, especially when processing all the dropped imports (which trigger GC messages to other vats). + +To protect the system against these bursts, the run policy can be configured to terminate vats slowly. Instead of doing all the cleanup work immediately, the policy allows the kernel to do a little bit of work each time `controller.run()` is called (e.g. once per block, for kernels hosted inside a blockchain). + +There are two RunPolicy methods which control this. The first is `runPolicy.allowCleanup()`. This will be invoked many times during `controller.run()`, each time the kernel tries to decide what to do next (once per step). The return value will enable (or not) a fixed amount of cleanup work. The second is `runPolicy.didCleanup({ cleanups })`, which is called later, to inform the policy of how much cleanup work was actually done. The policy can count the cleanups and switch `allowCleanup()` to return `false` when it reaches a threshold. (We need the pre-check `allowCleanup` method because the simple act of looking for cleanup work is itself a cost that we might be able to afford). + +If `allowCleanup()` exists, it must either return a falsy value, or an object. This object may have a `budget` property, which must be a number. + +A falsy return value (eg `allowCleanup: () => false`) prohibits cleanup work. This can be useful in a "only clean up during idle blocks" approach (see below), but should not be the only policy used, otherwise vat cleanup would never happen. + +A numeric `budget` limits how many cleanups are allowed to happen (if any are needed). One "cleanup" will delete one vatstore row, or one c-list entry (note that c-list deletion may trigger GC work), or one heap snapshot record, or one transcript span (and its populated transcript items). Using `{ budget: 5 }` seems to be a reasonable limit on each call, balancing overhead against doing sufficiently small units of work that we can limit the total work performed. + +If `budget` is missing or `undefined`, the kernel will perform unlimited cleanup work. This also happens if `allowCleanup()` is missing entirely, which maintains the old behavior for host applications that haven't been updated to make new policy objects. Note that cleanup is higher priority than anything else, followed by GC work, then BringOutYourDead, then message delivery. + +`didCleanup({ cleanups })` is called when the kernel actually performed some vat-termination cleanup, and the `cleanups` property is a number with the count of cleanups that took place. Each query to `allowCleanup()` might (or might not) be followed by a call to `didCleanup`, with a `cleanups` value that does not exceed the specified budget. + +To limit the work done per block (for blockchain-based applications) the host's RunPolicy objects must keep track of how many cleanups were reported, and change the behavior of `allowCleanup()` when it reaches a per-block threshold. See below for examples. + + ## Typical Run Policies A basic policy might simply limit the block to 100 cranks with deliveries and two vat creations: @@ -78,6 +104,7 @@ function make100CrankPolicy() { return true; }, }); + return policy; } ``` @@ -95,15 +122,15 @@ while(1) { Note that a new policy object should be provided for each call to `run()`. -A more sophisticated one would count computrons. Suppose that experiments suggest that one million computrons take about 5 seconds to execute. The policy would look like: +A more sophisticated one would count computrons. Suppose that experiments suggest that sixty-five million computrons take about 5 seconds to execute. The policy would look like: ```js function makeComputronCounterPolicy(limit) { - let total = 0; + let total = 0n; const policy = harden({ vatCreated() { - total += 100000; // pretend vat creation takes 100k computrons + total += 1_000_000n; // pretend vat creation takes 1M computrons return (total < limit); }, crankComplete(details) { @@ -112,18 +139,119 @@ function makeComputronCounterPolicy(limit) { return (total < limit); }, crankFailed() { - total += 1000000; // who knows, 1M is as good as anything + total += 65_000_000n; // who knows, 65M is as good as anything return (total < limit); }, emptyCrank() { return true; } }); + return policy; } ``` See `src/runPolicies.js` for examples. +To slowly terminate vats, limiting each block to 5 cleanups, the policy should start with a budget of 5, return the remaining `{ budget }` from `allowCleanup()`, and decrement it as `didCleanup` reports that budget being consumed: + +```js +function makeSlowTerminationPolicy() { + let cranks = 0; + let vats = 0; + let cleanups = 5; + const policy = harden({ + vatCreated() { + vats += 1; + return (vats < 2); + }, + crankComplete(details) { + cranks += 1; + return (cranks < 100); + }, + crankFailed() { + cranks += 1; + return (cranks < 100); + }, + emptyCrank() { + return true; + }, + allowCleanup() { + if (cleanups > 0) { + return { budget: cleanups }; + } else { + return false; + } + }, + didCleanup(spent) { + cleanups -= spent.cleanups; + }, + }); + return policy; +} +``` + +A more conservative approach might only allow cleanup in otherwise-empty blocks. To accompish this, use two separate policy objects, and two separate "runs". The first run only performs deliveries, and prohibits all cleanups: + +```js +function makeDeliveryOnlyPolicy() { + let empty = true; + const didWork = () => { empty = false; return true; }; + const policy = harden({ + vatCreated: didWork, + crankComplete: didWork, + crankFailed: didWork, + emptyCrank: didWork, + allowCleanup: () => false, + }); + const wasEmpty = () => empty; + return [ policy, wasEmpty ]; +} +``` + +The second only performs cleanup, with a limited budget, stopping the run after any deliveries occur (such as GC actions): + +```js +function makeCleanupOnlyPolicy() { + let cleanups = 5; + const stop: () => false; + const policy = harden({ + vatCreated: stop, + crankComplete: stop, + crankFailed: stop, + emptyCrank: stop, + allowCleanup() { + if (cleanups > 0) { + return { budget: cleanups }; + } else { + return false; + } + }, + didCleanup(spent) { + cleanups -= spent.cleanups; + }, + }); + return policy; +} +``` + +On each block, the host should only perform the second (cleanup) run if the first policy reports that the block was empty: + +```js +async function doBlock() { + const [ firstPolicy, wasEmpty ] = makeDeliveryOnlyPolicy(); + await controller.run(firstPolicy); + if (wasEmpty()) { + const secondPolicy = makeCleanupOnlyPolicy(); + await controller.run(secondPolicy); + } +} +``` + +Note that regardless of whatever computron/delivery budget is imposed by the first policy, the second policy will allow one additional delivery to be made (we do not yet have an `allowDelivery()` pre-check method that might inhibit this). The cleanup work, which may or may not happen, will sometimes trigger a GC delivery like `dispatch.dropExports`, but at most one such delivery will be made before the second policy returns `false` and stops `controller.run()`. If cleanup does not trigger such a delivery, or if no cleanup work needs to be done, then one normal run-queue delivery will be performed before the policy has a chance to say "stop". All other cleanup-triggered GC work will be deferred until the first run of the next block. + +Also note that `budget` and `cleanups` are plain `Number`s, whereas `comptrons` is a `BigInt`. + + ## Non-Consensus Wallclock Limits If the SwingSet kernel is not being operated in consensus mode, then it is safe to use wallclock time as a block limit: diff --git a/packages/SwingSet/src/kernel/kernel.js b/packages/SwingSet/src/kernel/kernel.js index 79b78a342e02..e923f26f2f46 100644 --- a/packages/SwingSet/src/kernel/kernel.js +++ b/packages/SwingSet/src/kernel/kernel.js @@ -271,7 +271,8 @@ export default function buildKernel( // Reject all promises decided by the vat, making sure to capture the list // of kpids before that data is deleted. const deadPromises = [...kernelKeeper.enumeratePromisesByDecider(vatID)]; - kernelKeeper.cleanupAfterTerminatedVat(vatID); + kernelKeeper.addTerminatedVat(vatID); + kernelKeeper.deleteVatID(vatID); for (const kpid of deadPromises) { resolveToError(kpid, makeError('vat terminated'), vatID); } @@ -378,7 +379,8 @@ export default function buildKernel( * abort?: boolean, // changes should be discarded, not committed * consumeMessage?: boolean, // discard the aborted delivery * didDelivery?: VatID, // we made a delivery to a vat, for run policy and save-snapshot - * computrons?: BigInt, // computron count for run policy + * computrons?: bigint, // computron count for run policy + * cleanups?: number, // cleanup budget spent * meterID?: string, // deduct those computrons from a meter * measureDirt?: [ VatID, Dirt ], // the dirt counter should increment * terminate?: { vatID: VatID, reject: boolean, info: SwingSetCapData }, // terminate vat, notify vat-admin @@ -645,6 +647,32 @@ export default function buildKernel( return deliveryCrankResults(vatID, status, false); // no meter, BOYD clears dirt } + /** + * Perform a small (budget-limited) amount of dead-vat cleanup work. + * + * @param {RunQueueEventCleanupTerminatedVat} message + * 'message' is the run-queue cleanup action, which includes a vatID and budget. + * A budget of 'undefined' allows unlimited work. Otherwise, the budget is a Number, + * and cleanup should not touch more than maybe 5*budget DB rows. + * @returns {Promise} + */ + async function processCleanupTerminatedVat(message) { + console.log(`-- processCleanupTerminatedVat`, message); + const { vatID, budget } = message; + const { done, cleanups } = kernelKeeper.cleanupAfterTerminatedVat( + vatID, + budget, + ); + if (done) { + kernelKeeper.deleteTerminatedVat(vatID); + } + // We don't perform any deliveries here, so tere are no computrons to + // report, but we do tell the runPolicy know how much kernel-side DB + // work we did, so it can decide how much was too much. + const computrons = 0n; + return harden({ computrons, cleanups }); + } + /** * The 'startVat' event is queued by `initializeKernel` for all static vats, * so that we execute their bundle imports and call their `buildRootObject` @@ -1144,6 +1172,7 @@ export default function buildKernel( * @typedef { import('../types-internal.js').RunQueueEventRetireImports } RunQueueEventRetireImports * @typedef { import('../types-internal.js').RunQueueEventNegatedGCAction } RunQueueEventNegatedGCAction * @typedef { import('../types-internal.js').RunQueueEventBringOutYourDead } RunQueueEventBringOutYourDead + * @typedef { import('../types-internal.js').RunQueueEventCleanupTerminatedVat } RunQueueEventCleanupTerminatedVat * @typedef { import('../types-internal.js').RunQueueEvent } RunQueueEvent */ @@ -1211,6 +1240,8 @@ export default function buildKernel( } else if (message.type === 'negated-gc-action') { // processGCActionSet pruned some negated actions, but had no GC // action to perform. Record the DB changes in their own crank. + } else if (message.type === 'cleanup-terminated-vat') { + deliverP = processCleanupTerminatedVat(message); } else if (gcMessages.includes(message.type)) { deliverP = processGCMessage(message); } else { @@ -1270,6 +1301,10 @@ export default function buildKernel( // sometimes happens randomly because of vat eviction policy // which should not affect the in-consensus policyInput) policyInput = ['create-vat', {}]; + } else if (message.type === 'cleanup-terminated-vat') { + const { cleanups } = crankResults; + assert(cleanups !== undefined); + policyInput = ['cleanup', { cleanups }]; } else { policyInput = ['crank', {}]; } @@ -1303,7 +1338,9 @@ export default function buildKernel( const { computrons, meterID } = crankResults; if (computrons) { assert.typeof(computrons, 'bigint'); - policyInput[1].computrons = BigInt(computrons); + if (policyInput[0] !== 'cleanup') { + policyInput[1].computrons = BigInt(computrons); + } if (meterID) { const notify = kernelKeeper.deductMeter(meterID, computrons); if (notify) { @@ -1727,12 +1764,13 @@ export default function buildKernel( * Pulls the next message from the highest-priority queue and returns it * along with a corresponding processor. * + * @param {RunPolicy} [policy] - a RunPolicy to limit the work being done * @returns {{ * message: RunQueueEvent | undefined, * processor: (message: RunQueueEvent) => Promise, * }} */ - function getNextMessageAndProcessor() { + function getNextMessageAndProcessor(policy) { const acceptanceMessage = kernelKeeper.getNextAcceptanceQueueMsg(); if (acceptanceMessage) { return { @@ -1740,7 +1778,16 @@ export default function buildKernel( processor: processAcceptanceMessage, }; } + const allowCleanup = policy?.allowCleanup ? policy.allowCleanup() : {}; + // false, or an object with optional .budget + if (allowCleanup) { + assert.typeof(allowCleanup, 'object'); + if (allowCleanup.budget) { + assert.typeof(allowCleanup.budget, 'number'); + } + } const message = + kernelKeeper.nextCleanupTerminatedVatAction(allowCleanup) || processGCActionSet(kernelKeeper) || kernelKeeper.nextReapAction() || kernelKeeper.getNextRunQueueMsg(); @@ -1817,7 +1864,8 @@ export default function buildKernel( await null; try { kernelKeeper.establishCrankSavepoint('start'); - const { processor, message } = getNextMessageAndProcessor(); + const { processor, message } = + getNextMessageAndProcessor(foreverPolicy()); // process a single message if (message) { await tryProcessMessage(processor, message); @@ -1854,7 +1902,7 @@ export default function buildKernel( kernelKeeper.startCrank(); try { kernelKeeper.establishCrankSavepoint('start'); - const { processor, message } = getNextMessageAndProcessor(); + const { processor, message } = getNextMessageAndProcessor(policy); if (!message) { break; } @@ -1876,6 +1924,11 @@ export default function buildKernel( case 'crank-failed': policyOutput = policy.crankFailed(policyInput[1]); break; + case 'cleanup': { + const { didCleanup = () => true } = policy; + policyOutput = didCleanup(policyInput[1]); + break; + } case 'none': policyOutput = policy.emptyCrank(); break; diff --git a/packages/SwingSet/src/kernel/state/kernelKeeper.js b/packages/SwingSet/src/kernel/state/kernelKeeper.js index fd6d79533f70..d71c8740f074 100644 --- a/packages/SwingSet/src/kernel/state/kernelKeeper.js +++ b/packages/SwingSet/src/kernel/state/kernelKeeper.js @@ -65,6 +65,7 @@ const enableKernelGC = true; // vat.name.$NAME = $vatID = v$NN // vat.nextID = $NN // vat.nextUpgradeID = $NN +// vats.terminated = JSON([vatIDs..]) // device.names = JSON([names..]) // device.name.$NAME = $deviceID = d$NN // device.nextID = $NN @@ -212,6 +213,15 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { insistStorageAPI(kvStore); + // upgrade from pre-"vats.terminated" storage, or populate cache + let terminatedVats; + if (kvStore.has('vats.terminated')) { + terminatedVats = JSON.parse(getRequired('vats.terminated')); + } else { + terminatedVats = []; + kvStore.set('vats.terminated', JSON.stringify(terminatedVats)); + } + /** * @param {string} key * @returns {string} @@ -679,8 +689,12 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { function ownerOfKernelObject(kernelSlot) { insistKernelType('object', kernelSlot); const owner = kvStore.get(`${kernelSlot}.owner`); - if (owner) { - insistVatID(owner); + if (!owner) { + return undefined; + } + insistVatID(owner); + if (terminatedVats.includes(owner)) { + return undefined; } return owner; } @@ -885,14 +899,49 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { kvStore.set(`${kernelSlot}.data.slots`, capdata.slots.join(',')); } - function cleanupAfterTerminatedVat(vatID) { + /** + * Perform some cleanup work for a specific (terminated but not + * fully-deleted) vat, possibly limited by a budget. Returns 'done' + * (where false means "please call me again", and true means "you + * can delete the vatID now"), and a count of how much work was done + * (so the runPolicy can decide when to stop). + * + * @param {string} vatID + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} + * + */ + function cleanupAfterTerminatedVat(vatID, budget = undefined) { insistVatID(vatID); + console.log(` cleanup: ${vatID} budget=${budget}`); + let cleanups = 0; + let spend = _did => false; // returns "stop now" + if (budget !== undefined) { + assert.typeof(budget, 'number'); + spend = (did = 1) => { + cleanups += did; + return cleanups >= budget; + }; + } + + // TODO: it would be slightly cheaper to walk all kvStore keys in + // order, and act on each one according to its category (c-list + // export, c-list import, vatstore, other), so we use a single + // enumeratePrefixedKeys() call each time. Until we do that, the + // last phase of the cleanup (where we've deleted all the exports + // and imports, and are working on the remaining keys) will waste + // two DB queries on each call. OTOH, those queries will probably + // hit the same SQLite index page as the successful one, so it + // probably won't cause any extra disk IO. So we can defer this + // optimization for a while. Note: when we implement it, be + // prepared to encounter the clist entries in eiher order (kref + // first or vref first), and delete the other one in the same + // call, so we don't wind up with half an entry. + const vatKeeper = provideVatKeeper(vatID); const exportPrefix = `${vatID}.c.o+`; const importPrefix = `${vatID}.c.o-`; - vatKeeper.deleteSnapshotsAndTranscript(); - // Note: ASCII order is "+,-./", and we rely upon this to split the // keyspace into the various o+NN/o-NN/etc spaces. If we were using a // more sophisticated database, we'd keep each section in a separate @@ -915,8 +964,27 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { // begin with `vMM.c.o+`. In addition to deleting the c-list entry, we // must also delete the corresponding kernel owner entry for the object, // since the object will no longer be accessible. + const vref = k.slice(`${vatID}.c.`.length); + assert(vref.startsWith('o+'), vref); const kref = kvStore.get(k); - orphanKernelObject(kref, vatID); + // we must delete the c-list entry, but we don't need to + // manipulate refcounts like the way vatKeeper/deleteCListEntry + // does, so just delete the keys directly + // vatKeeper.deleteCListEntry(kref, vref); + console.log(`cleanup -- deleting ${vatID}.c.${kref}`); + console.log(`cleanup -- deleting ${vatID}.c.${vref}`); + kvStore.delete(`${vatID}.c.${kref}`); + kvStore.delete(`${vatID}.c.${vref}`); + // if this object became unreferenced, processRefcounts will + // delete it, so don't be surprised. TODO: this results in an + // extra get() for each delete(), see if there's a way to avoid + // this. TODO: think carefully about other such races. + if (kvStore.has(`${kref}.owner`)) { + orphanKernelObject(kref, vatID); + } + if (spend()) { + return { done: false, cleanups }; + } } // then scan for imported objects, which must be decrefed @@ -925,8 +993,12 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { // drop+retire const kref = kvStore.get(k) || Fail`getNextKey ensures get`; const vref = k.slice(`${vatID}.c.`.length); + console.log(`cleanup -- deleting ${vatID} clist ${kref} <-> ${vref}`); vatKeeper.deleteCListEntry(kref, vref); // that will also delete both db keys + if (spend()) { + return { done: false, cleanups }; + } } // the caller used enumeratePromisesByDecider() before calling us, @@ -934,9 +1006,24 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { // now loop back through everything and delete it all for (const k of enumeratePrefixedKeys(kvStore, `${vatID}.`)) { + console.log(`cleanup -- deleting ${vatID} key ${k}`); kvStore.delete(k); + if (spend()) { + return { done: false, cleanups }; + } } + const dc = vatKeeper.deleteSnapshotsAndTranscripts(budget); + // last task, so increment cleanups, but dc.done is authoritative + spend(dc.cleanups); + + if (dc.done) { + console.log(` cleanup: done`); + } + return { done: dc.done, cleanups }; + } + + function deleteVatID(vatID) { // TODO: deleting entries from the dynamic vat IDs list requires a linear // scan of the list; arguably this collection ought to be represented in a // different way that makes it efficient to remove an entry from it, though @@ -1231,6 +1318,41 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { return makeUpgradeID(nextID); } + function addTerminatedVat(vatID) { + if (!terminatedVats.includes(vatID)) { + terminatedVats.push(vatID); + kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats)); + } + } + + function getFirstTerminatedVat() { + if (terminatedVats.length) { + return terminatedVats[0]; + } + return undefined; + } + + function deleteTerminatedVat(vatID) { + terminatedVats = terminatedVats.filter(id => id !== vatID); + kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats)); + } + + function nextCleanupTerminatedVatAction(allowCleanup) { + if (!allowCleanup) { + return undefined; + } + // budget === undefined means "unlimited" + const { budget } = allowCleanup; + // if (getGCActions().size) { + // return undefined; + // } + const vatID = getFirstTerminatedVat(); + if (vatID) { + return { type: 'cleanup-terminated-vat', vatID, budget }; + } + return undefined; + } + // As refcounts are decremented, we accumulate a set of krefs for which // action might need to be taken: // * promises which are now resolved and unreferenced can be deleted @@ -1423,7 +1545,7 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { function vatIsAlive(vatID) { insistVatID(vatID); - return kvStore.has(`${vatID}.o.nextID`); + return kvStore.has(`${vatID}.o.nextID`) && !terminatedVats.includes(vatID); } /** @@ -1722,6 +1844,12 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { getDynamicVats, getStaticVats, getDevices, + deleteVatID, + + addTerminatedVat, + getFirstTerminatedVat, + deleteTerminatedVat, + nextCleanupTerminatedVatAction, allocateUpgradeID, diff --git a/packages/SwingSet/src/kernel/state/vatKeeper.js b/packages/SwingSet/src/kernel/state/vatKeeper.js index 7cd7ff51ef58..696c414775ca 100644 --- a/packages/SwingSet/src/kernel/state/vatKeeper.js +++ b/packages/SwingSet/src/kernel/state/vatKeeper.js @@ -726,11 +726,37 @@ export function makeVatKeeper( }); } - function deleteSnapshotsAndTranscript() { + /** + * Perform some (possibly-limited) cleanup work for a vat. Returns + * 'done' (where false means "please call me again", and true means + * "you can delete the vatID now"), and a count of how much work was + * done (so the runPolicy can decide when to stop). + * + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} + * + */ + function deleteSnapshotsAndTranscripts(budget = undefined) { + // Each budget=1 allows us to delete one snapshot entry, or one + // transcript span and any transcript items associated with that + // span. Some nodes will have historical transcript items, some + // will not. Using budget=5 and snapshotInterval=200 means we + // delete 5 snapshot (probably only-hash) records, or 5 span + // records and maybe 1000 span items. + + let cleanups = 0; if (snapStore) { - snapStore.deleteVatSnapshots(vatID); + const dc = snapStore.deleteVatSnapshots(vatID, budget); + // initially uses 2+budget DB statements, then just 1 when done + cleanups += dc.cleanups; + if (!dc.done) { + return { done: false, cleanups }; + } } - transcriptStore.deleteVatTranscripts(vatID); + const dc = transcriptStore.deleteVatTranscripts(vatID, budget); + // initially uses 2+2*budget DB statements, then just 1 when done + cleanups += dc.cleanups; + return { done: dc.done, cleanups }; } function beginNewIncarnation() { @@ -812,7 +838,7 @@ export function makeVatKeeper( dumpState, saveSnapshot, getSnapshotInfo, - deleteSnapshotsAndTranscript, + deleteSnapshotsAndTranscripts, beginNewIncarnation, }); } diff --git a/packages/SwingSet/src/lib/runPolicies.js b/packages/SwingSet/src/lib/runPolicies.js index e3528d53c631..954784c9c1ad 100644 --- a/packages/SwingSet/src/lib/runPolicies.js +++ b/packages/SwingSet/src/lib/runPolicies.js @@ -3,6 +3,9 @@ import { assert } from '@agoric/assert'; export function foreverPolicy() { /** @type { RunPolicy } */ return harden({ + allowCleanup() { + return {}; // unlimited budget + }, vatCreated(_details) { return true; }, @@ -27,6 +30,9 @@ export function crankCounter( let vats = 0; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup() { + return { budget: 100 }; // limited budget + }, vatCreated() { vats += 1; return vats < maxCreateVats; @@ -52,6 +58,9 @@ export function computronCounter(limit) { let total = 0n; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup() { + return { budget: 100 }; // limited budget + }, vatCreated() { total += 100000n; // pretend vat creation takes 100k computrons return total < limit; @@ -79,6 +88,7 @@ export function wallClockWaiter(seconds) { const timeout = Date.now() + 1000 * seconds; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup: () => ({}), // unlimited budget vatCreated: () => Date.now() < timeout, crankComplete: () => Date.now() < timeout, crankFailed: () => Date.now() < timeout, @@ -86,3 +96,34 @@ export function wallClockWaiter(seconds) { }); return policy; } + +export function noCleanup() { + /** @type { RunPolicy } */ + const policy = harden({ + allowCleanup: () => false, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + }); + return policy; +} + +export function someCleanup(budget) { + let once = true; + /** @type { RunPolicy } */ + const policy = harden({ + allowCleanup: () => { + if (once) { + once = false; + return { budget }; + } + return false; + }, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + }); + return policy; +} diff --git a/packages/SwingSet/src/types-external.js b/packages/SwingSet/src/types-external.js index c8f2233fb83d..3075931aae6d 100644 --- a/packages/SwingSet/src/types-external.js +++ b/packages/SwingSet/src/types-external.js @@ -218,12 +218,17 @@ export {}; * @typedef { [tag: 'create-vat', details: PolicyInputDetails ]} PolicyInputCreateVat * @typedef { [tag: 'crank', details: PolicyInputDetails ] } PolicyInputCrankComplete * @typedef { [tag: 'crank-failed', details: PolicyInputDetails ]} PolicyInputCrankFailed - * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete | PolicyInputCrankFailed } PolicyInput + * @typedef { [tag: 'cleanup', details: { cleanups: number }] } PolicyInputCleanup + * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete | + * PolicyInputCrankFailed | PolicyInputCleanup } PolicyInput * @typedef { boolean } PolicyOutput - * @typedef { { vatCreated: (details: {}) => PolicyOutput, + * @typedef { { + * allowCleanup?: () => false | { budget?: number }, + * vatCreated: (details: {}) => PolicyOutput, * crankComplete: (details: { computrons?: bigint }) => PolicyOutput, * crankFailed: (details: {}) => PolicyOutput, * emptyCrank: () => PolicyOutput, + * didCleanup?: (details: { cleanups: number }) => PolicyOutput, * } } RunPolicy * * @typedef {object} VatWarehousePolicy diff --git a/packages/SwingSet/src/types-internal.js b/packages/SwingSet/src/types-internal.js index e135ff848106..6a66bc8ed8c5 100644 --- a/packages/SwingSet/src/types-internal.js +++ b/packages/SwingSet/src/types-internal.js @@ -138,10 +138,12 @@ export {}; * @typedef { { type: 'retireImports', vatID: VatID, krefs: string[] } } RunQueueEventRetireImports * @typedef { { type: 'negated-gc-action', vatID?: VatID } } RunQueueEventNegatedGCAction * @typedef { { type: 'bringOutYourDead', vatID: VatID } } RunQueueEventBringOutYourDead + * @typedef { { type: 'cleanup-terminated-vat', vatID: VatID, + * budget: number | undefined } } RunQueueEventCleanupTerminatedVat * @typedef { RunQueueEventNotify | RunQueueEventSend | RunQueueEventCreateVat | * RunQueueEventUpgradeVat | RunQueueEventChangeVatOptions | RunQueueEventStartVat | * RunQueueEventDropExports | RunQueueEventRetireExports | RunQueueEventRetireImports | - * RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead + * RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead | RunQueueEventCleanupTerminatedVat * } RunQueueEvent */ diff --git a/packages/SwingSet/test/snapshots/test-state.js.md b/packages/SwingSet/test/snapshots/test-state.js.md index 48daa769554d..f522dbe3eddd 100644 --- a/packages/SwingSet/test/snapshots/test-state.js.md +++ b/packages/SwingSet/test/snapshots/test-state.js.md @@ -8,8 +8,8 @@ Generated by [AVA](https://avajs.dev). > initial state - '09c3651da4f2f1fc6cd4e61170aaab3954ee1ace51e6028c69361f73b0ac7272' + '9715d38dfb0dc0ba922d5740d0442616d773ef41cb62a7dab666532cbd9c9f46' > expected activityhash - '00a0e15b521f7c32726b0b2d4da425eba66fab98678975ea5b03cd07bfe3109a' + '9e3f0f0d8560c41f3d617768ad1f9760ea93e781352fd836fcc9e16a84f5073e' diff --git a/packages/SwingSet/test/snapshots/test-state.js.snap b/packages/SwingSet/test/snapshots/test-state.js.snap index 223f3081f3568f64aae27207bcb07f96082f9d16..809c77821e414072b314abe05f6b011256650f3c 100644 GIT binary patch literal 278 zcmV+x0qOohRzVqaTY100000000AZkUdTVF%X3}LJ>7L*p4caf5&#m38-nuo*Au`&2EGZ3MJfx zoRp2ID0t;-zVE$n$@5x{?eogL_L(n_mSTsm(XORw*>f80*^0l6Jg9%1vkj$q)75*V9DAA6b+45|+tS3b#4IA4yA$Xrbar9w~!3i8G%Si>wVzr9+ cCB4bzEZk(<$gRAW_HjGm2Sn8@2_OLg022v { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); }); @@ -243,6 +244,7 @@ test('kernelKeeper vat names', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); t.deepEqual(k.getStaticVats(), [ ['Frank', 'v2'], @@ -296,6 +298,7 @@ test('kernelKeeper device names', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); t.deepEqual(k.getDevices(), [ ['Frank', 'd8'], @@ -481,6 +484,7 @@ test('kernelKeeper promises', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); k.deleteKernelObject(ko); diff --git a/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js new file mode 100644 index 000000000000..8176de2a9248 --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js @@ -0,0 +1,54 @@ +import { Far, E } from '@endo/far'; + +export function buildRootObject(_vatPowers) { + let root; + let adminNode; + + const self = Far('root', { + async bootstrap(vats, devices) { + const svc = E(vats.vatAdmin).createVatAdminService(devices.vatAdmin); + // create a dynamic vat, send it a message and let it respond, to make + // sure everything is working + const dude = await E(svc).createVatByName('dude'); + root = dude.root; + adminNode = dude.adminNode; + await E(root).alive(); + + // set up 20 "bootstrap exports, dude imports" c-list entries + const myExports = []; + for (let i = 0; i < 20; i += 1) { + myExports.push(Far('bootstrap export', {})); + } + await E(root).acceptImports(harden(myExports)); + + // set up 20 "dude exports, bootstrap imports" c-list entries + /* eslint-disable-next-line no-unused-vars */ + const myImports = await E(root).sendExports(20); + + // ask dude to creates 20 vatstore entries (in addition to the + // built-in liveslots stuff) + await E(root).makeVatstore(20); + + return 'bootstrap done'; + }, + + async kill(mode) { + switch (mode) { + case 'kill': + await E(adminNode).terminateWithFailure(mode); + break; + case 'dieHappy': + await E(root).dieHappy(mode); + break; + default: + console.log('something terrible has happened'); + break; + } + // confirm the vat is dead, even though cleanup happens later + return E(root) + .ping() + .catch(err => `kill done, ${err}`); + }, + }); + return self; +} diff --git a/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js b/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js new file mode 100644 index 000000000000..08b8fd8bf08a --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js @@ -0,0 +1,139 @@ +// @ts-nocheck +// eslint-disable-next-line import/order +import { test } from '../../../tools/prepare-test-env-ava.js'; + +import { kser } from '@agoric/kmarshal'; +import { initSwingStore } from '@agoric/swing-store'; + +import { buildVatController, buildKernelBundles } from '../../../src/index.js'; +import { enumeratePrefixedKeys } from '../../../src/kernel/state/storageHelper.js'; + +test.before(async t => { + const kernelBundles = await buildKernelBundles(); + t.context.data = { kernelBundles }; +}); + +const makeCleanupPolicy = () => { + let budget = 5; + let cleanups = 0; + const stop = () => false; + const policy = harden({ + vatCreated: stop, + crankComplete: stop, + crankFailed: stop, + emptyCrank: stop, + allowCleanup() { + if (budget > 0) { + return { budget }; + } else { + return false; + } + }, + didCleanup(spent) { + budget -= spent.cleanups; + cleanups += spent.cleanups; + }, + }); + const getCleanups = () => cleanups; + return [policy, getCleanups]; +}; + +const bfile = path => new URL(path, import.meta.url).pathname; + +async function doSlowTerminate(t, mode) { + const config = { + defaultReapInterval: 'never', + bootstrap: 'bootstrap', + bundles: { + dude: { + sourceSpec: bfile('vat-slow-terminate.js'), + }, + }, + vats: { + bootstrap: { + sourceSpec: bfile('bootstrap-slow-terminate.js'), + }, + }, + }; + const noCleanupPolicy = { + allowCleanup: () => false, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + didCleanup: () => true, + }; + + const kernelStorage = initSwingStore().kernelStorage; + const { kvStore } = kernelStorage; + const controller = await buildVatController(config, [], { + ...t.context.data, + kernelStorage, + }); + t.teardown(controller.shutdown); + t.is(controller.kpStatus(controller.bootstrapResult), 'unresolved'); + controller.pinVatRoot('bootstrap'); + await controller.run(noCleanupPolicy); + t.is(controller.kpStatus(controller.bootstrapResult), 'fulfilled'); + t.deepEqual( + controller.kpResolution(controller.bootstrapResult), + kser('bootstrap done'), + ); + + const vatID = JSON.parse(kvStore.get('vat.dynamicIDs'))[0]; + t.is(vatID, 'v6'); // change if necessary + const remaining = () => + Array.from(enumeratePrefixedKeys(kvStore, `${vatID}.`)); + + // 20*2 for imports, 20*2 for exports, 20*1 for vatstore = 100 + // plus 29 for usual liveslots stuff + t.is(remaining().length, 129); + t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + + // console.log(remaining()); + + const kpid = controller.queueToVatRoot('bootstrap', 'kill', [mode]); + await controller.run(noCleanupPolicy); + t.is(controller.kpStatus(kpid), 'fulfilled'); + t.deepEqual( + controller.kpResolution(kpid), + kser('kill done, Error: vat terminated'), + ); + + t.true(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + // no cleanups were allowed, so nothing should be removed yet + t.truthy(kernelStorage.kvStore.get(`${vatID}.options`, undefined)); + t.is(remaining().length, 129); + + let left = remaining().length; + + while (left) { + console.log(left); + const oldLeft = left; + const [policy, getCleanups] = makeCleanupPolicy(); + await controller.run(policy); + left = remaining().length; + t.true(getCleanups() <= 5); + const removed = oldLeft - left; + console.log( + ` removed: ${removed}, cleanups: ${getCleanups()}, left: ${left}`, + ); + t.true(removed <= 10); + // t.is(remaining().length, 119); + + // TODO: use an XS worker, set snapInterval low enough to create + // multiple spans and a few snapshots, watch for slow deletion of + // spans and snapshots too + } + + t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + t.is(kernelStorage.kvStore.get(`${vatID}.options`, undefined)); +} + +test.serial('slow terminate (kill)', async t => { + await doSlowTerminate(t, 'kill'); +}); + +test.serial('slow terminate (die happy)', async t => { + await doSlowTerminate(t, 'dieHappy'); +}); diff --git a/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js new file mode 100644 index 000000000000..63749b8ff00d --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js @@ -0,0 +1,22 @@ +import { Far } from '@endo/far'; + +export function buildRootObject(vatPowers, _vatParameters, baggage) { + const hold = []; + return Far('root', { + alive: () => true, + dieHappy: completion => vatPowers.exitVat(completion), + sendExports: count => { + const myExports = []; + for (let i = 0; i < count; i += 1) { + myExports.push(Far('dude export', {})); + } + return harden(myExports); + }, + acceptImports: imports => hold.push(imports), + makeVatstore: count => { + for (let i = 0; i < count; i += 1) { + baggage.init(`key-${i}`, i); + } + }, + }); +} diff --git a/packages/SwingSet/test/vat-admin/terminate/test-terminate.js b/packages/SwingSet/test/vat-admin/terminate/test-terminate.js index 750862e084b5..a4759943e477 100644 --- a/packages/SwingSet/test/vat-admin/terminate/test-terminate.js +++ b/packages/SwingSet/test/vat-admin/terminate/test-terminate.js @@ -150,8 +150,12 @@ async function doTerminateCritical( t.is(thrown.message, mode); } t.is(controller.kpStatus(kpid), dynamic ? 'unresolved' : 'fulfilled'); + // the kernel is supposed to crash before deleting vNN.options, + // although strictly speaking it doesn't matter because the host is + // supposed to crash too, abandoning the uncommitted swingstore + // changes const postProbe = kernelStorage.kvStore.get(`${deadVatID}.options`); - t.is(postProbe, undefined); + t.truthy(postProbe); } test.serial('terminate (dynamic, non-critical)', async t => {