From 8d3cc632ace5509cc2b52fb917aabba52089c231 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 12:46:57 -0800 Subject: [PATCH 01/21] add support for running pre-migrations and caching their results This can significantly speedup state migrations. --- chain/stmgr/forks.go | 124 ++++++++++++++++++++++++++++++++++---- chain/stmgr/forks_test.go | 4 +- chain/stmgr/stmgr.go | 45 ++++++++++++-- node/builder.go | 2 +- node/modules/stmgr.go | 20 ++++++ 5 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 node/modules/stmgr.go diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index e09744a67c1..8db1b9de4a7 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/binary" + "sort" "github.com/filecoin-project/go-state-types/rt" @@ -36,20 +37,68 @@ import ( "golang.org/x/xerrors" ) +// MigrationCache can be used to cache information used by a migration. This is primarily useful to +// "pre-compute" some migration state ahead of time, and make it accessible in the migration itself. +type MigrationCache interface { + Write(key string, value cid.Cid) error + Read(key string) (bool, cid.Cid, error) + Load(key string, loadFunc func() (cid.Cid, error)) (cid.Cid, error) +} + // UpgradeFunc is a migration function run at every upgrade. // +// - The cache is a per-upgrade cache, pre-populated by pre-migrations. // - The oldState is the state produced by the upgrade epoch. // - The returned newState is the new state that will be used by the next epoch. // - The height is the upgrade epoch height (already executed). // - The tipset is the tipset for the last non-null block before the upgrade. Do // not assume that ts.Height() is the upgrade height. -type UpgradeFunc func(ctx context.Context, sm *StateManager, cb ExecCallback, oldState cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (newState cid.Cid, err error) +type UpgradeFunc func( + ctx context.Context, + sm *StateManager, cache MigrationCache, + cb ExecCallback, oldState cid.Cid, + height abi.ChainEpoch, ts *types.TipSet, +) (newState cid.Cid, err error) + +// PreUpgradeFunc is a function run _before_ a network upgrade to pre-compute part of the network +// upgrade and speed it up. +type PreUpgradeFunc func( + ctx context.Context, + sm *StateManager, cache MigrationCache, + oldState cid.Cid, + height abi.ChainEpoch, ts *types.TipSet, +) + +// PreUpgrade describes a pre-migration step to prepare for a network state upgrade. Pre-migrations +// are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times. +type PreUpgrade struct { + // PreUpgrade is the pre-migration function to run at the specified time. This function is + // expected to run asynchronously and must abort promptly when canceled. + PreUpgrade PreUpgradeFunc + + // After specifies that this pre-migration should be started _after_ the given epoch. + // + // In case of chain reverts, the pre-migration will not be canceled and the state will not + // be reverted. + After abi.ChainEpoch + + // NotAfter specifies that this pre-migration should not be started after the given epoch. + // + // This should be set such that the pre-migration is likely to complete at least 5 epochs + // before the next pre-migration and/or upgrade epoch hits. + NotAfter abi.ChainEpoch +} type Upgrade struct { Height abi.ChainEpoch Network network.Version Expensive bool Migration UpgradeFunc + + // PreUpgrades specifies a set of pre-migration functions to run at the indicated epochs. + // These functions should fill the given cache with information that can speed up the + // eventual full migration at the upgrade epoch. + PreUpgrades []PreUpgrade } type UpgradeSchedule []Upgrade @@ -164,9 +213,9 @@ func (us UpgradeSchedule) Validate() error { func (sm *StateManager) handleStateForks(ctx context.Context, root cid.Cid, height abi.ChainEpoch, cb ExecCallback, ts *types.TipSet) (cid.Cid, error) { retCid := root var err error - f, ok := sm.stateMigrations[height] - if ok { - retCid, err = f(ctx, sm, cb, root, height, ts) + u := sm.stateMigrations[height] + if u != nil && u.upgrade != nil { + retCid, err = u.upgrade(ctx, sm, u.cache, cb, root, height, ts) if err != nil { return cid.Undef, err } @@ -180,6 +229,59 @@ func (sm *StateManager) hasExpensiveFork(ctx context.Context, height abi.ChainEp return ok } +func (sm *StateManager) preMigrationWorker(ctx context.Context) { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + type op struct { + epoch abi.ChainEpoch + run func(ts *types.TipSet) + } + + // Turn each pre-migration into an operation in a schedule. + var schedule []op + for _, migration := range sm.stateMigrations { + cache := migration.cache + for _, prem := range migration.preMigrations { + // TODO: make sure the schedule makes sense after < notafter, etc. + preCtx, preCancel := context.WithCancel(ctx) + migrationFunc := prem.PreUpgrade + + // Add an op to start a pre-migration. + schedule = append(schedule, op{ + epoch: prem.After, + + // TODO: are these values correct? + run: func(ts *types.TipSet) { migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) }, + }) + + // Add an op to cancle the pre-migration if it's still running. + schedule = append(schedule, op{ + epoch: prem.NotAfter, + run: func(ts *types.TipSet) { preCancel() }, + }) + } + } + + // Then sort by epoch. + sort.Slice(schedule, func(i, j int) bool { + return schedule[i].epoch < schedule[j].epoch + }) + + // Finally, when the head changes, see if there's anything we need to do. + for change := range sm.cs.SubHeadChanges(ctx) { + for _, head := range change { + for len(schedule) > 0 { + if head.Val.Height() < schedule[0].epoch { + break + } + schedule[0].run(head.Val) + schedule = schedule[1:] + } + } + } +} + func doTransfer(tree types.StateTree, from, to address.Address, amt abi.TokenAmount, cb func(trace types.ExecutionTrace)) error { fromAct, err := tree.GetActor(from) if err != nil { @@ -233,7 +335,7 @@ func doTransfer(tree types.StateTree, from, to address.Address, amt abi.TokenAmo return nil } -func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { // Some initial parameters FundsForMiners := types.FromFil(1_000_000) LookbackEpoch := abi.ChainEpoch(32000) @@ -519,7 +621,7 @@ func UpgradeFaucetBurnRecovery(ctx context.Context, sm *StateManager, cb ExecCal return tree.Flush(ctx) } -func UpgradeIgnition(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeIgnition(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { store := sm.cs.Store(ctx) if build.UpgradeLiftoffHeight <= epoch { @@ -574,7 +676,7 @@ func UpgradeIgnition(ctx context.Context, sm *StateManager, cb ExecCallback, roo return tree.Flush(ctx) } -func UpgradeRefuel(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeRefuel(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { store := sm.cs.Store(ctx) tree, err := sm.StateTree(root) @@ -600,7 +702,7 @@ func UpgradeRefuel(ctx context.Context, sm *StateManager, cb ExecCallback, root return tree.Flush(ctx) } -func UpgradeActorsV2(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeActorsV2(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { buf := bufbstore.NewTieredBstore(sm.cs.Blockstore(), bstore.NewTemporarySync()) store := store.ActorStore(ctx, buf) @@ -646,7 +748,7 @@ func UpgradeActorsV2(ctx context.Context, sm *StateManager, cb ExecCallback, roo return newRoot, nil } -func UpgradeLiftoff(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeLiftoff(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { tree, err := sm.StateTree(root) if err != nil { return cid.Undef, xerrors.Errorf("getting state tree: %w", err) @@ -660,7 +762,7 @@ func UpgradeLiftoff(ctx context.Context, sm *StateManager, cb ExecCallback, root return tree.Flush(ctx) } -func UpgradeCalico(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeCalico(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { store := sm.cs.Store(ctx) var stateRoot types.StateRoot if err := store.Get(ctx, root, &stateRoot); err != nil { @@ -702,7 +804,7 @@ func UpgradeCalico(ctx context.Context, sm *StateManager, cb ExecCallback, root return newRoot, nil } -func UpgradeActorsV3(ctx context.Context, sm *StateManager, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeActorsV3(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { buf := bufbstore.NewTieredBstore(sm.cs.Blockstore(), bstore.NewTemporarySync()) store := store.ActorStore(ctx, buf) diff --git a/chain/stmgr/forks_test.go b/chain/stmgr/forks_test.go index a2b7a179fba..80ebb491bd1 100644 --- a/chain/stmgr/forks_test.go +++ b/chain/stmgr/forks_test.go @@ -122,7 +122,7 @@ func TestForkHeightTriggers(t *testing.T) { cg.ChainStore(), UpgradeSchedule{{ Network: 1, Height: testForkHeight, - Migration: func(ctx context.Context, sm *StateManager, cb ExecCallback, + Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { cst := ipldcbor.NewCborStore(sm.ChainStore().Blockstore()) @@ -252,7 +252,7 @@ func TestForkRefuseCall(t *testing.T) { Network: 1, Expensive: true, Height: testForkHeight, - Migration: func(ctx context.Context, sm *StateManager, cb ExecCallback, + Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { return root, nil }}}) diff --git a/chain/stmgr/stmgr.go b/chain/stmgr/stmgr.go index 84e9e1744f0..e4dd816d1c7 100644 --- a/chain/stmgr/stmgr.go +++ b/chain/stmgr/stmgr.go @@ -20,6 +20,7 @@ import ( // Used for genesis. msig0 "github.com/filecoin-project/specs-actors/actors/builtin/multisig" + "github.com/filecoin-project/specs-actors/v3/actors/migration/nv10" // we use the same adt for all receipts blockadt "github.com/filecoin-project/specs-actors/actors/util/adt" @@ -62,15 +63,24 @@ type versionSpec struct { atOrBelow abi.ChainEpoch } +type migration struct { + upgrade UpgradeFunc + preMigrations []PreUpgrade + cache MigrationCache +} + type StateManager struct { cs *store.ChainStore + ctx context.Context + cancel context.CancelFunc + // Determines the network version at any given epoch. networkVersions []versionSpec latestVersion network.Version - // Maps chain epochs to upgrade functions. - stateMigrations map[abi.ChainEpoch]UpgradeFunc + // Maps chain epochs to migrations. + stateMigrations map[abi.ChainEpoch]*migration // A set of potentially expensive/time consuming upgrades. Explicit // calls for, e.g., gas estimation fail against this epoch with // ErrExpensiveFork. @@ -103,7 +113,7 @@ func NewStateManagerWithUpgradeSchedule(cs *store.ChainStore, us UpgradeSchedule return nil, err } - stateMigrations := make(map[abi.ChainEpoch]UpgradeFunc, len(us)) + stateMigrations := make(map[abi.ChainEpoch]*migration, len(us)) expensiveUpgrades := make(map[abi.ChainEpoch]struct{}, len(us)) var networkVersions []versionSpec lastVersion := network.Version0 @@ -111,8 +121,13 @@ func NewStateManagerWithUpgradeSchedule(cs *store.ChainStore, us UpgradeSchedule // If we have any upgrades, process them and create a version // schedule. for _, upgrade := range us { - if upgrade.Migration != nil { - stateMigrations[upgrade.Height] = upgrade.Migration + if upgrade.Migration != nil || upgrade.PreUpgrades != nil { + migration := &migration{ + upgrade: upgrade.Migration, + preMigrations: upgrade.PreUpgrades, + cache: nv10.NewMemMigrationCache(), + } + stateMigrations[upgrade.Height] = migration } if upgrade.Expensive { expensiveUpgrades[upgrade.Height] = struct{}{} @@ -148,6 +163,26 @@ func cidsToKey(cids []cid.Cid) string { return out } +// Start starts the state manager's optional background processes. At the moment, this schedules +// pre-migration functions to run ahead of network upgrades. +// +// This is method is not safe to invoke from multiple threads or concurrently with Stop. +func (sm *StateManager) Start(context.Context) error { + sm.ctx, sm.cancel = context.WithCancel(context.Background()) + go sm.preMigrationWorker(sm.ctx) + return nil +} + +// Stop starts the state manager's background processes. +// +// This is method is not safe to invoke from multiple threads or concurrently with Start. +func (sm *StateManager) Stop(context.Context) error { + if sm.cancel != nil { + sm.cancel() + } + return nil +} + func (sm *StateManager) TipSetState(ctx context.Context, ts *types.TipSet) (st cid.Cid, rec cid.Cid, err error) { ctx, span := trace.StartSpan(ctx, "tipSetState") defer span.End() diff --git a/node/builder.go b/node/builder.go index 8ee9b367440..1dd60ee1b32 100644 --- a/node/builder.go +++ b/node/builder.go @@ -269,7 +269,7 @@ func Online() Option { Override(new(vm.SyscallBuilder), vm.Syscalls), Override(new(*store.ChainStore), modules.ChainStore), Override(new(stmgr.UpgradeSchedule), stmgr.DefaultUpgradeSchedule()), - Override(new(*stmgr.StateManager), stmgr.NewStateManagerWithUpgradeSchedule), + Override(new(*stmgr.StateManager), modules.StateManager), Override(new(*wallet.LocalWallet), wallet.NewWallet), Override(new(wallet.Default), From(new(*wallet.LocalWallet))), Override(new(api.WalletAPI), From(new(wallet.MultiWallet))), diff --git a/node/modules/stmgr.go b/node/modules/stmgr.go new file mode 100644 index 00000000000..9d3917b856f --- /dev/null +++ b/node/modules/stmgr.go @@ -0,0 +1,20 @@ +package modules + +import ( + "go.uber.org/fx" + + "github.com/filecoin-project/lotus/chain/stmgr" + "github.com/filecoin-project/lotus/chain/store" +) + +func StateManager(lc fx.Lifecycle, cs *store.ChainStore, us stmgr.UpgradeSchedule) (*stmgr.StateManager, error) { + sm, err := stmgr.NewStateManagerWithUpgradeSchedule(cs, us) + if err != nil { + return nil, err + } + lc.Append(fx.Hook{ + OnStart: sm.Start, + OnStop: sm.Stop, + }) + return sm, nil +} From 8986a2002d686ff0ff5bb69c9f52e159f6d0eac7 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 14:57:38 -0800 Subject: [PATCH 02/21] wire up re-migration logic for nv10 --- chain/stmgr/forks.go | 119 ++++++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 40 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 8db1b9de4a7..90c6438626a 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -67,7 +67,7 @@ type PreUpgradeFunc func( sm *StateManager, cache MigrationCache, oldState cid.Cid, height abi.ChainEpoch, ts *types.TipSet, -) +) error // PreUpgrade describes a pre-migration step to prepare for a network state upgrade. Pre-migrations // are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times. @@ -76,13 +76,11 @@ type PreUpgrade struct { // expected to run asynchronously and must abort promptly when canceled. PreUpgrade PreUpgradeFunc - // After specifies that this pre-migration should be started _after_ the given epoch. - // - // In case of chain reverts, the pre-migration will not be canceled and the state will not - // be reverted. - After abi.ChainEpoch + // When specifies that this pre-migration should be started at most When epochs before the upgrade. + When abi.ChainEpoch - // NotAfter specifies that this pre-migration should not be started after the given epoch. + // NotAfter specifies that this pre-migration should not be started NotAfter epochs before + // the final upgrade epoch. // // This should be set such that the pre-migration is likely to complete at least 5 epochs // before the next pre-migration and/or upgrade epoch hits. @@ -170,6 +168,15 @@ func DefaultUpgradeSchedule() UpgradeSchedule { Height: build.UpgradeActorsV3Height, Network: network.Version10, Migration: UpgradeActorsV3, + PreUpgrades: []PreUpgrade{{ + PreUpgrade: PreUpgradeActorsV3, + When: 120, + NotAfter: 60, + }, { + PreUpgrade: PreUpgradeActorsV3, + When: 30, + NotAfter: 20, + }}, Expensive: true, }} @@ -234,48 +241,66 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { defer cancel() type op struct { - epoch abi.ChainEpoch - run func(ts *types.TipSet) + after abi.ChainEpoch + notAfter abi.ChainEpoch + run func(ts *types.TipSet) error } // Turn each pre-migration into an operation in a schedule. var schedule []op - for _, migration := range sm.stateMigrations { + for upgradeEpoch, migration := range sm.stateMigrations { cache := migration.cache for _, prem := range migration.preMigrations { - // TODO: make sure the schedule makes sense after < notafter, etc. preCtx, preCancel := context.WithCancel(ctx) migrationFunc := prem.PreUpgrade + afterEpoch := upgradeEpoch - prem.When + notAfterEpoch := upgradeEpoch - prem.NotAfter + // Add an op to start a pre-migration. schedule = append(schedule, op{ - epoch: prem.After, + after: afterEpoch, + notAfter: notAfterEpoch, // TODO: are these values correct? - run: func(ts *types.TipSet) { migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) }, + run: func(ts *types.TipSet) error { + return migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) + }, }) - // Add an op to cancle the pre-migration if it's still running. + // Add an op to cancel the pre-migration if it's still running. schedule = append(schedule, op{ - epoch: prem.NotAfter, - run: func(ts *types.TipSet) { preCancel() }, + after: notAfterEpoch, + notAfter: -1, + run: func(ts *types.TipSet) error { + preCancel() + return nil + }, }) } } // Then sort by epoch. sort.Slice(schedule, func(i, j int) bool { - return schedule[i].epoch < schedule[j].epoch + return schedule[i].after < schedule[j].after }) // Finally, when the head changes, see if there's anything we need to do. for change := range sm.cs.SubHeadChanges(ctx) { for _, head := range change { for len(schedule) > 0 { - if head.Val.Height() < schedule[0].epoch { + op := &schedule[0] + if head.Val.Height() < op.after { break } - schedule[0].run(head.Val) + + // If we haven't passed the pre-migration height... + if op.notAfter < 0 || head.Val.Height() <= op.notAfter { + err := op.run(head.Val) + if err != nil { + log.Errorw("failed to run pre-migration", "error", err) + } + } schedule = schedule[1:] } } @@ -804,12 +829,45 @@ func UpgradeCalico(ctx context.Context, sm *StateManager, _ MigrationCache, cb E return newRoot, nil } -func UpgradeActorsV3(ctx context.Context, sm *StateManager, _ MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { +func UpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { + // TODO: tune this. + config := nv10.Config{MaxWorkers: 1} + newRoot, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config) + if err != nil { + return cid.Undef, xerrors.Errorf("failed to persist new state root: %w", err) + } + + // perform some basic sanity checks to make sure everything still works. + store := store.ActorStore(ctx, sm.ChainStore().Blockstore()) + if newSm, err := state.LoadStateTree(store, newRoot); err != nil { + return cid.Undef, xerrors.Errorf("state tree sanity load failed: %w", err) + } else if newRoot2, err := newSm.Flush(ctx); err != nil { + return cid.Undef, xerrors.Errorf("state tree sanity flush failed: %w", err) + } else if newRoot2 != newRoot { + return cid.Undef, xerrors.Errorf("state-root mismatch: %s != %s", newRoot, newRoot2) + } else if _, err := newSm.GetActor(init_.Address); err != nil { + return cid.Undef, xerrors.Errorf("failed to load init actor after upgrade: %w", err) + } + + return newRoot, nil +} + +func PreUpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) error { + // TODO: tune this. + config := nv10.Config{MaxWorkers: 1} + _, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config) + return err +} + +func upgradeActorsV3Common( + ctx context.Context, sm *StateManager, cache MigrationCache, + root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet, + config nv10.Config, +) (cid.Cid, error) { buf := bufbstore.NewTieredBstore(sm.cs.Blockstore(), bstore.NewTemporarySync()) store := store.ActorStore(ctx, buf) // Load the state root. - var stateRoot types.StateRoot if err := store.Get(ctx, root, &stateRoot); err != nil { return cid.Undef, xerrors.Errorf("failed to decode state root: %w", err) @@ -823,18 +881,12 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, _ MigrationCache, cb } // Perform the migration - - // TODO: store this somewhere and pre-migrate - cache := nv10.NewMemMigrationCache() - // TODO: tune this. - config := nv10.Config{MaxWorkers: 1} newHamtRoot, err := nv10.MigrateStateTree(ctx, store, stateRoot.Actors, epoch, config, migrationLogger{}, cache) if err != nil { return cid.Undef, xerrors.Errorf("upgrading to actors v2: %w", err) } // Persist the result. - newRoot, err := store.Put(ctx, &types.StateRoot{ Version: types.StateTreeVersion2, Actors: newHamtRoot, @@ -844,19 +896,6 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, _ MigrationCache, cb return cid.Undef, xerrors.Errorf("failed to persist new state root: %w", err) } - // Check the result. - - // perform some basic sanity checks to make sure everything still works. - if newSm, err := state.LoadStateTree(store, newRoot); err != nil { - return cid.Undef, xerrors.Errorf("state tree sanity load failed: %w", err) - } else if newRoot2, err := newSm.Flush(ctx); err != nil { - return cid.Undef, xerrors.Errorf("state tree sanity flush failed: %w", err) - } else if newRoot2 != newRoot { - return cid.Undef, xerrors.Errorf("state-root mismatch: %s != %s", newRoot, newRoot2) - } else if _, err := newSm.GetActor(init_.Address); err != nil { - return cid.Undef, xerrors.Errorf("failed to load init actor after upgrade: %w", err) - } - // Persist the new tree. { From 8d05c5d62c88db1b98deb0145142664eefe49629 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 15:00:55 -0800 Subject: [PATCH 03/21] validate pre-migrations --- chain/stmgr/forks.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 90c6438626a..4756711f4fd 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -191,14 +191,20 @@ func DefaultUpgradeSchedule() UpgradeSchedule { } func (us UpgradeSchedule) Validate() error { - // Make sure we're not trying to upgrade to version 0. + // Make sure each upgrade is valid. for _, u := range us { if u.Network <= 0 { return xerrors.Errorf("cannot upgrade to version <= 0: %d", u.Network) } + + for _, m := range u.PreUpgrades { + if m.When <= m.NotAfter { + return xerrors.Errorf("pre-migration cannot end before it starts: %d <= %d", m.When, m.NotAfter) + } + } } - // Make sure all the upgrades make sense. + // Make sure the upgrade order makes sense. for i := 1; i < len(us); i++ { prev := &us[i-1] curr := &us[i] From f65d179f2c162c1ac6225947f6805a91b00c2d40 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 15:05:12 -0800 Subject: [PATCH 04/21] make pre-migrations async --- chain/stmgr/forks.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 4756711f4fd..f433fd68070 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -73,7 +73,7 @@ type PreUpgradeFunc func( // are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times. type PreUpgrade struct { // PreUpgrade is the pre-migration function to run at the specified time. This function is - // expected to run asynchronously and must abort promptly when canceled. + // run asynchronously and must abort promptly when canceled. PreUpgrade PreUpgradeFunc // When specifies that this pre-migration should be started at most When epochs before the upgrade. @@ -249,7 +249,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { type op struct { after abi.ChainEpoch notAfter abi.ChainEpoch - run func(ts *types.TipSet) error + run func(ts *types.TipSet) } // Turn each pre-migration into an operation in a schedule. @@ -269,8 +269,14 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { notAfter: notAfterEpoch, // TODO: are these values correct? - run: func(ts *types.TipSet) error { - return migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) + run: func(ts *types.TipSet) { + go func() { + err := migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) + if err != nil { + log.Errorw("failed to run pre-migration", + "error", err) + } + }() }, }) @@ -278,10 +284,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { schedule = append(schedule, op{ after: notAfterEpoch, notAfter: -1, - run: func(ts *types.TipSet) error { - preCancel() - return nil - }, + run: func(ts *types.TipSet) { preCancel() }, }) } } @@ -302,10 +305,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { // If we haven't passed the pre-migration height... if op.notAfter < 0 || head.Val.Height() <= op.notAfter { - err := op.run(head.Val) - if err != nil { - log.Errorw("failed to run pre-migration", "error", err) - } + op.run(head.Val) } schedule = schedule[1:] } From 77117a0be5a875367029206774d885f58e7cb7d9 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 15:11:31 -0800 Subject: [PATCH 05/21] wait for pre-upgrades to exit on stop --- chain/stmgr/forks.go | 8 ++++++++ chain/stmgr/stmgr.go | 19 +++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index f433fd68070..ffb8e70ce97 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -5,6 +5,7 @@ import ( "context" "encoding/binary" "sort" + "sync" "github.com/filecoin-project/go-state-types/rt" @@ -243,6 +244,8 @@ func (sm *StateManager) hasExpensiveFork(ctx context.Context, height abi.ChainEp } func (sm *StateManager) preMigrationWorker(ctx context.Context) { + defer close(sm.shutdown) + ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -252,6 +255,9 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { run func(ts *types.TipSet) } + var wg sync.WaitGroup + defer wg.Wait() + // Turn each pre-migration into an operation in a schedule. var schedule []op for upgradeEpoch, migration := range sm.stateMigrations { @@ -270,7 +276,9 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { // TODO: are these values correct? run: func(ts *types.TipSet) { + wg.Add(1) go func() { + defer wg.Done() err := migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) if err != nil { log.Errorw("failed to run pre-migration", diff --git a/chain/stmgr/stmgr.go b/chain/stmgr/stmgr.go index e4dd816d1c7..4f4524d583b 100644 --- a/chain/stmgr/stmgr.go +++ b/chain/stmgr/stmgr.go @@ -72,8 +72,8 @@ type migration struct { type StateManager struct { cs *store.ChainStore - ctx context.Context - cancel context.CancelFunc + cancel context.CancelFunc + shutdown chan struct{} // Determines the network version at any given epoch. networkVersions []versionSpec @@ -168,17 +168,24 @@ func cidsToKey(cids []cid.Cid) string { // // This is method is not safe to invoke from multiple threads or concurrently with Stop. func (sm *StateManager) Start(context.Context) error { - sm.ctx, sm.cancel = context.WithCancel(context.Background()) - go sm.preMigrationWorker(sm.ctx) + var ctx context.Context + ctx, sm.cancel = context.WithCancel(context.Background()) + sm.shutdown = make(chan struct{}) + go sm.preMigrationWorker(ctx) return nil } // Stop starts the state manager's background processes. // -// This is method is not safe to invoke from multiple threads or concurrently with Start. -func (sm *StateManager) Stop(context.Context) error { +// This is method is not safe to invoke concurrently with Start. +func (sm *StateManager) Stop(ctx context.Context) error { if sm.cancel != nil { sm.cancel() + select { + case <-sm.shutdown: + case <-ctx.Done(): + return ctx.Err() + } } return nil } From 6362887ce3e5b6d5b32984de2ccacb1717501a71 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 15:15:14 -0800 Subject: [PATCH 06/21] rename Upgrade to Migration where applicable This was really confusing. --- chain/stmgr/forks.go | 40 ++++++++++++++++++++-------------------- chain/stmgr/stmgr.go | 8 ++++---- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index ffb8e70ce97..c054ee6a6a8 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -46,7 +46,7 @@ type MigrationCache interface { Load(key string, loadFunc func() (cid.Cid, error)) (cid.Cid, error) } -// UpgradeFunc is a migration function run at every upgrade. +// MigrationFunc is a migration function run at every upgrade. // // - The cache is a per-upgrade cache, pre-populated by pre-migrations. // - The oldState is the state produced by the upgrade epoch. @@ -54,28 +54,28 @@ type MigrationCache interface { // - The height is the upgrade epoch height (already executed). // - The tipset is the tipset for the last non-null block before the upgrade. Do // not assume that ts.Height() is the upgrade height. -type UpgradeFunc func( +type MigrationFunc func( ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, oldState cid.Cid, height abi.ChainEpoch, ts *types.TipSet, ) (newState cid.Cid, err error) -// PreUpgradeFunc is a function run _before_ a network upgrade to pre-compute part of the network +// PreMigrationFunc is a function run _before_ a network upgrade to pre-compute part of the network // upgrade and speed it up. -type PreUpgradeFunc func( +type PreMigrationFunc func( ctx context.Context, sm *StateManager, cache MigrationCache, oldState cid.Cid, height abi.ChainEpoch, ts *types.TipSet, ) error -// PreUpgrade describes a pre-migration step to prepare for a network state upgrade. Pre-migrations +// PreMigration describes a pre-migration step to prepare for a network state upgrade. Pre-migrations // are optimizations, are not guaranteed to run, and may be canceled and/or run multiple times. -type PreUpgrade struct { - // PreUpgrade is the pre-migration function to run at the specified time. This function is +type PreMigration struct { + // PreMigration is the pre-migration function to run at the specified time. This function is // run asynchronously and must abort promptly when canceled. - PreUpgrade PreUpgradeFunc + PreMigration PreMigrationFunc // When specifies that this pre-migration should be started at most When epochs before the upgrade. When abi.ChainEpoch @@ -92,12 +92,12 @@ type Upgrade struct { Height abi.ChainEpoch Network network.Version Expensive bool - Migration UpgradeFunc + Migration MigrationFunc - // PreUpgrades specifies a set of pre-migration functions to run at the indicated epochs. + // PreMigrations specifies a set of pre-migration functions to run at the indicated epochs. // These functions should fill the given cache with information that can speed up the // eventual full migration at the upgrade epoch. - PreUpgrades []PreUpgrade + PreMigrations []PreMigration } type UpgradeSchedule []Upgrade @@ -169,14 +169,14 @@ func DefaultUpgradeSchedule() UpgradeSchedule { Height: build.UpgradeActorsV3Height, Network: network.Version10, Migration: UpgradeActorsV3, - PreUpgrades: []PreUpgrade{{ - PreUpgrade: PreUpgradeActorsV3, - When: 120, - NotAfter: 60, + PreMigrations: []PreMigration{{ + PreMigration: PreUpgradeActorsV3, + When: 120, + NotAfter: 60, }, { - PreUpgrade: PreUpgradeActorsV3, - When: 30, - NotAfter: 20, + PreMigration: PreUpgradeActorsV3, + When: 30, + NotAfter: 20, }}, Expensive: true, }} @@ -198,7 +198,7 @@ func (us UpgradeSchedule) Validate() error { return xerrors.Errorf("cannot upgrade to version <= 0: %d", u.Network) } - for _, m := range u.PreUpgrades { + for _, m := range u.PreMigrations { if m.When <= m.NotAfter { return xerrors.Errorf("pre-migration cannot end before it starts: %d <= %d", m.When, m.NotAfter) } @@ -264,7 +264,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { cache := migration.cache for _, prem := range migration.preMigrations { preCtx, preCancel := context.WithCancel(ctx) - migrationFunc := prem.PreUpgrade + migrationFunc := prem.PreMigration afterEpoch := upgradeEpoch - prem.When notAfterEpoch := upgradeEpoch - prem.NotAfter diff --git a/chain/stmgr/stmgr.go b/chain/stmgr/stmgr.go index 4f4524d583b..b0018ddc01f 100644 --- a/chain/stmgr/stmgr.go +++ b/chain/stmgr/stmgr.go @@ -64,8 +64,8 @@ type versionSpec struct { } type migration struct { - upgrade UpgradeFunc - preMigrations []PreUpgrade + upgrade MigrationFunc + preMigrations []PreMigration cache MigrationCache } @@ -121,10 +121,10 @@ func NewStateManagerWithUpgradeSchedule(cs *store.ChainStore, us UpgradeSchedule // If we have any upgrades, process them and create a version // schedule. for _, upgrade := range us { - if upgrade.Migration != nil || upgrade.PreUpgrades != nil { + if upgrade.Migration != nil || upgrade.PreMigrations != nil { migration := &migration{ upgrade: upgrade.Migration, - preMigrations: upgrade.PreUpgrades, + preMigrations: upgrade.PreMigrations, cache: nv10.NewMemMigrationCache(), } stateMigrations[upgrade.Height] = migration From b08abc12ba3a83eca40901476dae0eb5a2cc7c32 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 15:21:55 -0800 Subject: [PATCH 07/21] fix error message --- chain/stmgr/forks.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index c054ee6a6a8..268db11ed32 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -848,7 +848,7 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache config := nv10.Config{MaxWorkers: 1} newRoot, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config) if err != nil { - return cid.Undef, xerrors.Errorf("failed to persist new state root: %w", err) + return cid.Undef, xerrors.Errorf("migrating actors v3 state: %w", err) } // perform some basic sanity checks to make sure everything still works. From 32059d0cbf009906bd500752fd13d417e9a9145d Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 17:08:06 -0800 Subject: [PATCH 08/21] stmgr: only persist the migration cache on success Otherwise, we may end up referencing blocks we never wrote to disk. --- chain/stmgr/forks.go | 18 +++++++++++++++++- chain/stmgr/stmgr.go | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 268db11ed32..5018653c76e 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -229,10 +229,17 @@ func (sm *StateManager) handleStateForks(ctx context.Context, root cid.Cid, heig var err error u := sm.stateMigrations[height] if u != nil && u.upgrade != nil { - retCid, err = u.upgrade(ctx, sm, u.cache, cb, root, height, ts) + // Yes, we clone the cache, even for the final upgrade epoch. Why? Reverts. We may + // have to migrate multiple times. + tmpCache := u.cache.Clone() + retCid, err = u.upgrade(ctx, sm, tmpCache, cb, root, height, ts) if err != nil { return cid.Undef, err } + // Yes, we update the cache, even for the final upgrade epoch. Why? Reverts. This + // can save us a _lot_ of time because very few actors will have changed if we + // do a small revert then need to re-run the migration. + u.cache.Update(tmpCache) } return retCid, nil @@ -279,11 +286,20 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { wg.Add(1) go func() { defer wg.Done() + + // Clone the cache so we don't actually _update_ it + // till we're done. Otherwise, if we fail, the next + // migration to use the cache may assume that + // certain blocks exist, even if they don't. + tmpCache := cache.Clone() err := migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) if err != nil { log.Errorw("failed to run pre-migration", "error", err) + return } + // Finally, if everything worked, update the cache. + cache.Update(tmpCache) }() }, }) diff --git a/chain/stmgr/stmgr.go b/chain/stmgr/stmgr.go index b0018ddc01f..8598ac23d29 100644 --- a/chain/stmgr/stmgr.go +++ b/chain/stmgr/stmgr.go @@ -66,7 +66,7 @@ type versionSpec struct { type migration struct { upgrade MigrationFunc preMigrations []PreMigration - cache MigrationCache + cache *nv10.MemMigrationCache } type StateManager struct { From 05026a23b94d9832e0c3b6442e595b7a4da3dfee Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 18:44:47 -0800 Subject: [PATCH 09/21] actually use the temp cache for pre-migrations --- chain/stmgr/forks.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 5018653c76e..66d30390e30 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -292,7 +292,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { // migration to use the cache may assume that // certain blocks exist, even if they don't. tmpCache := cache.Clone() - err := migrationFunc(preCtx, sm, cache, ts.ParentState(), ts.Height(), ts) + err := migrationFunc(preCtx, sm, tmpCache, ts.ParentState(), ts.Height(), ts) if err != nil { log.Errorw("failed to run pre-migration", "error", err) From ed3e0869c36e5b10417450e309cd9a4fb34ef73e Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 18:45:02 -0800 Subject: [PATCH 10/21] register tipsets with the chainstore when testing --- chain/gen/gen.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/chain/gen/gen.go b/chain/gen/gen.go index 8e749095cf3..0b7f6eda4b0 100644 --- a/chain/gen/gen.go +++ b/chain/gen/gen.go @@ -465,7 +465,12 @@ func (cg *ChainGen) NextTipSetFromMinersWithMessages(base *types.TipSet, miners } } - return store.NewFullTipSet(blks), nil + fts := store.NewFullTipSet(blks) + if err := cg.cs.PutTipSet(context.TODO(), fts.TipSet()); err != nil { + return nil, err + } + + return fts, nil } func (cg *ChainGen) makeBlock(parents *types.TipSet, m address.Address, vrfticket *types.Ticket, From 4072f24bf27e40166c153f279c9704426aaa848a Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 18:45:50 -0800 Subject: [PATCH 11/21] test pre-migrations --- chain/stmgr/forks_test.go | 147 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/chain/stmgr/forks_test.go b/chain/stmgr/forks_test.go index 80ebb491bd1..92cc333498d 100644 --- a/chain/stmgr/forks_test.go +++ b/chain/stmgr/forks_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "sync" "testing" "github.com/ipfs/go-cid" @@ -317,3 +318,149 @@ func TestForkRefuseCall(t *testing.T) { } } } + +func TestForkPreMigration(t *testing.T) { + logging.SetAllLoggers(logging.LevelInfo) + + cg, err := gen.NewGenerator() + if err != nil { + t.Fatal(err) + } + + fooCid, err := abi.CidBuilder.Sum([]byte("foo")) + require.NoError(t, err) + + barCid, err := abi.CidBuilder.Sum([]byte("bar")) + require.NoError(t, err) + + failCid, err := abi.CidBuilder.Sum([]byte("fail")) + require.NoError(t, err) + + var wait20 sync.WaitGroup + wait20.Add(3) + + wasCanceled := make(chan struct{}) + + checkCache := func(t *testing.T, cache MigrationCache) { + found, value, err := cache.Read("foo") + require.NoError(t, err) + require.True(t, found) + require.Equal(t, fooCid, value) + + found, value, err = cache.Read("bar") + require.NoError(t, err) + require.True(t, found) + require.Equal(t, barCid, value) + + found, _, err = cache.Read("fail") + require.NoError(t, err) + require.False(t, found) + } + + sm, err := NewStateManagerWithUpgradeSchedule( + cg.ChainStore(), UpgradeSchedule{{ + Network: 1, + Height: testForkHeight, + Migration: func(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, + root cid.Cid, height abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { + + // Make sure the test that should be canceled, is canceled. + select { + case <-wasCanceled: + case <-ctx.Done(): + return cid.Undef, ctx.Err() + } + + // the cache should be setup correctly. + checkCache(t, cache) + + return root, nil + }, + PreMigrations: []PreMigration{{ + When: 20, + PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, + _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { + wait20.Done() + wait20.Wait() + + err := cache.Write("foo", fooCid) + require.NoError(t, err) + + return nil + }, + }, { + When: 20, + PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, + _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { + wait20.Done() + wait20.Wait() + + err := cache.Write("bar", barCid) + require.NoError(t, err) + + return nil + }, + }, { + When: 20, + PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, + _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { + wait20.Done() + wait20.Wait() + + err := cache.Write("fail", failCid) + require.NoError(t, err) + + // Fail this migration. The cached entry should not be persisted. + return fmt.Errorf("failed") + }, + }, { + When: 10, + PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, + _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { + + checkCache(t, cache) + + return nil + }, + }, { + When: 15, + NotAfter: 5, + PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, + _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { + + <-ctx.Done() + close(wasCanceled) + + return nil + }, + }}}, + }) + if err != nil { + t.Fatal(err) + } + require.NoError(t, sm.Start(context.Background())) + defer func() { + require.NoError(t, sm.Stop(context.Background())) + }() + + inv := vm.NewActorRegistry() + inv.Register(nil, testActor{}) + + sm.SetVMConstructor(func(ctx context.Context, vmopt *vm.VMOpts) (*vm.VM, error) { + nvm, err := vm.NewVM(ctx, vmopt) + if err != nil { + return nil, err + } + nvm.SetInvoker(inv) + return nvm, nil + }) + + cg.SetStateManager(sm) + + for i := 0; i < 50; i++ { + _, err := cg.NextTipSet() + if err != nil { + t.Fatal(err) + } + } +} From 35d6a400715f830ce624f4a955d04635c41bf17c Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Tue, 26 Jan 2021 18:50:25 -0800 Subject: [PATCH 12/21] cleanup pre-migration code a bit --- chain/stmgr/forks.go | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 66d30390e30..d7f138c2832 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -250,6 +250,22 @@ func (sm *StateManager) hasExpensiveFork(ctx context.Context, height abi.ChainEp return ok } +func runPreMigration(ctx context.Context, sm *StateManager, fn PreMigrationFunc, cache *nv10.MemMigrationCache, ts *types.TipSet) { + // Clone the cache so we don't actually _update_ it + // till we're done. Otherwise, if we fail, the next + // migration to use the cache may assume that + // certain blocks exist, even if they don't. + tmpCache := cache.Clone() + err := fn(ctx, sm, tmpCache, ts.ParentState(), ts.Height(), ts) + if err != nil { + log.Errorw("failed to run pre-migration", + "error", err) + return + } + // Finally, if everything worked, update the cache. + cache.Update(tmpCache) +} + func (sm *StateManager) preMigrationWorker(ctx context.Context) { defer close(sm.shutdown) @@ -286,20 +302,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { wg.Add(1) go func() { defer wg.Done() - - // Clone the cache so we don't actually _update_ it - // till we're done. Otherwise, if we fail, the next - // migration to use the cache may assume that - // certain blocks exist, even if they don't. - tmpCache := cache.Clone() - err := migrationFunc(preCtx, sm, tmpCache, ts.ParentState(), ts.Height(), ts) - if err != nil { - log.Errorw("failed to run pre-migration", - "error", err) - return - } - // Finally, if everything worked, update the cache. - cache.Update(tmpCache) + runPreMigration(preCtx, sm, migrationFunc, cache, ts) }() }, }) From 716f10e570f12717caf0d3073dc0e0f83290b48c Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 09:38:09 -0800 Subject: [PATCH 13/21] remove unnecessary wrapper store in tests This was causing a lot of warnings about not implementing View. --- chain/gen/gen.go | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/chain/gen/gen.go b/chain/gen/gen.go index 0b7f6eda4b0..1ad8dad6d54 100644 --- a/chain/gen/gen.go +++ b/chain/gen/gen.go @@ -14,7 +14,6 @@ import ( "github.com/filecoin-project/go-state-types/big" "github.com/filecoin-project/go-state-types/crypto" "github.com/google/uuid" - block "github.com/ipfs/go-block-format" "github.com/ipfs/go-blockservice" "github.com/ipfs/go-cid" offline "github.com/ipfs/go-ipfs-exchange-offline" @@ -85,19 +84,6 @@ type ChainGen struct { lr repo.LockedRepo } -type mybs struct { - blockstore.Blockstore -} - -func (m mybs) Get(c cid.Cid) (block.Block, error) { - b, err := m.Blockstore.Get(c) - if err != nil { - return nil, err - } - - return b, nil -} - var rootkeyMultisig = genesis.MultisigMeta{ Signers: []address.Address{remAccTestKey}, Threshold: 1, @@ -152,8 +138,6 @@ func NewGeneratorWithSectors(numSectors int) (*ChainGen, error) { } }() - bs = mybs{bs} - ks, err := lr.KeyStore() if err != nil { return nil, xerrors.Errorf("getting repo keystore failed: %w", err) From 854385168dfc4fdc184a0229e5de2fa2271e782d Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 12:24:48 -0800 Subject: [PATCH 14/21] add additional pre-migration validations --- chain/stmgr/forks.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index d7f138c2832..7a8473fc26a 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -199,10 +199,18 @@ func (us UpgradeSchedule) Validate() error { } for _, m := range u.PreMigrations { + if m.When < 0 || m.NotAfter < 0 { + return xerrors.Errorf("pre-migration must specify non-negative epochs") + } if m.When <= m.NotAfter { return xerrors.Errorf("pre-migration cannot end before it starts: %d <= %d", m.When, m.NotAfter) } } + if !sort.SliceIsSorted(u.PreMigrations, func(i, j int) bool { + return u.PreMigrations[i].When < u.PreMigrations[j].When + }) { + return xerrors.Errorf("pre-migrations must be sorted by start epoch") + } } // Make sure the upgrade order makes sense. From 56054b0ad3903c7750d65e9f8a7ec0c3a4c51619 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 12:54:56 -0800 Subject: [PATCH 15/21] sort pre-migrations --- chain/stmgr/forks_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/chain/stmgr/forks_test.go b/chain/stmgr/forks_test.go index 92cc333498d..e259db3b00e 100644 --- a/chain/stmgr/forks_test.go +++ b/chain/stmgr/forks_test.go @@ -414,22 +414,22 @@ func TestForkPreMigration(t *testing.T) { return fmt.Errorf("failed") }, }, { - When: 10, + When: 15, + NotAfter: 5, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { - checkCache(t, cache) + <-ctx.Done() + close(wasCanceled) return nil }, }, { - When: 15, - NotAfter: 5, + When: 10, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { - <-ctx.Done() - close(wasCanceled) + checkCache(t, cache) return nil }, From 9a03c003fe9ffa9b1c44e0055b25b063f3c4ef38 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 13:05:14 -0800 Subject: [PATCH 16/21] fix sorted test --- chain/stmgr/forks.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 7a8473fc26a..b143563c151 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -207,7 +207,7 @@ func (us UpgradeSchedule) Validate() error { } } if !sort.SliceIsSorted(u.PreMigrations, func(i, j int) bool { - return u.PreMigrations[i].When < u.PreMigrations[j].When + return u.PreMigrations[i].When > u.PreMigrations[j].When //nolint:scopelint,gosec }) { return xerrors.Errorf("pre-migrations must be sorted by start epoch") } From 9574db6d0a63b63eefdf7b784c1982c25790e5a4 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 15:05:00 -0800 Subject: [PATCH 17/21] log in migration --- chain/stmgr/forks.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index b143563c151..06559a378a7 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -6,6 +6,7 @@ import ( "encoding/binary" "sort" "sync" + "time" "github.com/filecoin-project/go-state-types/rt" @@ -176,7 +177,7 @@ func DefaultUpgradeSchedule() UpgradeSchedule { }, { PreMigration: PreUpgradeActorsV3, When: 30, - NotAfter: 20, + NotAfter: 10, }}, Expensive: true, }} @@ -237,17 +238,24 @@ func (sm *StateManager) handleStateForks(ctx context.Context, root cid.Cid, heig var err error u := sm.stateMigrations[height] if u != nil && u.upgrade != nil { + startTime := time.Now() + log.Warnw("STARTING migration", "height", height) // Yes, we clone the cache, even for the final upgrade epoch. Why? Reverts. We may // have to migrate multiple times. tmpCache := u.cache.Clone() retCid, err = u.upgrade(ctx, sm, tmpCache, cb, root, height, ts) if err != nil { + log.Errorw("FAILED migration", "height", height, "error", err) return cid.Undef, err } // Yes, we update the cache, even for the final upgrade epoch. Why? Reverts. This // can save us a _lot_ of time because very few actors will have changed if we // do a small revert then need to re-run the migration. u.cache.Update(tmpCache) + log.Warnw("COMPLETED migration", + "height", height, + "duration", time.Since(startTime), + ) } return retCid, nil @@ -259,19 +267,25 @@ func (sm *StateManager) hasExpensiveFork(ctx context.Context, height abi.ChainEp } func runPreMigration(ctx context.Context, sm *StateManager, fn PreMigrationFunc, cache *nv10.MemMigrationCache, ts *types.TipSet) { + height := ts.Height() + parent := ts.ParentState() + + startTime := time.Now() + + log.Warn("STARTING pre-migration") // Clone the cache so we don't actually _update_ it // till we're done. Otherwise, if we fail, the next // migration to use the cache may assume that // certain blocks exist, even if they don't. tmpCache := cache.Clone() - err := fn(ctx, sm, tmpCache, ts.ParentState(), ts.Height(), ts) + err := fn(ctx, sm, tmpCache, parent, height, ts) if err != nil { - log.Errorw("failed to run pre-migration", - "error", err) + log.Errorw("FAILED pre-migration", "error", err) return } // Finally, if everything worked, update the cache. cache.Update(tmpCache) + log.Warnw("COMPLETED pre-migration", "duration", time.Since(startTime)) } func (sm *StateManager) preMigrationWorker(ctx context.Context) { From cffeb1a59092279cfcb9d43d2f400796cdb824ec Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 15:41:26 -0800 Subject: [PATCH 18/21] fix doc comments Co-authored-by: Aayush Rajasekaran --- chain/stmgr/stmgr.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chain/stmgr/stmgr.go b/chain/stmgr/stmgr.go index 8598ac23d29..d58d7302800 100644 --- a/chain/stmgr/stmgr.go +++ b/chain/stmgr/stmgr.go @@ -166,7 +166,7 @@ func cidsToKey(cids []cid.Cid) string { // Start starts the state manager's optional background processes. At the moment, this schedules // pre-migration functions to run ahead of network upgrades. // -// This is method is not safe to invoke from multiple threads or concurrently with Stop. +// This method is not safe to invoke from multiple threads or concurrently with Stop. func (sm *StateManager) Start(context.Context) error { var ctx context.Context ctx, sm.cancel = context.WithCancel(context.Background()) @@ -177,7 +177,7 @@ func (sm *StateManager) Start(context.Context) error { // Stop starts the state manager's background processes. // -// This is method is not safe to invoke concurrently with Start. +// This method is not safe to invoke concurrently with Start. func (sm *StateManager) Stop(ctx context.Context) error { if sm.cancel != nil { sm.cancel() From bceb246080c0de0b81371f507d408d97c49365fd Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 15:47:34 -0800 Subject: [PATCH 19/21] set worker counts for pre-migrations --- chain/stmgr/forks.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index 06559a378a7..a16ba2474cb 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/binary" + "runtime" "sort" "sync" "time" @@ -885,8 +886,13 @@ func UpgradeCalico(ctx context.Context, sm *StateManager, _ MigrationCache, cb E } func UpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, cb ExecCallback, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) (cid.Cid, error) { - // TODO: tune this. - config := nv10.Config{MaxWorkers: 1} + // Use all the CPUs except 3. + workerCount := runtime.NumCPU() - 3 + if workerCount <= 0 { + workerCount = 1 + } + + config := nv10.Config{MaxWorkers: uint(workerCount)} newRoot, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config) if err != nil { return cid.Undef, xerrors.Errorf("migrating actors v3 state: %w", err) @@ -908,8 +914,14 @@ func UpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache } func PreUpgradeActorsV3(ctx context.Context, sm *StateManager, cache MigrationCache, root cid.Cid, epoch abi.ChainEpoch, ts *types.TipSet) error { - // TODO: tune this. - config := nv10.Config{MaxWorkers: 1} + // Use half the CPUs for pre-migration, but leave at least 3. + workerCount := runtime.NumCPU() + if workerCount <= 4 { + workerCount = 1 + } else { + workerCount /= 2 + } + config := nv10.Config{MaxWorkers: uint(workerCount)} _, err := upgradeActorsV3Common(ctx, sm, cache, root, epoch, ts, config) return err } From 39d4f6780d14e888258a0504f90c17ee822ceddb Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 15:58:51 -0800 Subject: [PATCH 20/21] pre-migration: refactor timing specification There are now three times: 1. StartWithin: start within X epochs of the upgrade. 2. DontStartWithin: don't start within X epochs of the upgrade. 3. StopWithin: stop within X epochs of the upgrade. --- chain/stmgr/forks.go | 70 +++++++++++++++++++++++++++------------ chain/stmgr/forks_test.go | 12 +++---- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/chain/stmgr/forks.go b/chain/stmgr/forks.go index a16ba2474cb..b36f2c0bd5c 100644 --- a/chain/stmgr/forks.go +++ b/chain/stmgr/forks.go @@ -79,15 +79,19 @@ type PreMigration struct { // run asynchronously and must abort promptly when canceled. PreMigration PreMigrationFunc - // When specifies that this pre-migration should be started at most When epochs before the upgrade. - When abi.ChainEpoch + // StartWithin specifies that this pre-migration should be started at most StartWithin + // epochs before the upgrade. + StartWithin abi.ChainEpoch - // NotAfter specifies that this pre-migration should not be started NotAfter epochs before - // the final upgrade epoch. + // DontStartWithin specifies that this pre-migration should not be started DontStartWithin + // epochs before the final upgrade epoch. // - // This should be set such that the pre-migration is likely to complete at least 5 epochs - // before the next pre-migration and/or upgrade epoch hits. - NotAfter abi.ChainEpoch + // This should be set such that the pre-migration is likely to complete before StopWithin. + DontStartWithin abi.ChainEpoch + + // StopWithin specifies that this pre-migration should be stopped StopWithin epochs of the + // final upgrade epoch. + StopWithin abi.ChainEpoch } type Upgrade struct { @@ -172,13 +176,15 @@ func DefaultUpgradeSchedule() UpgradeSchedule { Network: network.Version10, Migration: UpgradeActorsV3, PreMigrations: []PreMigration{{ - PreMigration: PreUpgradeActorsV3, - When: 120, - NotAfter: 60, + PreMigration: PreUpgradeActorsV3, + StartWithin: 120, + DontStartWithin: 60, + StopWithin: 35, }, { - PreMigration: PreUpgradeActorsV3, - When: 30, - NotAfter: 10, + PreMigration: PreUpgradeActorsV3, + StartWithin: 30, + DontStartWithin: 15, + StopWithin: 5, }}, Expensive: true, }} @@ -201,15 +207,30 @@ func (us UpgradeSchedule) Validate() error { } for _, m := range u.PreMigrations { - if m.When < 0 || m.NotAfter < 0 { + if m.StartWithin <= 0 { + return xerrors.Errorf("pre-migration must specify a positive start-within epoch") + } + + if m.DontStartWithin < 0 || m.StopWithin < 0 { return xerrors.Errorf("pre-migration must specify non-negative epochs") } - if m.When <= m.NotAfter { - return xerrors.Errorf("pre-migration cannot end before it starts: %d <= %d", m.When, m.NotAfter) + + if m.StartWithin <= m.StopWithin { + return xerrors.Errorf("pre-migration start-within must come before stop-within") + } + + // If we have a dont-start-within. + if m.DontStartWithin != 0 { + if m.DontStartWithin < m.StopWithin { + return xerrors.Errorf("pre-migration dont-start-within must come before stop-within") + } + if m.StartWithin <= m.DontStartWithin { + return xerrors.Errorf("pre-migration start-within must come after dont-start-within") + } } } if !sort.SliceIsSorted(u.PreMigrations, func(i, j int) bool { - return u.PreMigrations[i].When > u.PreMigrations[j].When //nolint:scopelint,gosec + return u.PreMigrations[i].StartWithin > u.PreMigrations[j].StartWithin //nolint:scopelint,gosec }) { return xerrors.Errorf("pre-migrations must be sorted by start epoch") } @@ -312,8 +333,13 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { preCtx, preCancel := context.WithCancel(ctx) migrationFunc := prem.PreMigration - afterEpoch := upgradeEpoch - prem.When - notAfterEpoch := upgradeEpoch - prem.NotAfter + afterEpoch := upgradeEpoch - prem.StartWithin + notAfterEpoch := upgradeEpoch - prem.DontStartWithin + stopEpoch := upgradeEpoch - prem.StopWithin + // We can't start after we stop. + if notAfterEpoch > stopEpoch { + notAfterEpoch = stopEpoch - 1 + } // Add an op to start a pre-migration. schedule = append(schedule, op{ @@ -332,7 +358,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { // Add an op to cancel the pre-migration if it's still running. schedule = append(schedule, op{ - after: notAfterEpoch, + after: stopEpoch, notAfter: -1, run: func(ts *types.TipSet) { preCancel() }, }) @@ -345,6 +371,8 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { }) // Finally, when the head changes, see if there's anything we need to do. + // + // We're intentionally ignoring reorgs as they don't matter for our purposes. for change := range sm.cs.SubHeadChanges(ctx) { for _, head := range change { for len(schedule) > 0 { @@ -354,7 +382,7 @@ func (sm *StateManager) preMigrationWorker(ctx context.Context) { } // If we haven't passed the pre-migration height... - if op.notAfter < 0 || head.Val.Height() <= op.notAfter { + if op.notAfter < 0 || head.Val.Height() < op.notAfter { op.run(head.Val) } schedule = schedule[1:] diff --git a/chain/stmgr/forks_test.go b/chain/stmgr/forks_test.go index e259db3b00e..17b6542b85f 100644 --- a/chain/stmgr/forks_test.go +++ b/chain/stmgr/forks_test.go @@ -377,7 +377,7 @@ func TestForkPreMigration(t *testing.T) { return root, nil }, PreMigrations: []PreMigration{{ - When: 20, + StartWithin: 20, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { wait20.Done() @@ -389,7 +389,7 @@ func TestForkPreMigration(t *testing.T) { return nil }, }, { - When: 20, + StartWithin: 20, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { wait20.Done() @@ -401,7 +401,7 @@ func TestForkPreMigration(t *testing.T) { return nil }, }, { - When: 20, + StartWithin: 20, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { wait20.Done() @@ -414,8 +414,8 @@ func TestForkPreMigration(t *testing.T) { return fmt.Errorf("failed") }, }, { - When: 15, - NotAfter: 5, + StartWithin: 15, + StopWithin: 5, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { @@ -425,7 +425,7 @@ func TestForkPreMigration(t *testing.T) { return nil }, }, { - When: 10, + StartWithin: 10, PreMigration: func(ctx context.Context, _ *StateManager, cache MigrationCache, _ cid.Cid, _ abi.ChainEpoch, _ *types.TipSet) error { From 4890d83acb92fbf69115f7c4c37f2c78687db2c2 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 27 Jan 2021 16:12:08 -0800 Subject: [PATCH 21/21] test: assert that pre-migrations are actually run --- chain/stmgr/forks_test.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/chain/stmgr/forks_test.go b/chain/stmgr/forks_test.go index 17b6542b85f..95e7ef69900 100644 --- a/chain/stmgr/forks_test.go +++ b/chain/stmgr/forks_test.go @@ -357,6 +357,8 @@ func TestForkPreMigration(t *testing.T) { require.False(t, found) } + counter := make(chan struct{}, 10) + sm, err := NewStateManagerWithUpgradeSchedule( cg.ChainStore(), UpgradeSchedule{{ Network: 1, @@ -374,6 +376,8 @@ func TestForkPreMigration(t *testing.T) { // the cache should be setup correctly. checkCache(t, cache) + counter <- struct{}{} + return root, nil }, PreMigrations: []PreMigration{{ @@ -386,6 +390,8 @@ func TestForkPreMigration(t *testing.T) { err := cache.Write("foo", fooCid) require.NoError(t, err) + counter <- struct{}{} + return nil }, }, { @@ -398,6 +404,8 @@ func TestForkPreMigration(t *testing.T) { err := cache.Write("bar", barCid) require.NoError(t, err) + counter <- struct{}{} + return nil }, }, { @@ -410,6 +418,8 @@ func TestForkPreMigration(t *testing.T) { err := cache.Write("fail", failCid) require.NoError(t, err) + counter <- struct{}{} + // Fail this migration. The cached entry should not be persisted. return fmt.Errorf("failed") }, @@ -422,6 +432,8 @@ func TestForkPreMigration(t *testing.T) { <-ctx.Done() close(wasCanceled) + counter <- struct{}{} + return nil }, }, { @@ -431,6 +443,8 @@ func TestForkPreMigration(t *testing.T) { checkCache(t, cache) + counter <- struct{}{} + return nil }, }}}, @@ -463,4 +477,7 @@ func TestForkPreMigration(t *testing.T) { t.Fatal(err) } } + // We have 5 pre-migration steps, and the migration. They should all have written something + // to this channel. + require.Equal(t, 6, len(counter)) }