From 4f29e2ecaa651646d656898d6da31d89d8a5a1a2 Mon Sep 17 00:00:00 2001 From: Viraj Bhartiya Date: Thu, 12 Dec 2024 16:41:15 +0530 Subject: [PATCH] chore: more frequent migration progress logs (#12732) --- CHANGELOG.md | 1 + chain/consensus/filcns/upgrades.go | 31 +++++++++++++++++-- .../misc/Building_a_network_skeleton.md | 15 +++++++-- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e26d46f4a2b..b969aecf155 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Add json output of tipsets to `louts chain list`. ([filecoin-project/lotus#12691](https://github.com/filecoin-project/lotus/pull/12691)) - Remove IPNI advertisement relay over pubsub via Lotus node as it now has been deprecated. ([filecoin-project/lotus#12768](https://github.com/filecoin-project/lotus/pull/12768) +- During a network upgrade, log migration progress every 2 seconds so they are more helpful and informative. The `LOTUS_MIGRATE_PROGRESS_LOG_SECONDS` environment variable can be used to change this if needed. ([filecoin-project/lotus#12732](https://github.com/filecoin-project/lotus/pull/12732)) # UNRELEASED v.1.32.0 diff --git a/chain/consensus/filcns/upgrades.go b/chain/consensus/filcns/upgrades.go index 3f68960861f..a93af1f4420 100644 --- a/chain/consensus/filcns/upgrades.go +++ b/chain/consensus/filcns/upgrades.go @@ -2754,9 +2754,14 @@ func PreUpgradeActorsV16(ctx context.Context, sm *stmgr.StateManager, cache stmg return xerrors.Errorf("error getting lookback ts for premigration: %w", err) } + logPeriod, err := getMigrationProgressLogPeriod() + if err != nil { + return xerrors.Errorf("error getting progress log period: %w", err) + } + config := migration.Config{ MaxWorkers: uint(workerCount), - ProgressLogPeriod: time.Minute * 5, + ProgressLogPeriod: logPeriod, } _, err = upgradeActorsV16Common(ctx, sm, cache, lbRoot, epoch, lbts, config) @@ -2770,11 +2775,17 @@ func UpgradeActorsV16(ctx context.Context, sm *stmgr.StateManager, cache stmgr.M if workerCount <= 0 { workerCount = 1 } + + logPeriod, err := getMigrationProgressLogPeriod() + if err != nil { + return cid.Undef, xerrors.Errorf("error getting progress log period: %w", err) + } + config := migration.Config{ MaxWorkers: uint(workerCount), JobQueueSize: 1000, ResultQueueSize: 100, - ProgressLogPeriod: 10 * time.Second, + ProgressLogPeriod: logPeriod, } newRoot, err := upgradeActorsV16Common(ctx, sm, cache, root, epoch, ts, config) if err != nil { @@ -3005,3 +3016,19 @@ func (ml migrationLogger) Log(level rt.LogLevel, msg string, args ...interface{} log.Errorf(msg, args...) } } + +func getMigrationProgressLogPeriod() (time.Duration, error) { + logPeriod := time.Second * 2 // default period + period := os.Getenv("LOTUS_MIGRATE_PROGRESS_LOG_SECONDS") + if period != "" { + seconds, err := strconv.Atoi(period) + if err != nil { + return 0, xerrors.Errorf("LOTUS_MIGRATE_PROGRESS_LOG_SECONDS must be an integer: %w", err) + } + if seconds <= 0 { + return 0, xerrors.Errorf("LOTUS_MIGRATE_PROGRESS_LOG_SECONDS must be positive") + } + logPeriod = time.Duration(seconds) * time.Second + } + return logPeriod, nil +} diff --git a/documentation/misc/Building_a_network_skeleton.md b/documentation/misc/Building_a_network_skeleton.md index d7d1fdbc9e0..1fae76b923b 100644 --- a/documentation/misc/Building_a_network_skeleton.md +++ b/documentation/misc/Building_a_network_skeleton.md @@ -456,9 +456,14 @@ Typically it's safe to not upgrade filecoin-ffi's version of go-state-types. Th return xerrors.Errorf("error getting lookback ts for premigration: %w", err) } + logPeriod, err := getMigrationProgressLogPeriod() + if err != nil { + return xerrors.Errorf("error getting progress log period: %w", err) + } + config := migration.Config{ MaxWorkers: uint(workerCount), - ProgressLogPeriod: time.Minute * 5, + ProgressLogPeriod: logPeriod, } _, err = upgradeActorsV(XX+1)Common(ctx, sm, cache, lbRoot, epoch, lbts, config) @@ -472,11 +477,17 @@ Typically it's safe to not upgrade filecoin-ffi's version of go-state-types. Th if workerCount <= 0 { workerCount = 1 } + + logPeriod, err := getMigrationProgressLogPeriod() + if err != nil { + return cid.Undef, xerrors.Errorf("error getting progress log period: %w", err) + } + config := migration.Config{ MaxWorkers: uint(workerCount), JobQueueSize: 1000, ResultQueueSize: 100, - ProgressLogPeriod: 10 * time.Second, + ProgressLogPeriod: logPeriod, } newRoot, err := upgradeActorsV(XX+1)Common(ctx, sm, cache, root, epoch, ts, config) if err != nil {