Skip to content

Commit

Permalink
Merge pull request #338 from agoric-labs/mfig-signal-then-halt
Browse files Browse the repository at this point in the history
fix(baseapp): signal then panic at halt-height
  • Loading branch information
michaelfig committed Aug 3, 2024
2 parents 685da65 + 2a09d9f commit 3768f9c
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 44 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG-Agoric.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ Ref: https://keepachangelog.com/en/1.0.0/

* (auth, bank) Agoric/agoric-sdk#8989 Remove deprecated lien support

### Bug Fixes

* (baseapp) [#338](https://github.com/agoric-labs/cosmos-sdk/pull/338) Make sure we don't execute blocks beyond the halt height. Restored from [#305](https://github.com/agoric-labs/cosmos-sdk/pull/305) but compatible with older `SIGINT`, `SIGTERM` logic

## `v0.46.16-alpha.agoric.2.4` - 2024-04-19

### Improvements
Expand Down Expand Up @@ -99,7 +103,7 @@ Ref: https://keepachangelog.com/en/1.0.0/

### Bug Fixes

* (baseapp) [#337](https://github.com/agoric-labs/cosmos-sdk/pull/337) revert #305 which causes test failures in agoric-sdk
* (baseapp) [#337](https://github.com/agoric-labs/cosmos-sdk/pull/337) revert [#305](https://github.com/agoric-labs/cosmos-sdk/pull/305) which causes test failures in agoric-sdk

## `v0.45.16-alpha.agoric.1` - 2023-09-22

Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ test-cover:

test-rosetta:
docker build -t rosetta-ci:latest -f contrib/rosetta/rosetta-ci/Dockerfile .
docker-compose -f contrib/rosetta/docker-compose.yaml up --abort-on-container-exit --exit-code-from test_rosetta --build
docker compose -f contrib/rosetta/docker-compose.yaml up --abort-on-container-exit --exit-code-from test_rosetta --build
.PHONY: test-rosetta

benchmark:
Expand Down Expand Up @@ -467,10 +467,10 @@ localnet-build-dlv:
localnet-build-nodes:
$(DOCKER) run --rm -v $(CURDIR)/.testnets:/data cosmossdk/simd \
testnet init-files --v 4 -o /data --starting-ip-address 192.168.10.2 --keyring-backend=test
docker-compose up -d
docker compose up -d

localnet-stop:
docker-compose down
docker compose down

# localnet-start will run a 4-node testnet locally. The nodes are
# based off the docker images in: ./contrib/images/simd-env
Expand Down
83 changes: 43 additions & 40 deletions baseapp/abci.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ func (app *BaseApp) BeginBlock(req abci.RequestBeginBlock) (res abci.ResponseBeg
))
}

app.checkHalt(req.Header.Height, req.Header.Time)

if err := app.validateHeight(req); err != nil {
panic(err)
}
Expand Down Expand Up @@ -312,6 +314,47 @@ func (app *BaseApp) deliverTxWithoutEventHistory(req abci.RequestDeliverTx) (res
}
}

// checkHalt forces a state machine halt and attempts to kill the current
// process if block height or timestamp exceeds halt-height or halt-time
// respectively.
func (app *BaseApp) checkHalt(blockHeight int64, blockTime time.Time) {
var halt bool
if app.haltHeight > 0 && uint64(blockHeight) > app.haltHeight {
// height to halt has passed
halt = true
} else if app.haltTime > 0 && blockTime.Unix() > int64(app.haltTime) {
// time to halt has passed
halt = true
}

if !halt {
return
}

app.logger.Info(
"halt per configuration",
"haltHeight", app.haltHeight,
"haltTime", app.haltTime,
"blockHeight", blockHeight,
"blockTime", blockTime,
)

// [AGORIC] Make a best-effort attempt to kill our process.
p, err := os.FindProcess(os.Getpid())
if err == nil {
// attempt cascading signals in case SIGINT fails (os dependent)
_ = p.Signal(syscall.SIGINT)
_ = p.Signal(syscall.SIGTERM)
// Errors in these signal calls are not meaningful to us. We tried our
// best, but we don't care (and can't tell) if or how the signal handler
// responds.
}

// Prevent the state machine from advancing to the next block, no matter how
// the signals were handled.
panic(errors.New("halt application"))
}

// Commit implements the ABCI interface. It will commit all state that exists in
// the deliver state's multi-store and includes the resulting commit ID in the
// returned abci.ResponseCommit. Commit will set the check state based on the
Expand Down Expand Up @@ -368,53 +411,13 @@ func (app *BaseApp) CommitWithoutSnapshot() (_res abci.ResponseCommit, snapshotH
// empty/reset the deliver state
app.deliverState = nil

var halt bool

switch {
case app.haltHeight > 0 && uint64(header.Height) >= app.haltHeight:
halt = true

case app.haltTime > 0 && header.Time.Unix() >= int64(app.haltTime):
halt = true
}

if halt {
// Halt the binary and allow Tendermint to receive the ResponseCommit
// response with the commit ID hash. This will allow the node to successfully
// restart and process blocks assuming the halt configuration has been
// reset or moved to a more distant value.
app.halt()
}

if app.snapshotManager.ShouldTakeSnapshot(header.Height) {
snapshotHeight = header.Height
}

return res, snapshotHeight
}

// halt attempts to gracefully shutdown the node via SIGINT and SIGTERM falling
// back on os.Exit if both fail.
func (app *BaseApp) halt() {
app.logger.Info("halting node per configuration", "height", app.haltHeight, "time", app.haltTime)

p, err := os.FindProcess(os.Getpid())
if err == nil {
// attempt cascading signals in case SIGINT fails (os dependent)
sigIntErr := p.Signal(syscall.SIGINT)
sigTermErr := p.Signal(syscall.SIGTERM)

if sigIntErr == nil || sigTermErr == nil {
return
}
}

// Resort to exiting immediately if the process could not be found or killed
// via SIGINT/SIGTERM signals.
app.logger.Info("failed to send SIGINT/SIGTERM; exiting...")
os.Exit(0)
}

// Snapshot takes a snapshot of the current state and prunes any old snapshottypes.
// It should be started as a goroutine
func (app *BaseApp) Snapshot(height int64) {
Expand Down
73 changes: 73 additions & 0 deletions baseapp/abci_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ package baseapp
import (
"encoding/json"
"fmt"
"os"
"os/signal"
"strings"
"syscall"
"testing"
"time"

"github.com/stretchr/testify/require"
abci "github.com/tendermint/tendermint/abci/types"
Expand Down Expand Up @@ -224,3 +229,71 @@ func (ps *paramStore) Get(_ sdk.Context, key []byte, ptr interface{}) {
panic(err)
}
}

func TestABCI_HaltChain(t *testing.T) {
logger := defaultLogger()
db := dbm.NewMemDB()
name := t.Name()

testCases := []struct {
name string
haltHeight uint64
haltTime uint64
blockHeight int64
blockTime int64
expHalt bool
}{
{"default", 0, 0, 10, 0, false},
{"halt-height-edge", 10, 0, 10, 0, false},
{"halt-height", 10, 0, 11, 0, true},
{"halt-time-edge", 0, 10, 1, 10, false},
{"halt-time", 0, 10, 1, 11, true},
}

sigs := make(chan os.Signal, 5)
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if tc.expHalt {
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
}

defer func() {
rec := recover()
signal.Stop(sigs)
var err error
if rec != nil {
err = rec.(error)
}
if !tc.expHalt {
require.NoError(t, err)
} else {
// ensure that we received the correct signals
require.Equal(t, syscall.SIGINT, <-sigs)
require.Equal(t, syscall.SIGTERM, <-sigs)
require.Equal(t, len(sigs), 0)

// Check our error message.
require.Error(t, err)
require.True(t, strings.HasPrefix(err.Error(), "halt application"))
}
}()

app := NewBaseApp(
name, logger, db, nil,
SetHaltHeight(tc.haltHeight),
SetHaltTime(tc.haltTime),
)

app.InitChain(abci.RequestInitChain{
InitialHeight: tc.blockHeight,
})

app.BeginBlock(abci.RequestBeginBlock{
Header: tmproto.Header{
Height: tc.blockHeight,
Time: time.Unix(tc.blockTime, 0),
},
})
})
}
}

0 comments on commit 3768f9c

Please sign in to comment.