Skip to content

Commit

Permalink
fix: catch panics in network operator runs
Browse files Browse the repository at this point in the history
Fixes #4567

This is not a complete fix, but rather a workaround: if the DHCP4 operator
panics, Talos shouldn't crash on `machined` unhandled panic.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Nov 24, 2021
1 parent d1f55f9 commit 9427e78
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 7 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ require (
github.com/AlekSi/pointer v1.2.0
github.com/BurntSushi/toml v0.4.1
github.com/beevik/ntp v0.3.0
github.com/cenkalti/backoff/v4 v4.1.1
github.com/containerd/cgroups v1.0.2
github.com/containerd/containerd v1.5.7
github.com/containerd/cri v1.19.0
Expand Down Expand Up @@ -149,7 +150,6 @@ require (
github.com/bits-and-blooms/bitset v1.2.0 // indirect
github.com/blang/semver v3.5.1+incompatible // indirect
github.com/briandowns/spinner v1.6.1 // indirect
github.com/cenkalti/backoff/v4 v4.1.1 // indirect
github.com/cespare/xxhash/v2 v2.1.1 // indirect
github.com/chai2010/gettext-go v0.0.0-20160711120539-c6fed771bfd5 // indirect
github.com/cilium/ebpf v0.7.0 // indirect
Expand Down
46 changes: 43 additions & 3 deletions internal/app/machined/pkg/controllers/network/operator_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import (
"context"
"fmt"
"sync"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/state"
Expand Down Expand Up @@ -91,18 +93,56 @@ type operatorRunState struct {
wg sync.WaitGroup
}

func (state *operatorRunState) Start(ctx context.Context, notifyCh chan<- struct{}) {
func (state *operatorRunState) Start(ctx context.Context, notifyCh chan<- struct{}, logger *zap.Logger, id string) {
state.wg.Add(1)

ctx, state.cancel = context.WithCancel(ctx)

go func() {
defer state.wg.Done()

state.Operator.Run(ctx, notifyCh)
state.runWithRestarts(ctx, notifyCh, logger, id)
}()
}

func (state *operatorRunState) runWithRestarts(ctx context.Context, notifyCh chan<- struct{}, logger *zap.Logger, id string) {
backoff := backoff.NewExponentialBackOff()

// disable number of retries limit
backoff.MaxElapsedTime = 0

for ctx.Err() == nil {
if err := state.runWithPanicHandler(ctx, notifyCh, logger, id); err == nil {
// operator finished without an error
return
}

interval := backoff.NextBackOff()

logger.Debug("restarting operator", zap.Duration("interval", interval), zap.String("operator", id))

select {
case <-ctx.Done():
return
case <-time.After(interval):
}
}
}

func (state *operatorRunState) runWithPanicHandler(ctx context.Context, notifyCh chan<- struct{}, logger *zap.Logger, id string) (err error) {
defer func() {
if p := recover(); p != nil {
err = fmt.Errorf("panic: %v", p)

logger.Error("operator panicked", zap.Stack("stack"), zap.Error(err), zap.String("operator", id))
}
}()

state.Operator.Run(ctx, notifyCh)

return nil
}

func (state *operatorRunState) Stop() {
state.cancel()

Expand Down Expand Up @@ -210,7 +250,7 @@ func (ctrl *OperatorSpecController) reconcileOperators(ctx context.Context, r co
}

logger.Debug("starting operator", zap.String("operator", id))
ctrl.operators[id].Start(ctx, notifyCh)
ctrl.operators[id].Start(ctx, notifyCh, logger, id)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type OperatorSpecSuite struct {
type mockOperator struct {
spec network.OperatorSpecSpec
notifyCh chan<- struct{}
panicked bool

mu sync.Mutex
addresses []network.AddressSpecSpec
Expand Down Expand Up @@ -72,13 +73,22 @@ func (mock *mockOperator) Run(ctx context.Context, notifyCh chan<- struct{}) {
runningOperatorsMu.Unlock()
}

<-ctx.Done()

{
defer func() {
runningOperatorsMu.Lock()
delete(runningOperators, mock.Prefix())
runningOperatorsMu.Unlock()
}()

if mock.spec.Operator == network.OperatorDHCP6 {
// DHCP6 operator panics on odd run
if !mock.panicked {
mock.panicked = true

panic("oh no, IPv6!!!")
}
}

<-ctx.Done()
}

func (mock *mockOperator) notify() {
Expand Down Expand Up @@ -335,6 +345,48 @@ func (suite *OperatorSpecSuite) TestScheduling() {
}))
}

func (suite *OperatorSpecSuite) TestPanic() {
specPanic := network.NewOperatorSpec(network.NamespaceName, "dhcp6/eth0")
*specPanic.TypedSpec() = network.OperatorSpecSpec{
Operator: network.OperatorDHCP6,
LinkName: "eth0",
RequireUp: true,
DHCP6: network.DHCP6OperatorSpec{
RouteMetric: 1024,
},
}

suite.Require().NoError(suite.state.Create(suite.ctx, specPanic))

linkState := network.NewLinkStatus(network.NamespaceName, "eth0")
*linkState.TypedSpec() = network.LinkStatusSpec{
OperationalState: nethelpers.OperStateUp,
}

suite.Require().NoError(suite.state.Create(suite.ctx, linkState))

// DHCP6 operator should panic and then restart
suite.Assert().NoError(retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).Retry(
func() error {
return suite.assertRunning([]string{"dhcp6/eth0"}, func(op *mockOperator) error { return nil })
}))

// bring down the interface, operator should be stopped
_, err := suite.state.UpdateWithConflicts(suite.ctx, linkState.Metadata(), func(r resource.Resource) error {
r.(*network.LinkStatus).TypedSpec().OperationalState = nethelpers.OperStateDown

return nil
})
suite.Require().NoError(err)

suite.Assert().NoError(retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).Retry(
func() error {
return suite.assertRunning(nil, func(op *mockOperator) error {
return nil
})
}))
}

func (suite *OperatorSpecSuite) TestOperatorOutputs() {
specDHCP := network.NewOperatorSpec(network.NamespaceName, "dhcp4/eth0")
*specDHCP.TypedSpec() = network.OperatorSpecSpec{
Expand Down

0 comments on commit 9427e78

Please sign in to comment.