From 66713f69af97e33de854a07bc0dafcf7eaf8eaf8 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Fri, 7 Jul 2023 14:39:58 +0800 Subject: [PATCH 1/2] embed: fix nil pointer dereference when stopServer Since v3.4.25, ETCD server introduces http-only urls flag to expose gRPC-only endpoints. When user enables this feature, the stopServer will panic during terminating. If the server is leader, it won't have chance to transfer the leadership. ``` Jul 07 14:43:04 etcd[11502]: received terminated signal, shutting down... Jul 07 14:43:04 etcd[11502]: WARNING: 2023/07/07 14:43:04 grpc: addrConn.createTransport failed to connect to {0.0.0.0:2379 0 }. Err :connection error: desc = "transport: Error while dialing dial tcp 0.0.0.0:2379: connect: connection refused". Reconnecting...Jul 07 14:43:04 etcd[11502]: WARNING: 2023/07/07 14:43:04 grpc: addrConn.createTransport failed to connect to {0.0.0.0:2379 0 }. Err :connection error: desc = "transport: Error while dialing dial tcp 0.0.0.0:2379: connect: connection refused". Reconnecting... Jul 07 14:43:04 etcd[11502]: panic: runtime error: invalid memory address or nil pointer dereference Jul 07 14:43:04 etcd[11502]: [signal SIGSEGV: segmentation violation code=0x1 addr=0x130 pc=0x9ccd45] Jul 07 14:43:04 etcd[11502]: goroutine 225 [running]: Jul 07 14:43:04 etcd[11502]: google.golang.org/grpc.(*Server).Stop(0x0) Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/pkg/mod/google.golang.org/grpc@v1.26.0/server.go:1390 +0x45 Jul 07 14:43:04 etcd[11502]: go.etcd.io/etcd/embed.stopServers.func1() Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/src/go.etcd.io/etcd/embed/etcd.go:431 +0x3c Jul 07 14:43:04 etcd[11502]: go.etcd.io/etcd/embed.stopServers({0x115a558, 0xc000278b70}, 0xc00024f248) Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/src/go.etcd.io/etcd/embed/etcd.go:438 +0x7d Jul 07 14:43:04 etcd[11502]: go.etcd.io/etcd/embed.(*Etcd).Close(0xc0004d6600) Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/src/go.etcd.io/etcd/embed/etcd.go:392 +0x835 Jul 07 14:43:04 etcd[11502]: go.etcd.io/etcd/pkg/osutil.HandleInterrupts.func1() Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/src/go.etcd.io/etcd/pkg/osutil/interrupt_unix.go:70 +0x284 Jul 07 14:43:04 etcd[11502]: created by go.etcd.io/etcd/pkg/osutil.HandleInterrupts Jul 07 14:43:04 etcd[11502]: /home/fuwei/go/src/go.etcd.io/etcd/pkg/osutil/interrupt_unix.go:53 +0xce Jul 07 14:43:04 systemd[1]: etcd.service: Main process exited, code=exited, status=2/INVALIDARGUMENT ``` Signed-off-by: Wei Fu --- embed/etcd.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/embed/etcd.go b/embed/etcd.go index a44ba5e2805..223a8aaeaa8 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -424,18 +424,20 @@ func (e *Etcd) Close() { } func stopServers(ctx context.Context, ss *servers) { - shutdownNow := func() { - // first, close the http.Server + // first, close the http.Server + if ss.http != nil { ss.http.Shutdown(ctx) - // then close grpc.Server; cancels all active RPCs - ss.grpc.Stop() + } + + if ss.grpc == nil { + return } // do not grpc.Server.GracefulStop with TLS enabled etcd server // See https://github.com/grpc/grpc-go/issues/1384#issuecomment-317124531 // and https://github.com/etcd-io/etcd/issues/8916 if ss.secure && ss.http != nil { - shutdownNow() + ss.grpc.Stop() return } @@ -453,7 +455,7 @@ func stopServers(ctx context.Context, ss *servers) { case <-ctx.Done(): // took too long, manually close open transports // e.g. watch streams - shutdownNow() + ss.grpc.Stop() // concurrent GracefulStop should be interrupted <-ch From 15efc559051a44073a55ca661c28d414dfb26ad0 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Fri, 7 Jul 2023 21:28:51 +0800 Subject: [PATCH 2/2] tests/e2e: allow to use SIGTERM to verify graceful-stop Signed-off-by: Wei Fu --- tests/e2e/cluster_test.go | 10 ++++++++++ tests/e2e/cmux_test.go | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index 9186accd181..f13fc88c590 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -133,6 +133,8 @@ type etcdProcessClusterConfig struct { WatchProcessNotifyInterval time.Duration debug bool + + stopSignal os.Signal } // newEtcdProcessCluster launches a new cluster from etcd processes, returning @@ -151,12 +153,20 @@ func newEtcdProcessCluster(cfg *etcdProcessClusterConfig) (*etcdProcessCluster, epc.Close() return nil, err } + epc.procs[i] = proc } if err := epc.Start(); err != nil { return nil, err } + + // overwrite the default signal + if cfg.stopSignal != nil { + for _, proc := range epc.procs { + proc.WithStopSignal(cfg.stopSignal) + } + } return epc, nil } diff --git a/tests/e2e/cmux_test.go b/tests/e2e/cmux_test.go index df9ee3ac114..8d394b31c66 100644 --- a/tests/e2e/cmux_test.go +++ b/tests/e2e/cmux_test.go @@ -21,6 +21,7 @@ import ( "context" "encoding/json" "fmt" + "syscall" "testing" "github.com/stretchr/testify/assert" @@ -63,10 +64,19 @@ func TestConnectionMultiplexing(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { ctx := context.Background() - cfg := etcdProcessClusterConfig{clusterSize: 1, clientTLS: tc.serverTLS, enableV2: true, clientHttpSeparate: tc.separateHttpPort} + cfg := etcdProcessClusterConfig{ + clusterSize: 1, + clientTLS: tc.serverTLS, + enableV2: true, + clientHttpSeparate: tc.separateHttpPort, + stopSignal: syscall.SIGTERM, // check graceful stop + } clus, err := newEtcdProcessCluster(&cfg) require.NoError(t, err) - defer clus.Close() + + defer func() { + require.NoError(t, clus.Close()) + }() var clientScenarios []clientConnType switch tc.serverTLS {