Skip to content

Commit

Permalink
etcdmain: Honour ExperimentalWaitClusterReadyTimeout in startEtcd
Browse files Browse the repository at this point in the history
When we can't reach quorum, we were waiting forever and never sending
the systemd notify message. As a result, systemd would eventually time out
and restart the etcd process which likely would make the unhealthy cluster
in an even worse state

Improves etcd-io#13785

Signed-off-by: Nicolai Moore <niconorsk@gmail.com>
  • Loading branch information
niconorsk committed Aug 26, 2022
1 parent 96a2669 commit 23c9349
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 15 deletions.
3 changes: 3 additions & 0 deletions server/etcdmain/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"os"
"runtime"
"strings"
"time"

"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/client/pkg/v3/logutil"
Expand Down Expand Up @@ -207,6 +208,8 @@ func startEtcd(cfg *embed.Config) (<-chan struct{}, <-chan error, error) {
select {
case <-e.Server.ReadyNotify(): // wait for e.Server to join the cluster
case <-e.Server.StopNotify(): // publish aborted from 'ErrStopped'
case <-time.After(cfg.ExperimentalWaitClusterReadyTimeout):
e.GetLogger().Warn("startEtcd: timed out waiting for the ready notification")
}
return e.Server.StopNotify(), e.Err(), nil
}
Expand Down
15 changes: 2 additions & 13 deletions tests/e2e/cluster_downgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func testDowngradeUpgrade(t *testing.T, clusterSize int) {

t.Log("Downgrade enabled, validating if cluster is ready for downgrade")
for i := 0; i < len(epc.Procs); i++ {
expectLog(t, epc.Procs[i], "The server is ready to downgrade")
e2e.ExpectLog(t, epc.Procs[i], "The server is ready to downgrade")
validateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{Cluster: lastVersionStr, Server: currentVersionStr})
}
t.Log("Cluster is ready for downgrade")
Expand All @@ -73,7 +73,7 @@ func testDowngradeUpgrade(t *testing.T, clusterSize int) {
startEtcd(t, epc.Procs[i], lastReleaseBinary)
}
t.Log("All members downgraded, validating downgrade")
expectLog(t, leader(t, epc), "the cluster has been downgraded")
e2e.ExpectLog(t, leader(t, epc), "the cluster has been downgraded")
for i := 0; i < len(epc.Procs); i++ {
validateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{Cluster: lastVersionStr, Server: lastVersionStr})
}
Expand Down Expand Up @@ -164,17 +164,6 @@ func validateVersion(t *testing.T, cfg *e2e.EtcdProcessClusterConfig, member e2e
}
}

func expectLog(t *testing.T, ep e2e.EtcdProcess, expectLog string) {
t.Helper()
var err error
testutils.ExecuteWithTimeout(t, 30*time.Second, func() {
_, err = ep.Logs().Expect(expectLog)
})
if err != nil {
t.Fatal(err)
}
}

func leader(t *testing.T, epc *e2e.EtcdProcessCluster) e2e.EtcdProcess {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/ctl_v3_grpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package e2e

import (
"context"
"fmt"
"strings"
"testing"
Expand Down Expand Up @@ -148,7 +149,7 @@ func firstMatch(t *testing.T, expectLine string, logs ...e2e.LogsExpect) string
match := make(chan string, len(logs))
for i := range logs {
go func(l e2e.LogsExpect) {
line, _ := l.Expect(expectLine)
line, _ := l.ExpectWithContext(context.Background(), expectLine)
match <- line
}(logs[i])
}
Expand Down
43 changes: 43 additions & 0 deletions tests/e2e/no_quorum_notify_daemon_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2021 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package e2e

import (
"testing"

"go.etcd.io/etcd/tests/v3/framework/e2e"
)

func TestInitDaemonNotifyWithoutQuorum(t *testing.T) {
// Initialize a cluster with 3 members
epc, err := e2e.InitEtcdProcessCluster(t, e2e.NewConfigAutoTLS())
if err != nil {
t.Fatalf("Failed to initilize the etcd cluster: %v", err)
}

// Remove two members, so that only one etcd will get started
epc.Procs = epc.Procs[:1]

// Start the etcd cluster with only one member
if err := epc.Start(); err != nil {
t.Fatalf("Failed to start the etcd cluster: %v", err)
}

// Expect log message indicating systemd notify message has been sent
e2e.ExpectLog(t, epc.Procs[0], "notifying init daemon")
// Expect log message indicating time out waiting for quorum hit
e2e.ExpectLog(t, epc.Procs[0], "startEtcd: timed out waiting for the ready notification")
epc.Close()
}
17 changes: 16 additions & 1 deletion tests/framework/e2e/etcd_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
package e2e

import (
"context"
"fmt"
"net/url"
"os"
"testing"
"time"

"go.uber.org/zap"

"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/pkg/v3/expect"
"go.etcd.io/etcd/tests/v3/framework/testutils"
)

var (
Expand All @@ -48,7 +52,7 @@ type EtcdProcess interface {
}

type LogsExpect interface {
Expect(string) (string, error)
ExpectWithContext(context.Context, string) (string, error)
Lines() []string
LineCount() int
}
Expand Down Expand Up @@ -179,3 +183,14 @@ func (ep *EtcdServerProcess) Logs() LogsExpect {
}
return ep.proc
}

func ExpectLog(t *testing.T, ep EtcdProcess, expectLog string) {
t.Helper()
var err error
testutils.ExecuteWithTimeout(t, 30*time.Second, func() {
_, err = ep.Logs().ExpectWithContext(context.Background(), expectLog)
})
if err != nil {
t.Fatal(err)
}
}

0 comments on commit 23c9349

Please sign in to comment.