Restore e2e test failures on cluster health retrieval (#1805)

* Restore e2e test failures on cluster health retrieval We disabled error reporting (leading to e2e test failure) when we cannot retrieve the health. A "valid case" to not being able to retrieve the cluster health is when we're restarting/removing the master node of a v6 cluster. During leader election, the cluster is temporarily unavailable to such requests. This is way better with v7 clusters and zen2, for which unavailability is made a lot smaller. Let's restore that check, but make sure we ignore any errors resulting from a v6 cluster upgrade. * Allow http errors during mutations Only if they don't happen contiguously for more than 60sec, which should help allowing unavailability during leader election, but not if that lasts too long. * Add missing license header * Make linter happy
elastic · Oct 1, 2019 · 5275cf8 · 5275cf8
1 parent e77f218
commit 5275cf8
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 3 deletions.
diff --git a/test/e2e/test/elasticsearch/steps_mutation.go b/test/e2e/test/elasticsearch/steps_mutation.go
@@ -18,7 +18,12 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-const continuousHealthCheckTimeout = 25 * time.Second
+const (
+	continuousHealthCheckTimeout = 5 * time.Second
+	// clusterUnavailabilityThreshold is the accepted duration for the cluster to temporarily not respond to requests
+	// (eg. during leader elections in the middle of a rolling upgrade)
+	clusterUnavailabilityThreshold = 60 * time.Second
+)
 
 func (b Builder) UpgradeTestSteps(k *test.K8sClient) test.StepList {
 	return test.StepList{
@@ -130,6 +135,7 @@ type ContinuousHealthCheckFailure struct {
 // ContinuousHealthCheck continuously runs health checks against Elasticsearch
 // during the whole mutation process
 type ContinuousHealthCheck struct {
+	b            Builder
 	SuccessCount int
 	FailureCount int
 	Failures     []ContinuousHealthCheckFailure
@@ -144,6 +150,7 @@ func NewContinuousHealthCheck(b Builder, k *test.K8sClient) (*ContinuousHealthCh
 		return nil, err
 	}
 	return &ContinuousHealthCheck{
+		b:        b,
 		stopChan: make(chan struct{}),
 		esClient: esClient,
 	}, nil
@@ -160,6 +167,7 @@ func (hc *ContinuousHealthCheck) AppendErr(err error) {
 
 // Start runs health checks in a goroutine, until stopped
 func (hc *ContinuousHealthCheck) Start() {
+	clusterUnavailability := clusterUnavailability{threshold: clusterUnavailabilityThreshold}
 	go func() {
 		ticker := time.NewTicker(test.DefaultRetryDelay)
 		for {
@@ -171,10 +179,16 @@ func (hc *ContinuousHealthCheck) Start() {
 				defer cancel()
 				health, err := hc.esClient.GetClusterHealth(ctx)
 				if err != nil {
-					// TODO: Temporarily account only red clusters, see https://github.com/elastic/cloud-on-k8s/issues/614
-					// hc.AppendErr(err)
+					// Could not retrieve cluster health, can happen when the master node is killed
+					// during a rolling upgrade. We allow it, unless it lasts for too long.
+					clusterUnavailability.markUnavailable()
+					if clusterUnavailability.hasExceededThreshold() {
+						// cluster has been unavailable for too long
+						hc.AppendErr(err)
+					}
 					continue
 				}
+				clusterUnavailability.markAvailable()
 				if estype.ElasticsearchHealth(health.Status) == estype.ElasticsearchRedHealth {
 					hc.AppendErr(errors.New("cluster health red"))
 					continue
@@ -189,3 +203,25 @@ func (hc *ContinuousHealthCheck) Start() {
 func (hc *ContinuousHealthCheck) Stop() {
 	hc.stopChan <- struct{}{}
 }
+
+type clusterUnavailability struct {
+	start     time.Time
+	threshold time.Duration
+}
+
+func (cu *clusterUnavailability) markUnavailable() {
+	if cu.start.IsZero() {
+		cu.start = time.Now()
+	}
+}
+
+func (cu *clusterUnavailability) markAvailable() {
+	cu.start = time.Time{}
+}
+
+func (cu *clusterUnavailability) hasExceededThreshold() bool {
+	if cu.start.IsZero() {
+		return false
+	}
+	return time.Since(cu.start) >= cu.threshold
+}
diff --git a/test/e2e/test/elasticsearch/steps_mutation_test.go b/test/e2e/test/elasticsearch/steps_mutation_test.go
@@ -0,0 +1,44 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package elasticsearch
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func Test_clusterUnavailability(t *testing.T) {
+	// set an arbitrary large threshold we'll not reach
+	cu := clusterUnavailability{threshold: 1 * time.Hour}
+
+	// no threshold should be exceeded while the cluster is available
+	require.False(t, cu.hasExceededThreshold())
+	cu.markAvailable()
+	require.True(t, cu.start.IsZero())
+	require.False(t, cu.hasExceededThreshold())
+
+	// mark the cluster as available, we're still below the threshold
+	cu.markUnavailable()
+	require.False(t, cu.start.IsZero())
+	require.False(t, cu.hasExceededThreshold())
+
+	// marking as unavailable again should not change the start time
+	initialStartTime := cu.start
+	cu.markUnavailable()
+	require.Equal(t, initialStartTime, cu.start)
+	require.False(t, cu.hasExceededThreshold())
+
+	// marking as available again should reset the start time
+	cu.markAvailable()
+	require.True(t, cu.start.IsZero())
+	require.False(t, cu.hasExceededThreshold())
+
+	// simulate a lower threshold we should have exceeded
+	cu.markUnavailable()
+	cu.threshold = time.Duration(0)
+	require.True(t, cu.hasExceededThreshold())
+}