Skip to content

Commit

Permalink
Restore e2e test failures on cluster health retrieval (#1805)
Browse files Browse the repository at this point in the history
* Restore e2e test failures on cluster health retrieval

We disabled error reporting (leading to e2e test failure) when we cannot
retrieve the health.
A "valid case" to not being able to retrieve the cluster health is when
we're restarting/removing the master node of a v6 cluster. During leader
election, the cluster is temporarily unavailable to such requests.
This is way better with v7 clusters and zen2, for which unavailability
is made a lot smaller.

Let's restore that check, but make sure we ignore any errors resulting
from a v6 cluster upgrade.

* Allow http errors during mutations

Only if they don't happen contiguously for more than 60sec, which should
help allowing unavailability during leader election, but not if that
lasts too long.

* Add missing license header

* Make linter happy
  • Loading branch information
sebgl committed Oct 1, 2019
1 parent e77f218 commit 5275cf8
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 3 deletions.
42 changes: 39 additions & 3 deletions test/e2e/test/elasticsearch/steps_mutation.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ import (
"github.com/stretchr/testify/require"
)

const continuousHealthCheckTimeout = 25 * time.Second
const (
continuousHealthCheckTimeout = 5 * time.Second
// clusterUnavailabilityThreshold is the accepted duration for the cluster to temporarily not respond to requests
// (eg. during leader elections in the middle of a rolling upgrade)
clusterUnavailabilityThreshold = 60 * time.Second
)

func (b Builder) UpgradeTestSteps(k *test.K8sClient) test.StepList {
return test.StepList{
Expand Down Expand Up @@ -130,6 +135,7 @@ type ContinuousHealthCheckFailure struct {
// ContinuousHealthCheck continuously runs health checks against Elasticsearch
// during the whole mutation process
type ContinuousHealthCheck struct {
b Builder
SuccessCount int
FailureCount int
Failures []ContinuousHealthCheckFailure
Expand All @@ -144,6 +150,7 @@ func NewContinuousHealthCheck(b Builder, k *test.K8sClient) (*ContinuousHealthCh
return nil, err
}
return &ContinuousHealthCheck{
b: b,
stopChan: make(chan struct{}),
esClient: esClient,
}, nil
Expand All @@ -160,6 +167,7 @@ func (hc *ContinuousHealthCheck) AppendErr(err error) {

// Start runs health checks in a goroutine, until stopped
func (hc *ContinuousHealthCheck) Start() {
clusterUnavailability := clusterUnavailability{threshold: clusterUnavailabilityThreshold}
go func() {
ticker := time.NewTicker(test.DefaultRetryDelay)
for {
Expand All @@ -171,10 +179,16 @@ func (hc *ContinuousHealthCheck) Start() {
defer cancel()
health, err := hc.esClient.GetClusterHealth(ctx)
if err != nil {
// TODO: Temporarily account only red clusters, see https://github.com/elastic/cloud-on-k8s/issues/614
// hc.AppendErr(err)
// Could not retrieve cluster health, can happen when the master node is killed
// during a rolling upgrade. We allow it, unless it lasts for too long.
clusterUnavailability.markUnavailable()
if clusterUnavailability.hasExceededThreshold() {
// cluster has been unavailable for too long
hc.AppendErr(err)
}
continue
}
clusterUnavailability.markAvailable()
if estype.ElasticsearchHealth(health.Status) == estype.ElasticsearchRedHealth {
hc.AppendErr(errors.New("cluster health red"))
continue
Expand All @@ -189,3 +203,25 @@ func (hc *ContinuousHealthCheck) Start() {
func (hc *ContinuousHealthCheck) Stop() {
hc.stopChan <- struct{}{}
}

type clusterUnavailability struct {
start time.Time
threshold time.Duration
}

func (cu *clusterUnavailability) markUnavailable() {
if cu.start.IsZero() {
cu.start = time.Now()
}
}

func (cu *clusterUnavailability) markAvailable() {
cu.start = time.Time{}
}

func (cu *clusterUnavailability) hasExceededThreshold() bool {
if cu.start.IsZero() {
return false
}
return time.Since(cu.start) >= cu.threshold
}
44 changes: 44 additions & 0 deletions test/e2e/test/elasticsearch/steps_mutation_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License;
// you may not use this file except in compliance with the Elastic License.

package elasticsearch

import (
"testing"
"time"

"github.com/stretchr/testify/require"
)

func Test_clusterUnavailability(t *testing.T) {
// set an arbitrary large threshold we'll not reach
cu := clusterUnavailability{threshold: 1 * time.Hour}

// no threshold should be exceeded while the cluster is available
require.False(t, cu.hasExceededThreshold())
cu.markAvailable()
require.True(t, cu.start.IsZero())
require.False(t, cu.hasExceededThreshold())

// mark the cluster as available, we're still below the threshold
cu.markUnavailable()
require.False(t, cu.start.IsZero())
require.False(t, cu.hasExceededThreshold())

// marking as unavailable again should not change the start time
initialStartTime := cu.start
cu.markUnavailable()
require.Equal(t, initialStartTime, cu.start)
require.False(t, cu.hasExceededThreshold())

// marking as available again should reset the start time
cu.markAvailable()
require.True(t, cu.start.IsZero())
require.False(t, cu.hasExceededThreshold())

// simulate a lower threshold we should have exceeded
cu.markUnavailable()
cu.threshold = time.Duration(0)
require.True(t, cu.hasExceededThreshold())
}

0 comments on commit 5275cf8

Please sign in to comment.