Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: ETCD-612: Introduced a dedicated quorumz handler to ensure PDB is violated if quorum is not safe #1278

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion pkg/cmd/readyz/readyz.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ import (
"errors"
goflag "flag"
"fmt"
"go.etcd.io/etcd/client/pkg/v3/tlsutil"
"net"
"net/http"
"syscall"
"time"

"go.etcd.io/etcd/client/pkg/v3/tlsutil"

"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -167,6 +168,7 @@ func (r *readyzOpts) Run() error {
// Handle the /healthz endpoint as well since the static pod controller's guard pods check the /healthz endpoint
// https://github.com/openshift/library-go/blob/edab248e63516c65a93467eaa8224c86d69f5de9/pkg/operator/staticpod/controller/guard/manifests/guard-pod.yaml#L44
mux.HandleFunc("/healthz", r.getReadyzHandlerFunc(shutdownCtx))
mux.HandleFunc("/quorumz", r.getQuorumzHandlerFunc(shutdownCtx))

addr := fmt.Sprintf("0.0.0.0:%d", r.listenPort)
klog.Infof("Listening on %s", addr)
Expand Down Expand Up @@ -233,6 +235,37 @@ func (r *readyzOpts) getReadyzHandlerFunc(ctx context.Context) http.HandlerFunc
}
}

func (r *readyzOpts) getQuorumzHandlerFunc(ctx context.Context) http.HandlerFunc {
return func(w http.ResponseWriter, _ *http.Request) {
etcdClient, err := r.clientPool.Get()
if err != nil {
klog.V(2).Infof("failed to establish etcd client: %v", err)
http.Error(w, fmt.Sprintf("failed to establish etcd client: %v", err), http.StatusServiceUnavailable)
return
}
defer r.clientPool.Return(etcdClient)

timeout, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()

etcdCluster, err := etcdClient.MemberList(timeout)
if err != nil {
klog.V(2).Infof("failed to get the current cluster membership: %v", err)
http.Error(w, fmt.Sprintf("failed to get the current cluster membership: %v", err), http.StatusServiceUnavailable)
return
}

memberHealth := etcdcli.GetMemberHealth(ctx, etcdClient, etcdCluster.Members)
err = etcdcli.IsQuorumFaultTolerantErr(memberHealth)
if err != nil {
klog.V(2).Infof("quorum is not fault tolerant: %v", err)
http.Error(w, fmt.Sprintf("quorum is not fault tolerant: %v", err), http.StatusServiceUnavailable)
return
}
w.WriteHeader(http.StatusOK)
}
}

func permitAddressReuse(network, addr string, conn syscall.RawConn) error {
return conn.Control(func(fd uintptr) {
if err := syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, unix.SO_REUSEADDR, 1); err != nil {
Expand Down
23 changes: 0 additions & 23 deletions pkg/operator/etcdcertsigner/etcdcertsignercontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,18 +392,6 @@ func (c *EtcdCertSignerController) syncLeafCertificates(
Type: corev1.SecretTypeOpaque,
Data: allCerts,
}

// check the quorum in case the cluster is healthy or not after generating certs, unless we're in force mode
if !forceSkipRollout {
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("EtcdCertSignerController can't evaluate whether quorum is safe: %w", err)
}

if !safe {
return fmt.Errorf("skipping EtcdCertSignerController reconciliation due to insufficient quorum")
}
}
_, _, err = resourceapply.ApplySecret(ctx, c.secretClient, recorder, secret)
return err
}
Expand Down Expand Up @@ -474,17 +462,6 @@ func (c *EtcdCertSignerController) ensureBundles(ctx context.Context,
// the leaf certificates are not regenerated too early.
configMap.Annotations[BundleRolloutRevisionAnnotation] = fmt.Sprintf("%d", currentRevision)

// The rollout may be stuck due to a missing etcd-all-bundles configmap, so we override the quorum check here to regenerate the configmap and let the next revision install
if !forceSkipRollout {
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return nil, nil, false, fmt.Errorf("EtcdCertSignerController.ensureBundles can't evaluate whether quorum is safe: %w", err)
}
if !safe {
return nil, nil, false, fmt.Errorf("skipping EtcdCertSignerController.ensureBundles reconciliation due to insufficient quorum")
}
}

_, rolloutTriggered, err = resourceapply.ApplyConfigMap(ctx, c.configMapClient, recorder, configMap)
if err != nil {
return nil, nil, rolloutTriggered, fmt.Errorf("could not apply bundle configmap: %w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,6 @@ func (c *EtcdEndpointsController) syncConfigMap(ctx context.Context, recorder ev

required.Data = endpointAddresses

safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("EtcdEndpointsController can't evaluate whether quorum is safe: %w", err)
}
if !safe {
return fmt.Errorf("skipping EtcdEndpointsController reconciliation due to insufficient quorum")
}

// Apply endpoint updates
if _, _, err := resourceapply.ApplyConfigMap(ctx, c.configmapClient, recorder, required); err != nil {
return fmt.Errorf("applying configmap update failed :%w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package etcdendpointscontroller

import (
"context"
"fmt"
"testing"

configv1 "github.com/openshift/api/config/v1"
Expand Down Expand Up @@ -269,29 +268,6 @@ func TestBootstrapAnnotationRemoval(t *testing.T) {
}
},
},
{
// The configmap should not update when quorum is critical
name: "ClusterNotUpdateWithMemberChangeViolatingQuorum",
objects: []runtime.Object{
u.BootstrapConfigMap(u.WithBootstrapStatus("complete")),
u.EndpointsConfigMap(
u.WithEndpoint(etcdMembers[0].ID, etcdMembers[0].PeerURLs[0]),
u.WithEndpoint(etcdMembers[1].ID, etcdMembers[1].PeerURLs[0]),
u.WithEndpoint(etcdMembers[2].ID, etcdMembers[2].PeerURLs[0]),
),
},
staticPodStatus: u.StaticPodOperatorStatus(
u.WithLatestRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
),
expectBootstrap: false,
etcdMembers: []*etcdserverpb.Member{
u.FakeEtcdMemberWithoutServer(0),
},
expectedErr: fmt.Errorf("EtcdEndpointsController can't evaluate whether quorum is safe: %w", fmt.Errorf("etcd cluster has quorum of 1 which is not fault tolerant: [{Member:name:\"etcd-0\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2907\" Healthy:true Took: Error:<nil>}]")),
},
{
// The configmap should be created without a bootstrap IP because the
// only time the configmap won't already exist is when we've upgraded
Expand Down
2 changes: 1 addition & 1 deletion pkg/operator/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle
"openshift-etcd-operator",
"etcd-operator",
"9980",
"readyz",
"quorumz",
// etcd should use a default UnhealthyPodEvictionPolicy behavior corresponding to the
// IfHealthyBudget policy. This policy achieves the least amount of disruption, as it
// does not allow eviction when multiple etcd pods do not report readiness.
Expand Down
10 changes: 0 additions & 10 deletions pkg/operator/targetconfigcontroller/targetconfigcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,16 +140,6 @@ func (c *TargetConfigController) createTargetConfig(
operatorSpec *operatorv1.StaticPodOperatorSpec,
envVars map[string]string) error {

// check the cluster is healthy or not after get env var, to ensure it is safe to rollout
safe, err := c.quorumChecker.IsSafeToUpdateRevision()
if err != nil {
return fmt.Errorf("TargetConfigController can't evaluate whether quorum is safe: %w", err)
}

if !safe {
return fmt.Errorf("skipping TargetConfigController reconciliation due to insufficient quorum")
}

var errs error
contentReplacer, err := c.getSubstitutionReplacer(operatorSpec, c.targetImagePullSpec, c.operatorImagePullSpec, envVars)
if err != nil {
Expand Down
26 changes: 4 additions & 22 deletions pkg/operator/targetconfigcontroller/targetconfigcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ package targetconfigcontroller

import (
"context"
"fmt"
"testing"

configv1 "github.com/openshift/api/config/v1"
operatorv1 "github.com/openshift/api/operator/v1"
configv1listers "github.com/openshift/client-go/config/listers/config/v1"
Expand All @@ -18,7 +19,6 @@ import (
"k8s.io/client-go/kubernetes/fake"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"testing"

"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
"github.com/openshift/cluster-etcd-operator/pkg/etcdenvvar"
Expand Down Expand Up @@ -55,24 +55,6 @@ func TestTargetConfigController(t *testing.T) {
},
etcdMembersEnvVar: "1,2,3",
},
{
name: "Quorum not fault tolerant",
objects: []runtime.Object{
u.BootstrapConfigMap(u.WithBootstrapStatus("complete")),
},
staticPodStatus: u.StaticPodOperatorStatus(
u.WithLatestRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
u.WithNodeStatusAtCurrentRevision(3),
),
etcdMembers: []*etcdserverpb.Member{
u.FakeEtcdMemberWithoutServer(0),
u.FakeEtcdMemberWithoutServer(2),
},
etcdMembersEnvVar: "1,3",
expectedErr: fmt.Errorf("TargetConfigController can't evaluate whether quorum is safe: %w", fmt.Errorf("etcd cluster has quorum of 2 which is not fault tolerant: [{Member:name:\"etcd-0\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2907\" Healthy:true Took: Error:<nil>} {Member:ID:2 name:\"etcd-2\" peerURLs:\"https://10.0.0.3:2380\" clientURLs:\"https://10.0.0.3:2907\" Healthy:true Took: Error:<nil>}]")),
},
{
name: "Quorum not fault tolerant but bootstrapping",
objects: []runtime.Object{
Expand Down Expand Up @@ -100,7 +82,7 @@ func TestTargetConfigController(t *testing.T) {
}
}

func TestControllerDegradesOnQuorumLoss(t *testing.T) {
/* func TestControllerDegradesOnQuorumLoss(t *testing.T) {
objects := []runtime.Object{
u.BootstrapConfigMap(u.WithBootstrapStatus("complete")),
}
Expand Down Expand Up @@ -137,7 +119,7 @@ func TestControllerDegradesOnQuorumLoss(t *testing.T) {
}

require.Truef(t, foundDegraded, "could not find degraded status in operator client")
}
} */

func getController(t *testing.T, staticPodStatus *operatorv1.StaticPodOperatorStatus, objects []runtime.Object, etcdMembers []*etcdserverpb.Member) (events.Recorder, v1helpers.StaticPodOperatorClient, *TargetConfigController) {
fakeOperatorClient := v1helpers.NewFakeStaticPodOperatorClient(
Expand Down