Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add Health Check logic to MachineHealthCheck Reconciler #2250

Merged
merged 26 commits into from
Feb 28, 2020
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3f130f0
Fetch health checking targets from cluster
JoelSpeed Jan 20, 2020
eb80ce1
Add test for health checking targets from cluster
JoelSpeed Jan 22, 2020
352001b
Check targets and determine which need remediation
JoelSpeed Jan 22, 2020
2d276e1
Add tests for health checking of nodes
JoelSpeed Jan 23, 2020
618eea1
Watch Machine objects in the MHC controller
JoelSpeed Jan 27, 2020
2f8fba0
Add test for MHC hasMatchingLabels
JoelSpeed Jan 27, 2020
5960a82
Add watch for nodes in remote clusters
JoelSpeed Jan 27, 2020
5f1300d
Add tests for watching of Machines and Nodes
JoelSpeed Jan 28, 2020
5f22b42
Return errors for requeueing
JoelSpeed Jan 30, 2020
0b58895
Remove redundant namespaced name logging
JoelSpeed Feb 5, 2020
bf817c7
Add NodeStartupTimeout to MachineHealthCheck
JoelSpeed Feb 5, 2020
0cafd05
Use NodeStartupTimeout instead of constant
JoelSpeed Feb 5, 2020
7a3919c
Use FailureReason to determine whether a Machine has failed
JoelSpeed Feb 5, 2020
cfd167c
Use IsZero for Machine deletion timestamp
JoelSpeed Feb 5, 2020
bcc2570
Drop namespacedName
JoelSpeed Feb 7, 2020
e8a4b0f
Add context to remote clients
JoelSpeed Feb 7, 2020
843e020
Use runtime objects for tests
JoelSpeed Feb 14, 2020
5ac9cc8
Address feedback
JoelSpeed Feb 21, 2020
ea2cd28
More feedback
JoelSpeed Feb 24, 2020
fbab1a2
Use sync.Map for clusterNodeInformers
JoelSpeed Feb 24, 2020
2ccd5c2
Allow empty status fields
JoelSpeed Feb 26, 2020
f4409bf
Make hasMatchingLabels generic
JoelSpeed Feb 26, 2020
451c720
Feedback #3
JoelSpeed Feb 27, 2020
81f6c32
Go back to map for informers
JoelSpeed Feb 27, 2020
973f205
Don't use pointer for mutex
JoelSpeed Feb 27, 2020
067f1e0
Add nodeMissing field to HealthCheckTarget
JoelSpeed Feb 27, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions api/v1alpha3/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ type MachineHealthCheckSpec struct {
// "selector" are not healthy.
// +optional
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
}

// ANCHOR_END: MachineHealthCHeckSpec
Expand Down Expand Up @@ -73,11 +78,11 @@ type UnhealthyCondition struct {
type MachineHealthCheckStatus struct {
// total number of machines counted by this machine health check
// +kubebuilder:validation:Minimum=0
ExpectedMachines int32 `json:"expectedMachines"`
ExpectedMachines int32 `json:"expectedMachines,omitempty"`

// total number of healthy machines counted by this machine health check
// +kubebuilder:validation:Minimum=0
CurrentHealthy int32 `json:"currentHealthy"`
CurrentHealthy int32 `json:"currentHealthy,omitempty"`
}

// ANCHOR_END: MachineHealthCheckStatus
Expand Down
20 changes: 20 additions & 0 deletions api/v1alpha3/machinehealthcheck_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package v1alpha3

import (
"fmt"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -28,6 +29,14 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook"
)

var (
// Default time allowed for a node to start up. Can be made longer as part of
// spec if required for particular provider.
// 10 minutes should allow the instance to start and the node to join the
// cluster on most providers.
defaultNodeStartupTimeout = metav1.Duration{Duration: 10 * time.Minute}
)

func (m *MachineHealthCheck) SetupWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(m).
Expand All @@ -46,6 +55,10 @@ func (m *MachineHealthCheck) Default() {
defaultMaxUnhealthy := intstr.FromString("100%")
m.Spec.MaxUnhealthy = &defaultMaxUnhealthy
}

if m.Spec.NodeStartupTimeout == nil {
m.Spec.NodeStartupTimeout = &defaultNodeStartupTimeout
}
}

// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
Expand Down Expand Up @@ -86,6 +99,13 @@ func (m *MachineHealthCheck) validate(old *MachineHealthCheck) error {
)
}

if m.Spec.NodeStartupTimeout != nil && m.Spec.NodeStartupTimeout.Seconds() < 30 {
allErrs = append(
allErrs,
field.Invalid(field.NewPath("spec", "nodeStartupTimeout"), m.Spec.NodeStartupTimeout, "must be at least 30s"),
)
}

if len(allErrs) == 0 {
return nil
}
Expand Down
66 changes: 66 additions & 0 deletions api/v1alpha3/machinehealthcheck_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package v1alpha3

import (
"testing"
"time"

. "github.com/onsi/gomega"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -30,6 +31,8 @@ func TestMachineHealthCheckDefault(t *testing.T) {
mhc.Default()

g.Expect(mhc.Spec.MaxUnhealthy.String()).To(Equal("100%"))
g.Expect(mhc.Spec.NodeStartupTimeout).ToNot(BeNil())
g.Expect(*mhc.Spec.NodeStartupTimeout).To(Equal(metav1.Duration{Duration: 10 * time.Minute}))
}

func TestMachineHealthCheckLabelSelectorAsSelectorValidation(t *testing.T) {
Expand Down Expand Up @@ -115,3 +118,66 @@ func TestMachineHealthCheckClusterNameImmutable(t *testing.T) {
})
}
}

func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) {
zero := metav1.Duration{Duration: 0}
twentyNineSeconds := metav1.Duration{Duration: 29 * time.Second}
thirtySeconds := metav1.Duration{Duration: 30 * time.Second}
oneMinute := metav1.Duration{Duration: 1 * time.Minute}
minusOneMinute := metav1.Duration{Duration: -1 * time.Minute}

tests := []struct {
name string
timeout *metav1.Duration
expectErr bool
}{
{
name: "when the nodeStartupTimeout is not given",
timeout: nil,
expectErr: false,
},
{
name: "when the nodeStartupTimeout is greater than 30s",
timeout: &oneMinute,
expectErr: false,
},
{
name: "when the nodeStartupTimeout is 30s",
timeout: &thirtySeconds,
expectErr: false,
},
{
name: "when the nodeStartupTimeout is 29s",
timeout: &twentyNineSeconds,
expectErr: true,
},
{
name: "when the nodeStartupTimeout is less than 0",
timeout: &minusOneMinute,
expectErr: true,
},
{
name: "when the nodeStartupTimeout is 0",
timeout: &zero,
expectErr: true,
},
}

for _, tt := range tests {
g := NewWithT(t)

mhc := &MachineHealthCheck{
Spec: MachineHealthCheckSpec{
NodeStartupTimeout: tt.timeout,
},
}

if tt.expectErr {
g.Expect(mhc.ValidateCreate()).NotTo(Succeed())
g.Expect(mhc.ValidateUpdate(mhc)).NotTo(Succeed())
} else {
g.Expect(mhc.ValidateCreate()).To(Succeed())
g.Expect(mhc.ValidateUpdate(mhc)).To(Succeed())
}
}
}
5 changes: 5 additions & 0 deletions api/v1alpha3/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ spec:
description: Any further remediation is only allowed if at most "MaxUnhealthy"
machines selected by "selector" are not healthy.
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: Machines older than this duration without a node will
be considered to have failed and will be remediated.
type: string
selector:
description: Label selector to match machines whose health will be
exercised
Expand Down Expand Up @@ -158,9 +162,6 @@ spec:
format: int32
minimum: 0
type: integer
required:
- currentHealthy
- expectedMachines
type: object
type: object
served: true
Expand Down
19 changes: 19 additions & 0 deletions controllers/machine_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"context"

"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/controller-runtime/pkg/client"
)
Expand Down Expand Up @@ -47,3 +49,20 @@ func getActiveMachinesInCluster(ctx context.Context, c client.Client, namespace,
}
return machines, nil
}

// hasMatchingLabels verifies that the Label Selector matches the given Labels
func hasMatchingLabels(matchSelector metav1.LabelSelector, matchLabels map[string]string) bool {
// This should never fail, validating webhook should catch this first
selector, err := metav1.LabelSelectorAsSelector(&matchSelector)
if err != nil {
return false
}
// If a nil or empty selector creeps in, it should match nothing, not everything.
if selector.Empty() {
return false
}
if !selector.Matches(labels.Set(matchLabels)) {
return false
}
return true
}
71 changes: 71 additions & 0 deletions controllers/machine_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,74 @@ func Test_getActiveMachinesInCluster(t *testing.T) {
})
}
}

func TestMachineHealthCheckHasMatchingLabels(t *testing.T) {
testCases := []struct {
name string
selector metav1.LabelSelector
labels map[string]string
expected bool
}{
{
name: "selector matches labels",

selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"foo": "bar",
},
},

labels: map[string]string{
"foo": "bar",
},

expected: true,
},
{
name: "selector does not match labels",

selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"foo": "bar",
},
},

labels: map[string]string{
"no": "match",
},
expected: false,
},
{
name: "selector is empty",
selector: metav1.LabelSelector{},
labels: map[string]string{},
expected: false,
},
{
name: "seelctor is invalid",
selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"foo": "bar",
},
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Operator: "bad-operator",
},
},
},
labels: map[string]string{
"foo": "bar",
},
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

got := hasMatchingLabels(tc.selector, tc.labels)
g.Expect(got).To(Equal(tc.expected))
})
}
}
Loading