diff --git a/Makefile b/Makefile index c6e82852b..207925eba 100644 --- a/Makefile +++ b/Makefile @@ -113,6 +113,14 @@ endif -tags "$(BUILD_TAGS)" \ ./test/e2e/problemmaker/problem_maker.go +./bin/health-checker: $(PKG_SOURCES) + CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \ + -mod vendor \ + -o bin/health-checker \ + -ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \ + -tags "$(BUILD_TAGS)" \ + cmd/healthchecker/health_checker.go + Dockerfile: Dockerfile.in sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@ ifneq ($(ENABLE_JOURNALD), 1) @@ -134,12 +142,12 @@ e2e-test: vet fmt build-tar -boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \ -artifacts-dir=$(ARTIFACTS) -build-binaries: ./bin/node-problem-detector ./bin/log-counter +build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker build-container: build-binaries Dockerfile docker build -t $(IMAGE) . -build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker +build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker sha1sum $(TARBALL) md5sum $(TARBALL) @@ -164,6 +172,7 @@ push-tar: build-tar push: push-container push-tar clean: + rm -f bin/health-checker rm -f bin/log-counter rm -f bin/node-problem-detector rm -f test/bin/problem-maker diff --git a/cmd/healthchecker/health_checker.go b/cmd/healthchecker/health_checker.go new file mode 100644 index 000000000..772bc0264 --- /dev/null +++ b/cmd/healthchecker/health_checker.go @@ -0,0 +1,53 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/spf13/pflag" + + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/custompluginmonitor/types" + "k8s.io/node-problem-detector/pkg/healthchecker" +) + +func main() { + // Set glog flag so that it does not log to files. + if err := flag.Set("logtostderr", "true"); err != nil { + fmt.Printf("Failed to set logtostderr=true: %v", err) + os.Exit(int(types.Unknown)) + } + + hco := options.NewHealthCheckerOptions() + hco.AddFlags(pflag.CommandLine) + pflag.Parse() + hco.ValidOrDie() + + hc, err := healthchecker.NewHealthChecker(hco) + if err != nil { + fmt.Println(err) + os.Exit(int(types.Unknown)) + } + if !hc.CheckHealth() { + fmt.Printf("%v was found unhealthy; repair flag : %v\n", hco.Component, hco.EnableRepair) + os.Exit(int(types.NonOK)) + } + os.Exit(int(types.OK)) +} diff --git a/cmd/healthchecker/options/options.go b/cmd/healthchecker/options/options.go new file mode 100644 index 000000000..66b704ec6 --- /dev/null +++ b/cmd/healthchecker/options/options.go @@ -0,0 +1,76 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "time" + + "github.com/spf13/pflag" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +// NewHealthCheckerOptions returns an empty health check options struct. +func NewHealthCheckerOptions() *HealthCheckerOptions { + return &HealthCheckerOptions{} +} + +// HealthCheckerOptions are the options used to configure the health checker. +type HealthCheckerOptions struct { + Component string + ContainerRuntime string + EnableRepair bool + CriCtlPath string + CoolDownTime time.Duration + HealthCheckTimeout time.Duration +} + +// AddFlags adds health checker command line options to pflag. +func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&hco.Component, "component", types.KubeletComponent, + "The component to check health for. Supports kubelet and container-runtime. Defaults to kubelet.") + fs.StringVar(&hco.ContainerRuntime, "container-runtime", types.DockerRuntime, + "The underlying container runtime name on the node. This is ignored if component is not container-runtime. Supports docker and containerd. Defaults to docker.") + fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component. Defaults to true.") + fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl, + "The path to the crictl binary. This is used to check health of container-runtimes other than docker. Defaults to /usr/bin/crictl") + fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCooldownTime, + "The time to wait after repairing the component. This helps in maintaining a buffer between repair and next invocation. Defaults to 2m.") + fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout, + "The time to wait before marking the component as unhealthy. Defaults to 10s.") +} + +// ValidOrDie validates health checker command line options. +func (hco *HealthCheckerOptions) ValidOrDie() { + // Make sure the component specified is valid. + if hco.Component != types.KubeletComponent && hco.Component != types.ContainerRuntimeComponent { + panic("The component specified is not supported. Supported components are : ") + } + // Skip checking for container-runtime if the component is not container-runtime + if hco.Component != types.ContainerRuntimeComponent { + return + } + // Make sure the container-runtime is valid. + if hco.ContainerRuntime != types.DockerRuntime && hco.ContainerRuntime != types.ContainerdRuntime { + panic("The container-runtime specified is not supported. Supported runtimes are : ") + } +} + +func init() { + pflag.CommandLine.AddGoFlagSet(flag.CommandLine) +} diff --git a/cmd/healthchecker/options/options_test.go b/cmd/healthchecker/options/options_test.go new file mode 100644 index 000000000..2afc2baec --- /dev/null +++ b/cmd/healthchecker/options/options_test.go @@ -0,0 +1,83 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +func TestValidOrDie(t *testing.T) { + testCases := []struct { + name string + hco HealthCheckerOptions + expectPanic bool + }{ + { + name: "valid kubelet component", + hco: HealthCheckerOptions{ + Component: types.KubeletComponent, + }, + expectPanic: false, + }, + { + name: "invalid component", + hco: HealthCheckerOptions{ + Component: "wrongComponent", + }, + expectPanic: true, + }, + { + name: "valid container-runtime", + hco: HealthCheckerOptions{ + Component: types.ContainerRuntimeComponent, + ContainerRuntime: types.DockerRuntime, + }, + expectPanic: false, + }, + { + name: "invalid container-runtime", + hco: HealthCheckerOptions{ + Component: types.ContainerRuntimeComponent, + ContainerRuntime: "wrongContainerRuntimeName", + }, + expectPanic: true, + }, + // The --container-runtime should be ignored if component is not container-runtime. + { + name: "valid kubelet component and invalid container-runtime", + hco: HealthCheckerOptions{ + Component: types.KubeletComponent, + ContainerRuntime: "wrongContainerRuntimeName", + }, + expectPanic: false, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + if test.expectPanic { + assert.Panics(t, test.hco.ValidOrDie, "HealthChecker option %+v is invalid. Expected ValidOrDie to panic.", test.hco) + } else { + assert.NotPanics(t, test.hco.ValidOrDie, "HealthChecker option %+v is valid. Expected ValidOrDie to not panic.", test.hco) + } + }) + } +} diff --git a/config/health-checker-container-runtime.json b/config/health-checker-container-runtime.json new file mode 100644 index 000000000..69cf56128 --- /dev/null +++ b/config/health-checker-container-runtime.json @@ -0,0 +1,33 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "ContainerRuntimeUnhealthy", + "reason": "ContainerRuntimeIsHealthy", + "message": "Container runtime on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ContainerRuntimeUnhealthy", + "reason": "DockerUnhealthy", + "path": "/home/kubernetes/bin/health-checker", + "args": [ + "--component=container-runtime", + "--enable-repair=false", + "--container-runtime=docker", + "--cooldown-time=2m" + ], + "timeout": "3m" + } + ] +} diff --git a/config/health-checker-kubelet.json b/config/health-checker-kubelet.json new file mode 100644 index 000000000..abb9aa2f3 --- /dev/null +++ b/config/health-checker-kubelet.json @@ -0,0 +1,32 @@ +{ + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": 80, + "concurrency": 1 + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "KubeletUnhealthy", + "reason": "KubeletIsHealthy", + "message": "kubelet on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "KubeletUnhealthy", + "reason": "KubeletUnhealthy", + "path": "/home/kubernetes/bin/health-checker", + "args": [ + "--component=kubelet", + "--enable-repair=false", + "--cooldown-time=2m" + ], + "timeout": "3m" + } + ] +} diff --git a/pkg/healthchecker/health_checker.go b/pkg/healthchecker/health_checker.go new file mode 100644 index 000000000..da5bd3ded --- /dev/null +++ b/pkg/healthchecker/health_checker.go @@ -0,0 +1,147 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "fmt" + "net/http" + "os/exec" + "time" + + "github.com/golang/glog" + + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/node-problem-detector/cmd/healthchecker/options" + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +type healthChecker struct { + enableRepair bool + healthCheckFunc func() bool + // The repair is "best-effort" and ignores the error from the underlying actions. + // The bash commands to kill the process will fail if the service is down and hence ignore. + repairFunc func() + crictlPath string + healthCheckTimeout time.Duration + coolDownTime time.Duration +} + +// NewHealthChecker returns a new health checker configured with the given options. +func NewHealthChecker(hco *options.HealthCheckerOptions) (types.HealthChecker, error) { + hc := &healthChecker{ + enableRepair: hco.EnableRepair, + crictlPath: hco.CriCtlPath, + healthCheckTimeout: hco.HealthCheckTimeout, + coolDownTime: hco.CoolDownTime, + } + switch hco.Component { + case types.KubeletComponent: + hc.healthCheckFunc = kubeletHealthCheck + hc.repairFunc = kubeletRepair + case types.ContainerRuntimeComponent: + hc.healthCheckFunc = getContainerRuntimeHealthCheck(hco.ContainerRuntime, hco.CriCtlPath) + hc.repairFunc = getContainerRuntimeRepair(hco.ContainerRuntime) + default: + return nil, fmt.Errorf("invalid component : %+v specified in health check options", hco.Component) + } + return hc, nil +} + +// CheckHealth checks for the health of the component and tries to repair if enabled. +// Returns true if healthy, false otherwise. +func (hc *healthChecker) CheckHealth() bool { + // Poll till the health check timeout for the component to be up. + if err := wait.PollImmediate(types.PollInterval, hc.healthCheckTimeout, func() (bool, error) { + healthy := hc.healthCheckFunc() + if healthy { + return true, nil + } + return false, nil + }); err != nil { + // The service is unhealthy. + // Attempt repair based on flag. + if hc.enableRepair { + glog.Infof("health-checker: component is unhealthy, proceeding to repair") + hc.repairFunc() + // stall for cool down period after repairing + time.Sleep(hc.coolDownTime) + } + return false + } + return true +} + +// kubeletHealthCheck checks the health of kubelet by hitting to the health endpoint. +// Returns true if healthy, false otherwise. +func kubeletHealthCheck() bool { + httpClient := http.Client{Timeout: types.HttpTimeout} + response, err := httpClient.Get(types.KubeletHealthCheckEndpoint) + if err != nil || response.StatusCode != http.StatusOK { + return false + } + return true +} + +// kubeletRepair tries to kill the kubelet service if a problem is detected. +func kubeletRepair() { + execCommand("systemctl", "kill", "kubelet") +} + +// getContainerRuntimeHealthCheck returns the health check function depending on the runtime. +func getContainerRuntimeHealthCheck(runtime, crictlPath string) func() bool { + if runtime == types.DockerRuntime { + // Use "docker ps" for docker health check. Not using crictl for docker to remove + // dependency on the kubelet. + return func() bool { + if err := execCommand("docker", "ps"); err != nil { + return false + } + return true + } + } + // Use "crictl pods" for containerd health check. + return func() bool { + if err := execCommand(crictlPath, "pods"); err != nil { + return false + } + return true + } +} + +// getContainerRuntimeHealthCheck returns the repair function depending on the runtime. +func getContainerRuntimeRepair(runtime string) func() { + if runtime == types.DockerRuntime { + return func() { + execCommand("pkill", "-SIGUSR1", "dockerd") + execCommand("systemctl", "kill", "--kill-who=main", types.DockerRuntime) + } + } + return func() { + execCommand("systemctl", "kill", "--kill-who=main", types.ContainerdRuntime) + } +} + +// execCommand executes the bash command and returns the error. +func execCommand(command string, args ...string) error { + cmd := exec.Command(command, args...) + glog.Infof("health-checker: executing command : %v\n", cmd) + if out, err := cmd.Output(); err != nil { + glog.Infof("health-checker: command failed : %v, %v\n", err.Error(), out) + return err + } + return nil +} diff --git a/pkg/healthchecker/health_checker_test.go b/pkg/healthchecker/health_checker_test.go new file mode 100644 index 000000000..61c33ad5c --- /dev/null +++ b/pkg/healthchecker/health_checker_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package healthchecker + +import ( + "testing" + "time" + + "k8s.io/node-problem-detector/pkg/healthchecker/types" +) + +var repairCalled bool + +func NewTestHealthChecker(repairFunc func(), healthCheckFunc func() bool, enableRepair bool) types.HealthChecker { + repairCalled = false + return &healthChecker{ + enableRepair: enableRepair, + healthCheckFunc: healthCheckFunc, + repairFunc: repairFunc, + healthCheckTimeout: time.Second, + coolDownTime: time.Second, + } +} + +func healthyFunc() bool { + return true +} + +func unhealthyFunc() bool { + return false +} + +func repairFunc() { + repairCalled = true +} + +func TestHealthCheck(t *testing.T) { + for _, tc := range []struct { + description string + enableRepair bool + healthy bool + healthCheckFunc func() bool + repairFunc func() + repairCalled bool + }{ + { + description: "healthy component", + enableRepair: true, + healthy: true, + healthCheckFunc: healthyFunc, + repairFunc: repairFunc, + repairCalled: false, + }, + { + description: "unhealthy component and disabled repair", + enableRepair: false, + healthy: false, + healthCheckFunc: unhealthyFunc, + repairFunc: repairFunc, + repairCalled: false, + }, + { + description: "unhealthy component and enabled repair", + enableRepair: true, + healthy: false, + healthCheckFunc: unhealthyFunc, + repairFunc: repairFunc, + repairCalled: true, + }, + } { + t.Run(tc.description, func(t *testing.T) { + hc := NewTestHealthChecker(tc.repairFunc, tc.healthCheckFunc, tc.enableRepair) + healthy := hc.CheckHealth() + if healthy != tc.healthy { + t.Errorf("incorrect health returned got %t; expected %t", healthy, tc.healthy) + } + if repairCalled != tc.repairCalled { + t.Errorf("incorrect repairCalled got %t; expected %t", repairCalled, tc.repairCalled) + } + }) + } +} diff --git a/pkg/healthchecker/types/types.go b/pkg/healthchecker/types/types.go new file mode 100644 index 000000000..ab0fc8c3b --- /dev/null +++ b/pkg/healthchecker/types/types.go @@ -0,0 +1,36 @@ +/* +Copyright 2020 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import "time" + +const ( + HttpTimeout = 10 * time.Second + DefaultCooldownTime = 2 * time.Minute + DefaultHealthCheckTimeout = 10 * time.Second + PollInterval = 10 * time.Second + DefaultCriCtl = "/usr/bin/crictl" + DockerRuntime = "docker" + ContainerdRuntime = "containerd" + KubeletComponent = "kubelet" + ContainerRuntimeComponent = "container-runtime" + KubeletHealthCheckEndpoint = "http://127.0.0.1:10248/healthz" +) + +type HealthChecker interface { + CheckHealth() bool +}