Skip to content

Commit

Permalink
Add health-check-monitor
Browse files Browse the repository at this point in the history
  • Loading branch information
abansal4032 committed May 26, 2020
1 parent 1d03b66 commit f09501f
Show file tree
Hide file tree
Showing 9 changed files with 567 additions and 2 deletions.
13 changes: 11 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ endif
-tags "$(BUILD_TAGS)" \
./test/e2e/problemmaker/problem_maker.go

./bin/health-checker: $(PKG_SOURCES)
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o bin/health-checker \
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
-tags "$(BUILD_TAGS)" \
cmd/healthchecker/health_checker.go

Dockerfile: Dockerfile.in
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
ifneq ($(ENABLE_JOURNALD), 1)
Expand All @@ -134,12 +142,12 @@ e2e-test: vet fmt build-tar
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
-artifacts-dir=$(ARTIFACTS)

build-binaries: ./bin/node-problem-detector ./bin/log-counter
build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker

build-container: build-binaries Dockerfile
docker build -t $(IMAGE) .

build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker
build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/health-checker ./test/bin/problem-maker
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
sha1sum $(TARBALL)
md5sum $(TARBALL)
Expand All @@ -164,6 +172,7 @@ push-tar: build-tar
push: push-container push-tar

clean:
rm -f bin/health-checker
rm -f bin/log-counter
rm -f bin/node-problem-detector
rm -f test/bin/problem-maker
Expand Down
53 changes: 53 additions & 0 deletions cmd/healthchecker/health_checker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"flag"
"fmt"
"os"

"github.com/spf13/pflag"

"k8s.io/node-problem-detector/cmd/healthchecker/options"
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
"k8s.io/node-problem-detector/pkg/healthchecker"
)

func main() {
// Set glog flag so that it does not log to files.
if err := flag.Set("logtostderr", "true"); err != nil {
fmt.Printf("Failed to set logtostderr=true: %v", err)
os.Exit(int(types.Unknown))
}

hco := options.NewHealthCheckerOptions()
hco.AddFlags(pflag.CommandLine)
pflag.Parse()
hco.ValidOrDie()

hc, err := healthchecker.NewHealthChecker(hco)
if err != nil {
fmt.Println(err)
os.Exit(int(types.Unknown))
}
if !hc.CheckHealth() {
fmt.Printf("%v was found unhealthy; repair flag : %v\n", hco.Component, hco.EnableRepair)
os.Exit(int(types.NonOK))
}
os.Exit(int(types.OK))
}
76 changes: 76 additions & 0 deletions cmd/healthchecker/options/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package options

import (
"flag"
"time"

"github.com/spf13/pflag"

"k8s.io/node-problem-detector/pkg/healthchecker/types"
)

// NewHealthCheckerOptions returns an empty health check options struct.
func NewHealthCheckerOptions() *HealthCheckerOptions {
return &HealthCheckerOptions{}
}

// HealthCheckerOptions are the options used to configure the health checker.
type HealthCheckerOptions struct {
Component string
ContainerRuntime string
EnableRepair bool
CriCtlPath string
CoolDownTime time.Duration
HealthCheckTimeout time.Duration
}

// AddFlags adds health checker command line options to pflag.
func (hco *HealthCheckerOptions) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&hco.Component, "component", types.KubeletComponent,
"The component to check health for. Supports kubelet and container-runtime. Defaults to kubelet.")
fs.StringVar(&hco.ContainerRuntime, "container-runtime", types.DockerRuntime,
"The underlying container runtime name on the node. This is ignored if component is not container-runtime. Supports docker and containerd. Defaults to docker.")
fs.BoolVar(&hco.EnableRepair, "enable-repair", true, "Flag to enable/disable repair attempt for the component. Defaults to true.")
fs.StringVar(&hco.CriCtlPath, "crictl-path", types.DefaultCriCtl,
"The path to the crictl binary. This is used to check health of container-runtimes other than docker. Defaults to /usr/bin/crictl")
fs.DurationVar(&hco.CoolDownTime, "cooldown-time", types.DefaultCooldownTime,
"The time to wait after repairing the component. This helps in maintaining a buffer between repair and next invocation. Defaults to 2m.")
fs.DurationVar(&hco.HealthCheckTimeout, "health-check-timeout", types.DefaultHealthCheckTimeout,
"The time to wait before marking the component as unhealthy. Defaults to 10s.")
}

// ValidOrDie validates health checker command line options.
func (hco *HealthCheckerOptions) ValidOrDie() {
// Make sure the component specified is valid.
if hco.Component != types.KubeletComponent && hco.Component != types.ContainerRuntimeComponent {
panic("The component specified is not supported. Supported components are : <docker/container-runtime>")
}
// Skip checking for container-runtime if the component is not container-runtime
if hco.Component != types.ContainerRuntimeComponent {
return
}
// Make sure the container-runtime is valid.
if hco.ContainerRuntime != types.DockerRuntime && hco.ContainerRuntime != types.ContainerdRuntime {
panic("The container-runtime specified is not supported. Supported runtimes are : <docker/containerd>")
}
}

func init() {
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
}
83 changes: 83 additions & 0 deletions cmd/healthchecker/options/options_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package options

import (
"testing"

"github.com/stretchr/testify/assert"

"k8s.io/node-problem-detector/pkg/healthchecker/types"
)

func TestValidOrDie(t *testing.T) {
testCases := []struct {
name string
hco HealthCheckerOptions
expectPanic bool
}{
{
name: "valid kubelet component",
hco: HealthCheckerOptions{
Component: types.KubeletComponent,
},
expectPanic: false,
},
{
name: "invalid component",
hco: HealthCheckerOptions{
Component: "wrongComponent",
},
expectPanic: true,
},
{
name: "valid container-runtime",
hco: HealthCheckerOptions{
Component: types.ContainerRuntimeComponent,
ContainerRuntime: types.DockerRuntime,
},
expectPanic: false,
},
{
name: "invalid container-runtime",
hco: HealthCheckerOptions{
Component: types.ContainerRuntimeComponent,
ContainerRuntime: "wrongContainerRuntimeName",
},
expectPanic: true,
},
// The --container-runtime should be ignored if component is not container-runtime.
{
name: "valid kubelet component and invalid container-runtime",
hco: HealthCheckerOptions{
Component: types.KubeletComponent,
ContainerRuntime: "wrongContainerRuntimeName",
},
expectPanic: false,
},
}

for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
if test.expectPanic {
assert.Panics(t, test.hco.ValidOrDie, "HealthChecker option %+v is invalid. Expected ValidOrDie to panic.", test.hco)
} else {
assert.NotPanics(t, test.hco.ValidOrDie, "HealthChecker option %+v is valid. Expected ValidOrDie to not panic.", test.hco)
}
})
}
}
33 changes: 33 additions & 0 deletions config/health-checker-container-runtime.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "ContainerRuntimeUnhealthy",
"reason": "ContainerRuntimeIsHealthy",
"message": "Container runtime on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "ContainerRuntimeUnhealthy",
"reason": "DockerUnhealthy",
"path": "/home/kubernetes/bin/health-checker",
"args": [
"--component=container-runtime",
"--enable-repair=false",
"--container-runtime=docker",
"--cooldown-time=2m"
],
"timeout": "3m"
}
]
}
32 changes: 32 additions & 0 deletions config/health-checker-kubelet.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "10s",
"timeout": "3m",
"max_output_length": 80,
"concurrency": 1
},
"source": "health-checker",
"metricsReporting": true,
"conditions": [
{
"type": "KubeletUnhealthy",
"reason": "KubeletIsHealthy",
"message": "kubelet on the node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "KubeletUnhealthy",
"reason": "KubeletUnhealthy",
"path": "/home/kubernetes/bin/health-checker",
"args": [
"--component=kubelet",
"--enable-repair=false",
"--cooldown-time=2m"
],
"timeout": "3m"
}
]
}
Loading

0 comments on commit f09501f

Please sign in to comment.