Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make sure multinode clusters can survive restarts #7973

Merged
merged 51 commits into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
94a9c03
initial commit for restartWorker
sharifelgamal Apr 30, 2020
76349e1
more changes
sharifelgamal May 2, 2020
4cb2d0f
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 2, 2020
38ec707
only warn on fresh start
sharifelgamal May 2, 2020
daaafd5
lint
sharifelgamal May 4, 2020
258be2c
re-enable skipped multi node test
sharifelgamal May 4, 2020
5589f01
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 4, 2020
bd2c4d8
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 4, 2020
e3b7476
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 4, 2020
2ec2665
check kubelet status explicitly
sharifelgamal May 4, 2020
4947106
let's change a bunch of stuff
sharifelgamal May 5, 2020
65c5d57
Merge branch 'better-restart' of github.com:tstromberg/minikube into …
sharifelgamal May 7, 2020
358f167
keepContext
sharifelgamal May 7, 2020
5e98113
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 7, 2020
ad9b4c1
save config
sharifelgamal May 8, 2020
6bb8371
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 8, 2020
4886030
pass everything by reference since we actively change node IP
sharifelgamal May 11, 2020
d4639c5
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 11, 2020
335379d
it works
sharifelgamal May 12, 2020
562fd5b
improve node stop output
sharifelgamal May 12, 2020
6d6bad6
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 15, 2020
26a3d53
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 15, 2020
009e056
revert ip change
sharifelgamal May 15, 2020
faf11e2
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 18, 2020
9165e1a
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 18, 2020
c603467
Merge branch 'ip-save' of github.com:sharifelgamal/minikube into restart
sharifelgamal May 19, 2020
ee5696f
Merge branch 'volume-del' of github.com:sharifelgamal/minikube into r…
sharifelgamal May 19, 2020
6abb399
make sure to call the correct control plane address
sharifelgamal May 20, 2020
70198d8
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 20, 2020
49cc6ae
revert old unrelated changes:
sharifelgamal May 20, 2020
dd7e57e
Merge branch 'restart' of github.com:sharifelgamal/minikube into restart
sharifelgamal May 20, 2020
43d37ec
debugging
sharifelgamal May 20, 2020
3caad25
lint
sharifelgamal May 20, 2020
d5f9490
Merge branch 'node-delete-vol' of github.com:sharifelgamal/minikube i…
sharifelgamal May 20, 2020
db186ee
Merge branch 'node-delete-vol' of github.com:sharifelgamal/minikube i…
sharifelgamal May 20, 2020
13f0b87
better debugging
sharifelgamal May 20, 2020
83740e1
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 20, 2020
95584ea
skip start/stop on github actions
sharifelgamal May 20, 2020
023b2dc
fix test
sharifelgamal May 20, 2020
c412237
add postmortem logs
sharifelgamal May 21, 2020
bde6624
Merge branch 'postmortem' of github.com:sharifelgamal/minikube into r…
sharifelgamal May 21, 2020
c8723ce
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 21, 2020
e225968
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 21, 2020
b516783
it works! and simpler
sharifelgamal May 27, 2020
d075334
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 27, 2020
101f2da
revert changes to node_add
sharifelgamal May 27, 2020
a7e483a
let's simplify kubeadm again
sharifelgamal May 27, 2020
606a307
skip restart test on github actions
sharifelgamal May 27, 2020
1945b5f
swap test order to make it work in github actions
sharifelgamal May 27, 2020
cebba29
comments
sharifelgamal May 27, 2020
94ce379
Merge branch 'master' of github.com:kubernetes/minikube into restart
sharifelgamal May 27, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 31 additions & 14 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,27 +286,44 @@ func startWithDriver(starter node.Starter, existing *config.ClusterConfig) (*kub
}

numNodes := viper.GetInt(nodes)
if numNodes == 1 && existing != nil {
if existing != nil {
if numNodes > 1 {
// We ignore the --nodes parameter if we're restarting an existing cluster
out.WarningT(`The cluster {{.cluster}} already exists which means the --nodes parameter will be ignored. Use "minikube node add" to add nodes to an existing cluster.`, out.V{"cluster": existing.Name})
}
numNodes = len(existing.Nodes)
}
if numNodes > 1 {
if driver.BareMetal(starter.Cfg.Driver) {
exit.WithCodeT(exit.Config, "The none driver is not compatible with multi-node clusters.")
} else {
out.Ln("")
warnAboutMultiNode()
for i := 1; i < numNodes; i++ {
nodeName := node.Name(i + 1)
n := config.Node{
Name: nodeName,
Worker: true,
ControlPlane: false,
KubernetesVersion: starter.Cfg.KubernetesConfig.KubernetesVersion,
// Only warn users on first start.
if existing == nil {
out.Ln("")
warnAboutMultiNode()

for i := 1; i < numNodes; i++ {
nodeName := node.Name(i + 1)
n := config.Node{
Name: nodeName,
Worker: true,
ControlPlane: false,
KubernetesVersion: starter.Cfg.KubernetesConfig.KubernetesVersion,
}
out.Ln("") // extra newline for clarity on the command line
err := node.Add(starter.Cfg, n)
if err != nil {
return nil, errors.Wrap(err, "adding node")
}
}
out.Ln("") // extra newline for clarity on the command line
err := node.Add(starter.Cfg, n)
if err != nil {
return nil, errors.Wrap(err, "adding node")
} else {
for _, n := range existing.Nodes {
if !n.ControlPlane {
err := node.Add(starter.Cfg, n)
if err != nil {
return nil, errors.Wrap(err, "adding node")
}
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/minikube/cmd/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func status(api libmachine.API, cc config.ClusterConfig, n config.Node) (*Status
glog.Infof("%s kubelet status = %s", name, stk)
st.Kubelet = stk.String()

// Early exit for regular nodes
// Early exit for worker nodes
if !controlPlane {
return st, nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/minikube/bootstrapper/bsutil/kverify/api_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func WaitForHealthyAPIServer(r cruntime.Manager, bs bootstrapper.Bootstrapper, c
}

if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, healthz); err != nil {
return fmt.Errorf("apiserver healthz never reported healthy")
return fmt.Errorf("apiserver healthz never reported healthy: %v", err)
}

vcheck := func() (bool, error) {
Expand Down
42 changes: 32 additions & 10 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error {

if err := bsutil.ExistingConfig(k.c); err == nil {
glog.Infof("found existing configuration files, will attempt cluster restart")
rerr := k.restartCluster(cfg)
rerr := k.restartControlPlane(cfg)
if rerr == nil {
return nil
}
Expand Down Expand Up @@ -484,7 +484,7 @@ func (k *Bootstrapper) needsReconfigure(conf string, hostname string, port int,
}

// restartCluster restarts the Kubernetes cluster configured by kubeadm
func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error {
func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
glog.Infof("restartCluster start")

start := time.Now()
Expand Down Expand Up @@ -605,10 +605,24 @@ func (k *Bootstrapper) JoinCluster(cc config.ClusterConfig, n config.Node, joinC
}()

// Join the master by specifying its token
joinCmd = fmt.Sprintf("%s --v=10 --node-name=%s", joinCmd, driver.MachineName(cc, n))
out, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", joinCmd))
if err != nil {
return errors.Wrapf(err, "cmd failed: %s\n%+v\n", joinCmd, out)
joinCmd = fmt.Sprintf("%s --node-name=%s", joinCmd, driver.MachineName(cc, n))

join := func() error {
// reset first to clear any possibly existing state
_, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s reset -f", bsutil.InvokeKubeadm(cc.KubernetesConfig.KubernetesVersion))))
if err != nil {
glog.Infof("kubeadm reset failed, continuing anyway: %v", err)
}

out, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", joinCmd))
if err != nil {
return errors.Wrapf(err, "cmd failed: %s\n%+v\n", joinCmd, out.Output())
}
return nil
}

if err := retry.Expo(join, 10*time.Second, 1*time.Minute); err != nil {
return errors.Wrap(err, "joining cp")
}

if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", "sudo systemctl daemon-reload && sudo systemctl enable kubelet && sudo systemctl start kubelet")); err != nil {
Expand All @@ -618,17 +632,21 @@ func (k *Bootstrapper) JoinCluster(cc config.ClusterConfig, n config.Node, joinC
return nil
}

// GenerateToken creates a token and returns the appropriate kubeadm join command to run
// GenerateToken creates a token and returns the appropriate kubeadm join command to run, or the already existing token
func (k *Bootstrapper) GenerateToken(cc config.ClusterConfig) (string, error) {
// Take that generated token and use it to get a kubeadm join command
tokenCmd := exec.Command("/bin/bash", "-c", fmt.Sprintf("%s token create --print-join-command --ttl=0", bsutil.InvokeKubeadm(cc.KubernetesConfig.KubernetesVersion)))
r, err := k.c.RunCmd(tokenCmd)
if err != nil {
return "", errors.Wrap(err, "generating bootstrap token")
return "", errors.Wrap(err, "generating join command")
}

joinCmd := r.Stdout.String()
joinCmd = strings.Replace(joinCmd, "kubeadm", bsutil.InvokeKubeadm(cc.KubernetesConfig.KubernetesVersion), 1)
joinCmd = fmt.Sprintf("%s --ignore-preflight-errors=all", strings.TrimSpace(joinCmd))
if cc.KubernetesConfig.CRISocket != "" {
joinCmd = fmt.Sprintf("%s --cri-socket %s", joinCmd, cc.KubernetesConfig.CRISocket)
}

return joinCmd, nil
}
Expand Down Expand Up @@ -743,14 +761,18 @@ func (k *Bootstrapper) UpdateNode(cfg config.ClusterConfig, n config.Node, r cru
}

files := []assets.CopyableFile{
assets.NewMemoryAssetTarget(kubeadmCfg, bsutil.KubeadmYamlPath+".new", "0640"),
assets.NewMemoryAssetTarget(kubeletCfg, bsutil.KubeletSystemdConfFile, "0644"),
assets.NewMemoryAssetTarget(kubeletService, bsutil.KubeletServiceFile, "0644"),
}

if n.ControlPlane {
files = append(files, assets.NewMemoryAssetTarget(kubeadmCfg, bsutil.KubeadmYamlPath+".new", "0640"))
}

// Copy the default CNI config (k8s.conf), so that kubelet can successfully
// start a Pod in the case a user hasn't manually installed any CNI plugin
// and minikube was started with "--extra-config=kubelet.network-plugin=cni".
if cfg.KubernetesConfig.EnableDefaultCNI {
if cfg.KubernetesConfig.EnableDefaultCNI && !config.MultiNode(cfg) {
files = append(files, assets.NewMemoryAssetTarget([]byte(defaultCNIConfig), bsutil.DefaultCNIConfigPath, "0644"))
}

Expand Down
6 changes: 5 additions & 1 deletion pkg/minikube/cruntime/cruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,11 @@ func New(c Config) (Manager, error) {

switch c.Type {
case "", "docker":
return &Docker{Socket: c.Socket, Runner: c.Runner, Init: sm}, nil
return &Docker{
Socket: c.Socket,
Runner: c.Runner,
Init: sm,
}, nil
case "crio", "cri-o":
return &CRIO{
Socket: c.Socket,
Expand Down
10 changes: 5 additions & 5 deletions pkg/minikube/machine/cache_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,16 @@ func CacheAndLoadImages(images []string) error {

status, err := Status(api, m)
if err != nil {
glog.Warningf("error getting status for %s: %v", pName, err)
failed = append(failed, pName)
glog.Warningf("error getting status for %s: %v", m, err)
failed = append(failed, m)
continue
}

if status == state.Running.String() { // the not running hosts will load on next start
h, err := api.Load(m)
if err != nil {
glog.Warningf("Failed to load machine %q: %v", m, err)
failed = append(failed, pName)
failed = append(failed, m)
continue
}
cr, err := CommandRunner(h)
Expand All @@ -214,10 +214,10 @@ func CacheAndLoadImages(images []string) error {
}
err = LoadImages(c, cr, images, constants.ImageCacheDir)
if err != nil {
failed = append(failed, pName)
failed = append(failed, m)
glog.Warningf("Failed to load cached images for profile %s. make sure the profile is running. %v", pName, err)
}
succeeded = append(succeeded, pName)
succeeded = append(succeeded, m)
}
}
}
Expand Down
7 changes: 4 additions & 3 deletions pkg/minikube/machine/fix.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func recreateIfNeeded(api libmachine.API, cc *config.ClusterConfig, n *config.No
}

if !me || err == constants.ErrMachineMissing {
out.T(out.Shrug, `{{.driver_name}} "{{.cluster}}" {{.machine_type}} is missing, will recreate.`, out.V{"driver_name": cc.Driver, "cluster": cc.Name, "machine_type": machineType})
out.T(out.Shrug, `{{.driver_name}} "{{.cluster}}" {{.machine_type}} is missing, will recreate.`, out.V{"driver_name": cc.Driver, "cluster": machineName, "machine_type": machineType})
demolish(api, *cc, *n, h)

glog.Infof("Sleeping 1 second for extra luck!")
Expand All @@ -133,20 +133,21 @@ func recreateIfNeeded(api libmachine.API, cc *config.ClusterConfig, n *config.No

if s == state.Running {
if !recreated {
out.T(out.Running, `Updating the running {{.driver_name}} "{{.cluster}}" {{.machine_type}} ...`, out.V{"driver_name": cc.Driver, "cluster": cc.Name, "machine_type": machineType})
out.T(out.Running, `Updating the running {{.driver_name}} "{{.cluster}}" {{.machine_type}} ...`, out.V{"driver_name": cc.Driver, "cluster": machineName, "machine_type": machineType})
}
return h, nil
}

if !recreated {
out.T(out.Restarting, `Restarting existing {{.driver_name}} {{.machine_type}} for "{{.cluster}}" ...`, out.V{"driver_name": cc.Driver, "cluster": cc.Name, "machine_type": machineType})
out.T(out.Restarting, `Restarting existing {{.driver_name}} {{.machine_type}} for "{{.cluster}}" ...`, out.V{"driver_name": cc.Driver, "cluster": machineName, "machine_type": machineType})
}
if err := h.Driver.Start(); err != nil {
return h, errors.Wrap(err, "driver start")
}
if err := saveHost(api, h, cc, n); err != nil {
return h, err
}

return h, nil
}

Expand Down
1 change: 0 additions & 1 deletion pkg/minikube/machine/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ func createHost(api libmachine.API, cfg *config.ClusterConfig, n *config.Node) (
if err := saveHost(api, h, cfg, n); err != nil {
return h, err
}

return h, nil
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/minikube/node/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"net"
"os"
"os/exec"
"runtime/debug"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -128,6 +127,7 @@ func Start(starter Starter, apiServer bool) (*kubeconfig.Settings, error) {
if err = bs.SetupCerts(starter.Cfg.KubernetesConfig, *starter.Node); err != nil {
return nil, errors.Wrap(err, "setting up certs")
}

}

var wg sync.WaitGroup
Expand Down Expand Up @@ -156,6 +156,7 @@ func Start(starter Starter, apiServer bool) (*kubeconfig.Settings, error) {
if err := bs.WaitForNode(*starter.Cfg, *starter.Node, viper.GetDuration(waitTimeout)); err != nil {
return nil, errors.Wrap(err, "Wait failed")
}

} else {
if err := bs.UpdateNode(*starter.Cfg, *starter.Node, cr); err != nil {
return nil, errors.Wrap(err, "Updating node")
Expand Down Expand Up @@ -251,7 +252,6 @@ func configureRuntimes(runner cruntime.CommandRunner, cc config.ClusterConfig, k

err = cr.Enable(disableOthers, forceSystemd())
if err != nil {
debug.PrintStack()
exit.WithError("Failed to enable container runtime", err)
}

Expand Down
83 changes: 80 additions & 3 deletions test/integration/multinode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ func TestMultiNode(t *testing.T) {
{"StopNode", validateStopRunningNode},
{"StartAfterStop", validateStartNodeAfterStop},
{"DeleteNode", validateDeleteNodeFromMultiNode},
{"StopMultiNode", validateStopMultiNodeCluster},
{"RestartMultiNode", validateRestartMultiNodeCluster},
}
for _, tc := range tests {
tc := tc
Expand Down Expand Up @@ -138,12 +140,20 @@ func validateStopRunningNode(ctx context.Context, t *testing.T, profile string)
}

func validateStartNodeAfterStop(ctx context.Context, t *testing.T, profile string) {
// TODO (#7496): remove skip once restarts work
t.Skip("Restarting nodes is broken :(")
if DockerDriver() {
rr, err := Run(t, exec.Command("docker", "version", "-f", "{{.Server.Version}}"))
if err != nil {
t.Fatalf("docker is broken: %v", err)
}
if strings.Contains(rr.Stdout.String(), "azure") {
t.Skip("kic containers are not supported on docker's azure")
}
}

// Start the node back up
rr, err := Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "node", "start", ThirdNodeName))
rr, err := Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "node", "start", ThirdNodeName, "--alsologtostderr"))
if err != nil {
t.Logf(rr.Stderr.String())
t.Errorf("node start returned an error. args %q: %v", rr.Command(), err)
}

Expand All @@ -160,6 +170,73 @@ func validateStartNodeAfterStop(ctx context.Context, t *testing.T, profile strin
if strings.Count(rr.Stdout.String(), "kubelet: Running") != 3 {
t.Errorf("status says both kubelets are not running: args %q: %v", rr.Command(), rr.Stdout.String())
}

// Make sure kubectl can connect correctly
rr, err = Run(t, exec.CommandContext(ctx, "kubectl", "get", "nodes"))
if err != nil {
t.Fatalf("failed to kubectl get nodes. args %q : %v", rr.Command(), err)
}
}

func validateStopMultiNodeCluster(ctx context.Context, t *testing.T, profile string) {
// Run minikube node stop on that node
rr, err := Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "stop"))
if err != nil {
t.Errorf("node stop returned an error. args %q: %v", rr.Command(), err)
}

// Run status to see the stopped hosts
rr, err = Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "status"))
// Exit code 7 means one host is stopped, which we are expecting
if err != nil && rr.ExitCode != 7 {
t.Fatalf("failed to run minikube status. args %q : %v", rr.Command(), err)
}

// Make sure minikube status shows 2 stopped nodes
rr, err = Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "status", "--alsologtostderr"))
if err != nil && rr.ExitCode != 7 {
t.Fatalf("failed to run minikube status. args %q : %v", rr.Command(), err)
}

if strings.Count(rr.Stdout.String(), "host: Stopped") != 2 {
t.Errorf("incorrect number of stopped hosts: args %q: %v", rr.Command(), rr.Stdout.String())
}

if strings.Count(rr.Stdout.String(), "kubelet: Stopped") != 2 {
t.Errorf("incorrect number of stopped kubelets: args %q: %v", rr.Command(), rr.Stdout.String())
}
}

func validateRestartMultiNodeCluster(ctx context.Context, t *testing.T, profile string) {
if DockerDriver() {
rr, err := Run(t, exec.Command("docker", "version", "-f", "{{.Server.Version}}"))
if err != nil {
t.Fatalf("docker is broken: %v", err)
}
if strings.Contains(rr.Stdout.String(), "azure") {
t.Skip("kic containers are not supported on docker's azure")
}
}
// Restart a full cluster with minikube start
startArgs := append([]string{"start", "-p", profile}, StartArgs()...)
rr, err := Run(t, exec.CommandContext(ctx, Target(), startArgs...))
if err != nil {
t.Fatalf("failed to start cluster. args %q : %v", rr.Command(), err)
}

// Make sure minikube status shows 2 running nodes
rr, err = Run(t, exec.CommandContext(ctx, Target(), "-p", profile, "status", "--alsologtostderr"))
if err != nil {
t.Fatalf("failed to run minikube status. args %q : %v", rr.Command(), err)
}

if strings.Count(rr.Stdout.String(), "host: Running") != 2 {
t.Errorf("status says both hosts are not running: args %q: %v", rr.Command(), rr.Stdout.String())
}

if strings.Count(rr.Stdout.String(), "kubelet: Running") != 2 {
t.Errorf("status says both kubelets are not running: args %q: %v", rr.Command(), rr.Stdout.String())
}
}

func validateDeleteNodeFromMultiNode(ctx context.Context, t *testing.T, profile string) {
Expand Down