Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement clusterctl delete cluster #406

Merged
merged 1 commit into from
Jul 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 131 additions & 43 deletions clusterctl/clusterdeployer/clusterdeployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ import (
"fmt"
"io/ioutil"
"os"
"strings"
"time"

"github.com/golang/glog"
"k8s.io/client-go/kubernetes"
clustercommon "sigs.k8s.io/cluster-api/pkg/apis/cluster/common"
clusterv1 "sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1"
"sigs.k8s.io/cluster-api/pkg/util"
"time"

"github.com/golang/glog"
)

// Provider specific logic. Logic here should eventually be optional & additive.
Expand All @@ -47,11 +49,20 @@ type ClusterProvisioner interface {
// Provides interaction with a cluster
type ClusterClient interface {
Apply(string) error
Delete(string) error
WaitForClusterV1alpha1Ready() error
GetClusterObjects() ([]*clusterv1.Cluster, error)
GetMachineDeploymentObjects() ([]*clusterv1.MachineDeployment, error)
GetMachineSetObjects() ([]*clusterv1.MachineSet, error)
GetMachineObjects() ([]*clusterv1.Machine, error)
CreateClusterObject(*clusterv1.Cluster) error
CreateMachineDeploymentObjects([]*clusterv1.MachineDeployment) error
CreateMachineSetObjects([]*clusterv1.MachineSet) error
CreateMachineObjects([]*clusterv1.Machine) error
DeleteClusterObjects() error
DeleteMachineDeploymentObjects() error
DeleteMachineSetObjects() error
DeleteMachineObjects() error
UpdateClusterObjectEndpoint(string) error
Close() error
}
Expand All @@ -74,28 +85,22 @@ type ProviderComponentsStoreFactory interface {
type ClusterDeployer struct {
externalProvisioner ClusterProvisioner
clientFactory ClientFactory
provider ProviderDeployer
providerComponents string
addonComponents string
kubeconfigOutput string
cleanupExternalCluster bool
}

func New(
externalProvisioner ClusterProvisioner,
clientFactory ClientFactory,
provider ProviderDeployer,
providerComponents string,
addonComponents string,
kubeconfigOutput string,
cleanupExternalCluster bool) *ClusterDeployer {
return &ClusterDeployer{
externalProvisioner: externalProvisioner,
clientFactory: clientFactory,
provider: provider,
providerComponents: providerComponents,
addonComponents: addonComponents,
kubeconfigOutput: kubeconfigOutput,
cleanupExternalCluster: cleanupExternalCluster,
}
}
Expand All @@ -106,7 +111,8 @@ const (
)

// Creates the a cluster from the provided cluster definition and machine list.
func (d *ClusterDeployer) Create(cluster *clusterv1.Cluster, machines []*clusterv1.Machine, providerComponentsStoreFactory ProviderComponentsStoreFactory) error {

func (d *ClusterDeployer) Create(cluster *clusterv1.Cluster, machines []*clusterv1.Machine, provider ProviderDeployer, kubeconfigOutput string, providerComponentsStoreFactory ProviderComponentsStoreFactory) error {
master, nodes, err := splitMachineRoles(machines)
if err != nil {
return fmt.Errorf("unable to seperate master machines from node machines: %v", err)
Expand All @@ -118,12 +124,7 @@ func (d *ClusterDeployer) Create(cluster *clusterv1.Cluster, machines []*cluster
if err != nil {
return fmt.Errorf("could not create external client: %v", err)
}
defer func() {
err := externalClient.Close()
if err != nil {
glog.Errorf("Could not close external client: %v", err)
}
}()
defer closeClient(externalClient, "external")

glog.Info("Applying Cluster API stack to external cluster")
if err := d.applyClusterAPIStack(externalClient); err != nil {
Expand All @@ -143,37 +144,32 @@ func (d *ClusterDeployer) Create(cluster *clusterv1.Cluster, machines []*cluster
}

glog.Infof("Updating external cluster object with master (%s) endpoint", master.Name)
if err := d.updateClusterEndpoint(externalClient); err != nil {
if err := d.updateClusterEndpoint(externalClient, provider); err != nil {
return fmt.Errorf("unable to update external cluster endpoint: %v", err)
}

glog.Info("Creating internal cluster")
internalClient, err := d.createInternalCluster(externalClient)
internalClient, err := d.createInternalCluster(externalClient, provider, kubeconfigOutput)
if err != nil {
return fmt.Errorf("unable to create internal cluster: %v", err)
}
defer func() {
err := internalClient.Close()
if err != nil {
glog.Errorf("Could not close internal client: %v", err)
}
}()
defer closeClient(internalClient, "internal")

glog.Info("Applying Cluster API stack to internal cluster")
if err := d.applyClusterAPIStackWithPivoting(internalClient, externalClient); err != nil {
return fmt.Errorf("unable to apply cluster api stack to internal cluster: %v", err)
}

glog.Info("Saving provider components to the internal cluster")
err = d.saveProviderComponentsToCluster(providerComponentsStoreFactory, d.kubeconfigOutput)
err = d.saveProviderComponentsToCluster(providerComponentsStoreFactory, kubeconfigOutput)
if err != nil {
return fmt.Errorf("unable to save provider components to internal cluster: %v", err)
}

// For some reason, endpoint doesn't get updated in external cluster sometimes. So we
// update the internal cluster endpoint as well to be sure.
glog.Infof("Updating internal cluster object with master (%s) endpoint", master.Name)
if err := d.updateClusterEndpoint(internalClient); err != nil {
if err := d.updateClusterEndpoint(internalClient, provider); err != nil {
return fmt.Errorf("unable to update internal cluster endpoint: %v", err)
}

Expand All @@ -189,7 +185,41 @@ func (d *ClusterDeployer) Create(cluster *clusterv1.Cluster, machines []*cluster
}
}

glog.Infof("Done provisioning cluster. You can now access your cluster with kubectl --kubeconfig %v", d.kubeconfigOutput)
glog.Infof("Done provisioning cluster. You can now access your cluster with kubectl --kubeconfig %v", kubeconfigOutput)

return nil
}

func (d *ClusterDeployer) Delete(internalClient ClusterClient) error {
glog.Info("Creating external cluster")
externalClient, cleanupExternalCluster, err := d.createExternalCluster()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if the external cluster already exists?

Copy link
Contributor Author

@spew spew Jun 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this will fail -- much like the CreateCluster command. However, I'll test this manually to make sure we understand the behavior and report back in this comment.

Copy link
Contributor Author

@spew spew Jun 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried this and this is the behavior:

$ minikube start
Starting local Kubernetes v1.10.0 cluster...
Starting VM...
Getting VM IP address...
Moving files into cluster...
Setting up certs...
Connecting to cluster...
Setting up kubeconfig...
Starting cluster components...
Kubectl is now configured to use the cluster.
Loading cached images from config file.

$ clusterctl delete cluster -p provider-components.yaml
I0627 14:17:22.206083   51664 clusterdeployer.go:194] Creating external cluster
F0627 14:27:56.001433   51664 delete_cluster.go:47] could not create external cluster: could not create external control plane: error running command 'minikube start --bootstrapper=kubeadm': exit status 1

Given that we do nothing special in kubectl create cluster if there is an existing minikube cluster I'd prefer to punt on the problems as it is an existing issue and not a change in the architecture.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we skip the external cluster creation if the minikube cluster already exists (pretty much the minikube status result). Not needed in this PR, but an issue+TODO would do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a less prescriptive issue -- I think there are some other possible solutions. Here it is: #413

defer cleanupExternalCluster()
if err != nil {
return fmt.Errorf("could not create external cluster: %v", err)
}
defer closeClient(externalClient, "external")

glog.Info("Applying Cluster API stack to external cluster")
if err = d.applyClusterAPIStack(externalClient); err != nil {
return fmt.Errorf("unable to apply cluster api stack to external cluster: %v", err)
}

glog.Info("Deleting Cluster API Provider Components from internal cluster")
if err = internalClient.Delete(d.providerComponents); err != nil {
glog.Infof("error while removing provider components from internal cluster: %v", err)
glog.Infof("Continuing with a best effort delete")
}

glog.Info("Copying objects from internal cluster to external cluster")
if err = pivot(internalClient, externalClient); err != nil {
return fmt.Errorf("unable to copy objects from internal to external cluster: %v", err)
}

glog.Info("Deleting objects from external cluster")
if err = deleteObjects(externalClient); err != nil {
return fmt.Errorf("unable to finish deleting objects in external cluster, resources may have been leaked: %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a pretty scary log line. Can you provide some guidance here as to what resources might have leaked?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard to build a generalized line of text here because the underlying resources are sort of provider specific. For example, in GCP when the cluster deletion fails it means that you may have 'leaked' a firewall rule. However, that would likely not be true for a vsphere implementation.

Do you have any ideas on how to improve the messaging?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a good solution right now, but perhaps we could make a generic statement ("VMs, firewall rules, LB rules etc") for now?

Copy link
Contributor Author

@spew spew Jun 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that is actually more clear because the resources in question are extremely specific to a given provider, for example, firewall rule is a GCP concept, in AWS there is "security groups", VMs might be safe, but in AWS land those would be called instances. When talking about load balancers, within AWS there are at least 3 different types of load balancers with different APIs, etc, (application, network, and classic).

I see two options:

  1. Add another method to the ProviderDeployer interface. This is the interface that is specific to the provider / cloud. This method could be something like this GetDeleteMachineErrorMessage(...) string and there would be one for Clusters, Machines, MachineSets, and MachineDeployments. This method would return a provider specific message of the kinds of resources that can be leaked.

  2. Change the message to this, substituting 'google' or 'aws' or the like for the provider name:

fmt.Errorf("unable to finish deleting objects in external cluster, the associated resources specific to the %v provider may have been leaked: %v", providerName, err)

Do you think we should implement either of these or leave things how they are?

}
glog.Info("Deletion of cluster complete")

return nil
}
Expand All @@ -216,39 +246,39 @@ func (d *ClusterDeployer) createExternalCluster() (ClusterClient, func(), error)
return externalClient, cleanupFn, nil
}

func (d *ClusterDeployer) createInternalCluster(externalClient ClusterClient) (ClusterClient, error) {
func (d *ClusterDeployer) createInternalCluster(externalClient ClusterClient, provider ProviderDeployer, kubeconfigOutput string) (ClusterClient, error) {
cluster, master, _, err := getClusterAPIObjects(externalClient)
if err != nil {
return nil, err
}

glog.V(1).Info("Getting internal cluster kubeconfig.")
internalKubeconfig, err := waitForKubeconfigReady(d.provider, cluster, master)
internalKubeconfig, err := waitForKubeconfigReady(provider, cluster, master)
if err != nil {
return nil, fmt.Errorf("unable to get internal cluster kubeconfig: %v", err)
}

err = d.writeKubeconfig(internalKubeconfig)
if err != nil {
if err = d.writeKubeconfig(internalKubeconfig, kubeconfigOutput); err != nil {
return nil, err
}

internalClient, err := d.clientFactory.NewClusterClientFromKubeconfig(internalKubeconfig)
if err != nil {
return nil, fmt.Errorf("unable to create internal cluster client: %v", err)
}

return internalClient, nil
}

func (d *ClusterDeployer) updateClusterEndpoint(client ClusterClient) error {
func (d *ClusterDeployer) updateClusterEndpoint(client ClusterClient, provider ProviderDeployer) error {
// Update cluster endpoint. Needed till this logic moves into cluster controller.
// TODO: https://github.com/kubernetes-sigs/cluster-api/issues/158
// Fetch fresh objects.
cluster, master, _, err := getClusterAPIObjects(client)
if err != nil {
return err
}
masterIP, err := d.provider.GetIP(cluster, master)
masterIP, err := provider.GetIP(cluster, master)
if err != nil {
return fmt.Errorf("unable to get master IP: %v", err)
}
Expand Down Expand Up @@ -329,10 +359,10 @@ func (d *ClusterDeployer) applyClusterAPIControllers(client ClusterClient) error
return client.Apply(d.providerComponents)
}

func (d *ClusterDeployer) writeKubeconfig(kubeconfig string) error {
func (d *ClusterDeployer) writeKubeconfig(kubeconfig string, kubeconfigOutput string) error {
const fileMode = 0666
os.Remove(d.kubeconfigOutput)
return ioutil.WriteFile(d.kubeconfigOutput, []byte(kubeconfig), fileMode)
os.Remove(kubeconfigOutput)
return ioutil.WriteFile(kubeconfigOutput, []byte(kubeconfig), fileMode)
}

func waitForKubeconfigReady(provider ProviderDeployer, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (string, error) {
Expand All @@ -356,11 +386,11 @@ func waitForKubeconfigReady(provider ProviderDeployer, cluster *clusterv1.Cluste

func pivot(from, to ClusterClient) error {
if err := from.WaitForClusterV1alpha1Ready(); err != nil {
return fmt.Errorf("Cluster v1aplpha1 resource not ready on source cluster.")
return fmt.Errorf("cluster v1alpha1 resource not ready on source cluster")
}

if err := to.WaitForClusterV1alpha1Ready(); err != nil {
return fmt.Errorf("Cluster v1aplpha1 resource not ready on target cluster.")
return fmt.Errorf("cluster v1alpha1 resource not ready on target cluster")
}

clusters, err := from.GetClusterObjects()
Expand All @@ -371,13 +401,38 @@ func pivot(from, to ClusterClient) error {
for _, cluster := range clusters {
// New objects cannot have a specified resource version. Clear it out.
cluster.SetResourceVersion("")
err = to.CreateClusterObject(cluster)
if err != nil {
return err
if err = to.CreateClusterObject(cluster); err != nil {
return fmt.Errorf("error moving Cluster '%v': %v", cluster.GetName(), err)
}
glog.Infof("Moved Cluster '%s'", cluster.GetName())
}

fromDeployments, err := from.GetMachineDeploymentObjects()
if err != nil {
return err
}
for _, deployment := range fromDeployments {
// New objects cannot have a specified resource version. Clear it out.
deployment.SetResourceVersion("")
if err = to.CreateMachineDeploymentObjects([]*clusterv1.MachineDeployment{deployment}); err != nil {
return fmt.Errorf("error moving MachineDeployment '%v': %v", deployment.GetName(), err)
}
glog.Infof("Moved MachineDeployment %v", deployment.GetName())
}

fromMachineSets, err := from.GetMachineSetObjects()
if err != nil {
return err
}
for _, machineSet := range fromMachineSets {
// New objects cannot have a specified resource version. Clear it out.
machineSet.SetResourceVersion("")
if err := to.CreateMachineSetObjects([]*clusterv1.MachineSet{machineSet}); err != nil {
return fmt.Errorf("error moving MachineSet '%v': %v", machineSet.GetName(), err)
}
glog.Infof("Moved MachineSet %v", machineSet.GetName())
}

machines, err := from.GetMachineObjects()
if err != nil {
return err
Expand All @@ -386,15 +441,42 @@ func pivot(from, to ClusterClient) error {
for _, machine := range machines {
// New objects cannot have a specified resource version. Clear it out.
machine.SetResourceVersion("")
err = to.CreateMachineObjects([]*clusterv1.Machine{machine})
if err != nil {
return err
if err = to.CreateMachineObjects([]*clusterv1.Machine{machine}); err != nil {
return fmt.Errorf("error moving Machine '%v': %v", machine.GetName(), err)
}
glog.Infof("Moved Machine '%s'", machine.GetName())
}
return nil
}

func deleteObjects(client ClusterClient) error {
var errors []string
glog.Infof("Deleting machine deployments")
if err := client.DeleteMachineDeploymentObjects(); err != nil {
err = fmt.Errorf("error deleting machine deployments: %v", err)
errors = append(errors, err.Error())
}
glog.Infof("Deleting machine sets")
if err := client.DeleteMachineSetObjects(); err != nil {
err = fmt.Errorf("error deleting machine sets: %v", err)
errors = append(errors, err.Error())
}
glog.Infof("Deleting machines")
if err := client.DeleteMachineObjects(); err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this safe considering some machines will be deleted by deletion of MachineSets?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's safe in the sense that we are simply calling the 'delete all' method in the cluster API. It is up to the cluster API to properly handle machines that already have a deletion in progress due to the fact that their parent object has been deleted.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's still not great because attempting to delete a machine twice (or three times) could spit a couple of error logs which would be misleading unless the user knows exactly how this code works, and exactly how MachineSets work. I think here, we should delete all Machines that are not in a MachineSet.

Copy link
Contributor Author

@spew spew Jun 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no actual deletion going on here, there is just a call to the cluster-api "delete all" methods, i.e. delete all machines, delete all machinesets, delete all machine deployments. Those are asynchronous methods that don't actually do the deletion -- the deletion itself is eventually reconciled by the controllers. IN this case, the underlying deletion for machines & machine sets is done by the machine controller.

I'm not convinced there actually is a problem or that the cluster API would do the wrong thing. Rather than put a lot of complicated deletion logic in clusterctl it seems better to have the controllers do the right thing.

Copy link
Contributor Author

@spew spew Jun 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I synced with @k4leung4 and he confirmed that there is not an issue here, there are a couple scenarios / points that we discussed:

  1. Deleting existing Machines?: DeleteCollection(...) (the actual delete methods being used) do not return errors when there are already objects that are in the process of being deleted (i.e. marked as being deleted). We are not passing in any machines we are simply call ing a "delete all" method.
  2. Concurrent Deletes?: We are calling the DeleteCollection(...) method and passing in the propogation policy of DeletePropogationForeground. This means the delete call will block until all child objects (i.e. machines that belong to a machine set are deleted. This comes from the comments for DeletePropagationForeground. I think what that actually means is they are marked for deleted in etcd. I copied the doc below.
  3. Deleting Machines before MachineSets: Even if we did things in the wrong order and called DeleteCollection on Machines, and that method actually deleted machines associated with a MachineSet (I've not yet drilled into whether that would even happen) all that would occur is the controller would attempt to recreate the machine and then when the DeleteCollection was called on the machine sets the machines would be deleted again.

Here is the DeletePropagationForeground doc for reference:

	// The object exists in the key-value store until the garbage collector
	// deletes all the dependents whose ownerReference.blockOwnerDeletion=true
	// from the key-value store.  API sever will put the "foregroundDeletion"
	// finalizer on the object, and sets its deletionTimestamp.  This policy is
	// cascading, i.e., the dependents will be deleted with Foreground.
	DeletePropagationForeground DeletionPropagation = "Foreground"

err = fmt.Errorf("error deleting machines: %v", err)
errors = append(errors, err.Error())
}
glog.Infof("Deleting clusters")
if err := client.DeleteClusterObjects(); err != nil {
err = fmt.Errorf("error deleting clusters: %v", err)
errors = append(errors, err.Error())
}
if len(errors) > 0 {
return fmt.Errorf("error(s) encountered deleting objects from external cluster: [%v]", strings.Join(errors, ", "))
}
return nil
}

func getClusterAPIObjects(client ClusterClient) (*clusterv1.Cluster, *clusterv1.Machine, []*clusterv1.Machine, error) {
machines, err := client.GetMachineObjects()
if err != nil {
Expand Down Expand Up @@ -440,3 +522,9 @@ func containsMasterRole(roles []clustercommon.MachineRole) bool {
}
return false
}

func closeClient(client ClusterClient, name string) {
if err := client.Close(); err != nil {
glog.Errorf("Could not close %v client: %v", name, err)
}
}
Loading