Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve machine/controller.Reconcile to intelligently block deleting … #350

Merged
merged 1 commit into from
Jun 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloud/google/config/configtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ spec:
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /etc/credentials/service-account.json
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "./gce-controller"
args:
Expand Down
4 changes: 4 additions & 0 deletions clusterctl/examples/google/provider-components.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ spec:
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /etc/credentials/service-account.json
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "./gce-controller"
args:
Expand Down
5 changes: 5 additions & 0 deletions clusterctl/examples/vsphere/provider-components.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ spec:
subPath: vsphere_tmp.pub
- name: named-machines
mountPath: /etc/named-machines
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "./vsphere-machine-controller"
args:
Expand Down
3 changes: 0 additions & 3 deletions pkg/controller/config/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ type LeaderElectionConfiguration struct {

type Configuration struct {
Kubeconfig string
InCluster bool
WorkerCount int
leaderElectionConfig *LeaderElectionConfiguration
}
Expand All @@ -71,7 +70,6 @@ const (
)

var ControllerConfig = Configuration{
InCluster: true,
WorkerCount: 5, // Default 5 worker.
leaderElectionConfig: &LeaderElectionConfiguration{
LeaderElect: false,
Expand All @@ -88,7 +86,6 @@ func GetLeaderElectionConfig() *LeaderElectionConfiguration {

func (c *Configuration) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&c.Kubeconfig, "kubeconfig", c.Kubeconfig, "Path to kubeconfig file with authorization and master location information.")
fs.BoolVar(&c.InCluster, "incluster", c.InCluster, "Controller will be running inside the cluster.")
fs.IntVar(&c.WorkerCount, "workers", c.WorkerCount, "The number of workers for controller.")

AddLeaderElectionFlags(c.leaderElectionConfig, fs)
Expand Down
37 changes: 32 additions & 5 deletions pkg/controller/machine/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@ package machine

import (
"errors"
"os"

"github.com/golang/glog"
"github.com/kubernetes-incubator/apiserver-builder/pkg/builders"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"

clusterv1 "sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1"
"sigs.k8s.io/cluster-api/pkg/client/clientset_generated/clientset"
listers "sigs.k8s.io/cluster-api/pkg/client/listers_generated/cluster/v1alpha1"
cfg "sigs.k8s.io/cluster-api/pkg/controller/config"
"sigs.k8s.io/cluster-api/pkg/controller/sharedinformers"
"sigs.k8s.io/cluster-api/util"
)

const NodeNameEnvVar = "NODE_NAME"

// +controller:group=cluster,version=v1alpha1,kind=Machine,resource=machines
type MachineControllerImpl struct {
builders.DefaultControllerFns
Expand All @@ -46,6 +47,9 @@ type MachineControllerImpl struct {
clientSet clientset.Interface
linkedNodes map[string]bool
cachedReadiness map[string]bool

// nodeName is the name of the node on which the machine controller is running, if not present, it is loaded from NODE_NAME.
nodeName string
}

// Init initializes the controller and is called by the generated code
Expand All @@ -54,6 +58,12 @@ func (c *MachineControllerImpl) Init(arguments sharedinformers.ControllerInitArg
// Use the lister for indexing machines labels
c.lister = arguments.GetSharedInformers().Factory.Cluster().V1alpha1().Machines().Lister()

if c.nodeName == "" {
c.nodeName = os.Getenv(NodeNameEnvVar)
if c.nodeName == "" {
glog.Warningf("environment variable %v is not set, this controller will not protect against deleting its own machine", NodeNameEnvVar)
}
}
clientset, err := clientset.NewForConfig(arguments.GetRestConfig())
if err != nil {
glog.Fatalf("error creating machine client: %v", err)
Expand Down Expand Up @@ -84,9 +94,8 @@ func (c *MachineControllerImpl) Reconcile(machine *clusterv1.Machine) error {
glog.Infof("reconciling machine object %v causes a no-op as there is no finalizer.", name)
return nil
}
// Master should not be deleted as part of reconcilation.
if cfg.ControllerConfig.InCluster && util.IsMaster(machine) {
glog.Infof("skipping reconciling master machine object %v", name)
if !c.isDeleteAllowed(machine) {
glog.Infof("Skipping reconciling of machine object %v", name)
return nil
}
glog.Infof("reconciling machine object %v triggers delete.", name)
Expand Down Expand Up @@ -172,3 +181,21 @@ func (c *MachineControllerImpl) getCluster(machine *clusterv1.Machine) (*cluster
return nil, errors.New("multiple clusters defined")
}
}

func (c *MachineControllerImpl) isDeleteAllowed(machine *clusterv1.Machine) bool {
if c.nodeName == "" || machine.Status.NodeRef == nil {
return true
}
if machine.Status.NodeRef.Name != c.nodeName {
return true
}
node, err := c.kubernetesClientSet.CoreV1().Nodes().Get(c.nodeName, metav1.GetOptions{})
if err != nil {
glog.Infof("unable to determine if controller's node is associated with machine '%v', error getting node named '%v': %v", machine.Name, c.nodeName, err)
return true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this return false? I'm imagining a case where temporarily lose connectivity (what if the apiserver pod is being restarted?) or a single http request failing causes the controller to delete itself. If we return false will the reconcile loop try again? Or do we need retry logic around this call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I debated this one back and forth and finally decided that we could err on the side of deleting in the "api server not available" scenarios. I biased for optimizing the "delete cluster" experience. For example, if for whatever reason you are trying to delete a cluster that has the API server that doesn't work or is crash looping or the pod is deleted then you would be unable to delete the machine controller node if this check returned false.

You are correct -- that the API server can be temporarily unavailable, timing out, etc, can & will happen. However, I chose to not deal with it because the correct answer is retries.

We aren't currently doing retries in this file and before introducing it I think I'd prefer a more holistic approach (i.e. the client should support a retry policy that wraps all calls and makes it so no one needs to write retry code explicitly). The reason why I think we should wait is that this is not yet production code and therefore it is probably better to optimize for code readability and ease of making future changes rather than stability of the system (the system which doesn't even exist yet).

Happy to hear your thoughts. And maybe kubernetes clients already support retries and we just need to turn some stuff on or import it into the cluster API clients.

}
// When the UID of the machine's node reference and this controller's actual node match then then the request is to
// delete the machine this machine-controller is running on. Return false to not allow machine controller to delete its
// own machine.
return node.UID != machine.Status.NodeRef.UID
}
2 changes: 1 addition & 1 deletion pkg/controller/machine/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func TestReconcileNode(t *testing.T) {

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
m := getMachine("foo", false, false, false)
m := getMachine("foo", nil, false, false)
if test.nodeRefName != "" {
m.Status.NodeRef = &corev1.ObjectReference{
Kind: "Node",
Expand Down
92 changes: 79 additions & 13 deletions pkg/controller/machine/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ import (
"testing"
"time"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
corefake "k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
clustercommon "sigs.k8s.io/cluster-api/pkg/apis/cluster/common"
"sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1"
"sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1/testutil"
"sigs.k8s.io/cluster-api/pkg/client/clientset_generated/clientset/fake"
Expand All @@ -37,13 +39,13 @@ func TestMachineSetControllerReconcileHandler(t *testing.T) {
instanceExists bool
isDeleting bool
withFinalizer bool
isMaster bool
ignoreDeleteCallCount bool
expectFinalizerRemoved bool
numExpectedCreateCalls int64
numExpectedDeleteCalls int64
numExpectedUpdateCalls int64
numExpectedExistsCalls int64
nodeRef *v1.ObjectReference
}{
{
name: "Create machine",
Expand Down Expand Up @@ -93,18 +95,29 @@ func TestMachineSetControllerReconcileHandler(t *testing.T) {
withFinalizer: false,
},
{
name: "Delete machine, skip master",
objExists: true,
instanceExists: true,
isDeleting: true,
withFinalizer: true,
isMaster: true,
name: "Delete machine, controller on different node with same name, but different UID",
objExists: true,
instanceExists: true,
isDeleting: true,
withFinalizer: true,
numExpectedDeleteCalls: 1,
expectFinalizerRemoved: true,
nodeRef: &v1.ObjectReference{Name: "controller-node-name", UID: "another-uid"},
},
{
name: "Delete machine, controller on the same node",
objExists: true,
instanceExists: true,
isDeleting: true,
withFinalizer: true,
expectFinalizerRemoved: false,
nodeRef: &v1.ObjectReference{Name: "controller-node-name", UID: "controller-node-uid"},
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
machineToTest := getMachine("bar", test.isDeleting, test.withFinalizer, test.isMaster)
machineToTest := getMachine("bar", test.nodeRef, test.isDeleting, test.withFinalizer)
knownObjects := []runtime.Object{}
if test.objExists {
knownObjects = append(knownObjects, machineToTest)
Expand All @@ -116,6 +129,14 @@ func TestMachineSetControllerReconcileHandler(t *testing.T) {
machineUpdated = true
return false, nil, nil
})
fakeKubernetesClient := corefake.NewSimpleClientset()
node := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "controller-node-name",
UID: "controller-node-uid",
},
}
fakeKubernetesClient.CoreV1().Nodes().Create(&node)

// When creating a new object, it should invoke the reconcile method.
cluster := testutil.GetVanillaCluster()
Expand All @@ -130,6 +151,8 @@ func TestMachineSetControllerReconcileHandler(t *testing.T) {
target := &MachineControllerImpl{}
target.actuator = actuator
target.clientSet = fakeClient
target.kubernetesClientSet = fakeKubernetesClient
target.nodeName = "controller-node-name"

var err error
err = target.Reconcile(machineToTest)
Expand Down Expand Up @@ -159,7 +182,50 @@ func TestMachineSetControllerReconcileHandler(t *testing.T) {
}
}

func getMachine(name string, isDeleting, hasFinalizer, isMaster bool) *v1alpha1.Machine {
func TestIsDeleteAllowed(t *testing.T) {
testCases := []struct {
name string
setControllerNodeName bool
nodeRef *v1.ObjectReference
expectedResult bool
}{
{"empty controller node name should return true", false, nil, true},
{"nil machine.Status.NodeRef should return true", true, nil, true},
{"different node name should return true", true, &v1.ObjectReference{Name: "another-node", UID: "another-uid"}, true},
{"same node name and different UID should return true", true, &v1.ObjectReference{Name: "controller-node-name", UID: "another-uid"}, true},
{"same node name and same UID should return false", true, &v1.ObjectReference{Name: "controller-node-name", UID: "controller-node-uid"}, false},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
controller, machine := createIsDeleteAllowedTestFixtures(t, tc.setControllerNodeName, "controller-node-name", "controller-node-uid", tc.nodeRef)
result := controller.isDeleteAllowed(machine)
if result != tc.expectedResult {
t.Errorf("result mismatch: got '%v', want '%v'", result, tc.expectedResult)
}
})
}
}

func createIsDeleteAllowedTestFixtures(t *testing.T, setControllerNodeNameVariable bool, controllerNodeName string, controllerNodeUid string, nodeRef *v1.ObjectReference) (*MachineControllerImpl, *v1alpha1.Machine) {
controller := &MachineControllerImpl{}
fakeKubernetesClient := corefake.NewSimpleClientset()
node := v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: controllerNodeName,
UID: types.UID(controllerNodeUid),
},
}
fakeKubernetesClient.CoreV1().Nodes().Create(&node)
controller.kubernetesClientSet = fakeKubernetesClient
controller.kubernetesClientSet = fakeKubernetesClient
if setControllerNodeNameVariable {
controller.nodeName = controllerNodeName
}
machine := getMachine("bar", nodeRef, true, false)
return controller, machine
}

func getMachine(name string, nodeRef *v1.ObjectReference, isDeleting, hasFinalizer bool) *v1alpha1.Machine {
m := &v1alpha1.Machine{
TypeMeta: metav1.TypeMeta{
Kind: "Machine",
Expand All @@ -169,6 +235,9 @@ func getMachine(name string, isDeleting, hasFinalizer, isMaster bool) *v1alpha1.
Name: name,
Namespace: metav1.NamespaceDefault,
},
Status: v1alpha1.MachineStatus{
NodeRef: nodeRef,
},
}
if isDeleting {
now := metav1.NewTime(time.Now())
Expand All @@ -177,9 +246,6 @@ func getMachine(name string, isDeleting, hasFinalizer, isMaster bool) *v1alpha1.
if hasFinalizer {
m.ObjectMeta.SetFinalizers([]string{v1alpha1.MachineFinalizer})
}
if isMaster {
m.Spec.Roles = []clustercommon.MachineRole{clustercommon.MasterRole}
}

return m
}