Skip to content

Commit

Permalink
Fix vSphere anti-affinity
Browse files Browse the repository at this point in the history
Signed-off-by: Waleed Malik <ahmedwaleedmalik@gmail.com>
  • Loading branch information
ahmedwaleedmalik committed Apr 22, 2024
1 parent f339b66 commit 4a52633
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 52 deletions.
16 changes: 7 additions & 9 deletions pkg/cloudprovider/provider/vsphere/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,7 @@ func (p *provider) create(ctx context.Context, log *zap.SugaredLogger, machine *
}

if config.VMAntiAffinity {
machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machineSetName, config); err != nil {
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machine, config); err != nil {
return nil, fmt.Errorf("failed to add VM to anti affinity rule: %w", err)
}
}
Expand Down Expand Up @@ -452,6 +451,12 @@ func (p *provider) Cleanup(ctx context.Context, log *zap.SugaredLogger, machine
return false, fmt.Errorf("failed to delete tags: %w", err)
}

if config.VMAntiAffinity {
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machine, config); err != nil {
return false, fmt.Errorf("failed to update VMs in anti-affinity rule: %w", err)
}
}

powerState, err := virtualMachine.PowerState(ctx)
if err != nil {
return false, fmt.Errorf("failed to get virtual machine power state: %w", err)
Expand Down Expand Up @@ -507,13 +512,6 @@ func (p *provider) Cleanup(ctx context.Context, log *zap.SugaredLogger, machine
return false, fmt.Errorf("failed to destroy vm %s: %w", virtualMachine.Name(), err)
}

if config.VMAntiAffinity {
machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machineSetName, config); err != nil {
return false, fmt.Errorf("failed to add VM to anti affinity rule: %w", err)
}
}

if pc.OperatingSystem != providerconfigtypes.OperatingSystemFlatcar {
filemanager := datastore.NewFileManager(session.Datacenter, false)

Expand Down
62 changes: 19 additions & 43 deletions pkg/cloudprovider/provider/vsphere/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,21 @@ import (
"context"
"errors"
"fmt"
"reflect"
"strings"
"time"

"github.com/aws/smithy-go/ptr"
"github.com/vmware/govmomi/find"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/types"

clusterv1alpha1 "github.com/kubermatic/machine-controller/pkg/apis/cluster/v1alpha1"
)

// createOrUpdateVMAntiAffinityRule creates or updates an anti affinity rule with the name in the given cluster.
// VMs are attached to the rule based on their folder path and name prefix in vsphere.
// A minimum of two VMs is required.
func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session *Session, name string, config *Config) error {
func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session *Session, machine *clusterv1alpha1.Machine, config *Config) error {
p.mutex.Lock()
defer p.mutex.Unlock()

Expand All @@ -43,27 +43,31 @@ func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session
return err
}

machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
vmsInFolder, err := session.Finder.VirtualMachineList(ctx, strings.Join([]string{config.Folder, "*"}, "/"))
if err != nil {
if errors.Is(err, &find.NotFoundError{}) {
return removeVMAntiAffinityRule(ctx, session, config.Cluster, name)
return removeVMAntiAffinityRule(ctx, session, config.Cluster, machineSetName)
}
return err
}

var ruleVMRef []types.ManagedObjectReference
for _, vm := range vmsInFolder {
if strings.HasPrefix(vm.Name(), name) {
// Only add VMs with the same machineSetName to the rule and exclude the machine itself if it is being deleted
if strings.HasPrefix(vm.Name(), machineSetName) && !(vm.Name() == machine.Name && machine.DeletionTimestamp != nil) {
ruleVMRef = append(ruleVMRef, vm.Reference())
}
}

// minimum of two vms required
// DRS rule must have at least two virtual machine members.
if len(ruleVMRef) < 2 {
return removeVMAntiAffinityRule(ctx, session, config.Cluster, name)
return nil
} else if len(ruleVMRef) == 0 {
return removeVMAntiAffinityRule(ctx, session, config.Cluster, machineSetName)
}

info, err := findClusterAntiAffinityRuleByName(ctx, cluster, name)
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, machineSetName)
if err != nil {
return err
}
Expand All @@ -76,7 +80,7 @@ func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session
ClusterRuleInfo: types.ClusterRuleInfo{
Enabled: ptr.Bool(true),
Mandatory: ptr.Bool(false),
Name: name,
Name: machineSetName,
UserCreated: ptr.Bool(true),
},
}
Expand All @@ -100,44 +104,16 @@ func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session
return err
}

err = task.WaitEx(ctx)
taskResult, err := task.WaitForResultEx(ctx)
if err != nil {
return err
return fmt.Errorf("error waiting for cluster %v reconfiguration to complete", cluster.Name())
}

return waitForRule(ctx, cluster, info)
}

// waitForRule checks periodically the vsphere api for the ClusterAntiAffinityRule and returns error if the rule was not found after a timeout.
func waitForRule(ctx context.Context, cluster *object.ClusterComputeResource, rule *types.ClusterAntiAffinityRuleSpec) error {
timeout := time.NewTimer(10 * time.Second)
ticker := time.NewTicker(500 * time.Millisecond)
defer timeout.Stop()
defer ticker.Stop()

for {
select {
case <-timeout.C:

info, err := findClusterAntiAffinityRuleByName(ctx, cluster, rule.Name)
if err != nil {
return err
}

if !reflect.DeepEqual(rule, info) {
return fmt.Errorf("expected anti affinity changes not found in vsphere")
}
case <-ticker.C:
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, rule.Name)
if err != nil {
return err
}

if reflect.DeepEqual(rule, info) {
return nil
}
}
if taskResult.State != types.TaskInfoStateSuccess {
return fmt.Errorf("cluster %v reconfiguration task was not successful", cluster.Name())
}

return nil
}