diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go index f8d60aa99968..06b2d5ba3f5b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go @@ -163,7 +163,7 @@ func (as *AgentPool) GetVMIndexes() ([]int, map[int]string, error) { } indexes = append(indexes, index) - resourceID, err := convertResourceGroupNameToLower("azure://" + *instance.ID) + resourceID, err := convertResourceGroupNameToLower(azurePrefix + *instance.ID) if err != nil { return nil, nil, err } @@ -484,7 +484,7 @@ func (as *AgentPool) Nodes() ([]cloudprovider.Instance, error) { // To keep consistent with providerID from kubernetes cloud provider, convert // resourceGroupName in the ID to lower case. - resourceID, err := convertResourceGroupNameToLower("azure://" + *instance.ID) + resourceID, err := convertResourceGroupNameToLower(azurePrefix + *instance.ID) if err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go index 7890cdde72ef..444f2f9f235c 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go @@ -35,8 +35,8 @@ import ( "github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2021-09-01/storage" "github.com/Azure/go-autorest/autorest/date" "github.com/Azure/go-autorest/autorest/to" - "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" ) var ( @@ -185,7 +185,8 @@ func TestGetVMsFromCache(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) testAS.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), testAS.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard, false, "") + testAS.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, *testAS.manager.config) assert.NoError(t, err) testAS.manager.azureCache = ac @@ -203,7 +204,8 @@ func TestGetVMIndexes(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac @@ -242,7 +244,8 @@ func TestGetCurSize(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac @@ -266,7 +269,8 @@ func TestAgentPoolTargetSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac @@ -285,7 +289,8 @@ func TestAgentPoolIncreaseSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(2) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac @@ -313,7 +318,8 @@ func TestDecreaseTargetSize(t *testing.T) { as.manager.azClient.virtualMachinesClient = mockVMClient expectedVMs := getExpectedVMs() mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(3) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac @@ -431,7 +437,9 @@ func TestAgentPoolDeleteNodes(t *testing.T) { mockSAClient := mockstorageaccountclient.NewMockInterface(ctrl) as.manager.azClient.storageAccountsClient = mockSAClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) + as.manager.config.VMType = vmTypeVMSS assert.NoError(t, err) as.manager.azureCache = ac @@ -497,7 +505,8 @@ func TestAgentPoolNodes(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) as.manager.azClient.virtualMachinesClient = mockVMClient mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil) - ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "") + as.manager.config.VMType = vmTypeStandard + ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config) assert.NoError(t, err) as.manager.azureCache = ac diff --git a/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery.go b/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery.go index 51112ace97cc..7e98164a27a0 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery.go @@ -18,8 +18,9 @@ package azure import ( "fmt" - "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "strings" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" ) const ( @@ -91,7 +92,7 @@ func matchDiscoveryConfig(labels map[string]*string, configs []labelAutoDiscover return false } - if len(v) > 0 { + if v != "" { if value == nil || *value != v { return false } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery_test.go b/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery_test.go index f119ed917243..edfec39b6e6a 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_autodiscovery_test.go @@ -17,9 +17,10 @@ limitations under the License. package azure import ( + "testing" + "github.com/stretchr/testify/assert" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" - "testing" ) func TestParseLabelAutoDiscoverySpecs(t *testing.T) { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index 1beb7b2b8feb..534d1ce2ba06 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -39,35 +39,72 @@ var ( // azureCache is used for caching cluster resources state. // // It is needed to: -// - keep track of node groups (VM and VMSS types) in the cluster, -// - keep track of instances and which node group they belong to, -// - limit repetitive Azure API calls. +// - keep track of node groups (VM and VMSS types) in the cluster, +// - keep track of instances and which node group they belong to, +// (for VMSS it only keeps track of instanceid-to-nodegroup mapping) +// - limit repetitive Azure API calls. +// +// It backs efficient responds to +// - cloudprovider.NodeGroups() (= registeredNodeGroups) +// - cloudprovider.NodeGroupForNode (via azureManager.GetNodeGroupForInstance => FindForInstance, +// using instanceToNodeGroup and unownedInstances) +// +// CloudProvider.Refresh, called before every autoscaler loop (every 10s by defaul), +// is implemented by AzureManager.Refresh which makes the cache refresh decision, +// based on AzureManager.lastRefresh and azureCache.refreshInterval. type azureCache struct { - mutex sync.Mutex - interrupt chan struct{} - azClient *azClient + mutex sync.Mutex + interrupt chan struct{} + azClient *azClient + + // refreshInterval specifies how often azureCache needs to be refreshed. + // The value comes from AZURE_VMSS_CACHE_TTL env var (or 1min if not specified), + // and is also used by some other caches. Together with AzureManager.lastRefresh, + // it is uses to decide whether a refresh is needed. refreshInterval time.Duration // Cache content. - resourceGroup string - vmType string - vmsPoolSet map[string]struct{} // track the nodepools that're vms pool - scaleSets map[string]compute.VirtualMachineScaleSet - virtualMachines map[string][]compute.VirtualMachine + + // resourceGroup specifies the name of the resource group that this cache tracks + resourceGroup string + + // vmType can be one of vmTypeVMSS (default), vmTypeStandard + vmType string + + vmsPoolSet map[string]struct{} // track the nodepools that're vms pool + + // scaleSets keeps the set of all known scalesets in the resource group, populated/refreshed via VMSS.List() call. + // It is only used/populated if vmType is vmTypeVMSS (default). + scaleSets map[string]compute.VirtualMachineScaleSet + // virtualMachines keeps the set of all VMs in the resource group. + // It is only used/populated if vmType is vmTypeStandard. + virtualMachines map[string][]compute.VirtualMachine + + // registeredNodeGroups represents all known NodeGroups. registeredNodeGroups []cloudprovider.NodeGroup - instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup - unownedInstances map[azureRef]bool - autoscalingOptions map[azureRef]map[string]string - skus map[string]*skewer.Cache + + // instanceToNodeGroup maintains a mapping from instance Ids to nodegroups. + // It is populated from the results of calling Nodes() on each nodegroup. + // It is used (together with unownedInstances) when looking up the nodegroup + // for a given instance id (see FindForInstance). + instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup + + // unownedInstance maintains a set of instance ids not belonging to any nodegroup. + // It is used (together with instanceToNodeGroup) when looking up the nodegroup for a given instance id. + // It is reset by invalidateUnownedInstanceCache(). + unownedInstances map[azureRef]bool + + autoscalingOptions map[azureRef]map[string]string + skus map[string]*skewer.Cache } -func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string, enableDynamicInstanceList bool, defaultLocation string) (*azureCache, error) { +func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) { cache := &azureCache{ interrupt: make(chan struct{}), azClient: client, refreshInterval: cacheTTL, - resourceGroup: resourceGroup, - vmType: vmType, + resourceGroup: config.ResourceGroup, + vmType: config.VMType, vmsPoolSet: make(map[string]struct{}), scaleSets: make(map[string]compute.VirtualMachineScaleSet), virtualMachines: make(map[string][]compute.VirtualMachine), @@ -78,8 +115,8 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy skus: make(map[string]*skewer.Cache), } - if enableDynamicInstanceList { - cache.skus[defaultLocation] = &skewer.Cache{} + if config.EnableDynamicInstanceList { + cache.skus[config.Location] = &skewer.Cache{} } if err := cache.regenerate(); err != nil { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_client.go b/cluster-autoscaler/cloudprovider/azure/azure_client.go index 4928670f913c..2e0522a2e45e 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_client.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_client.go @@ -302,7 +302,7 @@ func newServicePrincipalTokenFromCredentials(config *Config, env *azure.Environm if err != nil { return nil, fmt.Errorf("getting the managed service identity endpoint: %v", err) } - if len(config.UserAssignedIdentityID) > 0 { + if config.UserAssignedIdentityID != "" { klog.V(4).Info("azure: using User Assigned MSI ID to retrieve access token") return adal.NewServicePrincipalTokenFromMSIWithUserAssignedID(msiEndpoint, env.ServiceManagementEndpoint, @@ -314,7 +314,7 @@ func newServicePrincipalTokenFromCredentials(config *Config, env *azure.Environm env.ServiceManagementEndpoint) } - if len(config.AADClientSecret) > 0 { + if config.AADClientSecret != "" { klog.V(2).Infoln("azure: using client_id+client_secret to retrieve access token") return adal.NewServicePrincipalToken( *oauthConfig, @@ -323,13 +323,13 @@ func newServicePrincipalTokenFromCredentials(config *Config, env *azure.Environm env.ServiceManagementEndpoint) } - if len(config.AADClientCertPath) > 0 && len(config.AADClientCertPassword) > 0 { + if config.AADClientCertPath != "" { klog.V(2).Infoln("azure: using jwt client_assertion (client_cert+client_private_key) to retrieve access token") certData, err := ioutil.ReadFile(config.AADClientCertPath) if err != nil { return nil, fmt.Errorf("reading the client certificate from file %s: %v", config.AADClientCertPath, err) } - certificate, privateKey, err := decodePkcs12(certData, config.AADClientCertPassword) + certificate, privateKey, err := adal.DecodePfxCertificateData(certData, config.AADClientCertPassword) if err != nil { return nil, fmt.Errorf("decoding the client certificate: %v", err) } @@ -399,6 +399,7 @@ func newAzClient(cfg *Config, env *azure.Environment) (*azClient, error) { // https://github.com/Azure/go-autorest/blob/main/autorest/azure/environments.go skuClient := compute.NewResourceSkusClientWithBaseURI(azClientConfig.ResourceManagerEndpoint, cfg.SubscriptionID) skuClient.Authorizer = azClientConfig.Authorizer + skuClient.UserAgent = azClientConfig.UserAgent klog.V(5).Infof("Created sku client with authorizer: %v", skuClient) agentPoolClient, err := newAgentpoolClient(cfg) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_client_test.go b/cluster-autoscaler/cloudprovider/azure/azure_client_test.go new file mode 100644 index 000000000000..7ed0dd4c01f7 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_client_test.go @@ -0,0 +1,71 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "os" + "testing" + + "github.com/Azure/go-autorest/autorest/adal" + "github.com/Azure/go-autorest/autorest/azure" + "github.com/stretchr/testify/assert" +) + +func TestGetServicePrincipalTokenFromCertificate(t *testing.T) { + config := &Config{ + TenantID: "TenantID", + AADClientID: "AADClientID", + AADClientCertPath: "./testdata/test.pfx", + AADClientCertPassword: "id", + } + env := &azure.PublicCloud + token, err := newServicePrincipalTokenFromCredentials(config, env) + assert.NoError(t, err) + + oauthConfig, err := adal.NewOAuthConfig(env.ActiveDirectoryEndpoint, config.TenantID) + assert.NoError(t, err) + pfxContent, err := os.ReadFile("./testdata/test.pfx") + assert.NoError(t, err) + certificate, privateKey, err := adal.DecodePfxCertificateData(pfxContent, "id") + assert.NoError(t, err) + spt, err := adal.NewServicePrincipalTokenFromCertificate( + *oauthConfig, config.AADClientID, certificate, privateKey, env.ServiceManagementEndpoint) + assert.NoError(t, err) + assert.Equal(t, token, spt) +} + +func TestGetServicePrincipalTokenFromCertificateWithoutPassword(t *testing.T) { + config := &Config{ + TenantID: "TenantID", + AADClientID: "AADClientID", + AADClientCertPath: "./testdata/testnopassword.pfx", + } + env := &azure.PublicCloud + token, err := newServicePrincipalTokenFromCredentials(config, env) + assert.NoError(t, err) + + oauthConfig, err := adal.NewOAuthConfig(env.ActiveDirectoryEndpoint, config.TenantID) + assert.NoError(t, err) + pfxContent, err := os.ReadFile("./testdata/testnopassword.pfx") + assert.NoError(t, err) + certificate, privateKey, err := adal.DecodePfxCertificateData(pfxContent, "") + assert.NoError(t, err) + spt, err := adal.NewServicePrincipalTokenFromCertificate( + *oauthConfig, config.AADClientID, certificate, privateKey, env.ServiceManagementEndpoint) + assert.NoError(t, err) + assert.Equal(t, token, spt) +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go index 98b0ed185da3..114cd90a4ee2 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go @@ -31,7 +31,8 @@ import ( const ( // GPULabel is the label added to nodes with GPU resource. - GPULabel = "accelerator" + GPULabel = AKSLabelKeyPrefixValue + "accelerator" + legacyGPULabel = "accelerator" ) var ( @@ -72,7 +73,7 @@ func (azure *AzureCloudProvider) Name() string { // GPULabel returns the label added to nodes with GPU resource. func (azure *AzureCloudProvider) GPULabel() string { - return GPULabel + return legacyGPULabel // Use legacy to avoid breaking, for now } // GetAvailableGPUTypes return all available GPU types cloud provider supports diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go index a7c52f56d11e..0b064477fdb6 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go @@ -30,20 +30,20 @@ import ( "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" "github.com/Azure/go-autorest/autorest/azure" - "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" ) func newTestAzureManager(t *testing.T) *AzureManager { ctrl := gomock.NewController(t) defer ctrl.Finish() - expectedScaleSets := newTestVMSSList(3, "test-vmss", "eastus", compute.Uniform) + expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform) expectedVMSSVMs := newTestVMSSVMList(3) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), "rg").Return(expectedScaleSets, nil).AnyTimes() mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) - mockVMSSVMClient.EXPECT().List(gomock.Any(), "rg", "test-vmss", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + mockVMSSVMClient.EXPECT().List(gomock.Any(), "rg", "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() mockVMClient := mockvmclient.NewMockInterface(ctrl) expectedVMs := newTestVMList(3) mockVMClient.EXPECT().List(gomock.Any(), "rg").Return(expectedVMs, nil).AnyTimes() @@ -56,6 +56,7 @@ func newTestAzureManager(t *testing.T) *AzureManager { VMType: vmTypeVMSS, MaxDeploymentsCount: 2, Deployment: "deployment", + Location: "eastus", }, azClient: &azClient{ virtualMachineScaleSetsClient: mockVMSSClient, @@ -81,7 +82,7 @@ func newTestAzureManager(t *testing.T) *AzureManager { }, } - cache, error := newAzureCache(manager.azClient, refreshInterval, manager.config.ResourceGroup, vmTypeVMSS, false, "") + cache, error := newAzureCache(manager.azClient, refreshInterval, *manager.config) assert.NoError(t, error) manager.azureCache = cache @@ -192,7 +193,6 @@ func TestNodeGroupForNode(t *testing.T) { expectedVMs := newTestVMList(3) for _, orchMode := range orchestrationModes { - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) provider := newTestProvider(t) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) @@ -203,7 +203,6 @@ func TestNodeGroupForNode(t *testing.T) { mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient @@ -211,11 +210,12 @@ func TestNodeGroupForNode(t *testing.T) { provider.azureManager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() + provider.azureManager.azClient.virtualMachinesClient = mockVMClient } registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) - provider.azureManager.explicitlyConfigured["test-asg"] = true + newTestScaleSet(provider.azureManager, testASG)) + provider.azureManager.explicitlyConfigured[testASG] = true assert.True(t, registered) assert.Equal(t, len(provider.NodeGroups()), 1) @@ -225,21 +225,20 @@ func TestNodeGroupForNode(t *testing.T) { group, err := provider.NodeGroupForNode(node) assert.NoError(t, err) assert.NotNil(t, group, "Group should not be nil") - assert.Equal(t, group.Id(), "test-asg") + assert.Equal(t, group.Id(), testASG) assert.Equal(t, group.MinSize(), 1) assert.Equal(t, group.MaxSize(), 5) // test node in cluster that is not in a group managed by cluster autoscaler nodeNotInGroup := &apiv1.Node{ Spec: apiv1.NodeSpec{ - ProviderID: "azure:///subscriptions/subscripion/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachines/test-instance-id-not-in-group", + ProviderID: azurePrefix + "/subscriptions/subscripion/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachines/test-instance-id-not-in-group", }, } group, err = provider.NodeGroupForNode(nodeNotInGroup) assert.NoError(t, err) assert.Nil(t, group) } - } func TestNodeGroupForNodeWithNoProviderId(t *testing.T) { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index aa366f3a6e5d..2f1e3b6023f6 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -143,6 +143,15 @@ type Config struct { // EnableVmssFlex defines whether to enable Vmss Flex support or not EnableVmssFlex bool `json:"enableVmssFlex,omitempty" yaml:"enableVmssFlex,omitempty"` + + // (DEPRECATED, DO NOT USE) EnableForceDelete defines whether to enable force deletion on the APIs + EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` + + // (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info + EnableDetailedCSEMessage bool `json:"enableDetailedCSEMessage,omitempty" yaml:"enableDetailedCSEMessage,omitempty"` + + // (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance + GetVmssSizeRefreshPeriod time.Duration `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"` } // BuildAzureConfig returns a Config object for the Azure clients diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config_test.go b/cluster-autoscaler/cloudprovider/azure/azure_config_test.go index 5adeba8c0687..65c74a5c90a1 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config_test.go @@ -105,6 +105,7 @@ func TestInitializeCloudProviderRateLimitConfigWithReadAndWriteRateLimitAlreadyS assert.Equal(t, configWithRateLimits.CloudProviderRateLimitBucketWrite, rateLimitWriteBuckets) } +// nolint: goconst func TestInitializeCloudProviderRateLimitConfigWithInvalidReadAndWriteRateLimitSettingsFromEnv(t *testing.T) { emptyConfig := &CloudProviderRateLimitConfig{} var rateLimitReadQPS float32 = 3.0 diff --git a/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set.go new file mode 100644 index 000000000000..ee83119084e1 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set.go @@ -0,0 +1,86 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "context" + "strings" + + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" + "github.com/Azure/go-autorest/autorest/azure" + + "k8s.io/klog/v2" + "sigs.k8s.io/cloud-provider-azure/pkg/retry" +) + +// When Azure Dedicated Host is enabled or using isolated vm skus, force deleting a VMSS fails with the following error: +// +// "predominantErrorDetail": { +// "innererror": { +// "internalErrorCode": "OperationNotAllowedOnResourceThatManagesUpdatesWithMaintenanceControl" +// }, +// "code": "OperationNotAllowed", +// "message": "Operation 'ForceDelete' is not allowed on resource 'aks-newnp-11436513-vmss' since it manages updates using maintenance control." +// }, +// +// A programmatically way to determine if a VM size is isolated or not has not been found. The isolated VM documentation: +// https://docs.microsoft.com/en-us/azure/virtual-machines/isolation +// has the current list of isolated VM sizes, but new isolated VM size could be introduced in the future. +// +// As a result of not being able to find out if a VM size is isolated or not, we'll do the following: +// - if scaleSet has isolated vm size or dedicated host, disable forDelete +// - else use forceDelete +// - if new isolated sku were added or dedicatedHost was not updated properly, this forceDelete call will fail with above error. +// In that case, call normal delete (fall-back) + +var isolatedVMSizes = map[string]bool{ + strings.ToLower("Standard_E80ids_v4"): true, + strings.ToLower("Standard_E80is_v4"): true, + strings.ToLower("Standard_E104i_v5"): true, + strings.ToLower("Standard_E104is_v5"): true, + strings.ToLower("Standard_E104id_v5"): true, + strings.ToLower("Standard_E104ids_v5"): true, + strings.ToLower("Standard_M192is_v2"): true, + strings.ToLower("Standard_M192ims_v2"): true, + strings.ToLower("Standard_M192ids_v2"): true, + strings.ToLower("Standard_M192idms_v2"): true, + strings.ToLower("Standard_F72s_v2"): true, + strings.ToLower("Standard_M128ms"): true, +} + +func (scaleSet *ScaleSet) deleteInstances(ctx context.Context, requiredIds *compute.VirtualMachineScaleSetVMInstanceRequiredIDs, commonAsgId string) (*azure.Future, *retry.Error) { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + + skuName := scaleSet.getSKU() + resourceGroup := scaleSet.manager.config.ResourceGroup + forceDelete := shouldForceDelete(skuName, scaleSet) + future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsgId, *requiredIds, forceDelete) + if forceDelete && isOperationNotAllowed(rerr) { + klog.Infof("falling back to normal delete for instances %v for %s", requiredIds.InstanceIds, scaleSet.Name) + return scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsgId, *requiredIds, false) + } + return future, rerr +} + +func shouldForceDelete(skuName string, scaleSet *ScaleSet) bool { + return scaleSet.enableForceDelete && !isolatedVMSizes[strings.ToLower(skuName)] && !scaleSet.dedicatedHost +} + +func isOperationNotAllowed(rerr *retry.Error) bool { + return rerr != nil && rerr.ServiceErrorCode() == retry.OperationNotAllowed +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set_test.go new file mode 100644 index 000000000000..c491c54c1e23 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_force_delete_scale_set_test.go @@ -0,0 +1,79 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "net/http" + "testing" + + "github.com/Azure/go-autorest/autorest/azure" + "github.com/stretchr/testify/assert" + "sigs.k8s.io/cloud-provider-azure/pkg/retry" +) + +func TestShouldForceDelete(t *testing.T) { + skuName := "test-vmssSku" + + t.Run("should return true", func(t *testing.T) { + scaleSet := &ScaleSet{} + scaleSet.enableForceDelete = true + assert.Equal(t, shouldForceDelete(skuName, scaleSet), true) + }) + + t.Run("should return false because of dedicated hosts", func(t *testing.T) { + scaleSet := &ScaleSet{} + scaleSet.enableForceDelete = true + scaleSet.dedicatedHost = true + assert.Equal(t, shouldForceDelete(skuName, scaleSet), false) + }) + + t.Run("should return false because of isolated sku", func(t *testing.T) { + scaleSet := &ScaleSet{} + scaleSet.enableForceDelete = true + skuName = "Standard_F72s_v2" // belongs to the map isolatedVMSizes + assert.Equal(t, shouldForceDelete(skuName, scaleSet), false) + }) + +} + +func TestIsOperationNotAllowed(t *testing.T) { + t.Run("should return false because it's not OperationNotAllowed error", func(t *testing.T) { + error := &retry.Error{ + HTTPStatusCode: http.StatusBadRequest, + } + assert.Equal(t, isOperationNotAllowed(error), false) + }) + + t.Run("should return false because error is nil", func(t *testing.T) { + assert.Equal(t, isOperationNotAllowed(nil), false) + }) + + t.Run("should return true if error is OperationNotAllowed", func(t *testing.T) { + sre := &azure.ServiceError{ + Code: retry.OperationNotAllowed, + Message: "error-message", + } + error := &retry.Error{ + RawError: sre, + } + assert.Equal(t, isOperationNotAllowed(error), false) + }) + + // It is difficult to condition the case where return error matched expected error string for forceDelete and the + // function should return true. + +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_instance_gpu_sku.go b/cluster-autoscaler/cloudprovider/azure/azure_instance_gpu_sku.go index aea520027248..577bf47845e0 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_instance_gpu_sku.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_instance_gpu_sku.go @@ -17,9 +17,10 @@ limitations under the License. package azure import ( + "strings" + "github.com/Azure/skewer" "github.com/pkg/errors" - "strings" ) var ( diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index 637b5a897805..1b8f704072b7 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -26,13 +26,18 @@ import ( "time" "github.com/Azure/go-autorest/autorest/azure" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/config" "k8s.io/autoscaler/cluster-autoscaler/config/dynamic" + kretry "k8s.io/client-go/util/retry" klog "k8s.io/klog/v2" + "sigs.k8s.io/cloud-provider-azure/pkg/retry" ) const ( + azurePrefix = "azure://" + vmTypeVMSS = "vmss" vmTypeStandard = "standard" @@ -47,8 +52,19 @@ type AzureManager struct { azClient *azClient env azure.Environment - azureCache *azureCache - lastRefresh time.Time + // azureCache is used for caching Azure resources. + // It keeps track of nodegroups and instances + // (and of which nodegroup instances belong to) + azureCache *azureCache + // lastRefresh is the time azureCache was last refreshed. + // Together with azureCache.refreshInterval is it used to decide whether + // it is time to refresh the cache from Azure resources. + // + // Cache invalidation can also be requested via invalidateCache() + // (used by both AzureManager and ScaleSet), which manipulates + // lastRefresh to force refresh on the next check. + lastRefresh time.Time + autoDiscoverySpecs []labelAutoDiscoveryConfig explicitlyConfigured map[string]bool } @@ -90,7 +106,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi if cfg.VmssCacheTTL != 0 { cacheTTL = time.Duration(cfg.VmssCacheTTL) * time.Second } - cache, err := newAzureCache(azClient, cacheTTL, cfg.ResourceGroup, cfg.VMType, cfg.EnableDynamicInstanceList, cfg.Location) + cache, err := newAzureCache(azClient, cacheTTL, *cfg) if err != nil { return nil, err } @@ -106,8 +122,22 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi return nil, err } + retryBackoff := wait.Backoff{ + Duration: 2 * time.Minute, + Factor: 1.0, + Jitter: 0.1, + Steps: 6, + Cap: 10 * time.Minute, + } + if err := manager.forceRefresh(); err != nil { - return nil, err + err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) { + return manager.forceRefresh() + }) + if err != nil { + return nil, err + } + return manager, nil } return manager, nil @@ -156,7 +186,7 @@ func (m *AzureManager) buildNodeGroupFromSpec(spec string) (cloudprovider.NodeGr case vmTypeStandard: return NewAgentPool(s, m) case vmTypeVMSS: - return NewScaleSet(s, m, -1) + return NewScaleSet(s, m, -1, false) default: return nil, fmt.Errorf("vmtype %s not supported", m.config.VMType) } @@ -184,6 +214,8 @@ func (m *AzureManager) forceRefresh() error { return nil } +// invalidateCache forces cache reload on the next check +// by manipulating lastRefresh timestamp func (m *AzureManager) invalidateCache() { m.lastRefresh = time.Now().Add(-1 * m.azureCache.refreshInterval) klog.V(2).Infof("Invalidated Azure cache") @@ -347,7 +379,9 @@ func (m *AzureManager) getFilteredScaleSets(filter []labelAutoDiscoveryConfig) ( curSize = *scaleSet.Sku.Capacity } - vmss, err := NewScaleSet(spec, m, curSize) + dedicatedHost := scaleSet.VirtualMachineScaleSetProperties != nil && scaleSet.VirtualMachineScaleSetProperties.HostGroup != nil + + vmss, err := NewScaleSet(spec, m, curSize, dedicatedHost) if err != nil { klog.Warningf("ignoring vmss %q %s", *scaleSet.Name, err) continue diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 95e76041690d..a17f619385f9 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -29,8 +29,8 @@ import ( "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources" "github.com/Azure/go-autorest/autorest/date" "github.com/Azure/go-autorest/autorest/to" - "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/config" azclients "sigs.k8s.io/cloud-provider-azure/pkg/azureclients" @@ -136,7 +136,10 @@ const validAzureCfgForStandardVMTypeWithoutDeploymentParameters = `{ "deployment":"cluster-autoscaler-0001" }` -const invalidAzureCfg = `{{}"cloud": "AzurePublicCloud",}` +const ( + invalidAzureCfg = `{{}"cloud": "AzurePublicCloud",}` + testASG = "test-asg" +) func TestCreateAzureManagerValidConfig(t *testing.T) { ctrl := gomock.NewController(t) @@ -688,12 +691,13 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) { azureRef: azureRef{ Name: vmssName, }, - minSize: minVal, - maxSize: maxVal, - manager: manager, - curSize: 3, - sizeRefreshPeriod: manager.azureCache.refreshInterval, - instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, + minSize: minVal, + maxSize: maxVal, + manager: manager, + curSize: 3, + sizeRefreshPeriod: manager.azureCache.refreshInterval, + getVmssSizeRefreshPeriod: manager.azureCache.refreshInterval, + InstanceCache: InstanceCache{instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod}, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index f0b2e83f07e8..73714e0ab668 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -33,11 +33,13 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" "github.com/Azure/go-autorest/autorest/azure" + "github.com/Azure/go-autorest/autorest/to" ) var ( defaultVmssInstancesRefreshPeriod = 5 * time.Minute vmssContextTimeout = 3 * time.Minute + asyncContextTimeout = 30 * time.Minute vmssSizeMutex sync.Mutex ) @@ -58,35 +60,56 @@ type ScaleSet struct { minSize int maxSize int - sizeMutex sync.Mutex - curSize int64 - + enableForceDelete bool enableDynamicInstanceList bool + enableDetailedCSEMessage bool - lastSizeRefresh time.Time + // Current Size (Number of VMs) + + // curSize tracks (and caches) the number of VMs in this ScaleSet. + // It is periodically updated from vmss.Sku.Capacity, with VMSS itself coming + // either from azure.Cache (which periodically does VMSS.List) + // or from direct VMSS.Get (always used for Spot). + curSize int64 + // sizeRefreshPeriod is how often curSize is refreshed from vmss.Sku.Capacity. + // (Set from azureCache.refreshInterval = VmssCacheTTL or [defaultMetadataCache]refreshInterval = 1min) sizeRefreshPeriod time.Duration + // lastSizeRefresh is the time curSize was last refreshed from vmss.Sku.Capacity. + // Together with sizeRefreshPeriod, it is used to determine if it is time to refresh curSize. + lastSizeRefresh time.Time + // getVmssSizeRefreshPeriod is how often curSize should be refreshed in case VMSS.Get call is used. + // (Set from GetVmssSizeRefreshPeriod, if specified = get-vmss-size-refresh-period = 30s + getVmssSizeRefreshPeriod time.Duration + // sizeMutex protects curSize (the number of VMs in the ScaleSet) from concurrent access + sizeMutex sync.Mutex - instancesRefreshPeriod time.Duration - instancesRefreshJitter int + InstanceCache - instanceMutex sync.Mutex - instanceCache []cloudprovider.Instance - lastInstanceRefresh time.Time + // uses Azure Dedicated Host + dedicatedHost bool } // NewScaleSet creates a new NewScaleSet. -func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (*ScaleSet, error) { +func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, dedicatedHost bool) (*ScaleSet, error) { scaleSet := &ScaleSet{ azureRef: azureRef{ Name: spec.Name, }, - minSize: spec.MinSize, - maxSize: spec.MaxSize, - manager: az, - curSize: curSize, - sizeRefreshPeriod: az.azureCache.refreshInterval, + + minSize: spec.MinSize, + maxSize: spec.MaxSize, + + manager: az, + curSize: curSize, + sizeRefreshPeriod: az.azureCache.refreshInterval, + InstanceCache: InstanceCache{ + instancesRefreshJitter: az.config.VmssVmsCacheJitter, + }, + + enableForceDelete: az.config.EnableForceDelete, enableDynamicInstanceList: az.config.EnableDynamicInstanceList, - instancesRefreshJitter: az.config.VmssVmsCacheJitter, + enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage, + dedicatedHost: dedicatedHost, } if az.config.VmssVmsCacheTTL != 0 { @@ -95,6 +118,16 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) ( scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod } + if az.config.GetVmssSizeRefreshPeriod != 0 { + scaleSet.getVmssSizeRefreshPeriod = az.config.GetVmssSizeRefreshPeriod + } else { + scaleSet.getVmssSizeRefreshPeriod = az.azureCache.refreshInterval + } + + if az.config.EnableDetailedCSEMessage { + klog.V(2).Infof("enableDetailedCSEMessage: %t", scaleSet.enableDetailedCSEMessage) + } + return scaleSet, nil } @@ -154,17 +187,50 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { scaleSet.sizeMutex.Lock() defer scaleSet.sizeMutex.Unlock() - if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) { - klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize) - return scaleSet.curSize, nil - } - set, err := scaleSet.getVMSSFromCache() if err != nil { klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) return -1, err } + // // Remove check for returning in-memory size when VMSS is in updating state + // // If VMSS state is updating, return the currentSize which would've been proactively incremented or decremented by CA + // // unless it's -1. In that case, its better to initialize it. + // if scaleSet.curSize != -1 && set.VirtualMachineScaleSetProperties != nil && + // strings.EqualFold(to.String(set.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.GalleryProvisioningStateUpdating)) { + // klog.V(3).Infof("VMSS %q is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize) + // return scaleSet.curSize, nil + // } + + effectiveSizeRefreshPeriod := scaleSet.sizeRefreshPeriod + + // If the scale set is Spot, we want to have a more fresh view of the Sku.Capacity field. + // This is because evictions can happen + // at any given point in time, even before VMs are materialized as + // nodes. We should be able to react to those and have the autoscaler + // readjust the goal again to force restoration. + if isSpot(&set) { + effectiveSizeRefreshPeriod = scaleSet.getVmssSizeRefreshPeriod + } + + if scaleSet.lastSizeRefresh.Add(effectiveSizeRefreshPeriod).After(time.Now()) { + klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize) + return scaleSet.curSize, nil + } + + // If the scale set is on Spot, make a GET VMSS call to fetch more updated fresh info + if isSpot(&set) { + ctx, cancel := getContextWithCancel() + defer cancel() + + var rerr *retry.Error + set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name) + if rerr != nil { + klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr) + return -1, err + } + } + vmssSizeMutex.Lock() curSize := *set.Sku.Capacity vmssSizeMutex.Unlock() @@ -181,30 +247,25 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { return scaleSet.curSize, nil } -// GetScaleSetSize gets Scale Set size. -func (scaleSet *ScaleSet) GetScaleSetSize() (int64, error) { - return scaleSet.getCurSize() -} - -func (scaleSet *ScaleSet) waitForDeleteInstances(future *azure.Future, requiredIds *compute.VirtualMachineScaleSetVMInstanceRequiredIDs) { - ctx, cancel := getContextWithCancel() - defer cancel() - - klog.V(3).Infof("Calling virtualMachineScaleSetsClient.WaitForDeleteInstancesResult(%v) for %s", requiredIds.InstanceIds, scaleSet.Name) - httpResponse, err := scaleSet.manager.azClient.virtualMachineScaleSetsClient.WaitForDeleteInstancesResult(ctx, future, scaleSet.manager.config.ResourceGroup) - isSuccess, err := isSuccessHTTPResponse(httpResponse, err) - if isSuccess { - klog.V(3).Infof("virtualMachineScaleSetsClient.WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name) - return +// getScaleSetSize gets Scale Set size. +func (scaleSet *ScaleSet) getScaleSetSize() (int64, error) { + // First, get the size of the ScaleSet reported by API + // -1 indiciates the ScaleSet hasn't been initialized + size, err := scaleSet.getCurSize() + if size == -1 || err != nil { + klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, err) + return size, err } - klog.Errorf("virtualMachineScaleSetsClient.WaitForDeleteInstancesResult - DeleteInstances for instances %v for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err) + return size, nil } -// updateVMSSCapacity invokes virtualMachineScaleSetsClient to update the capacity for VMSS. -func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) { +// waitForCreateOrUpdate waits for the outcome of VMSS capacity update initiated via CreateOrUpdateAsync. +func (scaleSet *ScaleSet) waitForCreateOrUpdateInstances(future *azure.Future) { var err error defer func() { + // Invalidate instanceCache on success and failure. Failure might have created a few instances, but it is very rare. + scaleSet.invalidateInstanceCache() if err != nil { klog.Errorf("Failed to update the capacity for vmss %s with error %v, invalidate the cache so as to get the real size from API", scaleSet.Name, err) // Invalidate the VMSS size cache in order to fetch the size from the API. @@ -213,7 +274,7 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) { } }() - ctx, cancel := getContextWithCancel() + ctx, cancel := getContextWithTimeout(asyncContextTimeout) defer cancel() klog.V(3).Infof("Calling virtualMachineScaleSetsClient.WaitForCreateOrUpdateResult(%s)", scaleSet.Name) @@ -221,57 +282,41 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) { isSuccess, err := isSuccessHTTPResponse(httpResponse, err) if isSuccess { - klog.V(3).Infof("virtualMachineScaleSetsClient.WaitForCreateOrUpdateResult(%s) success", scaleSet.Name) - scaleSet.invalidateInstanceCache() + klog.V(3).Infof("waitForCreateOrUpdateInstances(%s) success", scaleSet.Name) return } - klog.Errorf("virtualMachineScaleSetsClient.WaitForCreateOrUpdateResult - updateVMSSCapacity for scale set %q failed: %v", scaleSet.Name, err) + klog.Errorf("waitForCreateOrUpdateInstances(%s) failed, err: %v", scaleSet.Name, err) } -// SetScaleSetSize sets ScaleSet size. -func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { - scaleSet.sizeMutex.Lock() - defer scaleSet.sizeMutex.Unlock() - +// setScaleSetSize sets ScaleSet size. +func (scaleSet *ScaleSet) setScaleSetSize(size int64, delta int) error { vmssInfo, err := scaleSet.getVMSSFromCache() if err != nil { klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, err) return err } - // Update the new capacity to cache. - vmssSizeMutex.Lock() - vmssInfo.Sku.Capacity = &size - vmssSizeMutex.Unlock() + requiredInstances := delta - // Compose a new VMSS for updating. - op := compute.VirtualMachineScaleSet{ - Name: vmssInfo.Name, - Sku: vmssInfo.Sku, - Location: vmssInfo.Location, - } - ctx, cancel := getContextWithTimeout(vmssContextTimeout) - defer cancel() - klog.V(3).Infof("Waiting for virtualMachineScaleSetsClient.CreateOrUpdateAsync(%s)", scaleSet.Name) - future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdateAsync(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name, op) - if rerr != nil { - klog.Errorf("virtualMachineScaleSetsClient.CreateOrUpdate for scale set %q failed: %v", scaleSet.Name, rerr) - return rerr.Error() + // If after reallocating instances we still need more instances or we're just in Delete mode + // send a scale request + if requiredInstances > 0 { + klog.V(3).Infof("Remaining unsatisfied count is %d. Attempting to increase scale set %q "+ + "capacity", requiredInstances, scaleSet.Name) + err := scaleSet.createOrUpdateInstances(&vmssInfo, size) + if err != nil { + klog.Errorf("Failed to increase capacity for scale set %q to %d: %v", scaleSet.Name, requiredInstances, err) + return err + } } - - // Proactively set the VMSS size so autoscaler makes better decisions. - scaleSet.curSize = size - scaleSet.lastSizeRefresh = time.Now() - - go scaleSet.updateVMSSCapacity(future) return nil } // TargetSize returns the current TARGET size of the node group. It is possible that the // number is different from the number of nodes registered in Kubernetes. func (scaleSet *ScaleSet) TargetSize() (int, error) { - size, err := scaleSet.GetScaleSetSize() + size, err := scaleSet.getScaleSetSize() return int(size), err } @@ -281,7 +326,7 @@ func (scaleSet *ScaleSet) IncreaseSize(delta int) error { return fmt.Errorf("size increase must be positive") } - size, err := scaleSet.GetScaleSetSize() + size, err := scaleSet.getScaleSetSize() if err != nil { return err } @@ -294,18 +339,26 @@ func (scaleSet *ScaleSet) IncreaseSize(delta int) error { return fmt.Errorf("size increase too large - desired:%d max:%d", int(size)+delta, scaleSet.MaxSize()) } - return scaleSet.SetScaleSetSize(size + int64(delta)) + return scaleSet.setScaleSetSize(size+int64(delta), delta) } // GetScaleSetVms returns list of nodes for the given scale set. func (scaleSet *ScaleSet) GetScaleSetVms() ([]compute.VirtualMachineScaleSetVM, *retry.Error) { - klog.V(4).Infof("GetScaleSetVms: starts") ctx, cancel := getContextWithTimeout(vmssContextTimeout) defer cancel() - resourceGroup := scaleSet.manager.config.ResourceGroup - vmList, rerr := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "instanceView") + var vmList []compute.VirtualMachineScaleSetVM + var rerr *retry.Error + if scaleSet.enableDetailedCSEMessage { + vmList, rerr = scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, scaleSet.manager.config.ResourceGroup, + scaleSet.Name, string(compute.InstanceViewTypesInstanceView)) + } else { + vmList, rerr = scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, scaleSet.manager.config.ResourceGroup, + scaleSet.Name, "") + } + klog.V(4).Infof("GetScaleSetVms: scaleSet.Name: %s, vmList: %v", scaleSet.Name, vmList) + if rerr != nil { klog.Errorf("VirtualMachineScaleSetVMsClient.List failed for %s: %v", scaleSet.Name, rerr) return nil, rerr @@ -348,7 +401,7 @@ func (scaleSet *ScaleSet) DecreaseTargetSize(delta int) error { // VMSS size should be changed automatically after the Node deletion, hence this operation is not required. // To prevent some unreproducible bugs, an extra refresh of cache is needed. scaleSet.invalidateInstanceCache() - _, err := scaleSet.GetScaleSetSize() + _, err := scaleSet.getScaleSetSize() if err != nil { klog.Warningf("DecreaseTargetSize: failed with error: %v", err) } @@ -376,7 +429,53 @@ func (scaleSet *ScaleSet) Belongs(node *apiv1.Node) (bool, error) { return true, nil } -// DeleteInstances deletes the given instances. All instances must be controlled by the same ASG. +func (scaleSet *ScaleSet) createOrUpdateInstances(vmssInfo *compute.VirtualMachineScaleSet, newSize int64) error { + if vmssInfo == nil { + return fmt.Errorf("vmssInfo cannot be nil while increating scaleSet capacity") + } + + scaleSet.sizeMutex.Lock() + defer scaleSet.sizeMutex.Unlock() + + // Update the new capacity to cache. + vmssSizeMutex.Lock() + vmssInfo.Sku.Capacity = &newSize + vmssSizeMutex.Unlock() + + // Compose a new VMSS for updating. + op := compute.VirtualMachineScaleSet{ + Name: vmssInfo.Name, + Sku: vmssInfo.Sku, + Location: vmssInfo.Location, + } + + if vmssInfo.ExtendedLocation != nil { + op.ExtendedLocation = &compute.ExtendedLocation{ + Name: vmssInfo.ExtendedLocation.Name, + Type: vmssInfo.ExtendedLocation.Type, + } + + klog.V(3).Infof("Passing ExtendedLocation information if it is not nil, with Edge Zone name:(%s)", *op.ExtendedLocation.Name) + } + + ctx, cancel := getContextWithTimeout(vmssContextTimeout) + defer cancel() + klog.V(3).Infof("Waiting for virtualMachineScaleSetsClient.CreateOrUpdateAsync(%s)", scaleSet.Name) + future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.CreateOrUpdateAsync(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name, op) + if rerr != nil { + klog.Errorf("virtualMachineScaleSetsClient.CreateOrUpdate for scale set %q failed: %+v", scaleSet.Name, rerr) + return rerr.Error() + } + + // Proactively set the VMSS size so autoscaler makes better decisions. + scaleSet.curSize = newSize + scaleSet.lastSizeRefresh = time.Now() + + go scaleSet.waitForCreateOrUpdateInstances(future) + return nil +} + +// DeleteInstances deletes the given instances. All instances must be controlled by the same nodegroup. func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregisteredNodes bool) error { if len(instances) == 0 { return nil @@ -391,16 +490,13 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered instancesToDelete := []*azureRef{} for _, instance := range instances { - asg, err := scaleSet.manager.GetNodeGroupForInstance(instance) + err = scaleSet.verifyNodeGroup(instance, commonAsg.Id()) if err != nil { return err } - if !strings.EqualFold(asg.Id(), commonAsg.Id()) { - return fmt.Errorf("cannot delete instance (%s) which don't belong to the same Scale Set (%q)", instance.Name, commonAsg) - } - - if cpi, found := scaleSet.getInstanceByProviderID(instance.Name); found && cpi.Status != nil && cpi.Status.State == cloudprovider.InstanceDeleting { + if cpi, found, err := scaleSet.getInstanceByProviderID(instance.Name); found && err == nil && cpi.Status != nil && + cpi.Status.State == cloudprovider.InstanceDeleting { klog.V(3).Infof("Skipping deleting instance %s as its current state is deleting", instance.Name) continue } @@ -429,14 +525,10 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered ctx, cancel := getContextWithTimeout(vmssContextTimeout) defer cancel() - resourceGroup := scaleSet.manager.config.ResourceGroup - scaleSet.instanceMutex.Lock() - klog.V(3).Infof("Calling virtualMachineScaleSetsClient.DeleteInstancesAsync(%v)", requiredIds.InstanceIds) - future, rerr := scaleSet.manager.azClient.virtualMachineScaleSetsClient.DeleteInstancesAsync(ctx, resourceGroup, commonAsg.Id(), *requiredIds, false) - scaleSet.instanceMutex.Unlock() + future, rerr := scaleSet.deleteInstances(ctx, requiredIds, commonAsg.Id()) if rerr != nil { - klog.Errorf("virtualMachineScaleSetsClient.DeleteInstancesAsync for instances %v failed: %v", requiredIds.InstanceIds, rerr) + klog.Errorf("virtualMachineScaleSetsClient.DeleteInstancesAsync for instances %v for %s failed: %+v", requiredIds.InstanceIds, scaleSet.Name, rerr) return rerr.Error() } @@ -456,14 +548,30 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered } go scaleSet.waitForDeleteInstances(future, requiredIds) - return nil } +func (scaleSet *ScaleSet) waitForDeleteInstances(future *azure.Future, requiredIds *compute.VirtualMachineScaleSetVMInstanceRequiredIDs) { + ctx, cancel := getContextWithTimeout(asyncContextTimeout) + + defer cancel() + klog.V(3).Infof("Calling virtualMachineScaleSetsClient.WaitForDeleteInstancesResult(%v) for %s", requiredIds.InstanceIds, scaleSet.Name) + httpResponse, err := scaleSet.manager.azClient.virtualMachineScaleSetsClient.WaitForDeleteInstancesResult(ctx, future, scaleSet.manager.config.ResourceGroup) + isSuccess, err := isSuccessHTTPResponse(httpResponse, err) + if isSuccess { + klog.V(3).Infof(".WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name) + // No need to invalidateInstanceCache because instanceStates were proactively set to "deleting" + return + } + // On failure, invalidate the instanceCache - cannot have instances in deletingState + scaleSet.invalidateInstanceCache() + klog.Errorf("WaitForDeleteInstancesResult(%v) for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err) +} + // DeleteNodes deletes the nodes from the group. func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { klog.V(8).Infof("Delete nodes requested: %q\n", nodes) - size, err := scaleSet.GetScaleSetSize() + size, err := scaleSet.getScaleSetSize() if err != nil { return err } @@ -472,8 +580,11 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { return fmt.Errorf("min size reached, nodes will not be deleted") } + // Distinguish between unregistered node deletion and normal node deletion refs := make([]*azureRef, 0, len(nodes)) hasUnregisteredNodes := false + unregisteredRefs := make([]*azureRef, 0, len(nodes)) + for _, node := range nodes { belongs, err := scaleSet.Belongs(node) if err != nil { @@ -490,7 +601,18 @@ func (scaleSet *ScaleSet) DeleteNodes(nodes []*apiv1.Node) error { ref := &azureRef{ Name: node.Spec.ProviderID, } - refs = append(refs, ref) + + if node.Annotations[cloudprovider.FakeNodeReasonAnnotation] == cloudprovider.FakeNodeUnregistered { + klog.V(5).Infof("Node: %s type is unregistered..Appending to the unregistered list", node.Name) + unregisteredRefs = append(unregisteredRefs, ref) + } else { + refs = append(refs, ref) + } + } + + if len(unregisteredRefs) > 0 { + klog.V(3).Infof("Removing unregisteredNodes: %v", unregisteredRefs) + return scaleSet.DeleteInstances(unregisteredRefs, true) } return scaleSet.DeleteInstances(refs, hasUnregisteredNodes) @@ -513,7 +635,8 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro return nil, err } - node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager) + node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList) + if err != nil { return nil, err } @@ -525,7 +648,6 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro // Nodes returns a list of all nodes that belong to this node group. func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) { - klog.V(4).Infof("Nodes: starts, scaleSet.Name: %s", scaleSet.Name) curSize, err := scaleSet.getCurSize() if err != nil { klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, err) @@ -541,73 +663,84 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) { return scaleSet.instanceCache, nil } - klog.V(4).Infof("Nodes: starts to get VMSS VMs") - splay := rand.New(rand.NewSource(time.Now().UnixNano())).Intn(scaleSet.instancesRefreshJitter + 1) - lastRefresh := time.Now().Add(-time.Second * time.Duration(splay)) - - orchestrationMode, err := scaleSet.getOrchestrationMode() + // Forcefully updating the instanceCache as the instanceCacheSize didn't match curSize or cache is invalid. + err = scaleSet.updateInstanceCache() if err != nil { - klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) return nil, err } - klog.V(4).Infof("VMSS: orchestration Mode %s", orchestrationMode) - - if orchestrationMode == compute.Uniform { - err := scaleSet.buildScaleSetCache(lastRefresh) - if err != nil { - return nil, err - } - - } else if orchestrationMode == compute.Flexible { - if scaleSet.manager.config.EnableVmssFlex { - err := scaleSet.buildScaleSetCacheForFlex(lastRefresh) - if err != nil { - return nil, err - } - } else { - return nil, fmt.Errorf("vmss - %q with Flexible orchestration detected but 'enableVmssFlex' feature flag is turned off", scaleSet.Name) - } - - } else { - return nil, fmt.Errorf("Failed to determine orchestration mode for vmss %q", scaleSet.Name) - } - klog.V(4).Infof("Nodes: returns") return scaleSet.instanceCache, nil } -func (scaleSet *ScaleSet) buildScaleSetCache(lastRefresh time.Time) error { - vms, rerr := scaleSet.GetScaleSetVms() +// buildScaleSetCacheForFlex is used by orchestrationMode == compute.Flexible +func (scaleSet *ScaleSet) buildScaleSetCacheForFlex() error { + klog.V(3).Infof("buildScaleSetCacheForFlex: resetting instance Cache for scaleSet %s", + scaleSet.Name) + splay := rand.New(rand.NewSource(time.Now().UnixNano())).Intn(scaleSet.instancesRefreshJitter + 1) + lastRefresh := time.Now().Add(-time.Second * time.Duration(splay)) + + vms, rerr := scaleSet.GetFlexibleScaleSetVms() if rerr != nil { if isAzureRequestsThrottled(rerr) { // Log a warning and update the instance refresh time so that it would retry after cache expiration - klog.Warningf("GetScaleSetVms() is throttled with message %v, would return the cached instances", rerr) + klog.Warningf("GetFlexibleScaleSetVms() is throttled with message %v, would return the cached instances", rerr) scaleSet.lastInstanceRefresh = lastRefresh return nil } return rerr.Error() } - scaleSet.instanceCache = buildInstanceCache(vms) + scaleSet.instanceCache = buildInstanceCacheForFlex(vms) scaleSet.lastInstanceRefresh = lastRefresh return nil } -func (scaleSet *ScaleSet) buildScaleSetCacheForFlex(lastRefresh time.Time) error { - vms, rerr := scaleSet.GetFlexibleScaleSetVms() +func (scaleSet *ScaleSet) buildScaleSetCacheForUniform() error { + klog.V(3).Infof("updateInstanceCache: resetting instance Cache for scaleSet %s", + scaleSet.Name) + splay := rand.New(rand.NewSource(time.Now().UnixNano())).Intn(scaleSet.instancesRefreshJitter + 1) + lastRefresh := time.Now().Add(-time.Second * time.Duration(splay)) + vms, rerr := scaleSet.GetScaleSetVms() if rerr != nil { if isAzureRequestsThrottled(rerr) { - // Log a warning and update the instance refresh time so that it would retry after cache expiration - klog.Warningf("GetFlexibleScaleSetVms() is throttled with message %v, would return the cached instances", rerr) + // Log a warning and update the instance refresh time so that it would retry later. + // Ensure to retry no sooner than rerr.RetryAfter + klog.Warningf("updateInstanceCache: GetScaleSetVms() is throttled with message %v, would return the cached instances", rerr) + nextRefresh := lastRefresh.Add(scaleSet.instancesRefreshPeriod) + if nextRefresh.Before(rerr.RetryAfter) { + delay := rerr.RetryAfter.Sub(nextRefresh) + lastRefresh = lastRefresh.Add(delay) + } scaleSet.lastInstanceRefresh = lastRefresh return nil } return rerr.Error() } - scaleSet.instanceCache = buildInstanceCache(vms) + instances := []cloudprovider.Instance{} + // Note that the GetScaleSetVms() results is not used directly because for the List endpoint, + // their resource ID format is not consistent with Get endpoint + for i := range vms { + // The resource ID is empty string, which indicates the instance may be in deleting state. + if *vms[i].ID == "" { + continue + } + resourceID, err := convertResourceGroupNameToLower(*vms[i].ID) + if err != nil { + // This shouldn't happen. Log a warning message for tracking. + klog.Warningf("updateInstanceCache: buildInstanceCache.convertResourceGroupNameToLower failed with error: %v", err) + continue + } + + instances = append(instances, cloudprovider.Instance{ + Id: azurePrefix + resourceID, + Status: scaleSet.instanceStatusFromVM(&vms[i]), + }) + } + + scaleSet.instanceCache = instances scaleSet.lastInstanceRefresh = lastRefresh return nil @@ -615,32 +748,22 @@ func (scaleSet *ScaleSet) buildScaleSetCacheForFlex(lastRefresh time.Time) error // Note that the GetScaleSetVms() results is not used directly because for the List endpoint, // their resource ID format is not consistent with Get endpoint -func buildInstanceCache(vmList interface{}) []cloudprovider.Instance { - instances := []cloudprovider.Instance{} - - switch vms := vmList.(type) { - case []compute.VirtualMachineScaleSetVM: - for _, vm := range vms { - powerState := vmPowerStateRunning - if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { - powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) - } - addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState) - } - case []compute.VirtualMachine: - for _, vm := range vms { - powerState := vmPowerStateRunning - if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { - powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) - } - addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState) +// buildInstanceCacheForFlex used by orchestrationMode == compute.Flexible +func buildInstanceCacheForFlex(vms []compute.VirtualMachine) []cloudprovider.Instance { + var instances []cloudprovider.Instance + for _, vm := range vms { + powerState := vmPowerStateRunning + if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { + powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) } + addVMToCache(&instances, vm.ID, vm.ProvisioningState, powerState) } return instances } -func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisioningState *string, powerState string) { +// addVMToCache used by orchestrationMode == compute.Flexible +func addVMToCache(instances *[]cloudprovider.Instance, id, provisioningState *string, powerState string) { // The resource ID is empty string, which indicates the instance may be in deleting state. if len(*id) == 0 { return @@ -654,41 +777,19 @@ func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisi } *instances = append(*instances, cloudprovider.Instance{ - Id: "azure://" + resourceID, + Id: azurePrefix + resourceID, Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState), }) } -func (scaleSet *ScaleSet) getInstanceByProviderID(providerID string) (cloudprovider.Instance, bool) { - scaleSet.instanceMutex.Lock() - defer scaleSet.instanceMutex.Unlock() - for _, instance := range scaleSet.instanceCache { - if instance.Id == providerID { - return instance, true - } - } - return cloudprovider.Instance{}, false -} - -func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, status cloudprovider.InstanceStatus) { - scaleSet.instanceMutex.Lock() - defer scaleSet.instanceMutex.Unlock() - for k, instance := range scaleSet.instanceCache { - if instance.Id == providerID { - klog.V(5).Infof("Setting instance %s status to %v", instance.Id, status) - scaleSet.instanceCache[k].Status = &status - } - } - scaleSet.lastInstanceRefresh = time.Now() -} - -// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state and power state to cloudprovider.InstanceStatus -func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus { +// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state to cloudprovider.InstanceStatus +// instanceStatusFromProvisioningStateAndPowerState used by orchestrationMode == compute.Flexible +func instanceStatusFromProvisioningStateAndPowerState(resourceID string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus { if provisioningState == nil { return nil } - klog.V(5).Infof("Getting vm instance provisioning state %s for %s", *provisioningState, resourceId) + klog.V(5).Infof("Getting vm instance provisioning state %s for %s", *provisioningState, resourceID) status := &cloudprovider.InstanceStatus{} switch *provisioningState { @@ -702,7 +803,7 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisi // ProvisioningState represents the most recent provisioning state, therefore only report // InstanceCreating errors when the power state indicates the instance has not yet started running if !isRunningVmPowerState(powerState) { - klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceId, powerState) + klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceID, powerState) status.State = cloudprovider.InstanceCreating status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ ErrorClass: cloudprovider.OutOfResourcesErrorClass, @@ -710,7 +811,7 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisi ErrorMessage: "Azure failed to provision a node for this node group", } } else { - klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceId, powerState) + klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceID, powerState) status.State = cloudprovider.InstanceRunning } default: @@ -720,11 +821,10 @@ func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisi return status } -func (scaleSet *ScaleSet) invalidateInstanceCache() { - scaleSet.instanceMutex.Lock() - // Set the instanceCache as outdated. - scaleSet.lastInstanceRefresh = time.Now().Add(-1 * scaleSet.instancesRefreshPeriod) - scaleSet.instanceMutex.Unlock() +func isSpot(vmss *compute.VirtualMachineScaleSet) bool { + return vmss != nil && vmss.VirtualMachineScaleSetProperties != nil && + vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile != nil && + vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile.Priority == compute.Spot } func (scaleSet *ScaleSet) invalidateLastSizeRefreshWithLock() { @@ -741,3 +841,43 @@ func (scaleSet *ScaleSet) getOrchestrationMode() (compute.OrchestrationMode, err } return vmss.OrchestrationMode, nil } + +func (scaleSet *ScaleSet) cseErrors(extensions *[]compute.VirtualMachineExtensionInstanceView) ([]string, bool) { + var errs []string + failed := false + if extensions != nil { + for _, extension := range *extensions { + if strings.EqualFold(to.String(extension.Name), vmssCSEExtensionName) && extension.Statuses != nil { + for _, status := range *extension.Statuses { + if status.Level == "Error" { + errs = append(errs, to.String(status.Message)) + failed = true + } + } + } + } + } + return errs, failed +} + +func (scaleSet *ScaleSet) getSKU() string { + vmssInfo, err := scaleSet.getVMSSFromCache() + if err != nil { + klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, err) + return "" + } + return to.String(vmssInfo.Sku.Name) +} + +func (scaleSet *ScaleSet) verifyNodeGroup(instance *azureRef, commonNgID string) error { + ng, err := scaleSet.manager.GetNodeGroupForInstance(instance) + if err != nil { + return err + } + + if !strings.EqualFold(ng.Id(), commonNgID) { + return fmt.Errorf("cannot delete instance (%s) which don't belong to the same Scale Set (%q)", + instance.Name, commonNgID) + } + return nil +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go new file mode 100644 index 000000000000..5b6843caf412 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache.go @@ -0,0 +1,260 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "fmt" + "sync" + "time" + + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" + "github.com/Azure/go-autorest/autorest/to" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + "k8s.io/klog/v2" +) + +/* +- "instanceCache" is included in the scaleSet data structures and holds +status information of the instances / vms. This data is used by the CAS +to make scaleUp / scaleDown decisions based on what is the current state +the cluster without making an api call. +- The time for this cache is represented by "instancesRefreshPeriod" which +by default is defaultVmssInstancesRefreshPeriod ~ 5 mins. +- "lastInstanceRefresh" represents the time when the cache was validated +the last time. +- Following methods are defined related to the instanceCache: + - invalidateInstanceCache() + - validateInstanceCache() + - validateInstanceCacheWithoutLock() + - updateInstanceCache() + - getInstanceByProviderID() + - getInstancesByState() + - getInstanceCacheSize() + - setInstanceStatusByProviderID() + - setInstanceStatusByProviderID() +*/ + +// InstanceCache tracks the VMs in the ScaleSet, in the form of corresponding cloudprovider.Instances. +// This struct also contains related locks and cache interval variables. +type InstanceCache struct { + // instanceCache tracks the VMs in the ScaleSet, in the form of corresponding cloudprovider.Instances. + // instanceCache directly backs the efficient response to NodeGroup.Nodes(), implemented by ScaleSet.Nodes(). + // It is periodially updated from VMSS using virtualMachineScaleSetVMsClient.List(). + instanceCache []cloudprovider.Instance + // instancesRefreshPeriod is how often instance cache is refreshed from VMSS. + // (Set from VmssVmsCacheTTL or defaultVmssInstancesRefreshPeriod = 5min) + instancesRefreshPeriod time.Duration + // lastInstanceRefresh is the time instanceCache was last refreshed from VMSS. + // Together with instancesRefreshPeriod, it is used to determine if it is time to refresh instanceCache. + lastInstanceRefresh time.Time + // instancesRefreshJitter (in seconds) is used to ensure refreshes (which involve expensive List call) + // don't happen at exactly the same time on all ScaleSets + instancesRefreshJitter int + // instanceMutex is used for protecting instance cache from concurrent access + instanceMutex sync.Mutex +} + +// invalidateInstanceCache invalidates the instanceCache by modifying the lastInstanceRefresh. +func (scaleSet *ScaleSet) invalidateInstanceCache() { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + // Set the instanceCache as outdated. + klog.V(3).Infof("invalidating instanceCache for %s", scaleSet.Name) + scaleSet.lastInstanceRefresh = time.Now().Add(-1 * scaleSet.instancesRefreshPeriod) +} + +// validateInstanceCache updates the instanceCache if it has expired. It acquires lock. +func (scaleSet *ScaleSet) validateInstanceCache() error { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + return scaleSet.validateInstanceCacheWithoutLock() +} + +// validateInstanceCacheWithoutLock is used a helper function for validateInstanceCache, get and set methods. +func (scaleSet *ScaleSet) validateInstanceCacheWithoutLock() error { + if scaleSet.lastInstanceRefresh.Add(scaleSet.instancesRefreshPeriod).After(time.Now()) { + klog.V(3).Infof("validateInstanceCacheWithoutLock: no need to reset instance Cache for scaleSet %s", + scaleSet.Name) + return nil + } + + return scaleSet.updateInstanceCache() +} + +// updateInstanceCache forcefully updates the cache without checking the timer - lastInstanceRefresh. +// Caller is responsible for acquiring lock on the instanceCache. +func (scaleSet *ScaleSet) updateInstanceCache() error { + orchestrationMode, err := scaleSet.getOrchestrationMode() + if err != nil { + klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) + return err + } + + if orchestrationMode == compute.Flexible { + if scaleSet.manager.config.EnableVmssFlex { + return scaleSet.buildScaleSetCacheForFlex() + } + return fmt.Errorf("vmss - %q with Flexible orchestration detected but 'enableVmssFlex' feature flag is turned off", scaleSet.Name) + } else if orchestrationMode == compute.Uniform { + return scaleSet.buildScaleSetCacheForUniform() + } + + return fmt.Errorf("failed to determine orchestration mode for vmss %q", scaleSet.Name) +} + +// getInstanceByProviderID returns instance from instanceCache if given providerID exists. +func (scaleSet *ScaleSet) getInstanceByProviderID(providerID string) (cloudprovider.Instance, bool, error) { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + + err := scaleSet.validateInstanceCacheWithoutLock() + if err != nil { + klog.Errorf("getInstanceByProviderID: error validating instanceCache for providerID %s for scaleSet %s, err: %v", + providerID, scaleSet.Name, err) + return cloudprovider.Instance{}, false, err + } + + for _, instance := range scaleSet.instanceCache { + if instance.Id == providerID { + return instance, true, nil + } + } + return cloudprovider.Instance{}, false, nil +} + +// getInstancesByState returns list of instances with given state. +func (scaleSet *ScaleSet) getInstancesByState(state cloudprovider.InstanceState) ([]cloudprovider.Instance, error) { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + + err := scaleSet.validateInstanceCacheWithoutLock() + if err != nil { + klog.Errorf("getInstancesByState: error validating instanceCache for state %d for scaleSet %s, "+ + "err: %v", state, scaleSet.Name, err) + return []cloudprovider.Instance{}, err + } + + instances := []cloudprovider.Instance{} + for _, instance := range scaleSet.instanceCache { + if instance.Status != nil && instance.Status.State == state { + instances = append(instances, instance) + } + } + return instances, nil +} + +// getInstanceCacheSize returns the size of the instanceCache. +func (scaleSet *ScaleSet) getInstanceCacheSize() (int64, error) { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + + err := scaleSet.validateInstanceCacheWithoutLock() + if err != nil { + klog.Errorf("getInstanceCacheSize: error validating instanceCache for scaleSet: %s, "+ + "err: %v", scaleSet.Name, err) + return -1, err + } + + return int64(len(scaleSet.instanceCache)), nil +} + +// setInstanceStatusByProviderID sets the status for an instance with given providerID. +// It reset the cache if stale and sets the status by acquiring a lock. +func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, status cloudprovider.InstanceStatus) { + scaleSet.instanceMutex.Lock() + defer scaleSet.instanceMutex.Unlock() + + err := scaleSet.validateInstanceCacheWithoutLock() + if err != nil { + klog.Errorf("setInstanceStatusByProviderID: error validating instanceCache for providerID %s for "+ + "scaleSet: %s, err: %v", providerID, scaleSet.Name, err) + // return no error because CAS runs with the expectation that future runs will refresh instance Cache + } + + for k, instance := range scaleSet.instanceCache { + if instance.Id == providerID { + klog.V(3).Infof("setInstanceStatusByProviderID: setting instance state for %s for scaleSet "+ + "%s to %d", instance.Id, scaleSet.Name, status.State) + scaleSet.instanceCache[k].Status = &status + break + } + } +} + +// instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus. +func (scaleSet *ScaleSet) instanceStatusFromVM(vm *compute.VirtualMachineScaleSetVM) *cloudprovider.InstanceStatus { + // Prefer the proactive cache view of the instance state if we aren't in a terminal state + // This is because the power state may be taking longer to update and we don't want + // an unfortunate VM update (TTL 5 min) to reset that state to running. + if vm.ProvisioningState == nil || *vm.ProvisioningState == string(compute.GalleryProvisioningStateUpdating) { + resourceID, _ := convertResourceGroupNameToLower(*vm.ID) + providerID := azurePrefix + resourceID + for _, instance := range scaleSet.instanceCache { + if instance.Id == providerID { + return instance.Status + } + } + return nil + } + + status := &cloudprovider.InstanceStatus{} + switch *vm.ProvisioningState { + case string(compute.GalleryProvisioningStateDeleting): + status.State = cloudprovider.InstanceDeleting + case string(compute.GalleryProvisioningStateCreating): + status.State = cloudprovider.InstanceCreating + case string(compute.GalleryProvisioningStateFailed): + powerState := vmPowerStateRunning + if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { + powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) + } + + // Provisioning can fail both during instance creation or after the instance is running. + // Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states, + // ProvisioningState represents the most recent provisioning state, therefore only report + // InstanceCreating errors when the power state indicates the instance has not yet started running + if !isRunningVmPowerState(powerState) { + klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", *vm.ID, powerState) + status.State = cloudprovider.InstanceCreating + status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "provisioning-state-failed", + ErrorMessage: "Azure failed to provision a node for this node group", + } + } else { + klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", *vm.ID, powerState) + status.State = cloudprovider.InstanceRunning + } + default: + status.State = cloudprovider.InstanceRunning + } + + // Add vmssCSE Provisioning Failed Message in error info body for vmssCSE Extensions if enableDetailedCSEMessage is true + if scaleSet.enableDetailedCSEMessage && vm.InstanceView != nil { + if err, failed := scaleSet.cseErrors(vm.InstanceView.Extensions); failed { + errorInfo := &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: vmssExtensionProvisioningFailed, + ErrorMessage: fmt.Sprintf("%s: %v", to.String(vm.Name), err), + } + status.ErrorInfo = errorInfo + } + } + + return status +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go new file mode 100644 index 000000000000..f491897a8041 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_instance_cache_test.go @@ -0,0 +1,55 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "fmt" + "testing" + "time" + + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + + "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" +) + +var ( + ctrl *gomock.Controller + currentTime, expiredTime time.Time + provider *AzureCloudProvider + scaleSet *ScaleSet + mockVMSSVMClient *mockvmssvmclient.MockInterface + expectedVMSSVMs []compute.VirtualMachineScaleSetVM + expectedStates []cloudprovider.InstanceState + instanceCache, expectedInstanceCache []cloudprovider.Instance +) + +func testGetInstanceCacheWithStates(t *testing.T, vms []compute.VirtualMachineScaleSetVM, + states []cloudprovider.InstanceState) []cloudprovider.Instance { + assert.Equal(t, len(vms), len(states)) + var instanceCacheTest []cloudprovider.Instance + for i := 0; i < len(vms); i++ { + instanceCacheTest = append(instanceCacheTest, cloudprovider.Instance{ + Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, i), + Status: &cloudprovider.InstanceStatus{State: states[i]}, + }) + } + return instanceCacheTest +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 85ee14c4c483..853054cbb354 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -20,12 +20,14 @@ import ( "fmt" "net/http" "testing" + "time" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" "github.com/Azure/go-autorest/autorest/to" - "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" apiv1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmclient/mockvmclient" "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient" @@ -37,9 +39,22 @@ func newTestScaleSet(manager *AzureManager, name string) *ScaleSet { azureRef: azureRef{ Name: name, }, - manager: manager, - minSize: 1, - maxSize: 5, + manager: manager, + minSize: 1, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, + } +} + +func newTestScaleSetMinSizeZero(manager *AzureManager, name string) *ScaleSet { + return &ScaleSet{ + azureRef: azureRef{ + Name: name, + }, + manager: manager, + minSize: 0, + maxSize: 5, + enableForceDelete: manager.config.EnableForceDelete, } } @@ -60,6 +75,22 @@ func newTestVMSSList(cap int64, name, loc string, orchmode compute.Orchestration } } +func newTestVMSSListForEdgeZones(capacity int64, name string) *compute.VirtualMachineScaleSet { + return &compute.VirtualMachineScaleSet{ + Name: to.StringPtr(name), + Sku: &compute.Sku{ + Capacity: to.Int64Ptr(capacity), + Name: to.StringPtr("Standard_D4_v2"), + }, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{}, + Location: to.StringPtr("eastus"), + ExtendedLocation: &compute.ExtendedLocation{ + Name: to.StringPtr("losangeles"), + Type: compute.ExtendedLocationTypes("EdgeZone"), + }, + } +} + func newTestVMSSVMList(count int) []compute.VirtualMachineScaleSetVM { var vmssVMList []compute.VirtualMachineScaleSetVM for i := 0; i < count; i++ { @@ -98,7 +129,7 @@ func newApiNode(orchmode compute.OrchestrationMode, vmID int64) *apiv1.Node { node := &apiv1.Node{ Spec: apiv1.NodeSpec{ - ProviderID: "azure://" + fmt.Sprintf(providerId, vmID), + ProviderID: azurePrefix + fmt.Sprintf(providerId, vmID), }, } return node @@ -121,13 +152,25 @@ func TestMinSize(t *testing.T) { assert.Equal(t, provider.NodeGroups()[0].MinSize(), 1) } +func TestMinSizeZero(t *testing.T) { + provider := newTestProvider(t) + registered := provider.azureManager.RegisterNodeGroup( + newTestScaleSetMinSizeZero(provider.azureManager, testASG)) + assert.True(t, registered) + assert.Equal(t, len(provider.NodeGroups()), 1) + assert.Equal(t, provider.NodeGroups()[0].MinSize(), 0) +} + func TestTargetSize(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform) + spotScaleSet := newTestVMSSList(5, "spot-vmss", "eastus", compute.Uniform)[0] + spotScaleSet.VirtualMachineProfile = &compute.VirtualMachineScaleSetVMProfile{Priority: compute.Spot} + expectedScaleSets = append(expectedScaleSets, spotScaleSet) + expectedVMSSVMs := newTestVMSSVMList(3) expectedVMs := newTestVMList(3) @@ -140,28 +183,51 @@ func TestTargetSize(t *testing.T) { mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes() provider.azureManager.azClient.virtualMachinesClient = mockVMClient - if orchMode == compute.Uniform { + // return a different capacity from GET API + spotScaleSet.Sku.Capacity = to.Int64Ptr(1) + mockVMSSClient.EXPECT().Get(gomock.Any(), provider.azureManager.config.ResourceGroup, "spot-vmss").Return(spotScaleSet, nil).Times(1) + provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + + mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + err := provider.azureManager.forceRefresh() + assert.NoError(t, err) + if orchMode == compute.Uniform { mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient - } else { provider.azureManager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() } - err := provider.azureManager.forceRefresh() + err = provider.azureManager.forceRefresh() assert.NoError(t, err) registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) + newTestScaleSet(provider.azureManager, testASG)) assert.True(t, registered) assert.Equal(t, len(provider.NodeGroups()), 1) targetSize, err := provider.NodeGroups()[0].TargetSize() assert.NoError(t, err) assert.Equal(t, 3, targetSize) + + targetSize, err = provider.NodeGroups()[0].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 3, targetSize) + + // With a spot nodegroup + spotNodeGroup := newTestScaleSet(provider.azureManager, "spot-vmss") + registered = provider.azureManager.RegisterNodeGroup(spotNodeGroup) + assert.True(t, registered) + assert.Equal(t, len(provider.NodeGroups()), 2) + + targetSize, err = provider.NodeGroups()[1].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 1, targetSize) } } @@ -171,13 +237,18 @@ func TestIncreaseSize(t *testing.T) { orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - expectedVMSSVMs := newTestVMSSVMList(3) - expectedVMs := newTestVMList(3) - for _, orchMode := range orchestrationModes { + expectedScaleSets := newTestVMSSList(3, testASG, "eastus", orchMode) + expectedVMSSVMs := newTestVMSSVMList(3) + expectedVMs := newTestVMList(3) + + // Include Edge Zone scenario here, testing scale from 3 to 5 and scale from zero cases. + expectedEdgeZoneScaleSets := newTestVMSSListForEdgeZones(3, "edgezone-vmss") + expectedEdgeZoneMinZeroScaleSets := newTestVMSSListForEdgeZones(0, "edgezone-minzero-vmss") + expectedScaleSets = append(expectedScaleSets, *expectedEdgeZoneScaleSets, *expectedEdgeZoneMinZeroScaleSets) + provider := newTestProvider(t) - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() @@ -189,12 +260,10 @@ func TestIncreaseSize(t *testing.T) { provider.azureManager.azClient.virtualMachinesClient = mockVMClient if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient } else { - provider.azureManager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() } @@ -207,26 +276,69 @@ func TestIncreaseSize(t *testing.T) { assert.Equal(t, expectedErr, err) registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) + newTestScaleSet(provider.azureManager, testASG)) assert.True(t, registered) assert.Equal(t, len(provider.NodeGroups()), 1) - // current target size is 2. + // Current target size is 3. targetSize, err := provider.NodeGroups()[0].TargetSize() assert.NoError(t, err) assert.Equal(t, 3, targetSize) - // increase 3 nodes. + // Increase 2 nodes. err = provider.NodeGroups()[0].IncreaseSize(2) assert.NoError(t, err) - // new target size should be 5. + // New target size should be 5. targetSize, err = provider.NodeGroups()[0].TargetSize() assert.NoError(t, err) assert.Equal(t, 5, targetSize) + + // Testing Edge Zone scenario. Scale from 3 to 5. + registeredForEdgeZone := provider.azureManager.RegisterNodeGroup( + newTestScaleSet(provider.azureManager, "edgezone-vmss")) + assert.True(t, registeredForEdgeZone) + assert.Equal(t, len(provider.NodeGroups()), 2) + + targetSizeForEdgeZone, err := provider.NodeGroups()[1].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 3, targetSizeForEdgeZone) + + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), provider.azureManager.config.ResourceGroup, + "edgezone-vmss", gomock.Any()).Return(nil, nil) + err = provider.NodeGroups()[1].IncreaseSize(2) + assert.NoError(t, err) + + targetSizeForEdgeZone, err = provider.NodeGroups()[1].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 5, targetSizeForEdgeZone) + + // Testing Edge Zone scenario scaleFromZero case. Scale from 0 to 2. + registeredForEdgeZoneMinZero := provider.azureManager.RegisterNodeGroup( + newTestScaleSetMinSizeZero(provider.azureManager, "edgezone-minzero-vmss")) + assert.True(t, registeredForEdgeZoneMinZero) + assert.Equal(t, len(provider.NodeGroups()), 3) + + // Current target size is 0. + targetSizeForEdgeZoneMinZero, err := provider.NodeGroups()[2].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 0, targetSizeForEdgeZoneMinZero) + + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), provider.azureManager.config.ResourceGroup, + "edgezone-minzero-vmss", gomock.Any()).Return(nil, nil) + err = provider.NodeGroups()[2].IncreaseSize(2) + assert.NoError(t, err) + + // New target size should be 2. + targetSizeForEdgeZoneMinZero, err = provider.NodeGroups()[2].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 2, targetSizeForEdgeZoneMinZero) } } +// TestIncreaseSizeOnVMProvisioningFailed has been tweeked only for Uniform Orchestration mode. +// If ProvisioningState == failed, Status.State == InstanceFailed for all the cases. +// Expected results would be different for Flexible orchestration mode but that is not getting tested in AKS. func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { testCases := map[string]struct { expectInstanceRunning bool @@ -259,6 +371,7 @@ func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform) expectedVMSSVMs := newTestVMSSVMList(3) + // The failed state is important line of code here expectedVMs := newTestVMList(3) expectedVMSSVMs[2].ProvisioningState = to.StringPtr(provisioningStateFailed) if !testCase.isMissingInstanceView { @@ -273,9 +386,11 @@ func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + mockVMClient := mockvmclient.NewMockInterface(ctrl) mockVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() manager.azClient.virtualMachinesClient = mockVMClient + manager.explicitlyConfigured["vmss-failed-upscale"] = true registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) assert.True(t, registered) @@ -320,7 +435,7 @@ func TestIncreaseSizeOnVMSSUpdating(t *testing.T) { Capacity: &vmssCapacity, }, VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ - ProvisioningState: to.StringPtr(provisioningStateUpdating), + ProvisioningState: to.StringPtr(string(compute.GalleryProvisioningStateUpdating)), OrchestrationMode: compute.Uniform, }, }, @@ -329,16 +444,21 @@ func TestIncreaseSizeOnVMSSUpdating(t *testing.T) { mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil) - mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil) - mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return( + nil, nil) + mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return( + &http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) - mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-updating", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-updating", + gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient manager.explicitlyConfigured["vmss-updating"] = true registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) assert.True(t, registered) - manager.Refresh() + + err := manager.Refresh() + assert.NoError(t, err) provider, err := BuildAzureCloudProvider(manager, nil) assert.NoError(t, err) @@ -359,7 +479,6 @@ func TestBelongs(t *testing.T) { expectedVMs := newTestVMList(3) for _, orchMode := range orchestrationModes { - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) provider := newTestProvider(t) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) @@ -370,32 +489,30 @@ func TestBelongs(t *testing.T) { provider.azureManager.azClient.virtualMachinesClient = mockVMClient if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient - } else { - provider.azureManager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() } registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) + newTestScaleSet(provider.azureManager, testASG)) assert.True(t, registered) scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) assert.True(t, ok) provider.azureManager.explicitlyConfigured["test-asg"] = true - provider.azureManager.Refresh() + err := provider.azureManager.Refresh() + assert.NoError(t, err) invalidNode := &apiv1.Node{ Spec: apiv1.NodeSpec{ - ProviderID: "azure:///subscriptions/test-subscrition-id/resourcegroups/invalid-asg/providers/microsoft.compute/virtualmachinescalesets/agents/virtualmachines/0", + ProviderID: azurePrefix + "/subscriptions/test-subscrition-id/resourcegroups/invalid-asg/providers/microsoft.compute/virtualmachinescalesets/agents/virtualmachines/0", }, } - _, err := scaleSet.Belongs(invalidNode) + _, err = scaleSet.Belongs(invalidNode) assert.Error(t, err) validNode := newApiNode(orchMode, 0) @@ -403,7 +520,6 @@ func TestBelongs(t *testing.T) { assert.Equal(t, true, belongs) assert.NoError(t, err) } - } func TestDeleteNodes(t *testing.T) { @@ -438,7 +554,7 @@ func TestDeleteNodes(t *testing.T) { } else { manager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() - + manager.azClient.virtualMachinesClient = mockVMClient } err := manager.forceRefresh() @@ -452,8 +568,8 @@ func TestDeleteNodes(t *testing.T) { assert.NoError(t, err) registered := manager.RegisterNodeGroup( - newTestScaleSet(manager, "test-asg")) - manager.explicitlyConfigured["test-asg"] = true + newTestScaleSet(manager, testASG)) + manager.explicitlyConfigured[testASG] = true assert.True(t, registered) err = manager.forceRefresh() @@ -498,14 +614,19 @@ func TestDeleteNodes(t *testing.T) { assert.Equal(t, 1, targetSize) // Ensure that the status for the instances is Deleting - instance0, found := scaleSet.getInstanceByProviderID(nodesToDelete[0].Spec.ProviderID) + // lastInstanceRefresh is set to time.Now() to avoid resetting instanceCache. + scaleSet.lastInstanceRefresh = time.Now() + instance0, found, err := scaleSet.getInstanceByProviderID(nodesToDelete[0].Spec.ProviderID) assert.True(t, found, true) + assert.NoError(t, err) assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) - instance2, found := scaleSet.getInstanceByProviderID(nodesToDelete[1].Spec.ProviderID) + // lastInstanceRefresh is set to time.Now() to avoid resetting instanceCache. + scaleSet.lastInstanceRefresh = time.Now() + instance2, found, err := scaleSet.getInstanceByProviderID(nodesToDelete[1].Spec.ProviderID) assert.True(t, found, true) + assert.NoError(t, err) assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting) - } } @@ -534,12 +655,10 @@ func TestDeleteNodeUnregistered(t *testing.T) { manager.azClient.virtualMachinesClient = mockVMClient if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient } else { - manager.config.EnableVmssFlex = true mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() } @@ -549,18 +668,20 @@ func TestDeleteNodeUnregistered(t *testing.T) { resourceLimiter := cloudprovider.NewResourceLimiter( map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + provider, err := BuildAzureCloudProvider(manager, resourceLimiter) assert.NoError(t, err) registered := manager.RegisterNodeGroup( - newTestScaleSet(manager, "test-asg")) - manager.explicitlyConfigured["test-asg"] = true + newTestScaleSet(manager, testASG)) + manager.explicitlyConfigured[testASG] = true assert.True(t, registered) err = manager.forceRefresh() assert.NoError(t, err) scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) assert.True(t, ok) + scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod targetSize, err := scaleSet.TargetSize() assert.NoError(t, err) @@ -584,10 +705,126 @@ func TestDeleteNodeUnregistered(t *testing.T) { assert.Equal(t, 2, targetSize) // Ensure that the status for the instances is Deleting - instance0, found := scaleSet.getInstanceByProviderID(nodesToDelete[0].Spec.ProviderID) + // lastInstanceRefresh is set to time.Now() to avoid resetting instanceCache. + scaleSet.lastInstanceRefresh = time.Now() + instance0, found, err := scaleSet.getInstanceByProviderID(nodesToDelete[0].Spec.ProviderID) assert.True(t, found, true) - assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) + assert.NoError(t, err) + assert.Equal(t, cloudprovider.InstanceDeleting, instance0.Status.State) } +} + +func TestDeleteInstancesWithForceDeleteEnabled(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + manager := newTestAzureManager(t) + // enabling forceDelete + manager.config.EnableForceDelete = true + + vmssName := "test-asg" + var vmssCapacity int64 = 3 + //hostGroupId := "test-hostGroup" + //hostGroup := &compute.SubResource{ + // ID: &hostGroupId, + //} + + expectedScaleSets := []compute.VirtualMachineScaleSet{ + { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + OrchestrationMode: compute.Uniform, + }, + }, + } + expectedVMSSVMs := newTestVMSSVMList(3) + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), true).Return(nil, nil) + mockVMSSClient.EXPECT().WaitForDeleteInstancesResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + err := manager.forceRefresh() + assert.NoError(t, err) + + resourceLimiter := cloudprovider.NewResourceLimiter( + map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, + map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + provider, err := BuildAzureCloudProvider(manager, resourceLimiter) + assert.NoError(t, err) + + registered := manager.RegisterNodeGroup( + newTestScaleSet(manager, "test-asg")) + manager.explicitlyConfigured["test-asg"] = true + assert.True(t, registered) + err = manager.forceRefresh() + assert.NoError(t, err) + + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + + targetSize, err := scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 3, targetSize) + + // Perform the delete operation + nodesToDelete := []*apiv1.Node{ + { + Spec: apiv1.NodeSpec{ + ProviderID: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0), + }, + }, + { + Spec: apiv1.NodeSpec{ + ProviderID: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 2), + }, + }, + } + err = scaleSet.DeleteNodes(nodesToDelete) + assert.NoError(t, err) + vmssCapacity = 1 + expectedScaleSets = []compute.VirtualMachineScaleSet{ + { + Name: &vmssName, + Sku: &compute.Sku{ + Capacity: &vmssCapacity, + }, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + OrchestrationMode: compute.Uniform, + }, + }, + } + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() + expectedVMSSVMs[0].ProvisioningState = to.StringPtr(string(compute.GalleryProvisioningStateDeleting)) + expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.GalleryProvisioningStateDeleting)) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + err = manager.forceRefresh() + assert.NoError(t, err) + + // Ensure the the cached size has been proactively decremented by 2 + targetSize, err = scaleSet.TargetSize() + assert.NoError(t, err) + assert.Equal(t, 1, targetSize) + + // Ensure that the status for the instances is Deleting + // lastInstanceRefresh is set to time.Now() to avoid resetting instanceCache. + scaleSet.lastInstanceRefresh = time.Now() + instance0, found, err := scaleSet.getInstanceByProviderID(azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)) + assert.True(t, found, true) + assert.NoError(t, err) + assert.Equal(t, instance0.Status.State, cloudprovider.InstanceDeleting) + + // lastInstanceRefresh is set to time.Now() to avoid resetting instanceCache. + scaleSet.lastInstanceRefresh = time.Now() + instance2, found, err := scaleSet.getInstanceByProviderID(azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 2)) + assert.True(t, found, true) + assert.NoError(t, err) + assert.Equal(t, instance2.Status.State, cloudprovider.InstanceDeleting) } @@ -642,7 +879,7 @@ func TestDeleteNoConflictRequest(t *testing.T) { node := &apiv1.Node{ Spec: apiv1.NodeSpec{ - ProviderID: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0), + ProviderID: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0), }, } @@ -680,7 +917,6 @@ func TestScaleSetNodes(t *testing.T) { expectedVMs := newTestVMList(3) for _, orchMode := range orchestrationModes { - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) provider := newTestProvider(t) @@ -692,7 +928,6 @@ func TestScaleSetNodes(t *testing.T) { provider.azureManager.azClient.virtualMachinesClient = mockVMClient if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient @@ -705,7 +940,9 @@ func TestScaleSetNodes(t *testing.T) { registered := provider.azureManager.RegisterNodeGroup( newTestScaleSet(provider.azureManager, "test-asg")) provider.azureManager.explicitlyConfigured["test-asg"] = true - provider.azureManager.Refresh() + err := provider.azureManager.Refresh() + assert.NoError(t, err) + assert.True(t, registered) assert.Equal(t, len(provider.NodeGroups()), 1) @@ -713,11 +950,12 @@ func TestScaleSetNodes(t *testing.T) { group, err := provider.NodeGroupForNode(node) assert.NoError(t, err) assert.NotNil(t, group, "Group should not be nil") - assert.Equal(t, group.Id(), "test-asg") + assert.Equal(t, group.Id(), testASG) assert.Equal(t, group.MinSize(), 1) assert.Equal(t, group.MaxSize(), 5) ss, ok := group.(*ScaleSet) + ss.lastInstanceRefresh = time.Now() assert.True(t, ok) assert.NotNil(t, ss) instances, err := group.Nodes() @@ -726,14 +964,14 @@ func TestScaleSetNodes(t *testing.T) { if orchMode == compute.Uniform { - assert.Equal(t, instances[0], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)}) - assert.Equal(t, instances[1], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 1)}) - assert.Equal(t, instances[2], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 2)}) + assert.Equal(t, instances[0], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 0)}) + assert.Equal(t, instances[1], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 1)}) + assert.Equal(t, instances[2], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineScaleSetVMID, 2)}) } else { - assert.Equal(t, instances[0], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineVMID, 0)}) - assert.Equal(t, instances[1], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineVMID, 1)}) - assert.Equal(t, instances[2], cloudprovider.Instance{Id: "azure://" + fmt.Sprintf(fakeVirtualMachineVMID, 2)}) + assert.Equal(t, instances[0], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineVMID, 0)}) + assert.Equal(t, instances[1], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineVMID, 1)}) + assert.Equal(t, instances[2], cloudprovider.Instance{Id: azurePrefix + fmt.Sprintf(fakeVirtualMachineVMID, 2)}) } } @@ -754,13 +992,14 @@ func TestEnableVmssFlexFlag(t *testing.T) { provider.azureManager.config.EnableVmssFlex = false provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient mockVMClient := mockvmclient.NewMockInterface(ctrl) - mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes() + + mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachinesClient = mockVMClient provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) - provider.azureManager.explicitlyConfigured["test-asg"] = true + newTestScaleSet(provider.azureManager, testASG)) + provider.azureManager.explicitlyConfigured[testASG] = true err := provider.azureManager.Refresh() assert.Error(t, err, "vmss - \"test-asg\" with Flexible orchestration detected but 'enbaleVmssFlex' feature flag is turned off") @@ -810,6 +1049,8 @@ func TestTemplateNodeInfo(t *testing.T) { return vmssType, nil } nodeInfo, err := asg.TemplateNodeInfo() + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Cpu(), *resource.NewQuantity(1, resource.DecimalSI)) + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Memory(), *resource.NewQuantity(3*1024*1024, resource.DecimalSI)) assert.NoError(t, err) assert.NotNil(t, nodeInfo) assert.NotEmpty(t, nodeInfo.Pods) @@ -827,6 +1068,8 @@ func TestTemplateNodeInfo(t *testing.T) { return &vmssType, nil } nodeInfo, err := asg.TemplateNodeInfo() + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Cpu(), *resource.NewQuantity(1, resource.DecimalSI)) + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Memory(), *resource.NewQuantity(3*1024*1024, resource.DecimalSI)) assert.NoError(t, err) assert.NotNil(t, nodeInfo) assert.NotEmpty(t, nodeInfo.Pods) @@ -856,8 +1099,58 @@ func TestTemplateNodeInfo(t *testing.T) { return &vmssType, nil } nodeInfo, err := asg.TemplateNodeInfo() + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Cpu(), *resource.NewQuantity(1, resource.DecimalSI)) + assert.Equal(t, *nodeInfo.Node().Status.Capacity.Memory(), *resource.NewQuantity(3*1024*1024, resource.DecimalSI)) assert.NoError(t, err) assert.NotNil(t, nodeInfo) assert.NotEmpty(t, nodeInfo.Pods) }) } +func TestCseErrors(t *testing.T) { + errorMessage := to.StringPtr("Error Message Test") + vmssVMs := compute.VirtualMachineScaleSetVM{ + Name: to.StringPtr("vmTest"), + ID: to.StringPtr(fakeVirtualMachineScaleSetVMID), + InstanceID: to.StringPtr("0"), + VirtualMachineScaleSetVMProperties: &compute.VirtualMachineScaleSetVMProperties{ + VMID: to.StringPtr("123E4567-E89B-12D3-A456-426655440000"), + ProvisioningState: to.StringPtr("Succeeded"), + InstanceView: &compute.VirtualMachineScaleSetVMInstanceView{ + Extensions: &[]compute.VirtualMachineExtensionInstanceView{ + { + Statuses: &[]compute.InstanceViewStatus{ + { + Level: "Error", + Message: errorMessage, + }, + }, + }, + }, + }, + }, + } + + manager := newTestAzureManager(t) + resourceLimiter := cloudprovider.NewResourceLimiter( + map[string]int64{cloudprovider.ResourceNameCores: 1, cloudprovider.ResourceNameMemory: 10000000}, + map[string]int64{cloudprovider.ResourceNameCores: 10, cloudprovider.ResourceNameMemory: 100000000}) + provider, _ := BuildAzureCloudProvider(manager, resourceLimiter) + manager.RegisterNodeGroup( + newTestScaleSet(manager, "test-asg")) + manager.explicitlyConfigured["test-asg"] = true + scaleSet, _ := provider.NodeGroups()[0].(*ScaleSet) + + t.Run("getCSEErrorMessages test with CSE error in VM extensions", func(t *testing.T) { + expectedCSEWErrorMessage := "Error Message Test" + (*vmssVMs.InstanceView.Extensions)[0].Name = to.StringPtr(vmssCSEExtensionName) + actualCSEErrorMessage, actualCSEFailureBool := scaleSet.cseErrors(vmssVMs.InstanceView.Extensions) + assert.True(t, actualCSEFailureBool) + assert.Equal(t, []string{expectedCSEWErrorMessage}, actualCSEErrorMessage) + }) + t.Run("getCSEErrorMessages test with no CSE error in VM extensions", func(t *testing.T) { + (*vmssVMs.InstanceView.Extensions)[0].Name = to.StringPtr("notCSEExtension") + actualCSEErrorMessage, actualCSEFailureBool := scaleSet.cseErrors(vmssVMs.InstanceView.Extensions) + assert.False(t, actualCSEFailureBool) + assert.Equal(t, []string(nil), actualCSEErrorMessage) + }) +} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index 719762078fed..e548f343318d 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -24,58 +24,74 @@ import ( "strings" "time" + "sigs.k8s.io/cloud-provider-azure/pkg/consts" + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" - cloudvolume "k8s.io/cloud-provider/volume" "k8s.io/klog/v2" + kubeletapis "k8s.io/kubelet/pkg/apis" ) const ( - azureDiskTopologyKey string = "topology.disk.csi.azure.com/zone" + // AKSLabelPrefixValue represents the constant prefix for AKSLabelKeyPrefixValue + AKSLabelPrefixValue = "kubernetes.azure.com" + // AKSLabelKeyPrefixValue represents prefix for AKS Labels + AKSLabelKeyPrefixValue = AKSLabelPrefixValue + "/" + + azureDiskTopologyKey = "topology.disk.csi.azure.com/zone" + // For NP-series SKU, the xilinx device plugin uses that resource name + // https://github.com/Xilinx/FPGA_as_a_Service/tree/master/k8s-fpga-device-plugin + xilinxFpgaResourceName = "xilinx.com/fpga-xilinx_u250_gen3x16_xdma_shell_2_1-0" + + // legacyPoolNameTag is the legacy tag that AKS adds to the VMSS with its value + // being the agentpool name + legacyPoolNameTag = "poolName" + // poolNameTag is the new tag that replaces the above one + // Newly created pools and clusters will have this one on the VMSS + // instead of the legacy one. We'll have to live with both tags for a while. + poolNameTag = "aks-managed-poolName" + + // This is the legacy label is added by agentbaker, agentpool={poolName} and we want to predict that + // a node added to this agentpool will have this as a node label. The value is fetched + // from the VMSS tag with key poolNameTag/legacyPoolNameTag + legacyAgentPoolNodeLabelKey = "agentpool" + // New label that replaces the above + agentPoolNodeLabelKey = AKSLabelKeyPrefixValue + "agentpool" + + // Storage profile node labels + legacyStorageProfileNodeLabelKey = "storageprofile" + storageProfileNodeLabelKey = AKSLabelKeyPrefixValue + "storageprofile" + + // Storage tier node labels + legacyStorageTierNodeLabelKey = "storagetier" + storageTierNodeLabelKey = AKSLabelKeyPrefixValue + "storagetier" + + // Fips node label + fipsNodeLabelKey = AKSLabelKeyPrefixValue + "fips_enabled" + + // OS Sku node Label + osSkuLabelKey = AKSLabelKeyPrefixValue + "os-sku" + + // Security node label + securityTypeLabelKey = AKSLabelKeyPrefixValue + "security-type" + + // Labels defined in RP + // Since Cluster autoscaler cannot import RP, it is defined here. + customCATrustEnabledLabelKey = AKSLabelKeyPrefixValue + "custom-ca-trust-enabled" + kataMshvVMIsolationLabelKey = AKSLabelKeyPrefixValue + "kata-mshv-vm-isolation" + + // Cluster node label + clusterLabelKey = AKSLabelKeyPrefixValue + "cluster" ) -func buildInstanceOS(template compute.VirtualMachineScaleSet) string { - instanceOS := cloudprovider.DefaultOS - if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.OsProfile != nil && template.VirtualMachineProfile.OsProfile.WindowsConfiguration != nil { - instanceOS = "windows" - } - - return instanceOS -} - -func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string) map[string]string { - result := make(map[string]string) - - result[apiv1.LabelArchStable] = cloudprovider.DefaultArch - result[apiv1.LabelOSStable] = buildInstanceOS(template) - - result[apiv1.LabelInstanceTypeStable] = *template.Sku.Name - result[apiv1.LabelTopologyRegion] = strings.ToLower(*template.Location) - - if template.Zones != nil && len(*template.Zones) > 0 { - failureDomains := make([]string, len(*template.Zones)) - for k, v := range *template.Zones { - failureDomains[k] = strings.ToLower(*template.Location) + "-" + v - } - - result[apiv1.LabelTopologyZone] = strings.Join(failureDomains[:], cloudvolume.LabelMultiZoneDelimiter) - result[azureDiskTopologyKey] = strings.Join(failureDomains[:], cloudvolume.LabelMultiZoneDelimiter) - } else { - result[apiv1.LabelTopologyZone] = "0" - result[azureDiskTopologyKey] = "" - } - - result[apiv1.LabelHostname] = nodeName - return result -} +func buildNodeFromTemplate(nodeGroupName string, template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) { -func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineScaleSet, manager *AzureManager) (*apiv1.Node, error) { node := apiv1.Node{} - nodeName := fmt.Sprintf("%s-asg-%d", scaleSetName, rand.Int63()) + nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63()) node.ObjectMeta = metav1.ObjectMeta{ Name: nodeName, @@ -91,7 +107,7 @@ func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineS // Fetching SKU information from SKU API if enableDynamicInstanceList is true. var dynamicErr error - if manager.config.EnableDynamicInstanceList { + if enableDynamicInstanceList { var vmssTypeDynamic InstanceType klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, manager.azureCache) @@ -103,7 +119,7 @@ func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineS klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) } } - if !manager.config.EnableDynamicInstanceList || dynamicErr != nil { + if !enableDynamicInstanceList || dynamicErr != nil { klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name) // fall-back on static list of vmss if dynamic workflow fails. vmssTypeStatic, staticErr := GetVMSSTypeStatically(template) @@ -122,17 +138,14 @@ func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineS node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(vcpu, resource.DecimalSI) // isNPSeries returns if a SKU is an NP-series SKU // SKU API reports GPUs for NP-series but it's actually FPGAs - if !isNPSeries(*template.Sku.Name) { + if isNPSeries(*template.Sku.Name) { + node.Status.Capacity[xilinxFpgaResourceName] = *resource.NewQuantity(gpuCount, resource.DecimalSI) + } else { node.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(gpuCount, resource.DecimalSI) } node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(memoryMb*1024*1024, resource.DecimalSI) - resourcesFromTags := extractAllocatableResourcesFromScaleSet(template.Tags) - for resourceName, val := range resourcesFromTags { - node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val - } - // TODO: set real allocatable. node.Status.Allocatable = node.Status.Capacity @@ -150,16 +163,188 @@ func buildNodeFromTemplate(scaleSetName string, template compute.VirtualMachineS // GenericLabels node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabels(template, nodeName)) + // Labels from the Scale Set's Tags - node.Labels = cloudprovider.JoinStringMaps(node.Labels, extractLabelsFromScaleSet(template.Tags)) + labels := extractLabelsFromScaleSet(template.Tags) + + // Add the agentpool label, its value should come from the VMSS poolName tag + // NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one + // We will have to live with both labels for a while + if node.Labels[legacyPoolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag] + labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag] + } + if node.Labels[poolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag] + labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag] + } + + // Add the storage profile and storage tier labels + if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.StorageProfile != nil && template.VirtualMachineProfile.StorageProfile.OsDisk != nil { + // ephemeral + if template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings != nil && template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings.Option == compute.Local { + labels[legacyStorageProfileNodeLabelKey] = "ephemeral" + labels[storageProfileNodeLabelKey] = "ephemeral" + } else { + labels[legacyStorageProfileNodeLabelKey] = "managed" + labels[storageProfileNodeLabelKey] = "managed" + } + if template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk != nil { + labels[legacyStorageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + labels[storageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + } + // Add ephemeral-storage value + if template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB != nil { + node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI) + klog.V(4).Infof("OS Disk Size from template is: %d", *template.VirtualMachineProfile.StorageProfile.OsDisk.DiskSizeGB) + klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage]) + } + } + + // If we are on GPU-enabled SKUs, append the accelerator + // label so that CA makes better decision when scaling from zero for GPU pools + if isNvidiaEnabledSKU(*template.Sku.Name) { + labels[GPULabel] = "nvidia" + labels[legacyGPULabel] = "nvidia" + } + + // Extract allocatables from tags + resourcesFromTags := extractAllocatableResourcesFromScaleSet(template.Tags) + for resourceName, val := range resourcesFromTags { + node.Status.Capacity[apiv1.ResourceName(resourceName)] = *val + } + + node.Labels = cloudprovider.JoinStringMaps(node.Labels, labels) + klog.V(4).Infof("Setting node %s labels to: %s", nodeName, node.Labels) // Taints from the Scale Set's Tags node.Spec.Taints = extractTaintsFromScaleSet(template.Tags) + klog.V(4).Infof("Setting node %s taints to: %s", nodeName, node.Spec.Taints) node.Status.Conditions = cloudprovider.BuildReadyConditions() return &node, nil } +func buildInstanceOS(template compute.VirtualMachineScaleSet) string { + instanceOS := cloudprovider.DefaultOS + if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.OsProfile != nil && template.VirtualMachineProfile.OsProfile.WindowsConfiguration != nil { + instanceOS = "windows" + } + + return instanceOS +} + +func buildGenericLabels(template compute.VirtualMachineScaleSet, nodeName string) map[string]string { + result := make(map[string]string) + + result[kubeletapis.LabelArch] = cloudprovider.DefaultArch + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + + result[kubeletapis.LabelOS] = buildInstanceOS(template) + result[apiv1.LabelOSStable] = buildInstanceOS(template) + + result[apiv1.LabelInstanceType] = *template.Sku.Name + result[apiv1.LabelInstanceTypeStable] = *template.Sku.Name + result[apiv1.LabelZoneRegion] = strings.ToLower(*template.Location) + result[apiv1.LabelTopologyRegion] = strings.ToLower(*template.Location) + + if template.Zones != nil && len(*template.Zones) > 0 { + failureDomains := make([]string, len(*template.Zones)) + for k, v := range *template.Zones { + failureDomains[k] = strings.ToLower(*template.Location) + "-" + v + } + //Picks random zones for Multi-zone nodepool when scaling from zero. + //This random zone will not be the same as the zone of the VMSS that is being created, the purpose of creating + //the node template with random zone is to initiate scaling from zero on the multi-zone nodepool. + //Note that the if the customer is to have some pod affinity picking exact zone, this logic won't work. + //For now, discourage the customers from using podAffinity to pick the availability zones. + randomZone := failureDomains[rand.Intn(len(failureDomains))] + result[apiv1.LabelZoneFailureDomain] = randomZone + result[apiv1.LabelTopologyZone] = randomZone + result[azureDiskTopologyKey] = randomZone + } else { + result[apiv1.LabelZoneFailureDomain] = "0" + result[apiv1.LabelTopologyZone] = "0" + result[azureDiskTopologyKey] = "" + } + + result[apiv1.LabelHostname] = nodeName + return result +} + +func fetchLabel(template *compute.VirtualMachineScaleSet, nodeLabels map[string]string) map[string]string { + // Labels from the Scale Set's Tags + var labels = extractLabelsFromScaleSet(template.Tags) + + // Add the agentpool label, its value should come from the VMSS poolName tag + // NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one + // We will have to live with both labels for a while + if nodeLabels[legacyPoolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = nodeLabels[legacyPoolNameTag] + labels[agentPoolNodeLabelKey] = nodeLabels[legacyPoolNameTag] + } + if nodeLabels[poolNameTag] != "" { + labels[legacyAgentPoolNodeLabelKey] = nodeLabels[poolNameTag] + labels[agentPoolNodeLabelKey] = nodeLabels[poolNameTag] + } + + // Add node-role label + if nodeLabels[consts.NodeLabelRole] != "" { + labels[consts.NodeLabelRole] = nodeLabels[consts.NodeLabelRole] + } + + if nodeLabels[fipsNodeLabelKey] != "" { + labels[fipsNodeLabelKey] = nodeLabels[fipsNodeLabelKey] + } + + if nodeLabels[osSkuLabelKey] != "" { + labels[osSkuLabelKey] = nodeLabels[osSkuLabelKey] + } + + if nodeLabels[securityTypeLabelKey] != "" { + labels[securityTypeLabelKey] = nodeLabels[securityTypeLabelKey] + } + + if nodeLabels[customCATrustEnabledLabelKey] != "" { + labels[customCATrustEnabledLabelKey] = nodeLabels[customCATrustEnabledLabelKey] + } + + if nodeLabels[kataMshvVMIsolationLabelKey] != "" { + labels[kataMshvVMIsolationLabelKey] = nodeLabels[kataMshvVMIsolationLabelKey] + } + + if nodeLabels[clusterLabelKey] != "" { + labels[clusterLabelKey] = nodeLabels[clusterLabelKey] + } + + // Add the storage tier labels + if template.VirtualMachineProfile != nil && template.VirtualMachineProfile.StorageProfile != nil && + template.VirtualMachineProfile.StorageProfile.OsDisk != nil { + // ephemeral + if template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings != nil && + template.VirtualMachineProfile.StorageProfile.OsDisk.DiffDiskSettings.Option == compute.Local { + labels[legacyStorageProfileNodeLabelKey] = "ephemeral" + labels[storageProfileNodeLabelKey] = "ephemeral" + } else { + labels[legacyStorageProfileNodeLabelKey] = "managed" + labels[storageProfileNodeLabelKey] = "managed" + } + if template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk != nil { + labels[legacyStorageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + labels[storageTierNodeLabelKey] = string(template.VirtualMachineProfile.StorageProfile.OsDisk.ManagedDisk.StorageAccountType) + } + } + + // If we are on GPU-enabled SKUs, append the accelerator + // label so that CA makes better decision when scaling from zero for GPU pools + if isNvidiaEnabledSKU(*template.Sku.Name) { + labels[GPULabel] = "nvidia" + labels[legacyGPULabel] = "nvidia" + } + + return labels +} + func extractLabelsFromScaleSet(tags map[string]*string) map[string]string { result := make(map[string]string) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template_test.go b/cluster-autoscaler/cloudprovider/azure/azure_template_test.go index 3eb8295f662b..09dbbae0318d 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template_test.go @@ -18,11 +18,14 @@ package azure import ( "fmt" + "testing" + + "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" + "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/to" "github.com/stretchr/testify/assert" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - "testing" ) func TestExtractLabelsFromScaleSet(t *testing.T) { @@ -119,6 +122,63 @@ func TestExtractAllocatableResourcesFromScaleSet(t *testing.T) { assert.Equal(t, (&exepectedCustomAllocatable).String(), labels["nvidia.com/Tesla-P100-PCIE"].String()) } +func TestTopologyFromScaleSet(t *testing.T) { + testNodeName := "test-node" + testSkuName := "test-sku" + testVmss := compute.VirtualMachineScaleSet{ + Response: autorest.Response{}, + Sku: &compute.Sku{Name: &testSkuName}, + Plan: nil, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{OsProfile: nil}}, + Zones: &[]string{"1", "2", "3"}, + Location: to.StringPtr("westus"), + } + expectedZoneValues := []string{"westus-1", "westus-2", "westus-3"} + + labels := buildGenericLabels(testVmss, testNodeName) + failureDomain, ok := labels[apiv1.LabelZoneFailureDomain] + assert.True(t, ok) + topologyZone, ok := labels[apiv1.LabelTopologyZone] + assert.True(t, ok) + azureDiskTopology, ok := labels[azureDiskTopologyKey] + assert.True(t, ok) + + assert.Contains(t, expectedZoneValues, failureDomain) + assert.Contains(t, expectedZoneValues, topologyZone) + assert.Contains(t, expectedZoneValues, azureDiskTopology) +} + +func TestEmptyTopologyFromScaleSet(t *testing.T) { + testNodeName := "test-node" + testSkuName := "test-sku" + testVmss := compute.VirtualMachineScaleSet{ + Response: autorest.Response{}, + Sku: &compute.Sku{Name: &testSkuName}, + Plan: nil, + VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{ + VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{OsProfile: nil}}, + Location: to.StringPtr("westus"), + } + + expectedFailureDomain := "0" + expectedTopologyZone := "0" + expectedAzureDiskTopology := "" + labels := buildGenericLabels(testVmss, testNodeName) + + failureDomain, ok := labels[apiv1.LabelZoneFailureDomain] + assert.True(t, ok) + assert.Equal(t, expectedFailureDomain, failureDomain) + + topologyZone, ok := labels[apiv1.LabelTopologyZone] + assert.True(t, ok) + assert.Equal(t, expectedTopologyZone, topologyZone) + + azureDiskTopology, ok := labels[azureDiskTopologyKey] + assert.True(t, ok) + assert.Equal(t, expectedAzureDiskTopology, azureDiskTopology) +} + func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool { set := make(map[apiv1.Taint]bool) for _, taint := range taints { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go index 86eb8ba87bef..83cf6848bf57 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go @@ -18,8 +18,6 @@ package azure import ( "context" - "crypto/rsa" - "crypto/x509" "encoding/json" "fmt" "io/ioutil" @@ -36,8 +34,7 @@ import ( "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/to" - "golang.org/x/crypto/pkcs12" - + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/version" klog "k8s.io/klog/v2" "sigs.k8s.io/cloud-provider-azure/pkg/retry" @@ -63,6 +60,12 @@ const ( vmResourceType = "Microsoft.Compute/virtualMachines" vmExtensionType = "Microsoft.Compute/virtualMachines/extensions" + // CSE Extension checks + vmssCSEExtensionName = "vmssCSE" + vmssExtensionProvisioningFailed = "VMExtensionProvisioningFailed" + // vmExtensionProvisioningErrorClass represents a Vm extension provisioning error + vmExtensionProvisioningErrorClass cloudprovider.InstanceErrorClass = 103 + // resource ids nsgID = "nsgID" rtID = "routeTableID" @@ -180,7 +183,7 @@ func (util *AzUtil) DeleteVirtualMachine(rg string, name string) error { } klog.V(2).Infof("VirtualMachine %s/%s removed", rg, name) - if len(nicName) > 0 { + if nicName != "" { klog.Infof("deleting nic: %s/%s", rg, nicName) interfaceCtx, interfaceCancel := getContextWithCancel() defer interfaceCancel() @@ -224,25 +227,9 @@ func (util *AzUtil) DeleteVirtualMachine(rg string, name string) error { klog.V(2).Infof("disk %s/%s removed", rg, *osDiskName) } } - return nil } -// decodePkcs12 decodes a PKCS#12 client certificate by extracting the public certificate and -// the private RSA key -func decodePkcs12(pkcs []byte, password string) (*x509.Certificate, *rsa.PrivateKey, error) { - privateKey, certificate, err := pkcs12.Decode(pkcs, password) - if err != nil { - return nil, nil, fmt.Errorf("decoding the PKCS#12 client certificate: %v", err) - } - rsaPrivateKey, isRsaKey := privateKey.(*rsa.PrivateKey) - if !isRsaKey { - return nil, nil, fmt.Errorf("PKCS#12 certificate must contain a RSA private key") - } - - return certificate, rsaPrivateKey, nil -} - func getUserAgentExtension() string { return fmt.Sprintf("cluster-autoscaler/v%s", version.ClusterAutoscalerVersion) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go index 983693d49b9f..a1d5313c4797 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util_test.go @@ -215,13 +215,15 @@ func TestConvertResourceGroupNameToLower(t *testing.T) { }, { desc: "providerID not in Azure format should report error", - resourceID: "azure://invalid-id", + resourceID: azurePrefix + "invalid-id", expectError: true, }, { - desc: "resource group name in VM providerID should be converted", - resourceID: "azure:///subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myResourceGroupName/providers/Microsoft.Compute/virtualMachines/k8s-agent-AAAAAAAA-0", - expected: "azure:///subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myresourcegroupname/providers/Microsoft.Compute/virtualMachines/k8s-agent-AAAAAAAA-0", + desc: "resource group name in VM providerID should be converted", + resourceID: azurePrefix + "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myResourceGroupName" + + "/providers/Microsoft.Compute/virtualMachines/k8s-agent-AAAAAAAA-0", + expected: azurePrefix + "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myresourcegroupname" + + "/providers/Microsoft.Compute/virtualMachines/k8s-agent-AAAAAAAA-0", }, { desc: "resource group name in VM resourceID should be converted", @@ -229,9 +231,11 @@ func TestConvertResourceGroupNameToLower(t *testing.T) { expected: "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myresourcegroupname/providers/Microsoft.Compute/virtualMachines/k8s-agent-AAAAAAAA-0", }, { - desc: "resource group name in VMSS providerID should be converted", - resourceID: "azure:///subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myResourceGroupName/providers/Microsoft.Compute/virtualMachineScaleSets/myScaleSetName/virtualMachines/156", - expected: "azure:///subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myresourcegroupname/providers/Microsoft.Compute/virtualMachineScaleSets/myScaleSetName/virtualMachines/156", + desc: "resource group name in VMSS providerID should be converted", + resourceID: azurePrefix + "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myResourceGroupName" + + "/providers/Microsoft.Compute/virtualMachineScaleSets/myScaleSetName/virtualMachines/156", + expected: azurePrefix + "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/myresourcegroupname" + + "/providers/Microsoft.Compute/virtualMachineScaleSets/myScaleSetName/virtualMachines/156", }, { desc: "resource group name in VMSS resourceID should be converted", diff --git a/cluster-autoscaler/cloudprovider/azure/testdata/test.pfx b/cluster-autoscaler/cloudprovider/azure/testdata/test.pfx new file mode 100755 index 000000000000..693133363fbd Binary files /dev/null and b/cluster-autoscaler/cloudprovider/azure/testdata/test.pfx differ diff --git a/cluster-autoscaler/cloudprovider/azure/testdata/testnopassword.pfx b/cluster-autoscaler/cloudprovider/azure/testdata/testnopassword.pfx new file mode 100755 index 000000000000..0b32730ed8c7 Binary files /dev/null and b/cluster-autoscaler/cloudprovider/azure/testdata/testnopassword.pfx differ diff --git a/cluster-autoscaler/go.mod b/cluster-autoscaler/go.mod index 00b2f3adf6de..34d81adab794 100644 --- a/cluster-autoscaler/go.mod +++ b/cluster-autoscaler/go.mod @@ -22,18 +22,17 @@ require ( github.com/golang/mock v1.6.0 github.com/google/go-cmp v0.6.0 github.com/google/go-querystring v1.0.0 - github.com/google/uuid v1.3.1 + github.com/google/uuid v1.5.0 github.com/jmespath/go-jmespath v0.4.0 github.com/json-iterator/go v1.1.12 - github.com/onsi/ginkgo/v2 v2.13.0 - github.com/onsi/gomega v1.29.0 + github.com/onsi/ginkgo/v2 v2.13.2 + github.com/onsi/gomega v1.30.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.16.0 github.com/satori/go.uuid v1.2.0 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.4 go.uber.org/mock v0.4.0 - golang.org/x/crypto v0.21.0 golang.org/x/net v0.23.0 golang.org/x/oauth2 v0.11.0 golang.org/x/sys v0.18.0 @@ -55,15 +54,22 @@ require ( k8s.io/kubelet v0.29.5 k8s.io/kubernetes v1.29.5 k8s.io/legacy-cloud-providers v0.0.0 - k8s.io/utils v0.0.0-20230726121419-3b25d923346b - sigs.k8s.io/cloud-provider-azure v1.28.0 + k8s.io/utils v0.0.0-20231127182322-b307cd553661 + sigs.k8s.io/cloud-provider-azure v1.29.0 sigs.k8s.io/structured-merge-diff/v4 v4.4.1 - sigs.k8s.io/yaml v1.3.0 + sigs.k8s.io/yaml v1.4.0 ) require ( cloud.google.com/go/compute v1.23.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.2 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.4.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.4.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns v1.2.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.5.0 // indirect github.com/Azure/go-armbalancer v0.0.2 // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/Azure/go-autorest/autorest/azure/cli v0.4.2 // indirect @@ -71,7 +77,7 @@ require ( github.com/Azure/go-autorest/autorest/validation v0.3.1 // indirect github.com/Azure/go-autorest/logger v0.2.1 // indirect github.com/Azure/go-autorest/tracing v0.6.0 // indirect - github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.2.0 // indirect github.com/GoogleCloudPlatform/k8s-cloud-provider v1.18.1-0.20220218231025-f11817397a1b // indirect github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab // indirect github.com/Microsoft/go-winio v0.6.0 // indirect @@ -98,15 +104,15 @@ require ( github.com/docker/go-units v0.5.0 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/euank/go-kmsg-parser v2.0.0+incompatible // indirect - github.com/evanphx/json-patch v5.6.0+incompatible // indirect + github.com/evanphx/json-patch v5.7.0+incompatible // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect - github.com/go-logr/logr v1.3.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.2.3 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.22.3 // indirect + github.com/go-openapi/swag v0.22.4 // indirect github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -118,7 +124,7 @@ require ( github.com/google/cel-go v0.17.7 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/google/pprof v0.0.0-20230602010524-ada837c32108 // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect github.com/googleapis/gax-go/v2 v2.11.0 // indirect @@ -155,7 +161,7 @@ require ( github.com/rubiojr/go-vhd v0.0.0-20200706105327-02e210299021 // indirect github.com/seccomp/libseccomp-golang v0.10.0 // indirect github.com/sirupsen/logrus v1.9.0 // indirect - github.com/spf13/cobra v1.7.0 // indirect + github.com/spf13/cobra v1.8.0 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/stretchr/objx v0.5.0 // indirect github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect @@ -167,7 +173,7 @@ require ( go.etcd.io/etcd/client/v3 v3.5.10 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/github.com/emicklei/go-restful/otelrestful v0.42.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 // indirect go.opentelemetry.io/otel v1.21.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.21.0 // indirect @@ -179,12 +185,13 @@ require ( go.uber.org/atomic v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.24.0 // indirect - golang.org/x/exp v0.0.0-20230321023759-10a507213a29 // indirect + golang.org/x/crypto v0.21.0 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect golang.org/x/mod v0.14.0 // indirect - golang.org/x/sync v0.5.0 // indirect + golang.org/x/sync v0.6.0 // indirect golang.org/x/term v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect - golang.org/x/time v0.3.0 // indirect + golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.16.1 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230822172742-b8732ec3820d // indirect @@ -206,6 +213,8 @@ require ( k8s.io/kubectl v0.28.0 // indirect k8s.io/mount-utils v0.26.0-alpha.0 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 // indirect + sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240105075710-c4d4895a970b // indirect + sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20231205023417-1ba5a224ab0e // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect ) diff --git a/cluster-autoscaler/go.sum b/cluster-autoscaler/go.sum index 87df26b3eb18..f4b977b8632c 100644 --- a/cluster-autoscaler/go.sum +++ b/cluster-autoscaler/go.sum @@ -59,14 +59,28 @@ github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.4.0 h1:BMAjVKJM0U/CYF27gA0ZM github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.4.0/go.mod h1:1fXstnBMas5kzG+S3q8UoJcmyU6nUeunJcMDHcRYHhs= github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.2 h1:LqbJ/WzJUwBf8UiaSzgX7aMclParm9/5Vgp+TY51uBQ= github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.2/go.mod h1:yInRyqWXAuaPrgI7p70+lDDgh3mlBohis29jGMISnmc= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.4.0 h1:QfV5XZt6iNa2aWMAt96CZEbfJ7kgG/qYIpq465Shr5E= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.4.0/go.mod h1:uYt4CfhkJA9o0FN7jfE5minm/i4nUE4MjGUJkzB6Zs8= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0 h1:DWlwvVV5r/Wy1561nZ3wrpI1/vDIBRY/Wd1HWaRBZWA= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerregistry/armcontainerregistry v1.2.0/go.mod h1:E7ltexgRDmeJ0fJWv0D/HLwY2xbDdN+uv+X2uZtOx3w= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v2 v2.4.0 h1:1u/K2BFv0MwkG6he8RYuUcbbeK22rkoZbg4lKa/msZU= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v2 v2.4.0/go.mod h1:U5gpsREQZE6SLk1t/cFfc1eMhYAlYpEzvaYXuDfefy8= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4 v4.8.0-beta.1 h1:6RFNcR7iE8Ka8j76gE0a/b28eAX6AZF4zqSw0XnFWbg= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4 v4.8.0-beta.1/go.mod h1:gYq8wyDgv6JLhGbAU6gg8amCPgQWRE+aCvrV2gyzdfs= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0 h1:PTFGRSlMKCQelWwxUyYVEUqseBJVemLyqWJjvMyt0do= github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal/v2 v2.0.0/go.mod h1:LRr2FzBTQlONPPa5HREE5+RjSCTXl7BwOvYOaWTqCaI= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1 h1:7CBQ+Ei8SP2c6ydQTGCCrS35bDxgTMfoP2miAwK++OU= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.1.1/go.mod h1:c/wcGeGx5FUPbM/JltUYHZcKmigwyVLJlDq+4HdtXaw= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.4.0 h1:HlZMUZW8S4P9oob1nCHxCCKrytxyLc+24nUJGssoEto= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault v1.4.0/go.mod h1:StGsLbuJh06Bd8IBfnAlIFV3fLb+gkczONWf15hpX2E= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/managementgroups/armmanagementgroups v1.0.0 h1:pPvTJ1dY0sA35JOeFq6TsY2xj6Z85Yo23Pj4wCCvu4o= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/managementgroups/armmanagementgroups v1.0.0/go.mod h1:mLfWfj8v3jfWKsL9G4eoBoXVcsqcIUTapmdKy7uGOp0= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0 h1:bXwSugBiSbgtz7rOtbfGf+woewp4f06orW9OP5BjHLA= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0/go.mod h1:Y/HgrePTmGy9HjdSGTqZNa+apUpTVIEVKXJyARP2lrk= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns v1.2.0 h1:9Eih8XcEeQnFD0ntMlUDleKMzfeCeUfa+VbnDCI4AZs= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/privatedns/armprivatedns v1.2.0/go.mod h1:wGPyTi+aURdqPAGMZDQqnNs9IrShADF8w2WZb6bKeq0= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0 h1:Dd+RhdJn0OTtVGaeDLZpcumkIVCtA/3/Fo42+eoYvVM= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.2.0/go.mod h1:5kakwfW5CjC9KK+Q4wjXAg+ShuIm2mBMua0ZFj2C8PE= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.5.0 h1:AifHbc4mg0x9zW52WOpKbsHaDKuRhlI7TVl47thgQ70= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.5.0/go.mod h1:T5RfihdXtBDxt1Ch2wobif3TvzTdumDy29kahv6AV9A= github.com/Azure/go-armbalancer v0.0.2 h1:NVnxsTWHI5/fEzL6k6TjxPUfcB/3Si3+HFOZXOu0QtA= github.com/Azure/go-armbalancer v0.0.2/go.mod h1:yTg7MA/8YnfKQc9o97tzAJ7fbdVkod1xGsIvKmhYPRE= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= @@ -104,8 +118,8 @@ github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUM github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= github.com/Azure/skewer v0.0.14 h1:0mzUJhspECkajYyynYsOCp//E2PSnYXrgP45bcskqfQ= github.com/Azure/skewer v0.0.14/go.mod h1:6WTecuPyfGtuvS8Mh4JYWuHhO4kcWycGfsUBB+XTFG4= -github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1 h1:WpB/QDNLpMw72xHJc34BNNykqSOeEJDAWkhf0u12/Jk= -github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.2.0 h1:hVeq+yCyUi+MsoO/CU95yqCIcdzra5ovzk8Q2BBpV2M= +github.com/AzureAD/microsoft-authentication-library-for-go v1.2.0/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/GoogleCloudPlatform/k8s-cloud-provider v1.18.1-0.20220218231025-f11817397a1b h1:Heo1J/ttaQFgGJSVnCZquy3e5eH5j1nqxBuomztB3P0= @@ -194,7 +208,7 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSV github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= @@ -238,8 +252,8 @@ github.com/envoyproxy/protoc-gen-validate v1.0.2 h1:QkIBuU5k+x7/QXPvPPnWXWlCdaBF github.com/envoyproxy/protoc-gen-validate v1.0.2/go.mod h1:GpiZQP3dDbg4JouG/NNS7QWXpgx6x8QiMKdmN72jogE= github.com/euank/go-kmsg-parser v2.0.0+incompatible h1:cHD53+PLQuuQyLZeriD1V/esuG4MuU0Pjs5y6iknohY= github.com/euank/go-kmsg-parser v2.0.0+incompatible/go.mod h1:MhmAMZ8V4CYH4ybgdRwPr2TU5ThnS43puaKEMpja1uw= -github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= -github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= +github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= @@ -259,8 +273,9 @@ github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A= @@ -269,8 +284,9 @@ github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogBU= +github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= @@ -376,14 +392,15 @@ github.com/google/pprof v0.0.0-20210122040257-d980be63207e/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20230602010524-ada837c32108 h1:y+JfwMOPwQwIrnh3TUPwwtOAhONoppkHiSa4sQBoK2k= +github.com/google/pprof v0.0.0-20230602010524-ada837c32108/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= -github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= +github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.2.3 h1:yk9/cqRKtT9wXZSsRH9aurXEpJX+U6FLtpYTdC3R06k= github.com/googleapis/enterprise-certificate-proxy v0.2.3/go.mod h1:AwSRAtLfXpU5Nm3pW+v7rGDHp09LsPtGY9MduiEsR9k= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= @@ -482,10 +499,10 @@ github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRW github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= -github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= -github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= -github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= -github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/onsi/ginkgo/v2 v2.13.2 h1:Bi2gGVkfn6gQcjNjZJVO8Gf0FHzMPf2phUei9tejVMs= +github.com/onsi/ginkgo/v2 v2.13.2/go.mod h1:XStQ8QcGwLyF4HdfcZB8SFOS/MWCgDuXMSBe6zrvLgM= +github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= +github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrBg0D7ufOcFM= @@ -552,8 +569,8 @@ github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasO github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE= -github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= -github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= +github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= +github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -629,8 +646,8 @@ go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/contrib/instrumentation/github.com/emicklei/go-restful/otelrestful v0.42.0 h1:Z6SbqeRZAl2OczfkFOqLx1BeYBDYehNjEnqluD7581Y= go.opentelemetry.io/contrib/instrumentation/github.com/emicklei/go-restful/otelrestful v0.42.0/go.mod h1:XiglO+8SPMqM3Mqh5/rtxR1VHc63o8tb38QrU6tm4mU= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0 h1:ZOLJc06r4CB42laIXg/7udr0pbZyuAihN10A/XuiQRY= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0/go.mod h1:5z+/ZWJQKXa9YT34fQNx5K8Hd1EoIhvtUygUQPqEOgQ= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0 h1:PzIubN4/sjByhDRHLviCjJuweBXWFZWhghjg7cS28+M= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0/go.mod h1:Ct6zzQEuGK3WpJs2n4dn+wfJYzd/+hNnxMRTWjGn30M= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 h1:aFJWCqJMNjENlcleuuOkGAPH82y0yULBScfXcIEdS24= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1/go.mod h1:sEGXWArGqc3tVa+ekntsN65DmVbVeW+7lTKTjZF3/Fo= go.opentelemetry.io/contrib/propagators/b3 v1.17.0 h1:ImOVvHnku8jijXqkwCSyYKRDt2YrnGXD4BbhcpfbfJo= @@ -691,8 +708,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug= -golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -796,8 +813,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -883,8 +900,8 @@ golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= -golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= +golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -1102,6 +1119,8 @@ gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/dnaeon/go-vcr.v3 v3.1.2 h1:F1smfXBqQqwpVifDfUBQG6zzaGjzT+EnVZakrOdr5wA= +gopkg.in/dnaeon/go-vcr.v3 v3.1.2/go.mod h1:2IMOnnlx9I6u9x+YBsM3tAMx6AlOxnJ0pWxQAzZ79Ag= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= @@ -1181,19 +1200,23 @@ k8s.io/legacy-cloud-providers v0.29.5 h1:X4nHog973iRQO8ITxZ75kZtz9dhj0dqgEhCApSa k8s.io/legacy-cloud-providers v0.29.5/go.mod h1:YdZBxeySnjCkLjgDOxBsXlKvqfjNvNpSCIf5fJWH2ic= k8s.io/mount-utils v0.29.5 h1:sY11J+CgXTzC2yWBjv7h+qOPykhMPRgichPCNFThMwk= k8s.io/mount-utils v0.29.5/go.mod h1:SHUMR9n3b6tLgEmlyT36cL6fV6Sjwa5CJhc0guCXvb0= -k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= -k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20231127182322-b307cd553661 h1:FepOBzJ0GXm8t0su67ln2wAZjbQ6RxQGZDnzuLcrUTI= +k8s.io/utils v0.0.0-20231127182322-b307cd553661/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 h1:TgtAeesdhpm2SGwkQasmbeqDo8th5wOBA5h/AjTKA4I= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0/go.mod h1:VHVDI/KrK4fjnV61bE2g3sA7tiETLn8sooImelsCx3Y= -sigs.k8s.io/cloud-provider-azure v1.28.0 h1:LkvvDQ2u0rCr1lhFBoyjvKhYazhpYnAohOqQKN060H8= -sigs.k8s.io/cloud-provider-azure v1.28.0/go.mod h1:ubvg4F58jePO4Z7C4XfgJkFFGpqhVeogpzOdc1X4dyk= +sigs.k8s.io/cloud-provider-azure v1.29.0 h1:lHk6AB+3XfURM7bbR+uABKeRcMC1TYreWA6GM5wUT6g= +sigs.k8s.io/cloud-provider-azure v1.29.0/go.mod h1:0WCrYlWxqk3/AptztkqPk1r9Gr3IULSHat7LipAA1sI= +sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240105075710-c4d4895a970b h1:onCsa2FoC9HGIgW+eQYJI8/IZnefwCcU9rF7ZKtD7f0= +sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.0-20240105075710-c4d4895a970b/go.mod h1:seH99Elt7KgWOOonCmzRcB1yLouqK7B7+l8RoSbqaYE= +sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20231205023417-1ba5a224ab0e h1:WjkP0sFCicdlRoTUwJZ0Nm72fZApS/vnAePc+Y7R364= +sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.0-20231205023417-1ba5a224ab0e/go.mod h1:dDc0Ixf5VI01TkTj83ENW1hH5jImGJsdKhQgFRyQsyA= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= -sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=