diff --git a/deploy/klusterlet/olm-catalog/0.13.1/manifests/operator.open-cluster-management.io_klusterlets.yaml b/deploy/klusterlet/olm-catalog/0.13.1/manifests/operator.open-cluster-management.io_klusterlets.yaml index 40d81f073..8701897dd 100644 --- a/deploy/klusterlet/olm-catalog/0.13.1/manifests/operator.open-cluster-management.io_klusterlets.yaml +++ b/deploy/klusterlet/olm-catalog/0.13.1/manifests/operator.open-cluster-management.io_klusterlets.yaml @@ -181,6 +181,51 @@ spec: description: RegistrationConfiguration contains the configuration of registration properties: + bootstrapKubeConfigs: + description: "BootstrapKubeConfigs defines the ordered list of + bootstrap kubeconfigs. The order decides which bootstrap kubeconfig + to use first when rebootstrap. \n When the agent loses the connection + to the current hub over HubConnectionTimeoutSeconds, or the + managedcluster CR is set `hubAcceptsClient=false` on the hub, + the controller marks the related bootstrap kubeconfig as \"failed\". + \n A failed bootstrapkubeconfig won't be used for the duration + specified by SkipFailedBootstrapKubeConfigSeconds. But if the + user updates the content of a failed bootstrapkubeconfig, the + \"failed\" mark will be cleared." + properties: + localSecretsConfig: + description: LocalSecretsConfig include a list of secrets + that contains the kubeconfigs for ordered bootstrap kubeconifigs. + The secrets must be in the same namespace where the agent + controller runs. + properties: + hubConnectionTimeoutSeconds: + default: 600 + description: HubConnectionTimeoutSeconds is used to set + the timeout of connecting to the hub cluster. When agent + loses the connection to the hub over the timeout seconds, + the agent do a rebootstrap. By default is 10 mins. + format: int32 + minimum: 180 + type: integer + secretNames: + description: SecretNames is a list of secret names. The + secrets are in the same namespace where the agent controller + runs. + items: + type: string + type: array + type: object + type: + default: None + description: Type specifies the type of priority bootstrap + kubeconfigs. By default, it is set to None, representing + no priority bootstrap kubeconfigs are set. + enum: + - None + - LocalSecrets + type: string + type: object clientCertExpirationSeconds: description: clientCertExpirationSeconds represents the seconds of a client certificate to expire. If it is not set or 0, the diff --git a/deploy/klusterlet/olm-catalog/latest/manifests/klusterlet.clusterserviceversion.yaml b/deploy/klusterlet/olm-catalog/latest/manifests/klusterlet.clusterserviceversion.yaml index dc7715467..884097e57 100644 --- a/deploy/klusterlet/olm-catalog/latest/manifests/klusterlet.clusterserviceversion.yaml +++ b/deploy/klusterlet/olm-catalog/latest/manifests/klusterlet.clusterserviceversion.yaml @@ -31,7 +31,7 @@ metadata: categories: Integration & Delivery,OpenShift Optional certified: "false" containerImage: quay.io/open-cluster-management/registration-operator:latest - createdAt: "2024-04-10T15:46:14Z" + createdAt: "2024-04-25T09:39:17Z" description: Manages the installation and upgrade of the Klusterlet. operators.operatorframework.io/builder: operator-sdk-v1.32.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig.go new file mode 100644 index 000000000..ef0c6e86b --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig.go @@ -0,0 +1,106 @@ +package bootstrapkubeconfigsmanager + +import ( + "context" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +type boostrapKubeConfigStatus string + +const ( + boostrapKubeConfigStatusInValid boostrapKubeConfigStatus = "InValid" + boostrapKubeConfigStatusValid boostrapKubeConfigStatus = "Valid" +) + +// bootstrapKubeConfig represents a bootstrap kubeconfig that agent can use to bootstrap a managed cluster. +type boostrapKubeConfig interface { + // Name returns the name of the bootstrap kubeconfig. It helps to identify the bootstrap kubeconfig. + Name() string + + // KubeConfigData returns the kubeconfig data of the bootstrap kubeconfig. + KubeConfigData() ([]byte, error) + + // Status returns the status of the bootstrap kubeconfig. + // A bootstrap kubeconfig has two status: Valid and InValid. + Status() (boostrapKubeConfigStatus, error) + + // Fail means at the time t, the bootstrap kubeconfig failed to connect to the hub cluster. + Fail(t time.Time) error +} + +var _ boostrapKubeConfig = &boostrapKubeConfigSecretImpl{} + +const ( + // BootstrapKubeconfigFailedTimeAnnotationKey represents the time when the bootstrap kubeconfig failed + BootstrapKubeconfigFailedTimeAnnotationKey = "agent.open-cluster-management.io/bootstrap-kubeconfig-failed-time" +) + +type boostrapKubeConfigSecretImpl struct { + secretName string + secretNamespace string + skipFailedBootstrapKubeconfigSeconds int32 // if a bootstrap kubeconfig failed, in 3 mins, it can't be used in rebootstrap. + kubeClient kubernetes.Interface +} + +func (b *boostrapKubeConfigSecretImpl) Name() string { + return b.secretName +} + +func (b *boostrapKubeConfigSecretImpl) KubeConfigData() ([]byte, error) { + secret, err := b.kubeClient.CoreV1().Secrets(b.secretNamespace).Get(context.Background(), b.secretName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("get the bootstrap kubeconfig secret failed: %v", err) + } + if secret.Data == nil { + return nil, fmt.Errorf("the bootstrap kubeconfig secret %s has no data", b.secretName) + } + kubeconfigData, ok := secret.Data["kubeconfig"] + if !ok { + return nil, fmt.Errorf("the bootstrap kubeconfig secret %s has no kubeconfig data", b.secretName) + } + return kubeconfigData, nil +} + +func (b *boostrapKubeConfigSecretImpl) Status() (boostrapKubeConfigStatus, error) { + secret, err := b.kubeClient.CoreV1().Secrets(b.secretNamespace).Get(context.Background(), b.secretName, metav1.GetOptions{}) + if err != nil { + return boostrapKubeConfigStatusInValid, fmt.Errorf("get the bootstrap kubeconfig secret failed: %v", err) + } + + if secret.Annotations == nil { + return boostrapKubeConfigStatusValid, nil + } + + now := time.Now() + if failedTime, ok := secret.Annotations[BootstrapKubeconfigFailedTimeAnnotationKey]; ok { + failedTimeParsed, err := time.Parse(time.RFC3339, failedTime) + if err != nil { + return boostrapKubeConfigStatusInValid, fmt.Errorf("failed to parse the failed time %s of the secret %s: %v", failedTime, secret.Name, err) + } + if now.Sub(failedTimeParsed).Seconds() < float64(b.skipFailedBootstrapKubeconfigSeconds) { + return boostrapKubeConfigStatusInValid, nil + } + } + return boostrapKubeConfigStatusValid, nil +} + +func (b *boostrapKubeConfigSecretImpl) Fail(t time.Time) error { + secret, err := b.kubeClient.CoreV1().Secrets(b.secretNamespace).Get(context.Background(), b.secretName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get the bootstrap kubeconfig secret failed: %v", err) + } + secretCopy := secret.DeepCopy() + if secretCopy.Annotations == nil { + secretCopy.Annotations = make(map[string]string) + } + secretCopy.Annotations[BootstrapKubeconfigFailedTimeAnnotationKey] = t.Format(time.RFC3339) + _, err = b.kubeClient.CoreV1().Secrets(b.secretNamespace).Update(context.Background(), secretCopy, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("update the secret %s failed: %v", b.secretName, err) + } + return nil +} diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig_test.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig_test.go new file mode 100644 index 000000000..77c8efc65 --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfig_test.go @@ -0,0 +1,202 @@ +package bootstrapkubeconfigsmanager + +import ( + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +func TestBootstrapKubeConfigSecretImpl(t *testing.T) { + testcases := []struct { + name string + mockClient func(failedTime time.Time) *fake.Clientset + validate func(c boostrapKubeConfig) + }{ + { + name: "Valid BootstrapKubeConfig", + mockClient: func(_ time.Time) *fake.Clientset { + // Create a mock secret + mockSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-secret", + Namespace: "test-namespace", + }, + Data: map[string][]byte{ + "kubeconfig": []byte("mock-kubeconfig"), + }, + } + + // Create a mock kubeClient + mockKubeClient := &fake.Clientset{} + mockKubeClient.AddReactor("get", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, mockSecret, nil + }) + return mockKubeClient + }, + validate: func(c boostrapKubeConfig) { + if c.Name() != "mock-secret" { + t.Errorf("Expected name %v, but got %v", "mock-secret", c.Name()) + } + + kubeConfigData, err := c.KubeConfigData() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if string(kubeConfigData) != "mock-kubeconfig" { + t.Errorf("Expected kubeconfig data %v, but got %v", "mock-kubeconfig", string(kubeConfigData)) + } + + status, err := c.Status() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if status != boostrapKubeConfigStatusValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusValid, status) + } + }, + }, + { + name: "Once failed but now valid BootstrapKubeConfig", + mockClient: func(_ time.Time) *fake.Clientset { + // Create a mock secret + mockSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-secret", + Namespace: "test-namespace", + Annotations: map[string]string{ + BootstrapKubeconfigFailedTimeAnnotationKey: "2021-08-01T00:00:00Z", + }, + }, + Data: map[string][]byte{ + "kubeconfig": []byte("mock-kubeconfig"), + }, + } + + // Create a mock kubeClient + mockKubeClient := &fake.Clientset{} + mockKubeClient.AddReactor("get", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, mockSecret, nil + }) + return mockKubeClient + }, + validate: func(c boostrapKubeConfig) { + status, err := c.Status() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if status != boostrapKubeConfigStatusValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusValid, status) + } + }, + }, + { + name: "Recently failed and invalid BootstrapKubeConfig", + mockClient: func(failedTime time.Time) *fake.Clientset { + // Create a mock secret + mockSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-secret", + Namespace: "test-namespace", + Annotations: map[string]string{ + BootstrapKubeconfigFailedTimeAnnotationKey: failedTime.Format(time.RFC3339), + }, + }, + Data: map[string][]byte{ + "kubeconfig": []byte("mock-kubeconfig"), + }, + } + + // Create a mock kubeClient + mockKubeClient := &fake.Clientset{} + mockKubeClient.AddReactor("get", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, mockSecret, nil + }) + return mockKubeClient + }, + validate: func(c boostrapKubeConfig) { + status, err := c.Status() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if status != boostrapKubeConfigStatusInValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusInValid, status) + } + }, + }, + { + name: "Fail a BootstrapKubeConfig", + mockClient: func(_ time.Time) *fake.Clientset { + // Create a mock secret + mockSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-secret", + Namespace: "test-namespace", + }, + Data: map[string][]byte{ + "kubeconfig": []byte("mock-kubeconfig"), + }, + } + + // Create a mock kubeClient + mockKubeClient := &fake.Clientset{} + mockKubeClient.AddReactor("get", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, mockSecret, nil + }) + mockKubeClient.AddReactor("update", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + // Get annotation from action + annotations := action.(k8stesting.UpdateAction).GetObject().(*corev1.Secret).Annotations + mockSecret.Annotations = annotations + return true, nil, nil + }) + return mockKubeClient + }, + validate: func(c boostrapKubeConfig) { + status, err := c.Status() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if status != boostrapKubeConfigStatusValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusValid, status) + } + + // Fail the BootstrapKubeConfig + err = c.Fail(time.Now()) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + status, err = c.Status() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if status != boostrapKubeConfigStatusInValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusInValid, status) + } + }, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + mockClient := tc.mockClient(time.Now()) + bootstrapKubeConfig := &boostrapKubeConfigSecretImpl{ + kubeClient: mockClient, + secretNamespace: "test-namespace", + secretName: "mock-secret", + skipFailedBootstrapKubeconfigSeconds: 60, + } + tc.validate(bootstrapKubeConfig) + }) + } +} diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse.go new file mode 100644 index 000000000..a99d52eb6 --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse.go @@ -0,0 +1,60 @@ +package bootstrapkubeconfigsmanager + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// bootstrapKubeConfigInUse is the registration spoke's current in used bootstrap kubeconfig. +type bootstrapKubeConfigInUse interface { + // KubeConfigData returns the kubeconfig data of the bootstrap kubeconfig in use. + KubeConfigData() ([]byte, error) + + // Update updates the kubeconfig data of the bootstrap kubeconfig in use. + Update(ctx context.Context, kubeconfigData []byte) error +} + +var _ bootstrapKubeConfigInUse = &bootstrapKubeConfigInUseImpl{} + +type bootstrapKubeConfigInUseImpl struct { + secretName string + secretNamespace string + kubeClient kubernetes.Interface +} + +func (b *bootstrapKubeConfigInUseImpl) KubeConfigData() ([]byte, error) { + secret, err := b.kubeClient.CoreV1().Secrets(b.secretNamespace).Get(context.Background(), b.secretName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("get the bootstrap kubeconfig secret failed: %v", err) + } + if secret.Data == nil { + return nil, fmt.Errorf("the bootstrap kubeconfig secret %s has no data", b.secretName) + } + kubeconfigData, ok := secret.Data["kubeconfig"] + if !ok { + return nil, fmt.Errorf("the bootstrap kubeconfig secret %s has no kubeconfig data", b.secretName) + } + return kubeconfigData, nil +} + +func (b *bootstrapKubeConfigInUseImpl) Update(ctx context.Context, kubeconfigData []byte) error { + var err error + // Get the in-use bootstrapkubeconfig secret + inUse, err := b.kubeClient.CoreV1().Secrets(b.secretNamespace).Get(ctx, b.secretName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get the current bootstrap kubeconfig secret failed: %v", err) + } + + // Update the in-use bootstrapkubeconfig secret + copy := inUse.DeepCopy() + copy.Data["kubeconfig"] = kubeconfigData + + _, err = b.kubeClient.CoreV1().Secrets(b.secretNamespace).Update(ctx, copy, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("update the current bootstrap kubeconfig secret failed: %v", err) + } + return nil +} diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse_test.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse_test.go new file mode 100644 index 000000000..f1ba370ee --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfiginuse_test.go @@ -0,0 +1,83 @@ +package bootstrapkubeconfigsmanager + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +func TestBootstrapKubeConfigInUseImpl(t *testing.T) { + testcases := []struct { + name string + mockClient func() *fake.Clientset + validate func(c bootstrapKubeConfigInUse) + }{ + { + name: "Update KubeConfig Data", + mockClient: func() *fake.Clientset { + // Create a mock secret + mockSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-secret", + Namespace: "test-namespace", + }, + Data: map[string][]byte{ + "kubeconfig": []byte("mock-kubeconfig"), + }, + } + + // Create a mock kubeClient + mockKubeClient := &fake.Clientset{} + mockKubeClient.AddReactor("get", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + return true, mockSecret, nil + }) + mockKubeClient.AddReactor("update", "secrets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + // Get data from action + data := action.(k8stesting.UpdateAction).GetObject().(*corev1.Secret).Data + mockSecret.Data = data + return true, nil, nil + }) + return mockKubeClient + }, + validate: func(c bootstrapKubeConfigInUse) { + kubeConfigData, err := c.KubeConfigData() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if string(kubeConfigData) != "mock-kubeconfig" { + t.Errorf("Expected kubeconfig data %v, but got %v", "mock-kubeconfig", string(kubeConfigData)) + } + + // Update kubeconfig data + err = c.Update(context.TODO(), []byte("new-kubeconfig")) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + kubeConfigData, err = c.KubeConfigData() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if string(kubeConfigData) != "new-kubeconfig" { + t.Errorf("Expected kubeconfig data %v, but got %v", "new-kubeconfig", string(kubeConfigData)) + } + }, + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + mockClient := tc.mockClient() + bootstrapKubeConfig := &bootstrapKubeConfigInUseImpl{ + kubeClient: mockClient, + secretNamespace: "test-namespace", + secretName: "mock-secret", + } + tc.validate(bootstrapKubeConfig) + }) + } +} diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager.go new file mode 100644 index 000000000..3fb187dec --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager.go @@ -0,0 +1,102 @@ +package bootstrapkubeconfigsmanager + +import ( + "bytes" + "context" + "fmt" + "time" + + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" +) + +// bootstrapkubeconfigsManager manages multiple bootstrapkubeconfigs. +// The only public method is ReSelect. +// ReSelect do 2 things: +// 1. Call Fail method of the bootstrapkubeconfig that has the same kubeconfig data with the current bootstrapkubeconfig in use. +// 2. Select a first valid bootstrapkubeconfig from the list and update it to the current bootstrapkubeconfig in use. +type bootstrapkubeconfigsManager struct { + bootstrapKubeConfigInUse bootstrapKubeConfigInUse + bootstrapKubeConfigs []boostrapKubeConfig +} + +func NewBoostrapkubeconfigsManager(spokeNamespace string, + bootstrapKubeconfigSecretInUse string, + bootstrapKubeconfigSecrets []string, + kubeClient kubernetes.Interface) *bootstrapkubeconfigsManager { + var bootstrapKubeConfigs []boostrapKubeConfig + for _, secretName := range bootstrapKubeconfigSecrets { + bootstrapKubeConfigs = append(bootstrapKubeConfigs, &boostrapKubeConfigSecretImpl{ + secretName: secretName, + secretNamespace: spokeNamespace, + skipFailedBootstrapKubeconfigSeconds: 180, // 3 mins + kubeClient: kubeClient, + }) + } + return &bootstrapkubeconfigsManager{ + bootstrapKubeConfigInUse: &bootstrapKubeConfigInUseImpl{ + secretName: bootstrapKubeconfigSecretInUse, + secretNamespace: spokeNamespace, + kubeClient: kubeClient, + }, + bootstrapKubeConfigs: bootstrapKubeConfigs, + } +} + +// ReSelect is the only public method of a bootstrapkubeconfigManager +func (m *bootstrapkubeconfigsManager) ReSelect(ctx context.Context) error { + now := time.Now() + klog.Info("ReSelect Start") + + bootstrapKubeConfigInUse := m.bootstrapKubeConfigInUse + kubeconfigDataInUse, err := bootstrapKubeConfigInUse.KubeConfigData() + if err != nil { + return fmt.Errorf("get the current bootstrap kubeconfig failed: %v", err) + } + + // If kubeconfigData equals to the kubeConfigDataInUse, need to mark the bootstrapkubeconfig as invalid. + bootstrapKubeConfigs := m.bootstrapKubeConfigs + for _, bootstrapKubeConfig := range bootstrapKubeConfigs { + kubeconfigData, err := bootstrapKubeConfig.KubeConfigData() + if err != nil { + klog.Errorf("get the kubeconfig data of the secret %s failed: %v", bootstrapKubeConfig.Name(), err) + continue + } + + if bytes.Equal(kubeconfigData, kubeconfigDataInUse) { + err := bootstrapKubeConfig.Fail(now) + if err != nil { + klog.Errorf("fail the bootstrapKubeConfig %s failed: %v", bootstrapKubeConfig.Name(), err) + } + } + } + + // Select a new valid bootstrapkubeconfig and update it to the current bootstrapkubeconfig in use. + for _, bootstrapKubeConfig := range bootstrapKubeConfigs { + status, err := bootstrapKubeConfig.Status() + if err != nil { + klog.Errorf("get the status of the secret %s failed: %v", bootstrapKubeConfig.Name(), err) + continue + } + if status == boostrapKubeConfigStatusInValid { + klog.Errorf("bootstrap kubeconfig %s is invalid", bootstrapKubeConfig.Name()) + continue + } + if status == boostrapKubeConfigStatusValid { + kubeconfigData, err := bootstrapKubeConfig.KubeConfigData() + if err != nil { + klog.Errorf("get the kubeconfig data of the bootstrapKubeConfig %s failed: %v", bootstrapKubeConfig.Name(), err) + continue + } + err = bootstrapKubeConfigInUse.Update(ctx, kubeconfigData) + if err != nil { + klog.Errorf("update the current bootstrap kubeconfig failed: %v", err) + continue + } + break + } + } + + klog.Info("ReSelect End") + return nil +} diff --git a/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager_test.go b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager_test.go new file mode 100644 index 000000000..90bc3326f --- /dev/null +++ b/pkg/registration/spoke/bootstrapkubeconfigsmanager/bootstrapkubeconfigsmanager_test.go @@ -0,0 +1,85 @@ +package bootstrapkubeconfigsmanager + +import ( + "context" + "testing" + "time" +) + +func TestBootstrapKubeConfigManager(t *testing.T) { + var err error + manager := &bootstrapkubeconfigsManager{ + bootstrapKubeConfigInUse: &fakeBootstrapKubeConfigInUse{ + kubeconfig: []byte("hub1"), + }, + bootstrapKubeConfigs: []boostrapKubeConfig{ + &fakeBootstrapKubeConfig{ + name: "hub1", + kubeconfigData: []byte("hub1"), + status: boostrapKubeConfigStatusValid, + }, + &fakeBootstrapKubeConfig{ + name: "hub2", + kubeconfigData: []byte("hub2"), + status: boostrapKubeConfigStatusInValid, + }, + &fakeBootstrapKubeConfig{ + name: "hub3", + kubeconfigData: []byte("hub3"), + status: boostrapKubeConfigStatusValid, + }, + }, + } + + err = manager.ReSelect(context.TODO()) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + // the status of hub1 should change to invalid + if status, _ := manager.bootstrapKubeConfigs[0].Status(); status != boostrapKubeConfigStatusInValid { + t.Errorf("Expected status %v, but got %v", boostrapKubeConfigStatusInValid, status) + } + + // the kubeconfigData of bootstrapKubeConfigInUse should change to hub3 + kubeconfigData, _ := manager.bootstrapKubeConfigInUse.KubeConfigData() + if string(kubeconfigData) != "hub3" { + t.Errorf("Expected kubeconfig data %v, but got %v", "hub3", string(kubeconfigData)) + } +} + +type fakeBootstrapKubeConfigInUse struct { + kubeconfig []byte +} + +func (f *fakeBootstrapKubeConfigInUse) KubeConfigData() ([]byte, error) { + return f.kubeconfig, nil +} + +func (f *fakeBootstrapKubeConfigInUse) Update(ctx context.Context, kubeconfigData []byte) error { + f.kubeconfig = kubeconfigData + return nil +} + +type fakeBootstrapKubeConfig struct { + name string + kubeconfigData []byte + status boostrapKubeConfigStatus +} + +func (f *fakeBootstrapKubeConfig) Name() string { + return f.name +} + +func (f *fakeBootstrapKubeConfig) KubeConfigData() ([]byte, error) { + return f.kubeconfigData, nil +} + +func (f *fakeBootstrapKubeConfig) Status() (boostrapKubeConfigStatus, error) { + return f.status, nil +} + +func (f *fakeBootstrapKubeConfig) Fail(failedTime time.Time) error { + f.status = boostrapKubeConfigStatusInValid + return nil +} diff --git a/pkg/registration/spoke/lease/lease_controller.go b/pkg/registration/spoke/lease/lease_controller.go index ef7ab9f29..20c3306b4 100644 --- a/pkg/registration/spoke/lease/lease_controller.go +++ b/pkg/registration/spoke/lease/lease_controller.go @@ -26,7 +26,7 @@ type managedClusterLeaseController struct { clusterName string hubClusterLister clusterv1listers.ManagedClusterLister lastLeaseDurationSeconds int32 - leaseUpdater leaseUpdaterInterface + leaseUpdater leaseUpdater } // NewManagedClusterLeaseController creates a new managed cluster lease controller on the managed cluster. @@ -34,15 +34,19 @@ func NewManagedClusterLeaseController( clusterName string, hubClient clientset.Interface, hubClusterInformer clusterv1informer.ManagedClusterInformer, + timeoutSeconds int32, + handleTimeout func(ctx context.Context) error, recorder events.Recorder) factory.Controller { c := &managedClusterLeaseController{ clusterName: clusterName, hubClusterLister: hubClusterInformer.Lister(), - leaseUpdater: &leaseUpdater{ - hubClient: hubClient, - clusterName: clusterName, - leaseName: "managed-cluster-lease", - recorder: recorder, + leaseUpdater: &leaseUpdaterImpl{ + hubClient: hubClient, + clusterName: clusterName, + leaseName: "managed-cluster-lease", + recorder: recorder, + timeoutSeconds: timeoutSeconds, + handleTimeout: handleTimeout, }, } @@ -85,23 +89,27 @@ func (c *managedClusterLeaseController) sync(ctx context.Context, syncCtx factor return nil } -type leaseUpdaterInterface interface { +type leaseUpdater interface { start(ctx context.Context, leaseDuration time.Duration) stop() } // leaseUpdater periodically updates the lease of a managed cluster -type leaseUpdater struct { +type leaseUpdaterImpl struct { hubClient clientset.Interface clusterName string leaseName string lock sync.Mutex cancel context.CancelFunc recorder events.Recorder + + lastSuccuessUpdateTime time.Time + timeoutSeconds int32 + handleTimeout func(ctx context.Context) error } // start a lease update routine to update the lease of a managed cluster periodically. -func (u *leaseUpdater) start(ctx context.Context, leaseDuration time.Duration) { +func (u *leaseUpdaterImpl) start(ctx context.Context, leaseDuration time.Duration) { u.lock.Lock() defer u.lock.Unlock() @@ -116,7 +124,7 @@ func (u *leaseUpdater) start(ctx context.Context, leaseDuration time.Duration) { } // stop the lease update routine. -func (u *leaseUpdater) stop() { +func (u *leaseUpdaterImpl) stop() { u.lock.Lock() defer u.lock.Unlock() @@ -129,15 +137,48 @@ func (u *leaseUpdater) stop() { } // update the lease of a given managed cluster. -func (u *leaseUpdater) update(ctx context.Context) { +// update the lease of a given managed cluster. +func (u *leaseUpdaterImpl) update(ctx context.Context) { + u.lock.Lock() + defer u.lock.Unlock() + + now := time.Now() + + handleTimeoutIfExpired := func(now time.Time) { + // if handleTimeout is nil, do nothing. + if u.handleTimeout == nil { + return + } + // if now is after the lastSuccuessRenewTime + timeoutSeconds, log timeout event and call timeoutThen function. + if !u.lastSuccuessUpdateTime.IsZero() && now.After(u.lastSuccuessUpdateTime.Add(time.Duration(u.timeoutSeconds)*time.Second)) { + u.recorder.Eventf("ManagedClusterLeaseUpdateTimeout", "Timeout to update lease %q on cluster %q", u.leaseName, u.clusterName) + utilruntime.HandleError(fmt.Errorf("Timeout to update lease cluster lease %q on hub cluster", u.leaseName)) + if err := u.handleTimeout(ctx); err != nil { + utilruntime.HandleError(fmt.Errorf("unable to handle timeout of cluster lease %q on hub cluster: %w", u.leaseName, err)) + } + return + } + } + + handleErrorAndTimeout := func(err error, now time.Time) { + utilruntime.HandleError(fmt.Errorf("unable to operate cluster lease %q on hub cluster: %w", u.leaseName, err)) + handleTimeoutIfExpired(now) + } + + // Get lease on the hub. lease, err := u.hubClient.CoordinationV1().Leases(u.clusterName).Get(ctx, u.leaseName, metav1.GetOptions{}) if err != nil { - utilruntime.HandleError(fmt.Errorf("unable to get cluster lease %q on hub cluster: %w", u.leaseName, err)) + handleErrorAndTimeout(fmt.Errorf("get lease err: %w", err), now) return } - lease.Spec.RenewTime = &metav1.MicroTime{Time: time.Now()} + // Update the lease RenewTime. + lease.Spec.RenewTime = &metav1.MicroTime{Time: now} if _, err = u.hubClient.CoordinationV1().Leases(u.clusterName).Update(ctx, lease, metav1.UpdateOptions{}); err != nil { - utilruntime.HandleError(fmt.Errorf("unable to update cluster lease %q on hub cluster: %w", u.leaseName, err)) + handleErrorAndTimeout(fmt.Errorf("update lease err: %w", err), now) + return } + + // Update the lastSuccuessUpdateTime. + u.lastSuccuessUpdateTime = now } diff --git a/pkg/registration/spoke/lease/lease_controller_test.go b/pkg/registration/spoke/lease/lease_controller_test.go index 0c555de64..69265bb62 100644 --- a/pkg/registration/spoke/lease/lease_controller_test.go +++ b/pkg/registration/spoke/lease/lease_controller_test.go @@ -130,14 +130,16 @@ func (f *fakeLeaseUpdater) stop() { f.stopCalled = true } -func TestLeaseUpdater(t *testing.T) { +func TestLeaseUpdater_UpdateSuccess(t *testing.T) { initRenewTime := time.Now() hubClient := kubefake.NewSimpleClientset(testinghelpers.NewManagedClusterLease("managed-cluster-lease", initRenewTime)) - leaseUpdater := &leaseUpdater{ - hubClient: hubClient, - clusterName: testinghelpers.TestManagedClusterName, - leaseName: "managed-cluster-lease", - recorder: eventstesting.NewTestingEventRecorder(t), + leaseUpdater := &leaseUpdaterImpl{ + hubClient: hubClient, + clusterName: testinghelpers.TestManagedClusterName, + leaseName: "managed-cluster-lease", + recorder: eventstesting.NewTestingEventRecorder(t), + timeoutSeconds: 3, + lastSuccuessUpdateTime: initRenewTime, } // start the updater @@ -145,6 +147,7 @@ func TestLeaseUpdater(t *testing.T) { leaseUpdater.start(ctx, time.Second*1) // wait for 3 second, the all actions should be in get,update pairs + // and the lastSuccuessUpdateTime should be updated time.Sleep(time.Second * 3) actions := hubClient.Actions() if len(actions) == 0 { @@ -159,6 +162,9 @@ func TestLeaseUpdater(t *testing.T) { t.Errorf("expect update action, but got %s", actions[i+1].GetVerb()) } } + if !leaseUpdater.lastSuccuessUpdateTime.After(initRenewTime) { + t.Errorf("expect lastSuccuessRenewTime after %v, but got %v", initRenewTime, leaseUpdater.lastSuccuessUpdateTime) + } // stop the updater leaseUpdater.stop() @@ -171,3 +177,80 @@ func TestLeaseUpdater(t *testing.T) { t.Errorf("expect %d actions, but got %d", actionLen, len(actions)) } } + +func TestLeaseUpdater_TimeoutWithHandle(t *testing.T) { + initRenewTime := time.Now() + hubClient := kubefake.NewSimpleClientset() + leaseUpdater := &leaseUpdaterImpl{ + hubClient: hubClient, + clusterName: testinghelpers.TestManagedClusterName, + leaseName: "managed-cluster-lease", + recorder: eventstesting.NewTestingEventRecorder(t), + timeoutSeconds: 3, + lastSuccuessUpdateTime: initRenewTime, + } + + // start the updater + ctx := context.Background() + leaseUpdater.start(ctx, time.Second*1) + + // wait for 4 second, there should only be get actions + time.Sleep(time.Second * 4) + actions := hubClient.Actions() + if len(actions) == 0 { + t.Error("expect at least 1 action, but got 0") + return + } + for _, action := range actions { + if action.GetVerb() != "get" { + t.Errorf("expect get action, but got %s", action.GetVerb()) + } + } + + if !leaseUpdater.lastSuccuessUpdateTime.Equal(initRenewTime) { + t.Errorf("expect lastSuccuessRenewTime equal to %v, but got %v", initRenewTime, leaseUpdater.lastSuccuessUpdateTime) + } +} + +func TestLeaseUpdater_TimeoutWithoutHandle(t *testing.T) { + initRenewTime := time.Now() + hubClient := kubefake.NewSimpleClientset() + timeout := false + leaseUpdater := &leaseUpdaterImpl{ + hubClient: hubClient, + clusterName: testinghelpers.TestManagedClusterName, + leaseName: "managed-cluster-lease", + recorder: eventstesting.NewTestingEventRecorder(t), + timeoutSeconds: 3, + lastSuccuessUpdateTime: initRenewTime, + handleTimeout: func(ctx context.Context) error { + timeout = true + return nil + }, + } + + // start the updater + ctx := context.Background() + leaseUpdater.start(ctx, time.Second*1) + + // wait for 4 second, there should only be get actions + time.Sleep(time.Second * 4) + actions := hubClient.Actions() + if len(actions) == 0 { + t.Error("expect at least 1 action, but got 0") + return + } + for _, action := range actions { + if action.GetVerb() != "get" { + t.Errorf("expect get action, but got %s", action.GetVerb()) + } + } + + if !leaseUpdater.lastSuccuessUpdateTime.Equal(initRenewTime) { + t.Errorf("expect lastSuccuessRenewTime equal to %v, but got %v", initRenewTime, leaseUpdater.lastSuccuessUpdateTime) + } + + if !timeout { + t.Error("expect timeout to be true, but got false") + } +} diff --git a/pkg/registration/spoke/options.go b/pkg/registration/spoke/options.go index 4b65a4af2..90620d9a0 100644 --- a/pkg/registration/spoke/options.go +++ b/pkg/registration/spoke/options.go @@ -15,8 +15,16 @@ var ClientCertHealthCheckInterval = 30 * time.Second // SpokeAgentOptions holds configuration for spoke cluster agent type SpokeAgentOptions struct { - BootstrapKubeconfig string - BootstrapKubeconfigSecret string + // The differences among BootstrapKubeconfig, BootstrapKubeconfigSecret, BootstrapKubeconfigSecrets are: + // 1. BootstrapKubeconfig is a file path, the controller uses it to build the client. + // 2. BootstrapKubeconfigSecret is the secret, a eventhandler will watch it, if the secret is changed, threbootstrap. + // 3. BootstrapKubeconfigSecrets is a list of secrets, when rebootstrap, the controller will chooses once from the list. + BootstrapKubeconfig string + BootstrapKubeconfigSecret string + BootstrapKubeconfigSecrets []string + + HubConnectionTimeoutSeconds int32 + HubKubeconfigSecret string SpokeExternalServerURLs []string ClusterHealthCheckPeriod time.Duration @@ -53,6 +61,10 @@ func (o *SpokeAgentOptions) AddFlags(fs *pflag.FlagSet) { "The path of the kubeconfig file for agent bootstrap.") fs.StringVar(&o.BootstrapKubeconfigSecret, "bootstrap-kubeconfig-secret", o.BootstrapKubeconfigSecret, "The name of secret in component namespace storing kubeconfig for agent bootstrap.") + fs.StringArrayVar(&o.BootstrapKubeconfigSecrets, "bootstrap-kubeconfig-secrets", o.BootstrapKubeconfigSecrets, + "The name of secrets in component namespace storing bootstrap kubeconfigs for agent bootstrap.") + fs.Int32Var(&o.HubConnectionTimeoutSeconds, "hub-connection-timeout-seconds", o.HubConnectionTimeoutSeconds, + "The timeout in seconds to connect to hub cluster.") fs.StringVar(&o.HubKubeconfigSecret, "hub-kubeconfig-secret", o.HubKubeconfigSecret, "The name of secret in component namespace storing kubeconfig for hub.") fs.StringArrayVar(&o.SpokeExternalServerURLs, "spoke-external-server-urls", o.SpokeExternalServerURLs, diff --git a/pkg/registration/spoke/registration/hub_accept_controller.go b/pkg/registration/spoke/registration/hub_accept_controller.go new file mode 100644 index 000000000..d75803643 --- /dev/null +++ b/pkg/registration/spoke/registration/hub_accept_controller.go @@ -0,0 +1,46 @@ +package registration + +import ( + "context" + + "github.com/openshift/library-go/pkg/controller/factory" + "github.com/openshift/library-go/pkg/operator/events" + + clusterv1informer "open-cluster-management.io/api/client/cluster/informers/externalversions/cluster/v1" + clusterv1listers "open-cluster-management.io/api/client/cluster/listers/cluster/v1" +) + +// hubAcceptController watch ManagedCluster CR on hub after spoke bootstrap done +// If the managedCluster.Spec.HubAccetpsClient is false, then mark the connection as failed. +type hubAcceptController struct { + clusterName string + hubClusterLister clusterv1listers.ManagedClusterLister + handleAcceptFalse func(ctx context.Context) error +} + +func NewHubAcceptController(clusterName string, hubClusterInformer clusterv1informer.ManagedClusterInformer, + handleAcceptFalse func(ctx context.Context) error, recorder events.Recorder) factory.Controller { + c := &hubAcceptController{ + clusterName: clusterName, + hubClusterLister: hubClusterInformer.Lister(), + handleAcceptFalse: handleAcceptFalse, + } + return factory.New(). + WithInformers(hubClusterInformer.Informer()). + WithSync(c.sync). + ToController("HubAcceptController", recorder) +} + +func (c *hubAcceptController) sync(ctx context.Context, _ factory.SyncContext) error { + cluster, err := c.hubClusterLister.Get(c.clusterName) + if err != nil { + return err + } + if cluster.Spec.HubAcceptsClient { + return nil + } + if c.handleAcceptFalse == nil { + return nil + } + return c.handleAcceptFalse(ctx) +} diff --git a/pkg/registration/spoke/registration/hub_accept_controller_test.go b/pkg/registration/spoke/registration/hub_accept_controller_test.go new file mode 100644 index 000000000..53533d1ee --- /dev/null +++ b/pkg/registration/spoke/registration/hub_accept_controller_test.go @@ -0,0 +1,71 @@ +package registration + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + clusterv1lister "open-cluster-management.io/api/client/cluster/listers/cluster/v1" + clusterv1 "open-cluster-management.io/api/cluster/v1" +) + +func TestHubAcceptController_sync(t *testing.T) { + var err error + clusterName := "testCluster" + handled := false + mockHubClusterLister := &MockManagedClusterLister{} + hacontroller := &hubAcceptController{ + clusterName: clusterName, + hubClusterLister: mockHubClusterLister, + handleAcceptFalse: func(ctx context.Context) error { + handled = true + return nil + }, + } + + // Create a mock hub cluster with HubAcceptsClient set to false + mockHubClusterLister.mockHubCluster = &clusterv1.ManagedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + }, + Spec: clusterv1.ManagedClusterSpec{ + HubAcceptsClient: true, + }, + } + + // Call the sync method + err = hacontroller.sync(context.TODO(), nil) + assert.NoError(t, err, "Expected no error") + + // Expect handled to be false + assert.False(t, handled, "Expected handled to be false") + + // Create a mock hub cluster with HubAcceptsClient set to true + mockHubClusterLister.mockHubCluster = &clusterv1.ManagedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + }, + Spec: clusterv1.ManagedClusterSpec{ + HubAcceptsClient: false, + }, + } + + // Call the sync method again + err = hacontroller.sync(context.TODO(), nil) + assert.NoError(t, err, "Expected no error") + + // Expect handled to be true + assert.True(t, handled, "Expected handled to be true") +} + +type MockManagedClusterLister struct { + clusterv1lister.ManagedClusterLister + mockHubCluster *clusterv1.ManagedCluster +} + +func (m *MockManagedClusterLister) Get(name string) (*clusterv1.ManagedCluster, error) { + // Return a dummy ManagedCluster or an error based on your test case. + return m.mockHubCluster, nil +} diff --git a/pkg/registration/spoke/spokeagent.go b/pkg/registration/spoke/spokeagent.go index 6aad644c2..6478878cf 100644 --- a/pkg/registration/spoke/spokeagent.go +++ b/pkg/registration/spoke/spokeagent.go @@ -35,6 +35,8 @@ import ( "open-cluster-management.io/ocm/pkg/registration/spoke/lease" "open-cluster-management.io/ocm/pkg/registration/spoke/managedcluster" "open-cluster-management.io/ocm/pkg/registration/spoke/registration" + + bootstrapkubeconfigsmanager "open-cluster-management.io/ocm/pkg/registration/spoke/bootstrapkubeconfigsmanager" ) // AddOnLeaseControllerSyncInterval is exposed so that integration tests can crank up the constroller sync speed. @@ -135,6 +137,13 @@ func (o *SpokeAgentConfig) RunSpokeAgentWithSpokeInformers(ctx context.Context, return err } + bootstrapKubeconfigManager := bootstrapkubeconfigsmanager.NewBoostrapkubeconfigsManager( + o.agentOptions.ComponentNamespace, + o.registrationOption.BootstrapKubeconfigSecret, // the in-use bootstrap kubeconfig secret + o.registrationOption.BootstrapKubeconfigSecrets, + managementKubeClient, + ) + // dump data in hub kubeconfig secret into file system if it exists err = registration.DumpSecret( managementKubeClient.CoreV1(), o.agentOptions.ComponentNamespace, o.registrationOption.HubKubeconfigSecret, @@ -368,6 +377,11 @@ func (o *SpokeAgentConfig) RunSpokeAgentWithSpokeInformers(ctx context.Context, o.agentOptions.SpokeClusterName, hubKubeClient, hubClusterInformerFactory.Cluster().V1().ManagedClusters(), + o.registrationOption.HubConnectionTimeoutSeconds, + func(ctx context.Context) error { + klog.Info("Failed to connect to hub because of lease out-of-date, reselect a new bootstrap kubeconfig") + return bootstrapKubeconfigManager.ReSelect(ctx) + }, recorder, ) @@ -411,6 +425,16 @@ func (o *SpokeAgentConfig) RunSpokeAgentWithSpokeInformers(ctx context.Context, ) } + hubAcceptController := registration.NewHubAcceptController( + o.agentOptions.SpokeClusterName, + hubClusterInformerFactory.Cluster().V1().ManagedClusters(), + func(ctx context.Context) error { + klog.Info("Failed to connect to hub because of hubAcceptClient set to false, reselect a new bootstrap kubeconfig") + return bootstrapKubeconfigManager.ReSelect(ctx) + }, + recorder, + ) + go hubKubeInformerFactory.Start(ctx.Done()) go hubClusterInformerFactory.Start(ctx.Done()) go namespacedManagementKubeInformerFactory.Start(ctx.Done()) @@ -429,6 +453,8 @@ func (o *SpokeAgentConfig) RunSpokeAgentWithSpokeInformers(ctx context.Context, go addOnRegistrationController.Run(ctx, 1) } + go hubAcceptController.Run(ctx, 1) + // start health checking of hub client certificate if o.registrationOption.clientCertHealthChecker != nil { tlsCertFile := path.Join(o.agentOptions.HubKubeconfigDir, clientcert.TLSCertFile)