diff --git a/controllers/hostdevicenetwork_controller.go b/controllers/hostdevicenetwork_controller.go index ecc56eaf..04485408 100644 --- a/controllers/hostdevicenetwork_controller.go +++ b/controllers/hostdevicenetwork_controller.go @@ -19,6 +19,7 @@ package controllers //nolint:dupl import ( "context" + "fmt" "time" "github.com/go-logr/logr" @@ -43,7 +44,8 @@ import ( // HostDeviceNetworkReconciler reconciles a HostDeviceNetwork object type HostDeviceNetworkReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + MigrationCh chan struct{} stateManager state.Manager } @@ -59,6 +61,12 @@ type HostDeviceNetworkReconciler struct { // //nolint:dupl func (r *HostDeviceNetworkReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Wait for migration flow to finish + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.Info("Reconciling HostDeviceNetwork") diff --git a/controllers/ipoibnetwork_controller.go b/controllers/ipoibnetwork_controller.go index 7e98c2d8..7c64226d 100644 --- a/controllers/ipoibnetwork_controller.go +++ b/controllers/ipoibnetwork_controller.go @@ -18,6 +18,7 @@ package controllers //nolint:dupl import ( "context" + "fmt" "time" "github.com/go-logr/logr" @@ -41,7 +42,8 @@ import ( // IPoIBNetworkReconciler reconciles a IPoIBNetwork object type IPoIBNetworkReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + MigrationCh chan struct{} stateManager state.Manager } @@ -57,6 +59,12 @@ type IPoIBNetworkReconciler struct { // //nolint:dupl func (r *IPoIBNetworkReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Wait for migration flow to finish + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.Info("Reconciling IPoIBNetwork") diff --git a/controllers/macvlannetwork_controller.go b/controllers/macvlannetwork_controller.go index e5c7c767..391e86c7 100644 --- a/controllers/macvlannetwork_controller.go +++ b/controllers/macvlannetwork_controller.go @@ -18,6 +18,7 @@ package controllers //nolint:dupl import ( "context" + "fmt" "time" "github.com/go-logr/logr" @@ -41,8 +42,9 @@ import ( // MacvlanNetworkReconciler reconciles a MacvlanNetwork object type MacvlanNetworkReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme + Log logr.Logger + Scheme *runtime.Scheme + MigrationCh chan struct{} stateManager state.Manager } @@ -58,6 +60,12 @@ type MacvlanNetworkReconciler struct { // //nolint:dupl func (r *MacvlanNetworkReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Wait for migration flow to finish + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.Info("Reconciling MacvlanNetwork") diff --git a/controllers/nicclusterpolicy_controller.go b/controllers/nicclusterpolicy_controller.go index f148aa9c..74a36d73 100644 --- a/controllers/nicclusterpolicy_controller.go +++ b/controllers/nicclusterpolicy_controller.go @@ -52,6 +52,7 @@ type NicClusterPolicyReconciler struct { Scheme *runtime.Scheme ClusterTypeProvider clustertype.Provider StaticConfigProvider staticconfig.Provider + MigrationCh chan struct{} stateManager state.Manager } @@ -87,6 +88,12 @@ type NicClusterPolicyReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Wait for migration flow to finish + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.V(consts.LogLevelInfo).Info("Reconciling NicClusterPolicy") @@ -179,6 +186,10 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels( _ = r.Client.List(ctx, pods, client.MatchingLabels{"nvidia.com/ofed-driver": ""}) for i := range pods.Items { pod := pods.Items[i] + if pod.Spec.NodeName == "" { + // In case that Pod is in Pending state + continue + } labelValue := "true" // We assume that OFED pod contains only one container to simplify the logic. // We can revisit this logic in the future if needed diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 0f7775bf..b16ea5f7 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -106,21 +106,27 @@ var _ = BeforeSuite(func() { }) Expect(err).ToNot(HaveOccurred()) + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) + err = (&HostDeviceNetworkReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + MigrationCh: migrationCompletionChan, }).SetupWithManager(k8sManager, testSetupLog) Expect(err).ToNot(HaveOccurred()) err = (&IPoIBNetworkReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + MigrationCh: migrationCompletionChan, }).SetupWithManager(k8sManager, testSetupLog) Expect(err).ToNot(HaveOccurred()) err = (&MacvlanNetworkReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + MigrationCh: migrationCompletionChan, }).SetupWithManager(k8sManager, testSetupLog) Expect(err).ToNot(HaveOccurred()) @@ -133,6 +139,7 @@ var _ = BeforeSuite(func() { Scheme: k8sManager.GetScheme(), ClusterTypeProvider: clusterTypeProvider, StaticConfigProvider: staticConfigProvider, + MigrationCh: migrationCompletionChan, }).SetupWithManager(k8sManager, testSetupLog) Expect(err).ToNot(HaveOccurred()) diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index 01f4599e..26b78433 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "fmt" "time" "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" @@ -47,6 +48,7 @@ type UpgradeReconciler struct { client.Client Scheme *runtime.Scheme StateManager upgrade.ClusterUpgradeStateManager + MigrationCh chan struct{} } const plannedRequeueInterval = time.Minute * 2 @@ -64,6 +66,12 @@ const UpgradeStateAnnotation = "nvidia.com/ofed-upgrade-state" // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *UpgradeReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { + // Wait for migration flow to finish + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.V(consts.LogLevelInfo).Info("Reconciling Upgrade") diff --git a/controllers/upgrade_controller_test.go b/controllers/upgrade_controller_test.go index 3ecbb6ca..c59d1063 100644 --- a/controllers/upgrade_controller_test.go +++ b/controllers/upgrade_controller_test.go @@ -55,9 +55,12 @@ var _ = Describe("Upgrade Controller", func() { }) Context("When NicClusterPolicy CR is created", func() { It("Upgrade policy is disabled", func() { + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) upgradeReconciler := &UpgradeReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), + Client: k8sClient, + Scheme: k8sClient.Scheme(), + MigrationCh: migrationCompletionChan, } req := ctrl.Request{NamespacedName: types.NamespacedName{Name: consts.NicClusterPolicyResourceName}} @@ -76,10 +79,13 @@ var _ = Describe("Upgrade Controller", func() { err := k8sClient.Create(goctx.TODO(), node) Expect(err).NotTo(HaveOccurred()) } + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) upgradeReconciler := &UpgradeReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), + Client: k8sClient, + Scheme: k8sClient.Scheme(), + MigrationCh: migrationCompletionChan, } // Call removeNodeUpgradeStateLabels function err := upgradeReconciler.removeNodeUpgradeStateLabels(goctx.TODO()) diff --git a/main.go b/main.go index 0ee0061a..77854d34 100644 --- a/main.go +++ b/main.go @@ -82,7 +82,7 @@ func setupWebhookControllers(mgr ctrl.Manager) error { return nil } -func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager) error { +func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager, migrationChan chan struct{}) error { ctrLog := setupLog.WithName("controller") clusterTypeProvider, err := clustertype.NewProvider(ctx, c) @@ -98,27 +98,31 @@ func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager) Scheme: mgr.GetScheme(), ClusterTypeProvider: clusterTypeProvider, // we want to cache information about the cluster type StaticConfigProvider: staticInfoProvider, + MigrationCh: migrationChan, }).SetupWithManager(mgr, ctrLog.WithName("NicClusterPolicy")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NicClusterPolicy") return err } if err := (&controllers.MacvlanNetworkReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + MigrationCh: migrationChan, }).SetupWithManager(mgr, ctrLog.WithName("MacVlanNetwork")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "MacvlanNetwork") return err } if err := (&controllers.HostDeviceNetworkReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + MigrationCh: migrationChan, }).SetupWithManager(mgr, ctrLog.WithName("HostDeviceNetwork")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "HostDeviceNetwork") return err } if err := (&controllers.IPoIBNetworkReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + MigrationCh: migrationChan, }).SetupWithManager(mgr, ctrLog.WithName("IPoIBNetwork")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "IPoIBNetwork") return err @@ -166,35 +170,26 @@ func main() { os.Exit(1) } - // run migration logic before controllers start - if err := migrate.Migrate(stopCtx, setupLog.WithName("migrate"), directClient); err != nil { - setupLog.Error(err, "failed to run migration logic") - os.Exit(1) + migrationCompletionChan := make(chan struct{}) + m := migrate.Migrator{ + K8sClient: directClient, + MigrationCh: migrationCompletionChan, + LeaderElection: enableLeaderElection, + Logger: ctrl.Log.WithName("Migrator"), } - - err = setupCRDControllers(stopCtx, directClient, mgr) + err = mgr.Add(&m) if err != nil { + setupLog.Error(err, "failed to add Migrator to the Manager") os.Exit(1) } - upgrade.SetDriverName("ofed") - - upgradeLogger := ctrl.Log.WithName("controllers").WithName("Upgrade") - - clusterUpdateStateManager, err := upgrade.NewClusterUpgradeStateManager( - upgradeLogger.WithName("clusterUpgradeManager"), config.GetConfigOrDie(), nil) - + err = setupCRDControllers(stopCtx, directClient, mgr, migrationCompletionChan) if err != nil { - setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade") os.Exit(1) } - if err = (&controllers.UpgradeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - StateManager: clusterUpdateStateManager, - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Upgrade") + err = setupUpgradeController(mgr, migrationCompletionChan) + if err != nil { os.Exit(1) } @@ -221,3 +216,28 @@ func main() { os.Exit(1) } } + +func setupUpgradeController(mgr ctrl.Manager, migrationChan chan struct{}) error { + upgrade.SetDriverName("ofed") + + upgradeLogger := ctrl.Log.WithName("controllers").WithName("Upgrade") + + clusterUpdateStateManager, err := upgrade.NewClusterUpgradeStateManager( + upgradeLogger.WithName("clusterUpgradeManager"), config.GetConfigOrDie(), nil) + + if err != nil { + setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade") + return err + } + + if err = (&controllers.UpgradeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + StateManager: clusterUpdateStateManager, + MigrationCh: migrationChan, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Upgrade") + return err + } + return nil +} diff --git a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml index 992a625d..b6ebe638 100644 --- a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml +++ b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml @@ -15,20 +15,22 @@ apiVersion: apps/v1 kind: DaemonSet metadata: labels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelHash }} nvidia.com/ofed-driver: "" - name: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-ds + mofed-ds-format-version: "1" + name: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelHash }}-ds namespace: {{ .RuntimeSpec.Namespace }} spec: updateStrategy: type: OnDelete selector: matchLabels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelHash }} template: metadata: labels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelHash }} + kernel: {{ .RuntimeSpec.Kernel }} nvidia.com/ofed-driver: "" spec: priorityClassName: system-node-critical @@ -250,6 +252,10 @@ spec: feature.node.kubernetes.io/pci-15b3.present: "true" feature.node.kubernetes.io/system-os_release.ID: {{ .RuntimeSpec.OSName }} feature.node.kubernetes.io/system-os_release.VERSION_ID: "{{ .RuntimeSpec.OSVer }}" + feature.node.kubernetes.io/kernel-version.full: "{{ .RuntimeSpec.Kernel }}" + {{- if .RuntimeSpec.UseDtk }} + feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: "{{ .RuntimeSpec.RhcosVersion }}" + {{- end }} {{- if .NodeAffinity }} affinity: nodeAffinity: diff --git a/pkg/migrate/migrate.go b/pkg/migrate/migrate.go index aa13d245..365789c6 100644 --- a/pkg/migrate/migrate.go +++ b/pkg/migrate/migrate.go @@ -22,6 +22,7 @@ import ( "fmt" "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" @@ -31,8 +32,36 @@ import ( "github.com/Mellanox/network-operator/pkg/config" "github.com/Mellanox/network-operator/pkg/consts" + + mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) +// Migrator migrates from previous versions +type Migrator struct { + K8sClient client.Client + MigrationCh chan struct{} + LeaderElection bool + Logger logr.Logger +} + +// NeedLeaderElection implements manager.NeedLeaderElection +func (m *Migrator) NeedLeaderElection() bool { + return m.LeaderElection +} + +// Start implements manager.Runnable +func (m *Migrator) Start(ctx context.Context) error { + err := Migrate(ctx, m.Logger, m.K8sClient) + if err != nil { + m.Logger.Error(err, "failed to migrate Network Operator") + return err + } + close(m.MigrationCh) + return nil +} + // Migrate contains logic which should run once during controller start. // The main use case for this handler is network-operator upgrade // for example, the handler can contain logic to change old data format to a new one or @@ -55,6 +84,11 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error { log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap") return err } + if err := handleSingleMofedDS(ctx, log, c); err != nil { + // critical for the operator operation, will fail Mofed migration + log.V(consts.LogLevelError).Error(err, "error trying to handle single MOFED DS") + return err + } return nil } @@ -115,3 +149,96 @@ func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c } return nil } + +func handleSingleMofedDS(ctx context.Context, log logr.Logger, c client.Client) error { + ncp := &mellanoxv1alpha1.NicClusterPolicy{} + key := types.NamespacedName{ + Name: consts.NicClusterPolicyResourceName, + } + err := c.Get(ctx, key, ncp) + if apiErrors.IsNotFound(err) { + log.V(consts.LogLevelDebug).Info("NIC ClusterPolicy not found, skip handling single MOFED DS") + return nil + } else if err != nil { + return err + } + if ncp.Spec.OFEDDriver == nil { + return nil + } + log.V(consts.LogLevelDebug).Info("Searching for single MOFED DS") + dsList := &appsv1.DaemonSetList{} + err = c.List(ctx, dsList, client.MatchingLabels{"nvidia.com/ofed-driver": ""}) + if err != nil { + log.V(consts.LogLevelError).Error(err, "fail to list MOFED DS") + return err + } + var ds *appsv1.DaemonSet + for i := range dsList.Items { + mofedDs := &dsList.Items[i] + // The single MOFED DS does not contain the label "mofed-ds-format-version" + _, ok := mofedDs.Labels["mofed-ds-format-version"] + if ok { + continue + } + log.V(consts.LogLevelDebug).Info("Found single MOFED DS", "name", mofedDs.Name) + ds = mofedDs + break + } + if ds != nil { + err = markNodesAsUpgradeRequested(ctx, log, c, ds) + if err != nil { + return err + } + policy := metav1.DeletePropagationOrphan + err = c.Delete(ctx, ds, &client.DeleteOptions{PropagationPolicy: &policy}) + if err != nil { + return err + } + log.V(consts.LogLevelDebug).Info("Deleted single MOFED DS with orphaned", "name", ds.Name) + return nil + } + log.V(consts.LogLevelDebug).Info("Single MOFED DS not found") + return nil +} + +func markNodesAsUpgradeRequested(ctx context.Context, log logr.Logger, c client.Client, ds *appsv1.DaemonSet) error { + nodes, err := getDaemonSetNodes(ctx, c, ds) + if err != nil { + return err + } + for _, nodeName := range nodes { + node := &corev1.Node{} + nodeKey := client.ObjectKey{ + Name: nodeName, + } + if err := c.Get(context.Background(), nodeKey, node); err != nil { + return err + } + patchString := []byte(fmt.Sprintf(`{"metadata":{"annotations":{%q: "true"}}}`, + upgrade.GetUpgradeRequestedAnnotationKey())) + patch := client.RawPatch(types.MergePatchType, patchString) + err = c.Patch(ctx, node, patch) + if err != nil { + return err + } + log.V(consts.LogLevelDebug).Info("Node annotated with upgrade-requested", "name", nodeName) + } + + return nil +} + +func getDaemonSetNodes(ctx context.Context, c client.Client, ds *appsv1.DaemonSet) ([]string, error) { + nodeNames := make([]string, 0) + pods := &corev1.PodList{} + err := c.List(ctx, pods, client.MatchingLabels(ds.Spec.Selector.MatchLabels)) + if err != nil { + return nil, err + } + for i := range pods.Items { + nodeName := pods.Items[i].Spec.NodeName + if nodeName != "" { + nodeNames = append(nodeNames, nodeName) + } + } + return nodeNames, nil +} diff --git a/pkg/migrate/migrate_test.go b/pkg/migrate/migrate_test.go index e974fd12..a9be5828 100644 --- a/pkg/migrate/migrate_test.go +++ b/pkg/migrate/migrate_test.go @@ -19,7 +19,9 @@ package migrate import ( goctx "context" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -27,6 +29,10 @@ import ( . "github.com/onsi/gomega" "github.com/Mellanox/network-operator/pkg/consts" + + mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" ) //nolint:dupl @@ -34,6 +40,8 @@ var _ = Describe("Migrate", func() { AfterEach(func() { cm := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: namespaceName, Name: nvIPAMcmName}} _ = k8sClient.Delete(goctx.Background(), cm) + _ = k8sClient.DeleteAllOf(goctx.Background(), &corev1.Node{}) + _ = k8sClient.DeleteAllOf(goctx.Background(), &corev1.Pod{}) }) It("should remove annotation on NVIPAM CM", func() { createConfigMap(true) @@ -55,6 +63,32 @@ var _ = Describe("Migrate", func() { err := Migrate(goctx.Background(), testLog, k8sClient) Expect(err).NotTo(HaveOccurred()) }) + It("should delete MOFED DS", func() { + upgrade.SetDriverName("ofed") + createNCP() + createMofedDS() + createNodes() + createPods() + By("Verify Single DS is deleted") + err := Migrate(goctx.Background(), testLog, k8sClient) + Expect(err).NotTo(HaveOccurred()) + Eventually(func() bool { + ds := &appsv1.DaemonSet{} + err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "test-ds"}, ds) + return errors.IsNotFound(err) + }) + By("Verify Nodes have upgrade-requested annotation") + Eventually(func() bool { + node1 := &corev1.Node{} + err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "test-node1"}, node1) + Expect(err).NotTo(HaveOccurred()) + node2 := &corev1.Node{} + err = k8sClient.Get(goctx.TODO(), types.NamespacedName{Namespace: namespaceName, Name: "test-node2"}, node2) + Expect(err).NotTo(HaveOccurred()) + return node1.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()] == "true" && + node2.Annotations[upgrade.GetUpgradeRequestedAnnotationKey()] == "true" + }) + }) }) func createConfigMap(addLabel bool) { @@ -66,3 +100,110 @@ func createConfigMap(addLabel bool) { err := k8sClient.Create(goctx.Background(), cm) Expect(err).NotTo(HaveOccurred()) } + +func createMofedDS() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespaceName, + Name: "test-ds", + Labels: map[string]string{"nvidia.com/ofed-driver": ""}, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "mofed-ubuntu22.04"}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "mofed-ubuntu22.04"}, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "mofed-container", + Image: "github/mofed", + }, + }, + }, + }, + }, + } + err := k8sClient.Create(goctx.Background(), ds) + Expect(err).NotTo(HaveOccurred()) +} + +func createNodes() { + By("Create Nodes") + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node1", + Labels: make(map[string]string), + Annotations: make(map[string]string), + }, + } + err := k8sClient.Create(goctx.TODO(), node) + Expect(err).NotTo(HaveOccurred()) + node2 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node2", + Labels: make(map[string]string), + Annotations: make(map[string]string), + }, + } + err = k8sClient.Create(goctx.TODO(), node2) + Expect(err).NotTo(HaveOccurred()) +} + +func createPods() { + By("Create Pods") + gracePeriodSeconds := int64(0) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod1", + Namespace: namespaceName, + Labels: map[string]string{"app": "mofed-ubuntu22.04"}, + }, + Spec: corev1.PodSpec{ + NodeName: "test-node1", + TerminationGracePeriodSeconds: &gracePeriodSeconds, + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "test-image", + }, + }, + }, + } + err := k8sClient.Create(goctx.TODO(), pod) + Expect(err).NotTo(HaveOccurred()) + pod2 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod2", + Namespace: namespaceName, + Labels: map[string]string{"app": "mofed-ubuntu22.04"}, + }, + Spec: corev1.PodSpec{ + NodeName: "test-node2", + TerminationGracePeriodSeconds: &gracePeriodSeconds, + Containers: []corev1.Container{ + { + Name: "test-container", + Image: "test-image", + }, + }, + }, + } + err = k8sClient.Create(goctx.TODO(), pod2) + Expect(err).NotTo(HaveOccurred()) +} + +func createNCP() { + ncp := &mellanoxv1alpha1.NicClusterPolicy{ObjectMeta: metav1.ObjectMeta{Name: consts.NicClusterPolicyResourceName}} + ncp.Spec.OFEDDriver = &mellanoxv1alpha1.OFEDDriverSpec{ + ImageSpec: mellanoxv1alpha1.ImageSpec{ + Image: "mofed", + Repository: "nvcr.io/nvidia/mellanox", + Version: "5.9-0.5.6.0", + ImagePullSecrets: []string{}, + }, + } + err := k8sClient.Create(goctx.Background(), ncp) + Expect(err).NotTo(HaveOccurred()) +} diff --git a/pkg/migrate/suite_test.go b/pkg/migrate/suite_test.go index ccbf139a..2bbc795c 100644 --- a/pkg/migrate/suite_test.go +++ b/pkg/migrate/suite_test.go @@ -18,6 +18,7 @@ package migrate import ( "context" + "os" "testing" "time" @@ -30,6 +31,8 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + mellanoxcomv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -58,9 +61,21 @@ var _ = BeforeSuite(func() { logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) testLog = logf.Log.WithName("test-log").WithName("setup") + err := mellanoxcomv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + By("bootstrapping test environment") + // Go to project root directory + err = os.Chdir("../..") + Expect(err).NotTo(HaveOccurred()) - testEnv = &envtest.Environment{} + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{"config/crd/bases"}, + CRDInstallOptions: envtest.CRDInstallOptions{ErrorIfPathMissing: true}, + ErrorIfCRDPathMissing: true, + } cfg, err := testEnv.Start() Expect(err).NotTo(HaveOccurred()) diff --git a/pkg/nodeinfo/node_info.go b/pkg/nodeinfo/node_info.go index a1eecb79..f1d7473a 100644 --- a/pkg/nodeinfo/node_info.go +++ b/pkg/nodeinfo/node_info.go @@ -21,6 +21,8 @@ of specific attributes (mainly labels) for easier use. package nodeinfo import ( + "fmt" + corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -35,6 +37,8 @@ var MellanoxNICListOptions = []client.ListOption{ type Provider interface { // GetNodesAttributes retrieves node attributes for nodes matching the filter criteria GetNodesAttributes(filters ...Filter) []NodeAttributes + // GetNodePools partitions nodes into one or more node pools for nodes matching the filter criteria + GetNodePools(filters ...Filter) []NodePool } // NewProvider creates a new Provider object @@ -58,3 +62,83 @@ func (p *provider) GetNodesAttributes(filters ...Filter) (attrs []NodeAttributes } return attrs } + +// NodePool represent a set of Nodes grouped by common attributes +type NodePool struct { + Name string + OsName string + OsVersion string + RhcosVersion string + Kernel string + Arch string +} + +// GetNodePools partitions nodes into one or more node pools. The list of nodes to partition +// is defined by the filters provided as input. +// +// Nodes are partitioned by osVersion-kernelVersion pair. +func (p *provider) GetNodePools(filters ...Filter) []NodePool { + filtered := p.nodes + for _, filter := range filters { + filtered = filter.Apply(filtered) + } + + nodePoolMap := make(map[string]NodePool) + + for _, node := range filtered { + node := node + nodeLabels := node.GetLabels() + + nodePool := NodePool{} + osName, ok := nodeLabels[NodeLabelOSName] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", + "Node", node.Name, "Label", NodeLabelOSName) + continue + } + nodePool.OsName = osName + + osVersion, ok := nodeLabels[NodeLabelOSVer] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", + "Node", node.Name, "Label", NodeLabelOSVer) + continue + } + nodePool.OsVersion = osVersion + + arch, ok := nodeLabels[NodeLabelCPUArch] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", + "Node", node.Name, "Label", NodeLabelCPUArch) + continue + } + nodePool.Arch = arch + + rhcos, ok := nodeLabels[NodeLabelOSTreeVersion] + if ok { + nodePool.RhcosVersion = rhcos + } + + kernel, ok := nodeLabels[NodeLabelKernelVerFull] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", + "Node", node.Name, "Label", NodeLabelKernelVerFull) + continue + } + nodePool.Kernel = kernel + + nodePool.Name = fmt.Sprintf("%s%s-%s", nodePool.OsName, nodePool.OsVersion, nodePool.Kernel) + + if _, exists := nodePoolMap[nodePool.Name]; !exists { + nodePoolMap[nodePool.Name] = nodePool + log.Info("NodePool found", "name", nodePool.Name) + } + } + + nodePools := make([]NodePool, 0) + for _, np := range nodePoolMap { + nodePools = append(nodePools, np) + } + + return nodePools +} diff --git a/pkg/nodeinfo/node_info_test.go b/pkg/nodeinfo/node_info_test.go index 2f22ea73..9668b650 100644 --- a/pkg/nodeinfo/node_info_test.go +++ b/pkg/nodeinfo/node_info_test.go @@ -23,6 +23,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + testArch = "amd64" + testOsUbuntu = "ubuntu" + testOsRhcos = "rhcos" + testOsVer = "22.04" + testKernelFull = "5.15.0-78-generic" +) + // A Filter applies a filter on a list of Nodes type dummyFilter struct { called bool @@ -103,4 +111,135 @@ var _ = Describe("nodeinfo Provider tests", func() { Expect(len(attrs)).To(Equal(0)) }) }) + + Context("GetNodePools with filter", func() { + It("Should return an empty list of pools", func() { + filter := &dummyFilter{filtered: []*corev1.Node{}} + provider := NewProvider([]*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelFull, testArch), + }) + pools := provider.GetNodePools(filter) + Expect(len(pools)).To(Equal(0)) + }) + It("Should return an empty list of pools, filter by label", func() { + node := getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch) + delete(node.Labels, NodeLabelMlnxNIC) + provider := NewProvider([]*corev1.Node{node}) + pools := provider.GetNodePools(NewNodeLabelFilterBuilder().WithLabel(NodeLabelMlnxNIC, "true").Build()) + Expect(len(pools)).To(Equal(0)) + }) + }) + + Context("GetNodePools without filter", func() { + It("Should return pool", func() { + provider := NewProvider([]*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelFull, testArch), + }) + pools := provider.GetNodePools() + Expect(len(pools)).To(Equal(1)) + Expect(pools[0].Arch).To(Equal(testArch)) + Expect(pools[0].OsName).To(Equal(testOsUbuntu)) + Expect(pools[0].OsVersion).To(Equal(testOsVer)) + Expect(pools[0].Kernel).To(Equal(testKernelFull)) + }) + DescribeTable("GetNodePools", + func(nodeList []*corev1.Node, expectedPools int) { + provider := NewProvider(nodeList) + pools := provider.GetNodePools() + Expect(len(pools)).To(Equal(expectedPools)) + }, + Entry("single pool, multiple nodes same NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-3", testOsUbuntu, testOsVer, testKernelFull, testArch), + }, 1), + Entry("2 pools, multiple nodes different OS NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsRhcos, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-3", testOsRhcos, testOsVer, testKernelFull, testArch), + }, 2), + Entry("2 pools, multiple nodes different OSVer NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, "20.04", testKernelFull, testArch), + }, 2), + Entry("2 pools, multiple nodes different KernelFull NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, "6", testArch), + }, 2), + Entry("1 pool, multiple nodes different arch NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelFull, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelFull, "arm"), + }, 1), + Entry("no pool, node without NFD labels", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{Name: "Node-1"}, + }, + }, 0), + Entry("no pool, node with missing osName NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerFull: testKernelFull, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + Entry("no pool, node with missing osVer NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelKernelVerFull: testKernelFull, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + Entry("no pool, node with missing arch NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerFull: testKernelFull, + }}, + }, + }, 0), + Entry("no pool, node with missing Kernel full NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelOSVer: testOsVer, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + ) + }) }) + +func getNodeWithNfdLabels(name, osName, osVer, kernelFull, arch string) *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: osName, + NodeLabelOSVer: osVer, + NodeLabelKernelVerFull: kernelFull, + NodeLabelCPUArch: arch, + }, + }, + } +} diff --git a/pkg/state/dummy_provider.go b/pkg/state/dummy_provider.go index 931e2a77..35c629c8 100644 --- a/pkg/state/dummy_provider.go +++ b/pkg/state/dummy_provider.go @@ -50,6 +50,17 @@ func (d *dummyProvider) GetNodesAttributes(...nodeinfo.Filter) []nodeinfo.NodeAt return []nodeinfo.NodeAttributes{{Attributes: nodeAttr}} } +func (d *dummyProvider) GetNodePools(...nodeinfo.Filter) []nodeinfo.NodePool { + return []nodeinfo.NodePool{ + { + Name: "ubuntu20.04-5.15", + OsName: "ubuntu", + OsVersion: "20.04", + Kernel: "5.15.0-78-generic", + }, + } +} + func getDummyCatalog() InfoCatalog { catalog := NewInfoCatalog() catalog.Add(InfoTypeNodeInfo, &dummyProvider{}) diff --git a/pkg/state/state_ofed.go b/pkg/state/state_ofed.go index 5fd4c551..36e82bfd 100644 --- a/pkg/state/state_ofed.go +++ b/pkg/state/state_ofed.go @@ -19,6 +19,7 @@ package state import ( "context" "fmt" + "hash/fnv" "path/filepath" "sort" "strings" @@ -37,12 +38,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/rand" "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + "github.com/Mellanox/network-operator/pkg/clustertype" "github.com/Mellanox/network-operator/pkg/config" "github.com/Mellanox/network-operator/pkg/consts" "github.com/Mellanox/network-operator/pkg/nodeinfo" @@ -153,6 +156,8 @@ type ofedRuntimeSpec struct { CPUArch string OSName string OSVer string + Kernel string + KernelHash string MOFEDImageName string InitContainerConfig initContainerConfig // is true if cluster type is Openshift @@ -160,6 +165,7 @@ type ofedRuntimeSpec struct { ContainerResources ContainerResourcesMap UseDtk bool DtkImageName string + RhcosVersion string } type ofedManifestRenderData struct { @@ -398,44 +404,61 @@ func (s *stateOFED) GetManifestObjects( if clusterInfo == nil { return nil, errors.New("clusterInfo provider required") } - - attrs := nodeInfo.GetNodesAttributes( + nodePools := nodeInfo.GetNodePools( nodeinfo.NewNodeLabelFilterBuilder().WithLabel(nodeinfo.NodeLabelMlnxNIC, "true").Build()) - if len(attrs) == 0 { + if len(nodePools) == 0 { reqLogger.V(consts.LogLevelInfo).Info("No nodes with Mellanox NICs where found in the cluster.") return []*unstructured.Unstructured{}, nil } - // TODO: Render daemonset multiple times according to CPUXOS matrix (ATM assume all nodes are the same) - // Note: it is assumed MOFED driver container is able to handle multiple kernel version e.g by triggering DKMS - // if driver was compiled against a missmatching kernel to begin with. - if err := s.checkAttributesExist(attrs[0], - nodeinfo.AttrTypeCPUArch, nodeinfo.AttrTypeOSName, nodeinfo.AttrTypeOSVer); err != nil { - return nil, err - } - - nodeAttr := attrs[0].Attributes + setProbesDefaults(cr) + // Update MOFED Env variables with defaults for the cluster + cr.Spec.OFEDDriver.Env = s.mergeWithDefaultEnvs(cr.Spec.OFEDDriver.Env) + objs := make([]*unstructured.Unstructured, 0) + renderedObjsMap := stateObjects{} useDtk := clusterInfo.IsOpenshift() && config.FromEnv().State.OFEDState.UseDTK + + for _, np := range nodePools { + nodePool := np + + // render objects + renderedObjs, err := renderObjects(ctx, &nodePool, useDtk, s, cr, reqLogger, clusterInfo) + if err != nil { + return nil, errors.Wrap(err, "failed to render objects") + } + for _, o := range renderedObjs { + if !renderedObjsMap.Exist(o.GroupVersionKind(), types.NamespacedName{ + Name: o.GetName(), + Namespace: o.GetNamespace()}) { + renderedObjsMap.Add(o.GroupVersionKind(), types.NamespacedName{Name: o.GetName(), Namespace: o.GetNamespace()}) + objs = append(objs, o) + } + } + } + reqLogger.V(consts.LogLevelDebug).Info("Rendered", "objects:", objs) + return objs, nil +} + +func renderObjects(ctx context.Context, nodePool *nodeinfo.NodePool, useDtk bool, s *stateOFED, + cr *mellanoxv1alpha1.NicClusterPolicy, reqLogger logr.Logger, + clusterInfo clustertype.Provider) ([]*unstructured.Unstructured, error) { var dtkImageName string + rhcosVersion := nodePool.RhcosVersion if useDtk { - if err := s.checkAttributesExist(attrs[0], nodeinfo.AttrTypeOSTreeVersion); err != nil { - return nil, err + if rhcosVersion == "" { + return nil, fmt.Errorf("required NFD Label missing: %s", nodeinfo.NodeLabelOSTreeVersion) } - dtk, err := s.getOCPDriverToolkitImage(ctx, nodeAttr[nodeinfo.AttrTypeOSTreeVersion]) + dtk, err := s.getOCPDriverToolkitImage(ctx, rhcosVersion) if err != nil { return nil, fmt.Errorf("failed to get OpenShift DTK image : %v", err) } dtkImageName = dtk } - setProbesDefaults(cr) - - // Update MOFED Env variables with defaults - cr.Spec.OFEDDriver.Env = s.mergeWithDefaultEnvs(cr.Spec.OFEDDriver.Env) - additionalVolMounts := additionalVolumeMounts{} - osname := nodeAttr[nodeinfo.AttrTypeOSName] + osname := nodePool.OsName + // set any custom ssl key/certificate configuration provided err := s.handleCertConfig(ctx, cr, osname, &additionalVolMounts) if err != nil { @@ -452,29 +475,28 @@ func (s *stateOFED) GetManifestObjects( CrSpec: cr.Spec.OFEDDriver, RuntimeSpec: &ofedRuntimeSpec{ runtimeSpec: runtimeSpec{config.FromEnv().State.NetworkOperatorResourceNamespace}, - CPUArch: nodeAttr[nodeinfo.AttrTypeCPUArch], - OSName: nodeAttr[nodeinfo.AttrTypeOSName], - OSVer: nodeAttr[nodeinfo.AttrTypeOSVer], - MOFEDImageName: s.getMofedDriverImageName(cr, nodeAttr, reqLogger), + CPUArch: nodePool.Arch, + OSName: nodePool.OsName, + OSVer: nodePool.OsVersion, + Kernel: nodePool.Kernel, + KernelHash: getStringHash(nodePool.Kernel), + MOFEDImageName: s.getMofedDriverImageName(cr, nodePool, reqLogger), InitContainerConfig: s.getInitContainerConfig(cr, reqLogger, config.FromEnv().State.OFEDState.InitContainerImage), IsOpenshift: clusterInfo.IsOpenshift(), ContainerResources: createContainerResourcesMap(cr.Spec.OFEDDriver.ContainerResources), UseDtk: useDtk, DtkImageName: dtkImageName, + RhcosVersion: rhcosVersion, }, Tolerations: cr.Spec.Tolerations, NodeAffinity: cr.Spec.NodeAffinity, AdditionalVolumeMounts: additionalVolMounts, } - // render objects + reqLogger.V(consts.LogLevelDebug).Info("Rendering objects", "data:", renderData) - objs, err := s.renderer.RenderObjects(&render.TemplatingData{Data: renderData}) - if err != nil { - return nil, errors.Wrap(err, "failed to render objects") - } - reqLogger.V(consts.LogLevelDebug).Info("Rendered", "objects:", objs) - return objs, nil + renderedObjs, err := s.renderer.RenderObjects(&render.TemplatingData{Data: renderData}) + return renderedObjs, err } // prepare configuration for the init container, @@ -502,9 +524,8 @@ func (s *stateOFED) getInitContainerConfig( } // getMofedDriverImageName generates MOFED driver image name based on the driver version specified in CR -// TODO(adrianc): in Network-Operator v1.5.0, we should just use the new naming scheme func (s *stateOFED) getMofedDriverImageName(cr *mellanoxv1alpha1.NicClusterPolicy, - nodeAttr map[nodeinfo.AttributeType]string, reqLogger logr.Logger) string { + pool *nodeinfo.NodePool, reqLogger logr.Logger) string { curDriverVer, err := semver.NewVersion(cr.Spec.OFEDDriver.Version) if err != nil { reqLogger.V(consts.LogLevelDebug).Info("failed to parse ofed driver version as semver") @@ -515,9 +536,9 @@ func (s *stateOFED) getMofedDriverImageName(cr *mellanoxv1alpha1.NicClusterPolic cr.Spec.OFEDDriver.Repository, cr.Spec.OFEDDriver.Image, cr.Spec.OFEDDriver.Version, - nodeAttr[nodeinfo.AttrTypeOSName], - nodeAttr[nodeinfo.AttrTypeOSVer], - nodeAttr[nodeinfo.AttrTypeCPUArch]) + pool.OsName, + pool.OsVersion, + pool.Arch) } // readOpenshiftProxyConfig reads ClusterWide Proxy configuration for Openshift @@ -760,3 +781,12 @@ func (s *stateOFED) getOCPDriverToolkitImage(ctx context.Context, ostreeVersion } return image, nil } + +// getStringHash returns a short deterministic hash +func getStringHash(s string) string { + hasher := fnv.New32a() + if _, err := hasher.Write([]byte(s)); err != nil { + panic(err) + } + return rand.SafeEncodeString(fmt.Sprint(hasher.Sum32())) +} diff --git a/pkg/state/state_ofed_test.go b/pkg/state/state_ofed_test.go index b71b5af8..3df184a3 100644 --- a/pkg/state/state_ofed_test.go +++ b/pkg/state/state_ofed_test.go @@ -18,7 +18,9 @@ package state import ( "context" + "fmt" "slices" + "strings" "k8s.io/apimachinery/pkg/runtime" @@ -50,6 +52,8 @@ const ( osName = "ubuntu" osVer = "22.04" rhcosOsTree = "414.92.202311061957-0" + kernelFull1 = "5.15.0-78-generic" + kernelFull2 = "5.15.0-91-generic" ) type openShiftClusterProvider struct { @@ -77,10 +81,11 @@ var _ = Describe("MOFED state test", func() { }) Context("getMofedDriverImageName", func() { - nodeAttr := make(map[nodeinfo.AttributeType]string) - nodeAttr[nodeinfo.AttrTypeCPUArch] = "amd64" - nodeAttr[nodeinfo.AttrTypeOSName] = "ubuntu" - nodeAttr[nodeinfo.AttrTypeOSVer] = "20.04" + nodePool := &nodeinfo.NodePool{ + OsName: "ubuntu", + OsVersion: "20.04", + Arch: "amd64", + } cr := &v1alpha1.NicClusterPolicy{ Spec: v1alpha1.NicClusterPolicySpec{ @@ -95,17 +100,17 @@ var _ = Describe("MOFED state test", func() { It("generates new image format", func() { cr.Spec.OFEDDriver.Version = "5.7-1.0.0.0" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:5.7-1.0.0.0-ubuntu20.04-amd64")) }) It("generates new image format double digit minor", func() { cr.Spec.OFEDDriver.Version = "5.10-0.0.0.1" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:5.10-0.0.0.1-ubuntu20.04-amd64")) }) It("return new image format in case of a bad version", func() { cr.Spec.OFEDDriver.Version = "1.1.1.1.1" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:1.1.1.1.1-ubuntu20.04-amd64")) }) }) @@ -242,8 +247,18 @@ var _ = Describe("MOFED state test", func() { {Name: envVarDriversInventoryPath, Value: ""}}), ) + DescribeTable("GetStringHash", + func(input, hash string) { + computedHash := getStringHash(input) + Expect(computedHash).To(BeEquivalentTo(hash)) + }, + Entry("kernel rhcos", "5.14.0-284.43.1.el9_2.x86_64", "687cd9dc94"), + Entry("kernel ubuntu", "5.15.0-78-generic", "54669c9886"), + Entry("kernel ubuntu - patch 91", "5.15.0-91-generic", "6d568d699f"), + ) + Context("Render Manifests", func() { - It("Should Render Mofed DaemonSet", func() { + It("Should Render multiple DaemonSet", func() { client := mocks.ControllerRuntimeClient{} manifestBaseDir := "../../manifests/state-ofed-driver" @@ -268,17 +283,21 @@ var _ = Describe("MOFED state test", func() { Version: "23.10-0.5.5.0", }, } - By("Creating NodeProvider with 1 Node") + + By("Creating NodeProvider with 3 Nodes, that form 2 Node pools") + infoProvider := nodeinfo.NewProvider([]*v1.Node{ + getNode("node1", kernelFull1), + getNode("node2", kernelFull2), + getNode("node3", kernelFull2), + }) catalog := NewInfoCatalog() catalog.Add(InfoTypeClusterType, &dummyProvider{}) - catalog.Add(InfoTypeNodeInfo, nodeinfo.NewProvider([]*v1.Node{ - getNode("node1"), - })) + catalog.Add(InfoTypeNodeInfo, infoProvider) objs, err := ofedState.GetManifestObjects(ctx, cr, catalog, testLogger) Expect(err).NotTo(HaveOccurred()) - // Expect 4 objects: DaemonSet, Service Account, ClusterRole, ClusterRoleBinding - Expect(len(objs)).To(Equal(4)) - By("Verify DaemonSet") + // Expect 5 objects: 1 DS per pool, Service Account, Role, RoleBinding + Expect(len(objs)).To(Equal(5)) + By("Verify DaemonSets NodeSelector") for _, obj := range objs { if obj.GetKind() != "DaemonSet" { continue @@ -286,8 +305,12 @@ var _ = Describe("MOFED state test", func() { ds := appsv1.DaemonSet{} err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &ds) Expect(err).NotTo(HaveOccurred()) - Expect(ds.Name).To(Equal("mofed-ubuntu22.04-ds")) - verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector) + if ds.Name == fmt.Sprintf("mofed-%s%s-%s-ds", osName, osVer, "54669c9886") { + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector, kernelFull1) + } + if ds.Name == fmt.Sprintf("mofed-%s%s-%s-ds", osName, osVer, "6d568d699f") { + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector, kernelFull2) + } verifyPodAntiInfinity(ds.Spec.Template.Spec.Affinity) } }) @@ -370,7 +393,7 @@ var _ = Describe("MOFED state test", func() { }, } By("Creating NodeProvider with 1 Node with RHCOS OS TREE label") - node := getNode("node1") + node := getNode("node1", kernelFull1) node.Labels[nodeinfo.NodeLabelOSTreeVersion] = rhcosOsTree infoProvider := nodeinfo.NewProvider([]*v1.Node{ node, @@ -392,7 +415,7 @@ var _ = Describe("MOFED state test", func() { err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &ds) Expect(err).NotTo(HaveOccurred()) By("Verify DaemonSet NodeSelector") - verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector) + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector, kernelFull1) By("Verify DTK container image") Expect(len(ds.Spec.Template.Spec.Containers)).To(Equal(2)) dtkContainer := ds.Spec.Template.Spec.Containers[1] @@ -500,7 +523,7 @@ func verifyAdditionalVolumes(volumes []v1.Volume) { Expect(foundRepo).To(BeTrue()) } -func verifyDSNodeSelector(selector map[string]string) { +func verifyDSNodeSelector(selector map[string]string, kernelFull string) { By("Verify NodeSelector") nsMellanox, ok := selector["feature.node.kubernetes.io/pci-15b3.present"] Expect(ok).To(BeTrue()) @@ -511,17 +534,21 @@ func verifyDSNodeSelector(selector map[string]string) { nsOsVer, ok := selector["feature.node.kubernetes.io/system-os_release.VERSION_ID"] Expect(ok).To(BeTrue()) Expect(nsOsVer).To(Equal(osVer)) + nsKernelMinor, ok := selector["feature.node.kubernetes.io/kernel-version.full"] + Expect(ok).To(BeTrue()) + Expect(nsKernelMinor).To(Equal(kernelFull)) } -func getNode(name string) *v1.Node { +func getNode(name, kernelFull string) *v1.Node { return &v1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: name, Labels: map[string]string{ - nodeinfo.NodeLabelMlnxNIC: "true", - nodeinfo.NodeLabelOSName: osName, - nodeinfo.NodeLabelOSVer: osVer, - nodeinfo.NodeLabelCPUArch: "amd64", + nodeinfo.NodeLabelMlnxNIC: "true", + nodeinfo.NodeLabelOSName: osName, + nodeinfo.NodeLabelOSVer: osVer, + nodeinfo.NodeLabelKernelVerFull: kernelFull, + nodeinfo.NodeLabelCPUArch: "amd64", }, }, } diff --git a/pkg/state/state_skel.go b/pkg/state/state_skel.go index 7da9167c..88dce211 100644 --- a/pkg/state/state_skel.go +++ b/pkg/state/state_skel.go @@ -19,7 +19,6 @@ package state import ( "context" "encoding/json" - "fmt" "github.com/go-logr/logr" "github.com/pkg/errors" @@ -33,7 +32,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/Mellanox/network-operator/pkg/consts" - "github.com/Mellanox/network-operator/pkg/nodeinfo" "github.com/Mellanox/network-operator/pkg/render" "github.com/Mellanox/network-operator/pkg/revision" ) @@ -464,16 +462,6 @@ func (s *stateSkel) isDaemonSetReady(uds *unstructured.Unstructured, reqLogger l return false, nil } -// Check if provided attrTypes are present in NodeAttributes.Attributes -func (s *stateSkel) checkAttributesExist(attrs nodeinfo.NodeAttributes, attrTypes ...nodeinfo.AttributeType) error { - for _, t := range attrTypes { - if _, ok := attrs.Attributes[t]; !ok { - return fmt.Errorf("mandatory node attribute does not exist for node %s", attrs.Name) - } - } - return nil -} - func (s *stateSkel) SetRenderer(renderer render.Renderer) { s.renderer = renderer }