Skip to content

Commit

Permalink
feat: Support errors on NCP status
Browse files Browse the repository at this point in the history
Fixes Mellanox#533

Signed-off-by: Fred Rolland <frolland@nvidia.com>
  • Loading branch information
rollandf committed May 23, 2024
1 parent cb5b6c3 commit 0a7d102
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 2 deletions.
3 changes: 3 additions & 0 deletions api/v1alpha1/nicclusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ type AppliedState struct {
Name string `json:"name"`
// +kubebuilder:validation:Enum={"ready", "notReady", "ignore", "error"}
State State `json:"state"`
// Message is a human readable message indicating details about why
// the state is in this condition
Message string `json:"message,omitempty"`
}

// NicClusterPolicyStatus defines the observed state of NicClusterPolicy
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/mellanox.com_hostdevicenetworks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ spec:
description: AppliedState defines a finer-grained view of the observed
state of NicClusterPolicy
properties:
message:
description: |-
Message is a human readable message indicating details about why
the state is in this condition
type: string
name:
type: string
state:
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/mellanox.com_nicclusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,11 @@ spec:
description: AppliedState defines a finer-grained view of the observed
state of NicClusterPolicy
properties:
message:
description: |-
Message is a human readable message indicating details about why
the state is in this condition
type: string
name:
type: string
state:
Expand Down
5 changes: 5 additions & 0 deletions controllers/nicclusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,11 @@ NextResult:
for i := range cr.Status.AppliedStates {
if cr.Status.AppliedStates[i].Name == stateStatus.StateName {
cr.Status.AppliedStates[i].State = mellanoxv1alpha1.State(stateStatus.Status)
if stateStatus.ErrInfo != nil {
cr.Status.AppliedStates[i].Message = stateStatus.ErrInfo.Error()
} else {
cr.Status.AppliedStates[i].Message = ""
}
continue NextResult
}
}
Expand Down
87 changes: 87 additions & 0 deletions controllers/nicclusterpolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,84 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() {
Expect(err).NotTo(HaveOccurred())
})
})
Context("When MOFED precompiled tag does not exists", func() {
It("should set error message in status", func() {
By("Create Node")
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
Labels: map[string]string{
nodeinfo.NodeLabelMlnxNIC: "true",
nodeinfo.NodeLabelOSName: "ubuntu",
nodeinfo.NodeLabelCPUArch: "amd64",
nodeinfo.NodeLabelKernelVerFull: "generic-9.0.1",
nodeinfo.NodeLabelOSVer: "20.0.4"},
Annotations: make(map[string]string),
},
}
err := k8sClient.Create(context.TODO(), node)
Expect(err).NotTo(HaveOccurred())
By("Create NicClusterPolicy with MOFED ForcePrecompiled")
cr := mellanoxv1alpha1.NicClusterPolicy{
ObjectMeta: metav1.ObjectMeta{
Name: "nic-cluster-policy",
Namespace: "",
},
Spec: mellanoxv1alpha1.NicClusterPolicySpec{
OFEDDriver: &mellanoxv1alpha1.OFEDDriverSpec{
ForcePrecompiled: true,
ImageSpec: mellanoxv1alpha1.ImageSpec{
Image: "mofed",
Repository: "nvcr.io/nvidia/mellanox",
Version: "5.9-0.5.6.0",
ImagePullSecrets: []string{},
},
},
},
}

err = k8sClient.Create(context.TODO(), &cr)
Expect(err).NotTo(HaveOccurred())

ncp := &mellanoxv1alpha1.NicClusterPolicy{}
err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, ncp)
Expect(err).NotTo(HaveOccurred())

By("Wait for NicClusterPolicy OFED state error message to be populated")
msg := "failed to create k8s objects from manifest: " +
"failed to render objects: ForcePrecompiled is enabled " +
"and precompiled tag was not found: " +
"5.9-0.5.6.0-generic-9.0.1-ubuntu20.0.4-amd64"

Eventually(func() string {
found := &mellanoxv1alpha1.NicClusterPolicy{}
err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found)
Expect(err).NotTo(HaveOccurred())
return getAppliedStateMessage(found.Status.AppliedStates, "state-OFED")
}, timeout*10, interval).Should(BeEquivalentTo(msg))

By("Set MOFED ForcePrecompiled to false")
patch := []byte(`{"spec": {"ofedDriver":{"forcePrecompiled": false}}}`)
Expect(k8sClient.Patch(context.TODO(), &cr, client.RawPatch(types.MergePatchType, patch))).To(Succeed())

By("Wait for NicClusterPolicy OFED state error message to be cleared")
msg = ""
Eventually(func() string {
found := &mellanoxv1alpha1.NicClusterPolicy{}
err = k8sClient.Get(context.TODO(), types.NamespacedName{Namespace: cr.GetNamespace(), Name: cr.GetName()}, found)
Expect(err).NotTo(HaveOccurred())
return getAppliedStateMessage(found.Status.AppliedStates, "state-OFED")
}, timeout*10, interval).Should(BeEquivalentTo(msg))

By("Delete NicClusterPolicy")
err = k8sClient.Delete(context.TODO(), &cr)
Expect(err).NotTo(HaveOccurred())

By("Delete Node")
err = k8sClient.Delete(context.TODO(), node)
Expect(err).NotTo(HaveOccurred())
})
})
Context("When NicClusterPolicy CR is deleted", func() {
It("should set mofed.wait to false", func() {
By("Create Node")
Expand Down Expand Up @@ -318,3 +396,12 @@ var _ = Describe("NicClusterPolicyReconciler Controller", func() {
})
})
})

func getAppliedStateMessage(states []mellanoxv1alpha1.AppliedState, stateName string) string {
for _, state := range states {
if state.Name == stateName {
return state.Message
}
}
return ""
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ spec:
description: AppliedState defines a finer-grained view of the observed
state of NicClusterPolicy
properties:
message:
description: |-
Message is a human readable message indicating details about why
the state is in this condition
type: string
name:
type: string
state:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,11 @@ spec:
description: AppliedState defines a finer-grained view of the observed
state of NicClusterPolicy
properties:
message:
description: |-
Message is a human readable message indicating details about why
the state is in this condition
type: string
name:
type: string
state:
Expand Down
4 changes: 2 additions & 2 deletions pkg/state/state_ofed.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ func (s *stateOFED) Sync(ctx context.Context, customResource interface{}, infoCa
objs, err := s.GetManifestObjects(ctx, cr, infoCatalog, log.FromContext(ctx))

if err != nil {
return SyncStateNotReady, errors.Wrap(err, "failed to create k8s objects from manifest")
return SyncStateError, errors.Wrap(err, "failed to create k8s objects from manifest")
}
if len(objs) == 0 {
// GetManifestObjects returned no objects, this means that no objects need to be applied to the cluster
Expand Down Expand Up @@ -455,7 +455,7 @@ func renderObjects(ctx context.Context, nodePool *nodeinfo.NodePool, useDtk bool
precompiledExists := docaProvider.TagExists(precompiledTag)
reqLogger.V(consts.LogLevelDebug).Info("Precompiled tag", "tag:", precompiledTag, "found:", precompiledExists)
if !precompiledExists && cr.Spec.OFEDDriver.ForcePrecompiled {
return nil, fmt.Errorf("ForcePrecompiled is enabled and precompiled image was not found")
return nil, fmt.Errorf("ForcePrecompiled is enabled and precompiled tag was not found: %s", precompiledTag)
}

if precompiledExists {
Expand Down

0 comments on commit 0a7d102

Please sign in to comment.