Skip to content

Commit

Permalink
Merge pull request #734 from SchSeba/fix_mlx_plugin
Browse files Browse the repository at this point in the history
Skip firmware reset on devices the operator doesn't control
  • Loading branch information
zeeke committed Jul 15, 2024
2 parents c3ba0f3 + 411a215 commit 64d69b6
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 6 deletions.
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/sriov/sriov.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset",
"pf-name", ifaceStatus.Name,
"address", ifaceStatus.PciAddress)

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}
err = s.removeUdevRules(ifaceStatus.PciAddress)
Expand All @@ -828,6 +835,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt,
return err
}

// remove pf status from host
err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress)
if err != nil {
return err
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/host/internal/sriov/sriov_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ var _ = Describe("SRIOV", func() {
PciAddress: "0000:d8:00.0",
NumVfs: 2,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return(
&netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}},
nil)
Expand Down Expand Up @@ -479,6 +480,7 @@ var _ = Describe("SRIOV", func() {
NumVfs: 2,
ExternallyManaged: true,
}, true, nil)
storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil)
Expect(s.ConfigSriovInterfaces(storeManagerMode,
[]sriovnetworkv1.Interface{},
[]sriovnetworkv1.InterfaceExt{
Expand Down
14 changes: 14 additions & 0 deletions pkg/host/store/mock/mock_store.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions pkg/host/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
type ManagerInterface interface {
ClearPCIAddressFolder() error
SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error
RemovePfAppliedStatus(pciAddress string) error
LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error)

GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error)
Expand Down Expand Up @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro
return err
}

func (s *manager) RemovePfAppliedStatus(pciAddress string) error {
hostExtension := utils.GetHostExtension()
pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress)
err := os.RemoveAll(pathFile)
if err != nil {
log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile)
return err
}
return nil
}

// LoadPfsStatus convert the /etc/sriov-operator/pci/<pci-address> json to pfstatus
// returns false if the file doesn't exist.
func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) {
Expand Down
21 changes: 21 additions & 0 deletions pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
processedNics[pciPrefix] = true
pciAddress := pciPrefix + "0"

// Skip devices not configured by the operator
if p.nicNotConfiguredByOperator(portsMap) {
continue
}

// Skip externally managed NICs
if p.nicHasExternallyManagedPFs(portsMap) {
continue
Expand Down Expand Up @@ -206,3 +211,19 @@ func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriov
}
return false
}

// nicNotConfiguredByOperator returns true if one of the ports(interface) of the NIC is not configured by operator
func (p *MellanoxPlugin) nicNotConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool {
for _, iface := range nicPortsMap {
_, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress)
if err != nil {
log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress)
continue
}
if exist {
log.Log.V(2).Info("PF configured by the operator", "interface", iface)
return true
}
}
return false
}
50 changes: 44 additions & 6 deletions test/conformance/tests/test_sriov_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ var _ = Describe("[sriov] operator", func() {
It("Should be possible to create a vfio-pci resource and allocate to a pod", func() {
By("creating a vfio-pci node policy")
resourceName := "testvfio"
_, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci")
Expect(err).ToNot(HaveOccurred())

By("waiting for the node state to be updated")
Expand All @@ -1249,6 +1249,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 10*time.Minute, time.Second).Should(Equal(int64(5)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("Creating sriov network to use the vfio device")
sriovNetwork := &sriovv1.SriovNetwork{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -1278,6 +1283,25 @@ var _ = Describe("[sriov] operator", func() {
networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"]
Expect(exist).To(BeTrue())
Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu)))

By("deleting the policy")
err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
WaitForSRIOVStable()

Eventually(func() int64 {
testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)]
allocatable, _ := resNum.AsInt64()
return allocatable
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

})
})

Expand Down Expand Up @@ -2196,15 +2220,19 @@ var _ = Describe("[sriov] operator", func() {
Context("ExternallyManaged Validation", func() {
numVfs := 5
var node string
var nic *sriovv1.InterfaceExt
var nic sriovv1.InterfaceExt
externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) {
policy.Spec.ExternallyManaged = true
}

execute.BeforeAll(func() {
var err error
node, nic, err = sriovInfos.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
node, nic = sriovInfos.FindOneVfioSriovDevice()
})

BeforeEach(func() {
if node == "" {
Skip("not suitable device found for the test")
}

By("Using device " + nic.Name + " on node " + node)
})
Expand Down Expand Up @@ -2265,6 +2293,11 @@ var _ = Describe("[sriov] operator", func() {
return allocatable
}, 3*time.Minute, time.Second).Should(Equal(int64(numVfs)))

By("validate the pf info exist on host")
output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("1"))

By("deleting the policy")
err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
Expand All @@ -2279,11 +2312,16 @@ var _ = Describe("[sriov] operator", func() {
}, 2*time.Minute, time.Second).Should(Equal(int64(0)))

By("checking the virtual functions are still on the host")
output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expect(errOutput).To(Equal(""))
Expect(output).To(ContainSubstring("5"))

By("validate the pf info doesn't exist on the host anymore")
output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l")
Expect(err).ToNot(HaveOccurred())
Expect(output).ToNot(Equal("0"))

By("cleaning the manual sriov created")
_, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name))
Expect(err).ToNot(HaveOccurred())
Expand Down

0 comments on commit 64d69b6

Please sign in to comment.