Skip to content

Commit

Permalink
Avoid guid pool exhaustion from guid leaks by syncing it with SM (#92)
Browse files Browse the repository at this point in the history
If the node running pods that use ib network is restarted, those pods'
 GUID are deleted from UFM but are persisted in ib-kubernetes.
They become unusable and might exhaust the GUID pool
if it's configured to be small enough.
The solution is to sync guid pool with UFM because some GUIDs
might have become free to use.

Signed-off-by: amaslennikov <amaslennikov@nvidia.com>

Signed-off-by: amaslennikov <amaslennikov@nvidia.com>
  • Loading branch information
almaslennikov committed Oct 25, 2022
1 parent 52b597f commit bff11fe
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 10 deletions.
40 changes: 33 additions & 7 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,6 @@ func NewDaemon() (Daemon, error) {
return nil, err
}

guidPool, err := guid.NewPool(&daemonConfig.GUIDPool)
if err != nil {
return nil, err
}

pluginLoader := sm.NewPluginLoader()
getSmClientFunc, err := pluginLoader.LoadPlugin(path.Join(
daemonConfig.PluginPath, daemonConfig.Plugin+".so"), sm.InitializePluginFunc)
Expand All @@ -125,6 +120,14 @@ func NewDaemon() (Daemon, error) {
return nil, validateErr
}

guidPool, err := guid.NewPool(&daemonConfig.GUIDPool)
if err != nil {
return nil, err
}

// Reset guid pool with already allocated guids to avoid collisions
err = syncGuidPool(smClient, guidPool)

podWatcher := watcher.NewWatcher(podEventHandler, client)
return &daemon{
config: daemonConfig,
Expand Down Expand Up @@ -256,7 +259,16 @@ func (d *daemon) processNetworkGUID(networkID string, spec *utils.IbSriovCniSpec
} else {
guidAddr, err = d.guidPool.GenerateGUID()
if err != nil {
return fmt.Errorf("failed to generate GUID for pod ID %s, with error: %v", pi.pod.UID, err)
switch err {
// If the guid pool is exhausted, need to sync with SM in case there are unsynced changes
case guid.GuidPoolExhaustedError:
err = syncGuidPool(d.smClient, d.guidPool)
if err != nil {
return err
}
default:
return fmt.Errorf("failed to generate GUID for pod ID %s, with error: %v", pi.pod.UID, err)
}
}

allocatedGUID = guidAddr.String()
Expand Down Expand Up @@ -284,6 +296,20 @@ func (d *daemon) processNetworkGUID(networkID string, spec *utils.IbSriovCniSpec
return nil
}

func syncGuidPool(smClient plugins.SubnetManagerClient, guidPool guid.Pool) error {
usedGuids, err := smClient.ListGuidsInUse()
if err != nil {
return err
}

// Reset guid pool with already allocated guids to avoid collisions
err = guidPool.Reset(usedGuids)
if err != nil {
return err
}
return nil
}

// Update and set Pod's network annotation.
// If failed to update annotation, pod's GUID added into the list to be removed from Pkey.
func (d *daemon) updatePodNetworkAnnotation(pi *podNetworkInfo, removedList *[]net.HardwareAddr) error {
Expand Down Expand Up @@ -537,7 +563,7 @@ func (d *daemon) DeletePeriodicUpdate() {
log.Info().Msg("delete periodic update finished")
}

// initPool check the guids that are already allocated by the running pods
// initPool check the guids that are already allocated by the running pods
func (d *daemon) initPool() error {
log.Info().Msg("Initializing GUID pool.")

Expand Down
51 changes: 48 additions & 3 deletions pkg/guid/guid_pool.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package guid

import (
"errors"
"fmt"

"github.com/rs/zerolog/log"

"github.com/Mellanox/ib-kubernetes/pkg/config"
Expand All @@ -19,8 +19,13 @@ type Pool interface {
// ReleaseGUID release the reservation of the guid.
// It returns error if the guid is not in the range.
ReleaseGUID(string) error

// Reset clears the current pool and resets it with given values (may be empty)
Reset(guids []string) error
}

var GuidPoolExhaustedError = errors.New("GUID pool is exhausted")

type guidPool struct {
rangeStart GUID // first guid in range
rangeEnd GUID // last guid in range
Expand Down Expand Up @@ -50,6 +55,34 @@ func NewPool(conf *config.GUIDPoolConfig) (Pool, error) {
}, nil
}

// Reset clears the current pool and resets it with given values (may be empty)
func (p *guidPool) Reset(guids []string) error {
log.Debug().Msg("resetting guid pool")

p.guidPoolMap = map[GUID]bool{}
if guids == nil {
return nil
}

for _, guid := range guids {
guidInRange, err := p.isGuidStringInRange(guid)
if err != nil {
log.Debug().Msgf("error validating GUID: %s: %v", guid, err)
return err
}
if !guidInRange {
// Out of range GUID may be expected and shouldn't be allocated in the pool
continue
}
err = p.AllocateGUID(guid)
if err != nil {
log.Debug().Msgf("error resetting the pool with value: %s: %v", guid, err)
return err
}
}
return nil
}

// GenerateGUID generates a guid from the range
func (p *guidPool) GenerateGUID() (GUID, error) {
// this look will ensure that we check all the range
Expand All @@ -62,7 +95,7 @@ func (p *guidPool) GenerateGUID() (GUID, error) {
if guid := p.getFreeGUID(p.rangeStart, p.rangeEnd); guid != 0 {
return guid, nil
}
return 0, fmt.Errorf("guid pool range is full")
return 0, GuidPoolExhaustedError
}

// ReleaseGUID release allocated guid
Expand All @@ -88,7 +121,7 @@ func (p *guidPool) AllocateGUID(guid string) error {
return err
}

if guidAddr < p.rangeStart || guidAddr > p.rangeEnd {
if !p.isGuidInRange(guidAddr) {
return fmt.Errorf("out of range guid %s, pool range %v - %v", guid, p.rangeStart, p.rangeEnd)
}

Expand All @@ -104,6 +137,18 @@ func isValidRange(rangeStart, rangeEnd GUID) bool {
return rangeStart <= rangeEnd && rangeStart != 0 && rangeEnd != 0xFFFFFFFFFFFFFFFF
}

func (p *guidPool) isGuidInRange(guid GUID) bool {
return guid >= p.rangeStart && guid <= p.rangeEnd
}

func (p *guidPool) isGuidStringInRange(guid string) (bool, error) {
guidAddr, err := ParseGUID(guid)
if err != nil {
return false, err
}
return p.isGuidInRange(guidAddr), nil
}

// getFreeGUID return free guid in given range
func (p *guidPool) getFreeGUID(start, end GUID) GUID {
for guid := start; guid <= end; guid++ {
Expand Down
51 changes: 51 additions & 0 deletions pkg/guid/guid_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,57 @@ import (

var _ = Describe("GUID Pool", func() {
conf := &config.GUIDPoolConfig{RangeStart: "02:00:00:00:00:00:00:00", RangeEnd: "02:FF:FF:FF:FF:FF:FF:FF"}
Context("ResetPool", func() {
It("Reset pool clears previous values", func() {
pool, err := NewPool(conf)
Expect(err).ToNot(HaveOccurred())
Expect(pool).ToNot(BeNil())

err = pool.AllocateGUID("02:00:00:00:00:00:00:00")
Expect(err).ToNot(HaveOccurred())
err = pool.AllocateGUID("02:00:00:00:FF:00:00:00")
Expect(err).ToNot(HaveOccurred())

pool.Reset(nil)

err = pool.ReleaseGUID("02:00:00:00:00:00:00:00")
Expect(err).To(HaveOccurred())
err = pool.ReleaseGUID("02:00:00:00:FF:00:00:00")
Expect(err).To(HaveOccurred())
})
It("Reset pool stores new values", func() {
pool, err := NewPool(conf)
Expect(err).ToNot(HaveOccurred())
Expect(pool).ToNot(BeNil())

expectedGuids := []string{"02:00:00:00:00:00:00:3e", "02:00:0F:F0:00:FF:00:09", "02:00:00:00:00:00:00:00"}

pool.Reset(expectedGuids)

for _, expectedGuid := range expectedGuids {
err = pool.ReleaseGUID(expectedGuid)
Expect(err).ToNot(HaveOccurred())
}
})
It("Exhausted pool throws error and doesn't after reset", func() {
conf := &config.GUIDPoolConfig{RangeStart: "02:00:00:00:00:00:00:00", RangeEnd: "02:00:00:00:00:00:00:00"}
pool, err := NewPool(conf)
Expect(err).ToNot(HaveOccurred())
Expect(pool).ToNot(BeNil())
guid, err := pool.GenerateGUID()
Expect(err).ToNot(HaveOccurred())
err = pool.AllocateGUID(guid.String())
Expect(err).ToNot(HaveOccurred())
guid, err = pool.GenerateGUID()
Expect(err).To(Equal(GuidPoolExhaustedError))

err = pool.Reset(nil)
Expect(err).ToNot(HaveOccurred())

guid, err = pool.GenerateGUID()
Expect(err).ToNot(HaveOccurred())
})
})
Context("NewPool", func() {
It("Create guid pool with valid parameters", func() {
pool, err := NewPool(conf)
Expand Down
5 changes: 5 additions & 0 deletions pkg/sm/plugins/noop/noop.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ func (p *plugin) RemoveGuidsFromPKey(pkey int, guids []net.HardwareAddr) error {
return nil
}

func (p *plugin) ListGuidsInUse() ([]string, error) {
log.Info().Msg("noop Plugin ListGuidsInUse()")
return nil, nil
}

// Initialize applies configs to plugin and return a subnet manager client
func Initialize() (plugins.SubnetManagerClient, error) {
log.Info().Msg("Initializing noop plugin")
Expand Down
3 changes: 3 additions & 0 deletions pkg/sm/plugins/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ type SubnetManagerClient interface {
// RemoveGuidsFromPKey remove guids for given pkey.
// It return error if failed.
RemoveGuidsFromPKey(pkey int, guids []net.HardwareAddr) error

// ListGuidsInUse returns a list of all GUIDS associated with PKeys
ListGuidsInUse() ([]string, error)
}
43 changes: 43 additions & 0 deletions pkg/sm/plugins/ufm/ufm.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"encoding/json"
"fmt"
"net"
"net/http"
Expand Down Expand Up @@ -133,6 +134,48 @@ func (u *ufmPlugin) RemoveGuidsFromPKey(pKey int, guids []net.HardwareAddr) erro
return nil
}

// convertToMacAddr adds semicolons each 2 characters to convert to MAC format
// UFM returns GUIDS without any delimiters, so expected format is as follows:
// FF00FF00FF00FF00
func convertToMacAddr(guid string) string {
for i := 2; i < len(guid); i += 3 {
guid = guid[:i] + ":" + guid[i:]
}
return guid
}

type Guid struct {
GuidValue string `json:"guid"`
}

type PKey struct {
Guids []Guid `json:"guids"`
}

// ListGuidsInUse returns all guids currently in use by pKeys
func (u *ufmPlugin) ListGuidsInUse() ([]string, error) {
response, err := u.client.Get(u.buildURL("/ufmRest/resources/pkeys/?guids_data=true"), http.StatusOK)
if err != nil {
return nil, fmt.Errorf("failed to get the list of guids: %v", err)
}

var pKeys map[string]PKey

if err := json.Unmarshal(response, &pKeys); err != nil {
return nil, fmt.Errorf("failed to get the list of guids: %v", err)
}

var guids []string

for pkey, _ := range pKeys {
pkeyData := pKeys[pkey]
for _, guidData := range pkeyData.Guids {
guids = append(guids, convertToMacAddr(guidData.GuidValue))
}
}
return guids, nil
}

func (u *ufmPlugin) buildURL(path string) string {
return fmt.Sprintf("%s://%s:%d%s", u.conf.HTTPSchema, u.conf.Address, u.conf.Port, path)
}
Expand Down
39 changes: 39 additions & 0 deletions pkg/sm/plugins/ufm/ufm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,43 @@ var _ = Describe("Ufm Subnet Manager Client plugin", func() {
Expect(&errMsg).To(Equal(&errMessage))
})
})
Context("ListGuidsInUse", func() {
It("Remove guid from valid pkey", func() {
testResponse := `{
"0x7fff": {
"guids": []
},
"0x7aff": {
"test": "val"
},
"0x5": {
"guids": [
{
"guid": "020000000000003e"
},
{
"guid": "02000FF000FF0009"
}
]
},
"0x6": {
"guids": [
{
"guid": "0200000000000000"
}
]
}
}`

client := &mocks.Client{}
client.On("Get", mock.Anything, mock.Anything).Return([]byte(testResponse), nil)

plugin := &ufmPlugin{client: client, conf: UFMConfig{}}
guids, err := plugin.ListGuidsInUse()
Expect(err).ToNot(HaveOccurred())

expectedGuids := []string{"02:00:00:00:00:00:00:3e", "02:00:0F:F0:00:FF:00:09", "02:00:00:00:00:00:00:00"}
Expect(guids).To(ConsistOf(expectedGuids))
})
})
})

0 comments on commit bff11fe

Please sign in to comment.