Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry GPU devices check during env vars load if instance supports GPU #4387

Merged
merged 1 commit into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 49 additions & 19 deletions ecs-init/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,21 @@ const (
networkMode = "host"
// usernsMode specifies the userns mode to create the agent container
usernsMode = "host"
// minBackoffDuration specifies the minimum backoff duration for ping to
// pingDockerSocketMinBackoffDuration specifies the minimum backoff duration for ping to
// return a success response from the docker socket
minBackoffDuration = time.Second
// maxBackoffDuration specifies the maximum backoff duration for ping to
pingDockerSocketMinBackoffDuration = time.Second
// pingDockerSocketMaxBackoffDuration specifies the maximum backoff duration for ping to
// return a success response from docker socket
maxBackoffDuration = 3 * time.Second
// backoffJitterMultiple specifies the backoff jitter multiplier
pingDockerSocketMaxBackoffDuration = 3 * time.Second
// pingDockerSocketBackoffJitterMultiple specifies the backoff jitter multiplier
// coefficient when pinging the docker socket
backoffJitterMultiple = 0.2
// backoffMultiple specifies the backoff multiplier coefficient when
pingDockerSocketBackoffJitterMultiple = 0.2
// pingDockerSocketBackoffMultiple specifies the backoff multiplier coefficient when
// pinging the docker socket
backoffMultiple = 2
// maxRetries specifies the maximum number of retries for ping to return
pingDockerSocketBackoffMultiple = 2
// pingDockerSocketMaxRetries specifies the maximum number of retries for ping to return
// a successful response from the docker socket
maxRetries = 5
pingDockerSocketMaxRetries = 5
// DefaultCgroupMountpoint is the default mount point for the cgroup subsystem
DefaultCgroupMountpoint = "/sys/fs/cgroup"
// pluginSocketFilesDir specifies the location of UNIX domain socket files of
Expand Down Expand Up @@ -110,6 +110,13 @@ const (
execConfigRelativePath = "config"

execAgentLogRelativePath = "/exec"

// nvidiaGPUDevicesPresentRetryTime specifies the duration of time to wait before retrying to check if NVIDIA
// GPU devices are present.
nvidiaGPUDevicesPresentRetryTime = 3 * time.Second
// nvidiaGPUDevicesPresentMaxRetries specifies the maximum number of retries to attempt for checking if NVIDIA
// GPU devices are present.
nvidiaGPUDevicesPresentMaxRetries = 10
)

// Do NOT include "CAP_" in capability string
Expand All @@ -133,12 +140,13 @@ var pluginDirs = []string{
}

var (
dockerOnce sync.Once
dockerClient *client
dockerClientErr error
isPathValid = defaultIsPathValid
execCommand = exec.Command
execLookPath = exec.LookPath
dockerOnce sync.Once
dockerClient *client
dockerClientErr error
isPathValid = defaultIsPathValid
execCommand = exec.Command
execLookPath = exec.LookPath
checkNvidiaGPUDevicesPresence = nvidiaGPUDevicesPresent
)

// client enables business logic for running the Agent inside Docker
Expand All @@ -153,8 +161,8 @@ func Client() (*client, error) {
// Create a backoff for pinging the docker socket. This should result in 17-19
// seconds of delay in the worst-case between different actions that depend on
// docker
pingBackoff := backoff.NewBackoff(minBackoffDuration, maxBackoffDuration, backoffJitterMultiple,
backoffMultiple, maxRetries)
pingBackoff := backoff.NewBackoff(pingDockerSocketMinBackoffDuration, pingDockerSocketMaxBackoffDuration, pingDockerSocketBackoffJitterMultiple,
pingDockerSocketBackoffMultiple, pingDockerSocketMaxRetries)
cl, err := newDockerClient(godockerClientFactory{}, pingBackoff)
if err != nil {
dockerClientErr = err
Expand Down Expand Up @@ -372,7 +380,12 @@ func (c *client) LoadEnvVars() map[string]string {
// merge in instance-specific environment variables
for envKey, envValue := range c.loadCustomInstanceEnvVars() {
if envKey == config.GPUSupportEnvVar && envValue == "true" {
if !nvidiaGPUDevicesPresent() {
// If environment variable with key `config.GPUSupportEnvVar` has value `true`, it is expected that
// NVIDIA GPU devices should eventually be present on the instance. Thus, do NOT give up and continue right
// away in the event that NVIDIA GPU devices are not yet present. Call
// `nvidiaGPUDevicesPresentWithRetries()` (instead of `nvidiaGPUDevicesPresent`) to retry and wait for a
// reasonable amount of time for NVIDIA GPU devices to be present before continuing.
if !nvidiaGPUDevicesPresentWithRetries() {
log.Warn("No GPU devices found, ignoring the GPU support config")
continue
}
Expand Down Expand Up @@ -571,6 +584,23 @@ func nvidiaGPUDevicesPresent() bool {
return true
}

// nvidiaGPUDevicesPresentWithRetries checks if NVIDIA GPU devices are present in the instance. It retries if NVIDIA
// GPU devices are not yet present every `nvidiaGPUDevicesPresentRetryTime` interval of time up to a maximum of
// `nvidiaGPUDevicesPresentMaxRetries` retries.
func nvidiaGPUDevicesPresentWithRetries() bool {
singholt marked this conversation as resolved.
Show resolved Hide resolved
devicesPresent := checkNvidiaGPUDevicesPresence()
for i := 0; i < nvidiaGPUDevicesPresentMaxRetries; i++ {
if devicesPresent {
break
}
log.Warnf("NVIDIA GPU devices are not yet present, retrying (attempt %d/%d) in %d nanoseconds",
i+1, nvidiaGPUDevicesPresentMaxRetries, nvidiaGPUDevicesPresentRetryTime)
time.Sleep(nvidiaGPUDevicesPresentRetryTime)
devicesPresent = checkNvidiaGPUDevicesPresence()
}
return devicesPresent
}

var MatchFilePatternForGPU = FilePatternMatchForGPU

func FilePatternMatchForGPU(pattern string) ([]string, error) {
Expand Down
52 changes: 52 additions & 0 deletions ecs-init/docker/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,58 @@ func TestStartAgentWithGPUConfigNoDevices(t *testing.T) {
assert.NoError(t, err)
}

func TestNvidiaGPUDevicesPresentWithRetries(t *testing.T) {
testCases := []struct {
name string
nvidiaGPUDevicesWillBePresent bool
numRetriesForNvidiaGPUDevicesToBePresent int
}{
{
name: "NVIDIA GPU devices present without retrying",
nvidiaGPUDevicesWillBePresent: true,
numRetriesForNvidiaGPUDevicesToBePresent: 0,
},
{
name: "NVIDIA GPU devices present after retrying",
nvidiaGPUDevicesWillBePresent: true,
numRetriesForNvidiaGPUDevicesToBePresent: 3,
},
{
name: "NVIDIA GPU devices not present after retrying",
nvidiaGPUDevicesWillBePresent: false,
},
}

defer func() {
checkNvidiaGPUDevicesPresence = nvidiaGPUDevicesPresent
}()

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
numTimesNvidiaGPUDevicesPresenceChecked := 0
checkNvidiaGPUDevicesPresence = func() bool {
if !tc.nvidiaGPUDevicesWillBePresent {
return false
}
numTimesNvidiaGPUDevicesPresenceChecked++
if numTimesNvidiaGPUDevicesPresenceChecked > tc.numRetriesForNvidiaGPUDevicesToBePresent {
return true
}
return false
}

devicesPresent := nvidiaGPUDevicesPresentWithRetries()
if !tc.nvidiaGPUDevicesWillBePresent {
assert.False(t, devicesPresent)
} else {
assert.Equal(t, tc.numRetriesForNvidiaGPUDevicesToBePresent, numTimesNvidiaGPUDevicesPresenceChecked-1)
assert.True(t, devicesPresent)
}
})
}

}

func TestGetContainerConfigWithFileOverrides(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
Expand Down
Loading