Skip to content

Commit

Permalink
Retry GPU devices check during env vars load if instance supports GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
danehlim committed Oct 8, 2024
1 parent 9eb274a commit a5062b9
Showing 1 changed file with 25 additions and 1 deletion.
26 changes: 25 additions & 1 deletion ecs-init/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,12 @@ func (c *client) LoadEnvVars() map[string]string {
// merge in instance-specific environment variables
for envKey, envValue := range c.loadCustomInstanceEnvVars() {
if envKey == config.GPUSupportEnvVar && envValue == "true" {
if !nvidiaGPUDevicesPresent() {
// If environment variable with key `config.GPUSupportEnvVar` has value `true`, it is expected that
// NVIDIA GPU devices should eventually be present on the instance. Thus, do NOT give up and continue right
// away in the event that NVIDIA GPU devices are not yet present. Call
// `nvidiaGPUDevicesPresentWithRetries()` (instead of `nvidiaGPUDevicesPresent`) to retry and wait for a
// reasonable amount of time for NVIDIA GPU devices to be present before continuing.
if !nvidiaGPUDevicesPresentWithRetries() {
log.Warn("No GPU devices found, ignoring the GPU support config")
continue
}
Expand Down Expand Up @@ -571,6 +576,25 @@ func nvidiaGPUDevicesPresent() bool {
return true
}

// nvidiaGPUDevicesPresentWithRetries checks if NVIDIA GPU devices are present in the instance. It retries if NVIDIA
// GPU devices are not yet present every `nvidiaGPUDevicesPresentRetryTime` interval of time up to a maximum of
// `nvidiaGPUDevicesPresentMaxRetries` retries.
func nvidiaGPUDevicesPresentWithRetries() bool {
nvidiaGPUDevicesPresentRetryTime := 3 * time.Second
nvidiaGPUDevicesPresentMaxRetries := 10
devicesPresent := nvidiaGPUDevicesPresent()
for i := 0; i < nvidiaGPUDevicesPresentMaxRetries; i++ {
if devicesPresent {
break
}
log.Warnf("NVIDIA GPU devices are not yet present, retrying (attempt %d/%d) in %d nanoseconds",
i+1, nvidiaGPUDevicesPresentMaxRetries, nvidiaGPUDevicesPresentRetryTime)
time.Sleep(nvidiaGPUDevicesPresentRetryTime)
devicesPresent = nvidiaGPUDevicesPresent()
}
return devicesPresent
}

var MatchFilePatternForGPU = FilePatternMatchForGPU

func FilePatternMatchForGPU(pattern string) ([]string, error) {
Expand Down

0 comments on commit a5062b9

Please sign in to comment.