diff --git a/cmd/training-operator.v1/main.go b/cmd/training-operator.v1/main.go index 4b09586cdb..97344b32ee 100644 --- a/cmd/training-operator.v1/main.go +++ b/cmd/training-operator.v1/main.go @@ -85,6 +85,8 @@ func main() { config.PyTorchInitContainerImageDefault, "The image for pytorch init container") flag.StringVar(&config.Config.PyTorchInitContainerTemplateFile, "pytorch-init-container-template-file", config.PyTorchInitContainerTemplateFileDefault, "The template file for pytorch init container") + flag.IntVar(&config.Config.PyTorchInitContainerMaxTries, "pytorch-init-container-retry-number", + config.PyTorchInitContainerMaxTriesDefault, "Default number of retries for the pytorch init container") // MPI related flags flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image", diff --git a/pkg/config/config.go b/pkg/config/config.go index 2a8ed5e813..a8e3b4fbfc 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -19,6 +19,7 @@ var Config struct { PyTorchInitContainerTemplateFile string PyTorchInitContainerImage string MPIKubectlDeliveryImage string + PyTorchInitContainerMaxTries int } const ( @@ -28,6 +29,8 @@ const ( // PyTorchInitContainerTemplateFileDefault is the default template file for // the pytorch init container. PyTorchInitContainerTemplateFileDefault = "/etc/config/initContainer.yaml" + // PyTorchInitContainerMaxTriesDefault is the default number of tries for the pytorch init container. + PyTorchInitContainerMaxTriesDefault = 100 // MPIKubectlDeliveryImageDefault is the default image for launcher pod in MPIJob init container. MPIKubectlDeliveryImageDefault = "mpioperator/kubectl-delivery:latest" ) diff --git a/pkg/controller.v1/pytorch/initcontainer.go b/pkg/controller.v1/pytorch/initcontainer.go index 39dc7d4725..8c36e06a31 100644 --- a/pkg/controller.v1/pytorch/initcontainer.go +++ b/pkg/controller.v1/pytorch/initcontainer.go @@ -43,7 +43,7 @@ var ( requests: cpu: 50m memory: 10Mi - command: ['sh', '-c', 'until nslookup {{.MasterAddr}}; do echo waiting for master; sleep 2; done;']` + command: ['sh', '-c', 'err=1;for i in $(seq {{.MaxTries}}); do if nslookup {{.MasterAddr}}; then err=0 && break; fi;echo waiting for master; sleep 2; done; exit $err']` onceInitContainer sync.Once icGenerator *initContainerGenerator ) @@ -51,6 +51,7 @@ var ( type initContainerGenerator struct { template string image string + maxTries int } func getInitContainerGenerator() *initContainerGenerator { @@ -58,6 +59,7 @@ func getInitContainerGenerator() *initContainerGenerator { icGenerator = &initContainerGenerator{ template: getInitContainerTemplateOrDefault(config.Config.PyTorchInitContainerTemplateFile), image: config.Config.PyTorchInitContainerImage, + maxTries: config.Config.PyTorchInitContainerMaxTries, } }) return icGenerator @@ -72,9 +74,11 @@ func (i *initContainerGenerator) GetInitContainer(masterAddr string) ([]corev1.C if err := tpl.Execute(&buf, struct { MasterAddr string InitContainerImage string + MaxTries int }{ MasterAddr: masterAddr, InitContainerImage: i.image, + MaxTries: i.maxTries, }); err != nil { return nil, err } diff --git a/pkg/controller.v1/pytorch/initcontainer_test.go b/pkg/controller.v1/pytorch/initcontainer_test.go index 3ead5855a9..e2f06cae1a 100644 --- a/pkg/controller.v1/pytorch/initcontainer_test.go +++ b/pkg/controller.v1/pytorch/initcontainer_test.go @@ -34,6 +34,7 @@ func TestInitContainer(t *testing.T) { config.Config.PyTorchInitContainerImage = config.PyTorchInitContainerImageDefault config.Config.PyTorchInitContainerTemplateFile = config.PyTorchInitContainerTemplateFileDefault + config.Config.PyTorchInitContainerMaxTries = config.PyTorchInitContainerMaxTriesDefault testCases := []struct { job *kubeflowv1.PyTorchJob