Skip to content

Commit

Permalink
fix infinite loop in init-pytorch container (#1756)
Browse files Browse the repository at this point in the history
* fix infinite loop

* fix flag name
  • Loading branch information
AxeZhan authored Feb 17, 2023
1 parent ea97855 commit 7067269
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 1 deletion.
2 changes: 2 additions & 0 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ func main() {
config.PyTorchInitContainerImageDefault, "The image for pytorch init container")
flag.StringVar(&config.Config.PyTorchInitContainerTemplateFile, "pytorch-init-container-template-file",
config.PyTorchInitContainerTemplateFileDefault, "The template file for pytorch init container")
flag.IntVar(&config.Config.PyTorchInitContainerMaxTries, "pytorch-init-container-max-tries",
config.PyTorchInitContainerMaxTriesDefault, "The number of tries for the pytorch init container")

// MPI related flags
flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ var Config struct {
PyTorchInitContainerTemplateFile string
PyTorchInitContainerImage string
MPIKubectlDeliveryImage string
PyTorchInitContainerMaxTries int
}

const (
Expand All @@ -28,6 +29,8 @@ const (
// PyTorchInitContainerTemplateFileDefault is the default template file for
// the pytorch init container.
PyTorchInitContainerTemplateFileDefault = "/etc/config/initContainer.yaml"
// PyTorchInitContainerMaxTriesDefault is the default number of tries for the pytorch init container.
PyTorchInitContainerMaxTriesDefault = 100
// MPIKubectlDeliveryImageDefault is the default image for launcher pod in MPIJob init container.
MPIKubectlDeliveryImageDefault = "mpioperator/kubectl-delivery:latest"
)
6 changes: 5 additions & 1 deletion pkg/controller.v1/pytorch/initcontainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,23 @@ var (
requests:
cpu: 50m
memory: 10Mi
command: ['sh', '-c', 'until nslookup {{.MasterAddr}}; do echo waiting for master; sleep 2; done;']`
command: ['sh', '-c', 'err=1;for i in $(seq {{.MaxTries}}); do if nslookup {{.MasterAddr}}; then err=0 && break; fi;echo waiting for master; sleep 2; done; exit $err']`
onceInitContainer sync.Once
icGenerator *initContainerGenerator
)

type initContainerGenerator struct {
template string
image string
maxTries int
}

func getInitContainerGenerator() *initContainerGenerator {
onceInitContainer.Do(func() {
icGenerator = &initContainerGenerator{
template: getInitContainerTemplateOrDefault(config.Config.PyTorchInitContainerTemplateFile),
image: config.Config.PyTorchInitContainerImage,
maxTries: config.Config.PyTorchInitContainerMaxTries,
}
})
return icGenerator
Expand All @@ -72,9 +74,11 @@ func (i *initContainerGenerator) GetInitContainer(masterAddr string) ([]corev1.C
if err := tpl.Execute(&buf, struct {
MasterAddr string
InitContainerImage string
MaxTries int
}{
MasterAddr: masterAddr,
InitContainerImage: i.image,
MaxTries: i.maxTries,
}); err != nil {
return nil, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/controller.v1/pytorch/initcontainer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func TestInitContainer(t *testing.T) {

config.Config.PyTorchInitContainerImage = config.PyTorchInitContainerImageDefault
config.Config.PyTorchInitContainerTemplateFile = config.PyTorchInitContainerTemplateFileDefault
config.Config.PyTorchInitContainerMaxTries = config.PyTorchInitContainerMaxTriesDefault

testCases := []struct {
job *kubeflowv1.PyTorchJob
Expand Down

0 comments on commit 7067269

Please sign in to comment.