Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix infinite loop in init-pytorch container #1756

Merged
merged 2 commits into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ func main() {
config.PyTorchInitContainerImageDefault, "The image for pytorch init container")
flag.StringVar(&config.Config.PyTorchInitContainerTemplateFile, "pytorch-init-container-template-file",
config.PyTorchInitContainerTemplateFileDefault, "The template file for pytorch init container")
flag.IntVar(&config.Config.PyTorchInitContainerMaxTries, "pytorch-init-container-max-tries",
config.PyTorchInitContainerMaxTriesDefault, "The number of tries for the pytorch init container")

// MPI related flags
flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ var Config struct {
PyTorchInitContainerTemplateFile string
PyTorchInitContainerImage string
MPIKubectlDeliveryImage string
PyTorchInitContainerMaxTries int
}

const (
Expand All @@ -28,6 +29,8 @@ const (
// PyTorchInitContainerTemplateFileDefault is the default template file for
// the pytorch init container.
PyTorchInitContainerTemplateFileDefault = "/etc/config/initContainer.yaml"
// PyTorchInitContainerMaxTriesDefault is the default number of tries for the pytorch init container.
PyTorchInitContainerMaxTriesDefault = 100
// MPIKubectlDeliveryImageDefault is the default image for launcher pod in MPIJob init container.
MPIKubectlDeliveryImageDefault = "mpioperator/kubectl-delivery:latest"
)
6 changes: 5 additions & 1 deletion pkg/controller.v1/pytorch/initcontainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,23 @@ var (
requests:
cpu: 50m
memory: 10Mi
command: ['sh', '-c', 'until nslookup {{.MasterAddr}}; do echo waiting for master; sleep 2; done;']`
command: ['sh', '-c', 'err=1;for i in $(seq {{.MaxTries}}); do if nslookup {{.MasterAddr}}; then err=0 && break; fi;echo waiting for master; sleep 2; done; exit $err']`
onceInitContainer sync.Once
icGenerator *initContainerGenerator
)

type initContainerGenerator struct {
template string
image string
maxTries int
}

func getInitContainerGenerator() *initContainerGenerator {
onceInitContainer.Do(func() {
icGenerator = &initContainerGenerator{
template: getInitContainerTemplateOrDefault(config.Config.PyTorchInitContainerTemplateFile),
image: config.Config.PyTorchInitContainerImage,
maxTries: config.Config.PyTorchInitContainerMaxTries,
}
})
return icGenerator
Expand All @@ -72,9 +74,11 @@ func (i *initContainerGenerator) GetInitContainer(masterAddr string) ([]corev1.C
if err := tpl.Execute(&buf, struct {
MasterAddr string
InitContainerImage string
MaxTries int
}{
MasterAddr: masterAddr,
InitContainerImage: i.image,
MaxTries: i.maxTries,
}); err != nil {
return nil, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/controller.v1/pytorch/initcontainer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func TestInitContainer(t *testing.T) {

Copy link
Member

@tenzen-y tenzen-y Feb 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to test the whole of initContainer. However, that is out of the scope of this PR.
So we can follow up with another PR.

config.Config.PyTorchInitContainerImage = config.PyTorchInitContainerImageDefault
config.Config.PyTorchInitContainerTemplateFile = config.PyTorchInitContainerTemplateFileDefault
config.Config.PyTorchInitContainerMaxTries = config.PyTorchInitContainerMaxTriesDefault

testCases := []struct {
job *kubeflowv1.PyTorchJob
Expand Down