-
Notifications
You must be signed in to change notification settings - Fork 705
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
KEP-2170: Implement JobSet, PlainML, and Torch Plugins (#2308)
* KEP-2170: Implement JobSet and PlainML Plugins Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix nil pointer exception for Trainer Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix unit tests in runtime package Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix unit tests Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix integration tests Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix lint Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Implement Torch Plugin Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Use list for the Info envs Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix golang ci Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Fix Torch plugin Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Use K8s sets Update error return Use ptr.Deref() for nil values Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Use client.Object for Build() call Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove DeepCopy Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove MLPolicy and PodGroupPolicy from the Info object Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Inline error Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Remove SDK jar file Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add integration test for Torch plugin Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Add TODO to calculate PodGroup values in unit tests Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Revert the change to add original Runtime Policies to Info Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Create const for the DefaultJobReplicas Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Check if PodLabels is empty Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> --------- Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
- Loading branch information
1 parent
3f7ec16
commit 7c5ea70
Showing
21 changed files
with
1,036 additions
and
574 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package constants | ||
|
||
import ( | ||
"fmt" | ||
|
||
batchv1 "k8s.io/api/batch/v1" | ||
) | ||
|
||
const ( | ||
|
||
// DefaultJobReplicas is the default value for the ReplicatedJob replicas. | ||
DefaultJobReplicas = 1 | ||
|
||
// JobSetKind is the Kind name for the JobSet. | ||
JobSetKind string = "JobSet" | ||
|
||
// JobTrainerNode is the Job name for the trainer node. | ||
JobTrainerNode string = "trainer-node" | ||
|
||
// ContainerTrainer is the container name for the trainer. | ||
ContainerTrainer string = "trainer" | ||
|
||
// ContainerTrainerPort is the default port for the trainer nodes communication. | ||
ContainerTrainerPort int32 = 29500 | ||
|
||
// JobInitializer is the Job name for the initializer. | ||
JobInitializer string = "initializer" | ||
|
||
// ContainerModelInitializer is the container name for the model initializer. | ||
ContainerModelInitializer string = "model-initializer" | ||
|
||
// ContainerDatasetInitializer is the container name for the dataset initializer. | ||
ContainerDatasetInitializer string = "dataset-initializer" | ||
|
||
// PodGroupKind is the Kind name for the PodGroup. | ||
PodGroupKind string = "PodGroup" | ||
|
||
// Distributed envs for torchrun. | ||
// Ref: https://github.com/pytorch/pytorch/blob/3a0d0885171376ed610c8175a19ba40411fc6f3f/torch/distributed/argparse_util.py#L45 | ||
// TorchEnvNumNodes is the env name for the number of training nodes. | ||
TorchEnvNumNodes string = "PET_NNODES" | ||
|
||
// TorchEnvNumProcPerNode is the env name for the number of procs per node (e.g. number of GPUs per Pod). | ||
TorchEnvNumProcPerNode string = "PET_NPROC_PER_NODE" | ||
|
||
// TorchEnvNodeRank is the env name for the node RANK | ||
TorchEnvNodeRank string = "PET_NODE_RANK" | ||
|
||
// TorchEnvMasterAddr is the env name for the master node address. | ||
TorchEnvMasterAddr string = "PET_MASTER_ADDR" | ||
|
||
// TorchEnvMasterPort is the env name for the master node port. | ||
TorchEnvMasterPort string = "PET_MASTER_PORT" | ||
) | ||
|
||
var ( | ||
// JobCompletionIndexFieldPath is the field path for the Job completion index annotation. | ||
JobCompletionIndexFieldPath string = fmt.Sprintf("metadata.annotations['%s']", batchv1.JobCompletionIndexAnnotation) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.