Skip to content

Commit

Permalink
KEP-2170: Decouple JobSet from TrainJob (kubeflow#2296)
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
Signed-off-by: sailesh duddupudi <saileshradar@gmail.com>
  • Loading branch information
tenzen-y authored and saileshd1402 committed Dec 2, 2024
1 parent 13dcb6b commit b4c0d40
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 51 deletions.
24 changes: 14 additions & 10 deletions docs/proposals/2170-kubeflow-training-v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,17 +337,16 @@ type TrainJobStatus struct {
// Conditions for the TrainJob.
Conditions []metav1.Condition `json:"conditions,omitempty"`

// ReplicatedJobsStatus tracks the number of Jobs for each replicatedJob in TrainJob.
ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"`
// JobsStatus tracks the child Jobs in TrainJob.
JobsStatus []JobStatus `json:"jobsStatus,omitempty"`
}

type ReplicatedJobStatus struct {
// Name of the ReplicatedJob.
type JobStatus struct {
// Name of the child Job.
Name string `json:"name"`

// Ready is the number of child Jobs where the number of ready pods and completed pods
// is greater than or equal to the total expected pod count for the Job (i.e., the minimum
// of job.spec.parallelism and job.spec.completions).
// is greater than or equal to the total expected pod count for the child Job.
Ready int32 `json:"ready"`

// Succeeded is the number of successfully completed child Jobs.
Expand Down Expand Up @@ -831,8 +830,8 @@ In the future, we can add more parameters if we find use-cases when it is requir

```golang
type PodSpecOverride struct {
// Names of the training job replicas in the training runtime template to apply the overrides.
TargetReplicatedJobs []string `json:"targetReplicatedJobs"`
// TrainJobs is the training job replicas in the training runtime template to apply the overrides.
TargetJobs []PodSpecOverrideTargetJob `json:"targetJobs"`

// Overrides for the containers in the desired job templates.
Containers []ContainerOverride `json:"containers,omitempty"`
Expand All @@ -853,6 +852,11 @@ type PodSpecOverride struct {
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

type PodSpecOverrideTargetJob struct {
// Name is the target training job name for which the PodSpec is overridden.
Name string `json:"name"`
}

// ContainerOverride represents parameters that can be overridden using PodSpecOverride.
// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
type ContainerOverride struct {
Expand Down Expand Up @@ -895,8 +899,8 @@ spec:
trainer:
image: docker.io/custom-training
podSpecOverrides:
- targetReplicatedJobs:
- node
- targetJobs:
- name: node
containers:
- name: user-identity
value: 123
Expand Down
4 changes: 2 additions & 2 deletions hack/violation_exception_v2alpha1.list
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,OutputModel,Env
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,Containers
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,InitContainers
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,TargetReplicatedJobs
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,TargetJobs
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,Tolerations
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,PodSpecOverride,Volumes
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,TorchElasticPolicy,Metrics
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,TrainJobSpec,PodSpecOverrides
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,TrainJobStatus,Conditions
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,TrainJobStatus,ReplicatedJobsStatus
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,TrainJobStatus,JobsStatus
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,Trainer,Args
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,Trainer,Command
API rule violation: list_type_missing,github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1,Trainer,Env
Expand Down
31 changes: 17 additions & 14 deletions manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ spec:
templates.
items:
description: |-
ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
ContainerOverride represents parameters that can be overridden using PodSpecOverrides.
Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
properties:
args:
Expand Down Expand Up @@ -740,7 +740,7 @@ spec:
job templates.
items:
description: |-
ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
ContainerOverride represents parameters that can be overridden using PodSpecOverrides.
Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
properties:
args:
Expand Down Expand Up @@ -976,11 +976,18 @@ spec:
serviceAccountName:
description: Override for the service account.
type: string
targetReplicatedJobs:
description: Names of the training job replicas in the training
targetJobs:
description: TrainJobs is the training job replicas in the training
runtime template to apply the overrides.
items:
type: string
properties:
name:
description: Name is the target training job name for
which the PodSpec is overridden.
type: string
required:
- name
type: object
type: array
tolerations:
description: Override for the Pod's tolerations.
Expand Down Expand Up @@ -2736,7 +2743,7 @@ spec:
type: object
type: array
required:
- targetReplicatedJobs
- targetJobs
type: object
type: array
runtimeRef:
Expand Down Expand Up @@ -3045,12 +3052,9 @@ spec:
- type
type: object
type: array
replicatedJobsStatus:
description: ReplicatedJobsStatus tracks the number of Jobs for each
replicatedJob in TrainJob.
jobsStatus:
description: JobsStatus tracks the child Jobs in TrainJob.
items:
description: ReplicatedJobStatus defines the observed ReplicatedJobs
Readiness.
properties:
active:
description: |-
Expand All @@ -3063,13 +3067,12 @@ spec:
format: int32
type: integer
name:
description: Name of the ReplicatedJob.
description: Name of the child Job.
type: string
ready:
description: |-
Ready is the number of child Jobs where the number of ready pods and completed pods
is greater than or equal to the total expected pod count for the Job (i.e., the minimum
of job.spec.parallelism and job.spec.completions).
is greater than or equal to the total expected pod count for the child Job.
format: int32
type: integer
succeeded:
Expand Down
107 changes: 95 additions & 12 deletions pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 32 additions & 6 deletions pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package v2alpha1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
)

const (
Expand Down Expand Up @@ -210,8 +209,8 @@ type OutputModel struct {

// PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources.
type PodSpecOverride struct {
// Names of the training job replicas in the training runtime template to apply the overrides.
TargetReplicatedJobs []string `json:"targetReplicatedJobs"`
// TrainJobs is the training job replicas in the training runtime template to apply the overrides.
TargetJobs []PodSpecOverrideTargetJob `json:"targetJobs"`

// Overrides for the containers in the desired job templates.
Containers []ContainerOverride `json:"containers,omitempty"`
Expand All @@ -232,7 +231,12 @@ type PodSpecOverride struct {
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

// ContainerOverrides represents parameters that can be overridden using PodSpecOverrides.
type PodSpecOverrideTargetJob struct {
// Name is the target training job name for which the PodSpec is overridden.
Name string `json:"name"`
}

// ContainerOverride represents parameters that can be overridden using PodSpecOverrides.
// Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
type ContainerOverride struct {
// Name for the container. TrainingRuntime must have this container.
Expand Down Expand Up @@ -261,8 +265,30 @@ type TrainJobStatus struct {
// Conditions for the TrainJob.
Conditions []metav1.Condition `json:"conditions,omitempty"`

// ReplicatedJobsStatus tracks the number of Jobs for each replicatedJob in TrainJob.
ReplicatedJobsStatus []jobsetv1alpha2.ReplicatedJobStatus `json:"replicatedJobsStatus,omitempty"`
// JobsStatus tracks the child Jobs in TrainJob.
JobsStatus []JobStatus `json:"jobsStatus,omitempty"`
}

type JobStatus struct {
// Name of the child Job.
Name string `json:"name"`

// Ready is the number of child Jobs where the number of ready pods and completed pods
// is greater than or equal to the total expected pod count for the child Job.
Ready int32 `json:"ready"`

// Succeeded is the number of successfully completed child Jobs.
Succeeded int32 `json:"succeeded"`

// Failed is the number of failed child Jobs.
Failed int32 `json:"failed"`

// Active is the number of child Jobs with at least 1 pod in a running or pending state
// which are not marked for deletion.
Active int32 `json:"active"`

// Suspended is the number of child Jobs which are in a suspended state.
Suspended int32 `json:"suspended"`
}

func init() {
Expand Down
Loading

0 comments on commit b4c0d40

Please sign in to comment.