Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add workload services #732

Merged
merged 21 commits into from
Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e41d006
initial commit for opnicluster
AmartC Oct 26, 2022
f0fb059
Add workload and pretrained DRAIN services as well as training contro…
AmartC Oct 27, 2022
7bd4364
Update code in ai apis directory
AmartC Oct 28, 2022
6c472cc
Update services spec to include training controller
AmartC Nov 1, 2022
48216cb
Update services.go file to have custom deployment for workload DRAIN …
AmartC Nov 1, 2022
295c290
Add workload drain service to Opni AI services configuration
AmartC Nov 2, 2022
7ba6fc8
Remove setting of NODE_TLS_REJECT_UNAUTHORIZED environment variable w…
AmartC Nov 2, 2022
e02d07c
Update suite tests to include training controller service
AmartC Nov 3, 2022
3b13fd6
Update opnicluster.go and util.go to support training controller module
AmartC Nov 3, 2022
f36d1dd
Update crds for training controller service
AmartC Nov 14, 2022
fa0b649
Minify crd yaml in mage crdgen
kralicky Nov 17, 2022
12903c4
Update crds created
AmartC Nov 18, 2022
a0f103c
Prepend document separators to minified crd yaml files
kralicky Nov 18, 2022
505c161
Update crds
AmartC Nov 18, 2022
4bec8f3
Update training controller service spec bug
AmartC Nov 21, 2022
56b73f7
Allow gpu controller runtimeclass to be nil
dbason Nov 21, 2022
4fbef8a
Update GPU controller to only deploy inferencing service with GPU con…
AmartC Nov 21, 2022
9c53f74
Update GPU controller tests
AmartC Nov 22, 2022
0636512
Fix up test case
AmartC Nov 22, 2022
8660527
Update workload services to be disabled by default
AmartC Nov 24, 2022
20d11f5
Update services.go to address nil pointer bug and also do not enable …
AmartC Nov 28, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions apis/ai/v1beta1/opnicluster_meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const (
PayloadReceiverService
GPUControllerService
MetricsService
TrainingControllerService
OpensearchUpdateService
)

Expand Down Expand Up @@ -42,6 +43,8 @@ func (s ServiceKind) String() string {
return "metrics"
case OpensearchUpdateService:
return "opensearch-update"
case TrainingControllerService:
return "training-controller"
default:
return ""
}
Expand Down Expand Up @@ -76,6 +79,8 @@ func (s ServiceKind) GetImageSpec(opniCluster *OpniCluster) *opnimeta.ImageSpec
return &opniCluster.Spec.Services.Metrics.ImageSpec
case OpensearchUpdateService:
return &opniCluster.Spec.Services.OpensearchUpdate.ImageSpec
case TrainingControllerService:
return &opniCluster.Spec.Services.TrainingController.ImageSpec
default:
return nil
}
Expand All @@ -97,6 +102,8 @@ func (s ServiceKind) GetNodeSelector(opniCluster *OpniCluster) map[string]string
return opniCluster.Spec.Services.Metrics.NodeSelector
case OpensearchUpdateService:
return opniCluster.Spec.Services.OpensearchUpdate.NodeSelector
case TrainingControllerService:
return opniCluster.Spec.Services.TrainingController.NodeSelector
default:
return map[string]string{}
}
Expand All @@ -118,6 +125,8 @@ func (s ServiceKind) GetTolerations(opniCluster *OpniCluster) []corev1.Toleratio
return opniCluster.Spec.Services.Metrics.Tolerations
case OpensearchUpdateService:
return opniCluster.Spec.Services.OpensearchUpdate.Tolerations
case TrainingControllerService:
return opniCluster.Spec.Services.TrainingController.Tolerations
default:
return []corev1.Toleration{}
}
Expand Down
36 changes: 25 additions & 11 deletions apis/ai/v1beta1/opnicluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,28 @@ type OpniCluster struct {
}

type ServicesSpec struct {
Drain DrainServiceSpec `json:"drain,omitempty"`
Inference InferenceServiceSpec `json:"inference,omitempty"`
Preprocessing PreprocessingServiceSpec `json:"preprocessing,omitempty"`
PayloadReceiver PayloadReceiverServiceSpec `json:"payloadReceiver,omitempty"`
GPUController GPUControllerServiceSpec `json:"gpuController,omitempty"`
Metrics MetricsServiceSpec `json:"metrics,omitempty"`
OpensearchUpdate OpensearchUpdateServiceSpec `json:"opensearchUpdate,omitempty"`
Drain DrainServiceSpec `json:"drain,omitempty"`
Inference InferenceServiceSpec `json:"inference,omitempty"`
Preprocessing PreprocessingServiceSpec `json:"preprocessing,omitempty"`
PayloadReceiver PayloadReceiverServiceSpec `json:"payloadReceiver,omitempty"`
GPUController GPUControllerServiceSpec `json:"gpuController,omitempty"`
Metrics MetricsServiceSpec `json:"metrics,omitempty"`
OpensearchUpdate OpensearchUpdateServiceSpec `json:"opensearchUpdate,omitempty"`
TrainingController TrainingControllerServiceSpec `json:"trainingController,omitempty"`
}

type DrainServiceSpec struct {
opnimeta.ImageSpec `json:",inline,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Replicas *int32 `json:"replicas,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
Replicas *int32 `json:"replicas,omitempty"`
Workload WorkloadDrainServiceSpec `json:"workload,omitempty"`
}

type WorkloadDrainServiceSpec struct {
Enabled *bool `json:"enabled,omitempty"`
Replicas *int32 `json:"replicas,omitempty"`
}

type InferenceServiceSpec struct {
Expand Down Expand Up @@ -135,6 +142,13 @@ type GPUControllerServiceSpec struct {
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

type TrainingControllerServiceSpec struct {
opnimeta.ImageSpec `json:",inline,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

type MetricsServiceSpec struct {
opnimeta.ImageSpec `json:",inline,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
Expand Down
62 changes: 62 additions & 0 deletions apis/ai/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions apis/v1beta2/opnicluster_meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const (
GPUControllerService
MetricsService
OpensearchUpdateService
TrainingControllerService
)

type OpensearchRole string
Expand Down Expand Up @@ -42,6 +43,8 @@ func (s ServiceKind) String() string {
return "metrics"
case OpensearchUpdateService:
return "opensearch-update"
case TrainingControllerService:
return "training-controller"
default:
return ""
}
Expand Down Expand Up @@ -76,6 +79,8 @@ func (s ServiceKind) GetImageSpec(spec OpniClusterSpec) *opnimeta.ImageSpec {
return &spec.Services.Metrics.ImageSpec
case OpensearchUpdateService:
return &spec.Services.OpensearchUpdate.ImageSpec
case TrainingControllerService:
return &spec.Services.TrainingController.ImageSpec
default:
return nil
}
Expand All @@ -97,6 +102,8 @@ func (s ServiceKind) GetNodeSelector(spec OpniClusterSpec) map[string]string {
return spec.Services.Metrics.NodeSelector
case OpensearchUpdateService:
return spec.Services.OpensearchUpdate.NodeSelector
case TrainingControllerService:
return spec.Services.TrainingController.NodeSelector
default:
return map[string]string{}
}
Expand All @@ -118,6 +125,8 @@ func (s ServiceKind) GetTolerations(spec OpniClusterSpec) []corev1.Toleration {
return spec.Services.Metrics.Tolerations
case OpensearchUpdateService:
return spec.Services.OpensearchUpdate.Tolerations
case TrainingControllerService:
return spec.Services.TrainingController.Tolerations
default:
return []corev1.Toleration{}
}
Expand Down
22 changes: 15 additions & 7 deletions apis/v1beta2/opnicluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,14 @@ type OpniCluster struct {
}

type ServicesSpec struct {
Drain DrainServiceSpec `json:"drain,omitempty"`
Inference InferenceServiceSpec `json:"inference,omitempty"`
Preprocessing PreprocessingServiceSpec `json:"preprocessing,omitempty"`
PayloadReceiver PayloadReceiverServiceSpec `json:"payloadReceiver,omitempty"`
GPUController GPUControllerServiceSpec `json:"gpuController,omitempty"`
Metrics MetricsServiceSpec `json:"metrics,omitempty"`
OpensearchUpdate OpensearchUpdateServiceSpec `json:"opensearchUpdate,omitempty"`
Drain DrainServiceSpec `json:"drain,omitempty"`
Inference InferenceServiceSpec `json:"inference,omitempty"`
Preprocessing PreprocessingServiceSpec `json:"preprocessing,omitempty"`
PayloadReceiver PayloadReceiverServiceSpec `json:"payloadReceiver,omitempty"`
GPUController GPUControllerServiceSpec `json:"gpuController,omitempty"`
Metrics MetricsServiceSpec `json:"metrics,omitempty"`
OpensearchUpdate OpensearchUpdateServiceSpec `json:"opensearchUpdate,omitempty"`
TrainingController TrainingControllerServiceSpec `json:"trainingController,omitempty"`
}

type DrainServiceSpec struct {
Expand Down Expand Up @@ -140,6 +141,13 @@ type PayloadReceiverServiceSpec struct {
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

type TrainingControllerServiceSpec struct {
opnimeta.ImageSpec `json:",inline,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
}

type GPUControllerServiceSpec struct {
opnimeta.ImageSpec `json:",inline,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
Expand Down
36 changes: 36 additions & 0 deletions apis/v1beta2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 44 additions & 0 deletions config/crd/bases/ai.opni.io_opniclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading