⚠️ Breaking Changes: Migrate model subcommand to model analyze (#1060)

Signed-off-by: Yi Chen <github@chenyicn.net>
kubeflow · Mar 27, 2024 · 12f205e · 12f205e
1 parent 5ac396c
commit 12f205e
Show file tree

Hide file tree

Showing 35 changed files with 242 additions and 224 deletions.
diff --git a/charts/modeljob/templates/job.yaml b/charts/modeljob/templates/job.yaml
@@ -158,12 +158,12 @@ spec:
               {{- if gt (int $gpuCount) 0}}
               nvidia.com/gpu: {{ .Values.gpuCount }}
               {{- end }}
-              {{ - if gt (int $gpuMemory) 0 }}
-              aliyun.com/gpu-mem: { { .Values.gpuMemory }}
-              {{ - end } }
-              {{ - if gt (int $gpuCore) 0 }}
+              {{- if gt (int $gpuMemory) 0 }}
+              aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
+              {{- end }}
+              {{- if gt (int $gpuCore) 0 }}
               aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
-              {{ - end }}
+              {{- end }}
           volumeMounts:
             {{- if .Values.dataset }}
             {{- range $pvcName, $mntPath := .Values.dataset}}

diff --git a/charts/pytorchjob/templates/ingress.yaml b/charts/pytorchjob/templates/ingress.yaml
@@ -13,8 +13,8 @@ metadata:
     createdBy: "PyTorchJob"
     controller-name: pytorch-operator
     group-name: kubeflow.org
-    job-name: { { .Release.Name } }
-    pytorch-job-name: { { .Release.Name } }
+    job-name: {{ .Release.Name }}
+    pytorch-job-name: {{ .Release.Name }}
 spec:
   rules:
     - http:

diff --git a/charts/seldon-core/templates/seldondeployment.yaml b/charts/seldon-core/templates/seldondeployment.yaml
@@ -41,12 +41,12 @@ spec:
                   {{- if gt (int $gpuCount) 0 }}
                   nvidia.com/gpu: {{ .Values.gpuCount }}
                   {{- end }}
-                  {{ - if gt (int $gpuMemory) 0 }}
+                  {{- if gt (int $gpuMemory) 0 }}
                   aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
-                  {{ - end } }
-                  {{ - if gt (int $gpuCore) 0 }}
+                  {{- end }}
+                  {{- if gt (int $gpuCore) 0 }}
                   aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
-                  {{ - end }}
+                  {{- end }}
       graph:
         implementation: {{ .Values.implementation }}
         modelUri: {{ .Values.modelUri }}

diff --git a/charts/tfjob/templates/ingress.yaml b/charts/tfjob/templates/ingress.yaml
@@ -13,7 +13,7 @@ metadata:
     role: tensorboard
     createdBy: "TFJob"
     group-name: kubeflow.org
-    tf-job-name: { { .Release.Name } }
+    tf-job-name: {{ .Release.Name }}
 spec:
   rules:
     - http:

diff --git a/charts/trtserving/templates/deployment.yaml b/charts/trtserving/templates/deployment.yaml
@@ -104,9 +104,9 @@ spec:
               {{- if gt (int $gpuMemory) 0}}
               aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
               {{- end }}
-              {{ - if gt (int $gpuCore) 0 }}
+              {{- if gt (int $gpuCore) 0 }}
               aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
-              {{ - end }}
+              {{- end }}
           volumeMounts:
             {{- if .Values.shareMemory }}
             - mountPath: /dev/shm

diff --git a/docs/model/benchmark/benchmark_torchscript.md b/docs/model/benchmark/benchmark_torchscript.md
@@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
 3\. Submit a model benchmark job.
 
 ```shell
-$ arena model benchmark \
+$ arena model analyze benchmark \
   --name=resnet18-benchmark \
   --namespace=default \
   --image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
@@ -80,13 +80,13 @@ $ arena model benchmark \
 
 job.batch/resnet18-benchmark created
 INFO[0000] The model benchmark job resnet18-benchmark has been submitted successfully
-INFO[0000] You can run `arena model get resnet18-benchmark` to check the job status
+INFO[0000] You can run `arena model analyze get resnet18-benchmark` to check the job status
 ```
 
 4\. List all the model benchmark jobs.
 
 ```shell
-$ arena model list
+$ arena model analyze list
 
 NAMESPACE      NAME                STATUS   TYPE       DURATION  AGE  GPU(Requested)
 default  resnet18-benchmark  RUNNING  Benchmark  23s       23s  1
@@ -95,7 +95,7 @@ default  resnet18-benchmark  RUNNING  Benchmark  23s       23s  1
 5\. Get model benchmark job detail info.
 
 ```shell
-$ arena model get resnet18-benchmark
+$ arena model analyze get resnet18-benchmark
 Name:       resnet18-benchmark
 Namespace:  default
 Type:       Benchmark
@@ -125,7 +125,3 @@ Benchmark finished, cost 60.00157570838928 s
 Benchmark result:
 {"p90_latency": 3.806, "p95_latency": 3.924, "p99_latency": 4.781, "min_latency": 3.665, "max_latency": 1555.418, "mean_latency": 3.88, "median_latency": 3.731, "throughput": 257, "gpu_mem_used": 1.47, "gpu_utilization": 38.39514839785918}
 ```
-
-
-
-
diff --git a/docs/model/index.md b/docs/model/index.md
@@ -1,7 +1,6 @@
 # Model Analyze Guide
 
-Welcome to the Arena Model Guide! This guide covers how to use the ``arena cli`` to profile the model to find performance bottleneck, and how to use tensorrt to optimize the inference performance, you can also benchmark the model to get inference metrics like qps, latency, gpu usage and so on. This page outlines the most common situations and questions that bring readers to this section.
-
+Welcome to the Arena Model Analyze Guide! This guide covers how to use the `arena cli` to profile the model to find performance bottleneck, and how to use tensorrt to optimize the inference performance, you can also benchmark the model to get inference metrics like qps, latency, gpu usage and so on. This page outlines the most common situations and questions that bring readers to this section.
 
 ## Who should use this guide?
 
@@ -15,8 +14,6 @@ After training you may get some models. If you want to know the model performanc
 
 * I want to [optimize the torchscript module with tensorrt](optimize/optimize_torchscript.md).
 
-
 ## Benchmark the model inference
 
 * I want to [benchmark the torchscript inference performance](benchmark/benchmark_torchscript.md).
-
diff --git a/docs/model/optimize/optimize_torchscript.md b/docs/model/optimize/optimize_torchscript.md
@@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
 3\. Submit a model optimize job.
 
 ```shell
-$ arena model optimize \
+$ arena model analyze optimize \
     --name=resnet18-optimize \
     --namespace=default \
     --image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
@@ -78,13 +78,13 @@ $ arena model optimize \
 
 job.batch/resnet18-optimize created
 INFO[0002] The model optimize job resnet18-optimize has been submitted successfully
-INFO[0002] You can run `arena model get resnet18-optimize` to check the job status
+INFO[0002] You can run `arena model analyze get resnet18-optimize` to check the job status
 ```
 
 4\. List all the model optimize jobs.
 
 ```shell
-$ arena model list
+$ arena model analyze list
 
 NAMESPACE      NAME               STATUS   TYPE      DURATION  AGE  GPU(Requested)
 default-group  resnet18-optimize  RUNNING  Optimize  0s        1m   1
@@ -93,7 +93,7 @@ default-group  resnet18-optimize  RUNNING  Optimize  0s        1m   1
 5\. Get model optimize job detail info.
 
 ```shell
-$ arena model get resnet18-profile
+$ arena model analyze get resnet18-profile
 Name:       resnet18-optimize
 Namespace:  default-group
 Type:       Optimize
@@ -111,4 +111,4 @@ Instances:
   resnet18-optimize-xrd6w  ContainerCreating  1m   0/1    0         cn-shenzhen.192.168.1.209
 ```
 
-6\. After the optimize job finished, you can see a new torchscript modue named opt_resnet18.pt in --export-path.
+6\. After the optimize job finished, you can see a new torchscript modue named opt_resnet18.pt in --export-path.
diff --git a/docs/model/profile/profile_torchscript.md b/docs/model/profile/profile_torchscript.md
@@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
 3\. Submit a model profile job.
 
 ```shell
-$ arena model profile \
+$ arena model analyze profile \
     --name=resnet18-profile \
     --namespace=default \
     --image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
@@ -82,13 +82,13 @@ service/resnet18-profile-tensorboard created
 deployment.apps/resnet18-profile-tensorboard created
 job.batch/resnet18-profile created
 INFO[0001] The model profile job resnet18-profile has been submitted successfully
-INFO[0001] You can run `arena model get resnet18-profile` to check the job status
+INFO[0001] You can run `arena model analyze get resnet18-profile` to check the job status
 ```
 
 4\. List all the profile jobs.
 
 ```shell
-$ arena model list
+$ arena model analyze list
 
 NAMESPACE      NAME              STATUS   TYPE     DURATION  AGE  GPU(Requested)
 default  resnet18-profile  RUNNING  Profile  34s       34s  1
@@ -97,7 +97,7 @@ default  resnet18-profile  RUNNING  Profile  34s       34s  1
 5\. Get model profile job detail info.
 
 ```shell
-$ arena model get resnet18-profile
+$ arena model analyze get resnet18-profile
 Name:       resnet18-profile
 Namespace:  default
 Type:       Profile
@@ -126,6 +126,4 @@ Forwarding from 127.0.0.1:6006 -> 6006
 Forwarding from [::1]:6006 -> 6006
 ```
 
-
-
-![tensorboard](./1-torchscript-profile-result.jpg)
+![tensorboard](./1-torchscript-profile-result.jpg)
diff --git a/pkg/apis/arenaclient/analyze_client.go b/pkg/apis/arenaclient/analyze_client.go
@@ -0,0 +1,102 @@
+package arenaclient
+
+import (
+	"github.com/kubeflow/arena/pkg/apis/config"
+	apisanalyze "github.com/kubeflow/arena/pkg/apis/model/analyze"
+	"github.com/kubeflow/arena/pkg/apis/types"
+	"github.com/kubeflow/arena/pkg/apis/utils"
+	"github.com/kubeflow/arena/pkg/model/analyze"
+)
+
+type AnalyzeClient struct {
+	namespace string
+	configer  *config.ArenaConfiger
+}
+
+func NewAnalyzeClient(namespace string, configer *config.ArenaConfiger) *AnalyzeClient {
+	return &AnalyzeClient{
+		namespace: namespace,
+		configer:  configer,
+	}
+}
+
+// Namespace sets the namespace,this operation does not change the default namespace
+func (m *AnalyzeClient) Namespace(namespace string) *AnalyzeClient {
+	copyModelJobClient := &AnalyzeClient{
+		namespace: namespace,
+		configer:  m.configer,
+	}
+	return copyModelJobClient
+}
+
+func (m *AnalyzeClient) Submit(job *apisanalyze.Job) error {
+	switch job.Type() {
+	case types.ModelProfileJob:
+		args := job.Args().(*types.ModelProfileArgs)
+		return analyze.SubmitModelProfileJob(args.Namespace, args)
+	case types.ModelOptimizeJob:
+		args := job.Args().(*types.ModelOptimizeArgs)
+		return analyze.SubmitModelOptimizeJob(args.Namespace, args)
+	case types.ModelBenchmarkJob:
+		args := job.Args().(*types.ModelBenchmarkArgs)
+		return analyze.SubmitModelBenchmarkJob(args.Namespace, args)
+	case types.ModelEvaluateJob:
+		args := job.Args().(*types.ModelEvaluateArgs)
+		return analyze.SubmitModelEvaluateJob(args.Namespace, args)
+	}
+	return nil
+}
+
+func (m *AnalyzeClient) Get(jobType types.ModelJobType, name string) (*types.ModelJobInfo, error) {
+	job, err := analyze.SearchModelJob(m.namespace, name, jobType)
+	if err != nil {
+		return nil, err
+	}
+
+	jobInfo := job.Convert2JobInfo()
+	return &jobInfo, nil
+}
+
+func (m *AnalyzeClient) GetAndPrint(jobType types.ModelJobType, name string, format string) error {
+	job, err := analyze.SearchModelJob(m.namespace, name, jobType)
+	if err != nil {
+		return err
+	}
+
+	analyze.PrintModelJob(job, utils.TransferPrintFormat(format))
+	return nil
+}
+
+func (m *AnalyzeClient) List(allNamespaces bool, jobType types.ModelJobType) ([]*types.ModelJobInfo, error) {
+	jobs, err := analyze.ListModelJobs(m.namespace, allNamespaces, jobType)
+	if err != nil {
+		return nil, err
+	}
+
+	var jobInfos []*types.ModelJobInfo
+	for _, job := range jobs {
+		jobInfo := job.Convert2JobInfo()
+		jobInfos = append(jobInfos, &jobInfo)
+	}
+	return jobInfos, nil
+}
+
+func (m *AnalyzeClient) ListAndPrint(allNamespaces bool, jobType types.ModelJobType, format string) error {
+	jobs, err := analyze.ListModelJobs(m.namespace, allNamespaces, jobType)
+	if err != nil {
+		return err
+	}
+
+	analyze.PrintAllModelJobs(jobs, allNamespaces, utils.TransferPrintFormat(format))
+	return nil
+}
+
+func (m *AnalyzeClient) Delete(jobType types.ModelJobType, jobNames ...string) error {
+	for _, jobName := range jobNames {
+		err := analyze.DeleteModelJob(m.namespace, jobName, jobType)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/pkg/apis/arenaclient/arenaclient.go b/pkg/apis/arenaclient/arenaclient.go
@@ -83,6 +83,6 @@ func (a *ArenaClient) Evaluate() *EvaluateClient {
 	return NewEvaluateClient(a.namespace, a.arenaConfiger)
 }
 
-func (a *ArenaClient) Model() *ModelClient {
-	return NewModelClient(a.namespace, a.arenaConfiger)
+func (a *ArenaClient) Analyze() *AnalyzeClient {
+	return NewAnalyzeClient(a.namespace, a.arenaConfiger)
 }