Skip to content

Commit

Permalink
⚠️ Breaking Changes: Migrate model subcommand to model analyze (#1060)
Browse files Browse the repository at this point in the history
Signed-off-by: Yi Chen <github@chenyicn.net>
  • Loading branch information
ChenYi015 authored Mar 27, 2024
1 parent 5ac396c commit 12f205e
Show file tree
Hide file tree
Showing 35 changed files with 242 additions and 224 deletions.
10 changes: 5 additions & 5 deletions charts/modeljob/templates/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,12 @@ spec:
{{- if gt (int $gpuCount) 0}}
nvidia.com/gpu: {{ .Values.gpuCount }}
{{- end }}
{{ - if gt (int $gpuMemory) 0 }}
aliyun.com/gpu-mem: { { .Values.gpuMemory }}
{{ - end } }
{{ - if gt (int $gpuCore) 0 }}
{{- if gt (int $gpuMemory) 0 }}
aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
{{- end }}
{{- if gt (int $gpuCore) 0 }}
aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
{{ - end }}
{{- end }}
volumeMounts:
{{- if .Values.dataset }}
{{- range $pvcName, $mntPath := .Values.dataset}}
Expand Down
4 changes: 2 additions & 2 deletions charts/pytorchjob/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ metadata:
createdBy: "PyTorchJob"
controller-name: pytorch-operator
group-name: kubeflow.org
job-name: { { .Release.Name } }
pytorch-job-name: { { .Release.Name } }
job-name: {{ .Release.Name }}
pytorch-job-name: {{ .Release.Name }}
spec:
rules:
- http:
Expand Down
8 changes: 4 additions & 4 deletions charts/seldon-core/templates/seldondeployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ spec:
{{- if gt (int $gpuCount) 0 }}
nvidia.com/gpu: {{ .Values.gpuCount }}
{{- end }}
{{ - if gt (int $gpuMemory) 0 }}
{{- if gt (int $gpuMemory) 0 }}
aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
{{ - end } }
{{ - if gt (int $gpuCore) 0 }}
{{- end }}
{{- if gt (int $gpuCore) 0 }}
aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
{{ - end }}
{{- end }}
graph:
implementation: {{ .Values.implementation }}
modelUri: {{ .Values.modelUri }}
Expand Down
2 changes: 1 addition & 1 deletion charts/tfjob/templates/ingress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ metadata:
role: tensorboard
createdBy: "TFJob"
group-name: kubeflow.org
tf-job-name: { { .Release.Name } }
tf-job-name: {{ .Release.Name }}
spec:
rules:
- http:
Expand Down
4 changes: 2 additions & 2 deletions charts/trtserving/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ spec:
{{- if gt (int $gpuMemory) 0}}
aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
{{- end }}
{{ - if gt (int $gpuCore) 0 }}
{{- if gt (int $gpuCore) 0 }}
aliyun.com/gpu-core.percentage: {{ .Values.gpuCore }}
{{ - end }}
{{- end }}
volumeMounts:
{{- if .Values.shareMemory }}
- mountPath: /dev/shm
Expand Down
12 changes: 4 additions & 8 deletions docs/model/benchmark/benchmark_torchscript.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
3\. Submit a model benchmark job.

```shell
$ arena model benchmark \
$ arena model analyze benchmark \
--name=resnet18-benchmark \
--namespace=default \
--image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
Expand All @@ -80,13 +80,13 @@ $ arena model benchmark \

job.batch/resnet18-benchmark created
INFO[0000] The model benchmark job resnet18-benchmark has been submitted successfully
INFO[0000] You can run `arena model get resnet18-benchmark` to check the job status
INFO[0000] You can run `arena model analyze get resnet18-benchmark` to check the job status
```

4\. List all the model benchmark jobs.

```shell
$ arena model list
$ arena model analyze list

NAMESPACE NAME STATUS TYPE DURATION AGE GPU(Requested)
default resnet18-benchmark RUNNING Benchmark 23s 23s 1
Expand All @@ -95,7 +95,7 @@ default resnet18-benchmark RUNNING Benchmark 23s 23s 1
5\. Get model benchmark job detail info.

```shell
$ arena model get resnet18-benchmark
$ arena model analyze get resnet18-benchmark
Name: resnet18-benchmark
Namespace: default
Type: Benchmark
Expand Down Expand Up @@ -125,7 +125,3 @@ Benchmark finished, cost 60.00157570838928 s
Benchmark result:
{"p90_latency": 3.806, "p95_latency": 3.924, "p99_latency": 4.781, "min_latency": 3.665, "max_latency": 1555.418, "mean_latency": 3.88, "median_latency": 3.731, "throughput": 257, "gpu_mem_used": 1.47, "gpu_utilization": 38.39514839785918}
```




5 changes: 1 addition & 4 deletions docs/model/index.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Model Analyze Guide

Welcome to the Arena Model Guide! This guide covers how to use the ``arena cli`` to profile the model to find performance bottleneck, and how to use tensorrt to optimize the inference performance, you can also benchmark the model to get inference metrics like qps, latency, gpu usage and so on. This page outlines the most common situations and questions that bring readers to this section.

Welcome to the Arena Model Analyze Guide! This guide covers how to use the `arena cli` to profile the model to find performance bottleneck, and how to use tensorrt to optimize the inference performance, you can also benchmark the model to get inference metrics like qps, latency, gpu usage and so on. This page outlines the most common situations and questions that bring readers to this section.

## Who should use this guide?

Expand All @@ -15,8 +14,6 @@ After training you may get some models. If you want to know the model performanc

* I want to [optimize the torchscript module with tensorrt](optimize/optimize_torchscript.md).


## Benchmark the model inference

* I want to [benchmark the torchscript inference performance](benchmark/benchmark_torchscript.md).

10 changes: 5 additions & 5 deletions docs/model/optimize/optimize_torchscript.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
3\. Submit a model optimize job.

```shell
$ arena model optimize \
$ arena model analyze optimize \
--name=resnet18-optimize \
--namespace=default \
--image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
Expand All @@ -78,13 +78,13 @@ $ arena model optimize \

job.batch/resnet18-optimize created
INFO[0002] The model optimize job resnet18-optimize has been submitted successfully
INFO[0002] You can run `arena model get resnet18-optimize` to check the job status
INFO[0002] You can run `arena model analyze get resnet18-optimize` to check the job status
```

4\. List all the model optimize jobs.

```shell
$ arena model list
$ arena model analyze list

NAMESPACE NAME STATUS TYPE DURATION AGE GPU(Requested)
default-group resnet18-optimize RUNNING Optimize 0s 1m 1
Expand All @@ -93,7 +93,7 @@ default-group resnet18-optimize RUNNING Optimize 0s 1m 1
5\. Get model optimize job detail info.

```shell
$ arena model get resnet18-profile
$ arena model analyze get resnet18-profile
Name: resnet18-optimize
Namespace: default-group
Type: Optimize
Expand All @@ -111,4 +111,4 @@ Instances:
resnet18-optimize-xrd6w ContainerCreating 1m 0/1 0 cn-shenzhen.192.168.1.209
```

6\. After the optimize job finished, you can see a new torchscript modue named opt_resnet18.pt in --export-path.
6\. After the optimize job finished, you can see a new torchscript modue named opt_resnet18.pt in --export-path.
12 changes: 5 additions & 7 deletions docs/model/profile/profile_torchscript.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Then give a profile configuration file named config.json like below.
3\. Submit a model profile job.

```shell
$ arena model profile \
$ arena model analyze profile \
--name=resnet18-profile \
--namespace=default \
--image=registry.cn-beijing.aliyuncs.com/kube-ai/easy-inference:1.0.0 \
Expand All @@ -82,13 +82,13 @@ service/resnet18-profile-tensorboard created
deployment.apps/resnet18-profile-tensorboard created
job.batch/resnet18-profile created
INFO[0001] The model profile job resnet18-profile has been submitted successfully
INFO[0001] You can run `arena model get resnet18-profile` to check the job status
INFO[0001] You can run `arena model analyze get resnet18-profile` to check the job status
```

4\. List all the profile jobs.

```shell
$ arena model list
$ arena model analyze list

NAMESPACE NAME STATUS TYPE DURATION AGE GPU(Requested)
default resnet18-profile RUNNING Profile 34s 34s 1
Expand All @@ -97,7 +97,7 @@ default resnet18-profile RUNNING Profile 34s 34s 1
5\. Get model profile job detail info.

```shell
$ arena model get resnet18-profile
$ arena model analyze get resnet18-profile
Name: resnet18-profile
Namespace: default
Type: Profile
Expand Down Expand Up @@ -126,6 +126,4 @@ Forwarding from 127.0.0.1:6006 -> 6006
Forwarding from [::1]:6006 -> 6006
```



![tensorboard](./1-torchscript-profile-result.jpg)
![tensorboard](./1-torchscript-profile-result.jpg)
102 changes: 102 additions & 0 deletions pkg/apis/arenaclient/analyze_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package arenaclient

import (
"github.com/kubeflow/arena/pkg/apis/config"
apisanalyze "github.com/kubeflow/arena/pkg/apis/model/analyze"
"github.com/kubeflow/arena/pkg/apis/types"
"github.com/kubeflow/arena/pkg/apis/utils"
"github.com/kubeflow/arena/pkg/model/analyze"
)

type AnalyzeClient struct {
namespace string
configer *config.ArenaConfiger
}

func NewAnalyzeClient(namespace string, configer *config.ArenaConfiger) *AnalyzeClient {
return &AnalyzeClient{
namespace: namespace,
configer: configer,
}
}

// Namespace sets the namespace,this operation does not change the default namespace
func (m *AnalyzeClient) Namespace(namespace string) *AnalyzeClient {
copyModelJobClient := &AnalyzeClient{
namespace: namespace,
configer: m.configer,
}
return copyModelJobClient
}

func (m *AnalyzeClient) Submit(job *apisanalyze.Job) error {
switch job.Type() {
case types.ModelProfileJob:
args := job.Args().(*types.ModelProfileArgs)
return analyze.SubmitModelProfileJob(args.Namespace, args)
case types.ModelOptimizeJob:
args := job.Args().(*types.ModelOptimizeArgs)
return analyze.SubmitModelOptimizeJob(args.Namespace, args)
case types.ModelBenchmarkJob:
args := job.Args().(*types.ModelBenchmarkArgs)
return analyze.SubmitModelBenchmarkJob(args.Namespace, args)
case types.ModelEvaluateJob:
args := job.Args().(*types.ModelEvaluateArgs)
return analyze.SubmitModelEvaluateJob(args.Namespace, args)
}
return nil
}

func (m *AnalyzeClient) Get(jobType types.ModelJobType, name string) (*types.ModelJobInfo, error) {
job, err := analyze.SearchModelJob(m.namespace, name, jobType)
if err != nil {
return nil, err
}

jobInfo := job.Convert2JobInfo()
return &jobInfo, nil
}

func (m *AnalyzeClient) GetAndPrint(jobType types.ModelJobType, name string, format string) error {
job, err := analyze.SearchModelJob(m.namespace, name, jobType)
if err != nil {
return err
}

analyze.PrintModelJob(job, utils.TransferPrintFormat(format))
return nil
}

func (m *AnalyzeClient) List(allNamespaces bool, jobType types.ModelJobType) ([]*types.ModelJobInfo, error) {
jobs, err := analyze.ListModelJobs(m.namespace, allNamespaces, jobType)
if err != nil {
return nil, err
}

var jobInfos []*types.ModelJobInfo
for _, job := range jobs {
jobInfo := job.Convert2JobInfo()
jobInfos = append(jobInfos, &jobInfo)
}
return jobInfos, nil
}

func (m *AnalyzeClient) ListAndPrint(allNamespaces bool, jobType types.ModelJobType, format string) error {
jobs, err := analyze.ListModelJobs(m.namespace, allNamespaces, jobType)
if err != nil {
return err
}

analyze.PrintAllModelJobs(jobs, allNamespaces, utils.TransferPrintFormat(format))
return nil
}

func (m *AnalyzeClient) Delete(jobType types.ModelJobType, jobNames ...string) error {
for _, jobName := range jobNames {
err := analyze.DeleteModelJob(m.namespace, jobName, jobType)
if err != nil {
return err
}
}
return nil
}
4 changes: 2 additions & 2 deletions pkg/apis/arenaclient/arenaclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ func (a *ArenaClient) Evaluate() *EvaluateClient {
return NewEvaluateClient(a.namespace, a.arenaConfiger)
}

func (a *ArenaClient) Model() *ModelClient {
return NewModelClient(a.namespace, a.arenaConfiger)
func (a *ArenaClient) Analyze() *AnalyzeClient {
return NewAnalyzeClient(a.namespace, a.arenaConfiger)
}
Loading

0 comments on commit 12f205e

Please sign in to comment.