-
Notifications
You must be signed in to change notification settings - Fork 863
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into issue_1120
- Loading branch information
Showing
5 changed files
with
392 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# Default values for k8s-prometheus-adapter.. | ||
affinity: {} | ||
|
||
image: | ||
repository: k8s.gcr.io/prometheus-adapter/prometheus-adapter | ||
tag: v0.9.0 | ||
pullPolicy: IfNotPresent | ||
|
||
logLevel: 4 | ||
|
||
metricsRelistInterval: 1m | ||
|
||
listenPort: 6443 | ||
|
||
nodeSelector: {} | ||
|
||
priorityClassName: "" | ||
|
||
# Url to access prometheus | ||
prometheus: | ||
# Value is templated | ||
url: http://prometheus-server.default.svc.cluster.local | ||
port: 80 | ||
path: "" | ||
|
||
replicas: 1 | ||
|
||
# k8s 1.21 needs fsGroup to be set for non root deployments | ||
# ref: https://github.com/kubernetes/kubernetes/issues/70679 | ||
podSecurityContext: | ||
fsGroup: 10001 | ||
|
||
rbac: | ||
# Specifies whether RBAC resources should be created | ||
create: true | ||
|
||
psp: | ||
# Specifies whether PSP resources should be created | ||
create: false | ||
|
||
serviceAccount: | ||
# Specifies whether a service account should be created | ||
create: true | ||
# The name of the service account to use. | ||
# If not set and create is true, a name is generated using the fullname template | ||
name: | ||
# ServiceAccount annotations. | ||
# Use case: AWS EKS IAM roles for service accounts | ||
# ref: https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html | ||
annotations: {} | ||
|
||
# Custom DNS configuration to be added to prometheus-adapter pods | ||
dnsConfig: {} | ||
# nameservers: | ||
# - 1.2.3.4 | ||
# searches: | ||
# - ns1.svc.cluster-domain.example | ||
# - my.dns.search.suffix | ||
# options: | ||
# - name: ndots | ||
# value: "2" | ||
# - name: edns0 | ||
resources: {} | ||
# requests: | ||
# cpu: 100m | ||
# memory: 128Mi | ||
# limits: | ||
# cpu: 100m | ||
# memory: 128Mi | ||
|
||
rules: | ||
default: true | ||
custom: [] | ||
# - seriesQuery: '{__name__=~"^some_metric_count$"}' | ||
# resources: | ||
# template: <<.Resource>> | ||
# name: | ||
# matches: "" | ||
# as: "my_custom_metric" | ||
# metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>) | ||
# Mounts a configMap with pre-generated rules for use. Overrides the | ||
# default, custom, external and resource entries | ||
existing: | ||
external: | ||
- seriesQuery: '{__name__=~"^ts_queue_latency_microseconds"}' | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
pod: | ||
resource: pod | ||
name: | ||
matches: "^(.*)_microseconds" | ||
as: "ts_queue_latency_microseconds" | ||
metricsQuery: ts_queue_latency_microseconds | ||
resource: {} | ||
# cpu: | ||
# containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, container!=""}[3m])) by (<<.GroupBy>>) | ||
# nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[3m])) by (<<.GroupBy>>) | ||
# resources: | ||
# overrides: | ||
# node: | ||
# resource: node | ||
# namespace: | ||
# resource: namespace | ||
# pod: | ||
# resource: pod | ||
# containerLabel: container | ||
# memory: | ||
# containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>, container!=""}) by (<<.GroupBy>>) | ||
# nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) | ||
# resources: | ||
# overrides: | ||
# node: | ||
# resource: node | ||
# namespace: | ||
# resource: namespace | ||
# pod: | ||
# resource: pod | ||
# containerLabel: container | ||
# window: 3m | ||
|
||
service: | ||
annotations: {} | ||
port: 443 | ||
type: ClusterIP | ||
# clusterIP: 1.2.3.4 | ||
|
||
tls: | ||
enable: false | ||
ca: |- | ||
# Public CA file that signed the APIService | ||
key: |- | ||
# Private key of the APIService | ||
certificate: |- | ||
# Public key of the APIService | ||
# Any extra arguments | ||
extraArguments: [] | ||
# - --tls-private-key-file=/etc/tls/tls.key | ||
# - --tls-cert-file=/etc/tls/tls.crt | ||
|
||
# Any extra volumes | ||
extraVolumes: [] | ||
# - name: example-name | ||
# hostPath: | ||
# path: /path/on/host | ||
# type: DirectoryOrCreate | ||
# - name: ssl-certs | ||
# hostPath: | ||
# path: /etc/ssl/certs/ca-bundle.crt | ||
# type: File | ||
|
||
# Any extra volume mounts | ||
extraVolumeMounts: [] | ||
# - name: example-name | ||
# mountPath: /path/in/container | ||
# - name: ssl-certs | ||
# mountPath: /etc/ssl/certs/ca-certificates.crt | ||
# readOnly: true | ||
|
||
tolerations: [] | ||
|
||
# Labels added to the pod | ||
podLabels: {} | ||
|
||
# Annotations added to the pod | ||
podAnnotations: {} | ||
|
||
hostNetwork: | ||
# Specifies if prometheus-adapter should be started in hostNetwork mode. | ||
# | ||
# You would require this enabled if you use alternate overlay networking for pods and | ||
# API server unable to communicate with metrics-server. As an example, this is required | ||
# if you use Weave network on EKS. See also dnsPolicy | ||
enabled: false | ||
|
||
# When hostNetwork is enabled, you probably want to set this to ClusterFirstWithHostNet | ||
# dnsPolicy: ClusterFirstWithHostNet | ||
|
||
# Deployment strategy type | ||
strategy: | ||
type: RollingUpdate | ||
rollingUpdate: | ||
maxUnavailable: 25% | ||
maxSurge: 25% | ||
|
||
podDisruptionBudget: | ||
# Specifies if PodDisruptionBudget should be enabled | ||
# When enabled, minAvailable or maxUnavailable should also be defined. | ||
enabled: false | ||
minAvailable: | ||
maxUnavailable: 1 | ||
|
||
certManager: | ||
enabled: false | ||
caCertDuration: 43800h | ||
certDuration: 8760h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Autoscaler | ||
|
||
Setup Kubernetes HPA(Horizontal Pod Autoscaler) for Torchserve, tuned for torchserve metrics. This uses Prometheus as metrics collector and Prometheus Adapter as mertrics server, serving Torchserve metrics for HPA. | ||
|
||
## Steps | ||
|
||
### 1. Install Torchserve with metrics enabled for prometheus format | ||
|
||
[Install TorchServe using Helm Charts](README.md##-Deploy-TorchServe-using-Helm-Charts) | ||
### 2. Install Prometheus | ||
|
||
```bash | ||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts | ||
helm repo update | ||
helm install prometheus prometheus-community/prometheus | ||
``` | ||
|
||
The above command outputs prometheus server url: | ||
|
||
```bash | ||
NAME: prometheus | ||
LAST DEPLOYED: Wed Sep 8 19:10:49 2021 | ||
NAMESPACE: default | ||
STATUS: deployed | ||
REVISION: 1 | ||
TEST SUITE: None | ||
NOTES: | ||
The Prometheus server can be accessed via port 80 on the following DNS name from within your cluster: | ||
prometheus-server.default.svc.cluster.local | ||
... | ||
... | ||
``` | ||
|
||
### 3. Install Prometheus Adapater | ||
|
||
- Update Prometheus url and port in adapter.yaml. Use the url given in prometheus installation output. | ||
|
||
```yaml | ||
# Url to access prometheus | ||
prometheus: | ||
# Value is templated | ||
url: http://prometheus-server.default.svc.cluster.local | ||
port: 80 | ||
path: "" | ||
``` | ||
- Update external metrics rules in adapter.yaml. Here we enabling external metrics in prometheus adapter and serving `ts_queue_latency_microseconds` metric. | ||
|
||
```yaml | ||
external: | ||
- seriesQuery: '{__name__=~"^ts_queue_latency_microseconds"}' | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
pod: | ||
resource: pod | ||
name: | ||
matches: "^(.*)_microseconds" | ||
as: "ts_queue_latency_microseconds" | ||
metricsQuery: ts_queue_latency_microseconds | ||
``` | ||
|
||
Refer: [Prometheus Adapter External Metrics](https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/docs/externalmetrics.md) | ||
|
||
- Install Prometheus adapter | ||
|
||
```bash | ||
helm install -f adapter.yaml prometheus-adapter prometheus-community/prometheus-adapter | ||
``` | ||
|
||
The output of above command is | ||
|
||
``` | ||
NAME: adapter | ||
LAST DEPLOYED: Wed Sep 8 19:49:28 2021 | ||
NAMESPACE: default | ||
STATUS: deployed | ||
REVISION: 1 | ||
TEST SUITE: None | ||
NOTES: | ||
adapter-prometheus-adapter has been deployed. | ||
In a few minutes you should be able to list metrics using the following command(s): | ||
|
||
kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | ||
|
||
kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 | ||
``` | ||
#### Check External metrics list | ||
```bash | ||
kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 | jq | ||
``` | ||
|
||
```json | ||
{ | ||
"kind": "APIResourceList", | ||
"apiVersion": "v1", | ||
"groupVersion": "external.metrics.k8s.io/v1beta1", | ||
"resources": [ | ||
{ | ||
"name": "ts_queue_latency_microseconds", | ||
"singularName": "", | ||
"namespaced": true, | ||
"kind": "ExternalMetricValueList", | ||
"verbs": [ | ||
"get" | ||
] | ||
} | ||
] | ||
} | ||
``` | ||
|
||
### 4. Deploy Horizontal Pod Autoscaler for Torchserve | ||
|
||
- Change `targetValue` as per requirement. | ||
|
||
```yaml | ||
kind: HorizontalPodAutoscaler | ||
apiVersion: autoscaling/v2beta1 | ||
metadata: | ||
name: torchserve | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: torchserve | ||
# autoscale between 1 and 5 replicas | ||
minReplicas: 1 | ||
maxReplicas: 5 | ||
metrics: | ||
- type: External | ||
external: | ||
metricName: ts_queue_latency_microseconds | ||
targetValue: "7000000m" | ||
``` | ||
```bash | ||
kubectl apply -f hpa.yaml | ||
``` | ||
|
||
### 5. Check status of HPG | ||
|
||
```bash | ||
kubectl describe hpa torchserve | ||
``` | ||
|
||
```bash | ||
Name: torchserve | ||
Namespace: default | ||
Labels: <none> | ||
Annotations: <none> | ||
CreationTimestamp: Wed, 08 Sep 2021 20:09:48 +0530 | ||
Reference: Deployment/torchserve | ||
Metrics: ( current / target ) | ||
"ts_queue_latency_microseconds" (target value): 5257630m / 7k | ||
Min replicas: 1 | ||
Max replicas: 5 | ||
Deployment pods: 3 current / 3 desired | ||
Conditions: | ||
Type Status Reason Message | ||
---- ------ ------ ------- | ||
AbleToScale True ReadyForNewScale recommended size matches current size | ||
ScalingActive True ValidMetricFound the HPA was able to successfully calculate a replica count from external metric ts_queue_latency_microseconds(nil) | ||
ScalingLimited False DesiredWithinRange the desired count is within the acceptable range | ||
Events: <none> | ||
``` |
Oops, something went wrong.