-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement Kubernetes deployment autoscaler (#2685)
* Add basic deployment autoscaler * Build deployment autoscaler docker image * Add deployment autoscaler to kubernetes cluster * Repair deployment autoscaler kubernetes configuration * Repair deployment autoscaler docker image * Cleanup source files * Use common kubernetes client configuration
- Loading branch information
Showing
24 changed files
with
2,577 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
# Deployment autoscaler | ||
|
||
> [Issue #2639](https://github.com/epam/cloud-pipeline/issues/2639) | ||
Deployment autoscaler horizontally autoscales both kubernetes deployments and kubernetes nodes in order to achieve some predefined target utilization. | ||
|
||
## Deployment | ||
|
||
In order to deploy deployment autoscaler for some kubernetes deployment use the following code snippet. | ||
|
||
```bash | ||
# Download deployment autoscaler resources | ||
CP_WORKDIR="$(mktemp -d)" | ||
CP_URL="https://raw.githubusercontent.com/epam/cloud-pipeline/develop/deploy/contents/k8s/cp-deployment-autoscaler/" | ||
wget "${CP_URL}/cp-deployment-autoscaler-dpl.yaml" -O "${CP_WORKDIR}/cp-deployment-autoscaler-dpl.yaml" | ||
wget "${CP_URL}/config.json" -O "${CP_WORKDIR}/config.json" | ||
|
||
# Specify autoscaler settings | ||
export CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME="cp-api-srv-autoscaler" | ||
export CP_DEPLOYMENT_AUTOSCALER_CONFIGMAP_NAME="${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME}-config" | ||
export CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_FILE="${CP_WORKDIR}/cp-deployment-autoscaler-dpl.yaml" | ||
export CP_DEPLOYMENT_AUTOSCALER_CONFIG_FILE="${CP_WORKDIR}/config.json" | ||
export CP_DOCKER_DIST_SRV="${CP_DOCKER_DIST_SRV:-"quay.io/"}" | ||
export CP_VERSION="${CP_VERSION:-"0.17"}" | ||
|
||
# Prepare autoscaler configuration | ||
nano "${CP_DEPLOYMENT_AUTOSCALER_CONFIG_FILE}" | ||
|
||
# Create or replace autoscaler deployment configuration | ||
if kubectl get cm "${CP_DEPLOYMENT_AUTOSCALER_CONFIGMAP_NAME}"; then | ||
kubectl delete cm "${CP_DEPLOYMENT_AUTOSCALER_CONFIGMAP_NAME}" | ||
fi | ||
kubectl create cm "${CP_DEPLOYMENT_AUTOSCALER_CONFIGMAP_NAME}" --from-file="${CP_DEPLOYMENT_AUTOSCALER_CONFIG_FILE}" | ||
|
||
# Prepare autoscaler deployment configuration | ||
envsubst < "${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_FILE}" > "${CP_WORKDIR}/_tmp" | ||
cp "${CP_WORKDIR}/_tmp" "${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_FILE}" | ||
|
||
# Create or replace autoscaler deployment | ||
if kubectl get deploy "${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME}"; then | ||
kubectl delete deploy "${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME}" | ||
fi | ||
kubectl apply -f "${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_FILE}" | ||
|
||
# Cleanup resources | ||
rm "${CP_WORKDIR}" | ||
``` | ||
|
||
## Configuration | ||
|
||
Deployment autoscaler parameter descriptions can be found in the following code snippet. | ||
|
||
```json | ||
{ | ||
"target": { | ||
// Specifies kubernetes deployments to autoscale | ||
"deployments": [ | ||
"cp-api-srv" | ||
], | ||
|
||
// Specifies kubernetes labels of deployment nodes | ||
"labels": { | ||
"cloud-pipeline/cp-api-srv": "true" | ||
}, | ||
|
||
// Specifies kubernetes labels of deployment transient nodes | ||
"transient_labels": { | ||
"cloud-pipeline/persistence": "transient" | ||
}, | ||
|
||
// Specifies tags of deployment instances | ||
"tags": { | ||
"cloud-pipeline/environment": "dev", | ||
"cloud-pipeline/deployment": "cp-api-srv" | ||
}, | ||
|
||
// Specifies tags of deployment transient instances | ||
"transient_tags": { | ||
"cloud-pipeline/persistence": "transient" | ||
}, | ||
|
||
// Specifies kubernetes deployment labels to check for reserved pods | ||
"reserved_labels": [ | ||
"cp-api-srv/service-leader" | ||
], | ||
|
||
// Specifies instances which cannot be scaled down | ||
"forbidden_instances": [ | ||
"i-12345678901234567" | ||
], | ||
|
||
// Specifies kubernetes nodes which cannot be scaled down | ||
"forbidden_nodes": [ | ||
"ip-123-45-6-789.eu-central-1.compute.internal" | ||
] | ||
}, | ||
"trigger": { | ||
// Specifies cluster nodes per target replicas coefficient. | ||
// The autoscaler tries to minimize a difference between | ||
// the actual and target coefficient value by scaling replicas. | ||
"cluster_nodes_per_target_replicas": 100, | ||
|
||
// Specifies target replicas per target nodes coefficient. | ||
// The autoscaler tries to minimize a difference between | ||
// the actual and target coefficient value by scaling nodes. | ||
"target_replicas_per_target_nodes": 1, | ||
|
||
// Specifies number of memory pressured target nodes | ||
// which triggers nodes/replicas scaling. | ||
"memory_pressured_nodes": 1, | ||
|
||
// Specifies number of disk pressured target nodes | ||
// which triggers nodes/replicas scaling. | ||
"disk_pressured_nodes": 1, | ||
|
||
// Specifies number of pid pressured target nodes | ||
// which triggers nodes/replicas scaling. | ||
"pid_pressured_nodes": 1, | ||
|
||
"cpu_utilization": { | ||
// Specifies percent of target nodes cpu utilization | ||
// which triggers nodes/replicas scaling ↑. | ||
"max": 90, | ||
|
||
// Specifies cpu utilization monitoring period in seconds. | ||
"monitoring_period": 600 | ||
}, | ||
|
||
"memory_utilization": { | ||
// Specifies percent of target nodes memory utilization | ||
// which triggers nodes/replicas scaling ↑. | ||
"max": 90, | ||
|
||
// Specifies memory utilization monitoring period in seconds. | ||
"monitoring_period": 600 | ||
} | ||
}, | ||
"rules": { | ||
// Specifies how to handle instances which doesn't have corresponding nodes: | ||
// SKIP - skips such instances explicit processing. | ||
// STOP - terminates such instances. | ||
"on_lost_instances" : "SKIP|STOP", | ||
|
||
// Specifies how to handle nodes which doesn't have corresponding instances: | ||
// SKIP - skips such nodes explicit processing. | ||
// STOP - deletes such nodes. | ||
"on_lost_nodes": "SKIP|STOP", | ||
|
||
"on_threshold_trigger": { | ||
// Specifies how many extra replicas can be scaled ↑ if some threshold triggers. | ||
"extra_replicas": 2, | ||
|
||
// Specifies how many extra nodes can be scaled ↑ if some threshold triggers. | ||
"extra_nodes": 2 | ||
} | ||
}, | ||
"limit": { | ||
// Specifies minimum number of deployment nodes. | ||
"min_nodes_number": 1, | ||
|
||
// Specifies maximum number of deployment nodes. | ||
"max_nodes_number": 10, | ||
|
||
// Specifies minimum number of deployment replicas. | ||
"min_replicas_number": 1, | ||
|
||
// Specifies maximum number of deployment replicas. | ||
"max_replicas_number": 10, | ||
|
||
// Specifies minimum interval between two consequent scalings in seconds. | ||
"min_scale_interval": 300, | ||
|
||
// Specifies minimum trigger duration before scaling in seconds. | ||
"min_triggers_duration": 60 | ||
}, | ||
"instance": { | ||
"instance_cloud": "aws", | ||
"instance_region": "eu-central-1", | ||
"instance_image": "ami-12345678901234567", | ||
"instance_type": "m5.xlarge", | ||
"instance_disk": 500, | ||
"instance_sshkey": "deploykey", | ||
"instance_subnet": "subnet-12345678", | ||
"instance_security_groups": [ | ||
"sg-12345678" | ||
], | ||
"instance_role": "arn:aws:iam::123456789012:instance-profile/Cloud-Pipeline-Service", | ||
"instance_name": "cp-deployment-autoscaler-instance", | ||
"instance_init_script": "/opt/deployment-autoscaler/init_multicloud.sh" | ||
}, | ||
"node": { | ||
"kube_token": "12345678901234567890123", | ||
"kube_ip": "123.45.6.789", | ||
"kube_port": "6443", | ||
"kube_dns_ip": "10.96.0.10", | ||
"aws_fs_url": "fs-12345678901234567.fsx.eu-central-1.amazonaws.com@tcp:/12345678" | ||
}, | ||
"timeout": { | ||
// Specifies node scaling ↑ polling timeout. | ||
"scale_up_node_timeout": 900, | ||
|
||
// Specifies node scaling ↑ polling delay. | ||
"scale_up_node_delay": 10, | ||
|
||
// Specifies instances scaling ↑ polling timeout. | ||
"scale_up_instance_timeout": 60, | ||
|
||
// Specifies instances scaling ↑ polling delay. | ||
"scale_up_instance_delay": 10 | ||
}, | ||
"misc": { | ||
"boto3_retry_count": 10 | ||
} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
{ | ||
"target": { | ||
"deployments": [ | ||
"cp-api-srv" | ||
], | ||
"labels": { | ||
"cloud-pipeline/cp-api-srv": "true" | ||
}, | ||
"transient_labels": { | ||
"cloud-pipeline/persistence": "transient" | ||
}, | ||
"tags": { | ||
"cloud-pipeline/environment": "dev", | ||
"cloud-pipeline/deployment": "cp-api-srv" | ||
}, | ||
"transient_tags": { | ||
"cloud-pipeline/persistence": "transient" | ||
}, | ||
"reserved_labels": [ | ||
"cp-api-srv/service-leader" | ||
], | ||
"forbidden_instances": [ | ||
"i-12345678901234567" | ||
], | ||
"forbidden_nodes": [ | ||
"ip-123-45-6-789.eu-central-1.compute.internal" | ||
] | ||
}, | ||
"trigger": { | ||
"cluster_nodes_per_target_replicas": 100, | ||
"target_replicas_per_target_nodes": 1, | ||
"memory_pressured_nodes": 1, | ||
"disk_pressured_nodes": 1, | ||
"pid_pressured_nodes": 1, | ||
"cpu_utilization": { | ||
"max": 90, | ||
"monitoring_period": 600 | ||
}, | ||
"memory_utilization": { | ||
"max": 90, | ||
"monitoring_period": 600 | ||
} | ||
}, | ||
"rules": { | ||
"on_lost_instances" : "SKIP", | ||
"on_lost_nodes": "SKIP", | ||
"on_threshold_trigger": { | ||
"extra_replicas": 2, | ||
"extra_nodes": 2 | ||
} | ||
}, | ||
"limit": { | ||
"min_nodes_number": 1, | ||
"max_nodes_number": 10, | ||
"min_replicas_number": 1, | ||
"max_replicas_number": 10, | ||
"min_scale_interval": 300, | ||
"min_triggers_duration": 60 | ||
}, | ||
"instance": { | ||
"instance_cloud": "aws", | ||
"instance_region": "eu-central-1", | ||
"instance_image": "ami-12345678901234567", | ||
"instance_type": "r5.xlarge", | ||
"instance_disk": 500, | ||
"instance_sshkey": "deploykey", | ||
"instance_subnet": "subnet-12345678", | ||
"instance_security_groups": [ | ||
"sg-12345678" | ||
], | ||
"instance_role": "arn:aws:iam::123456789012:instance-profile/Cloud-Pipeline-Service", | ||
"instance_name": "cp-deployment-autoscaler-instance", | ||
"instance_init_script": "/opt/deployment-autoscaler/init_multicloud.sh" | ||
}, | ||
"node": { | ||
"kube_token": "12345678901234567890123", | ||
"kube_ip": "123.45.6.789", | ||
"kube_port": "6443", | ||
"kube_dns_ip": "10.96.0.10", | ||
"aws_fs_url": "fs-12345678901234567.fsx.eu-central-1.amazonaws.com@tcp:/12345678" | ||
}, | ||
"timeout": { | ||
"scale_up_node_timeout": 900, | ||
"scale_up_node_delay": 10, | ||
"scale_up_instance_timeout": 60, | ||
"scale_up_instance_delay": 10 | ||
}, | ||
"misc": { | ||
"boto3_retry_count": 10 | ||
} | ||
} |
64 changes: 64 additions & 0 deletions
64
deploy/contents/k8s/cp-deployment-autoscaler/cp-deployment-autoscaler-dpl.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
apiVersion: extensions/v1beta1 | ||
kind: Deployment | ||
metadata: | ||
name: ${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME} | ||
namespace: default | ||
spec: | ||
replicas: 1 | ||
template: | ||
metadata: | ||
name: ${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME} | ||
namespace: default | ||
labels: | ||
cloud-pipeline/cp-deployment-autoscaler: "true" | ||
spec: | ||
nodeSelector: | ||
cloud-pipeline/cp-deployment-autoscaler: "true" | ||
tolerations: | ||
- key: node-role.kubernetes.io/master | ||
effect: NoSchedule | ||
containers: | ||
- name: cp-deployment-autoscaler | ||
image: ${CP_DOCKER_DIST_SRV}lifescience/cloud-pipeline:deployment-autoscaler-${CP_VERSION} | ||
imagePullPolicy: IfNotPresent | ||
envFrom: | ||
- configMapRef: | ||
name: cp-config-global | ||
env: | ||
- name: CP_LOGGING_LEVEL | ||
value: "DEBUG" | ||
- name: CP_LOGGING_FILE | ||
value: "/opt/deployment-autoscaler/logs/${CP_DEPLOYMENT_AUTOSCALER_DEPLOYMENT_NAME}.log" | ||
- name: CP_LOGGING_HISTORY | ||
value: "10" | ||
- name: CP_DEPLOYMENT_AUTOSCALE_CONFIGURATION_FILE | ||
value: "/etc/config/config.json" | ||
- name: CP_DEPLOYMENT_AUTOSCALE_POLLING_TIMEOUT | ||
value: "10" | ||
volumeMounts: | ||
- name: cp-deployment-autoscaler-config | ||
mountPath: /etc/config | ||
readOnly: true | ||
- name: cp-deployment-autoscaler-logs | ||
mountPath: /opt/deployment-autoscaler/logs | ||
- mountPath: /root/.kube | ||
name: kube-config | ||
readOnly: true | ||
- name: cp-cloud-credentials | ||
mountPath: /root/.cloud | ||
readOnly: true | ||
volumes: | ||
- name: cp-deployment-autoscaler-config | ||
configMap: | ||
name: ${CP_DEPLOYMENT_AUTOSCALER_CONFIGMAP_NAME} | ||
- name: cp-deployment-autoscaler-logs | ||
hostPath: | ||
path: /opt/deployment-autoscaler/logs | ||
- name: kube-config | ||
hostPath: | ||
path: /root/.kube | ||
- name: cp-cloud-credentials | ||
secret: | ||
secretName: cp-cloud-credentials | ||
imagePullSecrets: | ||
- name: cp-distr-docker-registry-secret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.