Skip to content
This repository has been archived by the owner on Jul 18, 2018. It is now read-only.

Use deterministic pauses rather than fixed-length ones. #186

Merged
merged 6 commits into from
Feb 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
---
api_servers: "{{ lookup('file', kubeconfig)|from_yaml|json_query('clusters[*].cluster.server') }}"
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
template: src=config.yaml.part.jinja2
dest="{{ config_base | expanduser }}/{{kraken_config.cluster}}/fabric/config.yaml"

- name: Wait for api server to become available in case it's not
wait_for:
host: "{{ item|regex_replace('https://','') }}"
port: 443
timeout: 600
with_items: "{{ api_servers }}"

- name: Ensure the kube-networking namespace exists
command: >
kubectl --kubeconfig={{ kubeconfig | expanduser }} create namespace kube-networking
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
- name: Set the kraken end point fact
set_fact:
kraken_endpoint: "{{ endpoint_result.stdout }}"
when: kraken_action == 'up'
when: endpoint_result|succeeded and not endpoint_result|skipped

- name: Get kraken aws_route53_zone.private_zone.zone_id
shell: >
Expand Down Expand Up @@ -74,4 +74,4 @@
- route53_zone
- aws_prefix
- terraform.tfstate
- terraform.tfstate.backup
- terraform.tfstate.backup
12 changes: 10 additions & 2 deletions ansible/roles/kraken.readiness/tasks/do-wait.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,16 @@
set_fact:
wait_api_start_timestamp: "{{ lookup('pipe','date +%Y%m%d%H%M%S') }}"

- name: Wait for api server to become available
wait_for: host={{ kraken_endpoint }} port=443 timeout={{ readiness_wait }}
- name: Fetch k8s api server address
set_fact:
api_servers: "{{ lookup('file', kubeconfig)|from_yaml|json_query('clusters[*].cluster.server') }}"

- name: Wait for api server to become available in case it's not
wait_for:
host: "{{ item|regex_replace('https://','') }}"
port: 443
timeout: "{{ readiness_wait }}"
with_items: "{{ api_servers }}"

- name: Get timestamp after api server wait
set_fact:
Expand Down
3 changes: 2 additions & 1 deletion ansible/roles/kraken.services/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
kubeconfig: "{{ config_base | expanduser }}/{{kraken_config.cluster}}/admin.kubeconfig"
helm_home: "{{ config_base | expanduser }}/{{kraken_config.cluster}}/.helm"
tiller: tiller-deploy
tiller_image:
tiller_image:
api_servers: "{{ lookup('file', kubeconfig)|from_yaml|json_query('clusters[*].cluster.server') }}"
50 changes: 36 additions & 14 deletions ansible/roles/kraken.services/tasks/kill-services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
command: >
kubectl --kubeconfig={{ kubeconfig }} get services --all-namespaces -o json
register: added_services
when: kraken_action == 'down'
when: kraken_action == 'down' and kubeconfig|is_file
ignore_errors: yes

- name: Register services fact
Expand All @@ -18,17 +18,6 @@
when: kraken_action == 'down'
ignore_errors: yes

- name: Clean up services
command: >
kubectl --kubeconfig={{ kubeconfig }} delete --namespace {{ item.metadata.namespace }} svc {{ item.metadata.name }}
with_items: "{{ the_services }}"
when: item.status.loadBalancer.ingress[0].hostname is defined and kraken_action == 'down'
ignore_errors: yes

- name: Pauase to let services come down
pause: minutes=5
when: kraken_action == 'down'

- name: Clean up releases
command: >
helm delete --purge {{ item.name }}
Expand All @@ -37,16 +26,49 @@
HELM_HOME: "{{ helm_home }}"
with_items: "{{cluster_services}}"
ignore_errors: yes
when: tiller_present|success
when: tiller_present|succeeded and not tiller_present|skipped

- name: Clean up tiller if present
command: >
kubectl --kubeconfig={{ kubeconfig }} delete deployment {{ tiller }} --namespace=kube-system
when: tiller_present|success
when: tiller_present|succeeded and not tiller_present|skipped

- name: Clean up services
command: >
kubectl --kubeconfig={{ kubeconfig }} delete --namespace {{ item.metadata.namespace }} svc {{ item.metadata.name }}
with_items: "{{ the_services }}"
when: item.status.loadBalancer.ingress[0].hostname is defined and kraken_action == 'down'
ignore_errors: yes

- name: Delete all service namespaces
command: >
kubectl --kubeconfig={{ kubeconfig }} delete namespace {{ item }}
with_items: "{{ cluster_namespaces }}"
when: cluster_namespaces is defined
ignore_errors: yes

- name: Get vpc id
shell: "terraform state show -state={{ config_base | expanduser }}/{{kraken_config.cluster}}/terraform.tfstate module.vpc.aws_vpc.vpc | awk '/^id/{print $3}'"
register: terraform_state_show
when: kraken_action == 'down'
changed_when: false

- name: Set vpc_id fact
set_fact:
vpcid: "{{ terraform_state_show.stdout }}"
when: kraken_action == 'down'

- name: Wait for ELBs to be deleted
action:
module: ec2_elb_facts
region: "{{ kraken_config.providerConfig.region }}"
aws_access_key: "{{ kraken_config.providerConfig.authentication.accessKey or omit}}"
aws_secret_key: "{{ kraken_config.providerConfig.authentication.accessSecret or omit }}"
profile: "{{ kraken_config.providerConfig.authentication.credentialsProfile or omit }}"
register: elb_facts
vars:
vpc_lookup: "elbs[?vpc_id=='{{ vpcid }}']"
when: kraken_action == 'down' and kraken_config.provider == 'aws'
until: (elb_facts is none) or (elb_facts|json_query(vpc_lookup) is none) or (elb_facts|json_query(vpc_lookup)|length <= 1)
retries: 600
delay: 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will cause us to exceed API rate limits. Maybe we should do a quick benchmark of the average ELB deletion time and then make that the delay?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1000 request per second (rps) with a burst limit of 2000 rps

Unless there's 1000 k2 doing this from one account, we should be ok.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably more acrobatics than we need, but an geometrically decreasing delay might be one way to minimize the total time we wait. :) I am more concerned with making this robust for users than making it fast for developers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you are quoting the rate limit for the AWS API Gateway Service, which is different than the rate limit for an AWS account.

http://docs.aws.amazon.com/AWSEC2/latest/APIReference/query-api-troubleshooting.html#api-request-rate

I am having trouble finding documentation on the rate limits for Describe, Modify, and Create actions. You are doing a describe here, so that is safer. Note that we have run into rate limits on our team in the past though, and the documentation specifically recommends an exponential back-off...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And ansible doesn't have the facility to do that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I argue that we go ahead with the 1 second loop and revisit this if there actually is a problem or if a backoff feature is added to ansible.

Copy link
Contributor

@davidewatson davidewatson Feb 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So here we are using the ec2_elb_facts ansible module:

https://github.com/wimnat/ansible-modules/blob/master/ec2_elb_facts/ec2_elb_facts.py

This uses boto.ec2.elb which I think implements a binary exponential backoff:

https://github.com/boto/boto/blob/abb38474ee5124bb571da0c42be67cd27c47094f/boto/connection.py

So maybe your change is safe. I know I am being pedantic here, it's just 1 second really seems excessive when we know ELB operations are much slower. I would prefer if verify that boto is implementing a backoff, and then optionally change the delay to be something more reasonable, or not. If a backoff is in place then I am not too concerned about CPU utilization here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the boto retries are for opening the connection and not the results of the query.

I'm just suggesting we not pre-optimize. I haven't seen any rate limit errors in all the times I've upped and downed - and that's on us-west which is (apparently) notorious for rate limiting. If it turns out that it's a problem, we can easily change the delay later.

Btw... I've recently been pretty consistently getting 5 retries before it moves on and that's with central-logging added.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks fine to me. API limits (as described here: kubernetes/kubernetes#39526, seem to be a problem in the thousand plus per hour. If we want this to have an exponential backoff (seems fine) lets open a fresh ticket and do the work there.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

1 change: 1 addition & 0 deletions ansible/roles/kraken.services/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
shell: >
kubectl --kubeconfig={{ kubeconfig }} get deployment {{ tiller }} --namespace=kube-system
register: tiller_present
when: kubeconfig|is_file
ignore_errors: yes

- include: kill-services.yaml
Expand Down
16 changes: 14 additions & 2 deletions ansible/roles/kraken.services/tasks/run-services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
helm_init_command: "{{helm_command}} init --tiller-image {{ tiller_image }}"
when: (tiller_image is defined) and (tiller_image is not none) and (tiller_image|trim != '')

- name: Wait for api server to become available in case it's not
wait_for:
host: "{{ item|regex_replace('https://','') }}"
port: 443
timeout: 600
with_items: "{{ api_servers }}"

- name: Init helm dry-run
command: >
{{helm_init_command}} --dry-run
Expand All @@ -45,8 +52,13 @@
retries: 60
delay: 1

- name: Give tiller rc a chance to fully init
pause: seconds=60
- name: Wait for tiller to be ready
command: "kubectl --kubeconfig={{ kubeconfig | expanduser }} --namespace=kube-system get deploy tiller-deploy -o json"
register: output
until: ((output.stdout|from_json).status.availableReplicas|default(0)) > 0
retries: 600
delay: 1
changed_when: false

- name: Remove helm repositories
command: >
Expand Down