Skip to content

Commit

Permalink
Each time we fail, wait longer for the next retry (#175)
Browse files Browse the repository at this point in the history
* Each time we fail, wait longer for the next retry

* Limit the max retry delay

* Increase the job ttl to 10 hours, to help backoff logic

* Fix broken test

* Attempt to wait for job to error

* Fix wait call

* Just wait for error more simply
  • Loading branch information
JohnGarbutt authored Sep 11, 2024
1 parent bc28791 commit 4a71b56
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 11 deletions.
20 changes: 16 additions & 4 deletions azimuth_caas_operator/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,9 +403,21 @@ async def cluster_delete(body, name, namespace, labels, **kwargs):
error=error,
)
# unlabel the job, so we trigger a retry next time
await ansible_runner.unlabel_job(K8S_CLIENT, delete_job)
await ansible_runner.unlabel_delete_job(K8S_CLIENT, delete_job)

# Wait 60 seconds before retrying the delete
msg = f"Delete job failed for {name} in {namespace} because: {reason}"
# wait longer each time we fail
failed_delete_jobs = await ansible_runner.get_failed_delete_jobs_for_cluster(
K8S_CLIENT, name, namespace
)
delay_multiplier = len(failed_delete_jobs) - 1
# limit max delay to 128 (2**7) minutes between retries
# although auto delete jobs should age out after 10 hours anyway
delay_multiplier = min(delay_multiplier, 7)
delay = 60 * (2**delay_multiplier)

msg = (
f"Delete job failed for {name} in {namespace} "
f"retrying in {delay} seconds because: {reason}"
)
LOG.error(msg)
raise kopf.TemporaryError(msg, delay=60)
raise kopf.TemporaryError(msg, delay=delay)
13 changes: 9 additions & 4 deletions azimuth_caas_operator/tests/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,8 @@ async def test_cluster_delete_waits_on_running_jobs(
)
mock_ensure_cluster_id.assert_awaited_once_with(operator.K8S_CLIENT, cluster)

@mock.patch.object(ansible_runner, "unlabel_job")
@mock.patch.object(ansible_runner, "get_failed_delete_jobs_for_cluster")
@mock.patch.object(ansible_runner, "unlabel_delete_job")
@mock.patch.object(ansible_runner, "get_job_error_message")
@mock.patch.object(ansible_runner, "get_job_completed_state")
@mock.patch.object(cluster_utils, "ensure_cluster_id")
Expand All @@ -622,18 +623,21 @@ async def test_cluster_delete_trigger_retry_on_error(
mock_ensure_cluster_id,
mock_get_job_state,
mock_get_error,
mock_unlabel,
mock_unlabel_delete_job,
mock_failed_jobs,
):
mock_get_jobs.return_value = "fakejob"
mock_get_job_state.return_value = False
mock_get_error.return_value = "Problem with job."
fake_body = cluster_crd.get_fake_dict()
mock_failed_jobs.return_value = ["failedjob1", "failedjob2"]

with self.assertRaises(kopf.TemporaryError) as ctx:
await operator.cluster_delete(fake_body, "cluster1", "ns", {})

self.assertEqual(
"Delete job failed for cluster1 in ns because: Problem with job.",
"Delete job failed for cluster1 in ns retrying in 120 seconds "
"because: Problem with job.",
str(ctx.exception),
)
mock_create_finsish.assert_awaited_once_with(
Expand All @@ -651,7 +655,8 @@ async def test_cluster_delete_trigger_retry_on_error(
"Possible reason for the failure was: Problem with job.",
)
mock_ensure_cluster_id.assert_awaited_once_with(operator.K8S_CLIENT, cluster)
mock_unlabel.assert_awaited_once_with(operator.K8S_CLIENT, "fakejob")
mock_unlabel_delete_job.assert_awaited_once_with(operator.K8S_CLIENT, "fakejob")
mock_failed_jobs.assert_awaited_once_with(operator.K8S_CLIENT, "cluster1", "ns")

@mock.patch("aiohttp.ClientSession.get")
async def test_fetch_ui_meta_from_url_success(self, mock_get):
Expand Down
2 changes: 1 addition & 1 deletion azimuth_caas_operator/tests/utils/test_ansible_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def test_get_job_remove(self):
runAsGroup: 1000
runAsUser: 1000
serviceAccountName: test1-tfstate
ttlSecondsAfterFinished: 3600
ttlSecondsAfterFinished: 36000
volumes:
- emptyDir: {}
name: runner-data
Expand Down
27 changes: 25 additions & 2 deletions azimuth_caas_operator/utils/ansible_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ def get_job(
template:
spec:
{f'''
# auto-remove delete jobs after one hour
ttlSecondsAfterFinished: 3600
# auto-remove delete jobs after 10 hours
ttlSecondsAfterFinished: 36000
''' if remove else ''}
serviceAccountName: {service_account_name}
securityContext:
Expand Down Expand Up @@ -415,6 +415,20 @@ async def get_delete_job_for_cluster(client, cluster_name, namespace):
return await get_job_for_cluster(client, cluster_name, namespace, remove=True)


async def get_failed_delete_jobs_for_cluster(client, cluster_name, namespace):
job_resource = await get_job_resource(client)
return [
job
async for job in job_resource.list(
labels={
"azimuth-caas-cluster": cluster_name,
"azimuth-caas-action": "failed-delete-job",
},
namespace=namespace,
)
]


async def get_job_for_cluster(
client, cluster_name, namespace, remove=False, update=False
):
Expand Down Expand Up @@ -631,6 +645,15 @@ async def unlabel_job(client, job):
)


async def unlabel_delete_job(client, job):
job_resource = await client.api("batch/v1").resource("jobs")
await job_resource.patch(
job.metadata.name,
dict(metadata=dict(labels={"azimuth-caas-action": "failed-delete-job"})),
namespace=job.metadata.namespace,
)


async def start_job(
client, cluster: cluster_crd.Cluster, namespace, remove=False, update=False
):
Expand Down
7 changes: 7 additions & 0 deletions tools/functional_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,16 @@ kubectl delete -f $SCRIPT_DIR/test_quick_delete_failure.yaml --wait=false
# wait for delete we expect to work
kubectl delete -f $SCRIPT_DIR/test_quick.yaml

# make sure the failed delete is still deleting
until kubectl wait --for=jsonpath='{.status.phase}'=Deleting cluster quick-test-fail-delete; do echo "wait for deleting"; sleep 2; done
# make sure the failed delete has an error set
until kubectl wait --for=jsonpath='{.status.error}' cluster quick-test-fail-delete; do echo "wait for delete failure"; sleep 2; done
# make sure the failed delete is still deleting
until kubectl wait --for=jsonpath='{.status.phase}'=Deleting cluster quick-test-fail-delete; do echo "wait for deleting"; sleep 2; done

kubectl get cluster
kubectl get cluster -o yaml
kubectl get jobs

# output logs from operator
kubectl logs -n azimuth-caas-operator deployment/azimuth-caas-operator operator

0 comments on commit 4a71b56

Please sign in to comment.