Skip to content

Commit

Permalink
Improve error reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
mkjpryor committed Aug 13, 2024
1 parent f44b9ca commit 1eaf3da
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 19 deletions.
15 changes: 7 additions & 8 deletions azimuth_caas_operator/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,21 +141,20 @@ async def cluster_create(body, name, namespace, labels, **kwargs):
# Wait for Blazar lease to be active
try:
flavor_map = await lease_utils.ensure_lease_active(K8S_CLIENT, cluster)
except lease_utils.LeaseInError:
except lease_utils.LeaseInError as exc:
# TODO(johngarbutt) we need to tell between these two cases!
message = str(exc)
if "external service enforcement filter denied the request" in message.lower():
message = "Not enough credits to create platform"
await cluster_utils.update_cluster(
K8S_CLIENT,
name,
namespace,
cluster_crd.ClusterPhase.FAILED,
error="Cloud is full, or you are out of credits.",
K8S_CLIENT, name, namespace, cluster_crd.ClusterPhase.FAILED, error=message
)
msg = f"Lease is in Error state for {name} in {namespace}"
LOG.error(msg)
# keep polling until the lease is active, but report the error
raise kopf.TemporaryError(msg, delay=20)

await cluster_utils.update_cluster_flavors(K8S_CLIENT, cluster, flavor_map)
else:
await cluster_utils.update_cluster_flavors(K8S_CLIENT, cluster, flavor_map)

# Check for an existing create job
create_job = await ansible_runner.get_create_job_for_cluster(
Expand Down
1 change: 0 additions & 1 deletion azimuth_caas_operator/utils/ansible_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from cryptography.hazmat.primitives import serialization

from easykube import ApiError
import kopf

from azimuth_caas_operator.models.v1alpha1 import cluster as cluster_crd
from azimuth_caas_operator.models.v1alpha1 import cluster_type as cluster_type_crd
Expand Down
30 changes: 20 additions & 10 deletions azimuth_caas_operator/utils/lease.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
from azimuth_caas_operator.models.v1alpha1 import cluster as cluster_crd

SCHEDULE_API_VERSION = "scheduling.azimuth.stackhpc.com/v1alpha1"
FINALIZER = "caas.stackhpc.com"
FINALIZER = "caas.azimuth.stackhpc.com"
LOG = logging.getLogger(__name__)


class LeaseInError(Exception):
pass


async def _patch_finalizers(resource, name, namespace, finalizers):
"""
Patches the finalizers of a resource. If the resource does not exist any
Expand All @@ -33,7 +35,7 @@ async def ensure_lease_active(client, cluster: cluster_crd.Cluster):
"""
if not cluster.spec.leaseName:
LOG.info("No leaseName set, skipping lease check.")
return
return {}

lease_resource = await client.api(SCHEDULE_API_VERSION).resource("leases")
lease = await lease_resource.fetch(
Expand All @@ -54,13 +56,15 @@ async def ensure_lease_active(client, cluster: cluster_crd.Cluster):
)
LOG.info("Added finalizer to the lease.")

if lease and "status" in lease and lease["status"]["phase"] == "Active":
lease_status = lease.get("status", {})

if lease_status.get("phase", "Unknown") == "Active":
LOG.info("Lease is active!")
# return mapping of requested flavor to reservation flavor
return lease["status"]["flavorMap"]
return lease["status"]["sizeMap"]

if lease and "status" in lease and lease["status"]["phase"] == "Error":
raise LeaseInError("Lease is in Error state.")
if lease_status.get("phase", "Unknown") == "Error":
raise LeaseInError(lease_status.get("errorMessage", "Error creating lease"))

LOG.info(f"Lease {cluster.spec.leaseName} is not active, wait till active.")
delay = 60
Expand All @@ -81,10 +85,16 @@ async def ensure_lease_active(client, cluster: cluster_crd.Cluster):

async def drop_lease_finalizer(client, cluster: cluster_crd.Cluster):
lease_resource = await client.api(SCHEDULE_API_VERSION).resource("leases")
lease = await lease_resource.fetch(
cluster.spec.leaseName,
namespace=cluster.metadata.namespace,
)
try:
lease = await lease_resource.fetch(
cluster.spec.leaseName,
namespace=cluster.metadata.namespace,
)
except easykube.ApiError as exc:
if exc.status_code == 404:
return
else:
raise
finalizers = lease.get("metadata", {}).get("finalizers", [])
await _patch_finalizers(
lease_resource,
Expand Down

0 comments on commit 1eaf3da

Please sign in to comment.