From 93e502758843322e6da588282373069df7bad9af Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Fri, 23 Feb 2024 10:40:09 +0100 Subject: [PATCH 1/2] tests: add a retry when asserting the up metric Adding a retry for checking the state of an alert will allow time to prometheus-k8s to scrape the necessary metrics for a unit, without it we may run into a race condition where the assertion of the metric is run before prometheus is even able to scrape. This commit adds a retry logic to avoid this. --- tests/integration/test_charm.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index d30ffdc..a6095f2 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -163,6 +163,18 @@ async def check_alert_propagation(url, alert_name): alert_rule = next((rule for rule in alert_rules if rule["name"] == alert_name)) assert alert_rule is not None and alert_rule["state"] == "firing" +@tenacity.retry(wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(6), reraise=True) +async def assert_seldon_unit_is_available(prometheus_url): + """Assert the unit is available. + + This will be tried multiple times to avoid raising errors when prometheus-k8s + is not in an Active status. + """ + # query for the up metric and assert the unit is available + up_query_response = await fetch_url( + f'http://{prometheus_url}:9090/api/v1/query?query=up{{juju_application="{APP_NAME}"}}' + ) + assert up_query_response["data"]["result"][0]["value"][1] == "1" @pytest.mark.abort_on_fail @pytest.mark.asyncio @@ -197,12 +209,6 @@ async def test_seldon_alert_rules(ops_test: OpsTest): discovered_labels = targets_result["data"]["activeTargets"][0]["discoveredLabels"] assert discovered_labels["juju_application"] == "seldon-controller-manager" - # query for the up metric and assert the unit is available - up_query_response = await fetch_url( - f'http://{prometheus_url}:9090/api/v1/query?query=up{{juju_application="{APP_NAME}"}}' - ) - assert up_query_response["data"]["result"][0]["value"][1] == "1" - # obtain alert rules from Prometheus rules_url = f"http://{prometheus_url}:9090/api/v1/rules" alert_rules_result = await fetch_url(rules_url) @@ -228,6 +234,9 @@ async def test_seldon_alert_rules(ops_test: OpsTest): for rule in rules: assert rule["name"] in rules_file_alert_names + # verify SeldonUnitIsUnavailable alert is not firing + await assert_seldon_unit_is_available(prometheus_url) + # The following integration test is optional (experimental) and might not be functioning # correctly under some conditions due to its reliance on timing of K8S deployments, timing of # Prometheus scraping, and rate calculations for alerts. From 1373963a585e7b6817428cc015f1eb9645cecd89 Mon Sep 17 00:00:00 2001 From: Daniela Plascencia Date: Fri, 23 Feb 2024 10:45:49 +0100 Subject: [PATCH 2/2] skip: fix lint --- tests/integration/test_charm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index a6095f2..14e1036 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -163,6 +163,7 @@ async def check_alert_propagation(url, alert_name): alert_rule = next((rule for rule in alert_rules if rule["name"] == alert_name)) assert alert_rule is not None and alert_rule["state"] == "firing" + @tenacity.retry(wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(6), reraise=True) async def assert_seldon_unit_is_available(prometheus_url): """Assert the unit is available. @@ -176,6 +177,7 @@ async def assert_seldon_unit_is_available(prometheus_url): ) assert up_query_response["data"]["result"][0]["value"][1] == "1" + @pytest.mark.abort_on_fail @pytest.mark.asyncio async def test_seldon_alert_rules(ops_test: OpsTest):