From 3d1b50863742138bdc2b62ad8e91d68c927fb3af Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Date: Mon, 23 Sep 2024 09:21:31 -0300 Subject: [PATCH] [MISC] Fixes for PITR backup test instabilities (#690) * try CI fixes * reformat waits * fix test charm + nits * try fix wrong blocked messages * add wildcard * try another command * small nits * reposition pgdata storage check * try fixing PITR checks * check PITR fails even if service is active * adapt on peer relation changed hook * retry pitr fail check * dont wait for remove relations in test_backups * try fixing restore fail catching * check all pg logs on pitr fail check * revert previous error message * final nits * use walrus check Signed-off-by: Lucas Gameiro Borges * add coverage --------- Signed-off-by: Lucas Gameiro Borges --- src/charm.py | 143 ++++++++++++------ src/upgrade.py | 3 + .../ha_tests/test_upgrade_from_stable.py | 2 +- tests/integration/helpers.py | 2 +- .../new_relations/test_new_relations.py | 6 +- tests/integration/test_backups.py | 22 ++- tests/integration/test_backups_pitr.py | 10 +- tests/integration/test_charm.py | 6 +- tests/unit/test_charm.py | 28 +++- 9 files changed, 164 insertions(+), 58 deletions(-) diff --git a/src/charm.py b/src/charm.py index a815e9ba91..dadaa4b34b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,6 +11,7 @@ import re import shutil import sys +import time from pathlib import Path from typing import Dict, List, Literal, Optional, Tuple, get_args @@ -338,8 +339,7 @@ def get_secret(self, scope: Scopes, key: str) -> Optional[str]: if scope not in get_args(Scopes): raise RuntimeError("Unknown secret scope.") - peers = self.model.get_relation(PEER) - if not peers: + if not (peers := self.model.get_relation(PEER)): return None secret_key = self._translate_field_to_secret_key(key) @@ -357,7 +357,9 @@ def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[ if not value: return self.remove_secret(scope, key) - peers = self.model.get_relation(PEER) + if not (peers := self.model.get_relation(PEER)): + return None + secret_key = self._translate_field_to_secret_key(key) # Old translation in databag is to be deleted self.peer_relation_data(scope).delete_relation_data(peers.id, [key]) @@ -368,7 +370,9 @@ def remove_secret(self, scope: Scopes, key: str) -> None: if scope not in get_args(Scopes): raise RuntimeError("Unknown secret scope.") - peers = self.model.get_relation(PEER) + if not (peers := self.model.get_relation(PEER)): + return None + secret_key = self._translate_field_to_secret_key(key) if scope == APP_SCOPE: self.peer_relation_app.delete_relation_data(peers.id, [secret_key]) @@ -551,11 +555,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901 logger.error("Invalid configuration: %s", str(e)) return - # If PITR restore failed, then wait it for resolve. + # Should not override a blocked status + if isinstance(self.unit.status, BlockedStatus): + logger.debug("on_peer_relation_changed early exit: Unit in blocked status") + return + + services = container.pebble.get_services(names=[self._postgresql_service]) if ( - "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data - ) and isinstance(self.unit.status, BlockedStatus): - event.defer() + ("restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data) + and len(services) > 0 + and not self._was_restore_successful(container, services[0]) + ): + logger.debug("on_peer_relation_changed early exit: Backup restore check failed") return # Validate the status of the member before setting an ActiveStatus. @@ -1344,6 +1355,23 @@ def _on_update_status_early_exit_checks(self, container) -> bool: if not container.can_connect(): logger.debug("on_update_status early exit: Cannot connect to container") return False + + self._check_pgdata_storage_size() + + if self._has_blocked_status or self._has_non_restore_waiting_status: + # If charm was failing to disable plugin, try again and continue (user may have removed the objects) + if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: + self.enable_disable_extensions() + return True + + if ( + self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET + and "require-change-bucket-after-restore" not in self.app_peer_data + ): + return True + + logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status") + return False return True def _check_pgdata_storage_size(self) -> None: @@ -1372,15 +1400,6 @@ def _on_update_status(self, _) -> None: if not self._on_update_status_early_exit_checks(container): return - self._check_pgdata_storage_size() - - if self._has_blocked_status or self._has_non_restore_waiting_status: - # If charm was failing to disable plugin, try again (user may have removed the objects) - if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: - self.enable_disable_extensions() - logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status") - return - services = container.pebble.get_services(names=[self._postgresql_service]) if len(services) == 0: # Service has not been added nor started yet, so don't try to check Patroni API. @@ -1419,23 +1438,25 @@ def _on_update_status(self, _) -> None: def _was_restore_successful(self, container: Container, service: ServiceInfo) -> bool: """Checks if restore operation succeeded and S3 is properly configured.""" - if service.current != ServiceStatus.ACTIVE: - if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)): - logger.error( - "Restore failed: database service failed to reach point-in-time-recovery target. " - "You can launch another restore with different parameters" - ) - self.log_pitr_last_transaction_time() - self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) - return False + if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)): + logger.error( + "Restore failed: database service failed to reach point-in-time-recovery target. " + "You can launch another restore with different parameters" + ) + self.log_pitr_last_transaction_time() + self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) + return False - if self.unit.status.message != CANNOT_RESTORE_PITR: - logger.error("Restore failed: database service failed to start") - self.unit.status = BlockedStatus("Failed to restore backup") + if ( + service.current != ServiceStatus.ACTIVE + and self.unit.status.message != CANNOT_RESTORE_PITR + ): + logger.error("Restore failed: database service failed to start") + self.unit.status = BlockedStatus("Failed to restore backup") return False if not self._patroni.member_started: - logger.debug("on_update_status early exit: Patroni has not started yet") + logger.debug("Restore check early exit: Patroni has not started yet") return False # Remove the restoring backup flag and the restore stanza name. @@ -2097,26 +2118,56 @@ def is_pitr_failed(self, container: Container) -> Tuple[bool, bool]: - Is patroni service failed to bootstrap cluster. - Is it new fail, that wasn't observed previously. """ - try: - log_exec = container.pebble.exec(["pebble", "logs", "postgresql"], combine_stderr=True) - patroni_logs = log_exec.wait_output()[0] - patroni_exceptions = re.findall( - r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", - patroni_logs, - re.MULTILINE, - ) - except ExecError: # For Juju 2. - log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"]) - patroni_logs = log_exec.wait_output()[0] - patroni_exceptions = re.findall( - r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", - patroni_logs, - re.MULTILINE, - ) + patroni_exceptions = [] + count = 0 + while len(patroni_exceptions) == 0 and count < 10: + try: + log_exec = container.pebble.exec( + ["pebble", "logs", "postgresql", "-n", "all"], combine_stderr=True + ) + patroni_logs = log_exec.wait_output()[0] + patroni_exceptions = re.findall( + r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", + patroni_logs, + re.MULTILINE, + ) + except ExecError: # For Juju 2. + log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"]) + patroni_logs = log_exec.wait_output()[0] + patroni_exceptions = re.findall( + r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", + patroni_logs, + re.MULTILINE, + ) + if len(patroni_exceptions) != 0: + break + # If no match, look at older logs + log_exec = container.pebble.exec([ + "find", + "/var/log/postgresql/", + "-name", + "'patroni.log.*'", + "-exec", + "cat", + "{}", + "+", + ]) + patroni_logs = log_exec.wait_output()[0] + patroni_exceptions = re.findall( + r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", + patroni_logs, + re.MULTILINE, + ) + count += 1 + time.sleep(3) + if len(patroni_exceptions) > 0: + logger.debug("Failures to bootstrap cluster detected on Patroni service logs") old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None) self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1] return True, patroni_exceptions[-1] != old_pitr_fail_id + + logger.debug("No failures detected on Patroni service logs") return False, False def log_pitr_last_transaction_time(self) -> None: diff --git a/src/upgrade.py b/src/upgrade.py index 14ecb4f875..18c7e10fd1 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -75,6 +75,9 @@ def _handle_label_change(self) -> None: @property def is_no_sync_member(self) -> bool: """Whether this member shouldn't be a synchronous standby (when it's a replica).""" + if not self.peer_relation: + return False + sync_standbys = self.peer_relation.data[self.charm.app].get("sync-standbys") if sync_standbys is None: return False diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py index e47453a187..b702ab2790 100644 --- a/tests/integration/ha_tests/test_upgrade_from_stable.py +++ b/tests/integration/ha_tests/test_upgrade_from_stable.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) -TIMEOUT = 5 * 60 +TIMEOUT = 10 * 60 @pytest.mark.group(1) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4997b125d9..823837cd48 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -747,7 +747,7 @@ async def switchover( candidate: The unit that should be elected the new primary. """ primary_ip = await get_unit_address(ops_test, current_primary) - for attempt in Retrying(stop=stop_after_attempt(4), wait=wait_fixed(5), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(60), wait=wait_fixed(3), reraise=True): with attempt: response = requests.post( f"http://{primary_ip}:8008/switchover", diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py index beb785bc49..e4e881f9db 100644 --- a/tests/integration/new_relations/test_new_relations.py +++ b/tests/integration/new_relations/test_new_relations.py @@ -91,7 +91,11 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME ) await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000 + apps=[DATABASE_APP_NAME], + status="active", + raise_on_blocked=True, + raise_on_error=False, + timeout=1000, ) # Check that on juju 3 we have secrets and no username and password in the rel databag diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 5ce1e5b4a4..a896212c98 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -194,9 +194,15 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. - await ops_test.model.remove_application(database_app_name, block_until_done=True) + await ops_test.model.remove_application(database_app_name) + await ops_test.model.block_until( + lambda: database_app_name not in ops_test.model.applications, timeout=1000 + ) # Remove the TLS operator. - await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) + await ops_test.model.remove_application(tls_certificates_app_name) + await ops_test.model.block_until( + lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000 + ) @pytest.mark.group(2) @@ -219,9 +225,15 @@ async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - database_app_name = f"{DATABASE_APP_NAME}-gcp" # Remove the database app. - await ops_test.model.remove_application(database_app_name, block_until_done=True) + await ops_test.model.remove_application(database_app_name) + await ops_test.model.block_until( + lambda: database_app_name not in ops_test.model.applications, timeout=1000 + ) # Remove the TLS operator. - await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) + await ops_test.model.remove_application(tls_certificates_app_name) + await ops_test.model.block_until( + lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000 + ) @pytest.mark.group(2) @@ -297,7 +309,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None database_app_name, 0, S3_INTEGRATOR_APP_NAME, - MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, + ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, ) # Check that the backup was correctly restored by having only the first created table. diff --git a/tests/integration/test_backups_pitr.py b/tests/integration/test_backups_pitr.py index e531ec6d02..df98545302 100644 --- a/tests/integration/test_backups_pitr.py +++ b/tests/integration/test_backups_pitr.py @@ -282,9 +282,15 @@ async def pitr_backup_operations( await ops_test.model.wait_for_idle(status="active", timeout=1000) # Remove the database app. - await ops_test.model.remove_application(database_app_name, block_until_done=True) + await ops_test.model.remove_application(database_app_name) + await ops_test.model.block_until( + lambda: database_app_name not in ops_test.model.applications, timeout=1000 + ) # Remove the TLS operator. - await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) + await ops_test.model.remove_application(tls_certificates_app_name) + await ops_test.model.block_until( + lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000 + ) @pytest.mark.group(1) diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 15ade275ee..b957e583db 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -440,7 +440,11 @@ async def test_redeploy_charm_same_model_after_forcing_removal(ops_test: OpsTest # This check is enough to ensure that the charm/workload is working for this specific test. await ops_test.model.wait_for_idle( - apps=[APP_NAME], status="active", timeout=1000, wait_for_exact_units=len(UNIT_IDS) + apps=[APP_NAME], + status="active", + timeout=1000, + wait_for_exact_units=len(UNIT_IDS), + raise_on_error=False, ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e3b4ee514b..e5b12813c7 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -24,7 +24,8 @@ from requests import ConnectionError from tenacity import RetryError, wait_fixed -from charm import PostgresqlOperatorCharm +from backups import MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET +from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm from constants import PEER, SECRET_INTERNAL_LABEL from patroni import NotReadyError from tests.unit.helpers import _FakeApiError @@ -448,6 +449,9 @@ def test_on_update_status(harness): patch( "charm.PostgresqlOperatorCharm._handle_processes_failures" ) as _handle_processes_failures, + patch( + "charm.PostgresqlOperatorCharm.enable_disable_extensions" + ) as _enable_disable_extensions, patch("charm.Patroni.member_started") as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, @@ -468,6 +472,28 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _get_primary.assert_not_called() + # Test unit in blocked status, without message. + _pebble.get_services.reset_mock() + harness.model.unit.status = BlockedStatus() + harness.charm.on.update_status.emit() + _enable_disable_extensions.assert_not_called() + _pebble.get_services.assert_not_called() + + # Test unit in blocked status due to blocking extensions. + harness.model.unit.status = BlockedStatus(EXTENSION_OBJECT_MESSAGE) + harness.charm.on.update_status.emit() + _enable_disable_extensions.assert_called_once() + _pebble.get_services.assert_called_once() + + # Test unit in blocked status due to restored cluster. + _pebble.get_services.reset_mock() + _enable_disable_extensions.reset_mock() + harness.model.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET) + harness.charm.on.update_status.emit() + _enable_disable_extensions.assert_not_called() + _pebble.get_services.assert_called_once() + harness.model.unit.status = ActiveStatus() + # Test when a failure need to be handled. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] _handle_processes_failures.return_value = True