canonical · lucasgameiroborges · Sep 23, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/src/charm.py b/src/charm.py
@@ -11,6 +11,7 @@
 import re
 import shutil
 import sys
+import time
 from pathlib import Path
 from typing import Dict, List, Literal, Optional, Tuple, get_args
 
@@ -336,8 +337,7 @@ def get_secret(self, scope: Scopes, key: str) -> Optional[str]:
         if scope not in get_args(Scopes):
             raise RuntimeError("Unknown secret scope.")
 
-        peers = self.model.get_relation(PEER)
-        if not peers:
+        if not (peers := self.model.get_relation(PEER)):
             return None
 
         secret_key = self._translate_field_to_secret_key(key)
@@ -355,7 +355,9 @@ def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[
         if not value:
             return self.remove_secret(scope, key)
 
-        peers = self.model.get_relation(PEER)
+        if not (peers := self.model.get_relation(PEER)):
+            return None
+
         secret_key = self._translate_field_to_secret_key(key)
         # Old translation in databag is to be deleted
         self.peer_relation_data(scope).delete_relation_data(peers.id, [key])
@@ -366,7 +368,9 @@ def remove_secret(self, scope: Scopes, key: str) -> None:
         if scope not in get_args(Scopes):
             raise RuntimeError("Unknown secret scope.")
 
-        peers = self.model.get_relation(PEER)
+        if not (peers := self.model.get_relation(PEER)):
+            return None
+
         secret_key = self._translate_field_to_secret_key(key)
         if scope == APP_SCOPE:
             self.peer_relation_app.delete_relation_data(peers.id, [secret_key])
@@ -549,11 +553,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None:  # noqa: C901
             logger.error("Invalid configuration: %s", str(e))
             return
 
-        # If PITR restore failed, then wait it for resolve.
+        # Should not override a blocked status
+        if isinstance(self.unit.status, BlockedStatus):
+            logger.debug("on_peer_relation_changed early exit: Unit in blocked status")
+            return
+
+        services = container.pebble.get_services(names=[self._postgresql_service])
         if (
-            "restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data
-        ) and isinstance(self.unit.status, BlockedStatus):
-            event.defer()
+            ("restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data)
+            and len(services) > 0
+            and not self._was_restore_successful(container, services[0])
+        ):
+            logger.debug("on_peer_relation_changed early exit: Backup restore check failed")
             return
 
         # Validate the status of the member before setting an ActiveStatus.
@@ -1338,6 +1349,23 @@ def _on_update_status_early_exit_checks(self, container) -> bool:
         if not container.can_connect():
             logger.debug("on_update_status early exit: Cannot connect to container")
             return False
+
+        self._check_pgdata_storage_size()
+
+        if self._has_blocked_status or self._has_non_restore_waiting_status:
+            # If charm was failing to disable plugin, try again and continue (user may have removed the objects)
+            if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
+                self.enable_disable_extensions()
+                return True
+
+            if (
+                self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
+                and "require-change-bucket-after-restore" not in self.app_peer_data
+            ):
+                return True
+
+            logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
+            return False
         return True
 
     def _check_pgdata_storage_size(self) -> None:
@@ -1366,15 +1394,6 @@ def _on_update_status(self, _) -> None:
         if not self._on_update_status_early_exit_checks(container):
             return
 
-        self._check_pgdata_storage_size()
-
-        if self._has_blocked_status or self._has_non_restore_waiting_status:
-            # If charm was failing to disable plugin, try again (user may have removed the objects)
-            if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
-                self.enable_disable_extensions()
-            logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
-            return
-
         services = container.pebble.get_services(names=[self._postgresql_service])
         if len(services) == 0:
             # Service has not been added nor started yet, so don't try to check Patroni API.
@@ -1413,23 +1432,25 @@ def _on_update_status(self, _) -> None:
 
     def _was_restore_successful(self, container: Container, service: ServiceInfo) -> bool:
         """Checks if restore operation succeeded and S3 is properly configured."""
-        if service.current != ServiceStatus.ACTIVE:
-            if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
-                logger.error(
-                    "Restore failed: database service failed to reach point-in-time-recovery target. "
-                    "You can launch another restore with different parameters"
-                )
-                self.log_pitr_last_transaction_time()
-                self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
-                return False
+        if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
+            logger.error(
+                "Restore failed: database service failed to reach point-in-time-recovery target. "
+                "You can launch another restore with different parameters"
+            )
+            self.log_pitr_last_transaction_time()
+            self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
+            return False
 
-            if self.unit.status.message != CANNOT_RESTORE_PITR:
-                logger.error("Restore failed: database service failed to start")
-                self.unit.status = BlockedStatus("Failed to restore backup")
+        if (
+            service.current != ServiceStatus.ACTIVE
+            and self.unit.status.message != CANNOT_RESTORE_PITR
+        ):
+            logger.error("Restore failed: database service failed to start")
+            self.unit.status = BlockedStatus("Failed to restore backup")
             return False
 
         if not self._patroni.member_started:
-            logger.debug("on_update_status early exit: Patroni has not started yet")
+            logger.debug("Restore check early exit: Patroni has not started yet")
             return False
 
         # Remove the restoring backup flag and the restore stanza name.
@@ -2091,26 +2112,56 @@ def is_pitr_failed(self, container: Container) -> Tuple[bool, bool]:
                 - Is patroni service failed to bootstrap cluster.
                 - Is it new fail, that wasn't observed previously.
         """
-        try:
-            log_exec = container.pebble.exec(["pebble", "logs", "postgresql"], combine_stderr=True)
-            patroni_logs = log_exec.wait_output()[0]
-            patroni_exceptions = re.findall(
-                r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
-                patroni_logs,
-                re.MULTILINE,
-            )
-        except ExecError:  # For Juju 2.
-            log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
-            patroni_logs = log_exec.wait_output()[0]
-            patroni_exceptions = re.findall(
-                r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
-                patroni_logs,
-                re.MULTILINE,
-            )
+        patroni_exceptions = []
+        count = 0
+        while len(patroni_exceptions) == 0 and count < 10:
+            try:
+                log_exec = container.pebble.exec(
+                    ["pebble", "logs", "postgresql", "-n", "all"], combine_stderr=True
+                )
+                patroni_logs = log_exec.wait_output()[0]
+                patroni_exceptions = re.findall(
+                    r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
+                    patroni_logs,
+                    re.MULTILINE,
+                )
+            except ExecError:  # For Juju 2.
+                log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
+                patroni_logs = log_exec.wait_output()[0]
+                patroni_exceptions = re.findall(
+                    r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
+                    patroni_logs,
+                    re.MULTILINE,
+                )
+                if len(patroni_exceptions) != 0:
+                    break
+                # If no match, look at older logs
+                log_exec = container.pebble.exec([
+                    "find",
+                    "/var/log/postgresql/",
+                    "-name",
+                    "'patroni.log.*'",
+                    "-exec",
+                    "cat",
+                    "{}",
+                    "+",
+                ])
+                patroni_logs = log_exec.wait_output()[0]
+                patroni_exceptions = re.findall(
+                    r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
+                    patroni_logs,
+                    re.MULTILINE,
+                )
+            count += 1
+            time.sleep(3)
+
         if len(patroni_exceptions) > 0:
+            logger.debug("Failures to bootstrap cluster detected on Patroni service logs")
             old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None)
             self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1]
             return True, patroni_exceptions[-1] != old_pitr_fail_id
+
+        logger.debug("No failures detected on Patroni service logs")
         return False, False
 
     def log_pitr_last_transaction_time(self) -> None:

diff --git a/src/upgrade.py b/src/upgrade.py
@@ -75,6 +75,9 @@ def _handle_label_change(self) -> None:
     @property
     def is_no_sync_member(self) -> bool:
         """Whether this member shouldn't be a synchronous standby (when it's a replica)."""
+        if not self.peer_relation:
+            return False
+
         sync_standbys = self.peer_relation.data[self.charm.app].get("sync-standbys")
         if sync_standbys is None:
             return False

diff --git a/tests/integration/ha_tests/test_upgrade_from_stable.py b/tests/integration/ha_tests/test_upgrade_from_stable.py
@@ -28,7 +28,7 @@
 
 logger = logging.getLogger(__name__)
 
-TIMEOUT = 5 * 60
+TIMEOUT = 10 * 60
 
 
 @pytest.mark.group(1)

diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
@@ -744,7 +744,7 @@ async def switchover(
         candidate: The unit that should be elected the new primary.
     """
     primary_ip = await get_unit_address(ops_test, current_primary)
-    for attempt in Retrying(stop=stop_after_attempt(4), wait=wait_fixed(5), reraise=True):
+    for attempt in Retrying(stop=stop_after_attempt(60), wait=wait_fixed(3), reraise=True):
         with attempt:
             response = requests.post(
                 f"http://{primary_ip}:8008/switchover",

diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py
@@ -91,7 +91,11 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas
             f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME
         )
         await ops_test.model.wait_for_idle(
-            apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000
+            apps=[DATABASE_APP_NAME],
+            status="active",
+            raise_on_blocked=True,
+            raise_on_error=False,
+            timeout=1000,
         )
 
         # Check that on juju 3 we have secrets and no username and password in the rel databag

diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py
@@ -194,9 +194,15 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
         await ops_test.model.wait_for_idle(status="active", timeout=1000)
 
     # Remove the database app.
-    await ops_test.model.remove_application(database_app_name, block_until_done=True)
+    await ops_test.model.remove_application(database_app_name)
+    await ops_test.model.block_until(
+        lambda: database_app_name not in ops_test.model.applications, timeout=1000
+    )
     # Remove the TLS operator.
-    await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
+    await ops_test.model.remove_application(tls_certificates_app_name)
+    await ops_test.model.block_until(
+        lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
+    )
 
 
 @pytest.mark.group(2)
@@ -219,9 +225,15 @@ async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
     database_app_name = f"{DATABASE_APP_NAME}-gcp"
 
     # Remove the database app.
-    await ops_test.model.remove_application(database_app_name, block_until_done=True)
+    await ops_test.model.remove_application(database_app_name)
+    await ops_test.model.block_until(
+        lambda: database_app_name not in ops_test.model.applications, timeout=1000
+    )
     # Remove the TLS operator.
-    await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
+    await ops_test.model.remove_application(tls_certificates_app_name)
+    await ops_test.model.block_until(
+        lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
+    )
 
 
 @pytest.mark.group(2)
@@ -297,7 +309,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None
             database_app_name,
             0,
             S3_INTEGRATOR_APP_NAME,
-            MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
+            ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
         )
 
     # Check that the backup was correctly restored by having only the first created table.

diff --git a/tests/integration/test_backups_pitr.py b/tests/integration/test_backups_pitr.py
@@ -282,9 +282,15 @@ async def pitr_backup_operations(
         await ops_test.model.wait_for_idle(status="active", timeout=1000)
 
     # Remove the database app.
-    await ops_test.model.remove_application(database_app_name, block_until_done=True)
+    await ops_test.model.remove_application(database_app_name)
+    await ops_test.model.block_until(
+        lambda: database_app_name not in ops_test.model.applications, timeout=1000
+    )
     # Remove the TLS operator.
-    await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
+    await ops_test.model.remove_application(tls_certificates_app_name)
+    await ops_test.model.block_until(
+        lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
+    )
 
 
 @pytest.mark.group(1)

diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py
@@ -440,7 +440,11 @@ async def test_redeploy_charm_same_model_after_forcing_removal(ops_test: OpsTest
 
         # This check is enough to ensure that the charm/workload is working for this specific test.
         await ops_test.model.wait_for_idle(
-            apps=[APP_NAME], status="active", timeout=1000, wait_for_exact_units=len(UNIT_IDS)
+            apps=[APP_NAME],
+            status="active",
+            timeout=1000,
+            wait_for_exact_units=len(UNIT_IDS),
+            raise_on_error=False,
         )
 
 

diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
@@ -24,7 +24,8 @@
 from requests import ConnectionError
 from tenacity import RetryError, wait_fixed
 
-from charm import PostgresqlOperatorCharm
+from backups import MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
+from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
 from constants import PEER, SECRET_INTERNAL_LABEL
 from patroni import NotReadyError
 from tests.unit.helpers import _FakeApiError
@@ -448,6 +449,9 @@ def test_on_update_status(harness):
         patch(
             "charm.PostgresqlOperatorCharm._handle_processes_failures"
         ) as _handle_processes_failures,
+        patch(
+            "charm.PostgresqlOperatorCharm.enable_disable_extensions"
+        ) as _enable_disable_extensions,
         patch("charm.Patroni.member_started") as _member_started,
         patch("charm.Patroni.get_primary") as _get_primary,
         patch("ops.model.Container.pebble") as _pebble,
@@ -468,6 +472,28 @@ def test_on_update_status(harness):
         harness.charm.on.update_status.emit()
         _get_primary.assert_not_called()
 
+        # Test unit in blocked status, without message.
+        _pebble.get_services.reset_mock()
+        harness.model.unit.status = BlockedStatus()
+        harness.charm.on.update_status.emit()
+        _enable_disable_extensions.assert_not_called()
+        _pebble.get_services.assert_not_called()
+
+        # Test unit in blocked status due to blocking extensions.
+        harness.model.unit.status = BlockedStatus(EXTENSION_OBJECT_MESSAGE)
+        harness.charm.on.update_status.emit()
+        _enable_disable_extensions.assert_called_once()
+        _pebble.get_services.assert_called_once()
+
+        # Test unit in blocked status due to restored cluster.
+        _pebble.get_services.reset_mock()
+        _enable_disable_extensions.reset_mock()
+        harness.model.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET)
+        harness.charm.on.update_status.emit()
+        _enable_disable_extensions.assert_not_called()
+        _pebble.get_services.assert_called_once()
+        harness.model.unit.status = ActiveStatus()
+
         # Test when a failure need to be handled.
         _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
         _handle_processes_failures.return_value = True