Skip to content

Commit

Permalink
[MISC] Fixes for PITR backup test instabilities (#690)
Browse files Browse the repository at this point in the history
* try CI fixes

* reformat waits

* fix test charm + nits

* try fix wrong blocked messages

* add wildcard

* try another command

* small nits

* reposition pgdata storage check

* try fixing PITR checks

* check PITR fails even if service is active

* adapt on peer relation changed hook

* retry pitr fail check

* dont wait for remove relations in test_backups

* try fixing restore fail catching

* check all pg logs on pitr fail check

* revert previous error message

* final nits

* use walrus check

Signed-off-by: Lucas Gameiro Borges <lucas.borges@canonical.com>

* add coverage

---------

Signed-off-by: Lucas Gameiro Borges <lucas.borges@canonical.com>
  • Loading branch information
lucasgameiroborges committed Sep 23, 2024
1 parent 61c68ce commit 3d1b508
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 58 deletions.
143 changes: 97 additions & 46 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
import shutil
import sys
import time
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, get_args

Expand Down Expand Up @@ -338,8 +339,7 @@ def get_secret(self, scope: Scopes, key: str) -> Optional[str]:
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")

peers = self.model.get_relation(PEER)
if not peers:
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
Expand All @@ -357,7 +357,9 @@ def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[
if not value:
return self.remove_secret(scope, key)

peers = self.model.get_relation(PEER)
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
# Old translation in databag is to be deleted
self.peer_relation_data(scope).delete_relation_data(peers.id, [key])
Expand All @@ -368,7 +370,9 @@ def remove_secret(self, scope: Scopes, key: str) -> None:
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")

peers = self.model.get_relation(PEER)
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
if scope == APP_SCOPE:
self.peer_relation_app.delete_relation_data(peers.id, [secret_key])
Expand Down Expand Up @@ -551,11 +555,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901
logger.error("Invalid configuration: %s", str(e))
return

# If PITR restore failed, then wait it for resolve.
# Should not override a blocked status
if isinstance(self.unit.status, BlockedStatus):
logger.debug("on_peer_relation_changed early exit: Unit in blocked status")
return

services = container.pebble.get_services(names=[self._postgresql_service])
if (
"restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data
) and isinstance(self.unit.status, BlockedStatus):
event.defer()
("restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data)
and len(services) > 0
and not self._was_restore_successful(container, services[0])
):
logger.debug("on_peer_relation_changed early exit: Backup restore check failed")
return

# Validate the status of the member before setting an ActiveStatus.
Expand Down Expand Up @@ -1344,6 +1355,23 @@ def _on_update_status_early_exit_checks(self, container) -> bool:
if not container.can_connect():
logger.debug("on_update_status early exit: Cannot connect to container")
return False

self._check_pgdata_storage_size()

if self._has_blocked_status or self._has_non_restore_waiting_status:
# If charm was failing to disable plugin, try again and continue (user may have removed the objects)
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
self.enable_disable_extensions()
return True

if (
self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
and "require-change-bucket-after-restore" not in self.app_peer_data
):
return True

logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
return False
return True

def _check_pgdata_storage_size(self) -> None:
Expand Down Expand Up @@ -1372,15 +1400,6 @@ def _on_update_status(self, _) -> None:
if not self._on_update_status_early_exit_checks(container):
return

self._check_pgdata_storage_size()

if self._has_blocked_status or self._has_non_restore_waiting_status:
# If charm was failing to disable plugin, try again (user may have removed the objects)
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
self.enable_disable_extensions()
logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
return

services = container.pebble.get_services(names=[self._postgresql_service])
if len(services) == 0:
# Service has not been added nor started yet, so don't try to check Patroni API.
Expand Down Expand Up @@ -1419,23 +1438,25 @@ def _on_update_status(self, _) -> None:

def _was_restore_successful(self, container: Container, service: ServiceInfo) -> bool:
"""Checks if restore operation succeeded and S3 is properly configured."""
if service.current != ServiceStatus.ACTIVE:
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
logger.error(
"Restore failed: database service failed to reach point-in-time-recovery target. "
"You can launch another restore with different parameters"
)
self.log_pitr_last_transaction_time()
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
return False
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
logger.error(
"Restore failed: database service failed to reach point-in-time-recovery target. "
"You can launch another restore with different parameters"
)
self.log_pitr_last_transaction_time()
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
return False

if self.unit.status.message != CANNOT_RESTORE_PITR:
logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
if (
service.current != ServiceStatus.ACTIVE
and self.unit.status.message != CANNOT_RESTORE_PITR
):
logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
return False

if not self._patroni.member_started:
logger.debug("on_update_status early exit: Patroni has not started yet")
logger.debug("Restore check early exit: Patroni has not started yet")
return False

# Remove the restoring backup flag and the restore stanza name.
Expand Down Expand Up @@ -2097,26 +2118,56 @@ def is_pitr_failed(self, container: Container) -> Tuple[bool, bool]:
- Is patroni service failed to bootstrap cluster.
- Is it new fail, that wasn't observed previously.
"""
try:
log_exec = container.pebble.exec(["pebble", "logs", "postgresql"], combine_stderr=True)
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
patroni_logs,
re.MULTILINE,
)
except ExecError: # For Juju 2.
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
patroni_exceptions = []
count = 0
while len(patroni_exceptions) == 0 and count < 10:
try:
log_exec = container.pebble.exec(
["pebble", "logs", "postgresql", "-n", "all"], combine_stderr=True
)
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
patroni_logs,
re.MULTILINE,
)
except ExecError: # For Juju 2.
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
if len(patroni_exceptions) != 0:
break
# If no match, look at older logs
log_exec = container.pebble.exec([
"find",
"/var/log/postgresql/",
"-name",
"'patroni.log.*'",
"-exec",
"cat",
"{}",
"+",
])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
count += 1
time.sleep(3)

if len(patroni_exceptions) > 0:
logger.debug("Failures to bootstrap cluster detected on Patroni service logs")
old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None)
self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1]
return True, patroni_exceptions[-1] != old_pitr_fail_id

logger.debug("No failures detected on Patroni service logs")
return False, False

def log_pitr_last_transaction_time(self) -> None:
Expand Down
3 changes: 3 additions & 0 deletions src/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def _handle_label_change(self) -> None:
@property
def is_no_sync_member(self) -> bool:
"""Whether this member shouldn't be a synchronous standby (when it's a replica)."""
if not self.peer_relation:
return False

sync_standbys = self.peer_relation.data[self.charm.app].get("sync-standbys")
if sync_standbys is None:
return False
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_upgrade_from_stable.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

logger = logging.getLogger(__name__)

TIMEOUT = 5 * 60
TIMEOUT = 10 * 60


@pytest.mark.group(1)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ async def switchover(
candidate: The unit that should be elected the new primary.
"""
primary_ip = await get_unit_address(ops_test, current_primary)
for attempt in Retrying(stop=stop_after_attempt(4), wait=wait_fixed(5), reraise=True):
for attempt in Retrying(stop=stop_after_attempt(60), wait=wait_fixed(3), reraise=True):
with attempt:
response = requests.post(
f"http://{primary_ip}:8008/switchover",
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/new_relations/test_new_relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas
f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME
)
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000
apps=[DATABASE_APP_NAME],
status="active",
raise_on_blocked=True,
raise_on_error=False,
timeout=1000,
)

# Check that on juju 3 we have secrets and no username and password in the rel databag
Expand Down
22 changes: 17 additions & 5 deletions tests/integration/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,15 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(2)
Expand All @@ -219,9 +225,15 @@ async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
database_app_name = f"{DATABASE_APP_NAME}-gcp"

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(2)
Expand Down Expand Up @@ -297,7 +309,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None
database_app_name,
0,
S3_INTEGRATOR_APP_NAME,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
)

# Check that the backup was correctly restored by having only the first created table.
Expand Down
10 changes: 8 additions & 2 deletions tests/integration/test_backups_pitr.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,15 @@ async def pitr_backup_operations(
await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(1)
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,11 @@ async def test_redeploy_charm_same_model_after_forcing_removal(ops_test: OpsTest

# This check is enough to ensure that the charm/workload is working for this specific test.
await ops_test.model.wait_for_idle(
apps=[APP_NAME], status="active", timeout=1000, wait_for_exact_units=len(UNIT_IDS)
apps=[APP_NAME],
status="active",
timeout=1000,
wait_for_exact_units=len(UNIT_IDS),
raise_on_error=False,
)


Expand Down
28 changes: 27 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from requests import ConnectionError
from tenacity import RetryError, wait_fixed

from charm import PostgresqlOperatorCharm
from backups import MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
from constants import PEER, SECRET_INTERNAL_LABEL
from patroni import NotReadyError
from tests.unit.helpers import _FakeApiError
Expand Down Expand Up @@ -448,6 +449,9 @@ def test_on_update_status(harness):
patch(
"charm.PostgresqlOperatorCharm._handle_processes_failures"
) as _handle_processes_failures,
patch(
"charm.PostgresqlOperatorCharm.enable_disable_extensions"
) as _enable_disable_extensions,
patch("charm.Patroni.member_started") as _member_started,
patch("charm.Patroni.get_primary") as _get_primary,
patch("ops.model.Container.pebble") as _pebble,
Expand All @@ -468,6 +472,28 @@ def test_on_update_status(harness):
harness.charm.on.update_status.emit()
_get_primary.assert_not_called()

# Test unit in blocked status, without message.
_pebble.get_services.reset_mock()
harness.model.unit.status = BlockedStatus()
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_not_called()
_pebble.get_services.assert_not_called()

# Test unit in blocked status due to blocking extensions.
harness.model.unit.status = BlockedStatus(EXTENSION_OBJECT_MESSAGE)
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_called_once()
_pebble.get_services.assert_called_once()

# Test unit in blocked status due to restored cluster.
_pebble.get_services.reset_mock()
_enable_disable_extensions.reset_mock()
harness.model.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET)
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_not_called()
_pebble.get_services.assert_called_once()
harness.model.unit.status = ActiveStatus()

# Test when a failure need to be handled.
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_handle_processes_failures.return_value = True
Expand Down

0 comments on commit 3d1b508

Please sign in to comment.