Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Fixes for PITR backup test instabilities #690

Merged
merged 21 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 97 additions & 46 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
import shutil
import sys
import time
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, get_args

Expand Down Expand Up @@ -336,8 +337,7 @@ def get_secret(self, scope: Scopes, key: str) -> Optional[str]:
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")

peers = self.model.get_relation(PEER)
if not peers:
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
Expand All @@ -355,7 +355,9 @@ def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[
if not value:
return self.remove_secret(scope, key)

peers = self.model.get_relation(PEER)
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
# Old translation in databag is to be deleted
self.peer_relation_data(scope).delete_relation_data(peers.id, [key])
Expand All @@ -366,7 +368,9 @@ def remove_secret(self, scope: Scopes, key: str) -> None:
if scope not in get_args(Scopes):
raise RuntimeError("Unknown secret scope.")

peers = self.model.get_relation(PEER)
if not (peers := self.model.get_relation(PEER)):
return None

secret_key = self._translate_field_to_secret_key(key)
if scope == APP_SCOPE:
self.peer_relation_app.delete_relation_data(peers.id, [secret_key])
Expand Down Expand Up @@ -549,11 +553,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901
logger.error("Invalid configuration: %s", str(e))
return

# If PITR restore failed, then wait it for resolve.
# Should not override a blocked status
if isinstance(self.unit.status, BlockedStatus):
logger.debug("on_peer_relation_changed early exit: Unit in blocked status")
return

services = container.pebble.get_services(names=[self._postgresql_service])
if (
"restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data
) and isinstance(self.unit.status, BlockedStatus):
event.defer()
("restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data)
and len(services) > 0
and not self._was_restore_successful(container, services[0])
):
logger.debug("on_peer_relation_changed early exit: Backup restore check failed")
Copy link
Member Author

@lucasgameiroborges lucasgameiroborges Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was observed in some cases that, after a PITR restore failed, the first event to run following the failure would be on_peer_relation_changed instead of update_status (where backup failure checks were located). When that happened, the charm would try to ping patroni, fail, enter awaiting for member to start status and endlessly defer, never knowing it was a backup failure. this condition is meant to avoid that.

return

# Validate the status of the member before setting an ActiveStatus.
Expand Down Expand Up @@ -1338,6 +1349,23 @@ def _on_update_status_early_exit_checks(self, container) -> bool:
if not container.can_connect():
logger.debug("on_update_status early exit: Cannot connect to container")
return False

self._check_pgdata_storage_size()

if self._has_blocked_status or self._has_non_restore_waiting_status:
# If charm was failing to disable plugin, try again and continue (user may have removed the objects)
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
self.enable_disable_extensions()
return True

if (
self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
and "require-change-bucket-after-restore" not in self.app_peer_data
):
Copy link
Member Author

@lucasgameiroborges lucasgameiroborges Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this extra condition is meant to allow MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET blocked status to resolve inside update_status event, in case the flag is no longer there, avoiding potential endless block. The whole check was moved to _on_update_status_early_exit_checks to avoid complexity inside _on_update_status

return True

logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
return False
return True

def _check_pgdata_storage_size(self) -> None:
Expand Down Expand Up @@ -1366,15 +1394,6 @@ def _on_update_status(self, _) -> None:
if not self._on_update_status_early_exit_checks(container):
return

self._check_pgdata_storage_size()

if self._has_blocked_status or self._has_non_restore_waiting_status:
# If charm was failing to disable plugin, try again (user may have removed the objects)
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE:
self.enable_disable_extensions()
logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status")
return

services = container.pebble.get_services(names=[self._postgresql_service])
if len(services) == 0:
# Service has not been added nor started yet, so don't try to check Patroni API.
Expand Down Expand Up @@ -1413,23 +1432,25 @@ def _on_update_status(self, _) -> None:

def _was_restore_successful(self, container: Container, service: ServiceInfo) -> bool:
"""Checks if restore operation succeeded and S3 is properly configured."""
if service.current != ServiceStatus.ACTIVE:
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
logger.error(
"Restore failed: database service failed to reach point-in-time-recovery target. "
"You can launch another restore with different parameters"
)
self.log_pitr_last_transaction_time()
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
return False
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

often times, in juju 3 specifically, the restore would fail (making patroni unresponsive) but the service status itself would stay active for a while. This was causing the charm to think it was just patroni not being ready on the unit and never catching the underlying failure. Depending on the ordering of events following, this could cause the charm to go into waiting status and never into blocked

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting findings!

logger.error(
"Restore failed: database service failed to reach point-in-time-recovery target. "
"You can launch another restore with different parameters"
)
self.log_pitr_last_transaction_time()
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR)
return False

if self.unit.status.message != CANNOT_RESTORE_PITR:
logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
if (
service.current != ServiceStatus.ACTIVE
and self.unit.status.message != CANNOT_RESTORE_PITR
):
Comment on lines +1444 to +1447
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect that this block is being reached earlier than the one above on Juju 2.9 because of the status message (Failed to restore backup) of the unit in the failed tests:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That status is set here after the PITR error check, so I think the reason for this block running before the one above is because this check failed when it shoud've succeeded.

Anyway, I agree that this should be investigated in another PR, probably won't be a quick fix unfortunately.

logger.error("Restore failed: database service failed to start")
self.unit.status = BlockedStatus("Failed to restore backup")
return False

if not self._patroni.member_started:
logger.debug("on_update_status early exit: Patroni has not started yet")
logger.debug("Restore check early exit: Patroni has not started yet")
return False

# Remove the restoring backup flag and the restore stanza name.
Expand Down Expand Up @@ -2091,26 +2112,56 @@ def is_pitr_failed(self, container: Container) -> Tuple[bool, bool]:
- Is patroni service failed to bootstrap cluster.
- Is it new fail, that wasn't observed previously.
"""
try:
log_exec = container.pebble.exec(["pebble", "logs", "postgresql"], combine_stderr=True)
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
patroni_logs,
re.MULTILINE,
)
except ExecError: # For Juju 2.
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
patroni_exceptions = []
count = 0
while len(patroni_exceptions) == 0 and count < 10:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This way of checking the logs for failures is not very stable and often doesn't catch the errors. My changes here aim to just decrease the likelihood of false negatives, but integration tests can still fail on 1st try, specially on juju 2.9. We probably need to redo this check using another approach.

try:
log_exec = container.pebble.exec(
["pebble", "logs", "postgresql", "-n", "all"], combine_stderr=True
)
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$",
patroni_logs,
re.MULTILINE,
)
except ExecError: # For Juju 2.
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
if len(patroni_exceptions) != 0:
break
# If no match, look at older logs
log_exec = container.pebble.exec([
"find",
"/var/log/postgresql/",
"-name",
"'patroni.log.*'",
"-exec",
marceloneppel marked this conversation as resolved.
Show resolved Hide resolved
"cat",
"{}",
"+",
])
patroni_logs = log_exec.wait_output()[0]
patroni_exceptions = re.findall(
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster",
patroni_logs,
re.MULTILINE,
)
count += 1
time.sleep(3)

if len(patroni_exceptions) > 0:
logger.debug("Failures to bootstrap cluster detected on Patroni service logs")
old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None)
self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1]
return True, patroni_exceptions[-1] != old_pitr_fail_id

logger.debug("No failures detected on Patroni service logs")
return False, False

def log_pitr_last_transaction_time(self) -> None:
Expand Down
3 changes: 3 additions & 0 deletions src/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def _handle_label_change(self) -> None:
@property
def is_no_sync_member(self) -> bool:
"""Whether this member shouldn't be a synchronous standby (when it's a replica)."""
if not self.peer_relation:
return False

sync_standbys = self.peer_relation.data[self.charm.app].get("sync-standbys")
if sync_standbys is None:
return False
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_upgrade_from_stable.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

logger = logging.getLogger(__name__)

TIMEOUT = 5 * 60
TIMEOUT = 10 * 60


@pytest.mark.group(1)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ async def switchover(
candidate: The unit that should be elected the new primary.
"""
primary_ip = await get_unit_address(ops_test, current_primary)
for attempt in Retrying(stop=stop_after_attempt(4), wait=wait_fixed(5), reraise=True):
for attempt in Retrying(stop=stop_after_attempt(60), wait=wait_fixed(3), reraise=True):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably a bit overkill, but this was affecting test_backups which is an annoying test to retry and these values helped.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, do you have some logs from a failure related to the previous values?

Copy link
Member Author

@lucasgameiroborges lucasgameiroborges Sep 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this PR commits there are 2 examples:
https://github.com/canonical/postgresql-k8s-operator/actions/runs/10894200338/job/30260414050#step:31:395

https://github.com/canonical/postgresql-k8s-operator/actions/runs/10893225406/job/30227925207#step:31:491

Another case, that I didn't find a quick example but I've seen it, is when the assert for status 200 failed during retries because patroni returns 412: no valid candidates for switchover

with attempt:
response = requests.post(
f"http://{primary_ip}:8008/switchover",
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/new_relations/test_new_relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas
f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME
)
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000
apps=[DATABASE_APP_NAME],
status="active",
raise_on_blocked=True,
raise_on_error=False,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise_on_error=False missing here, another one below.

timeout=1000,
)

# Check that on juju 3 we have secrets and no username and password in the rel databag
Expand Down
22 changes: 17 additions & 5 deletions tests/integration/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,15 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this change (and the other similar ones) aim to avoid the case where, for some reason, the application removal gets stuck and never returns, causing the test to hang for 2h (until CI job itself times out). Because remove_application() does not have a timeout parameter on juju 2.9, I used the regular block_until with timeout.

await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(2)
Expand All @@ -219,9 +225,15 @@ async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) -
database_app_name = f"{DATABASE_APP_NAME}-gcp"

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(2)
Expand Down Expand Up @@ -297,7 +309,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None
database_app_name,
0,
S3_INTEGRATOR_APP_NAME,
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET,
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverting the change made on PITR PR which was not fully appropriate (possibly motivated by the issue observed in https://github.com/canonical/postgresql-k8s-operator/pull/690/files#r1764218285). Double checked with @marceloneppel here.

)

# Check that the backup was correctly restored by having only the first created table.
Expand Down
10 changes: 8 additions & 2 deletions tests/integration/test_backups_pitr.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,15 @@ async def pitr_backup_operations(
await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Remove the database app.
await ops_test.model.remove_application(database_app_name, block_until_done=True)
await ops_test.model.remove_application(database_app_name)
await ops_test.model.block_until(
lambda: database_app_name not in ops_test.model.applications, timeout=1000
)
# Remove the TLS operator.
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True)
await ops_test.model.remove_application(tls_certificates_app_name)
await ops_test.model.block_until(
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000
)


@pytest.mark.group(1)
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,11 @@ async def test_redeploy_charm_same_model_after_forcing_removal(ops_test: OpsTest

# This check is enough to ensure that the charm/workload is working for this specific test.
await ops_test.model.wait_for_idle(
apps=[APP_NAME], status="active", timeout=1000, wait_for_exact_units=len(UNIT_IDS)
apps=[APP_NAME],
status="active",
timeout=1000,
wait_for_exact_units=len(UNIT_IDS),
raise_on_error=False,
)


Expand Down
28 changes: 27 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from requests import ConnectionError
from tenacity import RetryError, wait_fixed

from charm import PostgresqlOperatorCharm
from backups import MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET
from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
from constants import PEER, SECRET_INTERNAL_LABEL
from patroni import NotReadyError
from tests.unit.helpers import _FakeApiError
Expand Down Expand Up @@ -448,6 +449,9 @@ def test_on_update_status(harness):
patch(
"charm.PostgresqlOperatorCharm._handle_processes_failures"
) as _handle_processes_failures,
patch(
"charm.PostgresqlOperatorCharm.enable_disable_extensions"
) as _enable_disable_extensions,
patch("charm.Patroni.member_started") as _member_started,
patch("charm.Patroni.get_primary") as _get_primary,
patch("ops.model.Container.pebble") as _pebble,
Expand All @@ -468,6 +472,28 @@ def test_on_update_status(harness):
harness.charm.on.update_status.emit()
_get_primary.assert_not_called()

# Test unit in blocked status, without message.
_pebble.get_services.reset_mock()
harness.model.unit.status = BlockedStatus()
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_not_called()
_pebble.get_services.assert_not_called()

# Test unit in blocked status due to blocking extensions.
harness.model.unit.status = BlockedStatus(EXTENSION_OBJECT_MESSAGE)
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_called_once()
_pebble.get_services.assert_called_once()

# Test unit in blocked status due to restored cluster.
_pebble.get_services.reset_mock()
_enable_disable_extensions.reset_mock()
harness.model.unit.status = BlockedStatus(MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET)
harness.charm.on.update_status.emit()
_enable_disable_extensions.assert_not_called()
_pebble.get_services.assert_called_once()
harness.model.unit.status = ActiveStatus()

# Test when a failure need to be handled.
_pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)]
_handle_processes_failures.return_value = True
Expand Down
Loading