-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MISC] Fixes for PITR backup test instabilities #690
Changes from all commits
82dd969
49e0dfa
cd6285d
37c1d14
cb598c5
caa74c6
5a565b3
e06a2ac
b98b334
fecb91e
8ebe27e
2ebfa87
f730aad
2be8a38
a11e374
90a9c88
4c1fd93
435d939
ca592f7
a79a89f
a1728c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
import re | ||
import shutil | ||
import sys | ||
import time | ||
from pathlib import Path | ||
from typing import Dict, List, Literal, Optional, Tuple, get_args | ||
|
||
|
@@ -336,8 +337,7 @@ def get_secret(self, scope: Scopes, key: str) -> Optional[str]: | |
if scope not in get_args(Scopes): | ||
raise RuntimeError("Unknown secret scope.") | ||
|
||
peers = self.model.get_relation(PEER) | ||
if not peers: | ||
if not (peers := self.model.get_relation(PEER)): | ||
return None | ||
|
||
secret_key = self._translate_field_to_secret_key(key) | ||
|
@@ -355,7 +355,9 @@ def set_secret(self, scope: Scopes, key: str, value: Optional[str]) -> Optional[ | |
if not value: | ||
return self.remove_secret(scope, key) | ||
|
||
peers = self.model.get_relation(PEER) | ||
if not (peers := self.model.get_relation(PEER)): | ||
return None | ||
|
||
secret_key = self._translate_field_to_secret_key(key) | ||
# Old translation in databag is to be deleted | ||
self.peer_relation_data(scope).delete_relation_data(peers.id, [key]) | ||
|
@@ -366,7 +368,9 @@ def remove_secret(self, scope: Scopes, key: str) -> None: | |
if scope not in get_args(Scopes): | ||
raise RuntimeError("Unknown secret scope.") | ||
|
||
peers = self.model.get_relation(PEER) | ||
if not (peers := self.model.get_relation(PEER)): | ||
return None | ||
|
||
secret_key = self._translate_field_to_secret_key(key) | ||
if scope == APP_SCOPE: | ||
self.peer_relation_app.delete_relation_data(peers.id, [secret_key]) | ||
|
@@ -549,11 +553,18 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901 | |
logger.error("Invalid configuration: %s", str(e)) | ||
return | ||
|
||
# If PITR restore failed, then wait it for resolve. | ||
# Should not override a blocked status | ||
if isinstance(self.unit.status, BlockedStatus): | ||
logger.debug("on_peer_relation_changed early exit: Unit in blocked status") | ||
return | ||
|
||
services = container.pebble.get_services(names=[self._postgresql_service]) | ||
if ( | ||
"restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data | ||
) and isinstance(self.unit.status, BlockedStatus): | ||
event.defer() | ||
("restoring-backup" in self.app_peer_data or "restore-to-time" in self.app_peer_data) | ||
and len(services) > 0 | ||
and not self._was_restore_successful(container, services[0]) | ||
): | ||
logger.debug("on_peer_relation_changed early exit: Backup restore check failed") | ||
return | ||
|
||
# Validate the status of the member before setting an ActiveStatus. | ||
|
@@ -1338,6 +1349,23 @@ def _on_update_status_early_exit_checks(self, container) -> bool: | |
if not container.can_connect(): | ||
logger.debug("on_update_status early exit: Cannot connect to container") | ||
return False | ||
|
||
self._check_pgdata_storage_size() | ||
|
||
if self._has_blocked_status or self._has_non_restore_waiting_status: | ||
# If charm was failing to disable plugin, try again and continue (user may have removed the objects) | ||
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: | ||
self.enable_disable_extensions() | ||
return True | ||
|
||
if ( | ||
self.unit.status.message == MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET | ||
and "require-change-bucket-after-restore" not in self.app_peer_data | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this extra condition is meant to allow |
||
return True | ||
|
||
logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status") | ||
return False | ||
return True | ||
|
||
def _check_pgdata_storage_size(self) -> None: | ||
|
@@ -1366,15 +1394,6 @@ def _on_update_status(self, _) -> None: | |
if not self._on_update_status_early_exit_checks(container): | ||
return | ||
|
||
self._check_pgdata_storage_size() | ||
|
||
if self._has_blocked_status or self._has_non_restore_waiting_status: | ||
# If charm was failing to disable plugin, try again (user may have removed the objects) | ||
if self.unit.status.message == EXTENSION_OBJECT_MESSAGE: | ||
self.enable_disable_extensions() | ||
logger.debug("on_update_status early exit: Unit is in Blocked/Waiting status") | ||
return | ||
|
||
services = container.pebble.get_services(names=[self._postgresql_service]) | ||
if len(services) == 0: | ||
# Service has not been added nor started yet, so don't try to check Patroni API. | ||
|
@@ -1413,23 +1432,25 @@ def _on_update_status(self, _) -> None: | |
|
||
def _was_restore_successful(self, container: Container, service: ServiceInfo) -> bool: | ||
"""Checks if restore operation succeeded and S3 is properly configured.""" | ||
if service.current != ServiceStatus.ACTIVE: | ||
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)): | ||
logger.error( | ||
"Restore failed: database service failed to reach point-in-time-recovery target. " | ||
"You can launch another restore with different parameters" | ||
) | ||
self.log_pitr_last_transaction_time() | ||
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) | ||
return False | ||
if "restore-to-time" in self.app_peer_data and all(self.is_pitr_failed(container)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. often times, in juju 3 specifically, the restore would fail (making patroni unresponsive) but the service status itself would stay active for a while. This was causing the charm to think it was just patroni not being ready on the unit and never catching the underlying failure. Depending on the ordering of events following, this could cause the charm to go into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting findings! |
||
logger.error( | ||
"Restore failed: database service failed to reach point-in-time-recovery target. " | ||
"You can launch another restore with different parameters" | ||
) | ||
self.log_pitr_last_transaction_time() | ||
self.unit.status = BlockedStatus(CANNOT_RESTORE_PITR) | ||
return False | ||
|
||
if self.unit.status.message != CANNOT_RESTORE_PITR: | ||
logger.error("Restore failed: database service failed to start") | ||
self.unit.status = BlockedStatus("Failed to restore backup") | ||
if ( | ||
service.current != ServiceStatus.ACTIVE | ||
and self.unit.status.message != CANNOT_RESTORE_PITR | ||
): | ||
Comment on lines
+1444
to
+1447
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect that this block is being reached earlier than the one above on Juju 2.9 because of the status message (
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That status is set here after the PITR error check, so I think the reason for this block running before the one above is because this check failed when it shoud've succeeded. Anyway, I agree that this should be investigated in another PR, probably won't be a quick fix unfortunately. |
||
logger.error("Restore failed: database service failed to start") | ||
self.unit.status = BlockedStatus("Failed to restore backup") | ||
return False | ||
|
||
if not self._patroni.member_started: | ||
logger.debug("on_update_status early exit: Patroni has not started yet") | ||
logger.debug("Restore check early exit: Patroni has not started yet") | ||
return False | ||
|
||
# Remove the restoring backup flag and the restore stanza name. | ||
|
@@ -2091,26 +2112,56 @@ def is_pitr_failed(self, container: Container) -> Tuple[bool, bool]: | |
- Is patroni service failed to bootstrap cluster. | ||
- Is it new fail, that wasn't observed previously. | ||
""" | ||
try: | ||
log_exec = container.pebble.exec(["pebble", "logs", "postgresql"], combine_stderr=True) | ||
patroni_logs = log_exec.wait_output()[0] | ||
patroni_exceptions = re.findall( | ||
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", | ||
patroni_logs, | ||
re.MULTILINE, | ||
) | ||
except ExecError: # For Juju 2. | ||
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"]) | ||
patroni_logs = log_exec.wait_output()[0] | ||
patroni_exceptions = re.findall( | ||
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", | ||
patroni_logs, | ||
re.MULTILINE, | ||
) | ||
patroni_exceptions = [] | ||
count = 0 | ||
while len(patroni_exceptions) == 0 and count < 10: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This way of checking the logs for failures is not very stable and often doesn't catch the errors. My changes here aim to just decrease the likelihood of false negatives, but integration tests can still fail on 1st try, specially on juju 2.9. We probably need to redo this check using another approach. |
||
try: | ||
log_exec = container.pebble.exec( | ||
["pebble", "logs", "postgresql", "-n", "all"], combine_stderr=True | ||
) | ||
patroni_logs = log_exec.wait_output()[0] | ||
patroni_exceptions = re.findall( | ||
r"^([0-9-:TZ.]+) \[postgresql] patroni\.exceptions\.PatroniFatalException: Failed to bootstrap cluster$", | ||
patroni_logs, | ||
re.MULTILINE, | ||
) | ||
except ExecError: # For Juju 2. | ||
log_exec = container.pebble.exec(["cat", "/var/log/postgresql/patroni.log"]) | ||
patroni_logs = log_exec.wait_output()[0] | ||
patroni_exceptions = re.findall( | ||
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", | ||
patroni_logs, | ||
re.MULTILINE, | ||
) | ||
if len(patroni_exceptions) != 0: | ||
break | ||
# If no match, look at older logs | ||
log_exec = container.pebble.exec([ | ||
"find", | ||
"/var/log/postgresql/", | ||
"-name", | ||
"'patroni.log.*'", | ||
"-exec", | ||
marceloneppel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"cat", | ||
"{}", | ||
"+", | ||
]) | ||
patroni_logs = log_exec.wait_output()[0] | ||
patroni_exceptions = re.findall( | ||
r"^([0-9- :]+) UTC \[[0-9]+\]: INFO: removing initialize key after failed attempt to bootstrap the cluster", | ||
patroni_logs, | ||
re.MULTILINE, | ||
) | ||
count += 1 | ||
time.sleep(3) | ||
|
||
if len(patroni_exceptions) > 0: | ||
logger.debug("Failures to bootstrap cluster detected on Patroni service logs") | ||
old_pitr_fail_id = self.unit_peer_data.get("last_pitr_fail_id", None) | ||
self.unit_peer_data["last_pitr_fail_id"] = patroni_exceptions[-1] | ||
return True, patroni_exceptions[-1] != old_pitr_fail_id | ||
|
||
logger.debug("No failures detected on Patroni service logs") | ||
return False, False | ||
|
||
def log_pitr_last_transaction_time(self) -> None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,7 @@ | |
|
||
logger = logging.getLogger(__name__) | ||
|
||
TIMEOUT = 5 * 60 | ||
TIMEOUT = 10 * 60 | ||
|
||
|
||
@pytest.mark.group(1) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -744,7 +744,7 @@ async def switchover( | |
candidate: The unit that should be elected the new primary. | ||
""" | ||
primary_ip = await get_unit_address(ops_test, current_primary) | ||
for attempt in Retrying(stop=stop_after_attempt(4), wait=wait_fixed(5), reraise=True): | ||
for attempt in Retrying(stop=stop_after_attempt(60), wait=wait_fixed(3), reraise=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably a bit overkill, but this was affecting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, do you have some logs from a failure related to the previous values? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this PR commits there are 2 examples: Another case, that I didn't find a quick example but I've seen it, is when the assert for status 200 failed during retries because patroni returns |
||
with attempt: | ||
response = requests.post( | ||
f"http://{primary_ip}:8008/switchover", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,7 +91,11 @@ async def test_database_relation_with_charm_libraries(ops_test: OpsTest, databas | |
f"{APPLICATION_APP_NAME}:{FIRST_DATABASE_RELATION_NAME}", DATABASE_APP_NAME | ||
) | ||
await ops_test.model.wait_for_idle( | ||
apps=[DATABASE_APP_NAME], status="active", raise_on_blocked=True, timeout=1000 | ||
apps=[DATABASE_APP_NAME], | ||
status="active", | ||
raise_on_blocked=True, | ||
raise_on_error=False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
timeout=1000, | ||
) | ||
|
||
# Check that on juju 3 we have secrets and no username and password in the rel databag | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -194,9 +194,15 @@ async def test_backup_aws(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - | |
await ops_test.model.wait_for_idle(status="active", timeout=1000) | ||
|
||
# Remove the database app. | ||
await ops_test.model.remove_application(database_app_name, block_until_done=True) | ||
await ops_test.model.remove_application(database_app_name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this change (and the other similar ones) aim to avoid the case where, for some reason, the application removal gets stuck and never returns, causing the test to hang for 2h (until CI job itself times out). Because |
||
await ops_test.model.block_until( | ||
lambda: database_app_name not in ops_test.model.applications, timeout=1000 | ||
) | ||
# Remove the TLS operator. | ||
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) | ||
await ops_test.model.remove_application(tls_certificates_app_name) | ||
await ops_test.model.block_until( | ||
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000 | ||
) | ||
|
||
|
||
@pytest.mark.group(2) | ||
|
@@ -219,9 +225,15 @@ async def test_backup_gcp(ops_test: OpsTest, cloud_configs: Tuple[Dict, Dict]) - | |
database_app_name = f"{DATABASE_APP_NAME}-gcp" | ||
|
||
# Remove the database app. | ||
await ops_test.model.remove_application(database_app_name, block_until_done=True) | ||
await ops_test.model.remove_application(database_app_name) | ||
await ops_test.model.block_until( | ||
lambda: database_app_name not in ops_test.model.applications, timeout=1000 | ||
) | ||
# Remove the TLS operator. | ||
await ops_test.model.remove_application(tls_certificates_app_name, block_until_done=True) | ||
await ops_test.model.remove_application(tls_certificates_app_name) | ||
await ops_test.model.block_until( | ||
lambda: tls_certificates_app_name not in ops_test.model.applications, timeout=1000 | ||
) | ||
|
||
|
||
@pytest.mark.group(2) | ||
|
@@ -297,7 +309,7 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None | |
database_app_name, | ||
0, | ||
S3_INTEGRATOR_APP_NAME, | ||
MOVE_RESTORED_CLUSTER_TO_ANOTHER_BUCKET, | ||
ANOTHER_CLUSTER_REPOSITORY_ERROR_MESSAGE, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverting the change made on PITR PR which was not fully appropriate (possibly motivated by the issue observed in https://github.com/canonical/postgresql-k8s-operator/pull/690/files#r1764218285). Double checked with @marceloneppel here. |
||
) | ||
|
||
# Check that the backup was correctly restored by having only the first created table. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was observed in some cases that, after a PITR restore failed, the first event to run following the failure would be
on_peer_relation_changed
instead ofupdate_status
(where backup failure checks were located). When that happened, the charm would try to ping patroni, fail, enterawaiting for member to start
status and endlessly defer, never knowing it was a backup failure. this condition is meant to avoid that.