Skip to content

Commit

Permalink
[DPE-3609] Fix rotate cluster + replicaset password (#378)
Browse files Browse the repository at this point in the history
## Issue
Rotating operator password for on MongoDB cluster or replica set results
in “flickering” errors on update_status

## Solution
Fix this by adding extra processing that checks if passwords + tls
settings are consistent across the cluster / replica set.
  • Loading branch information
MiaAltieri authored Mar 18, 2024
1 parent 0305e63 commit e1223e9
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 39 deletions.
18 changes: 9 additions & 9 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,16 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 10
LIBPATCH = 11
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
BACKUP_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(BackupUser.get_username())
INT_TLS_CA_KEY = f"int-{Config.TLS.SECRET_CA_LABEL}"
FORBIDDEN_REMOVAL_ERR_CODE = 20
AUTH_FAILED_CODE = 18
UNAUTHORISED_CODE = 13
TLS_CANNOT_FIND_PRIMARY = 133


class ShardAuthError(Exception):
Expand Down Expand Up @@ -471,7 +473,7 @@ def cluster_password_synced(self) -> bool:
with MongoDBConnection(self.charm.mongodb_config) as mongod:
mongod.get_replset_status()
except OperationFailure as e:
if e.code == 18: # Unauthorized Error - i.e. password is not in sync
if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]:
return False
raise
except ServerSelectionTimeoutError:
Expand Down Expand Up @@ -999,9 +1001,10 @@ def _is_added_to_cluster(self) -> bool:
return self.charm.app.name in cluster_shards
except OperationFailure as e:
if e.code in [
13,
18,
]: # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos
UNAUTHORISED_CODE,
AUTH_FAILED_CODE,
TLS_CANNOT_FIND_PRIMARY,
]:
return False

raise
Expand All @@ -1026,10 +1029,7 @@ def cluster_password_synced(self) -> bool:
with MongoDBConnection(self.charm.mongodb_config) as mongo:
mongod_reachable = mongo.is_ready
except OperationFailure as e:
if e.code in [
13,
18,
]: # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos
if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE, TLS_CANNOT_FIND_PRIMARY]:
return False
raise
except ServerSelectionTimeoutError:
Expand Down
61 changes: 41 additions & 20 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import subprocess
import time
from pathlib import Path
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional, Set, Tuple

from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.mongodb.v0.config_server_interface import ClusterProvider
Expand Down Expand Up @@ -72,12 +72,17 @@
Unit,
WaitingStatus,
)
from pymongo.errors import OperationFailure, ServerSelectionTimeoutError
from tenacity import Retrying, before_log, retry, stop_after_attempt, wait_fixed

from config import Config
from exceptions import AdminUserCreationError, ApplicationHostNotFoundError
from machine_helpers import MONGO_USER, ROOT_USER_GID, update_mongod_service

AUTH_FAILED_CODE = 18
UNAUTHORISED_CODE = 13
TLS_CANNOT_FIND_PRIMARY = 133

logger = logging.getLogger(__name__)

APP_SCOPE = Config.Relations.APP_SCOPE
Expand Down Expand Up @@ -556,24 +561,13 @@ def _on_update_status(self, event: UpdateStatusEvent):
self.unit.status = WaitingStatus("Waiting for MongoDB to start")
return

# Cannot check more advanced MongoDB statuses if the cluster doesn't have passwords synced
# this can occur in two cases:
# 1. password rotation
# 2. race conditions when a new shard is addeded.
if (
not self.shard.cluster_password_synced()
or not self.config_server.cluster_password_synced()
):
self.unit.status = WaitingStatus("Waiting to sync passwords across the cluster")
return

# leader should periodically handle configuring the replica set. Incidents such as network
# cuts can lead to new IP addresses and therefore will require a reconfigure. Especially
# in the case that the leader a change in IP address it will not receive a relation event.
if self.unit.is_leader():
self._handle_reconfigure(event)

self.unit.status = self.get_status()
self.unit.status = self.process_statuses()

def _on_get_primary_action(self, event: ActionEvent):
event.set_results({"replica-set-primary": self.primary})
Expand Down Expand Up @@ -1406,18 +1400,17 @@ def get_invalid_integration_status(self) -> Optional[StatusBase]:
"Relation to s3-integrator is not supported, config role must be config-server"
)

def get_status(self) -> StatusBase:
"""Returns the status with the highest priority from backups, sharding, and mongod.
Note: it will never be the case that shard_status and config_server_status are both present
since the mongodb app can either be a shard or a config server, but not both.
"""
# retrieve statuses of different services running on Charmed MongoDB
def get_statuses(self) -> Tuple:
"""Retrieves statuses for the different processes running inside the unit."""
mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit))
shard_status = self.shard.get_shard_status()
config_server_status = self.config_server.get_config_server_status()
pbm_status = self.backups.get_pbm_status()
return (mongodb_status, shard_status, config_server_status, pbm_status)

def prioritize_statuses(self, statuses: Tuple) -> StatusBase:
"""Returns the status with the highest priority from backups, sharding, and mongod."""
mongodb_status, shard_status, config_server_status, pbm_status = statuses
# failure in mongodb takes precedence over sharding and config server
if not isinstance(mongodb_status, ActiveStatus):
return mongodb_status
Expand All @@ -1434,6 +1427,34 @@ def get_status(self) -> StatusBase:
# if all statuses are active report mongodb status over sharding status
return mongodb_status

def process_statuses(self) -> StatusBase:
"""Retrieves statuses from processes inside charm and returns the highest priority status.
When a non-fatal error occurs while processing statuses, the error is processed and
returned as a statuses.
"""
# retrieve statuses of different services running on Charmed MongoDB
deployment_mode = "replica set" if self.is_role(Config.Role.REPLICATION) else "cluster"
waiting_status = None
try:
statuses = self.get_statuses()
except OperationFailure as e:
if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]:
waiting_status = f"Waiting to sync passwords across the {deployment_mode}"
elif e.code == TLS_CANNOT_FIND_PRIMARY:
waiting_status = (
f"Waiting to sync internal membership across the {deployment_mode}"
)
else:
raise
except ServerSelectionTimeoutError:
waiting_status = f"Waiting to sync internal membership across the {deployment_mode}"

if waiting_status:
return WaitingStatus(waiting_status)

return self.prioritize_statuses(statuses)

def is_relation_feasible(self, rel_interface) -> bool:
"""Returns true if the proposed relation is feasible."""
if self.is_sharding_component() and rel_interface in Config.Relations.DB_RELATIONS:
Expand Down
42 changes: 41 additions & 1 deletion tests/integration/sharding_tests/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
from pytest_operator.plugin import OpsTest

from ..helpers import get_leader_id, get_password, set_password
from .helpers import (
generate_mongodb_client,
has_correct_shards,
Expand All @@ -17,10 +18,18 @@
SHARD_ONE_APP_NAME = "shard-one"
SHARD_TWO_APP_NAME = "shard-two"
SHARD_THREE_APP_NAME = "shard-three"
SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME]
SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME]
CONFIG_SERVER_APP_NAME = "config-server-one"
CLUSTER_APPS = [
CONFIG_SERVER_APP_NAME,
SHARD_ONE_APP_NAME,
SHARD_TWO_APP_NAME,
SHARD_THREE_APP_NAME,
]
SHARD_REL_NAME = "sharding"
CONFIG_SERVER_REL_NAME = "config-server"
OPERATOR_USERNAME = "operator"
OPERATOR_PASSWORD = "operator-password"
MONGODB_KEYFILE_PATH = "/var/snap/charmed-mongodb/current/etc/mongod/keyFile"
# for now we have a large timeout due to the slow drainage of the `config.system.sessions`
# collection. More info here:
Expand Down Expand Up @@ -122,6 +131,37 @@ async def test_cluster_active(ops_test: OpsTest) -> None:
), "Config server did not process config properly"


@pytest.mark.group(1)
async def test_set_operator_password(ops_test: OpsTest):
"""Tests that the cluster can safely set the operator password."""
for cluster_app_name in CLUSTER_APPS:
operator_password = await get_password(
ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name
)
assert (
operator_password != OPERATOR_PASSWORD
), f"{cluster_app_name} is incorrectly already set to the new password."

# rotate password and verify that no unit goes into error as a result of password rotation
config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
await set_password(
ops_test, unit_id=config_leader_id, username=OPERATOR_USERNAME, password=OPERATOR_PASSWORD
)
await ops_test.model.wait_for_idle(
apps=CLUSTER_APPS,
status="active",
idle_period=20,
),

for cluster_app_name in CLUSTER_APPS:
operator_password = await get_password(
ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name
)
assert (
operator_password == OPERATOR_PASSWORD
), f"{cluster_app_name} did not rotate to new password."


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_sharding(ops_test: OpsTest) -> None:
Expand Down
12 changes: 3 additions & 9 deletions tests/integration/sharding_tests/test_sharding_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,10 @@ async def test_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None:
async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None:
"""Tests that sharded Charmed MongoDB cluster supports restores."""
config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
# TODO Future Work - determine the source of the temporary error state that occurs during the
# rotation of the operator user.
await set_password(
ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD
)
await ops_test.model.wait_for_idle(
apps=CLUSTER_APPS, status="active", idle_period=20, raise_on_error=False
),
await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", idle_period=20)

# count total writes
cluster_writes = await writes_helpers.get_cluster_writes_count(
Expand Down Expand Up @@ -326,14 +322,12 @@ async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -
await deploy_cluster_backup_test(ops_test, deploy_s3_integrator=False)
await setup_cluster_and_s3(ops_test)
config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
# TODO Future Work - determine the source of the temporary error state that occurs during the
# rotation of the operator user.
await set_password(
ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD
)
await ops_test.model.wait_for_idle(
apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT, raise_on_error=False
),
apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT
)

# find most recent backup id and restore
leader_unit = await backup_helpers.get_leader_unit(
Expand Down

0 comments on commit e1223e9

Please sign in to comment.