From 905ffcac801867165704a77a2c2c1c4f68fce23a Mon Sep 17 00:00:00 2001 From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> Date: Tue, 12 Mar 2024 11:44:40 +0100 Subject: [PATCH] [DPE-3650] TLS now supported on sharded deployments (#370) Future PR: Add Useful TLS error messages i.e. 1. Shard has TLS but config-server does not 2. Config-server has TLS but shard does not 3. CA mismatch --------- Co-authored-by: Mehdi Bendriss --- lib/charms/mongodb/v0/mongodb_tls.py | 47 +++-- lib/charms/mongodb/v1/helpers.py | 36 +++- lib/charms/mongodb/v1/shards_interface.py | 198 ++++++++++++----- .../sharding_tests/test_sharding_relations.py | 8 + .../sharding_tests/test_sharding_tls.py | 199 ++++++++++++++++++ tests/integration/tls_tests/helpers.py | 20 +- 6 files changed, 429 insertions(+), 79 deletions(-) create mode 100644 tests/integration/sharding_tests/test_sharding_tls.py diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py index 41b4aeca6..73d66686b 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v0/mongodb_tls.py @@ -39,7 +39,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 9 logger = logging.getLogger(__name__) @@ -76,17 +76,18 @@ def _on_set_tls_private_key(self, event: ActionEvent) -> None: """Set the TLS private key, which will be used for requesting the certificate.""" logger.debug("Request to set TLS private key received.") try: - self._request_certificate(event.params.get("external-key", None), internal=False) - self._request_certificate(event.params.get("internal-key", None), internal=True) + self.request_certificate(event.params.get("external-key", None), internal=False) + self.request_certificate(event.params.get("internal-key", None), internal=True) logger.debug("Successfully set TLS private key.") except ValueError as e: event.fail(str(e)) - def _request_certificate( + def request_certificate( self, param: Optional[str], internal: bool, ): + """Request TLS certificate.""" if param is None: key = generate_private_key() else: @@ -94,16 +95,19 @@ def _request_certificate( csr = generate_csr( private_key=key, - subject=self.get_host(self.charm.unit), - organization=self.charm.app.name, + subject=self._get_subject_name(), + organization=self._get_subject_name(), sans=self._get_sans(), sans_ip=[str(self.charm.model.get_binding(self.peer_relation).network.bind_address)], ) - self.set_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL, key.decode("utf-8")) self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, csr.decode("utf-8")) self.set_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL, None) + label = "int" if internal else "ext" + self.charm.unit_peer_data[f"{label}_certs_subject"] = self._get_subject_name() + self.charm.unit_peer_data[f"{label}_certs_subject"] = self._get_subject_name() + if self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): self.certs.request_certificate_creation(certificate_signing_request=csr) @@ -124,8 +128,8 @@ def _parse_tls_file(raw_content: str) -> bytes: def _on_tls_relation_joined(self, _: RelationJoinedEvent) -> None: """Request certificate when TLS relation joined.""" - self._request_certificate(None, internal=True) - self._request_certificate(None, internal=False) + self.request_certificate(None, internal=True) + self.request_certificate(None, internal=False) def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None: """Disable TLS when TLS relation broken.""" @@ -149,7 +153,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: if ext_csr and event.certificate_signing_request.rstrip() == ext_csr.rstrip(): logger.debug("The external TLS certificate available.") - internal = False # external crs + internal = False elif int_csr and event.certificate_signing_request.rstrip() == int_csr.rstrip(): logger.debug("The internal TLS certificate available.") internal = True @@ -165,7 +169,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: self.set_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL, event.certificate) self.set_tls_secret(internal, Config.TLS.SECRET_CA_LABEL, event.ca) - if self._waiting_for_certs(): + if self.waiting_for_certs(): logger.debug( "Defer till both internal and external TLS certificates available to avoid second restart." ) @@ -185,13 +189,13 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: else: self.charm.unit.status = ActiveStatus() - def _waiting_for_certs(self): + def waiting_for_certs(self): """Returns a boolean indicating whether additional certs are needed.""" if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL): - logger.debug("Waiting for application certificate.") + logger.debug("Waiting for internal certificate.") return True if not self.get_tls_secret(internal=False, label_name=Config.TLS.SECRET_CERT_LABEL): - logger.debug("Waiting for application certificate.") + logger.debug("Waiting for external certificate.") return True return False @@ -222,8 +226,8 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8") new_csr = generate_csr( private_key=key, - subject=self.get_host(self.charm.unit), - organization=self.charm.app.name, + subject=self._get_subject_name(), + organization=self._get_subject_name(), sans=self._get_sans(), sans_ip=[str(self.charm.model.get_binding(self.peer_relation).network.bind_address)], ) @@ -293,3 +297,14 @@ def get_tls_secret(self, internal: bool, label_name: str) -> str: scope = "int" if internal else "ext" label_name = f"{scope}-{label_name}" return self.charm.get_secret(UNIT_SCOPE, label_name) + + def _get_subject_name(self) -> str: + """Generate the subject name for CSR.""" + # In sharded MongoDB deployments it is a requirement that all subject names match across + # all cluster components + if self.charm.is_role(Config.Role.SHARD): + # until integrated with config-server use current app name as + # subject name + return self.charm.shard.get_config_server_name() or self.charm.app.name + + return self.charm.app.name diff --git a/lib/charms/mongodb/v1/helpers.py b/lib/charms/mongodb/v1/helpers.py index 9038198d1..16ce2e538 100644 --- a/lib/charms/mongodb/v1/helpers.py +++ b/lib/charms/mongodb/v1/helpers.py @@ -30,7 +30,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 # path to store mongodb ketFile KEY_FILE = "keyFile" @@ -121,12 +121,40 @@ def get_mongos_args( f"--configdb {config_server_db}", # config server is already using 27017 f"--port {Config.MONGOS_PORT}", - f"--keyFile={full_conf_dir}/{KEY_FILE}", - "\n", ] - # TODO Future PR: support TLS on mongos + # TODO : generalise these into functions to be re-used + if config.tls_external: + cmd.extend( + [ + f"--tlsCAFile={full_conf_dir}/{TLS_EXT_CA_FILE}", + f"--tlsCertificateKeyFile={full_conf_dir}/{TLS_EXT_PEM_FILE}", + # allow non-TLS connections + "--tlsMode=preferTLS", + "--tlsDisabledProtocols=TLS1_0,TLS1_1", + ] + ) + # internal TLS can be enabled only if external is enabled + if config.tls_internal and config.tls_external: + cmd.extend( + [ + "--clusterAuthMode=x509", + "--tlsAllowInvalidCertificates", + f"--tlsClusterCAFile={full_conf_dir}/{TLS_INT_CA_FILE}", + f"--tlsClusterFile={full_conf_dir}/{TLS_INT_PEM_FILE}", + ] + ) + else: + # keyFile used for authentication replica set peers if no internal tls configured. + cmd.extend( + [ + "--clusterAuthMode=keyFile", + f"--keyFile={full_conf_dir}/{KEY_FILE}", + ] + ) + + cmd.append("\n") return " ".join(cmd) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index 824f56a18..f07e5aede 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -9,7 +9,7 @@ import json import logging import time -from typing import List, Optional, Set +from typing import List, Optional, Set, Tuple from charms.data_platform_libs.v0.data_interfaces import ( DatabaseProvides, @@ -31,7 +31,7 @@ ShardNotPlannedForRemovalError, ) from charms.mongodb.v1.users import BackupUser, MongoDBUser, OperatorUser -from ops.charm import CharmBase, EventBase, RelationBrokenEvent +from ops.charm import CharmBase, EventBase, RelationBrokenEvent, RelationChangedEvent from ops.framework import Object from ops.model import ( ActiveStatus, @@ -40,6 +40,7 @@ StatusBase, WaitingStatus, ) +from pymongo.errors import ServerSelectionTimeoutError from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -55,11 +56,12 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 9 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) BACKUP_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(BackupUser.get_username()) +INT_TLS_CA_KEY = f"int-{Config.TLS.SECRET_CA_LABEL}" FORBIDDEN_REMOVAL_ERR_CODE = 20 AUTH_FAILED_CODE = 18 @@ -110,25 +112,29 @@ def _on_relation_joined(self, event): logger.info("Skipping relation joined event: hook checks did not pass") return - # TODO Future PR, sync tls secrets and PBM password - - self.database_provides.update_relation_data( - event.relation.id, - { - OPERATOR_PASSWORD_KEY: self.charm.get_secret( - Config.Relations.APP_SCOPE, - OPERATOR_PASSWORD_KEY, - ), - BACKUP_PASSWORD_KEY: self.charm.get_secret( - Config.Relations.APP_SCOPE, - BACKUP_PASSWORD_KEY, - ), - KEYFILE_KEY: self.charm.get_secret( - Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME - ), - HOSTS_KEY: json.dumps(self.charm._unit_ips), - }, + relation_data = { + OPERATOR_PASSWORD_KEY: self.charm.get_secret( + Config.Relations.APP_SCOPE, + OPERATOR_PASSWORD_KEY, + ), + BACKUP_PASSWORD_KEY: self.charm.get_secret( + Config.Relations.APP_SCOPE, + BACKUP_PASSWORD_KEY, + ), + KEYFILE_KEY: self.charm.get_secret( + Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME + ), + HOSTS_KEY: json.dumps(self.charm._unit_ips), + } + + # if tls enabled + int_tls_ca = self.charm.tls.get_tls_secret( + internal=True, label_name=Config.TLS.SECRET_CA_LABEL ) + if int_tls_ca: + relation_data[INT_TLS_CA_KEY] = int_tls_ca + + self.database_provides.update_relation_data(event.relation.id, relation_data) def pass_hook_checks(self, event: EventBase) -> bool: """Runs the pre-hooks checks for ShardingProvider, returns True if all pass.""" @@ -457,6 +463,10 @@ def cluster_password_synced(self) -> bool: if e.code == 18: # Unauthorized Error - i.e. password is not in sync return False raise + except ServerSelectionTimeoutError: + # Connection refused, - this occurs when internal membership is not in sync across the + # cluster (i.e. TLS + KeyFile). + return False return True @@ -470,16 +480,23 @@ def __init__( """Constructor for ShardingProvider object.""" self.relation_name = relation_name self.charm = charm + self.database_requires = DatabaseRequires( self.charm, relation_name=self.relation_name, - additional_secret_fields=[KEYFILE_KEY, OPERATOR_PASSWORD_KEY, BACKUP_PASSWORD_KEY], + additional_secret_fields=[ + KEYFILE_KEY, + OPERATOR_PASSWORD_KEY, + BACKUP_PASSWORD_KEY, + INT_TLS_CA_KEY, + ], # a database isn't required for the relation between shards + config servers, but is a # requirement for using `DatabaseRequires` database_name="", ) super().__init__(charm, self.relation_name) + self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_changed ) @@ -535,14 +552,83 @@ def _handle_changed_secrets(self, event) -> None: username=OperatorUser.get_username(), new_password=operator_password ) self.update_password(BackupUser.get_username(), new_password=backup_password) - except RetryError: + except (NotReadyError, PyMongoError): self.charm.unit.status = BlockedStatus("Failed to rotate cluster secrets") logger.error("Shard failed to rotate cluster secrets.") event.defer() return + # FUTURE PR: if config-server does not have TLS enabled log a useful message and go into + # blocked in relation_changed and other status checks + + def get_membership_auth_modes(self, event: RelationChangedEvent) -> Tuple[bool, bool]: + """Returns the available authentication membership forms.""" + key_file_contents = self.database_requires.fetch_relation_field( + event.relation.id, KEYFILE_KEY + ) + tls_ca = self.database_requires.fetch_relation_field(event.relation.id, INT_TLS_CA_KEY) + return (key_file_contents is not None, tls_ca is not None) + + def update_member_auth( + self, event: RelationChangedEvent, membership_auth: Tuple[bool, bool] + ) -> None: + """Updates the shard to have the same membership auth as the config-server.""" + cluster_auth_keyfile, cluster_auth_tls = membership_auth + tls_integrated = self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION) + + # Edge case: shard has TLS enabled before having connected to the config-server. For TLS in + # sharded MongoDB clusters it is necessary that the subject and organisation name are the + # same in their CSRs. Re-requesting a cert after integrated with the config-server + # regenerates the cert with the appropriate configurations needed for sharding. + if cluster_auth_tls and tls_integrated and self._should_request_new_certs(): + logger.info("Cluster implements internal membership auth via certificates") + self.charm.tls.request_certificate(param=None, internal=True) + self.charm.tls.request_certificate(param=None, internal=False) + elif cluster_auth_keyfile and not cluster_auth_tls and not tls_integrated: + logger.info("Cluster implements internal membership auth via keyFile") + + # Copy over keyfile regardless of whether the cluster uses TLS or or KeyFile for internal + # membership authentication. If TLS is disabled on the cluster this enables the cluster to + # have the correct cluster KeyFile readily available. + key_file_contents = self.database_requires.fetch_relation_field( + event.relation.id, KEYFILE_KEY + ) + self.update_keyfile(key_file_contents=key_file_contents) + + # Future PR - status updating for inconsistencies with TLS (i.e. shard has TLS but + # config-server does not and vice versa or CA-mismatch) + + def get_cluster_passwords( + self, event: RelationChangedEvent + ) -> Tuple[Optional[str], Optional[str]]: + """Retrieves shared cluster passwords.""" + operator_password = self.database_requires.fetch_relation_field( + event.relation.id, OPERATOR_PASSWORD_KEY + ) + backup_password = self.database_requires.fetch_relation_field( + event.relation.id, BACKUP_PASSWORD_KEY + ) + return (operator_password, backup_password) + + def sync_cluster_passwords( + self, event: RelationChangedEvent, operator_password: str, backup_password: str + ) -> None: + """Updates shared cluster passwords.""" + try: + self.update_password( + username=OperatorUser.get_username(), new_password=operator_password + ) + self.update_password(BackupUser.get_username(), new_password=backup_password) + except RetryError: + self.charm.unit.status = BlockedStatus("Shard not added to config-server") + logger.error( + "Failed to sync cluster passwords from config-server to shard. Shard cannot be added to config-server, deferring event and retrying." + ) + event.defer() + def _on_relation_changed(self, event): """Retrieves secrets from config-server and updates them within the shard.""" + # TODO Future PR include TLS sainity check in pass_hook_checks if not self.pass_hook_checks(event): logger.info("Skipping relation joined event: hook checks re not passed") return @@ -550,19 +636,25 @@ def _on_relation_changed(self, event): # if re-using an old shard, re-set drained flag. self.charm.unit_peer_data["drained"] = json.dumps(False) - # TODO: Future PR better status message behavior - self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + # relation-changed events can be used for other purposes (not only adding the shard), i.e. + # password rotation, secret rotation, mongos hosts rotation + if self._is_mongos_reachable() and not self._is_added_to_cluster(): + self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") - # shards rely on the config server for secrets - key_file_contents = self.database_requires.fetch_relation_field( - event.relation.id, KEYFILE_KEY - ) - if not key_file_contents: + # shards rely on the config server for shared cluster secrets + key_file_enabled, tls_enabled = self.get_membership_auth_modes(event) + if not key_file_enabled and not tls_enabled: + logger.info("Waiting for secrets for config-server.") event.defer() self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server") return - self.update_keyfile(key_file_contents=key_file_contents) + self.update_member_auth(event, (key_file_enabled, tls_enabled)) + + if tls_enabled and self.charm.tls.waiting_for_certs(): + logger.info("Waiting for requested certs, before restarting and adding to cluster.") + event.defer() + return # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). with MongoDBConnection(self.charm.mongodb_config) as mongo: @@ -576,29 +668,13 @@ def _on_relation_changed(self, event): return # TODO Future work, see if needed to check for all units restarted / primary elected - operator_password = self.database_requires.fetch_relation_field( - event.relation.id, OPERATOR_PASSWORD_KEY - ) - backup_password = self.database_requires.fetch_relation_field( - event.relation.id, BACKUP_PASSWORD_KEY - ) + (operator_password, backup_password) = self.get_cluster_passwords(event) if not operator_password or not backup_password: event.defer() self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server") return - try: - self.update_password( - username=OperatorUser.get_username(), new_password=operator_password - ) - self.update_password(BackupUser.get_username(), new_password=backup_password) - except RetryError: - self.charm.unit.status = BlockedStatus("Shard not added to config-server") - logger.error( - "Shard could not be added to config server, failed to set operator password." - ) - event.defer() - return + self.sync_cluster_passwords(event, operator_password, backup_password) # after updating the password of the backup user, restart pbm with correct password self.charm._connect_pbm_agent() @@ -656,7 +732,6 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None: self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") mongos_hosts = json.loads(self.charm.app_peer_data["mongos_hosts"]) self.wait_for_draining(mongos_hosts) - self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal") def wait_for_draining(self, mongos_hosts: List[str]): @@ -869,10 +944,17 @@ def _is_added_to_cluster(self) -> bool: cluster_shards = mongo.get_shard_members() return self.charm.app.name in cluster_shards except OperationFailure as e: - if e.code == 13: # Unauthorized, we are not yet connected to mongos + if e.code in [ + 13, + 18, + ]: # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos return False raise + except ServerSelectionTimeoutError: + # Connection refused, - this occurs when internal membership is not in sync across the + # cluster (i.e. TLS + KeyFile). + return False def cluster_password_synced(self) -> bool: """Returns True if the cluster password is synced for the shard.""" @@ -893,6 +975,10 @@ def cluster_password_synced(self) -> bool: if e.code == 18: # Unauthorized Error - i.e. password is not in sync return False raise + except ServerSelectionTimeoutError: + # Connection refused, - this occurs when internal membership is not in sync across the + # cluster (i.e. TLS + KeyFile). + return False return mongos_reachable and mongod_reachable @@ -921,9 +1007,9 @@ def has_config_server(self) -> bool: """Returns True if currently related to config server.""" return len(self.charm.model.relations[self.relation_name]) > 0 - def get_related_config_server(self) -> str: + def get_config_server_name(self) -> str: """Returns the related config server.""" - if self.relation_name not in self.charm.model.relations: + if not self.model.get_relation(self.relation_name): return None # metadata.yaml prevents having multiple config servers @@ -937,3 +1023,9 @@ def get_mongos_hosts(self) -> List[str]: return return json.loads(config_server_relation.data[config_server_relation.app].get(HOSTS_KEY)) + + def _should_request_new_certs(self) -> bool: + """Returns if the shard has already requested the certificates for internal-membership.""" + int_subject = self.charm.unit_peer_data.get("int_certs_subject", None) + ext_subject = self.charm.unit_peer_data.get("ext_certs_subject", None) + return {int_subject, ext_subject} != {self.get_config_server_name()} diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py index c1c24d381..0338623fb 100644 --- a/tests/integration/sharding_tests/test_sharding_relations.py +++ b/tests/integration/sharding_tests/test_sharding_relations.py @@ -95,6 +95,7 @@ async def test_build_and_deploy( @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: """Verify that a shard can only be related to one config server.""" await ops_test.model.integrate( @@ -127,6 +128,7 @@ async def test_only_one_config_server_relation(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: """Verify that sharding components cannot use the DB relation.""" for sharded_component in SHARDING_COMPONENTS: @@ -162,6 +164,7 @@ async def test_cannot_use_db_relation(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: """Verify that sharding components cannot use the legacy DB relation.""" for sharded_component in SHARDING_COMPONENTS: @@ -197,6 +200,7 @@ async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_replication_config_server_relation(ops_test: OpsTest): """Verifies that using a replica as a shard fails.""" # attempt to add a replication deployment as a shard to the config server. @@ -225,6 +229,7 @@ async def test_replication_config_server_relation(ops_test: OpsTest): @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_replication_shard_relation(ops_test: OpsTest): """Verifies that using a replica as a config-server fails.""" # attempt to add a shard to a replication deployment as a config server. @@ -260,6 +265,7 @@ async def test_replication_shard_relation(ops_test: OpsTest): @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_replication_mongos_relation(ops_test: OpsTest) -> None: """Verifies connecting a replica to a mongos router fails.""" # attempt to add a replication deployment as a shard to the config server. @@ -296,6 +302,7 @@ async def test_replication_mongos_relation(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_shard_mongos_relation(ops_test: OpsTest) -> None: """Verifies connecting a shard to a mongos router fails.""" # attempt to add a replication deployment as a shard to the config server. @@ -332,6 +339,7 @@ async def test_shard_mongos_relation(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_shard_s3_relation(ops_test: OpsTest) -> None: """Verifies integrating a shard to s3-integrator fails.""" # attempt to add a replication deployment as a shard to the config server. diff --git a/tests/integration/sharding_tests/test_sharding_tls.py b/tests/integration/sharding_tests/test_sharding_tls.py new file mode 100644 index 000000000..8929778f4 --- /dev/null +++ b/tests/integration/sharding_tests/test_sharding_tls.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + + +import pytest +from pytest_operator.plugin import OpsTest +from tenacity import Retrying, stop_after_attempt, wait_fixed + +from ..tls_tests import helpers as tls_helpers + +CERTS_APP_NAME = "self-signed-certificates" +SHARD_ONE_APP_NAME = "shard-one" +SHARD_TWO_APP_NAME = "shard-two" +CONFIG_SERVER_APP_NAME = "config-server" +CLUSTER_COMPONENTS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, CONFIG_SERVER_APP_NAME] +SHARD_REL_NAME = "sharding" +CONFIG_SERVER_REL_NAME = "config-server" +CERT_REL_NAME = "certificates" +TIMEOUT = 15 * 60 + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest) -> None: + """Build and deploy a sharded cluster.""" + await deploy_cluster_components(ops_test) + + # deploy the s3 integrator charm + await ops_test.model.deploy(CERTS_APP_NAME, channel="stable") + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=[CERTS_APP_NAME, CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + raise_on_error=False, + ) + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_built_cluster_with_tls(ops_test: OpsTest) -> None: + """Tests that the cluster can be integrated with TLS.""" + await integrate_cluster(ops_test) + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=CLUSTER_COMPONENTS, + idle_period=20, + timeout=TIMEOUT, + ) + + await integrate_with_tls(ops_test) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=CLUSTER_COMPONENTS, + idle_period=20, + timeout=TIMEOUT, + ) + + await check_cluster_tls_enabled(ops_test) + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_disable_cluster_with_tls(ops_test: OpsTest) -> None: + """Tests that the cluster can disable TLS.""" + await remove_tls_integrations(ops_test) + await check_cluster_tls_disabled(ops_test) + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_tls_then_build_cluster(ops_test: OpsTest) -> None: + """Tests that the cluster can be integrated with TLS.""" + await destroy_cluster(ops_test) + await deploy_cluster_components(ops_test) + + await integrate_with_tls(ops_test) + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=CLUSTER_COMPONENTS, + idle_period=20, + timeout=TIMEOUT, + ) + + await integrate_cluster(ops_test) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=CLUSTER_COMPONENTS, + idle_period=20, + timeout=TIMEOUT, + ) + + await check_cluster_tls_enabled(ops_test) + + +# FUTURE PR - test inconsistencies in TLS settings across cluster + + +async def check_cluster_tls_disabled(ops_test: OpsTest) -> None: + # check each replica set is running with TLS enabled + for cluster_component in CLUSTER_COMPONENTS: + for unit in ops_test.model.applications[cluster_component].units: + await tls_helpers.check_tls( + ops_test, unit, enabled=False, app_name=cluster_component, mongos=False + ) + + # check mongos is running with TLS enabled + for unit in ops_test.model.applications[CONFIG_SERVER_APP_NAME].units: + await tls_helpers.check_tls( + ops_test, unit, enabled=False, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + + +async def check_cluster_tls_enabled(ops_test: OpsTest) -> None: + # check each replica set is running with TLS enabled + for cluster_component in CLUSTER_COMPONENTS: + for unit in ops_test.model.applications[cluster_component].units: + await tls_helpers.check_tls( + ops_test, unit, enabled=True, app_name=cluster_component, mongos=False + ) + + # check mongos is running with TLS enabled + for unit in ops_test.model.applications[CONFIG_SERVER_APP_NAME].units: + await tls_helpers.check_tls( + ops_test, unit, enabled=True, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + + +async def deploy_cluster_components(ops_test: OpsTest) -> None: + my_charm = await ops_test.build_charm(".") + await ops_test.model.deploy( + my_charm, + num_units=2, + config={"role": "config-server"}, + application_name=CONFIG_SERVER_APP_NAME, + ) + await ops_test.model.deploy( + my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME + ) + await ops_test.model.deploy( + my_charm, num_units=1, config={"role": "shard"}, application_name=SHARD_TWO_APP_NAME + ) + + await ops_test.model.wait_for_idle( + apps=CLUSTER_COMPONENTS, + idle_period=20, + timeout=TIMEOUT, + ) + + +async def destroy_cluster(ops_test): + """Destroy cluster in a forceful way.""" + for app in CLUSTER_COMPONENTS: + await ops_test.model.applications[app].destroy(force=True, no_wait=False) + + # destroy does not wait for applications to be removed, perform this check manually + for attempt in Retrying(stop=stop_after_attempt(100), wait=wait_fixed(10), reraise=True): + with attempt: + # pytest_operator has a bug where the number of applications does not get correctly + # updated. Wrapping the call with `fast_forward` resolves this + async with ops_test.fast_forward(): + assert ( + len(ops_test.model.applications) == 1 + ), "old cluster not destroyed successfully." + + +async def remove_tls_integrations(ops_test: OpsTest) -> None: + """Removes the TLS integration from all cluster components.""" + for app in CLUSTER_COMPONENTS: + await ops_test.model.applications[app].remove_relation( + f"{app}:{CERT_REL_NAME}", + f"{CERTS_APP_NAME}:{CERT_REL_NAME}", + ) + + +async def integrate_cluster(ops_test: OpsTest) -> None: + """Integrates the cluster components with each other.""" + await ops_test.model.integrate( + f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + await ops_test.model.integrate( + f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + +async def integrate_with_tls(ops_test: OpsTest) -> None: + """Integrates cluster components with self-signed certs operator.""" + for app in CLUSTER_COMPONENTS: + await ops_test.model.integrate( + f"{CERTS_APP_NAME}:{CERT_REL_NAME}", + f"{app}:{CERT_REL_NAME}", + ) diff --git a/tests/integration/tls_tests/helpers.py b/tests/integration/tls_tests/helpers.py index 3523f1ee2..09c8ebf49 100644 --- a/tests/integration/tls_tests/helpers.py +++ b/tests/integration/tls_tests/helpers.py @@ -37,24 +37,29 @@ class ProcessError(Exception): """Raised when a process fails.""" -async def mongo_tls_command(ops_test: OpsTest, app_name=None) -> str: +async def mongo_tls_command(ops_test: OpsTest, app_name=None, mongos=False) -> str: """Generates a command which verifies TLS status.""" app_name = app_name or await get_app_name(ops_test) + port = "27017" if not mongos else "27018" replica_set_hosts = [ - unit.public_address for unit in ops_test.model.applications[app_name].units + f"{unit.public_address}:{port}" for unit in ops_test.model.applications[app_name].units ] password = await get_password(ops_test, app_name=app_name) hosts = ",".join(replica_set_hosts) - replica_set_uri = f"mongodb://operator:" f"{password}@" f"{hosts}/admin?replicaSet={app_name}" + extra_args = f"?replicaSet={app_name}" if not mongos else "" + replica_set_uri = f"mongodb://operator:{password}@{hosts}/admin{extra_args}" + status_comand = "rs.status()" if not mongos else "sh.status()" return ( - f"{MONGO_SHELL} '{replica_set_uri}' --eval 'rs.status()'" + f"{MONGO_SHELL} '{replica_set_uri}' --eval '{status_comand}'" f" --tls --tlsCAFile {EXTERNAL_CERT_PATH}" f" --tlsCertificateKeyFile {EXTERNAL_PEM_PATH}" ) -async def check_tls(ops_test: OpsTest, unit: ops.model.Unit, enabled: bool, app_name=None) -> bool: +async def check_tls( + ops_test: OpsTest, unit: ops.model.Unit, enabled: bool, app_name=None, mongos=False +) -> bool: """Returns whether TLS is enabled on the specific PostgreSQL instance. Args: @@ -70,9 +75,12 @@ async def check_tls(ops_test: OpsTest, unit: ops.model.Unit, enabled: bool, app_ stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) ): with attempt: - mongod_tls_check = await mongo_tls_command(ops_test, app_name=app_name) + mongod_tls_check = await mongo_tls_command( + ops_test, app_name=app_name, mongos=mongos + ) check_tls_cmd = f"exec --unit {unit.name} -- {mongod_tls_check}" return_code, _, _ = await ops_test.juju(*check_tls_cmd.split()) + tls_enabled = return_code == 0 if enabled != tls_enabled: raise ValueError(