diff --git a/src/v/features/CMakeLists.txt b/src/v/features/CMakeLists.txt index 449e78492abf9..b1653972f3015 100644 --- a/src/v/features/CMakeLists.txt +++ b/src/v/features/CMakeLists.txt @@ -10,6 +10,8 @@ v_cc_library( v::model v::config v::version + v::security + v::metrics ) add_dependencies(v_features kafka_codegen_headers) diff --git a/src/v/features/feature_table.cc b/src/v/features/feature_table.cc index 97599d66adc75..3d61eb974cc5c 100644 --- a/src/v/features/feature_table.cc +++ b/src/v/features/feature_table.cc @@ -14,12 +14,18 @@ #include "cluster/types.h" #include "config/node_config.h" #include "features/logger.h" +#include "metrics/metrics.h" +#include "prometheus/prometheus_sanitize.h" #include "version.h" #include +#include +#include + // The feature table is closely related to cluster and uses many types from it using namespace cluster; +using namespace std::chrono_literals; namespace features { @@ -189,6 +195,60 @@ static std::array test_extra_schema{ feature_spec::prepare_policy::always}, }; +class feature_table::probe { +public: + explicit probe(const feature_table& parent) + : _parent(parent) {} + + probe(const probe&) = delete; + probe& operator=(const probe&) = delete; + probe(probe&&) = delete; + probe& operator=(probe&&) = delete; + ~probe() noexcept = default; + + void setup_metrics() { + if (ss::this_shard_id() != 0) { + return; + } + + if (!config::shard_local_cfg().disable_metrics()) { + setup_metrics_for(_metrics); + } + + if (!config::shard_local_cfg().disable_public_metrics()) { + setup_metrics_for(_public_metrics); + } + } + + void setup_metrics_for(metrics::metric_groups_base& metrics) { + namespace sm = ss::metrics; + + static_assert( + !std::is_move_constructible_v + && !std::is_move_assignable_v + && !std::is_copy_constructible_v + && !std::is_copy_assignable_v, + "The probe captures a reference to this"); + + metrics.add_group( + prometheus_sanitize::metrics_name("cluster:features"), + { + sm::make_gauge( + "enterprise_license_expiry_sec", + [&ft = _parent]() { + return calculate_expiry_metric(ft.get_license()); + }, + sm::description("Number of seconds remaining until the " + "Enterprise license expires")) + .aggregate({sm::shard_label}), + }); + } + + const feature_table& _parent; + metrics::internal_metric_groups _metrics; + metrics::public_metric_groups _public_metrics; +}; + feature_table::feature_table() { // Intentionally undocumented environment variable, only for use // in integration tests. @@ -221,9 +281,15 @@ feature_table::feature_table() { } } } + + _probe = std::make_unique(*this); + _probe->setup_metrics(); } +feature_table::~feature_table() noexcept = default; + ss::future<> feature_table::stop() { + _probe.reset(); _as.request_abort(); // Don't trust callers to have fired their abort source in the right @@ -686,6 +752,18 @@ void feature_table::assert_compatible_version(bool override) { } } +long long feature_table::calculate_expiry_metric( + const std::optional& license, + security::license::clock::time_point now) { + if (!license) { + return -1; + } + + auto rem = license->expiration() - now; + auto rem_capped = std::max(rem.zero(), rem); + return rem_capped / 1s; +} + } // namespace features namespace cluster { diff --git a/src/v/features/feature_table.h b/src/v/features/feature_table.h index 6ebc791bd001b..e0818bd8c849a 100644 --- a/src/v/features/feature_table.h +++ b/src/v/features/feature_table.h @@ -18,6 +18,7 @@ #include "utils/waiter_queue.h" #include +#include #include #include @@ -444,6 +445,11 @@ class feature_table { static cluster::cluster_version get_earliest_logical_version(); feature_table(); + feature_table(const feature_table&) = delete; + feature_table& operator=(const feature_table&) = delete; + feature_table(feature_table&&) = delete; + feature_table& operator=(feature_table&&) = delete; + ~feature_table() noexcept; feature_state& get_state(feature f_id); const feature_state& get_state(feature f_id) const { @@ -545,7 +551,15 @@ class feature_table { // Assert out on startup if we appear to have upgraded too far void assert_compatible_version(bool); + // Visible for testing + static long long calculate_expiry_metric( + const std::optional& license, + security::license::clock::time_point now + = security::license::clock::now()); + private: + class probe; + // Only for use by our friends feature backend & manager void set_active_version( cluster::cluster_version, @@ -601,6 +615,7 @@ class feature_table { ss::gate _gate; ss::abort_source _as; + std::unique_ptr _probe; }; } // namespace features diff --git a/src/v/features/tests/feature_table_test.cc b/src/v/features/tests/feature_table_test.cc index 225f0e1dc596a..d90e97314226f 100644 --- a/src/v/features/tests/feature_table_test.cc +++ b/src/v/features/tests/feature_table_test.cc @@ -11,6 +11,7 @@ #include "features/feature_table.h" #include "features/feature_table_snapshot.h" +#include "security/license.h" #include "test_utils/fixture.h" #include "vlog.h" @@ -334,3 +335,25 @@ FIXTURE_TEST(feature_table_old_snapshot, feature_table_fixture) { ft.get_state(feature::test_alpha).get_state() == feature_state::state::active); } + +SEASTAR_THREAD_TEST_CASE(feature_table_probe_expiry_metric_test) { + using ft = features::feature_table; + const char* sample_valid_license = std::getenv("REDPANDA_SAMPLE_LICENSE"); + if (sample_valid_license == nullptr) { + const char* is_on_ci = std::getenv("CI"); + BOOST_TEST_REQUIRE( + !is_on_ci, + "Expecting the REDPANDA_SAMPLE_LICENSE env var in the CI " + "enviornment"); + return; + } + const ss::sstring license_str{sample_valid_license}; + const auto license = security::make_license(license_str); + + auto expiry = security::license::clock::time_point{4813252273s}; + + BOOST_CHECK_EQUAL(ft::calculate_expiry_metric(license, expiry - 1s), 1); + BOOST_CHECK_EQUAL(ft::calculate_expiry_metric(license, expiry), 0); + BOOST_CHECK_EQUAL(ft::calculate_expiry_metric(license, expiry + 1s), 0); + BOOST_CHECK_EQUAL(ft::calculate_expiry_metric(std::nullopt), -1); +} diff --git a/src/v/security/license.cc b/src/v/security/license.cc index 2b507c035e0ba..7a208d90fe51b 100644 --- a/src/v/security/license.cc +++ b/src/v/security/license.cc @@ -20,6 +20,11 @@ #include #include +#include +#include + +using namespace std::chrono_literals; + namespace security { namespace crypto { @@ -223,9 +228,11 @@ license make_license(const ss::sstring& raw_license) { } bool license::is_expired() const noexcept { - const auto now = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()); - return now > expiry; + return clock::now() > expiration(); +} + +license::clock::time_point license::expiration() const noexcept { + return clock::time_point{expiry}; } } // namespace security diff --git a/src/v/security/license.h b/src/v/security/license.h index ff3af703893f1..898834ed2bd75 100644 --- a/src/v/security/license.h +++ b/src/v/security/license.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -64,6 +65,8 @@ inline std::ostream& operator<<(std::ostream& os, license_type lt) { struct license : serde::envelope, serde::compat_version<0>> { + using clock = std::chrono::system_clock; + /// Expected encoded contents uint8_t format_version; license_type type; @@ -81,6 +84,9 @@ struct license /// Seconds since epoch until license expiration std::chrono::seconds expires() const noexcept; + /// Expiration timepoint + clock::time_point expiration() const noexcept; + auto operator<=>(const license&) const = delete; private: diff --git a/src/v/security/tests/license_test.cc b/src/v/security/tests/license_test.cc index 2e726ba7dd8af..2a9dc64511cc3 100644 --- a/src/v/security/tests/license_test.cc +++ b/src/v/security/tests/license_test.cc @@ -13,6 +13,11 @@ #include #include #include + +#include + +using namespace std::chrono_literals; + namespace security { BOOST_AUTO_TEST_CASE(test_license_invalid_signature) { @@ -76,7 +81,10 @@ BOOST_AUTO_TEST_CASE(test_license_valid_content) { BOOST_CHECK_EQUAL(license.format_version, 0); BOOST_CHECK_EQUAL(license.type, license_type::enterprise); BOOST_CHECK_EQUAL(license.organization, "redpanda-testing"); + BOOST_CHECK(!license.is_expired()); BOOST_CHECK_EQUAL(license.expiry.count(), 4813252273); + BOOST_CHECK( + license.expiration() == license::clock::time_point{4813252273s}); BOOST_CHECK_EQUAL( license.checksum, "2730125070a934ca1067ed073d7159acc9975dc61015892308aae186f7455daf"); diff --git a/src/v/storage/tests/segment_deduplication_test.cc b/src/v/storage/tests/segment_deduplication_test.cc index 58a1e06e9c507..31671b3182252 100644 --- a/src/v/storage/tests/segment_deduplication_test.cc +++ b/src/v/storage/tests/segment_deduplication_test.cc @@ -7,6 +7,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0 +#include "config/configuration.h" #include "gmock/gmock.h" #include "random/generators.h" #include "storage/chunk_cache.h" @@ -158,6 +159,17 @@ TEST(FindSlidingRangeTest, TestCollectOneRecordSegments) { } TEST(BuildOffsetMap, TestBuildSimpleMap) { + ss::smp::invoke_on_all([] { + config::shard_local_cfg().disable_metrics.set_value(true); + config::shard_local_cfg().disable_public_metrics.set_value(true); + }).get(); + auto defer_config_reset = ss::defer([] { + ss::smp::invoke_on_all([] { + config::shard_local_cfg().disable_metrics.reset(); + config::shard_local_cfg().disable_public_metrics.reset(); + }).get(); + }); + storage::disk_log_builder b; build_segments(b, 3); auto cleanup = ss::defer([&] { b.stop().get(); }); diff --git a/src/v/storage/tests/storage_test_fixture.h b/src/v/storage/tests/storage_test_fixture.h index 968596e54a5ef..7b92651c23404 100644 --- a/src/v/storage/tests/storage_test_fixture.h +++ b/src/v/storage/tests/storage_test_fixture.h @@ -202,9 +202,14 @@ class storage_test_fixture { resources, feature_table) { configure_unit_test_logging(); - // avoid double metric registrations + // avoid double metric registrations - disk_log_builder and other + // helpers also start a feature_table and other structs that register + // metrics ss::smp::invoke_on_all([] { config::shard_local_cfg().get("disable_metrics").set_value(true); + config::shard_local_cfg() + .get("disable_public_metrics") + .set_value(true); config::shard_local_cfg() .get("log_segment_size_min") .set_value(std::optional{}); @@ -223,6 +228,7 @@ class storage_test_fixture { feature_table.stop().get(); ss::smp::invoke_on_all([] { config::shard_local_cfg().get("disable_metrics").reset(); + config::shard_local_cfg().get("disable_public_metrics").reset(); config::shard_local_cfg().get("log_segment_size_min").reset(); }).get(); } diff --git a/tests/rptest/tests/rpk_cluster_test.py b/tests/rptest/tests/rpk_cluster_test.py index 7d222d41fb908..a333a9866cc32 100644 --- a/tests/rptest/tests/rpk_cluster_test.py +++ b/tests/rptest/tests/rpk_cluster_test.py @@ -15,7 +15,7 @@ import json from rptest.services.cluster import cluster -from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST +from rptest.services.redpanda import RESTART_LOG_ALLOW_LIST, MetricSamples, MetricsEndpoint from rptest.util import expect_exception, get_cluster_license, get_second_cluster_license from ducktape.utils.util import wait_until from rptest.util import wait_until_result @@ -208,6 +208,25 @@ def test_cluster_down(self): else: assert False, f"Unexpected success: '{r}'" + def _get_license_expiry(self) -> int: + METRICS_NAME = "cluster_features_enterprise_license_expiry_sec" + + def get_metrics_value(metrics_endpoint: MetricsEndpoint) -> int: + metrics = self.redpanda.metrics_sample( + sample_pattern=METRICS_NAME, metrics_endpoint=metrics_endpoint) + assert isinstance(metrics, MetricSamples), \ + f'Failed to get metrics for {METRICS_NAME}' + + samples = [sample for sample in metrics.samples] + assert len(samples) == len(self.redpanda.nodes), \ + f'Invalid number of samples: {len(samples)}' + return int(samples[0].value) + + internal_val = get_metrics_value(MetricsEndpoint.METRICS) + public_val = get_metrics_value(MetricsEndpoint.PUBLIC_METRICS) + assert internal_val == public_val, f"Mismatch: {internal_val} != {public_val}" + return internal_val + @cluster(num_nodes=3) def test_upload_and_query_cluster_license_rpk(self): """ @@ -220,6 +239,12 @@ def test_upload_and_query_cluster_license_rpk(self): "Skipping test, REDPANDA_SAMPLE_LICENSE env var not found") return + wait_until(lambda: self._get_license_expiry() == -1, + timeout_sec=10, + backoff_sec=1, + retry_on_exc=True, + err_msg="Unset license should return a -1 expiry") + with tempfile.NamedTemporaryFile() as tf: tf.write(bytes(license, 'UTF-8')) tf.seek(0) @@ -237,6 +262,14 @@ def obtain_license(): retry_on_exc=True, err_msg="unable to retrieve license information") + wait_until( + lambda: self._get_license_expiry() > 0, + timeout_sec=10, + backoff_sec=1, + retry_on_exc=True, + err_msg="The expiry metric should be positive with a valid license" + ) + expected_license = { 'expires': "Jul 11 2122",