From 39c48f42b1db70a6b0843d5d1f0c52e2c547d26a Mon Sep 17 00:00:00 2001 From: Willem Kaufmann Date: Tue, 7 May 2024 14:48:05 -0400 Subject: [PATCH] cloud_storage: respect `max_keys` in `s3_client::list_objects()` Previously, the `max_keys` value was erroneously added to the header of a `list_objects` request in the `s3_client`. This is, in fact, a URI request parameter, not a request header. See: * https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html * https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html If `max_keys` is specified in a call to `remote::list_objects()`, the onus is now on the user to deal with a possibly truncated value at the call site, likely in a while loop. The idea is that the user will be able to check for this case using `list_result.is_truncated`, and then pass `list_result.next_continuation_token` to `remote::list_objects()` in future requests. --- src/v/cloud_storage/remote.cc | 14 ++++++++++++++ src/v/cloud_storage/remote.h | 13 +++++++++++-- src/v/cloud_storage_clients/s3_client.cc | 8 ++++---- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/v/cloud_storage/remote.cc b/src/v/cloud_storage/remote.cc index 9cfe1a782b53c..da284f89a7de8 100644 --- a/src/v/cloud_storage/remote.cc +++ b/src/v/cloud_storage/remote.cc @@ -1364,6 +1364,12 @@ ss::future remote::list_objects( // Gathers the items from a series of successful ListObjectsV2 calls cloud_storage_clients::client::list_bucket_result list_bucket_result; + const auto caller_handle_truncation = max_keys.has_value(); + + if (caller_handle_truncation) { + vassert(max_keys.value() > 0, "Max keys must be greater than 0."); + } + // Keep iterating while the ListObjectsV2 calls has more items to return while (!_gate.is_closed() && permit.is_allowed && !result) { auto res = co_await lease.client->list_objects( @@ -1403,6 +1409,14 @@ ss::future remote::list_objects( // Continue to list the remaining items if (items_remaining) { + // But, return early if max_keys was specified (caller will + // handle truncation) + if (caller_handle_truncation) { + list_bucket_result.is_truncated = true; + list_bucket_result.next_continuation_token + = continuation_token.value(); + co_return list_bucket_result; + } continue; } diff --git a/src/v/cloud_storage/remote.h b/src/v/cloud_storage/remote.h index dfa5b2afd10c1..38e299f3c82a5 100644 --- a/src/v/cloud_storage/remote.h +++ b/src/v/cloud_storage/remote.h @@ -448,8 +448,17 @@ class remote /// \param prefix Optional prefix to restrict listing of objects /// \param delimiter A character to use as a delimiter when grouping list /// results - /// \param item_filter Optional filter to apply to items before - /// collecting + /// \param max_keys The maximum number of keys to return. If left + /// unspecified, all object keys that fulfill the request will be collected, + /// and the result will not be truncated (truncation not allowed). If + /// specified, it will be up to the user to deal with a possibly-truncated + /// result (using list_result.is_truncated) at the call site, most likely in + /// a while loop. The continuation-token generated by that request will be + /// available through list_result.next_continuation_token for future + /// requests. It is also important to note that the value for max_keys will + /// be capped by the cloud provider default (which may vary between + /// providers, e.g AWS has a limit of 1000 keys per ListObjects request). + /// \param item_filter Optional filter to apply to items before collecting ss::future list_objects( const cloud_storage_clients::bucket_name& name, retry_chain_node& parent, diff --git a/src/v/cloud_storage_clients/s3_client.cc b/src/v/cloud_storage_clients/s3_client.cc index b1d581e1e6c3c..8356dbd619a08 100644 --- a/src/v/cloud_storage_clients/s3_client.cc +++ b/src/v/cloud_storage_clients/s3_client.cc @@ -206,6 +206,9 @@ request_creator::make_list_objects_v2_request( if (delimiter.has_value()) { key = fmt::format("{}&delimiter={}", key, *delimiter); } + if (max_keys.has_value()) { + key = fmt::format("{}&max-keys={}", key, *max_keys); + } auto target = make_target(name, object_key{key}); header.method(boost::beast::http::verb::get); header.target(target); @@ -220,9 +223,6 @@ request_creator::make_list_objects_v2_request( if (start_after) { header.insert(aws_header_names::start_after, (*start_after)().string()); } - if (max_keys) { - header.insert(aws_header_names::max_keys, std::to_string(*max_keys)); - } if (continuation_token) { header.insert( aws_header_names::continuation_token, @@ -829,7 +829,7 @@ ss::future s3_client::do_list_objects_v2( name, std::move(prefix), std::move(start_after), - max_keys, + std::move(max_keys), std::move(continuation_token), delimiter); if (!header) {