-
Notifications
You must be signed in to change notification settings - Fork 592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
cloud_storage: enable prefetching chunks #10950
Changes from all commits
0784056
e77455e
4343954
356a2a3
c9cd373
c39d558
fc95469
158061e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,8 +51,63 @@ | |
|
||
#include <exception> | ||
|
||
namespace { | ||
class bounded_stream final : public ss::data_source_impl { | ||
public: | ||
bounded_stream(ss::input_stream<char>& stream, size_t upto) | ||
: _stream{stream} | ||
, _upto{upto} {} | ||
|
||
ss::future<ss::temporary_buffer<char>> get() override { | ||
auto buf = co_await _stream.read_up_to(_upto); | ||
_upto -= buf.size(); | ||
co_return buf; | ||
} | ||
|
||
private: | ||
ss::input_stream<char>& _stream; | ||
size_t _upto; | ||
}; | ||
|
||
} // namespace | ||
|
||
namespace cloud_storage { | ||
|
||
class split_segment_into_chunk_range_consumer { | ||
public: | ||
split_segment_into_chunk_range_consumer( | ||
cloud_storage::remote_segment& remote_segment, | ||
cloud_storage::segment_chunk_range range) | ||
: _segment{remote_segment} | ||
, _range{std::move(range)} {} | ||
|
||
ss::future<uint64_t> | ||
operator()(uint64_t size, ss::input_stream<char> stream) { | ||
for (const auto [start, end] : _range) { | ||
const auto bytes_to_read = end.value_or(_segment._size - 1) - start | ||
+ 1; | ||
auto reservation = co_await _segment._cache.reserve_space( | ||
bytes_to_read, 1); | ||
vlog( | ||
cst_log.trace, | ||
"making stream from byte offset {} for {} bytes", | ||
start, | ||
bytes_to_read); | ||
auto dsi = std::make_unique<bounded_stream>(stream, bytes_to_read); | ||
auto stream_upto = ss::input_stream<char>{ | ||
ss::data_source{std::move(dsi)}}; | ||
co_await _segment.put_chunk_in_cache( | ||
reservation, std::move(stream_upto), start); | ||
} | ||
co_await stream.close(); | ||
co_return size; | ||
} | ||
|
||
private: | ||
cloud_storage::remote_segment& _segment; | ||
cloud_storage::segment_chunk_range _range; | ||
}; | ||
|
||
std::filesystem::path | ||
generate_index_path(const cloud_storage::remote_segment_path& p) { | ||
return fmt::format("{}.index", p().native()); | ||
|
@@ -258,6 +313,7 @@ remote_segment::offset_data_stream( | |
ss::gate::holder g(_gate); | ||
co_await hydrate(); | ||
offset_index::find_result pos; | ||
std::optional<uint16_t> prefetch_override = std::nullopt; | ||
if (first_timestamp) { | ||
// Time queries are linear search from front of the segment. The | ||
// dominant cost of a time query on a remote partition is promoting | ||
|
@@ -270,6 +326,7 @@ remote_segment::offset_data_stream( | |
.kaf_offset = _base_rp_offset - _base_offset_delta, | ||
.file_pos = 0, | ||
}; | ||
prefetch_override = 0; | ||
} else { | ||
pos = maybe_get_offsets(start).value_or(offset_index::find_result{ | ||
.rp_offset = _base_rp_offset, | ||
|
@@ -300,7 +357,8 @@ remote_segment::offset_data_stream( | |
pos.kaf_offset, | ||
end, | ||
pos.file_pos, | ||
std::move(options)); | ||
std::move(options), | ||
prefetch_override); | ||
data_stream = ss::input_stream<char>{ | ||
ss::data_source{std::move(chunk_ds)}}; | ||
} | ||
|
@@ -407,8 +465,7 @@ ss::future<uint64_t> remote_segment::put_segment_in_cache( | |
co_return size_bytes; | ||
} | ||
|
||
ss::future<uint64_t> remote_segment::put_chunk_in_cache( | ||
uint64_t size, | ||
ss::future<> remote_segment::put_chunk_in_cache( | ||
space_reservation_guard& reservation, | ||
ss::input_stream<char> stream, | ||
chunk_start_offset_t chunk_start) { | ||
|
@@ -424,8 +481,6 @@ ss::future<uint64_t> remote_segment::put_chunk_in_cache( | |
put_exception); | ||
std::rethrow_exception(put_exception); | ||
} | ||
|
||
co_return size; | ||
} | ||
|
||
ss::future<> remote_segment::do_hydrate_segment() { | ||
|
@@ -440,13 +495,11 @@ ss::future<> remote_segment::do_hydrate_segment() { | |
_bucket, | ||
_path, | ||
[this, &reservation](uint64_t size_bytes, ss::input_stream<char> s) { | ||
if (is_legacy_mode_engaged()) { | ||
return put_segment_in_cache_and_create_index( | ||
size_bytes, reservation, std::move(s)); | ||
} else { | ||
return put_segment_in_cache( | ||
size_bytes, reservation, std::move(s)); | ||
} | ||
// Always create the index because we are in legacy mode if we ended | ||
// up hydrating the segment. Legacy mode indicates a missing index, so | ||
// we create it here on the fly using the downloaded segment. | ||
return put_segment_in_cache_and_create_index( | ||
size_bytes, reservation, std::move(s)); | ||
}, | ||
local_rtc); | ||
|
||
|
@@ -882,27 +935,37 @@ ss::future<> remote_segment::hydrate() { | |
.discard_result(); | ||
} | ||
|
||
ss::future<> remote_segment::hydrate_chunk( | ||
chunk_start_offset_t start, std::optional<chunk_start_offset_t> end) { | ||
retry_chain_node rtc{ | ||
cache_hydration_timeout, cache_hydration_backoff, &_rtc}; | ||
|
||
auto cache_status = co_await _cache.is_cached(get_path_to_chunk(start)); | ||
if (cache_status == cache_element_status::available) { | ||
ss::future<> remote_segment::hydrate_chunk(segment_chunk_range range) { | ||
const auto start = range.first_offset(); | ||
const auto path_to_start = get_path_to_chunk(start); | ||
|
||
// It is possible that the chunk has already been downloaded during a | ||
// prefetch operation. In this case we skip hydration and try to materialize | ||
// the chunk. This also skips the prefetch of the successive chunks. So | ||
// given a series of chunks A, B, C, D, E and a prefetch of 2, when A is | ||
// fetched B,C are also fetched. Then hydration of B,C are no-ops and no | ||
// prefetch is done during those no-ops. When D is fetched, hydration | ||
// makes an HTTP GET call and E is also prefetched. So a total of two calls | ||
// are made for the five chunks (ignoring any cache evictions during the | ||
// process). | ||
if (const auto status = co_await _cache.is_cached(path_to_start); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each |
||
status == cache_element_status::available) { | ||
vlog( | ||
_ctxlog.debug, | ||
"skipping chunk hydration for chunk path {}, it is already in " | ||
"cache", | ||
path_to_start); | ||
co_return; | ||
} | ||
|
||
const auto space_required = end.value_or(_size - 1) - start + 1; | ||
auto reserved = co_await _cache.reserve_space(space_required, 1); | ||
retry_chain_node rtc{ | ||
cache_hydration_timeout, cache_hydration_backoff, &_rtc}; | ||
|
||
const auto end = range.last_offset().value_or(_size - 1); | ||
auto consumer = split_segment_into_chunk_range_consumer{ | ||
*this, std::move(range)}; | ||
auto res = co_await _api.download_segment( | ||
_bucket, | ||
_path, | ||
[this, start, &reserved](auto size, auto stream) { | ||
return put_chunk_in_cache(size, reserved, std::move(stream), start); | ||
}, | ||
rtc, | ||
std::make_pair(start, end.value_or(_size - 1))); | ||
_bucket, _path, std::move(consumer), rtc, std::make_pair(start, end)); | ||
if (res != download_result::success) { | ||
throw download_exception{res, _path}; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,8 +45,9 @@ class segment_chunks { | |
// hydration. The waiters are managed per chunk in `segment_chunk::waiters`. | ||
// The first reader to request hydration queues the download. The next | ||
// readers are added to wait list. | ||
ss::future<segment_chunk::handle_t> | ||
hydrate_chunk(chunk_start_offset_t chunk_start); | ||
ss::future<segment_chunk::handle_t> hydrate_chunk( | ||
chunk_start_offset_t chunk_start, | ||
std::optional<uint16_t> prefetch_override = std::nullopt); | ||
|
||
// For all chunks between first and last, increment the | ||
// required_by_readers_in_future value by one, and increment the | ||
|
@@ -76,8 +77,9 @@ class segment_chunks { | |
// Attempts to download chunk into cache and return the file handle for | ||
// segment_chunk. Should be retried if there is a failure due to cache | ||
// eviction between download and opening the file handle. | ||
ss::future<ss::file> | ||
do_hydrate_and_materialize(chunk_start_offset_t chunk_start); | ||
ss::future<ss::file> do_hydrate_and_materialize( | ||
chunk_start_offset_t chunk_start, | ||
std::optional<uint16_t> prefetch_override = std::nullopt); | ||
|
||
// Periodically closes chunk file handles for the space to be reclaimable by | ||
// cache eviction. The chunks are evicted when they are no longer opened for | ||
|
@@ -167,4 +169,24 @@ std::unique_ptr<chunk_eviction_strategy> make_eviction_strategy( | |
uint64_t max_chunks, | ||
uint64_t hydrated_chunks); | ||
|
||
class segment_chunk_range { | ||
public: | ||
Comment on lines
+172
to
+173
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might be missing something, but given the public API of this, could this be a deque of pairs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean for storing the data inside this class (instead of a map), or for replacing this class entirely? If the former, then probably yes it can be a deque of pairs and since we only ever iterate over it and never lookup keys themselves, it should be marginally faster, but I don't see that as enough of a speedup to change it (traversal through a tree should still be pretty fast). If the latter, this class provides some convenience methods for calculating the bounds of the range to decide how much space to reserve in cache etc, it could maybe be done by free functions accepting the deque of pairs but I prefer a class. |
||
using map_t = absl:: | ||
btree_map<chunk_start_offset_t, std::optional<chunk_start_offset_t>>; | ||
|
||
segment_chunk_range( | ||
const segment_chunks::chunk_map_t& chunks, | ||
size_t prefetch, | ||
chunk_start_offset_t start); | ||
|
||
std::optional<chunk_start_offset_t> last_offset() const; | ||
chunk_start_offset_t first_offset() const; | ||
|
||
map_t::iterator begin(); | ||
map_t::iterator end(); | ||
|
||
private: | ||
map_t _chunks; | ||
}; | ||
|
||
} // namespace cloud_storage |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the stream is not closed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed