Skip to content

Commit

Permalink
[Consensus] improve compression and reduce compaction on blocks col…
Browse files Browse the repository at this point in the history
…umn family (#19770)

## Description 

- Switch to sst storage for `blocks` with large block size, to reduce
disk footprint.
- Switch to universal compaction to reduce write amplifications due
compaction.

## Test plan 

PT: the changes shows no visible effect on performance of the network,
with significant (~60%) reduction in `blocks` column family foot print.

---

## Release notes

Check each box that your changes affect. If none of the boxes relate to
your changes, release notes aren't required.

For each box you select, include information after the relevant heading
that describes the impact of your changes that a user might notice and
any actions they must take to implement updates.

- [ ] Protocol: 
- [ ] Nodes (Validators and Full nodes): 
- [ ] Indexer: 
- [ ] JSON-RPC: 
- [ ] GraphQL: 
- [ ] CLI: 
- [ ] Rust SDK:
- [ ] REST API:
  • Loading branch information
mwtian committed Oct 14, 2024
1 parent 048381d commit 0d07e17
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 11 deletions.
7 changes: 3 additions & 4 deletions consensus/core/src/storage/rocksdb_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,9 @@ impl RocksDBStore {
(
Self::BLOCKS_CF,
default_db_options()
.optimize_for_write_throughput()
// Blocks can get large and they don't need to be compacted.
// So keep them in rocksdb blobstore.
.optimize_for_large_values_no_scan(1 << 10)
.optimize_for_write_throughput_no_deletion()
// Using larger block is ok since there is not much point reads on the cf.
.set_block_options(512, 128 << 10)
.options,
),
(Self::DIGESTS_BY_AUTHORITIES_CF, cf_options.clone()),
Expand Down
85 changes: 78 additions & 7 deletions crates/typed-store/src/rocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ const DEFAULT_DB_WAL_SIZE: usize = 1024;
// Environment variable to control behavior of write throughput optimized tables.
const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER";
const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 4;
const DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 80;
const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB";
const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256;
const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER";
Expand Down Expand Up @@ -915,7 +916,7 @@ impl<K, V> DBMap<K, V> {
property_name: &std::ffi::CStr,
) -> Result<i64, TypedStoreError> {
match rocksdb.property_int_value_cf(cf, property_name) {
Ok(Some(value)) => Ok(value.try_into().unwrap()),
Ok(Some(value)) => Ok(value.min(i64::MAX as u64).try_into().unwrap_or_default()),
Ok(None) => Ok(0),
Err(e) => Err(TypedStoreError::RocksDBError(e.into_string())),
}
Expand Down Expand Up @@ -2443,7 +2444,7 @@ impl DBOptions {
// Optimize tables with a mix of lookup and scan workloads.
pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions {
self.options
.set_block_based_table_factory(&get_block_options(block_cache_size_mb));
.set_block_based_table_factory(&get_block_options(block_cache_size_mb, 16 << 10));
self
}

Expand Down Expand Up @@ -2500,6 +2501,75 @@ impl DBOptions {
self
}

// Optimize tables receiving significant insertions, without any deletions.
// TODO: merge this function with optimize_for_write_throughput(), and use a flag to
// indicate if deletion is received.
pub fn optimize_for_write_throughput_no_deletion(mut self) -> DBOptions {
// Increase write buffer size to 256MiB.
let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
.unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
* 1024
* 1024;
self.options.set_write_buffer_size(write_buffer_size);
// Increase write buffers to keep to 6 before slowing down writes.
let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
.unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
self.options
.set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
// Keep 1 write buffer so recent writes can be read from memory.
self.options
.set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());

// Switch to universal compactions.
self.options
.set_compaction_style(rocksdb::DBCompactionStyle::Universal);
self.options.set_num_levels(1);
let mut compaction_options = rocksdb::UniversalCompactOptions::default();
compaction_options.set_max_size_amplification_percent(10000);
compaction_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Similar);
self.options
.set_universal_compaction_options(&compaction_options);

let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
.unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER);
self.options.set_level_zero_file_num_compaction_trigger(
max_level_zero_file_num.try_into().unwrap(),
);
self.options.set_level_zero_slowdown_writes_trigger(
(max_level_zero_file_num * 12).try_into().unwrap(),
);
self.options
.set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());

// Increase sst file size to 128MiB.
self.options.set_target_file_size_base(
read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
.unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
* 1024
* 1024,
);

// This should be a no-op for universal compaction but increasing it to be safe.
self.options
.set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);

self
}

// Overrides the block options with different block cache size and block size.
pub fn set_block_options(
mut self,
block_cache_size_mb: usize,
block_size_bytes: usize,
) -> DBOptions {
self.options
.set_block_based_table_factory(&get_block_options(
block_cache_size_mb,
block_size_bytes,
));
self
}

// Disables write stalling and stopping based on pending compaction bytes.
pub fn disable_write_throttling(mut self) -> DBOptions {
self.options.set_soft_pending_compaction_bytes_limit(0);
Expand Down Expand Up @@ -2551,7 +2621,9 @@ pub fn default_db_options() -> DBOptions {

opt.set_enable_pipelined_write(true);

opt.set_block_based_table_factory(&get_block_options(128));
// Increase block size to 16KiB.
// https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
opt.set_block_based_table_factory(&get_block_options(128, 16 << 10));

// Set memtable bloomfilter.
opt.set_memtable_prefix_bloom_ratio(0.02);
Expand All @@ -2562,15 +2634,14 @@ pub fn default_db_options() -> DBOptions {
}
}

fn get_block_options(block_cache_size_mb: usize) -> BlockBasedOptions {
fn get_block_options(block_cache_size_mb: usize, block_size_bytes: usize) -> BlockBasedOptions {
// Set options mostly similar to those used in optimize_for_point_lookup(),
// except non-default binary and hash index, to hopefully reduce lookup latencies
// without causing any regression for scanning, with slightly more memory usages.
// https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621
let mut block_options = BlockBasedOptions::default();
// Increase block size to 16KiB.
// https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
block_options.set_block_size(16 * 1024);
// Overrides block size.
block_options.set_block_size(block_size_bytes);
// Configure a block cache.
block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20));
// Set a bloomfilter with 1% false positive rate.
Expand Down

0 comments on commit 0d07e17

Please sign in to comment.