diff --git a/consensus/core/src/storage/rocksdb_store.rs b/consensus/core/src/storage/rocksdb_store.rs index edee2b3bb9779..956383b5002a9 100644 --- a/consensus/core/src/storage/rocksdb_store.rs +++ b/consensus/core/src/storage/rocksdb_store.rs @@ -54,10 +54,9 @@ impl RocksDBStore { ( Self::BLOCKS_CF, default_db_options() - .optimize_for_write_throughput() - // Blocks can get large and they don't need to be compacted. - // So keep them in rocksdb blobstore. - .optimize_for_large_values_no_scan(1 << 10) + .optimize_for_write_throughput_no_deletion() + // Using larger block is ok since there is not much point reads on the cf. + .set_block_options(512, 128 << 10) .options, ), (Self::DIGESTS_BY_AUTHORITIES_CF, cf_options.clone()), diff --git a/crates/typed-store/src/rocks/mod.rs b/crates/typed-store/src/rocks/mod.rs index b2164c42a2b65..825f375d1059b 100644 --- a/crates/typed-store/src/rocks/mod.rs +++ b/crates/typed-store/src/rocks/mod.rs @@ -62,6 +62,7 @@ const DEFAULT_DB_WAL_SIZE: usize = 1024; // Environment variable to control behavior of write throughput optimized tables. const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER"; const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 4; +const DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 80; const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB"; const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256; const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER"; @@ -915,7 +916,7 @@ impl DBMap { property_name: &std::ffi::CStr, ) -> Result { match rocksdb.property_int_value_cf(cf, property_name) { - Ok(Some(value)) => Ok(value.try_into().unwrap()), + Ok(Some(value)) => Ok(value.min(i64::MAX as u64).try_into().unwrap_or_default()), Ok(None) => Ok(0), Err(e) => Err(TypedStoreError::RocksDBError(e.into_string())), } @@ -2443,7 +2444,7 @@ impl DBOptions { // Optimize tables with a mix of lookup and scan workloads. pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions { self.options - .set_block_based_table_factory(&get_block_options(block_cache_size_mb)); + .set_block_based_table_factory(&get_block_options(block_cache_size_mb, 16 << 10)); self } @@ -2500,6 +2501,75 @@ impl DBOptions { self } + // Optimize tables receiving significant insertions, without any deletions. + // TODO: merge this function with optimize_for_write_throughput(), and use a flag to + // indicate if deletion is received. + pub fn optimize_for_write_throughput_no_deletion(mut self) -> DBOptions { + // Increase write buffer size to 256MiB. + let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB) + .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB) + * 1024 + * 1024; + self.options.set_write_buffer_size(write_buffer_size); + // Increase write buffers to keep to 6 before slowing down writes. + let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER) + .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER); + self.options + .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap()); + // Keep 1 write buffer so recent writes can be read from memory. + self.options + .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap()); + + // Switch to universal compactions. + self.options + .set_compaction_style(rocksdb::DBCompactionStyle::Universal); + self.options.set_num_levels(1); + let mut compaction_options = rocksdb::UniversalCompactOptions::default(); + compaction_options.set_max_size_amplification_percent(10000); + compaction_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Similar); + self.options + .set_universal_compaction_options(&compaction_options); + + let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER) + .unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER); + self.options.set_level_zero_file_num_compaction_trigger( + max_level_zero_file_num.try_into().unwrap(), + ); + self.options.set_level_zero_slowdown_writes_trigger( + (max_level_zero_file_num * 12).try_into().unwrap(), + ); + self.options + .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap()); + + // Increase sst file size to 128MiB. + self.options.set_target_file_size_base( + read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB) + .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64 + * 1024 + * 1024, + ); + + // This should be a no-op for universal compaction but increasing it to be safe. + self.options + .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64); + + self + } + + // Overrides the block options with different block cache size and block size. + pub fn set_block_options( + mut self, + block_cache_size_mb: usize, + block_size_bytes: usize, + ) -> DBOptions { + self.options + .set_block_based_table_factory(&get_block_options( + block_cache_size_mb, + block_size_bytes, + )); + self + } + // Disables write stalling and stopping based on pending compaction bytes. pub fn disable_write_throttling(mut self) -> DBOptions { self.options.set_soft_pending_compaction_bytes_limit(0); @@ -2551,7 +2621,9 @@ pub fn default_db_options() -> DBOptions { opt.set_enable_pipelined_write(true); - opt.set_block_based_table_factory(&get_block_options(128)); + // Increase block size to 16KiB. + // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks + opt.set_block_based_table_factory(&get_block_options(128, 16 << 10)); // Set memtable bloomfilter. opt.set_memtable_prefix_bloom_ratio(0.02); @@ -2562,15 +2634,14 @@ pub fn default_db_options() -> DBOptions { } } -fn get_block_options(block_cache_size_mb: usize) -> BlockBasedOptions { +fn get_block_options(block_cache_size_mb: usize, block_size_bytes: usize) -> BlockBasedOptions { // Set options mostly similar to those used in optimize_for_point_lookup(), // except non-default binary and hash index, to hopefully reduce lookup latencies // without causing any regression for scanning, with slightly more memory usages. // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621 let mut block_options = BlockBasedOptions::default(); - // Increase block size to 16KiB. - // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks - block_options.set_block_size(16 * 1024); + // Overrides block size. + block_options.set_block_size(block_size_bytes); // Configure a block cache. block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20)); // Set a bloomfilter with 1% false positive rate.