[Consensus] improve compression and reduce compaction on blocks col…

…umn family (#19770) ## Description - Switch to sst storage for `blocks` with large block size, to reduce disk footprint. - Switch to universal compaction to reduce write amplifications due compaction. ## Test plan PT: the changes shows no visible effect on performance of the network, with significant (~60%) reduction in `blocks` column family foot print. --- ## Release notes Check each box that your changes affect. If none of the boxes relate to your changes, release notes aren't required. For each box you select, include information after the relevant heading that describes the impact of your changes that a user might notice and any actions they must take to implement updates. - [ ] Protocol: - [ ] Nodes (Validators and Full nodes): - [ ] Indexer: - [ ] JSON-RPC: - [ ] GraphQL: - [ ] CLI: - [ ] Rust SDK: - [ ] REST API:
MystenLabs · Oct 14, 2024 · 0d07e17 · 0d07e17
1 parent 048381d
commit 0d07e17
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 11 deletions.
diff --git a/consensus/core/src/storage/rocksdb_store.rs b/consensus/core/src/storage/rocksdb_store.rs
@@ -54,10 +54,9 @@ impl RocksDBStore {
             (
                 Self::BLOCKS_CF,
                 default_db_options()
-                    .optimize_for_write_throughput()
-                    // Blocks can get large and they don't need to be compacted.
-                    // So keep them in rocksdb blobstore.
-                    .optimize_for_large_values_no_scan(1 << 10)
+                    .optimize_for_write_throughput_no_deletion()
+                    // Using larger block is ok since there is not much point reads on the cf.
+                    .set_block_options(512, 128 << 10)
                     .options,
             ),
             (Self::DIGESTS_BY_AUTHORITIES_CF, cf_options.clone()),

diff --git a/crates/typed-store/src/rocks/mod.rs b/crates/typed-store/src/rocks/mod.rs
@@ -62,6 +62,7 @@ const DEFAULT_DB_WAL_SIZE: usize = 1024;
 // Environment variable to control behavior of write throughput optimized tables.
 const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER";
 const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 4;
+const DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 80;
 const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB";
 const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256;
 const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER";
@@ -915,7 +916,7 @@ impl<K, V> DBMap<K, V> {
         property_name: &std::ffi::CStr,
     ) -> Result<i64, TypedStoreError> {
         match rocksdb.property_int_value_cf(cf, property_name) {
-            Ok(Some(value)) => Ok(value.try_into().unwrap()),
+            Ok(Some(value)) => Ok(value.min(i64::MAX as u64).try_into().unwrap_or_default()),
             Ok(None) => Ok(0),
             Err(e) => Err(TypedStoreError::RocksDBError(e.into_string())),
         }
@@ -2443,7 +2444,7 @@ impl DBOptions {
     // Optimize tables with a mix of lookup and scan workloads.
     pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions {
         self.options
-            .set_block_based_table_factory(&get_block_options(block_cache_size_mb));
+            .set_block_based_table_factory(&get_block_options(block_cache_size_mb, 16 << 10));
         self
     }
 
@@ -2500,6 +2501,75 @@ impl DBOptions {
         self
     }
 
+    // Optimize tables receiving significant insertions, without any deletions.
+    // TODO: merge this function with optimize_for_write_throughput(), and use a flag to
+    // indicate if deletion is received.
+    pub fn optimize_for_write_throughput_no_deletion(mut self) -> DBOptions {
+        // Increase write buffer size to 256MiB.
+        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
+            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
+            * 1024
+            * 1024;
+        self.options.set_write_buffer_size(write_buffer_size);
+        // Increase write buffers to keep to 6 before slowing down writes.
+        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
+            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
+        self.options
+            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
+        // Keep 1 write buffer so recent writes can be read from memory.
+        self.options
+            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
+
+        // Switch to universal compactions.
+        self.options
+            .set_compaction_style(rocksdb::DBCompactionStyle::Universal);
+        self.options.set_num_levels(1);
+        let mut compaction_options = rocksdb::UniversalCompactOptions::default();
+        compaction_options.set_max_size_amplification_percent(10000);
+        compaction_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Similar);
+        self.options
+            .set_universal_compaction_options(&compaction_options);
+
+        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
+            .unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER);
+        self.options.set_level_zero_file_num_compaction_trigger(
+            max_level_zero_file_num.try_into().unwrap(),
+        );
+        self.options.set_level_zero_slowdown_writes_trigger(
+            (max_level_zero_file_num * 12).try_into().unwrap(),
+        );
+        self.options
+            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
+
+        // Increase sst file size to 128MiB.
+        self.options.set_target_file_size_base(
+            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
+                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
+                * 1024
+                * 1024,
+        );
+
+        // This should be a no-op for universal compaction but increasing it to be safe.
+        self.options
+            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
+
+        self
+    }
+
+    // Overrides the block options with different block cache size and block size.
+    pub fn set_block_options(
+        mut self,
+        block_cache_size_mb: usize,
+        block_size_bytes: usize,
+    ) -> DBOptions {
+        self.options
+            .set_block_based_table_factory(&get_block_options(
+                block_cache_size_mb,
+                block_size_bytes,
+            ));
+        self
+    }
+
     // Disables write stalling and stopping based on pending compaction bytes.
     pub fn disable_write_throttling(mut self) -> DBOptions {
         self.options.set_soft_pending_compaction_bytes_limit(0);
@@ -2551,7 +2621,9 @@ pub fn default_db_options() -> DBOptions {
 
     opt.set_enable_pipelined_write(true);
 
-    opt.set_block_based_table_factory(&get_block_options(128));
+    // Increase block size to 16KiB.
+    // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
+    opt.set_block_based_table_factory(&get_block_options(128, 16 << 10));
 
     // Set memtable bloomfilter.
     opt.set_memtable_prefix_bloom_ratio(0.02);
@@ -2562,15 +2634,14 @@ pub fn default_db_options() -> DBOptions {
     }
 }
 
-fn get_block_options(block_cache_size_mb: usize) -> BlockBasedOptions {
+fn get_block_options(block_cache_size_mb: usize, block_size_bytes: usize) -> BlockBasedOptions {
     // Set options mostly similar to those used in optimize_for_point_lookup(),
     // except non-default binary and hash index, to hopefully reduce lookup latencies
     // without causing any regression for scanning, with slightly more memory usages.
     // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621
     let mut block_options = BlockBasedOptions::default();
-    // Increase block size to 16KiB.
-    // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
-    block_options.set_block_size(16 * 1024);
+    // Overrides block size.
+    block_options.set_block_size(block_size_bytes);
     // Configure a block cache.
     block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20));
     // Set a bloomfilter with 1% false positive rate.