facebook · cbi42 · Sep 25, 2023 · Sep 21, 2023 · Sep 22, 2023 · Sep 25, 2023
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,12 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 8.6.6 (09/25/2023)
+### Bug Fixes
+* Fix a bug with atomic_flush=true that can cause DB to stuck after a flush fails (#11872).
+* Fix a bug where RocksDB (with atomic_flush=false) can delete output SST files of pending flushes when a previous concurrent flush fails (#11865). This can result in DB entering read-only state with error message like `IO error: No such file or directory: While open a file for random read: /tmp/rocksdbtest-501/db_flush_test_87732_4230653031040984171/000013.sst`.
+* When the compressed secondary cache capacity is reduced to 0, it should be completely disabled. Before this fix, inserts and lookups would still go to the backing `LRUCache` before returning, thus incurring locking overhead. With this fix, inserts and lookups are no-ops and do not add any overhead.
+
 ## 8.6.5 (09/15/2023)
 ### Bug Fixes
 * Fixed a bug where `rocksdb.file.read.verify.file.checksums.micros` is not populated.

diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
@@ -3193,6 +3193,279 @@ INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
 
 INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
 
+TEST_F(DBFlushTest, NonAtomicFlushRollbackPendingFlushes) {
+  // Fix a bug in when atomic_flush=false.
+  // The bug can happen as follows:
+  // Start Flush0 for memtable M0 to SST0
+  // Start Flush1 for memtable M1 to SST1
+  // Flush1 returns OK, but don't install to MANIFEST and let whoever flushes
+  // M0 to take care of it
+  // Flush0 finishes with a retryable IOError
+  //   - It rollbacks M0, (incorrectly) not M1
+  //   - Deletes SST1 and SST2
+  //
+  // Auto-recovery will start Flush2 for M0, it does not pick up M1 since it
+  // thinks that M1 is flushed
+  // Flush2 writes SST3 and finishes OK, tries to install SST3 and SST2
+  // Error opening SST2 since it's already deleted
+  //
+  // The fix is to let Flush0 also rollback M1.
+  Options opts = CurrentOptions();
+  opts.atomic_flush = false;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.max_write_buffer_number = 64;
+  opts.max_background_flushes = 4;
+  env_->SetBackgroundThreads(4, Env::HIGH);
+  DestroyAndReopen(opts);
+  std::atomic_int flush_count = 0;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
+        int c = flush_count.fetch_add(1);
+        if (c == 0) {
+          Status* s = (Status*)(s_ptr);
+          IOStatus io_error = IOStatus::IOError("injected foobar");
+          io_error.SetRetryable(true);
+          *s = io_error;
+          TEST_SYNC_POINT("Let mem1 flush start");
+          TEST_SYNC_POINT("Wait for mem1 flush to finish");
+        }
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Let mem1 flush start", "Mem1 flush starts"},
+       {"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "Wait for error recover"}});
+  // Need first flush to wait for the second flush to finish
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "val1"));
+  // trigger bg flush mem0
+  ASSERT_OK(Put(Key(2), "val2"));
+  TEST_SYNC_POINT("Mem1 flush starts");
+  // trigger bg flush mem1
+  ASSERT_OK(Put(Key(3), "val3"));
+
+  TEST_SYNC_POINT("Wait for error recover");
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBFlushTest, AbortNonAtomicFlushWhenBGError) {
+  // Fix a bug in when atomic_flush=false.
+  // The bug can happen as follows:
+  // Start Flush0 for memtable M0 to SST0
+  // Start Flush1 for memtable M1 to SST1
+  // Flush1 returns OK, but doesn't install output MANIFEST and let whoever
+  // flushes M0 to take care of it
+  // Start Flush2 for memtable M2 to SST2
+  // Flush0 finishes with a retryable IOError
+  //   - It rollbacks M0 AND M1
+  //   - Deletes SST1 and SST2
+  // Flush2 finishes, does not rollback M2,
+  //  - releases the pending file number that keeps SST2 alive
+  //  - deletes SST2
+  //
+  // Then auto-recovery starts, error opening SST2 when try to install
+  // flush result
+  //
+  // The fix is to let Flush2 rollback M2 if it finds that
+  // there is a background error.
+  Options opts = CurrentOptions();
+  opts.atomic_flush = false;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.max_write_buffer_number = 64;
+  opts.max_background_flushes = 4;
+  env_->SetBackgroundThreads(4, Env::HIGH);
+  DestroyAndReopen(opts);
+  std::atomic_int flush_count = 0;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
+        int c = flush_count.fetch_add(1);
+        if (c == 0) {
+          Status* s = (Status*)(s_ptr);
+          IOStatus io_error = IOStatus::IOError("injected foobar");
+          io_error.SetRetryable(true);
+          *s = io_error;
+          TEST_SYNC_POINT("Let mem1 flush start");
+          TEST_SYNC_POINT("Wait for mem1 flush to finish");
+
+          TEST_SYNC_POINT("Let mem2 flush start");
+          TEST_SYNC_POINT("Wait for mem2 to start writing table");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&](void* mems) {
+        autovector<MemTable*>* mems_ptr = (autovector<MemTable*>*)mems;
+        if ((*mems_ptr)[0]->GetID() == 3) {
+          TEST_SYNC_POINT("Mem2 flush starts writing table");
+          TEST_SYNC_POINT("Mem2 flush waits until rollback");
+        }
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Let mem1 flush start", "Mem1 flush starts"},
+       {"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"},
+       {"Let mem2 flush start", "Mem2 flush starts"},
+       {"Mem2 flush starts writing table",
+        "Wait for mem2 to start writing table"},
+       {"RollbackMemtableFlush", "Mem2 flush waits until rollback"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "Wait for error recover"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  // trigger bg flush mem0
+  ASSERT_OK(Put(Key(2), "val2"));
+  TEST_SYNC_POINT("Mem1 flush starts");
+  // trigger bg flush mem1
+  ASSERT_OK(Put(Key(3), "val3"));
+
+  TEST_SYNC_POINT("Mem2 flush starts");
+  ASSERT_OK(Put(Key(4), "val4"));
+
+  TEST_SYNC_POINT("Wait for error recover");
+  // Recovery flush writes 3 memtables together into 1 file.
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBFlushTest, NonAtomicNormalFlushAbortWhenBGError) {
+  Options opts = CurrentOptions();
+  opts.atomic_flush = false;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.max_write_buffer_number = 64;
+  opts.max_background_flushes = 1;
+  env_->SetBackgroundThreads(2, Env::HIGH);
+  DestroyAndReopen(opts);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  std::atomic_int flush_write_table_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
+        int c = flush_write_table_count.fetch_add(1);
+        if (c == 0) {
+          Status* s = (Status*)(s_ptr);
+          IOStatus io_error = IOStatus::IOError("injected foobar");
+          io_error.SetRetryable(true);
+          *s = io_error;
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Let error recovery start",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "Wait for error recover"}});
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  // trigger bg flush0 for mem0
+  ASSERT_OK(Put(Key(2), "val2"));
+  // Not checking status since this wait can finish before flush starts.
+  dbfull()->TEST_WaitForFlushMemTable().PermitUncheckedError();
+
+  // trigger bg flush1 for mem1, should see bg error and abort
+  // before picking a memtable to flush
+  ASSERT_OK(Put(Key(3), "val3"));
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  TEST_SYNC_POINT("Let error recovery start");
+  TEST_SYNC_POINT("Wait for error recover");
+  // Recovery flush writes 2 memtables together into 1 file.
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  // 1 for flush 0 and 1 for recovery flush
+  ASSERT_EQ(2, flush_write_table_count);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) {
+  // Test for a bug with atomic flush where DB can become stuck
+  // after a flush error. A repro timeline:
+  //
+  // Start Flush0 for mem0
+  // Start Flush1 for mem1
+  // Now Flush1 will wait for Flush0 to install mem0
+  // Flush0 finishes with retryable IOError, rollbacks mem0
+  // Resume starts and waits for background job to finish, i.e., Flush1
+  // Fill memtable again, trigger Flush2 for mem0
+  // Flush2 will get error status, and not rollback mem0, see code in
+  // https://github.com/facebook/rocksdb/blob/b927ba5936216861c2c35ab68f50ba4a78e65747/db/db_impl/db_impl_compaction_flush.cc#L725
+  //
+  // DB is stuck since mem0 can never be picked now
+  //
+  // The fix is to rollback mem0 in Flush2, and let Flush1 also abort upon
+  // background error besides waiting for older memtables to be installed.
+  // The recovery flush in this case should pick up all memtables
+  // and write them to a single L0 file.
+  Options opts = CurrentOptions();
+  opts.atomic_flush = true;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.max_write_buffer_number = 64;
+  opts.max_background_flushes = 4;
+  env_->SetBackgroundThreads(4, Env::HIGH);
+  DestroyAndReopen(opts);
+
+  std::atomic_int flush_count = 0;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
+        int c = flush_count.fetch_add(1);
+        if (c == 0) {
+          Status* s = (Status*)(s_ptr);
+          IOStatus io_error = IOStatus::IOError("injected foobar");
+          io_error.SetRetryable(true);
+          *s = io_error;
+          TEST_SYNC_POINT("Let flush for mem1 start");
+          // Wait for Flush1 to start waiting to install flush result
+          TEST_SYNC_POINT("Wait for flush for mem1");
+        }
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Let flush for mem1 start", "Flush for mem1"},
+       {"DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV",
+        "Wait for flush for mem1"},
+       {"RecoverFromRetryableBGIOError:BeforeStart",
+        "Wait for resume to start"},
+       {"Recovery should continue here",
+        "RecoverFromRetryableBGIOError:BeforeStart2"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "Wait for error recover"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "val1"));
+  // trigger Flush0 for mem0
+  ASSERT_OK(Put(Key(2), "val2"));
+
+  // trigger Flush1 for mem1
+  TEST_SYNC_POINT("Flush for mem1");
+  ASSERT_OK(Put(Key(3), "val3"));
+
+  // Wait until resume started to schedule another flush
+  TEST_SYNC_POINT("Wait for resume to start");
+  // This flush should not be scheduled due to bg error
+  ASSERT_OK(Put(Key(4), "val4"));
+
+  // TEST_WaitForBackgroundWork() returns background error
+  // after all background work is done.
+  ASSERT_NOK(dbfull()->TEST_WaitForBackgroundWork());
+  // Flush should abort and not writing any table
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Wait until this flush is done.
+  TEST_SYNC_POINT("Recovery should continue here");
+  TEST_SYNC_POINT("Wait for error recover");
+  // error recovery can schedule new flushes, but should not
+  // encounter error
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
@@ -471,6 +471,28 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
   if (shutdown_initiated_) {
     s = Status::ShutdownInProgress();
   }
+  if (s.ok() && context.flush_after_recovery) {
+    // Since we drop all non-recovery flush requests during recovery,
+    // and new memtable may fill up during recovery,
+    // schedule one more round of flush.
+    FlushOptions flush_opts;
+    flush_opts.allow_write_stall = false;
+    flush_opts.wait = false;
+    Status status = FlushAllColumnFamilies(
+        flush_opts, FlushReason::kCatchUpAfterErrorRecovery);
+    if (!status.ok()) {
+      // FlushAllColumnFamilies internally should take care of setting
+      // background error if needed.
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "The catch up flush after successful recovery failed [%s]",
+                     s.ToString().c_str());
+    }
+    // FlushAllColumnFamilies releases and re-acquires mutex.
+    if (shutdown_initiated_) {
+      s = Status::ShutdownInProgress();
+    }
+  }
+
   if (s.ok()) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       SchedulePendingCompaction(cfd);