Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch 8.6.6 #11886

Merged
merged 4 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`

## 8.6.6 (09/25/2023)
### Bug Fixes
* Fix a bug with atomic_flush=true that can cause DB to stuck after a flush fails (#11872).
* Fix a bug where RocksDB (with atomic_flush=false) can delete output SST files of pending flushes when a previous concurrent flush fails (#11865). This can result in DB entering read-only state with error message like `IO error: No such file or directory: While open a file for random read: /tmp/rocksdbtest-501/db_flush_test_87732_4230653031040984171/000013.sst`.
* When the compressed secondary cache capacity is reduced to 0, it should be completely disabled. Before this fix, inserts and lookups would still go to the backing `LRUCache` before returning, thus incurring locking overhead. With this fix, inserts and lookups are no-ops and do not add any overhead.

## 8.6.5 (09/15/2023)
### Bug Fixes
* Fixed a bug where `rocksdb.file.read.verify.file.checksums.micros` is not populated.
Expand Down
273 changes: 273 additions & 0 deletions db/db_flush_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3193,6 +3193,279 @@ INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,

INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());

TEST_F(DBFlushTest, NonAtomicFlushRollbackPendingFlushes) {
// Fix a bug in when atomic_flush=false.
// The bug can happen as follows:
// Start Flush0 for memtable M0 to SST0
// Start Flush1 for memtable M1 to SST1
// Flush1 returns OK, but don't install to MANIFEST and let whoever flushes
// M0 to take care of it
// Flush0 finishes with a retryable IOError
// - It rollbacks M0, (incorrectly) not M1
// - Deletes SST1 and SST2
//
// Auto-recovery will start Flush2 for M0, it does not pick up M1 since it
// thinks that M1 is flushed
// Flush2 writes SST3 and finishes OK, tries to install SST3 and SST2
// Error opening SST2 since it's already deleted
//
// The fix is to let Flush0 also rollback M1.
Options opts = CurrentOptions();
opts.atomic_flush = false;
opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
opts.max_write_buffer_number = 64;
opts.max_background_flushes = 4;
env_->SetBackgroundThreads(4, Env::HIGH);
DestroyAndReopen(opts);
std::atomic_int flush_count = 0;
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->SetCallBack(
"FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
int c = flush_count.fetch_add(1);
if (c == 0) {
Status* s = (Status*)(s_ptr);
IOStatus io_error = IOStatus::IOError("injected foobar");
io_error.SetRetryable(true);
*s = io_error;
TEST_SYNC_POINT("Let mem1 flush start");
TEST_SYNC_POINT("Wait for mem1 flush to finish");
}
});
SyncPoint::GetInstance()->LoadDependency(
{{"Let mem1 flush start", "Mem1 flush starts"},
{"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"},
{"RecoverFromRetryableBGIOError:RecoverSuccess",
"Wait for error recover"}});
// Need first flush to wait for the second flush to finish
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(1), "val1"));
// trigger bg flush mem0
ASSERT_OK(Put(Key(2), "val2"));
TEST_SYNC_POINT("Mem1 flush starts");
// trigger bg flush mem1
ASSERT_OK(Put(Key(3), "val3"));

TEST_SYNC_POINT("Wait for error recover");
ASSERT_EQ(1, NumTableFilesAtLevel(0));
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBFlushTest, AbortNonAtomicFlushWhenBGError) {
// Fix a bug in when atomic_flush=false.
// The bug can happen as follows:
// Start Flush0 for memtable M0 to SST0
// Start Flush1 for memtable M1 to SST1
// Flush1 returns OK, but doesn't install output MANIFEST and let whoever
// flushes M0 to take care of it
// Start Flush2 for memtable M2 to SST2
// Flush0 finishes with a retryable IOError
// - It rollbacks M0 AND M1
// - Deletes SST1 and SST2
// Flush2 finishes, does not rollback M2,
// - releases the pending file number that keeps SST2 alive
// - deletes SST2
//
// Then auto-recovery starts, error opening SST2 when try to install
// flush result
//
// The fix is to let Flush2 rollback M2 if it finds that
// there is a background error.
Options opts = CurrentOptions();
opts.atomic_flush = false;
opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
opts.max_write_buffer_number = 64;
opts.max_background_flushes = 4;
env_->SetBackgroundThreads(4, Env::HIGH);
DestroyAndReopen(opts);
std::atomic_int flush_count = 0;
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->SetCallBack(
"FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
int c = flush_count.fetch_add(1);
if (c == 0) {
Status* s = (Status*)(s_ptr);
IOStatus io_error = IOStatus::IOError("injected foobar");
io_error.SetRetryable(true);
*s = io_error;
TEST_SYNC_POINT("Let mem1 flush start");
TEST_SYNC_POINT("Wait for mem1 flush to finish");

TEST_SYNC_POINT("Let mem2 flush start");
TEST_SYNC_POINT("Wait for mem2 to start writing table");
}
});

SyncPoint::GetInstance()->SetCallBack(
"FlushJob::WriteLevel0Table", [&](void* mems) {
autovector<MemTable*>* mems_ptr = (autovector<MemTable*>*)mems;
if ((*mems_ptr)[0]->GetID() == 3) {
TEST_SYNC_POINT("Mem2 flush starts writing table");
TEST_SYNC_POINT("Mem2 flush waits until rollback");
}
});
SyncPoint::GetInstance()->LoadDependency(
{{"Let mem1 flush start", "Mem1 flush starts"},
{"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"},
{"Let mem2 flush start", "Mem2 flush starts"},
{"Mem2 flush starts writing table",
"Wait for mem2 to start writing table"},
{"RollbackMemtableFlush", "Mem2 flush waits until rollback"},
{"RecoverFromRetryableBGIOError:RecoverSuccess",
"Wait for error recover"}});
SyncPoint::GetInstance()->EnableProcessing();

ASSERT_OK(Put(Key(1), "val1"));
// trigger bg flush mem0
ASSERT_OK(Put(Key(2), "val2"));
TEST_SYNC_POINT("Mem1 flush starts");
// trigger bg flush mem1
ASSERT_OK(Put(Key(3), "val3"));

TEST_SYNC_POINT("Mem2 flush starts");
ASSERT_OK(Put(Key(4), "val4"));

TEST_SYNC_POINT("Wait for error recover");
// Recovery flush writes 3 memtables together into 1 file.
ASSERT_EQ(1, NumTableFilesAtLevel(0));
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBFlushTest, NonAtomicNormalFlushAbortWhenBGError) {
Options opts = CurrentOptions();
opts.atomic_flush = false;
opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
opts.max_write_buffer_number = 64;
opts.max_background_flushes = 1;
env_->SetBackgroundThreads(2, Env::HIGH);
DestroyAndReopen(opts);
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
std::atomic_int flush_write_table_count = 0;
SyncPoint::GetInstance()->SetCallBack(
"FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
int c = flush_write_table_count.fetch_add(1);
if (c == 0) {
Status* s = (Status*)(s_ptr);
IOStatus io_error = IOStatus::IOError("injected foobar");
io_error.SetRetryable(true);
*s = io_error;
}
});

SyncPoint::GetInstance()->EnableProcessing();
SyncPoint::GetInstance()->LoadDependency(
{{"Let error recovery start",
"RecoverFromRetryableBGIOError:BeforeStart"},
{"RecoverFromRetryableBGIOError:RecoverSuccess",
"Wait for error recover"}});

ASSERT_OK(Put(Key(1), "val1"));
// trigger bg flush0 for mem0
ASSERT_OK(Put(Key(2), "val2"));
// Not checking status since this wait can finish before flush starts.
dbfull()->TEST_WaitForFlushMemTable().PermitUncheckedError();

// trigger bg flush1 for mem1, should see bg error and abort
// before picking a memtable to flush
ASSERT_OK(Put(Key(3), "val3"));
ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
ASSERT_EQ(0, NumTableFilesAtLevel(0));

TEST_SYNC_POINT("Let error recovery start");
TEST_SYNC_POINT("Wait for error recover");
// Recovery flush writes 2 memtables together into 1 file.
ASSERT_EQ(1, NumTableFilesAtLevel(0));
// 1 for flush 0 and 1 for recovery flush
ASSERT_EQ(2, flush_write_table_count);
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) {
// Test for a bug with atomic flush where DB can become stuck
// after a flush error. A repro timeline:
//
// Start Flush0 for mem0
// Start Flush1 for mem1
// Now Flush1 will wait for Flush0 to install mem0
// Flush0 finishes with retryable IOError, rollbacks mem0
// Resume starts and waits for background job to finish, i.e., Flush1
// Fill memtable again, trigger Flush2 for mem0
// Flush2 will get error status, and not rollback mem0, see code in
// https://github.com/facebook/rocksdb/blob/b927ba5936216861c2c35ab68f50ba4a78e65747/db/db_impl/db_impl_compaction_flush.cc#L725
//
// DB is stuck since mem0 can never be picked now
//
// The fix is to rollback mem0 in Flush2, and let Flush1 also abort upon
// background error besides waiting for older memtables to be installed.
// The recovery flush in this case should pick up all memtables
// and write them to a single L0 file.
Options opts = CurrentOptions();
opts.atomic_flush = true;
opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
opts.max_write_buffer_number = 64;
opts.max_background_flushes = 4;
env_->SetBackgroundThreads(4, Env::HIGH);
DestroyAndReopen(opts);

std::atomic_int flush_count = 0;
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->SetCallBack(
"FlushJob::WriteLevel0Table:s", [&](void* s_ptr) {
int c = flush_count.fetch_add(1);
if (c == 0) {
Status* s = (Status*)(s_ptr);
IOStatus io_error = IOStatus::IOError("injected foobar");
io_error.SetRetryable(true);
*s = io_error;
TEST_SYNC_POINT("Let flush for mem1 start");
// Wait for Flush1 to start waiting to install flush result
TEST_SYNC_POINT("Wait for flush for mem1");
}
});
SyncPoint::GetInstance()->LoadDependency(
{{"Let flush for mem1 start", "Flush for mem1"},
{"DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV",
"Wait for flush for mem1"},
{"RecoverFromRetryableBGIOError:BeforeStart",
"Wait for resume to start"},
{"Recovery should continue here",
"RecoverFromRetryableBGIOError:BeforeStart2"},
{"RecoverFromRetryableBGIOError:RecoverSuccess",
"Wait for error recover"}});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(1), "val1"));
// trigger Flush0 for mem0
ASSERT_OK(Put(Key(2), "val2"));

// trigger Flush1 for mem1
TEST_SYNC_POINT("Flush for mem1");
ASSERT_OK(Put(Key(3), "val3"));

// Wait until resume started to schedule another flush
TEST_SYNC_POINT("Wait for resume to start");
// This flush should not be scheduled due to bg error
ASSERT_OK(Put(Key(4), "val4"));

// TEST_WaitForBackgroundWork() returns background error
// after all background work is done.
ASSERT_NOK(dbfull()->TEST_WaitForBackgroundWork());
// Flush should abort and not writing any table
ASSERT_EQ(0, NumTableFilesAtLevel(0));

// Wait until this flush is done.
TEST_SYNC_POINT("Recovery should continue here");
TEST_SYNC_POINT("Wait for error recover");
// error recovery can schedule new flushes, but should not
// encounter error
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
ASSERT_EQ(1, NumTableFilesAtLevel(0));
}
} // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
Expand Down
22 changes: 22 additions & 0 deletions db/db_impl/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,28 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
if (shutdown_initiated_) {
s = Status::ShutdownInProgress();
}
if (s.ok() && context.flush_after_recovery) {
// Since we drop all non-recovery flush requests during recovery,
// and new memtable may fill up during recovery,
// schedule one more round of flush.
FlushOptions flush_opts;
flush_opts.allow_write_stall = false;
flush_opts.wait = false;
Status status = FlushAllColumnFamilies(
flush_opts, FlushReason::kCatchUpAfterErrorRecovery);
if (!status.ok()) {
// FlushAllColumnFamilies internally should take care of setting
// background error if needed.
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"The catch up flush after successful recovery failed [%s]",
s.ToString().c_str());
}
// FlushAllColumnFamilies releases and re-acquires mutex.
if (shutdown_initiated_) {
s = Status::ShutdownInProgress();
}
}

if (s.ok()) {
for (auto cfd : *versions_->GetColumnFamilySet()) {
SchedulePendingCompaction(cfd);
Expand Down
Loading
Loading