From f01a1b22466ff7fa3aa03046dcd831e0ff742b44 Mon Sep 17 00:00:00 2001 From: zhangjinpeng1987 Date: Tue, 12 Dec 2017 16:44:27 +0800 Subject: [PATCH] delete range bug fix --- librocksdb_sys/rocksdb/db/compaction_job.cc | 4 +- .../rocksdb/db/db_compaction_test.cc | 54 +++++++++++++++++++ librocksdb_sys/rocksdb/db/db_impl.cc | 24 ++++----- librocksdb_sys/rocksdb/db/version_set.cc | 32 ++++++----- .../rocksdb/include/rocksdb/convenience.h | 3 +- 5 files changed, 87 insertions(+), 30 deletions(-) diff --git a/librocksdb_sys/rocksdb/db/compaction_job.cc b/librocksdb_sys/rocksdb/db/compaction_job.cc index 1d023ca45..7419e0a54 100644 --- a/librocksdb_sys/rocksdb/db/compaction_job.cc +++ b/librocksdb_sys/rocksdb/db/compaction_job.cc @@ -1040,6 +1040,7 @@ Status CompactionJob::FinishCompactionOutputFile( auto meta = &sub_compact->current_output()->meta; if (s.ok()) { Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; const Slice *lower_bound, *upper_bound; if (sub_compact->outputs.size() == 1) { // For the first output table, include range tombstones before the min key @@ -1049,7 +1050,8 @@ Status CompactionJob::FinishCompactionOutputFile( // For subsequent output tables, only include range tombstones from min // key onwards since the previous file was extended to contain range // tombstones falling before min key. - lower_bound_guard = meta->smallest.user_key(); + smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); lower_bound = &lower_bound_guard; } else { lower_bound = nullptr; diff --git a/librocksdb_sys/rocksdb/db/db_compaction_test.cc b/librocksdb_sys/rocksdb/db/db_compaction_test.cc index ca77d5b93..93fbc0d37 100644 --- a/librocksdb_sys/rocksdb/db/db_compaction_test.cc +++ b/librocksdb_sys/rocksdb/db/db_compaction_test.cc @@ -1439,6 +1439,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) { ASSERT_GT(old_num_files, new_num_files); } +TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { + // regression test for #2833: groups of files whose user-keys overlap at the + // endpoints could be split by `DeleteFilesInRange`. This caused old data to + // reappear, either because a new version of the key was removed, or a range + // deletion was partially dropped. It could also cause non-overlapping + // invariant to be violated if the files dropped by DeleteFilesInRange were + // a subset of files that a range deletion spans. + const int kNumL0Files = 2; + const int kValSize = 8 << 10; // 8KB + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + // The snapshot prevents key 1 from having its old version dropped. The low + // `target_file_size_base` ensures two keys will be in each output file. + const Snapshot* snapshot = nullptr; + Random rnd(301); + // The value indicates which flush the key belonged to, which is enough + // for us to determine the keys' relative ages. After L0 flushes finish, + // files look like: + // + // File 0: 0 -> vals[0], 1 -> vals[0] + // File 1: 1 -> vals[1], 2 -> vals[1] + // + // Then L0->L1 compaction happens, which outputs keys as follows: + // + // File 0: 0 -> vals[0], 1 -> vals[1] + // File 1: 1 -> vals[0], 2 -> vals[1] + // + // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that + // would cause `1 -> vals[0]` (an older key) to reappear. + std::string vals[kNumL0Files]; + for (int i = 0; i < kNumL0Files; ++i) { + vals[i] = RandomString(&rnd, kValSize); + Put(Key(i), vals[i]); + Put(Key(i + 1), vals[i]); + Flush(); + if (i == 0) { + snapshot = db_->GetSnapshot(); + } + } + dbfull()->TEST_WaitForCompact(); + + // Verify `DeleteFilesInRange` can't drop only file 0 which would cause + // "1 -> vals[0]" to reappear. + Slice begin = Key(0); + Slice end = Key(1); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_EQ(vals[1], Get(Key(1))); + + db_->ReleaseSnapshot(snapshot); +} + TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { int32_t trivial_move = 0; int32_t non_trivial_move = 0; diff --git a/librocksdb_sys/rocksdb/db/db_impl.cc b/librocksdb_sys/rocksdb/db/db_impl.cc index d1bfe41e8..c9f4702c3 100644 --- a/librocksdb_sys/rocksdb/db/db_impl.cc +++ b/librocksdb_sys/rocksdb/db/db_impl.cc @@ -2062,25 +2062,19 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family, end_key = &end_storage; } - vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1, - nullptr, false); + vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key, + &level_files, -1 /* hint_index */, + nullptr /* file_index */); FileMetaData* level_file; for (uint32_t j = 0; j < level_files.size(); j++) { level_file = level_files[j]; - if (((begin == nullptr) || - (cfd->internal_comparator().user_comparator()->Compare( - level_file->smallest.user_key(), *begin) >= 0)) && - ((end == nullptr) || - (cfd->internal_comparator().user_comparator()->Compare( - level_file->largest.user_key(), *end) <= 0))) { - if (level_file->being_compacted) { - continue; - } - edit.SetColumnFamily(cfd->GetID()); - edit.DeleteFile(i, level_file->fd.GetNumber()); - deleted_files.push_back(level_file); - level_file->being_compacted = true; + if (level_file->being_compacted) { + continue; } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.push_back(level_file); + level_file->being_compacted = true; } } if (edit.GetDeletedFiles().empty()) { diff --git a/librocksdb_sys/rocksdb/db/version_set.cc b/librocksdb_sys/rocksdb/db/version_set.cc index 628ef9ff8..da55ad57a 100644 --- a/librocksdb_sys/rocksdb/db/version_set.cc +++ b/librocksdb_sys/rocksdb/db/version_set.cc @@ -1777,27 +1777,33 @@ void VersionStorageInfo::GetOverlappingInputs( void VersionStorageInfo::GetCleanInputsWithinInterval( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index) const { - if (level >= num_non_empty_levels_) { + inputs->clear(); + if (file_index) { + *file_index = -1; + } + if (level >= num_non_empty_levels_ || level == 0 || + level_files_brief_[level].num_files == 0) { // this level is empty, no inputs within range + // also don't support clean input interval within L0 return; } - inputs->clear(); Slice user_begin, user_end; - if (begin != nullptr) { + const auto& level_files = level_files_brief_[level]; + if (begin == nullptr) { + user_begin = ExtractUserKey(level_files.files[0].smallest_key); + } else { user_begin = begin->user_key(); } - if (end != nullptr) { + if (end == nullptr) { + user_end = ExtractUserKey( + level_files.files[level_files.num_files - 1].largest_key); + } else { user_end = end->user_key(); } - if (file_index) { - *file_index = -1; - } - if (begin != nullptr && end != nullptr && level > 0) { - GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, - hint_index, file_index, - true /* within_interval */); - } + GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, + hint_index, file_index, + true /* within_interval */); } // Store in "*inputs" all files in "level" that overlap [begin,end] @@ -1860,8 +1866,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( } else { ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid, &start_index, &end_index); + assert(end_index >= start_index); } - assert(end_index >= start_index); // insert overlapping files into vector for (int i = start_index; i <= end_index; i++) { inputs->push_back(files_[level][i]); diff --git a/librocksdb_sys/rocksdb/include/rocksdb/convenience.h b/librocksdb_sys/rocksdb/include/rocksdb/convenience.h index 4a60afb11..b09ac4816 100644 --- a/librocksdb_sys/rocksdb/include/rocksdb/convenience.h +++ b/librocksdb_sys/rocksdb/include/rocksdb/convenience.h @@ -325,7 +325,8 @@ void CancelAllBackgroundWork(DB* db, bool wait = false); // Delete files which are entirely in the given range // Could leave some keys in the range which are in files which are not -// entirely in the range. +// entirely in the range. Also leaves L0 files regardless of whether they're +// in the range. // Snapshots before the delete might not see the data in the given range. Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end);