diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index e4e84107efb..f971bfee8ea 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -50,6 +50,7 @@ class KVIter : public Iterator { } virtual Slice key() const override { return iter_->first; } + virtual bool key_is_deleted() const override { return false; } virtual Slice value() const override { return iter_->second; } virtual Status status() const override { return Status::OK(); } diff --git a/db/db_impl.cc b/db/db_impl.cc index 49a123a1d74..eb3f8939886 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -4021,7 +4021,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, sv->version_number, read_options.iterate_upper_bound, - read_options.prefix_same_as_start, read_options.pin_data); + read_options.prefix_same_as_start, read_options.pin_data, + read_options.skip_deleted_keys); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -4133,7 +4134,8 @@ Status DBImpl::NewIterators( env_, *cfd->ioptions(), cfd->user_comparator(), iter, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, nullptr, false, read_options.pin_data)); + sv->version_number, nullptr, false, read_options.pin_data, + read_options.skip_deleted_keys)); } #endif } else { diff --git a/db/db_iter.cc b/db/db_iter.cc index 4afe610f768..19157d59eb4 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -103,7 +103,8 @@ class DBIter: public Iterator { InternalIterator* iter, SequenceNumber s, bool arena_mode, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound = nullptr, - bool prefix_same_as_start = false) + bool prefix_same_as_start = false, + bool skip_deleted_keys = true) : arena_mode_(arena_mode), env_(env), logger_(ioptions.info_log), @@ -118,7 +119,8 @@ class DBIter: public Iterator { version_number_(version_number), iterate_upper_bound_(iterate_upper_bound), prefix_same_as_start_(prefix_same_as_start), - iter_pinned_(false) { + iter_pinned_(false), + skip_deleted_keys_(skip_deleted_keys) { RecordTick(statistics_, NO_ITERATORS); prefix_extractor_ = ioptions.prefix_extractor; max_skip_ = max_sequential_skip_in_iterations; @@ -140,10 +142,17 @@ class DBIter: public Iterator { } } virtual bool Valid() const override { return valid_; } + virtual Slice key() const override { assert(valid_); return saved_key_.GetKey(); } + + virtual bool key_is_deleted() const override { + assert(valid_); + return saved_key_.IsDeleted(); + } + virtual Slice value() const override { assert(valid_); return (direction_ == kForward && !current_entry_is_merged_) ? @@ -207,6 +216,46 @@ class DBIter: public Iterator { virtual void SeekToFirst() override; virtual void SeekToLast() override; + private: + + // Class that wraps the IterKey and keeps track of delete status. This is + // implemented as a separate class to force users to consider the deletion + // status as a first class citizen when iterating the db. + class IterKeyWithDeletionStatus { + public: + IterKeyWithDeletionStatus() + : is_deleted_(false) {} + + Slice GetKey() const { return key_.GetKey(); } + bool IsDeleted() const { return is_deleted_; } + bool IsKeyPinned() const { return key_.IsKeyPinned(); } + + void Clear() { is_deleted_ = false; key_.Clear(); } + + void SetInternalKey(const Slice& user_key, SequenceNumber s) { + is_deleted_ = false; + key_.SetInternalKey(user_key, s); + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + is_deleted_ = false; + key_.SetInternalKey(parsed_key); + } + + Slice SetKey(const Slice& key, bool copy, bool is_deleted) { + is_deleted_ = is_deleted; + return key_.SetKey(key, copy); + } + + void SetDeleted(bool is_deleted) { + is_deleted_ = is_deleted; + } + + private: + IterKey key_; + bool is_deleted_; + }; + private: void ReverseToBackward(); void PrevInternal(); @@ -239,7 +288,7 @@ class DBIter: public Iterator { SequenceNumber const sequence_; Status status_; - IterKey saved_key_; + IterKeyWithDeletionStatus saved_key_; std::string saved_value_; Direction direction_; bool valid_; @@ -251,6 +300,7 @@ class DBIter: public Iterator { IterKey prefix_start_; bool prefix_same_as_start_; bool iter_pinned_; + const bool skip_deleted_keys_; // List of operands for merge operator. MergeContext merge_context_; LocalStatistics local_stats_; @@ -353,7 +403,13 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. saved_key_.SetKey(ikey.user_key, - !iter_->IsKeyPinned() /* copy */); + !iter_->IsKeyPinned() /* copy */, + true /* is_deleted */); + if (!skip_deleted_keys_) { + valid_ = true; + return; + } + skipping = true; num_skipped = 0; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); @@ -361,12 +417,14 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { case kTypeValue: valid_ = true; saved_key_.SetKey(ikey.user_key, - !iter_->IsKeyPinned() /* copy */); + !iter_->IsKeyPinned() /* copy */, + false /* is_deleted */); return; case kTypeMerge: // By now, we are sure the current ikey is going to yield a value saved_key_.SetKey(ikey.user_key, - !iter_->IsKeyPinned() /* copy */); + !iter_->IsKeyPinned() /* copy */, + false /* is_deleted */); current_entry_is_merged_ = true; valid_ = true; MergeValuesNewToOld(); // Go to a different state machine @@ -529,7 +587,8 @@ void DBIter::PrevInternal() { while (iter_->Valid()) { saved_key_.SetKey(ExtractUserKey(iter_->key()), - !iter_->IsKeyPinned() /* copy */); + !iter_->IsKeyPinned() /* copy */, + false /* is_deleted */); if (FindValueForCurrentKey()) { valid_ = true; if (!iter_->Valid()) { @@ -556,7 +615,9 @@ void DBIter::PrevInternal() { // This function checks, if the entry with biggest sequence_number <= sequence_ // is non kTypeDeletion or kTypeSingleDeletion. If it's not, we save value in -// saved_value_ +// saved_value_. If skip_deleted_keys_ is not set, this function will treat +// deleted keys as valid keys, except for the fact that saved_value_ will not +// be set. bool DBIter::FindValueForCurrentKey() { assert(iter_->Valid()); merge_context_.Clear(); @@ -587,6 +648,9 @@ bool DBIter::FindValueForCurrentKey() { case kTypeSingleDeletion: merge_context_.Clear(); last_not_merge_type = last_key_entry_type; + // TODO: Think about the accounting for internal_deleted_skipped_count + // if we are not skipping deleted keys. Should we update it only the + // first time we encounter a deleted key in this loop? PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: @@ -605,10 +669,6 @@ bool DBIter::FindValueForCurrentKey() { } switch (last_key_entry_type) { - case kTypeDeletion: - case kTypeSingleDeletion: - valid_ = false; - return false; case kTypeMerge: if (last_not_merge_type == kTypeDeletion) { StopWatchNano timer(env_, statistics_ != nullptr); @@ -636,6 +696,15 @@ bool DBIter::FindValueForCurrentKey() { case kTypeValue: // do nothing - we've already has value in saved_value_ break; + case kTypeDeletion: + case kTypeSingleDeletion: + saved_key_.SetDeleted(true); + if (skip_deleted_keys_) { + valid_ = false; + return false; + } + // If we don't intend to skip deleted keys, do nothing. + break; default: assert(false); break; @@ -664,8 +733,18 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { valid_ = true; return true; } - valid_ = false; - return false; + + saved_key_.SetDeleted(true); + + if (skip_deleted_keys_) { + valid_ = false; + return false; + } + + // Control reaches here if we are dealing with a deleted key, and we do not + // want to ignore them. + valid_ = true; + return true; } // kTypeMerge. We need to collect all kTypeMerge values and save them @@ -682,6 +761,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (!iter_->Valid() || !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) || ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) { + // Handling for the case when we encounter a delete after processing a + // bunch of merges. In that case, we should merge all the value we saw so + // far. { StopWatchNano timer(env_, statistics_ != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); @@ -700,6 +782,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } + // Control reaches here if we encounter a kTypeValue after a bunch of merges. + // In that case, apply all the merges over this value, and keep the value + // in saved_value_. const Slice& val = iter_->value(); { StopWatchNano timer(env_, statistics_ != nullptr); @@ -773,7 +858,7 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); saved_key_.Clear(); - // now savved_key is used to store internal key. + // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); { @@ -848,7 +933,8 @@ void DBIter::SeekToLast() { // it will seek to the last key before the // ReadOptions.iterate_upper_bound if (iter_->Valid() && iterate_upper_bound_ != nullptr) { - saved_key_.SetKey(*iterate_upper_bound_, false /* copy */); + saved_key_.SetKey(*iterate_upper_bound_, false /* copy */, + false /* is_deleted */); std::string last_key; AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), kMaxSequenceNumber, @@ -886,11 +972,12 @@ Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound, - bool prefix_same_as_start, bool pin_data) { + bool prefix_same_as_start, bool pin_data, + bool skip_deleted_keys) { DBIter* db_iter = new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, false, max_sequential_skip_in_iterations, version_number, - iterate_upper_bound, prefix_same_as_start); + iterate_upper_bound, prefix_same_as_start, skip_deleted_keys); if (pin_data) { db_iter->PinData(); } @@ -914,6 +1001,9 @@ inline void ArenaWrappedDBIter::Seek(const Slice& target) { inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); } inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); } inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } +inline bool ArenaWrappedDBIter::key_is_deleted() const { + return db_iter_->key_is_deleted(); +} inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); } inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); } inline Status ArenaWrappedDBIter::PinData() { return db_iter_->PinData(); } @@ -934,14 +1024,15 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator( const Comparator* user_key_comparator, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound, bool prefix_same_as_start, - bool pin_data) { + bool pin_data, bool skip_deleted_keys) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, nullptr, sequence, true, max_sequential_skip_in_iterations, version_number, - iterate_upper_bound, prefix_same_as_start); + iterate_upper_bound, prefix_same_as_start, + skip_deleted_keys); iter->SetDBIter(db_iter); if (pin_data) { diff --git a/db/db_iter.h b/db/db_iter.h index f239d2984b3..54e5908895e 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -30,7 +30,8 @@ extern Iterator* NewDBIterator( const Comparator* user_key_comparator, InternalIterator* internal_iter, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound = nullptr, - bool prefix_same_as_start = false, bool pin_data = false); + bool prefix_same_as_start = false, bool pin_data = false, + bool skip_deleted_keys = true); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -59,6 +60,7 @@ class ArenaWrappedDBIter : public Iterator { virtual void Next() override; virtual void Prev() override; virtual Slice key() const override; + virtual bool key_is_deleted() const override; virtual Slice value() const override; virtual Status status() const override; @@ -78,6 +80,7 @@ extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( const Comparator* user_key_comparator, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound = nullptr, - bool prefix_same_as_start = false, bool pin_data = false); + bool prefix_same_as_start = false, bool pin_data = false, + bool skip_deleted_keys = true); } // namespace rocksdb diff --git a/db/db_test.cc b/db/db_test.cc index 310142de1c3..5d62286333a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3236,6 +3236,7 @@ class ModelDB : public DB { virtual Slice key() const override { return iter_->first; } virtual Slice value() const override { return iter_->second; } virtual Status status() const override { return Status::OK(); } + virtual bool key_is_deleted() const override { return false; } private: const KVMap* const map_; diff --git a/db/managed_iterator.cc b/db/managed_iterator.cc index 1d47f933df7..d466b2a1583 100644 --- a/db/managed_iterator.cc +++ b/db/managed_iterator.cc @@ -200,6 +200,11 @@ Slice ManagedIterator::value() const { return cached_value_.GetKey(); } +bool ManagedIterator::key_is_deleted() const { + assert(valid_); + return cached_key_is_deleted_; +} + Status ManagedIterator::status() const { return status_; } void ManagedIterator::RebuildIterator() { @@ -219,6 +224,7 @@ void ManagedIterator::UpdateCurrent() { status_ = Status::OK(); cached_key_.SetKey(mutable_iter_->key()); cached_value_.SetKey(mutable_iter_->value()); + cached_key_is_deleted_ = mutable_iter_->key_is_deleted(); } void ManagedIterator::ReleaseIter(bool only_old) { diff --git a/db/managed_iterator.h b/db/managed_iterator.h index d9a87596ea4..3c6084dae03 100644 --- a/db/managed_iterator.h +++ b/db/managed_iterator.h @@ -46,6 +46,7 @@ class ManagedIterator : public Iterator { virtual void Next() override; virtual Slice key() const override; virtual Slice value() const override; + virtual bool key_is_deleted() const override; virtual Status status() const override; void ReleaseIter(bool only_old); void SetDropOld(bool only_old) { @@ -73,6 +74,7 @@ class ManagedIterator : public Iterator { IterKey cached_key_; IterKey cached_value_; + bool cached_key_is_deleted_ = false; bool only_drop_old_ = true; bool snapshot_created_; diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 7da37ec3383..e4c7c1e1303 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -85,6 +85,11 @@ class Iterator : public Cleanable { // REQUIRES: Valid() virtual Slice key() const = 0; + // Return if the iterator points to a deleted key. Deleted keys are exposed + // only when skip_deleted_keys is set to false in ReadOptions. + // REQUIRES: Valid() + virtual bool key_is_deleted() const = 0; + // Return the value for the current entry. The underlying storage for // the returned slice is valid only until the next modification of // the iterator. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 10984b297cd..c346bc7f885 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1447,6 +1447,12 @@ struct ReadOptions { // Default: false bool pin_data; + // Expose deleted keys through the iterator. If set to false, rocksdb will + // internally skip over deleted keys. This can be used by users who want a + // predictable upper bound for the time taken by Seek()/Next()/Prev(). + // Default: true + bool skip_deleted_keys; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; diff --git a/table/iterator.cc b/table/iterator.cc index 09f7f8e6879..1578faeb4be 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -71,6 +71,7 @@ class EmptyIterator : public Iterator { assert(false); return Slice(); } + bool key_is_deleted() const override { assert(false); return false; } Slice value() const override { assert(false); return Slice(); diff --git a/util/options.cc b/util/options.cc index 1c4b63a5317..13901bc819d 100644 --- a/util/options.cc +++ b/util/options.cc @@ -778,7 +778,8 @@ ReadOptions::ReadOptions() managed(false), total_order_seek(false), prefix_same_as_start(false), - pin_data(false) { + pin_data(false), + skip_deleted_keys(true) { XFUNC_TEST("", "managed_options", managed_options, xf_manage_options, reinterpret_cast(this)); } @@ -793,7 +794,8 @@ ReadOptions::ReadOptions(bool cksum, bool cache) managed(false), total_order_seek(false), prefix_same_as_start(false), - pin_data(false) { + pin_data(false), + skip_deleted_keys(true) { XFUNC_TEST("", "managed_options", managed_options, xf_manage_options, reinterpret_cast(this)); } diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index a96123d81ce..d37fa6990f1 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -115,6 +115,8 @@ class TtlIterator : public Iterator { Slice key() const override { return iter_->key(); } + bool key_is_deleted() const override { return iter_->key_is_deleted(); } + int32_t timestamp() const { return DecodeFixed32(iter_->value().data() + iter_->value().size() - DBWithTTLImpl::kTSLength); diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 83b07f4db75..7fc1978d697 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -147,6 +147,11 @@ class BaseDeltaIterator : public Iterator { : delta_iterator_->Entry().value; } + bool key_is_deleted() const override { + return current_at_base_ ? base_iterator_->key_is_deleted() + : false /* delta_iterator */; + } + Status status() const override { if (!status_.ok()) { return status_; diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index d91482db482..22a89848d81 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -527,6 +527,7 @@ class KVIter : public Iterator { virtual Slice key() const { return iter_->first; } virtual Slice value() const { return iter_->second; } virtual Status status() const { return Status::OK(); } + virtual bool key_is_deleted() const { return false; } private: const KVMap* const map_;