diff --git a/c-deps/libroach/db.cc b/c-deps/libroach/db.cc index d839656a9cda..2fa191e72ce3 100644 --- a/c-deps/libroach/db.cc +++ b/c-deps/libroach/db.cc @@ -15,6 +15,7 @@ #include "db.h" #include #include +#include #include #include #include @@ -57,6 +58,23 @@ DBKey ToDBKey(const rocksdb::Slice& s) { return key; } +ScopedStats::ScopedStats(DBIterator* iter) + : iter_(iter), + internal_delete_skipped_count_base_( + rocksdb::get_perf_context()->internal_delete_skipped_count) { + if (iter_->stats != nullptr) { + rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); + } +} +ScopedStats::~ScopedStats() { + if (iter_->stats != nullptr) { + iter_->stats->internal_delete_skipped_count += + (rocksdb::get_perf_context()->internal_delete_skipped_count - + internal_delete_skipped_count_base_); + rocksdb::SetPerfLevel(rocksdb::PerfLevel::kDisable); + } +} + } // namespace cockroach namespace { @@ -74,6 +92,7 @@ DBIterState DBIterGetState(DBIterator* iter) { state.value = ToDBSlice(iter->rep->value()); } } + return state; } @@ -370,53 +389,94 @@ DBStatus DBEnvWriteFile(DBEngine* db, DBSlice path, DBSlice contents) { return db->EnvWriteFile(path, contents); } -DBIterator* DBNewIter(DBEngine* db, bool prefix) { +DBIterator* DBNewIter(DBEngine* db, bool prefix, bool stats) { rocksdb::ReadOptions opts; opts.prefix_same_as_start = prefix; opts.total_order_seek = !prefix; - return db->NewIter(&opts); + auto db_iter = db->NewIter(&opts); + + if (stats) { + db_iter->stats.reset(new IteratorStats); + *db_iter->stats = {}; + } + + return db_iter; } -DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts) { +DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts, + bool with_stats) { + IteratorStats* stats = nullptr; + if (with_stats) { + stats = new IteratorStats; + *stats = {}; + } + const std::string min = EncodeTimestamp(min_ts); const std::string max = EncodeTimestamp(max_ts); rocksdb::ReadOptions opts; opts.total_order_seek = true; - opts.table_filter = [min, max](const rocksdb::TableProperties& props) { + opts.table_filter = [min, max, stats](const rocksdb::TableProperties& props) { auto userprops = props.user_collected_properties; auto tbl_min = userprops.find("crdb.ts.min"); if (tbl_min == userprops.end() || tbl_min->second.empty()) { + if (stats != nullptr) { + ++stats->timebound_num_ssts; + } return true; } auto tbl_max = userprops.find("crdb.ts.max"); if (tbl_max == userprops.end() || tbl_max->second.empty()) { + if (stats != nullptr) { + ++stats->timebound_num_ssts; + } return true; } // If the timestamp range of the table overlaps with the timestamp range we // want to iterate, the table might contain timestamps we care about. - return max.compare(tbl_min->second) >= 0 && min.compare(tbl_max->second) <= 0; + bool used = max.compare(tbl_min->second) >= 0 && min.compare(tbl_max->second) <= 0; + if (used && stats != nullptr) { + ++stats->timebound_num_ssts; + } + return used; }; - return db->NewIter(&opts); + + auto db_iter = db->NewIter(&opts); + if (stats != nullptr) { + db_iter->stats.reset(stats); + } + return db_iter; } void DBIterDestroy(DBIterator* iter) { delete iter; } +IteratorStats DBIterStats(DBIterator* iter) { + IteratorStats stats = {}; + if (iter->stats != nullptr) { + stats = *iter->stats; + } + return stats; +} + DBIterState DBIterSeek(DBIterator* iter, DBKey key) { + ScopedStats stats(iter); iter->rep->Seek(EncodeKey(key)); return DBIterGetState(iter); } DBIterState DBIterSeekToFirst(DBIterator* iter) { + ScopedStats stats(iter); iter->rep->SeekToFirst(); return DBIterGetState(iter); } DBIterState DBIterSeekToLast(DBIterator* iter) { + ScopedStats stats(iter); iter->rep->SeekToLast(); return DBIterGetState(iter); } DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) { + ScopedStats stats(iter); // If we're skipping the current key versions, remember the key the // iterator was pointing out. std::string old_key; @@ -459,6 +519,7 @@ DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) { } DBIterState DBIterPrev(DBIterator* iter, bool skip_current_key_versions) { + ScopedStats stats(iter); // If we're skipping the current key versions, remember the key the // iterator was pointed out. std::string old_key; diff --git a/c-deps/libroach/db.h b/c-deps/libroach/db.h index 2b2e343b93ef..452e1b3e2803 100644 --- a/c-deps/libroach/db.h +++ b/c-deps/libroach/db.h @@ -71,4 +71,17 @@ std::string EncodeKey(DBKey k); MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DBKey start, DBKey end, int64_t now_nanos); +// ScopedStats wraps an iterator and, if that iterator has the stats +// member populated, aggregates a subset of the RocksDB perf counters +// into it (while the ScopedStats is live). +class ScopedStats { + public: + ScopedStats(DBIterator*); + ~ScopedStats(); + + private: + DBIterator* const iter_; + uint64_t internal_delete_skipped_count_base_; +}; + } // namespace cockroach diff --git a/c-deps/libroach/include/libroach.h b/c-deps/libroach/include/libroach.h index 11cf27f9f92c..cb4fb265eda4 100644 --- a/c-deps/libroach/include/libroach.h +++ b/c-deps/libroach/include/libroach.h @@ -175,11 +175,14 @@ DBEngine* DBNewBatch(DBEngine* db, bool writeOnly); // the user-key prefix of the key supplied to DBIterSeek() to restrict // which sstables are searched, but iteration (using Next) over keys // without the same user-key prefix will not work correctly (keys may -// be skipped). It is the callers responsibility to call -// DBIterDestroy(). -DBIterator* DBNewIter(DBEngine* db, bool prefix); +// be skipped). When stats is true, the iterator will collect RocksDB +// performance counters which can be retrieved via `DBIterStats`. +// +// It is the caller's responsibility to call DBIterDestroy(). +DBIterator* DBNewIter(DBEngine* db, bool prefix, bool stats); -DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts); +DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts, + bool with_stats); // Destroys an iterator, freeing up any associated memory. void DBIterDestroy(DBIterator* iter); @@ -187,6 +190,20 @@ void DBIterDestroy(DBIterator* iter); // Positions the iterator at the first key that is >= "key". DBIterState DBIterSeek(DBIterator* iter, DBKey key); +typedef struct { + uint64_t internal_delete_skipped_count; + // the number of SSTables touched (only for time bound iterators). + // This field is populated from the table filter, not from the + // RocksDB perf counters. + // + // TODO(tschottdorf): populate this field for all iterators. + uint64_t timebound_num_ssts; + // New fields added here must also be added in various other places; + // just grep the repo for internal_delete_skipped_count. Sorry. +} IteratorStats; + +IteratorStats DBIterStats(DBIterator* iter); + // Positions the iterator at the first key in the database. DBIterState DBIterSeekToFirst(DBIterator* iter); diff --git a/c-deps/libroach/iterator.h b/c-deps/libroach/iterator.h index b6322c96aeb1..8080b2fe5e84 100644 --- a/c-deps/libroach/iterator.h +++ b/c-deps/libroach/iterator.h @@ -29,4 +29,5 @@ struct DBIterator { std::unique_ptr rep; std::unique_ptr kvs; std::unique_ptr intents; + std::unique_ptr stats; }; diff --git a/c-deps/libroach/mvcc.cc b/c-deps/libroach/mvcc.cc index 734063fd08d3..8f33dfc02932 100644 --- a/c-deps/libroach/mvcc.cc +++ b/c-deps/libroach/mvcc.cc @@ -292,6 +292,7 @@ DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTx // don't retrieve a key different than the start key. This is a bit // of a hack. const DBSlice end = {0, 0}; + ScopedStats scoped_iter(iter); mvccForwardScanner scanner(iter, key, end, timestamp, 0 /* max_keys */, txn, consistent, tombstones); return scanner.get(); @@ -300,6 +301,7 @@ DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTx DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, DBTxn txn, bool consistent, bool reverse, bool tombstones) { + ScopedStats scoped_iter(iter); if (reverse) { mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, consistent, tombstones); return scanner.scan(); diff --git a/pkg/ccl/storageccl/engineccl/bench_test.go b/pkg/ccl/storageccl/engineccl/bench_test.go index c0812f834a17..48a7edf9a769 100644 --- a/pkg/ccl/storageccl/engineccl/bench_test.go +++ b/pkg/ccl/storageccl/engineccl/bench_test.go @@ -173,7 +173,7 @@ func BenchmarkTimeBoundIterate(b *testing.B) { }) b.Run("TimeBoundIterator", func(b *testing.B) { runIterate(b, loadFactor, func(e engine.Engine, startTime, endTime hlc.Timestamp) engine.Iterator { - return e.NewTimeBoundIterator(startTime, endTime) + return e.NewTimeBoundIterator(startTime, endTime, false) }) }) }) diff --git a/pkg/ccl/storageccl/engineccl/mvcc.go b/pkg/ccl/storageccl/engineccl/mvcc.go index 28ef2dad59c1..cc9eb8b5cd14 100644 --- a/pkg/ccl/storageccl/engineccl/mvcc.go +++ b/pkg/ccl/storageccl/engineccl/mvcc.go @@ -66,7 +66,9 @@ func NewMVCCIncrementalIterator( // interval into the fully-inclusive [start, end] interval that // NewTimeBoundIterator expects. This is strictly a performance // optimization; omitting the call would still return correct results. - iter: e.NewTimeBoundIterator(startTime.Next(), endTime), + // + // TODO(tschottdorf): plumb withStats in when needed. + iter: e.NewTimeBoundIterator(startTime.Next(), endTime, false /* withStats */), startTime: startTime, endTime: endTime, } diff --git a/pkg/sql/logictest/testdata/logic_test/show_trace b/pkg/sql/logictest/testdata/logic_test/show_trace index 1ca9ba203af3..c5a448952d56 100644 --- a/pkg/sql/logictest/testdata/logic_test/show_trace +++ b/pkg/sql/logictest/testdata/logic_test/show_trace @@ -825,3 +825,11 @@ SELECT message FROM [SHOW TRACE FOR SESSION] WHERE message LIKE e'%1 CPut, 1 Beg ---- r1: sending batch 1 CPut, 1 BeginTxn, 1 EndTxn to (n1,s1):1 1 CPut, 1 BeginTxn, 1 EndTxn + +statement ok +CREATE TABLE t.enginestats(k INT PRIMARY KEY, v INT) + +query T +SELECT message FROM [ SHOW TRACE FOR SELECT * FROM t.enginestats ] WHERE message LIKE '%InternalDelete%' +---- +engine stats: {InternalDeleteSkippedCount:0 TimeBoundNumSSTs:0} diff --git a/pkg/storage/batcheval/cmd_refresh_range.go b/pkg/storage/batcheval/cmd_refresh_range.go index 2b3f97e8c854..f58cbe948e89 100644 --- a/pkg/storage/batcheval/cmd_refresh_range.go +++ b/pkg/storage/batcheval/cmd_refresh_range.go @@ -44,7 +44,7 @@ func RefreshRange( // Use a time-bounded iterator to avoid unnecessarily iterating over // older data. - iter := batch.NewTimeBoundIterator(h.Txn.OrigTimestamp, h.Txn.Timestamp) + iter := batch.NewTimeBoundIterator(h.Txn.OrigTimestamp, h.Txn.Timestamp, false) defer iter.Close() // Iterate over values until we discover any value written at or // after the original timestamp, but before or at the current diff --git a/pkg/storage/batcheval/cmd_resolve_intent_range.go b/pkg/storage/batcheval/cmd_resolve_intent_range.go index 3222272a3fad..4c94769f94b5 100644 --- a/pkg/storage/batcheval/cmd_resolve_intent_range.go +++ b/pkg/storage/batcheval/cmd_resolve_intent_range.go @@ -56,7 +56,7 @@ func ResolveIntentRange( // Use a time-bounded iterator as an optimization if indicated. var iterAndBuf engine.IterAndBuf if args.MinTimestamp != (hlc.Timestamp{}) { - iter := batch.NewTimeBoundIterator(args.MinTimestamp, args.IntentTxn.Timestamp) + iter := batch.NewTimeBoundIterator(args.MinTimestamp, args.IntentTxn.Timestamp, false) iterAndBuf = engine.GetBufUsingIter(iter) } else { iterAndBuf = engine.GetIterAndBuf(batch) diff --git a/pkg/storage/engine/batch_test.go b/pkg/storage/engine/batch_test.go index 96a2f01c65c1..57c5ce8cd5dc 100644 --- a/pkg/storage/engine/batch_test.go +++ b/pkg/storage/engine/batch_test.go @@ -175,7 +175,7 @@ func TestReadOnlyBasics(t *testing.T) { func() { _, _, _, _ = b.GetProto(a, getVal) }, func() { _ = b.Iterate(a, a, func(MVCCKeyValue) (bool, error) { return true, nil }) }, func() { b.NewIterator(IterOptions{}).Close() }, - func() { b.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{}).Close() }, + func() { b.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{}, false).Close() }, } defer func() { b.Close() diff --git a/pkg/storage/engine/engine.go b/pkg/storage/engine/engine.go index 65579fbfbe24..08e5a047f525 100644 --- a/pkg/storage/engine/engine.go +++ b/pkg/storage/engine/engine.go @@ -57,6 +57,12 @@ type SimpleIterator interface { UnsafeValue() []byte } +// IteratorStats is returned from (Iterator).Stats. +type IteratorStats struct { + InternalDeleteSkippedCount int + TimeBoundNumSSTs int +} + // Iterator is an interface for iterating over key/value pairs in an // engine. Iterator implementations are thread safe unless otherwise // noted. @@ -116,6 +122,8 @@ type Iterator interface { MVCCScan(start, end roachpb.Key, max int64, timestamp hlc.Timestamp, txn *roachpb.Transaction, consistent, reverse, tombstone bool, ) (kvs []byte, numKvs int64, intents []byte, err error) + + Stats() IteratorStats } // IterOptions contains options used to create an Iterator. @@ -125,6 +133,9 @@ type IterOptions struct { // but iteration (using Next) over keys without the same user-key // prefix will not work correctly (keys may be skipped) Prefix bool + // If WithStats is true, the iterator accumulates RocksDB performance + // counters over its lifetime which can be queried via `Stats()`. + WithStats bool } // Reader is the read interface to an engine's data. @@ -164,7 +175,7 @@ type Reader interface { // will frequently return keys outside of the [start, end] time range. If you // must guarantee that you never see a key outside of the time bounds, perform // your own filtering. - NewTimeBoundIterator(start, end hlc.Timestamp) Iterator + NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator } // Writer is the write interface to an engine's data. diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index e0adc266cdbb..3429901c0148 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -22,6 +22,7 @@ import ( "sync" "time" + "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/cockroachdb/cockroach/pkg/keys" @@ -31,6 +32,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/tracing" ) const ( @@ -1661,13 +1663,20 @@ func mvccScanInternal( } var ownIter bool + var withStats bool if iter == nil { - iter = engine.NewIterator(IterOptions{}) + if sp := opentracing.SpanFromContext(ctx); sp != nil && !tracing.IsBlackHoleSpan(sp) { + withStats = true + } + iter = engine.NewIterator(IterOptions{WithStats: withStats}) ownIter = true } kvData, numKvs, intentData, err := iter.MVCCScan( key, endKey, max, timestamp, txn, consistent, reverse, tombstones) + if withStats { + log.Eventf(ctx, "engine stats: %+v", iter.Stats()) + } if ownIter { iter.Close() } diff --git a/pkg/storage/engine/rocksdb.go b/pkg/storage/engine/rocksdb.go index c38ad747e8c2..baf7e01b567c 100644 --- a/pkg/storage/engine/rocksdb.go +++ b/pkg/storage/engine/rocksdb.go @@ -913,9 +913,9 @@ func (r *RocksDB) NewIterator(opts IterOptions) Iterator { } // NewTimeBoundIterator is like NewIterator, but returns a time-bound iterator. -func (r *RocksDB) NewTimeBoundIterator(start, end hlc.Timestamp) Iterator { +func (r *RocksDB) NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator { it := &rocksDBIterator{} - it.initTimeBound(r.rdb, start, end, r) + it.initTimeBound(r.rdb, start, end, withStats, r) return it } @@ -1009,12 +1009,12 @@ func (r *rocksDBReadOnly) NewIterator(opts IterOptions) Iterator { return iter } -func (r *rocksDBReadOnly) NewTimeBoundIterator(start, end hlc.Timestamp) Iterator { +func (r *rocksDBReadOnly) NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator { if r.isClosed { panic("using a closed rocksDBReadOnly") } it := &rocksDBIterator{} - it.initTimeBound(r.parent.rdb, start, end, r) + it.initTimeBound(r.parent.rdb, start, end, withStats, r) return it } @@ -1179,7 +1179,7 @@ func (r *rocksDBSnapshot) NewIterator(opts IterOptions) Iterator { } // NewTimeBoundIterator is like NewIterator, but returns a time-bound iterator. -func (r *rocksDBSnapshot) NewTimeBoundIterator(start, end hlc.Timestamp) Iterator { +func (r *rocksDBSnapshot) NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator { panic("not implemented") } @@ -1313,6 +1313,10 @@ type batchIterator struct { batch *rocksDBBatch } +func (r *batchIterator) Stats() IteratorStats { + return r.iter.Stats() +} + func (r *batchIterator) Close() { if r.batch == nil { panic("closing idle iterator") @@ -1616,7 +1620,7 @@ func (r *rocksDBBatch) NewIterator(opts IterOptions) Iterator { } // NewTimeBoundIterator is like NewIterator, but returns a time-bound iterator. -func (r *rocksDBBatch) NewTimeBoundIterator(start, end hlc.Timestamp) Iterator { +func (r *rocksDBBatch) NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator { if r.writeOnly { panic("write-only batch") } @@ -1630,7 +1634,7 @@ func (r *rocksDBBatch) NewTimeBoundIterator(start, end hlc.Timestamp) Iterator { iter := &batchIterator{ batch: r, } - iter.iter.initTimeBound(r.batch, start, end, r) + iter.iter.initTimeBound(r.batch, start, end, withStats, r) return iter } @@ -1871,15 +1875,17 @@ func (r *rocksDBIterator) init(rdb *C.DBEngine, opts IterOptions, engine Reader, r.parent.iters.Unlock() } - r.iter = C.DBNewIter(rdb, C.bool(opts.Prefix)) + r.iter = C.DBNewIter(rdb, C.bool(opts.Prefix), C.bool(opts.WithStats)) if r.iter == nil { panic("unable to create iterator") } r.engine = engine } -func (r *rocksDBIterator) initTimeBound(rdb *C.DBEngine, start, end hlc.Timestamp, engine Reader) { - r.iter = C.DBNewTimeBoundIter(rdb, goToCTimestamp(start), goToCTimestamp(end)) +func (r *rocksDBIterator) initTimeBound( + rdb *C.DBEngine, start, end hlc.Timestamp, withStats bool, engine Reader, +) { + r.iter = C.DBNewTimeBoundIter(rdb, goToCTimestamp(start), goToCTimestamp(end), C.bool(withStats)) if r.iter == nil { panic("unable to create iterator") } @@ -1904,6 +1910,14 @@ func (r *rocksDBIterator) destroy() { // The following methods implement the Iterator interface. +func (r *rocksDBIterator) Stats() IteratorStats { + stats := C.DBIterStats(r.iter) + return IteratorStats{ + TimeBoundNumSSTs: int(C.ulonglong(stats.timebound_num_ssts)), + InternalDeleteSkippedCount: int(C.ulonglong(stats.internal_delete_skipped_count)), + } +} + func (r *rocksDBIterator) Close() { r.destroy() iterPool.Put(r) diff --git a/pkg/storage/engine/rocksdb_iter_stats_test.go b/pkg/storage/engine/rocksdb_iter_stats_test.go new file mode 100644 index 000000000000..2184367d21c4 --- /dev/null +++ b/pkg/storage/engine/rocksdb_iter_stats_test.go @@ -0,0 +1,107 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package engine + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" +) + +func TestIterStats(t *testing.T) { + defer leaktest.AfterTest(t)() + + db := setupMVCCInMemRocksDB(t, "test_iter_stats") + defer db.Close() + + k := MakeMVCCMetadataKey(roachpb.Key("foo")) + if err := db.Put(k, []byte("abc")); err != nil { + t.Fatal(err) + } + + if err := db.Clear(k); err != nil { + t.Fatal(err) + } + + batch := db.NewBatch() + defer batch.Close() + + testCases := []Iterator{ + db.NewIterator(IterOptions{WithStats: true}), + db.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{WallTime: 999}, true /* withStats */), + batch.NewIterator(IterOptions{WithStats: true}), + batch.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{WallTime: 999}, true /* withStats */), + } + + defer func() { + for _, iter := range testCases { + iter.Close() + } + }() + + for _, iter := range testCases { + t.Run("", func(t *testing.T) { + // Seeking past the tombstone manually counts it. + for i := 0; i < 10; i++ { + iter.Seek(NilKey) + iter.Seek(MVCCKeyMax) + stats := iter.Stats() + if e, a := i+1, stats.InternalDeleteSkippedCount; a != e { + t.Errorf("expected internal delete skipped count of %d, not %d", e, a) + } + } + // Scanning a key range containing the tombstone sees it. + for i := 0; i < 10; i++ { + if _, _, _, err := iter.MVCCScan( + roachpb.KeyMin, roachpb.KeyMax, 0, hlc.Timestamp{}, nil, true, false, false, + ); err != nil { + t.Fatal(err) + } + stats := iter.Stats() + if e, a := i+11, stats.InternalDeleteSkippedCount; a != e { + t.Errorf("expected internal delete skipped count of %d, not %d", e, a) + } + } + + // Getting the key with the tombstone sees it. + for i := 0; i < 10; i++ { + if _, _, err := iter.MVCCGet( + k.Key, hlc.Timestamp{}, nil, true, false, + ); err != nil { + t.Fatal(err) + } + stats := iter.Stats() + if e, a := i+21, stats.InternalDeleteSkippedCount; a != e { + t.Errorf("expected internal delete skipped count of %d, not %d", e, a) + } + } + // Getting KeyMax doesn't see it. + for i := 0; i < 10; i++ { + if _, _, err := iter.MVCCGet( + roachpb.KeyMax, hlc.Timestamp{}, nil, true, false, + ); err != nil { + t.Fatal(err) + } + stats := iter.Stats() + if e, a := 30, stats.InternalDeleteSkippedCount; a != e { + t.Errorf("expected internal delete skipped count of %d, not %d", e, a) + } + } + + }) + } +} diff --git a/pkg/storage/engine/rocksdb_test.go b/pkg/storage/engine/rocksdb_test.go index e7e837ccaaec..73495b94cdc9 100644 --- a/pkg/storage/engine/rocksdb_test.go +++ b/pkg/storage/engine/rocksdb_test.go @@ -761,9 +761,7 @@ func TestRocksDBTimeBound(t *testing.T) { batch := rocksdb.NewBatch() defer batch.Close() - // Make a time bounded iterator that skips the SSTable containing our writes. - func() { - tbi := batch.NewTimeBoundIterator(maxTimestamp.Next(), maxTimestamp.Next().Next()) + check := func(t *testing.T, tbi Iterator, keys, ssts int) { defer tbi.Close() tbi.Seek(NilKey) @@ -780,10 +778,35 @@ func TestRocksDBTimeBound(t *testing.T) { } // Make sure the iterator sees no writes. - if expCount := 0; expCount != count { - t.Fatalf("saw %d values in time bounded iterator, but expected %d", count, expCount) + if keys != count { + t.Fatalf("saw %d values in time bounded iterator, but expected %d", count, keys) } - }() + stats := tbi.Stats() + if a := stats.TimeBoundNumSSTs; a != ssts { + t.Fatalf("touched %d SSTs, expected %d", a, ssts) + } + } + + testCases := []struct { + iter Iterator + keys, ssts int + }{ + // Completely to the right, not touching. + {iter: batch.NewTimeBoundIterator(maxTimestamp.Next(), maxTimestamp.Next().Next(), true /* withStats */), keys: 0, ssts: 0}, + // Completely to the left, not touching. + {iter: batch.NewTimeBoundIterator(minTimestamp.Prev().Prev(), minTimestamp.Prev(), true /* withStats */), keys: 0, ssts: 0}, + // Touching on the right. + {iter: batch.NewTimeBoundIterator(maxTimestamp, maxTimestamp, true /* withStats */), keys: len(times), ssts: 1}, + // Touching on the left. + {iter: batch.NewTimeBoundIterator(minTimestamp, minTimestamp, true /* withStats */), keys: len(times), ssts: 1}, + // Copy of last case, but confirm that we don't get SST stats if we don't ask for them. + {iter: batch.NewTimeBoundIterator(minTimestamp, minTimestamp, false /* withStats */), keys: len(times), ssts: 0}} + + for _, test := range testCases { + t.Run("", func(t *testing.T) { + check(t, test.iter, test.keys, test.ssts) + }) + } // Make a regular iterator. Before #21721, this would accidentally pick up the // time bounded iterator instead. diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go index 538e2095f04b..95533af1f33e 100644 --- a/pkg/storage/replica_command.go +++ b/pkg/storage/replica_command.go @@ -495,7 +495,7 @@ func resolveLocalIntents( } min, max := txn.InclusiveTimeBounds() - iter := batch.NewTimeBoundIterator(min, max) + iter := batch.NewTimeBoundIterator(min, max, false) iterAndBuf := engine.GetBufUsingIter(iter) defer iterAndBuf.Cleanup() diff --git a/pkg/storage/spanset/batch.go b/pkg/storage/spanset/batch.go index 16537fb8b517..ac569b8050cd 100644 --- a/pkg/storage/spanset/batch.go +++ b/pkg/storage/spanset/batch.go @@ -47,6 +47,11 @@ func NewIterator(iter engine.Iterator, spans *SpanSet) *Iterator { } } +// Stats is part of the engine.Iterator interface. +func (s *Iterator) Stats() engine.IteratorStats { + return s.i.Stats() +} + // Close is part of the engine.Iterator interface. func (s *Iterator) Close() { s.i.Close() @@ -232,8 +237,10 @@ func (s spanSetReader) NewIterator(opts engine.IterOptions) engine.Iterator { return &Iterator{s.r.NewIterator(opts), s.spans, nil, false} } -func (s spanSetReader) NewTimeBoundIterator(start, end hlc.Timestamp) engine.Iterator { - return &Iterator{s.r.NewTimeBoundIterator(start, end), s.spans, nil, false} +func (s spanSetReader) NewTimeBoundIterator( + start, end hlc.Timestamp, withStats bool, +) engine.Iterator { + return &Iterator{s.r.NewTimeBoundIterator(start, end, withStats), s.spans, nil, false} } type spanSetWriter struct {