Skip to content

Commit

Permalink
Merge #24543
Browse files Browse the repository at this point in the history
24543: engine: pass rocksdb perf stats back to go r=petermattis a=tschottdorf

Being able to see RocksDB-level perf counters in slow read operations
can be helpful to diagnose issues such as #17229. Being able to see
I/O slowness at the RocksDB level should be helpful to diagnose disk
bottlenecking.

This commit optionally enables collection of (a currently singleton
subset) RocksDB performance counters on a per-iterator basis but
makes it straightforward to add anything from the IO or PerfContext.

Note that the RocksDB perf counters use TLS (thread-local storage).
Iterators are usually created and used from goroutines, so they are not
tied to a thread. To avoid this, the iterator holds a struct into which
it accumulates the collected counters for each operation that takes
place in C++. Compared to the overhead of crossing the cgo barrier in
the first place, the added two conditionals seem negligible (and the
branch predictor should do a good job on them).

As a first cut, it prints the number of skipped RocksDB tombstones
into the trace for scan operations.

Was was tempted to make this event log automatic upon closing the
iterator, but this would require passing a `context.Context` into
`(Iterator).Close` as well as `NewIterator` which seemed potentially
controversial.

Release note: None

Co-authored-by: Tobias Schottdorf <tobias.schottdorf@gmail.com>
  • Loading branch information
craig[bot] and tbg committed Apr 16, 2018
2 parents 33005f1 + 6c99271 commit b49e034
Show file tree
Hide file tree
Showing 18 changed files with 311 additions and 36 deletions.
73 changes: 67 additions & 6 deletions c-deps/libroach/db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "db.h"
#include <algorithm>
#include <rocksdb/convenience.h>
#include <rocksdb/perf_context.h>
#include <rocksdb/sst_file_writer.h>
#include <rocksdb/table.h>
#include <stdarg.h>
Expand Down Expand Up @@ -57,6 +58,23 @@ DBKey ToDBKey(const rocksdb::Slice& s) {
return key;
}

ScopedStats::ScopedStats(DBIterator* iter)
: iter_(iter),
internal_delete_skipped_count_base_(
rocksdb::get_perf_context()->internal_delete_skipped_count) {
if (iter_->stats != nullptr) {
rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
}
}
ScopedStats::~ScopedStats() {
if (iter_->stats != nullptr) {
iter_->stats->internal_delete_skipped_count +=
(rocksdb::get_perf_context()->internal_delete_skipped_count -
internal_delete_skipped_count_base_);
rocksdb::SetPerfLevel(rocksdb::PerfLevel::kDisable);
}
}

} // namespace cockroach

namespace {
Expand All @@ -74,6 +92,7 @@ DBIterState DBIterGetState(DBIterator* iter) {
state.value = ToDBSlice(iter->rep->value());
}
}

return state;
}

Expand Down Expand Up @@ -370,53 +389,94 @@ DBStatus DBEnvWriteFile(DBEngine* db, DBSlice path, DBSlice contents) {
return db->EnvWriteFile(path, contents);
}

DBIterator* DBNewIter(DBEngine* db, bool prefix) {
DBIterator* DBNewIter(DBEngine* db, bool prefix, bool stats) {
rocksdb::ReadOptions opts;
opts.prefix_same_as_start = prefix;
opts.total_order_seek = !prefix;
return db->NewIter(&opts);
auto db_iter = db->NewIter(&opts);

if (stats) {
db_iter->stats.reset(new IteratorStats);
*db_iter->stats = {};
}

return db_iter;
}

DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts) {
DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts,
bool with_stats) {
IteratorStats* stats = nullptr;
if (with_stats) {
stats = new IteratorStats;
*stats = {};
}

const std::string min = EncodeTimestamp(min_ts);
const std::string max = EncodeTimestamp(max_ts);
rocksdb::ReadOptions opts;
opts.total_order_seek = true;
opts.table_filter = [min, max](const rocksdb::TableProperties& props) {
opts.table_filter = [min, max, stats](const rocksdb::TableProperties& props) {
auto userprops = props.user_collected_properties;
auto tbl_min = userprops.find("crdb.ts.min");
if (tbl_min == userprops.end() || tbl_min->second.empty()) {
if (stats != nullptr) {
++stats->timebound_num_ssts;
}
return true;
}
auto tbl_max = userprops.find("crdb.ts.max");
if (tbl_max == userprops.end() || tbl_max->second.empty()) {
if (stats != nullptr) {
++stats->timebound_num_ssts;
}
return true;
}
// If the timestamp range of the table overlaps with the timestamp range we
// want to iterate, the table might contain timestamps we care about.
return max.compare(tbl_min->second) >= 0 && min.compare(tbl_max->second) <= 0;
bool used = max.compare(tbl_min->second) >= 0 && min.compare(tbl_max->second) <= 0;
if (used && stats != nullptr) {
++stats->timebound_num_ssts;
}
return used;
};
return db->NewIter(&opts);

auto db_iter = db->NewIter(&opts);
if (stats != nullptr) {
db_iter->stats.reset(stats);
}
return db_iter;
}

void DBIterDestroy(DBIterator* iter) { delete iter; }

IteratorStats DBIterStats(DBIterator* iter) {
IteratorStats stats = {};
if (iter->stats != nullptr) {
stats = *iter->stats;
}
return stats;
}

DBIterState DBIterSeek(DBIterator* iter, DBKey key) {
ScopedStats stats(iter);
iter->rep->Seek(EncodeKey(key));
return DBIterGetState(iter);
}

DBIterState DBIterSeekToFirst(DBIterator* iter) {
ScopedStats stats(iter);
iter->rep->SeekToFirst();
return DBIterGetState(iter);
}

DBIterState DBIterSeekToLast(DBIterator* iter) {
ScopedStats stats(iter);
iter->rep->SeekToLast();
return DBIterGetState(iter);
}

DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) {
ScopedStats stats(iter);
// If we're skipping the current key versions, remember the key the
// iterator was pointing out.
std::string old_key;
Expand Down Expand Up @@ -459,6 +519,7 @@ DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) {
}

DBIterState DBIterPrev(DBIterator* iter, bool skip_current_key_versions) {
ScopedStats stats(iter);
// If we're skipping the current key versions, remember the key the
// iterator was pointed out.
std::string old_key;
Expand Down
13 changes: 13 additions & 0 deletions c-deps/libroach/db.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,17 @@ std::string EncodeKey(DBKey k);
MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DBKey start,
DBKey end, int64_t now_nanos);

// ScopedStats wraps an iterator and, if that iterator has the stats
// member populated, aggregates a subset of the RocksDB perf counters
// into it (while the ScopedStats is live).
class ScopedStats {
public:
ScopedStats(DBIterator*);
~ScopedStats();

private:
DBIterator* const iter_;
uint64_t internal_delete_skipped_count_base_;
};

} // namespace cockroach
25 changes: 21 additions & 4 deletions c-deps/libroach/include/libroach.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,18 +175,35 @@ DBEngine* DBNewBatch(DBEngine* db, bool writeOnly);
// the user-key prefix of the key supplied to DBIterSeek() to restrict
// which sstables are searched, but iteration (using Next) over keys
// without the same user-key prefix will not work correctly (keys may
// be skipped). It is the callers responsibility to call
// DBIterDestroy().
DBIterator* DBNewIter(DBEngine* db, bool prefix);
// be skipped). When stats is true, the iterator will collect RocksDB
// performance counters which can be retrieved via `DBIterStats`.
//
// It is the caller's responsibility to call DBIterDestroy().
DBIterator* DBNewIter(DBEngine* db, bool prefix, bool stats);

DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts);
DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts,
bool with_stats);

// Destroys an iterator, freeing up any associated memory.
void DBIterDestroy(DBIterator* iter);

// Positions the iterator at the first key that is >= "key".
DBIterState DBIterSeek(DBIterator* iter, DBKey key);

typedef struct {
uint64_t internal_delete_skipped_count;
// the number of SSTables touched (only for time bound iterators).
// This field is populated from the table filter, not from the
// RocksDB perf counters.
//
// TODO(tschottdorf): populate this field for all iterators.
uint64_t timebound_num_ssts;
// New fields added here must also be added in various other places;
// just grep the repo for internal_delete_skipped_count. Sorry.
} IteratorStats;

IteratorStats DBIterStats(DBIterator* iter);

// Positions the iterator at the first key in the database.
DBIterState DBIterSeekToFirst(DBIterator* iter);

Expand Down
1 change: 1 addition & 0 deletions c-deps/libroach/iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ struct DBIterator {
std::unique_ptr<rocksdb::Iterator> rep;
std::unique_ptr<cockroach::chunkedBuffer> kvs;
std::unique_ptr<rocksdb::WriteBatch> intents;
std::unique_ptr<IteratorStats> stats;
};
2 changes: 2 additions & 0 deletions c-deps/libroach/mvcc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTx
// don't retrieve a key different than the start key. This is a bit
// of a hack.
const DBSlice end = {0, 0};
ScopedStats scoped_iter(iter);
mvccForwardScanner scanner(iter, key, end, timestamp, 0 /* max_keys */, txn, consistent,
tombstones);
return scanner.get();
Expand All @@ -300,6 +301,7 @@ DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTx
DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool consistent, bool reverse,
bool tombstones) {
ScopedStats scoped_iter(iter);
if (reverse) {
mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, consistent, tombstones);
return scanner.scan();
Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/storageccl/engineccl/bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ func BenchmarkTimeBoundIterate(b *testing.B) {
})
b.Run("TimeBoundIterator", func(b *testing.B) {
runIterate(b, loadFactor, func(e engine.Engine, startTime, endTime hlc.Timestamp) engine.Iterator {
return e.NewTimeBoundIterator(startTime, endTime)
return e.NewTimeBoundIterator(startTime, endTime, false)
})
})
})
Expand Down
4 changes: 3 additions & 1 deletion pkg/ccl/storageccl/engineccl/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ func NewMVCCIncrementalIterator(
// interval into the fully-inclusive [start, end] interval that
// NewTimeBoundIterator expects. This is strictly a performance
// optimization; omitting the call would still return correct results.
iter: e.NewTimeBoundIterator(startTime.Next(), endTime),
//
// TODO(tschottdorf): plumb withStats in when needed.
iter: e.NewTimeBoundIterator(startTime.Next(), endTime, false /* withStats */),
startTime: startTime,
endTime: endTime,
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/show_trace
Original file line number Diff line number Diff line change
Expand Up @@ -825,3 +825,11 @@ SELECT message FROM [SHOW TRACE FOR SESSION] WHERE message LIKE e'%1 CPut, 1 Beg
----
r1: sending batch 1 CPut, 1 BeginTxn, 1 EndTxn to (n1,s1):1
1 CPut, 1 BeginTxn, 1 EndTxn

statement ok
CREATE TABLE t.enginestats(k INT PRIMARY KEY, v INT)

query T
SELECT message FROM [ SHOW TRACE FOR SELECT * FROM t.enginestats ] WHERE message LIKE '%InternalDelete%'
----
engine stats: {InternalDeleteSkippedCount:0 TimeBoundNumSSTs:0}
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_refresh_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func RefreshRange(

// Use a time-bounded iterator to avoid unnecessarily iterating over
// older data.
iter := batch.NewTimeBoundIterator(h.Txn.OrigTimestamp, h.Txn.Timestamp)
iter := batch.NewTimeBoundIterator(h.Txn.OrigTimestamp, h.Txn.Timestamp, false)
defer iter.Close()
// Iterate over values until we discover any value written at or
// after the original timestamp, but before or at the current
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_resolve_intent_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func ResolveIntentRange(
// Use a time-bounded iterator as an optimization if indicated.
var iterAndBuf engine.IterAndBuf
if args.MinTimestamp != (hlc.Timestamp{}) {
iter := batch.NewTimeBoundIterator(args.MinTimestamp, args.IntentTxn.Timestamp)
iter := batch.NewTimeBoundIterator(args.MinTimestamp, args.IntentTxn.Timestamp, false)
iterAndBuf = engine.GetBufUsingIter(iter)
} else {
iterAndBuf = engine.GetIterAndBuf(batch)
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/engine/batch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ func TestReadOnlyBasics(t *testing.T) {
func() { _, _, _, _ = b.GetProto(a, getVal) },
func() { _ = b.Iterate(a, a, func(MVCCKeyValue) (bool, error) { return true, nil }) },
func() { b.NewIterator(IterOptions{}).Close() },
func() { b.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{}).Close() },
func() { b.NewTimeBoundIterator(hlc.Timestamp{}, hlc.Timestamp{}, false).Close() },
}
defer func() {
b.Close()
Expand Down
13 changes: 12 additions & 1 deletion pkg/storage/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ type SimpleIterator interface {
UnsafeValue() []byte
}

// IteratorStats is returned from (Iterator).Stats.
type IteratorStats struct {
InternalDeleteSkippedCount int
TimeBoundNumSSTs int
}

// Iterator is an interface for iterating over key/value pairs in an
// engine. Iterator implementations are thread safe unless otherwise
// noted.
Expand Down Expand Up @@ -116,6 +122,8 @@ type Iterator interface {
MVCCScan(start, end roachpb.Key, max int64, timestamp hlc.Timestamp,
txn *roachpb.Transaction, consistent, reverse, tombstone bool,
) (kvs []byte, numKvs int64, intents []byte, err error)

Stats() IteratorStats
}

// IterOptions contains options used to create an Iterator.
Expand All @@ -125,6 +133,9 @@ type IterOptions struct {
// but iteration (using Next) over keys without the same user-key
// prefix will not work correctly (keys may be skipped)
Prefix bool
// If WithStats is true, the iterator accumulates RocksDB performance
// counters over its lifetime which can be queried via `Stats()`.
WithStats bool
}

// Reader is the read interface to an engine's data.
Expand Down Expand Up @@ -164,7 +175,7 @@ type Reader interface {
// will frequently return keys outside of the [start, end] time range. If you
// must guarantee that you never see a key outside of the time bounds, perform
// your own filtering.
NewTimeBoundIterator(start, end hlc.Timestamp) Iterator
NewTimeBoundIterator(start, end hlc.Timestamp, withStats bool) Iterator
}

// Writer is the write interface to an engine's data.
Expand Down
11 changes: 10 additions & 1 deletion pkg/storage/engine/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"sync"
"time"

"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"

"github.com/cockroachdb/cockroach/pkg/keys"
Expand All @@ -31,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
)

const (
Expand Down Expand Up @@ -1661,13 +1663,20 @@ func mvccScanInternal(
}

var ownIter bool
var withStats bool
if iter == nil {
iter = engine.NewIterator(IterOptions{})
if sp := opentracing.SpanFromContext(ctx); sp != nil && !tracing.IsBlackHoleSpan(sp) {
withStats = true
}
iter = engine.NewIterator(IterOptions{WithStats: withStats})
ownIter = true
}
kvData, numKvs, intentData, err := iter.MVCCScan(
key, endKey, max, timestamp, txn, consistent, reverse, tombstones)

if withStats {
log.Eventf(ctx, "engine stats: %+v", iter.Stats())
}
if ownIter {
iter.Close()
}
Expand Down
Loading

0 comments on commit b49e034

Please sign in to comment.