Skip to content

Commit

Permalink
storage: clean up MVCC key encoding functions
Browse files Browse the repository at this point in the history
In particular, this separates the timestamp encoding from the overall
key encoding. This is necessary for the MVCC range tombstone work, where
Pebble key suffixes (timestamps) are processed in isolation.

Unfortunately, this adds ~9% overhead for `EncodeMVCCKey()`. This was
found to be due to additional function call overhead, and the Go
compiler's unwillingness to inline these. This was considered acceptable
for the encode path, while with the hotter `DecodeMVCCKey()` path the
timestamp decoding logic was instead duplicated to avoid this overhead.

```
name                                            old time/op    new time/op    delta
EncodeMVCCKey/key=empty/ts=empty-24               15.8ns ± 0%    15.7ns ± 0%   -0.46%  (p=0.000 n=10+10)
EncodeMVCCKey/key=empty/ts=walltime-24            17.9ns ± 0%    19.5ns ± 0%   +8.88%  (p=0.000 n=10+10)
EncodeMVCCKey/key=empty/ts=walltime+logical-24    18.5ns ± 0%    20.1ns ± 0%   +8.99%  (p=0.000 n=10+10)
EncodeMVCCKey/key=empty/ts=all-24                 18.8ns ± 0%    20.4ns ± 0%   +8.66%  (p=0.000 n=10+10)
EncodeMVCCKey/key=short/ts=walltime+logical-24    19.1ns ± 0%    20.7ns ± 0%   +8.38%  (p=0.000 n=10+9)
EncodeMVCCKey/key=short/ts=all-24                 19.5ns ± 0%    20.7ns ± 0%   +6.18%  (p=0.000 n=10+9)
EncodeMVCCKey/key=short/ts=empty-24               16.3ns ± 0%    16.0ns ± 0%   -1.86%  (p=0.000 n=10+10)
EncodeMVCCKey/key=short/ts=walltime-24            18.1ns ± 0%    20.6ns ± 0%  +13.41%  (p=0.000 n=8+8)
EncodeMVCCKey/key=long/ts=empty-24                58.7ns ± 0%    58.8ns ± 0%   +0.15%  (p=0.000 n=10+10)
EncodeMVCCKey/key=long/ts=walltime-24             59.8ns ± 0%    60.8ns ± 0%   +1.78%  (p=0.000 n=10+9)
EncodeMVCCKey/key=long/ts=walltime+logical-24     60.7ns ± 0%    61.7ns ± 0%   +1.54%  (p=0.000 n=10+10)
EncodeMVCCKey/key=long/ts=all-24                  60.9ns ± 0%    61.9ns ± 0%   +1.60%  (p=0.000 n=10+9)
DecodeMVCCKey/key=empty/ts=empty-24               12.4ns ± 0%    12.4ns ± 0%     ~     (p=0.912 n=10+6)
DecodeMVCCKey/key=empty/ts=walltime-24            13.3ns ± 0%    13.3ns ± 0%     ~     (p=0.054 n=10+10)
DecodeMVCCKey/key=empty/ts=walltime+logical-24    13.3ns ± 0%    13.3ns ± 0%   -0.06%  (p=0.034 n=10+10)
DecodeMVCCKey/key=empty/ts=all-24                 13.6ns ± 0%    13.6ns ± 0%     ~     (p=0.509 n=10+10)
DecodeMVCCKey/key=short/ts=walltime+logical-24    13.3ns ± 0%    13.3ns ± 0%     ~     (all equal)
DecodeMVCCKey/key=short/ts=all-24                 13.6ns ± 0%    13.6ns ± 0%     ~     (p=0.151 n=10+10)
DecodeMVCCKey/key=short/ts=empty-24               12.5ns ± 0%    12.4ns ± 0%   -0.21%  (p=0.000 n=10+10)
DecodeMVCCKey/key=short/ts=walltime-24            13.3ns ± 0%    13.3ns ± 0%     ~     (p=0.577 n=8+10)
DecodeMVCCKey/key=long/ts=walltime+logical-24     13.3ns ± 0%    13.3ns ± 0%     ~     (all equal)
DecodeMVCCKey/key=long/ts=all-24                  13.6ns ± 0%    13.6ns ± 0%     ~     (p=0.650 n=10+10)
DecodeMVCCKey/key=long/ts=empty-24                12.4ns ± 0%    12.4ns ± 0%   +0.15%  (p=0.004 n=10+10)
DecodeMVCCKey/key=long/ts=walltime-24             13.3ns ± 0%    13.3ns ± 0%   +0.10%  (p=0.012 n=10+9)
```

Release note: None
  • Loading branch information
erikgrinaker committed Jan 31, 2022
1 parent 25849d1 commit 9035872
Show file tree
Hide file tree
Showing 3 changed files with 341 additions and 65 deletions.
28 changes: 15 additions & 13 deletions pkg/storage/enginepb/decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,30 @@ func SplitMVCCKey(mvccKey []byte) (key []byte, ts []byte, ok bool) {
}

// DecodeKey decodes an key/timestamp from its serialized representation.
func DecodeKey(encodedKey []byte) (key []byte, timestamp hlc.Timestamp, _ error) {
key, ts, ok := SplitMVCCKey(encodedKey)
func DecodeKey(encodedKey []byte) ([]byte, hlc.Timestamp, error) {
key, encodedTS, ok := SplitMVCCKey(encodedKey)
if !ok {
return nil, timestamp, errors.Errorf("invalid encoded mvcc key: %x", encodedKey)
return nil, hlc.Timestamp{}, errors.Errorf("invalid encoded mvcc key: %x", encodedKey)
}
switch len(ts) {
// NB: This logic is duplicated with storage.decodeMVCCTimestamp() to avoid the
// overhead of an additional function call (~13%).
var timestamp hlc.Timestamp
switch len(encodedTS) {
case 0:
// No-op.
case 8:
timestamp.WallTime = int64(binary.BigEndian.Uint64(ts[0:8]))
timestamp.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
case 12:
timestamp.WallTime = int64(binary.BigEndian.Uint64(ts[0:8]))
timestamp.Logical = int32(binary.BigEndian.Uint32(ts[8:12]))
timestamp.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
timestamp.Logical = int32(binary.BigEndian.Uint32(encodedTS[8:12]))
case 13:
timestamp.WallTime = int64(binary.BigEndian.Uint64(ts[0:8]))
timestamp.Logical = int32(binary.BigEndian.Uint32(ts[8:12]))
timestamp.Synthetic = ts[12] != 0
timestamp.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
timestamp.Logical = int32(binary.BigEndian.Uint32(encodedTS[8:12]))
timestamp.Synthetic = encodedTS[12] != 0
default:
return nil, timestamp, errors.Errorf(
"invalid encoded mvcc key: %x bad timestamp %x", encodedKey, ts)
return nil, hlc.Timestamp{}, errors.Errorf(
"invalid encoded mvcc key: %x bad timestamp %x", encodedKey, encodedTS)
}

return key, timestamp, nil
}

Expand Down
198 changes: 146 additions & 52 deletions pkg/storage/mvcc_key.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,24 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/errors"
)

var (
// MVCCKeyMax is a maximum mvcc-encoded key value which sorts after
// all other keys.
// MVCCKeyMax sorts after all other MVCC keys.
MVCCKeyMax = MakeMVCCMetadataKey(roachpb.KeyMax)
// NilKey is the nil MVCCKey.
NilKey = MVCCKey{}
)

const (
mvccEncodedTimeSentinelLen = 1
mvccEncodedTimeWallLen = 8
mvccEncodedTimeLogicalLen = 4
mvccEncodedTimeSyntheticLen = 1
mvccEncodedTimeLengthLen = 1
)

// MVCCKey is a versioned key, distinguished from roachpb.Key with the addition
// of a timestamp.
type MVCCKey struct {
Expand Down Expand Up @@ -110,17 +118,35 @@ func (k MVCCKey) Len() int {
return encodedMVCCKeyLength(k)
}

// EncodeMVCCKey encodes an engine.MVCC key into the RocksDB representation.
// EncodeMVCCKey encodes an MVCCKey into its Pebble representation. The encoding
// takes the following forms, where trailing time components are omitted when
// zero-valued:
//
// [key] [sentinel] [timeWall] [timeLogical] [timeSynthetic] [timeLength]
// [key] [sentinel] [timeWall] [timeLogical] [timeLength]
// [key] [sentinel] [timeWall] [timeLength]
// [key] [sentinel]
//
// key: the unmodified binary key (variable length)
// sentinel: separates key and timestamp (1 byte: 0x00)
// timeWall: Timestamp.WallTime (8 bytes: big-endian uint64)
// timeLogical: Timestamp.Logical (4 bytes: big-endian uint64)
// timeSynthetic: Timestamp.Synthetic (1 byte: 0x01 when set)
// timeLength: encoded timestamp length inc. itself (1 byte: uint8)
//
// The sentinel byte can be used to detect a key without a timestamp, since
// timeLength will never be 0 (it includes itself in the length).
func EncodeMVCCKey(key MVCCKey) []byte {
keyLen := key.Len()
keyLen := encodedMVCCKeyLength(key)
buf := make([]byte, keyLen)
encodeMVCCKeyToBuf(buf, key, keyLen)
return buf
}

// EncodeMVCCKeyToBuf encodes an engine.MVCC key into the RocksDB representation.
// EncodeMVCCKeyToBuf encodes an MVCCKey into its Pebble representation, reusing
// the given byte buffer if it has sufficient capacity.
func EncodeMVCCKeyToBuf(buf []byte, key MVCCKey) []byte {
keyLen := key.Len()
keyLen := encodedMVCCKeyLength(key)
if cap(buf) < keyLen {
buf = make([]byte, keyLen)
} else {
Expand All @@ -130,66 +156,134 @@ func EncodeMVCCKeyToBuf(buf []byte, key MVCCKey) []byte {
return buf
}

// encodeMVCCKeyToBuf encodes an MVCCKey into its Pebble representation to the
// target buffer, which must have the correct size.
func encodeMVCCKeyToBuf(buf []byte, key MVCCKey, keyLen int) {
const (
timestampSentinelLen = 1
walltimeEncodedLen = 8
logicalEncodedLen = 4
syntheticEncodedLen = 1
)

copy(buf, key.Key)

pos := len(key.Key)
timestampLength := keyLen - pos - 1
if timestampLength > 0 {
buf[pos] = 0
pos += timestampSentinelLen
binary.BigEndian.PutUint64(buf[pos:], uint64(key.Timestamp.WallTime))
pos += walltimeEncodedLen
if key.Timestamp.Logical != 0 || key.Timestamp.Synthetic {
binary.BigEndian.PutUint32(buf[pos:], uint32(key.Timestamp.Logical))
pos += logicalEncodedLen
}
if key.Timestamp.Synthetic {
buf[pos] = 1
pos += syntheticEncodedLen
}

buf[pos] = 0 // sentinel byte
pos += mvccEncodedTimeSentinelLen

tsLen := keyLen - pos - mvccEncodedTimeLengthLen
if tsLen > 0 {
encodeMVCCTimestampToBuf(buf[pos:], key.Timestamp)
pos += tsLen
buf[pos] = byte(tsLen + mvccEncodedTimeLengthLen)
}
buf[len(buf)-1] = byte(timestampLength)
}

// encodeMVCCTimestamp encodes an MVCC timestamp into its Pebble
// representation, excluding length suffix and sentinel byte.
func encodeMVCCTimestamp(ts hlc.Timestamp) []byte {
_, encodedTS, _ := enginepb.SplitMVCCKey(EncodeMVCCKey(MVCCKey{Timestamp: ts}))
return encodedTS
tsLen := encodedMVCCTimestampLength(ts)
if tsLen == 0 {
return nil
}
buf := make([]byte, tsLen)
encodeMVCCTimestampToBuf(buf, ts)
return buf
}

// DecodeMVCCKey decodes an engine.MVCCKey from its serialized representation.
func DecodeMVCCKey(encodedKey []byte) (MVCCKey, error) {
// TODO(erikgrinaker): merge in the enginepb decoding functions when it no
// longer involves a problematic GCO dependency (via Pebble).
k, ts, err := enginepb.DecodeKey(encodedKey)
return MVCCKey{k, ts}, err
// encodeMVCCTimestampSuffix encodes an MVCC timestamp into its Pebble
// representation, including the length suffix but excluding the sentinel byte.
// This is equivalent to the Pebble suffix.
func encodeMVCCTimestampSuffix(ts hlc.Timestamp) []byte {
tsLen := encodedMVCCTimestampLength(ts)
if tsLen == 0 {
return nil
}
buf := make([]byte, tsLen+mvccEncodedTimeLengthLen)
encodeMVCCTimestampToBuf(buf, ts)
buf[tsLen] = byte(tsLen + mvccEncodedTimeLengthLen)
return buf
}

// encodeMVCCTimestampToBuf encodes an MVCC timestamp into its Pebble
// representation, excluding the length suffix and sentinel byte. The target
// buffer must have the correct size, and the timestamp must not be empty.
func encodeMVCCTimestampToBuf(buf []byte, ts hlc.Timestamp) {
binary.BigEndian.PutUint64(buf, uint64(ts.WallTime))
if ts.Logical != 0 || ts.Synthetic {
binary.BigEndian.PutUint32(buf[mvccEncodedTimeWallLen:], uint32(ts.Logical))
if ts.Synthetic {
buf[mvccEncodedTimeWallLen+mvccEncodedTimeLogicalLen] = 1
}
}
}

// encodedMVCCKeyLength returns the encoded length of the given MVCCKey.
func encodedMVCCKeyLength(key MVCCKey) int {
const (
timestampSentinelLen = 1
walltimeEncodedLen = 8
logicalEncodedLen = 4
syntheticEncodedLen = 1
timestampEncodedLengthLen = 1
)

n := len(key.Key) + timestampEncodedLengthLen
keyLen := len(key.Key) + mvccEncodedTimeSentinelLen
if !key.Timestamp.IsEmpty() {
n += timestampSentinelLen + walltimeEncodedLen
keyLen += mvccEncodedTimeWallLen + mvccEncodedTimeLengthLen
if key.Timestamp.Logical != 0 || key.Timestamp.Synthetic {
n += logicalEncodedLen
}
if key.Timestamp.Synthetic {
n += syntheticEncodedLen
keyLen += mvccEncodedTimeLogicalLen
if key.Timestamp.Synthetic {
keyLen += mvccEncodedTimeSyntheticLen
}
}
}
return n
return keyLen
}

// encodedMVCCTimestampLength returns the encoded length of the given MVCC
// timestamp, excluding the length suffix and sentinel bytes.
func encodedMVCCTimestampLength(ts hlc.Timestamp) int {
// This is backwards, but encodedMVCCKeyLength() is called in the
// EncodeMVCCKey() hot path and an additional function call to this function
// shows ~6% overhead in benchmarks. We therefore do the timestamp length
// calculation inline in encodedMVCCKeyLength(), and remove the excess here.
tsLen := encodedMVCCKeyLength(MVCCKey{Timestamp: ts}) - mvccEncodedTimeSentinelLen
if tsLen > 0 {
tsLen -= mvccEncodedTimeLengthLen
}
return tsLen
}

// TODO(erikgrinaker): merge in the enginepb decoding functions once it can
// avoid the storage package's problematic CGo dependency (via Pebble).

// DecodeMVCCKey decodes an MVCCKey from its Pebble representation.
func DecodeMVCCKey(encodedKey []byte) (MVCCKey, error) {
k, ts, err := enginepb.DecodeKey(encodedKey)
return MVCCKey{k, ts}, err
}

// decodeMVCCTimestamp decodes an MVCC timestamp from its Pebble representation,
// excluding the length suffix.
func decodeMVCCTimestamp(encodedTS []byte) (hlc.Timestamp, error) {
// NB: This logic is duplicated in enginepb.DecodeKey() to avoid the
// overhead of an additional function call there (~13%).
var ts hlc.Timestamp
switch len(encodedTS) {
case 0:
// No-op.
case 8:
ts.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
case 12:
ts.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
ts.Logical = int32(binary.BigEndian.Uint32(encodedTS[8:12]))
case 13:
ts.WallTime = int64(binary.BigEndian.Uint64(encodedTS[0:8]))
ts.Logical = int32(binary.BigEndian.Uint32(encodedTS[8:12]))
ts.Synthetic = encodedTS[12] != 0
default:
return hlc.Timestamp{}, errors.Errorf("bad timestamp %x", encodedTS)
}
return ts, nil
}

// decodeMVCCTimestampSuffix decodes an MVCC timestamp from its Pebble representation,
// including the length suffix.
func decodeMVCCTimestampSuffix(encodedTS []byte) (hlc.Timestamp, error) {
if len(encodedTS) == 0 {
return hlc.Timestamp{}, nil
}
encodedLen := len(encodedTS)
if suffixLen := int(encodedTS[encodedLen-1]); suffixLen != encodedLen {
return hlc.Timestamp{}, errors.Errorf(
"bad timestamp: found length suffix %d, actual length %d", suffixLen, encodedLen)
}
return decodeMVCCTimestamp(encodedTS[:encodedLen-1])
}
Loading

0 comments on commit 9035872

Please sign in to comment.