Skip to content

Commit

Permalink
storage/engine: expose FailOnMoreRecent MVCC{Get/Scan}Option
Browse files Browse the repository at this point in the history
Relates to cockroachdb#40205.

This change introduces a new option to `MVCCGetOptions` and `MVCCScanOptions`
that causes reads to throw an error if they observe an MVCC version at a
timestamp above their read timestamp. Specifically, when the option is enabled,
a `WriteTooOldError` will be returned if a scan observes an MVCC version with a
timestamp above the read. Similarly, a `WriteIntentError` will be returned if a
scan observes another transaction's intent, even if that intent has a timestamp
above the scan's read timestamp.

This option will be used in the future by `ScanRequests` and `ReverseScanRequests`
that intend to acquire unreplicated key-level locks, which will power `SELECT FOR
UPDATE`. These scans do not want to ignore existing MVCC versions at higher timestamps
like traditional scans do. Instead, they want to throw errors and force their
transaction to increase its timestamp through either a refresh or a retry. This was
previously prototyped in 4a8e8dc.

Interestingly, this is not new logic to the MVCC layer. This behavior is exactly
the same as that of the initial key-value lookup performed during MVCC writes
(see `mvccPutInternal`). It's fitting that behavior needed for `SELECT FOR UPDATE`
would mirror that already exhibited by the read portion of read-write operations.
This also hints at an opportunity to potentially use this option to merge the two
implementations and get rid of custom logic like `mvccGetInternal` that only exists
on the write path. We'd need to be careful about doing so though, as this code is
heavily tuned.

Release note: None
  • Loading branch information
nvanbenschoten committed Jan 31, 2020
1 parent 6e7fa8f commit a25a9c9
Show file tree
Hide file tree
Showing 9 changed files with 376 additions and 85 deletions.
5 changes: 3 additions & 2 deletions c-deps/libroach/include/libroach.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,15 +358,16 @@ typedef struct {
DBStatus status;
DBChunkedBuffer data;
DBSlice intents;
DBTimestamp write_too_old_timestamp;
DBTimestamp uncertainty_timestamp;
DBSlice resume_key;
} DBScanResults;

DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
bool inconsistent, bool tombstones);
bool inconsistent, bool tombstones, bool fail_on_more_recent);
DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse,
bool tombstones);
bool tombstones, bool fail_on_more_recent);

// DBStatsResult contains various runtime stats for RocksDB.
typedef struct {
Expand Down
14 changes: 7 additions & 7 deletions c-deps/libroach/mvcc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -269,28 +269,28 @@ DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_
}

DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
bool inconsistent, bool tombstones) {
bool inconsistent, bool tombstones, bool fail_on_more_recent) {
// Get is implemented as a scan where we retrieve a single key. We specify an
// empty key for the end key which will ensure we don't retrieve a key
// different than the start key. This is a bit of a hack.
const DBSlice end = {0, 0};
ScopedStats scoped_iter(iter);
mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, txn, inconsistent,
tombstones);
tombstones, fail_on_more_recent);
return scanner.get();
}

DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse,
bool tombstones) {
bool tombstones, bool fail_on_more_recent) {
ScopedStats scoped_iter(iter);
if (reverse) {
mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent,
tombstones);
mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, tombstones,
fail_on_more_recent);
return scanner.scan();
} else {
mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent,
tombstones);
mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, tombstones,
fail_on_more_recent);
return scanner.scan();
}
}
60 changes: 43 additions & 17 deletions c-deps/libroach/mvcc.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ static const int kMaxItersBeforeSeek = 10;
template <bool reverse> class mvccScanner {
public:
mvccScanner(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys,
DBTxn txn, bool inconsistent, bool tombstones)
DBTxn txn, bool inconsistent, bool tombstones, bool fail_on_more_recent)
: iter_(iter),
iter_rep_(iter->rep.get()),
start_key_(ToSlice(start)),
Expand All @@ -62,6 +62,7 @@ template <bool reverse> class mvccScanner {
txn_ignored_seqnums_(txn.ignored_seqnums),
inconsistent_(inconsistent),
tombstones_(tombstones),
fail_on_more_recent_(fail_on_more_recent),
check_uncertainty_(timestamp < txn.max_timestamp),
kvs_(new chunkedBuffer),
intents_(new rocksdb::WriteBatch),
Expand Down Expand Up @@ -254,6 +255,13 @@ template <bool reverse> class mvccScanner {
return true;
}

bool writeTooOldError(DBTimestamp ts) {
results_.write_too_old_timestamp = ts;
kvs_->Clear();
intents_->Clear();
return false;
}

bool uncertaintyError(DBTimestamp ts) {
results_.uncertainty_timestamp = ts;
kvs_->Clear();
Expand All @@ -276,8 +284,15 @@ template <bool reverse> class mvccScanner {
return addAndAdvance(cur_value_);
}

if (fail_on_more_recent_) {
// 2. Our txn's read timestamp is less than the most recent
// version's timestamp and the scanner has been configured
// to throw a write too old error on more recent versions.
return writeTooOldError(cur_timestamp_);
}

if (check_uncertainty_) {
// 2. Our txn's read timestamp is less than the max timestamp
// 3. Our txn's read timestamp is less than the max timestamp
// seen by the txn. We need to check for clock uncertainty
// errors.
if (txn_max_timestamp_ >= cur_timestamp_) {
Expand All @@ -288,7 +303,7 @@ template <bool reverse> class mvccScanner {
return seekVersion(txn_max_timestamp_, true);
}

// 3. Our txn's read timestamp is greater than or equal to the
// 4. Our txn's read timestamp is greater than or equal to the
// max timestamp seen by the txn so clock uncertainty checks are
// unnecessary. We need to seek to the desired version of the
// value (i.e. one with a timestamp earlier than our read
Expand All @@ -305,7 +320,7 @@ template <bool reverse> class mvccScanner {
}

if (meta_.has_raw_bytes()) {
// 4. Emit immediately if the value is inline.
// 5. Emit immediately if the value is inline.
return addAndAdvance(meta_.raw_bytes());
}

Expand All @@ -326,8 +341,12 @@ template <bool reverse> class mvccScanner {
// Intents for other transactions are visible at or below:
// max(txn.max_timestamp, read_timestamp)
const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_;
if (max_visible_timestamp < meta_timestamp && !own_intent) {
// 5. The key contains an intent, but we're reading before the
// ... unless we're intending on failing on more recent writes,
// in which case other transaction's intents are always visible.
const bool other_intent_visible =
max_visible_timestamp >= meta_timestamp || fail_on_more_recent_;
if (!own_intent && !other_intent_visible) {
// 6. The key contains an intent, but we're reading before the
// intent. Seek to the desired version. Note that if we own the
// intent (i.e. we're reading transactionally) we want to read
// the intent regardless of our read timestamp and fall into
Expand All @@ -336,7 +355,7 @@ template <bool reverse> class mvccScanner {
}

if (inconsistent_) {
// 6. The key contains an intent and we're doing an inconsistent
// 7. The key contains an intent and we're doing an inconsistent
// read at a timestamp newer than the intent. We ignore the
// intent by insisting that the timestamp we're reading at is a
// historical timestamp < the intent timestamp. However, we
Expand All @@ -354,24 +373,30 @@ template <bool reverse> class mvccScanner {
}

if (!own_intent) {
// 7. The key contains an intent which was not written by our
// transaction and our read timestamp is newer than that of the
// intent. Note that this will trigger an error on the Go
// side. We continue scanning so that we can return all of the
// intents in the scan range.
// 8. The key contains an intent which was not written by our
// transaction and either:
// - our read timestamp is equal to or newer than that of the
// intent
// - our read timestamp is older than that of the intent but
// the intent is in our transaction's uncertainty interval
// - our read timestamp is older than that of the intent but
// we want to fail on more recent writes
// Note that this will trigger an error on the Go side. We
// continue scanning so that we can return all of the intents
// in the scan range.
intents_->Put(cur_raw_key_, cur_value_);
return advanceKey();
}

if (txn_epoch_ == meta_.txn().epoch()) {
if (txn_sequence_ >= meta_.txn().sequence() && !seqNumIsIgnored(meta_.txn().sequence())) {
// 8. We're reading our own txn's intent at an equal or higher sequence.
// 9. We're reading our own txn's intent at an equal or higher sequence.
// Note that we read at the intent timestamp, not at our read timestamp
// as the intent timestamp may have been pushed forward by another
// transaction. Txn's always need to read their own writes.
return seekVersion(meta_timestamp, false);
} else {
// 9. We're reading our own txn's intent at a lower sequence than is
// 10. We're reading our own txn's intent at a lower sequence than is
// currently present in the intent. This means the intent we're seeing
// was written at a higher sequence than the read and that there may or
// may not be earlier versions of the intent (with lower sequence
Expand All @@ -382,7 +407,7 @@ template <bool reverse> class mvccScanner {
if (found) {
return advanceKey();
}
// 10. If no value in the intent history has a sequence number equal to
// 11. If no value in the intent history has a sequence number equal to
// or less than the read, we must ignore the intents laid down by the
// transaction all together. We ignore the intent by insisting that the
// timestamp we're reading at is a historical timestamp < the intent
Expand All @@ -392,15 +417,15 @@ template <bool reverse> class mvccScanner {
}

if (txn_epoch_ < meta_.txn().epoch()) {
// 11. We're reading our own txn's intent but the current txn has
// 12. We're reading our own txn's intent but the current txn has
// an earlier epoch than the intent. Return an error so that the
// earlier incarnation of our transaction aborts (presumably
// this is some operation that was retried).
return setStatus(FmtStatus("failed to read with epoch %u due to a write intent with epoch %u",
txn_epoch_, meta_.txn().epoch()));
}

// 12. We're reading our own txn's intent but the current txn has a
// 13. We're reading our own txn's intent but the current txn has a
// later epoch than the intent. This can happen if the txn was
// restarted and an earlier iteration wrote the value we're now
// reading. In this case, we ignore the intent and read the
Expand Down Expand Up @@ -729,6 +754,7 @@ template <bool reverse> class mvccScanner {
const DBIgnoredSeqNums txn_ignored_seqnums_;
const bool inconsistent_;
const bool tombstones_;
const bool fail_on_more_recent_;
const bool check_uncertainty_;
DBScanResults results_;
std::unique_ptr<chunkedBuffer> kvs_;
Expand Down
11 changes: 11 additions & 0 deletions pkg/roachpb/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,17 @@ func (e *WriteIntentError) message(_ *Error) string {

var _ ErrorDetailInterface = &WriteIntentError{}

// NewWriteTooOldError creates a new write too old error. The function accepts
// the timestamp of the operation that hit the error, along with the timestamp
// immediately after the existing write which had a higher timestamp and which
// caused the error.
func NewWriteTooOldError(operationTS, actualTS hlc.Timestamp) *WriteTooOldError {
return &WriteTooOldError{
Timestamp: operationTS,
ActualTimestamp: actualTS,
}
}

func (e *WriteTooOldError) Error() string {
return e.message(nil)
}
Expand Down
Loading

0 comments on commit a25a9c9

Please sign in to comment.