From a25a9c92b4aa4dab7067077e894776bcdd3470da Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 29 Jan 2020 00:40:09 -0500 Subject: [PATCH] storage/engine: expose FailOnMoreRecent MVCC{Get/Scan}Option Relates to #40205. This change introduces a new option to `MVCCGetOptions` and `MVCCScanOptions` that causes reads to throw an error if they observe an MVCC version at a timestamp above their read timestamp. Specifically, when the option is enabled, a `WriteTooOldError` will be returned if a scan observes an MVCC version with a timestamp above the read. Similarly, a `WriteIntentError` will be returned if a scan observes another transaction's intent, even if that intent has a timestamp above the scan's read timestamp. This option will be used in the future by `ScanRequests` and `ReverseScanRequests` that intend to acquire unreplicated key-level locks, which will power `SELECT FOR UPDATE`. These scans do not want to ignore existing MVCC versions at higher timestamps like traditional scans do. Instead, they want to throw errors and force their transaction to increase its timestamp through either a refresh or a retry. This was previously prototyped in 4a8e8dc. Interestingly, this is not new logic to the MVCC layer. This behavior is exactly the same as that of the initial key-value lookup performed during MVCC writes (see `mvccPutInternal`). It's fitting that behavior needed for `SELECT FOR UPDATE` would mirror that already exhibited by the read portion of read-write operations. This also hints at an opportunity to potentially use this option to merge the two implementations and get rid of custom logic like `mvccGetInternal` that only exists on the write path. We'd need to be careful about doing so though, as this code is heavily tuned. Release note: None --- c-deps/libroach/include/libroach.h | 5 +- c-deps/libroach/mvcc.cc | 14 +- c-deps/libroach/mvcc.h | 60 +++++-- pkg/roachpb/errors.go | 11 ++ pkg/storage/engine/mvcc.go | 98 ++++++---- pkg/storage/engine/mvcc_history_test.go | 10 +- pkg/storage/engine/pebble_mvcc_scanner.go | 57 ++++-- pkg/storage/engine/rocksdb.go | 38 +++- .../mvcc_histories/read_fail_on_more_recent | 168 ++++++++++++++++++ 9 files changed, 376 insertions(+), 85 deletions(-) create mode 100644 pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent diff --git a/c-deps/libroach/include/libroach.h b/c-deps/libroach/include/libroach.h index ecb7ac3ad3f5..18bb4ec8719c 100644 --- a/c-deps/libroach/include/libroach.h +++ b/c-deps/libroach/include/libroach.h @@ -358,15 +358,16 @@ typedef struct { DBStatus status; DBChunkedBuffer data; DBSlice intents; + DBTimestamp write_too_old_timestamp; DBTimestamp uncertainty_timestamp; DBSlice resume_key; } DBScanResults; DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn, - bool inconsistent, bool tombstones); + bool inconsistent, bool tombstones, bool fail_on_more_recent); DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse, - bool tombstones); + bool tombstones, bool fail_on_more_recent); // DBStatsResult contains various runtime stats for RocksDB. typedef struct { diff --git a/c-deps/libroach/mvcc.cc b/c-deps/libroach/mvcc.cc index c838387ff9f6..4518f250ea07 100644 --- a/c-deps/libroach/mvcc.cc +++ b/c-deps/libroach/mvcc.cc @@ -269,28 +269,28 @@ DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_ } DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn, - bool inconsistent, bool tombstones) { + bool inconsistent, bool tombstones, bool fail_on_more_recent) { // Get is implemented as a scan where we retrieve a single key. We specify an // empty key for the end key which will ensure we don't retrieve a key // different than the start key. This is a bit of a hack. const DBSlice end = {0, 0}; ScopedStats scoped_iter(iter); mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, txn, inconsistent, - tombstones); + tombstones, fail_on_more_recent); return scanner.get(); } DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse, - bool tombstones) { + bool tombstones, bool fail_on_more_recent) { ScopedStats scoped_iter(iter); if (reverse) { - mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, - tombstones); + mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, tombstones, + fail_on_more_recent); return scanner.scan(); } else { - mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, - tombstones); + mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, tombstones, + fail_on_more_recent); return scanner.scan(); } } diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index 3ba99cfa7e55..870393f47bea 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -48,7 +48,7 @@ static const int kMaxItersBeforeSeek = 10; template class mvccScanner { public: mvccScanner(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys, - DBTxn txn, bool inconsistent, bool tombstones) + DBTxn txn, bool inconsistent, bool tombstones, bool fail_on_more_recent) : iter_(iter), iter_rep_(iter->rep.get()), start_key_(ToSlice(start)), @@ -62,6 +62,7 @@ template class mvccScanner { txn_ignored_seqnums_(txn.ignored_seqnums), inconsistent_(inconsistent), tombstones_(tombstones), + fail_on_more_recent_(fail_on_more_recent), check_uncertainty_(timestamp < txn.max_timestamp), kvs_(new chunkedBuffer), intents_(new rocksdb::WriteBatch), @@ -254,6 +255,13 @@ template class mvccScanner { return true; } + bool writeTooOldError(DBTimestamp ts) { + results_.write_too_old_timestamp = ts; + kvs_->Clear(); + intents_->Clear(); + return false; + } + bool uncertaintyError(DBTimestamp ts) { results_.uncertainty_timestamp = ts; kvs_->Clear(); @@ -276,8 +284,15 @@ template class mvccScanner { return addAndAdvance(cur_value_); } + if (fail_on_more_recent_) { + // 2. Our txn's read timestamp is less than the most recent + // version's timestamp and the scanner has been configured + // to throw a write too old error on more recent versions. + return writeTooOldError(cur_timestamp_); + } + if (check_uncertainty_) { - // 2. Our txn's read timestamp is less than the max timestamp + // 3. Our txn's read timestamp is less than the max timestamp // seen by the txn. We need to check for clock uncertainty // errors. if (txn_max_timestamp_ >= cur_timestamp_) { @@ -288,7 +303,7 @@ template class mvccScanner { return seekVersion(txn_max_timestamp_, true); } - // 3. Our txn's read timestamp is greater than or equal to the + // 4. Our txn's read timestamp is greater than or equal to the // max timestamp seen by the txn so clock uncertainty checks are // unnecessary. We need to seek to the desired version of the // value (i.e. one with a timestamp earlier than our read @@ -305,7 +320,7 @@ template class mvccScanner { } if (meta_.has_raw_bytes()) { - // 4. Emit immediately if the value is inline. + // 5. Emit immediately if the value is inline. return addAndAdvance(meta_.raw_bytes()); } @@ -326,8 +341,12 @@ template class mvccScanner { // Intents for other transactions are visible at or below: // max(txn.max_timestamp, read_timestamp) const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_; - if (max_visible_timestamp < meta_timestamp && !own_intent) { - // 5. The key contains an intent, but we're reading before the + // ... unless we're intending on failing on more recent writes, + // in which case other transaction's intents are always visible. + const bool other_intent_visible = + max_visible_timestamp >= meta_timestamp || fail_on_more_recent_; + if (!own_intent && !other_intent_visible) { + // 6. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read // the intent regardless of our read timestamp and fall into @@ -336,7 +355,7 @@ template class mvccScanner { } if (inconsistent_) { - // 6. The key contains an intent and we're doing an inconsistent + // 7. The key contains an intent and we're doing an inconsistent // read at a timestamp newer than the intent. We ignore the // intent by insisting that the timestamp we're reading at is a // historical timestamp < the intent timestamp. However, we @@ -354,24 +373,30 @@ template class mvccScanner { } if (!own_intent) { - // 7. The key contains an intent which was not written by our - // transaction and our read timestamp is newer than that of the - // intent. Note that this will trigger an error on the Go - // side. We continue scanning so that we can return all of the - // intents in the scan range. + // 8. The key contains an intent which was not written by our + // transaction and either: + // - our read timestamp is equal to or newer than that of the + // intent + // - our read timestamp is older than that of the intent but + // the intent is in our transaction's uncertainty interval + // - our read timestamp is older than that of the intent but + // we want to fail on more recent writes + // Note that this will trigger an error on the Go side. We + // continue scanning so that we can return all of the intents + // in the scan range. intents_->Put(cur_raw_key_, cur_value_); return advanceKey(); } if (txn_epoch_ == meta_.txn().epoch()) { if (txn_sequence_ >= meta_.txn().sequence() && !seqNumIsIgnored(meta_.txn().sequence())) { - // 8. We're reading our own txn's intent at an equal or higher sequence. + // 9. We're reading our own txn's intent at an equal or higher sequence. // Note that we read at the intent timestamp, not at our read timestamp // as the intent timestamp may have been pushed forward by another // transaction. Txn's always need to read their own writes. return seekVersion(meta_timestamp, false); } else { - // 9. We're reading our own txn's intent at a lower sequence than is + // 10. We're reading our own txn's intent at a lower sequence than is // currently present in the intent. This means the intent we're seeing // was written at a higher sequence than the read and that there may or // may not be earlier versions of the intent (with lower sequence @@ -382,7 +407,7 @@ template class mvccScanner { if (found) { return advanceKey(); } - // 10. If no value in the intent history has a sequence number equal to + // 11. If no value in the intent history has a sequence number equal to // or less than the read, we must ignore the intents laid down by the // transaction all together. We ignore the intent by insisting that the // timestamp we're reading at is a historical timestamp < the intent @@ -392,7 +417,7 @@ template class mvccScanner { } if (txn_epoch_ < meta_.txn().epoch()) { - // 11. We're reading our own txn's intent but the current txn has + // 12. We're reading our own txn's intent but the current txn has // an earlier epoch than the intent. Return an error so that the // earlier incarnation of our transaction aborts (presumably // this is some operation that was retried). @@ -400,7 +425,7 @@ template class mvccScanner { txn_epoch_, meta_.txn().epoch())); } - // 12. We're reading our own txn's intent but the current txn has a + // 13. We're reading our own txn's intent but the current txn has a // later epoch than the intent. This can happen if the txn was // restarted and an earlier iteration wrote the value we're now // reading. In this case, we ignore the intent and read the @@ -729,6 +754,7 @@ template class mvccScanner { const DBIgnoredSeqNums txn_ignored_seqnums_; const bool inconsistent_; const bool tombstones_; + const bool fail_on_more_recent_; const bool check_uncertainty_; DBScanResults results_; std::unique_ptr kvs_; diff --git a/pkg/roachpb/errors.go b/pkg/roachpb/errors.go index eb43fd94f4ac..22b6fd3e0163 100644 --- a/pkg/roachpb/errors.go +++ b/pkg/roachpb/errors.go @@ -588,6 +588,17 @@ func (e *WriteIntentError) message(_ *Error) string { var _ ErrorDetailInterface = &WriteIntentError{} +// NewWriteTooOldError creates a new write too old error. The function accepts +// the timestamp of the operation that hit the error, along with the timestamp +// immediately after the existing write which had a higher timestamp and which +// caused the error. +func NewWriteTooOldError(operationTS, actualTS hlc.Timestamp) *WriteTooOldError { + return &WriteTooOldError{ + Timestamp: operationTS, + ActualTimestamp: actualTS, + } +} + func (e *WriteTooOldError) Error() string { return e.message(nil) } diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index f419cd7cda3c..971fab2056a8 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -707,9 +707,20 @@ func (b *getBuffer) release() { // MVCCGetOptions bundles options for the MVCCGet family of functions. type MVCCGetOptions struct { // See the documentation for MVCCGet for information on these parameters. - Inconsistent bool - Tombstones bool - Txn *roachpb.Transaction + Inconsistent bool + Tombstones bool + FailOnMoreRecent bool + Txn *roachpb.Transaction +} + +func (opts *MVCCGetOptions) validate() error { + if opts.Inconsistent && opts.Txn != nil { + return errors.Errorf("cannot allow inconsistent reads within a transaction") + } + if opts.Inconsistent && opts.FailOnMoreRecent { + return errors.Errorf("cannot allow inconsistent reads with fail on more recent option") + } + return nil } // MVCCGet returns the most recent value for the specified key whose timestamp @@ -727,6 +738,12 @@ type MVCCGetOptions struct { // // Note that transactional gets must be consistent. Put another way, only // non-transactional gets may be inconsistent. +// +// When reading in "fail on more recent" mode, a WriteTooOldError will be +// returned if the read observes a version with a timestamp above the read +// timestamp. Similarly, a WriteIntentError will be returned if the read +// observes another transaction's intent, even if it has a timestamp above +// the read timestamp. func MVCCGet( ctx context.Context, reader Reader, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, ) (*roachpb.Value, *roachpb.Intent, error) { @@ -738,14 +755,14 @@ func MVCCGet( func mvccGet( ctx context.Context, iter Iterator, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, ) (value *roachpb.Value, intent *roachpb.Intent, err error) { + if len(key) == 0 { + return nil, nil, emptyKeyError() + } if timestamp.WallTime < 0 { return nil, nil, errors.Errorf("cannot write to %q at timestamp %s", key, timestamp) } - if opts.Inconsistent && opts.Txn != nil { - return nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction") - } - if len(key) == 0 { - return nil, nil, emptyKeyError() + if err := opts.validate(); err != nil { + return nil, nil, err } // If the iterator has a specialized implementation, defer to that. @@ -760,12 +777,13 @@ func mvccGet( // specify an empty key for the end key which will ensure we don't retrieve a // key different than the start key. This is a bit of a hack. *mvccScanner = pebbleMVCCScanner{ - parent: iter, - start: key, - ts: timestamp, - maxKeys: 1, - inconsistent: opts.Inconsistent, - tombstones: opts.Tombstones, + parent: iter, + start: key, + ts: timestamp, + maxKeys: 1, + inconsistent: opts.Inconsistent, + tombstones: opts.Tombstones, + failOnMoreRecent: opts.FailOnMoreRecent, } mvccScanner.init(opts.Txn) @@ -1662,9 +1680,7 @@ func mvccPutInternal( // instead of allowing their transactions to continue and be retried // before committing. writeTimestamp.Forward(metaTimestamp.Next()) - maybeTooOldErr = &roachpb.WriteTooOldError{ - Timestamp: readTimestamp, ActualTimestamp: writeTimestamp, - } + maybeTooOldErr = roachpb.NewWriteTooOldError(readTimestamp, writeTimestamp) // If we're in a transaction, always get the value at the orig // timestamp. if txn != nil { @@ -2290,12 +2306,12 @@ func mvccScanToBytes( timestamp hlc.Timestamp, opts MVCCScanOptions, ) (kvData [][]byte, numKVs int64, resumeSpan *roachpb.Span, intents []roachpb.Intent, err error) { - if opts.Inconsistent && opts.Txn != nil { - return nil, 0, nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction") - } if len(endKey) == 0 { return nil, 0, nil, nil, emptyKeyError() } + if err := opts.validate(); err != nil { + return nil, 0, nil, nil, err + } if max == 0 { resumeSpan = &roachpb.Span{Key: key, EndKey: endKey} return nil, 0, resumeSpan, nil, nil @@ -2310,14 +2326,15 @@ func mvccScanToBytes( defer pebbleMVCCScannerPool.Put(mvccScanner) *mvccScanner = pebbleMVCCScanner{ - parent: iter, - reverse: opts.Reverse, - start: key, - end: endKey, - ts: timestamp, - maxKeys: max, - inconsistent: opts.Inconsistent, - tombstones: opts.Tombstones, + parent: iter, + reverse: opts.Reverse, + start: key, + end: endKey, + ts: timestamp, + maxKeys: max, + inconsistent: opts.Inconsistent, + tombstones: opts.Tombstones, + failOnMoreRecent: opts.FailOnMoreRecent, } mvccScanner.init(opts.Txn) @@ -2411,10 +2428,21 @@ type MVCCScanOptions struct { // to return no results. // See the documentation for MVCCScan for information on these parameters. - Inconsistent bool - Tombstones bool - Reverse bool - Txn *roachpb.Transaction + Inconsistent bool + Tombstones bool + Reverse bool + FailOnMoreRecent bool + Txn *roachpb.Transaction +} + +func (opts *MVCCScanOptions) validate() error { + if opts.Inconsistent && opts.Txn != nil { + return errors.Errorf("cannot allow inconsistent reads within a transaction") + } + if opts.Inconsistent && opts.FailOnMoreRecent { + return errors.Errorf("cannot allow inconsistent reads with fail on more recent option") + } + return nil } // MVCCScan scans the key range [key, endKey) in the provided reader up to some @@ -2448,6 +2476,12 @@ type MVCCScanOptions struct { // // Note that transactional scans must be consistent. Put another way, only // non-transactional scans may be inconsistent. +// +// When scanning in "fail on more recent" mode, a WriteTooOldError will be +// returned if the scan observes a version with a timestamp above the read +// timestamp. Similarly, a WriteIntentError will be returned if the scan +// observes another transaction's intent, even if it has a timestamp above +// the read timestamp. func MVCCScan( ctx context.Context, reader Reader, diff --git a/pkg/storage/engine/mvcc_history_test.go b/pkg/storage/engine/mvcc_history_test.go index fd4dee347eb0..3570d1c986ab 100644 --- a/pkg/storage/engine/mvcc_history_test.go +++ b/pkg/storage/engine/mvcc_history_test.go @@ -48,10 +48,10 @@ import ( // // cput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [cond=] // del [t=] [ts=[,]] [resolve [status=]] k= -// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] +// get [t=] [ts=[,]] [resolve [status=]] k= [inconsistent] [tombstones] [failOnMoreRecent] // increment [t=] [ts=[,]] [resolve [status=]] k= [inc=] // put [t=] [ts=[,]] [resolve [status=]] k= v= [raw] -// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] +// scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [tombstones] [reverse] [failOnMoreRecent] // // merge [ts=[,]] k= v= [raw] // @@ -598,6 +598,9 @@ func cmdGet(e *evalCtx) error { if e.hasArg("tombstones") { opts.Tombstones = true } + if e.hasArg("failOnMoreRecent") { + opts.FailOnMoreRecent = true + } val, intent, err := MVCCGet(e.ctx, e.engine, key, ts, opts) if err != nil { return err @@ -691,6 +694,9 @@ func cmdScan(e *evalCtx) error { if e.hasArg("reverse") { opts.Reverse = true } + if e.hasArg("failOnMoreRecent") { + opts.FailOnMoreRecent = true + } max := int64(-1) if e.hasArg("max") { var imax int diff --git a/pkg/storage/engine/pebble_mvcc_scanner.go b/pkg/storage/engine/pebble_mvcc_scanner.go index 0c4cdb207fb4..98678270b065 100644 --- a/pkg/storage/engine/pebble_mvcc_scanner.go +++ b/pkg/storage/engine/pebble_mvcc_scanner.go @@ -108,6 +108,7 @@ type pebbleMVCCScanner struct { // Bools copied over from MVCC{Scan,Get}Options. See the comment on the // package level MVCCScan for what these mean. inconsistent, tombstones bool + failOnMoreRecent bool checkUncertainty bool keyBuf []byte savedBuf []byte @@ -239,6 +240,16 @@ func (p *pebbleMVCCScanner) getFromIntentHistory() bool { return true } +// Returns a write too old error with the specified timestamp. +func (p *pebbleMVCCScanner) writeTooOldError(ts hlc.Timestamp) bool { + // The txn can't write at the existing timestamp, so we provide the error + // with the timestamp immediately after it. + p.err = roachpb.NewWriteTooOldError(p.ts, ts.Next()) + p.results.clear() + p.intents.Reset() + return false +} + // Returns an uncertainty error with the specified timestamp and p.txn. func (p *pebbleMVCCScanner) uncertaintyError(ts hlc.Timestamp) bool { p.err = roachpb.NewReadWithinUncertaintyIntervalError(p.ts, ts, p.txn) @@ -258,8 +269,15 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return p.addAndAdvance(p.curValue) } + if p.failOnMoreRecent { + // 2. Our txn's read timestamp is less than the most recent + // version's timestamp and the scanner has been configured + // to throw a write too old error on more recent versions. + return p.writeTooOldError(p.curTS) + } + if p.checkUncertainty { - // 2. Our txn's read timestamp is less than the max timestamp + // 3. Our txn's read timestamp is less than the max timestamp // seen by the txn. We need to check for clock uncertainty // errors. if p.curTS.LessEq(p.txn.MaxTimestamp) { @@ -269,7 +287,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return p.seekVersion(p.txn.MaxTimestamp, true) } - // 3. Our txn's read timestamp is greater than or equal to the + // 4. Our txn's read timestamp is greater than or equal to the // max timestamp seen by the txn so clock uncertainty checks are // unnecessary. We need to seek to the desired version of the // value (i.e. one with a timestamp earlier than our read @@ -287,7 +305,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return false } if len(p.meta.RawBytes) != 0 { - // 4. Emit immediately if the value is inline. + // 5. Emit immediately if the value is inline. return p.addAndAdvance(p.meta.RawBytes) } @@ -313,9 +331,10 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { if p.checkUncertainty { maxVisibleTS = p.txn.MaxTimestamp } + otherIntentVisible := metaTS.LessEq(maxVisibleTS) || p.failOnMoreRecent - if maxVisibleTS.Less(metaTS) && !ownIntent { - // 5. The key contains an intent, but we're reading before the + if !ownIntent && !otherIntentVisible { + // 6. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the // intent (i.e. we're reading transactionally) we want to read // the intent regardless of our read timestamp and fall into @@ -324,7 +343,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if p.inconsistent { - // 6. The key contains an intent and we're doing an inconsistent + // 7. The key contains an intent and we're doing an inconsistent // read at a timestamp newer than the intent. We ignore the // intent by insisting that the timestamp we're reading at is a // historical timestamp < the intent timestamp. However, we @@ -347,11 +366,17 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if !ownIntent { - // 7. The key contains an intent which was not written by our - // transaction and our read timestamp is newer than that of the - // intent. Note that this will trigger an error on the Go - // side. We continue scanning so that we can return all of the - // intents in the scan range. + // 8. The key contains an intent which was not written by our + // transaction and either: + // - our read timestamp is equal to or newer than that of the + // intent + // - our read timestamp is older than that of the intent but + // the intent is in our transaction's uncertainty interval + // - our read timestamp is older than that of the intent but + // we want to fail on more recent writes + // Note that this will trigger an error higher up the stack. We + // continue scanning so that we can return all of the intents + // in the scan range. p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curMVCCKey()) p.err = p.intents.Set(p.keyBuf, p.curValue, nil) if p.err != nil { @@ -362,14 +387,14 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { if p.txnEpoch == p.meta.Txn.Epoch { if p.txnSequence >= p.meta.Txn.Sequence && !enginepb.TxnSeqIsIgnored(p.meta.Txn.Sequence, p.txnIgnoredSeqNums) { - // 8. We're reading our own txn's intent at an equal or higher sequence. + // 9. We're reading our own txn's intent at an equal or higher sequence. // Note that we read at the intent timestamp, not at our read timestamp // as the intent timestamp may have been pushed forward by another // transaction. Txn's always need to read their own writes. return p.seekVersion(metaTS, false) } - // 9. We're reading our own txn's intent at a lower sequence than is + // 10. We're reading our own txn's intent at a lower sequence than is // currently present in the intent. This means the intent we're seeing // was written at a higher sequence than the read and that there may or // may not be earlier versions of the intent (with lower sequence @@ -382,7 +407,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } return p.advanceKey() } - // 10. If no value in the intent history has a sequence number equal to + // 11. If no value in the intent history has a sequence number equal to // or less than the read, we must ignore the intents laid down by the // transaction all together. We ignore the intent by insisting that the // timestamp we're reading at is a historical timestamp < the intent @@ -391,7 +416,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { } if p.txnEpoch < p.meta.Txn.Epoch { - // 11. We're reading our own txn's intent but the current txn has + // 12. We're reading our own txn's intent but the current txn has // an earlier epoch than the intent. Return an error so that the // earlier incarnation of our transaction aborts (presumably // this is some operation that was retried). @@ -400,7 +425,7 @@ func (p *pebbleMVCCScanner) getAndAdvance() bool { return false } - // 12. We're reading our own txn's intent but the current txn has a + // 13. We're reading our own txn's intent but the current txn has a // later epoch than the intent. This can happen if the txn was // restarted and an earlier iteration wrote the value we're now // reading. In this case, we ignore the intent and read the diff --git a/pkg/storage/engine/rocksdb.go b/pkg/storage/engine/rocksdb.go index 87266e0f1093..c4bf78ad71af 100644 --- a/pkg/storage/engine/rocksdb.go +++ b/pkg/storage/engine/rocksdb.go @@ -2415,12 +2415,15 @@ func (r *rocksDBIterator) MVCCGet( r.clearState() state := C.MVCCGet( r.iter, goToCSlice(key), goToCTimestamp(timestamp), goToCTxn(opts.Txn), - C.bool(opts.Inconsistent), C.bool(opts.Tombstones), + C.bool(opts.Inconsistent), C.bool(opts.Tombstones), C.bool(opts.FailOnMoreRecent), ) if err := statusToError(state.status); err != nil { return nil, nil, err } + if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { + return nil, nil, err + } if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { return nil, nil, err } @@ -2484,11 +2487,15 @@ func (r *rocksDBIterator) MVCCScan( goToCTimestamp(timestamp), C.int64_t(max), goToCTxn(opts.Txn), C.bool(opts.Inconsistent), C.bool(opts.Reverse), C.bool(opts.Tombstones), + C.bool(opts.FailOnMoreRecent), ) if err := statusToError(state.status); err != nil { return nil, 0, nil, nil, err } + if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { + return nil, 0, nil, nil, err + } if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { return nil, 0, nil, nil, err } @@ -2715,6 +2722,13 @@ func goToCTimestamp(ts hlc.Timestamp) C.DBTimestamp { } } +func cToGoTimestamp(ts C.DBTimestamp) hlc.Timestamp { + return hlc.Timestamp{ + WallTime: int64(ts.wall_time), + Logical: int32(ts.logical), + } +} + func goToCTxn(txn *roachpb.Transaction) C.DBTxn { var r C.DBTxn if txn != nil { @@ -2745,16 +2759,22 @@ func statusToError(s C.DBStatus) error { return &Error{msg: cStringToGoString(s)} } +func writeTooOldToError(readTS hlc.Timestamp, existingCTS C.DBTimestamp) error { + existingTS := cToGoTimestamp(existingCTS) + if !existingTS.IsEmpty() { + // The txn can't write at the existing timestamp, so we provide the + // error with the timestamp immediately after it. + return roachpb.NewWriteTooOldError(readTS, existingTS.Next()) + } + return nil +} + func uncertaintyToError( - readTS hlc.Timestamp, existingTS C.DBTimestamp, txn *roachpb.Transaction, + readTS hlc.Timestamp, existingCTS C.DBTimestamp, txn *roachpb.Transaction, ) error { - if existingTS.wall_time != 0 || existingTS.logical != 0 { - return roachpb.NewReadWithinUncertaintyIntervalError( - readTS, hlc.Timestamp{ - WallTime: int64(existingTS.wall_time), - Logical: int32(existingTS.logical), - }, - txn) + existingTS := cToGoTimestamp(existingCTS) + if !existingTS.IsEmpty() { + return roachpb.NewReadWithinUncertaintyIntervalError(readTS, existingTS, txn) } return nil } diff --git a/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent new file mode 100644 index 000000000000..3bbeec50dd5e --- /dev/null +++ b/pkg/storage/engine/testdata/mvcc_histories/read_fail_on_more_recent @@ -0,0 +1,168 @@ +# Setup: +# k1: value @ ts 10 +# k2: intent @ ts 10 + +run ok +put k=k1 v=v ts=10,0 +---- +>> at end: +data: "k1"/0.000000010,0 -> /BYTES/v + +run ok +with t=A + txn_begin ts=10,0 + put k=k2 v=v +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=0.000000010,0 min=0,0 seq=0} rw=true stat=PENDING rts=0.000000010,0 wto=false max=0,0 +data: "k1"/0.000000010,0 -> /BYTES/v +meta: "k2"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=0.000000010,0 min=0,0 seq=0} ts=0.000000010,0 del=false klen=12 vlen=6 +data: "k2"/0.000000010,0 -> /BYTES/v + +# Test cases: +# +# for k in (k1, k2): +# for op in (get, scan): +# for ts in (9, 10, 11): +# for failOnMoreRecent in (false, true): +# testCase() +# + +run ok +get k=k1 ts=9,0 +---- +get: "k1" -> + +run error +get k=k1 ts=9,0 failOnMoreRecent +---- +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 0.000000009,0 too old; wrote at 0.000000010,1 + +run ok +get k=k1 ts=10,0 +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=10,0 failOnMoreRecent +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=11,0 +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k1 ts=11,0 failOnMoreRecent +---- +get: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=9,0 +---- +scan: "k1"-"k2" -> + +run error +scan k=k1 end=k2 ts=9,0 failOnMoreRecent +---- +scan: "k1"-"k2" -> +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write at timestamp 0.000000009,0 too old; wrote at 0.000000010,1 + +run ok +scan k=k1 end=k2 ts=10,0 +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=10,0 failOnMoreRecent +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=11,0 +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +scan k=k1 end=k2 ts=11,0 failOnMoreRecent +---- +scan: "k1" -> /BYTES/v @0.000000010,0 + +run ok +get k=k2 ts=9,0 +---- +get: "k2" -> + +run error +get k=k2 ts=9,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=10,0 +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=10,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=11,0 +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +get k=k2 ts=11,0 failOnMoreRecent +---- +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run ok +scan k=k2 end=k3 ts=9,0 +---- +scan: "k2"-"k3" -> + +run error +scan k=k2 end=k3 ts=9,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=10,0 +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=10,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=11,0 +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +run error +scan k=k2 end=k3 ts=11,0 failOnMoreRecent +---- +scan: "k2"-"k3" -> +error: (*roachpb.WriteIntentError:) conflicting intents on "k2" + +# The failOnMoreRecent and inconsistent options cannot be used together. + +run error +get k=k1 ts=9,0 inconsistent failOnMoreRecent +---- +error: (*withstack.withStack:) cannot allow inconsistent reads with fail on more recent option + +run error +scan k=k1 end=k2 ts=9,0 inconsistent failOnMoreRecent +---- +scan: "k1"-"k2" -> +error: (*withstack.withStack:) cannot allow inconsistent reads with fail on more recent option