Skip to content

Commit

Permalink
engine: correctly handle intents from previous epochs above read time…
Browse files Browse the repository at this point in the history
…stamp

Fixes "Bug 1" from cockroachdb#36089 (comment).

This commit fixes the bug described in the referenced issue where MVCCScan can
read committed MVCC values at timestamps larger than a scan's read timestamp if
it finds an intent for the same transaction but from a previous epoch at a
timestamp larger than the scan's read timestamp.

Fixing this bug resolves the current issue in cockroachdb#36089.

Release note (bug fix): Fix bug where MVCC value at future timestamp is
returned after a transaction restart.
  • Loading branch information
nvanbenschoten committed Sep 9, 2019
1 parent f16eaad commit b57d372
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 7 deletions.
14 changes: 11 additions & 3 deletions c-deps/libroach/mvcc.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,14 @@ template <bool reverse> class mvccScanner {

const bool own_intent = (meta_.txn().id() == txn_id_);
const DBTimestamp meta_timestamp = ToDBTimestamp(meta_.timestamp());
// meta_timestamp is the timestamp of an intent value, which we may or may
// not end up ignoring, depending on factors codified below. If we do ignore
// the intent then we want to read at a lower timestamp that's strictly
// below the intent timestamp (to skip the intent), but also does not exceed
// our read timestamp (to avoid erroneously picking up future committed
// values); this timestamp is prev_timestamp.
const DBTimestamp prev_timestamp =
timestamp_ < meta_timestamp ? timestamp_ : PrevTimestamp(meta_timestamp);
if (timestamp_ < meta_timestamp && !own_intent) {
// 5. The key contains an intent, but we're reading before the
// intent. Seek to the desired version. Note that if we own the
Expand All @@ -281,7 +289,7 @@ template <bool reverse> class mvccScanner {
return false;
}
intents_->Put(cur_raw_key_, cur_value_);
return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false);
return seekVersion(prev_timestamp, false);
}

if (!own_intent) {
Expand Down Expand Up @@ -318,7 +326,7 @@ template <bool reverse> class mvccScanner {
// transaction all together. We ignore the intent by insisting that the
// timestamp we're reading at is a historical timestamp < the intent
// timestamp.
return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false);
return seekVersion(prev_timestamp, false);
}
}

Expand All @@ -336,7 +344,7 @@ template <bool reverse> class mvccScanner {
// restarted and an earlier iteration wrote the value we're now
// reading. In this case, we ignore the intent and read the
// previous value as if the transaction were starting fresh.
return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false);
return seekVersion(prev_timestamp, false);
}

// nextKey advances the iterator to point to the next MVCC key
Expand Down
16 changes: 12 additions & 4 deletions pkg/storage/engine/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,15 @@ func mvccGetInternal(
"failed to read with epoch %d due to a write intent with epoch %d",
txn.Epoch, meta.Txn.Epoch)
}
seekKey = seekKey.Next()
// Seek past the intent's timestamp and at least as far
// back as the read timestamp. This is necessary if the
// intent is at a higher timestamp than we're trying to
// read at.
if timestamp.Less(metaTimestamp) {
seekKey.Timestamp = timestamp
} else {
seekKey.Timestamp = metaTimestamp.Prev()
}
}
} else if txn != nil && timestamp.Less(txn.MaxTimestamp) {
// In this branch, the latest timestamp is ahead, and so the read of an
Expand All @@ -892,9 +900,9 @@ func mvccGetInternal(
// transaction, or in the absence of future versions that clock uncertainty
// would apply to.
seekKey.Timestamp = timestamp
if seekKey.Timestamp == (hlc.Timestamp{}) {
return nil, ignoredIntent, safeValue, nil
}
}
if seekKey.Timestamp == (hlc.Timestamp{}) {
return nil, ignoredIntent, safeValue, nil
}

iter.Seek(seekKey)
Expand Down
76 changes: 76 additions & 0 deletions pkg/storage/engine/mvcc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3527,6 +3527,82 @@ func TestMVCCGetWithDiffEpochs(t *testing.T) {
}
}

// TestMVCCGetWithDiffEpochsAndTimestamps writes a value first using
// epoch 1, then reads using epoch 2 with different timestamps to verify
// that values written during different transaction epochs are not visible.
//
// The test includes the case where the read at epoch 2 is at a *lower*
// timestamp than the intent write at epoch 1. This is not expected to
// happen commonly, but caused issues in #36089.
func TestMVCCGetWithDiffEpochsAndTimestamps(t *testing.T) {
defer leaktest.AfterTest(t)()

for _, impl := range mvccGetImpls {
t.Run(impl.name, func(t *testing.T) {
mvccGet := impl.fn

ctx := context.Background()
engine := createTestEngine()
defer engine.Close()

// Write initial value without a txn at timestamp 1.
if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 1}, value1, nil); err != nil {
t.Fatal(err)
}
// Write another value without a txn at timestamp 3.
if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 3}, value2, nil); err != nil {
t.Fatal(err)
}
// Now write using txn1, epoch 1.
txn1ts := makeTxn(*txn1, hlc.Timestamp{WallTime: 1})
// Bump epoch 1's write timestamp to timestamp 4.
txn1ts.Timestamp = hlc.Timestamp{WallTime: 4}
// Expected to hit WriteTooOld error but to still lay down intent.
err := MVCCPut(ctx, engine, nil, testKey1, txn1ts.OrigTimestamp, value3, txn1ts)
if wtoErr, ok := err.(*roachpb.WriteTooOldError); !ok {
t.Fatalf("unexpectedly not WriteTooOld: %s", err)
} else if expTS, actTS := txn1ts.Timestamp, wtoErr.ActualTimestamp; expTS != actTS {
t.Fatalf("expected write too old error with actual ts %s; got %s", expTS, actTS)
}
// Try reading using different epochs & timestamps.
testCases := []struct {
txn *roachpb.Transaction
readTS hlc.Timestamp
expValue *roachpb.Value
}{
// Epoch 1, read 1; should see new value3.
{txn1, hlc.Timestamp{WallTime: 1}, &value3},
// Epoch 1, read 2; should see new value3.
{txn1, hlc.Timestamp{WallTime: 2}, &value3},
// Epoch 1, read 3; should see new value3.
{txn1, hlc.Timestamp{WallTime: 3}, &value3},
// Epoch 1, read 4; should see new value3.
{txn1, hlc.Timestamp{WallTime: 4}, &value3},
// Epoch 1, read 5; should see new value3.
{txn1, hlc.Timestamp{WallTime: 5}, &value3},
// Epoch 2, read 1; should see committed value1.
{txn1e2, hlc.Timestamp{WallTime: 1}, &value1},
// Epoch 2, read 2; should see committed value1.
{txn1e2, hlc.Timestamp{WallTime: 2}, &value1},
// Epoch 2, read 3; should see committed value2.
{txn1e2, hlc.Timestamp{WallTime: 3}, &value2},
// Epoch 2, read 4; should see committed value2.
{txn1e2, hlc.Timestamp{WallTime: 4}, &value2},
// Epoch 2, read 5; should see committed value2.
{txn1e2, hlc.Timestamp{WallTime: 5}, &value2},
}
for i, test := range testCases {
t.Run(strconv.Itoa(i), func(t *testing.T) {
value, _, err := mvccGet(ctx, engine, testKey1, test.readTS, MVCCGetOptions{Txn: test.txn})
if err != nil || value == nil || !bytes.Equal(test.expValue.RawBytes, value.RawBytes) {
t.Errorf("test %d: expected value %q, err nil; got %+v, %v", i, test.expValue.RawBytes, value, err)
}
})
}
})
}
}

// TestMVCCGetWithOldEpoch writes a value first using epoch 2, then
// reads using epoch 1 to verify that the read will fail.
func TestMVCCGetWithOldEpoch(t *testing.T) {
Expand Down

0 comments on commit b57d372

Please sign in to comment.