From b57d372b896909d14199f9ab6d72cb382917e850 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Thu, 6 Jun 2019 19:01:49 -0400 Subject: [PATCH] engine: correctly handle intents from previous epochs above read timestamp Fixes "Bug 1" from https://github.com/cockroachdb/cockroach/issues/36089#issuecomment-499349410. This commit fixes the bug described in the referenced issue where MVCCScan can read committed MVCC values at timestamps larger than a scan's read timestamp if it finds an intent for the same transaction but from a previous epoch at a timestamp larger than the scan's read timestamp. Fixing this bug resolves the current issue in #36089. Release note (bug fix): Fix bug where MVCC value at future timestamp is returned after a transaction restart. --- c-deps/libroach/mvcc.h | 14 ++++-- pkg/storage/engine/mvcc.go | 16 +++++-- pkg/storage/engine/mvcc_test.go | 76 +++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/c-deps/libroach/mvcc.h b/c-deps/libroach/mvcc.h index f44bb668cf68..3b0799cff75e 100644 --- a/c-deps/libroach/mvcc.h +++ b/c-deps/libroach/mvcc.h @@ -257,6 +257,14 @@ template class mvccScanner { const bool own_intent = (meta_.txn().id() == txn_id_); const DBTimestamp meta_timestamp = ToDBTimestamp(meta_.timestamp()); + // meta_timestamp is the timestamp of an intent value, which we may or may + // not end up ignoring, depending on factors codified below. If we do ignore + // the intent then we want to read at a lower timestamp that's strictly + // below the intent timestamp (to skip the intent), but also does not exceed + // our read timestamp (to avoid erroneously picking up future committed + // values); this timestamp is prev_timestamp. + const DBTimestamp prev_timestamp = + timestamp_ < meta_timestamp ? timestamp_ : PrevTimestamp(meta_timestamp); if (timestamp_ < meta_timestamp && !own_intent) { // 5. The key contains an intent, but we're reading before the // intent. Seek to the desired version. Note that if we own the @@ -281,7 +289,7 @@ template class mvccScanner { return false; } intents_->Put(cur_raw_key_, cur_value_); - return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false); + return seekVersion(prev_timestamp, false); } if (!own_intent) { @@ -318,7 +326,7 @@ template class mvccScanner { // transaction all together. We ignore the intent by insisting that the // timestamp we're reading at is a historical timestamp < the intent // timestamp. - return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false); + return seekVersion(prev_timestamp, false); } } @@ -336,7 +344,7 @@ template class mvccScanner { // restarted and an earlier iteration wrote the value we're now // reading. In this case, we ignore the intent and read the // previous value as if the transaction were starting fresh. - return seekVersion(PrevTimestamp(ToDBTimestamp(meta_.timestamp())), false); + return seekVersion(prev_timestamp, false); } // nextKey advances the iterator to point to the next MVCC key diff --git a/pkg/storage/engine/mvcc.go b/pkg/storage/engine/mvcc.go index b61432df4355..204156d55395 100644 --- a/pkg/storage/engine/mvcc.go +++ b/pkg/storage/engine/mvcc.go @@ -866,7 +866,15 @@ func mvccGetInternal( "failed to read with epoch %d due to a write intent with epoch %d", txn.Epoch, meta.Txn.Epoch) } - seekKey = seekKey.Next() + // Seek past the intent's timestamp and at least as far + // back as the read timestamp. This is necessary if the + // intent is at a higher timestamp than we're trying to + // read at. + if timestamp.Less(metaTimestamp) { + seekKey.Timestamp = timestamp + } else { + seekKey.Timestamp = metaTimestamp.Prev() + } } } else if txn != nil && timestamp.Less(txn.MaxTimestamp) { // In this branch, the latest timestamp is ahead, and so the read of an @@ -892,9 +900,9 @@ func mvccGetInternal( // transaction, or in the absence of future versions that clock uncertainty // would apply to. seekKey.Timestamp = timestamp - if seekKey.Timestamp == (hlc.Timestamp{}) { - return nil, ignoredIntent, safeValue, nil - } + } + if seekKey.Timestamp == (hlc.Timestamp{}) { + return nil, ignoredIntent, safeValue, nil } iter.Seek(seekKey) diff --git a/pkg/storage/engine/mvcc_test.go b/pkg/storage/engine/mvcc_test.go index 86fb3acb134f..7ee6a4d13a68 100644 --- a/pkg/storage/engine/mvcc_test.go +++ b/pkg/storage/engine/mvcc_test.go @@ -3527,6 +3527,82 @@ func TestMVCCGetWithDiffEpochs(t *testing.T) { } } +// TestMVCCGetWithDiffEpochsAndTimestamps writes a value first using +// epoch 1, then reads using epoch 2 with different timestamps to verify +// that values written during different transaction epochs are not visible. +// +// The test includes the case where the read at epoch 2 is at a *lower* +// timestamp than the intent write at epoch 1. This is not expected to +// happen commonly, but caused issues in #36089. +func TestMVCCGetWithDiffEpochsAndTimestamps(t *testing.T) { + defer leaktest.AfterTest(t)() + + for _, impl := range mvccGetImpls { + t.Run(impl.name, func(t *testing.T) { + mvccGet := impl.fn + + ctx := context.Background() + engine := createTestEngine() + defer engine.Close() + + // Write initial value without a txn at timestamp 1. + if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 1}, value1, nil); err != nil { + t.Fatal(err) + } + // Write another value without a txn at timestamp 3. + if err := MVCCPut(ctx, engine, nil, testKey1, hlc.Timestamp{WallTime: 3}, value2, nil); err != nil { + t.Fatal(err) + } + // Now write using txn1, epoch 1. + txn1ts := makeTxn(*txn1, hlc.Timestamp{WallTime: 1}) + // Bump epoch 1's write timestamp to timestamp 4. + txn1ts.Timestamp = hlc.Timestamp{WallTime: 4} + // Expected to hit WriteTooOld error but to still lay down intent. + err := MVCCPut(ctx, engine, nil, testKey1, txn1ts.OrigTimestamp, value3, txn1ts) + if wtoErr, ok := err.(*roachpb.WriteTooOldError); !ok { + t.Fatalf("unexpectedly not WriteTooOld: %s", err) + } else if expTS, actTS := txn1ts.Timestamp, wtoErr.ActualTimestamp; expTS != actTS { + t.Fatalf("expected write too old error with actual ts %s; got %s", expTS, actTS) + } + // Try reading using different epochs & timestamps. + testCases := []struct { + txn *roachpb.Transaction + readTS hlc.Timestamp + expValue *roachpb.Value + }{ + // Epoch 1, read 1; should see new value3. + {txn1, hlc.Timestamp{WallTime: 1}, &value3}, + // Epoch 1, read 2; should see new value3. + {txn1, hlc.Timestamp{WallTime: 2}, &value3}, + // Epoch 1, read 3; should see new value3. + {txn1, hlc.Timestamp{WallTime: 3}, &value3}, + // Epoch 1, read 4; should see new value3. + {txn1, hlc.Timestamp{WallTime: 4}, &value3}, + // Epoch 1, read 5; should see new value3. + {txn1, hlc.Timestamp{WallTime: 5}, &value3}, + // Epoch 2, read 1; should see committed value1. + {txn1e2, hlc.Timestamp{WallTime: 1}, &value1}, + // Epoch 2, read 2; should see committed value1. + {txn1e2, hlc.Timestamp{WallTime: 2}, &value1}, + // Epoch 2, read 3; should see committed value2. + {txn1e2, hlc.Timestamp{WallTime: 3}, &value2}, + // Epoch 2, read 4; should see committed value2. + {txn1e2, hlc.Timestamp{WallTime: 4}, &value2}, + // Epoch 2, read 5; should see committed value2. + {txn1e2, hlc.Timestamp{WallTime: 5}, &value2}, + } + for i, test := range testCases { + t.Run(strconv.Itoa(i), func(t *testing.T) { + value, _, err := mvccGet(ctx, engine, testKey1, test.readTS, MVCCGetOptions{Txn: test.txn}) + if err != nil || value == nil || !bytes.Equal(test.expValue.RawBytes, value.RawBytes) { + t.Errorf("test %d: expected value %q, err nil; got %+v, %v", i, test.expValue.RawBytes, value, err) + } + }) + } + }) + } +} + // TestMVCCGetWithOldEpoch writes a value first using epoch 2, then // reads using epoch 1 to verify that the read will fail. func TestMVCCGetWithOldEpoch(t *testing.T) {