diff --git a/compaction_iter.go b/compaction_iter.go index cadb9b03a2..522557bfeb 100644 --- a/compaction_iter.go +++ b/compaction_iter.go @@ -7,6 +7,7 @@ package pebble import ( "io" "sort" + "strconv" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" @@ -140,6 +141,10 @@ type compactionIter struct { // determine when iteration has advanced to a new user key and thus a new // snapshot stripe. keyBuf []byte + // Temporary buffer used for storing the previous value, which may be an + // unsafe, i.iter-owned slice that could be altered when the iterator is + // advanced. + valueBuf []byte // Is the current entry valid? valid bool iterKey *InternalKey @@ -303,12 +308,8 @@ func (i *compactionIter) Next() (*InternalKey, []byte) { continue } - case InternalKeyKindSet: - i.saveKey() - i.value = i.iterValue - i.valid = true - i.skip = true - i.maybeZeroSeqnum(i.curSnapshotIdx) + case InternalKeyKindSet, InternalKeyKindSetWithDelete: + i.setNext() return &i.key, i.value case InternalKeyKindMerge: @@ -444,6 +445,52 @@ func (i *compactionIter) nextInStripe() stripeChangeType { return newStripe } +func (i *compactionIter) setNext() { + // Save the current key. + i.saveKey() + // iterValue is owned by i.iter and could change after the Next() call, so + // use valueBuf instead. Note that valueBuf is only used in this one + // instance; everywhere else, we just point i.value to the unsafe + // i.iter-owned value buffer. + i.valueBuf = append(i.valueBuf[:0], i.iterValue...) + i.value = i.valueBuf + i.valid = true + i.maybeZeroSeqnum(i.curSnapshotIdx) + + // If this key is a SETWITHDEL, we can emit it immediately. Records with the + // same key are safe to skip. + if i.iterKey.Kind() == InternalKeyKindSetWithDelete { + i.skip = true + return + } + + // Else, we continue to loop through records in the stripe. + for { + switch t := i.nextInStripe(); t { + case newStripe, sameStripeNonSkippable: + i.pos = iterPosNext + if t == sameStripeNonSkippable { + // This is a key we cannot skip. We can safely emit a SETWITHDEL + // here, though it is suboptimal due to ... FIXME + // TODO(travers): Optimize this after undertaking a larger + // refactoring to make this check simpler. + i.key.SetKind(InternalKeyKindSetWithDelete) + } + return + case sameStripeSkippable: + // We're still in the same stripe. If this is a DEL, we stop looking + // and emit a SETWITHDEL. Subsequent keys are eligible for skipping. + if i.iterKey.Kind() == InternalKeyKindDelete { + i.key.SetKind(InternalKeyKindSetWithDelete) + i.skip = true + return + } + default: + panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(t))) + } + } +} + func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { // Save the current key. i.saveKey() @@ -479,7 +526,7 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { i.skip = true return sameStripeSkippable - case InternalKeyKindSet: + case InternalKeyKindSet, InternalKeyKindSetWithDelete: if i.rangeDelFrag.Deleted(*key, i.curSnapshotSeqNum) { // We change the kind of the result key to a Set so that it shadows // keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't @@ -490,9 +537,10 @@ func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { return sameStripeSkippable } - // We've hit a Set value. Merge with the existing value and return. We - // change the kind of the resulting key to a Set so that it shadows keys - // in lower levels. That is, MERGE+SET -> SET. + // We've hit a Set or SetWithDel value. Merge with the existing + // value and return. We change the kind of the resulting key to a + // Set so that it shadows keys in lower levels. That is: + // MERGE + (SET*) -> SET. i.err = valueMerger.MergeOlder(i.iterValue) if i.err != nil { i.valid = false @@ -544,8 +592,9 @@ func (i *compactionIter) singleDeleteNext() bool { key := i.iterKey switch key.Kind() { - case InternalKeyKindDelete, InternalKeyKindMerge: - // We've hit a Delete or Merge, transform the SingleDelete into a full Delete. + case InternalKeyKindDelete, InternalKeyKindMerge, InternalKeyKindSetWithDelete: + // We've hit a Delete, Merge or SetWithDelete, transform the + // SingleDelete into a full Delete. i.key.SetKind(InternalKeyKindDelete) i.skip = true return true diff --git a/error_test.go b/error_test.go index 165f5a83ca..15bf755468 100644 --- a/error_test.go +++ b/error_test.go @@ -194,7 +194,7 @@ func TestRequireReadError(t *testing.T) { require.NoError(t, d.Flush()) expectLSM(` 0.0: - 000007:[a1#4,SET-a2#72057594037927935,RANGEDEL] + 000007:[a1#4,SETWITHDEL-a2#72057594037927935,RANGEDEL] 6: 000005:[a1#1,SET-a2#2,SET] `, d, t) @@ -281,7 +281,7 @@ func TestCorruptReadError(t *testing.T) { require.NoError(t, d.Flush()) expectLSM(` 0.0: - 000007:[a1#4,SET-a2#72057594037927935,RANGEDEL] + 000007:[a1#4,SETWITHDEL-a2#72057594037927935,RANGEDEL] 6: 000005:[a1#1,SET-a2#2,SET] `, d, t) diff --git a/internal.go b/internal.go index 00e38b33b8..563785ee67 100644 --- a/internal.go +++ b/internal.go @@ -18,6 +18,7 @@ const ( InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete InternalKeyKindMax = base.InternalKeyKindMax + InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete InternalKeyKindInvalid = base.InternalKeyKindInvalid InternalKeySeqNumBatch = base.InternalKeySeqNumBatch InternalKeySeqNumMax = base.InternalKeySeqNumMax diff --git a/internal/base/internal.go b/internal/base/internal.go index 3afd6f38b5..448e85d605 100644 --- a/internal/base/internal.go +++ b/internal/base/internal.go @@ -47,6 +47,11 @@ const ( // seqNum. InternalKeyKindMax InternalKeyKind = 17 + // InternalKeyKindSetWithDelete keys are SET keys that have met with a + // DELETE key in a prior compaction. This key kind is specific to Pebble. + // See https://github.com/cockroachdb/pebble/issues/1255. + InternalKeyKindSetWithDelete InternalKeyKind = 18 + // A marker for an invalid key. InternalKeyKindInvalid InternalKeyKind = 255 @@ -66,14 +71,15 @@ const ( ) var internalKeyKindNames = []string{ - InternalKeyKindDelete: "DEL", - InternalKeyKindSet: "SET", - InternalKeyKindMerge: "MERGE", - InternalKeyKindLogData: "LOGDATA", - InternalKeyKindSingleDelete: "SINGLEDEL", - InternalKeyKindRangeDelete: "RANGEDEL", - InternalKeyKindMax: "MAX", - InternalKeyKindInvalid: "INVALID", + InternalKeyKindDelete: "DEL", + InternalKeyKindSet: "SET", + InternalKeyKindMerge: "MERGE", + InternalKeyKindLogData: "LOGDATA", + InternalKeyKindSingleDelete: "SINGLEDEL", + InternalKeyKindRangeDelete: "RANGEDEL", + InternalKeyKindMax: "MAX", + InternalKeyKindSetWithDelete: "SETWITHDEL", + InternalKeyKindInvalid: "INVALID", } func (k InternalKeyKind) String() string { @@ -130,13 +136,14 @@ func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey { } var kindsMap = map[string]InternalKeyKind{ - "DEL": InternalKeyKindDelete, - "SINGLEDEL": InternalKeyKindSingleDelete, - "RANGEDEL": InternalKeyKindRangeDelete, - "SET": InternalKeyKindSet, - "MERGE": InternalKeyKindMerge, - "INVALID": InternalKeyKindInvalid, - "MAX": InternalKeyKindMax, + "DEL": InternalKeyKindDelete, + "SINGLEDEL": InternalKeyKindSingleDelete, + "RANGEDEL": InternalKeyKindRangeDelete, + "SET": InternalKeyKindSet, + "MERGE": InternalKeyKindMerge, + "INVALID": InternalKeyKindInvalid, + "MAX": InternalKeyKindMax, + "SETWITHDEL": InternalKeyKindSetWithDelete, } // ParseInternalKey parses the string representation of an internal key. The diff --git a/iterator.go b/iterator.go index 6bfa85eb9c..4d7ebfb9b8 100644 --- a/iterator.go +++ b/iterator.go @@ -245,7 +245,7 @@ func (i *Iterator) findNextEntry(limit []byte) { i.nextUserKey() continue - case InternalKeyKindSet: + case InternalKeyKindSet, InternalKeyKindSetWithDelete: i.keyBuf = append(i.keyBuf[:0], key.UserKey...) i.key = i.keyBuf i.value = i.iterValue @@ -459,7 +459,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { } continue - case InternalKeyKindSet: + case InternalKeyKindSet, InternalKeyKindSetWithDelete: i.keyBuf = append(i.keyBuf[:0], key.UserKey...) i.key = i.keyBuf // iterValue is owned by i.iter and could change after the Prev() @@ -570,7 +570,7 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { // point. return - case InternalKeyKindSet: + case InternalKeyKindSet, InternalKeyKindSetWithDelete: // We've hit a Set value. Merge with the existing value and return. i.err = valueMerger.MergeOlder(i.iterValue) return diff --git a/level_checker.go b/level_checker.go index d779546e5e..dcf6010995 100644 --- a/level_checker.go +++ b/level_checker.go @@ -167,7 +167,7 @@ func (m *simpleMergingIter) step() bool { m.err = closer.Close() } m.valueMerger = nil - case InternalKeyKindSet: + case InternalKeyKindSet, InternalKeyKindSetWithDelete: m.err = m.valueMerger.MergeOlder(item.value) if m.err == nil { var closer io.Closer diff --git a/range_del_test.go b/range_del_test.go index dd406e9ffa..1afdcd0c56 100644 --- a/range_del_test.go +++ b/range_del_test.go @@ -264,9 +264,9 @@ func TestRangeDelCompactionTruncation(t *testing.T) { 1: 000012:[a#3,RANGEDEL-b#72057594037927935,RANGEDEL] 3: - 000017:[b#4,SET-b#4,SET] + 000017:[b#4,SETWITHDEL-b#4,SETWITHDEL] 000018:[b#3,RANGEDEL-c#72057594037927935,RANGEDEL] - 000019:[c#5,SET-d#72057594037927935,RANGEDEL] + 000019:[c#5,SETWITHDEL-d#72057594037927935,RANGEDEL] `) // The L1 table still contains a tombstone from [a,d) which will improperly diff --git a/testdata/compaction_iter b/testdata/compaction_iter index e7038e8cd1..3ecbc85927 100644 --- a/testdata/compaction_iter +++ b/testdata/compaction_iter @@ -169,7 +169,7 @@ iter first next ---- -a#9,1:b +a#9,18:b . iter snapshots=6 @@ -177,7 +177,7 @@ first next next ---- -a#9,1:b +a#9,18:b a#5,1:f . @@ -186,7 +186,7 @@ first next next ---- -a#9,1:b +a#9,18:b a#6,0: . @@ -195,8 +195,8 @@ first next next ---- -a#9,1:b -a#7,1:d +a#9,18:b +a#7,18:d . iter snapshots=9 @@ -212,7 +212,7 @@ iter snapshots=10 first next ---- -a#9,1:b +a#9,18:b . iter snapshots=(5,6,7,8,9) @@ -249,7 +249,7 @@ iter first next ---- -a#2,1:b +a#2,18:b err=invalid internal key kind: 255 define @@ -308,7 +308,7 @@ next next tombstones ---- -a#2,1:b +a#2,18:b a#1,15:c b#4,15:d . @@ -342,7 +342,7 @@ next next tombstones ---- -a#2,1:b +a#2,18:b a#1,15:c b#4,15:d b#2,1:e @@ -362,7 +362,7 @@ next next tombstones ---- -a#2,1:b +a#2,18:b a#1,15:c b#4,15:d b#2,1:e @@ -1164,3 +1164,173 @@ next a#3,15:c b#5,1:5[base] b#1,2:1 + +# SET that meets a DEL is transformed into a SETWITHDEL. + +define +a.SET.2:b +a.DEL.1: +---- + +iter +first +next +---- +a#2,18:b +. + +iter snapshots=2 +first +next +next +---- +a#2,1:b +a#1,0: +. + +define +a.SET.3:c +a.DEL.2: +a.SET.1:b +---- + +iter +first +next +---- +a#3,18:c +. + +iter snapshots=2 +first +next +next +---- +a#3,18:c +a#1,1:b +. + +define +a.SET.3:c +a.SET.2:b +a.DEL.1: +---- + +iter +first +next +---- +a#3,18:c +. + +iter snapshots=3 +first +next +next +---- +a#3,1:c +a#2,18:b +. + +iter snapshots=2 +first +next +next +---- +a#3,1:c +a#1,0: +. + +define +a.DEL.3: +a.SET.2:b +a.DEL.1: +---- + +iter +first +next +---- +a#3,0: +. + +iter snapshots=3 +first +next +next +---- +a#3,0: +a#2,18:b +. + +iter snapshots=2 +first +next +next +---- +a#3,0: +a#1,0: +. + +# SINGLEDEL that meets a SETWITHDEL is transformed into a DEL. + +define +a.SINGLEDEL.3: +a.SETWITHDEL.2:d +b.SET.1:c +---- + +iter +first +next +next +---- +a#3,0: +b#1,1:c +. + +iter snapshots=2 +first +next +next +---- +a#3,0: +b#1,1:c +. + +iter snapshots=3 +first +next +next +next +---- +a#3,7: +a#2,18:d +b#1,1:c +. + +define +a.SETWITHDEL.3:3 +a.SET.2:d +b.SET.1:c +---- + +iter +first +next +next +---- +a#3,18:3 +b#1,1:c +. + +iter snapshots=3 +first +next +next +next +---- +a#3,18:3 +a#2,1:d +b#1,1:c +. diff --git a/testdata/manual_compaction b/testdata/manual_compaction index 594c742d1a..0e75649cc3 100644 --- a/testdata/manual_compaction +++ b/testdata/manual_compaction @@ -92,7 +92,7 @@ range-deletions-bytes-estimate: 1552 compact a-e L1 ---- 2: - 000008:[a#3,SET-b#72057594037927935,RANGEDEL] + 000008:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL] 000009:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] 000010:[d#2,RANGEDEL-e#72057594037927935,RANGEDEL] 3: @@ -140,7 +140,7 @@ L3 compact a-e L1 ---- 2: - 000010:[a#3,SET-b#72057594037927935,RANGEDEL] + 000010:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL] 000011:[b#2,RANGEDEL-d#72057594037927935,RANGEDEL] 000012:[d#2,RANGEDEL-f#72057594037927935,RANGEDEL] 000013:[f#2,RANGEDEL-g#72057594037927935,RANGEDEL] @@ -181,7 +181,7 @@ L3 compact a-e L1 ---- 2: - 000009:[a#3,SET-g#72057594037927935,RANGEDEL] + 000009:[a#3,SETWITHDEL-g#72057594037927935,RANGEDEL] 000010:[h#3,SET-h#3,SET] 3: 000006:[a#0,SET-b#0,SET] @@ -390,7 +390,7 @@ compact a-e L1 0.0: 000004:[c#4,SET-c#4,SET] 2: - 000008:[a#3,SET-b#72057594037927935,RANGEDEL] + 000008:[a#3,SETWITHDEL-b#72057594037927935,RANGEDEL] 000009:[b#2,RANGEDEL-e#72057594037927935,RANGEDEL] 3: 000007:[b#1,SET-b#1,SET] diff --git a/testdata/singledel_manual_compaction b/testdata/singledel_manual_compaction index ff83e37648..067958023a 100644 --- a/testdata/singledel_manual_compaction +++ b/testdata/singledel_manual_compaction @@ -1,7 +1,6 @@ # This is not actually a manual compaction test, and simply uses manual -# compaction to demonstrate single delete semantics. Specifically, it -# demonstrates that the behavior can be non-deterministic if not used -# correctly. +# compaction to demonstrate single delete semantics with used with +# set-with-delete. # Define a sequence of SET=>SET=>DEL=>SET=>SINGLEDEL. define target-file-sizes=(1, 1, 1, 1, 1) @@ -39,7 +38,7 @@ compact a-b L2 1: 000004:[a#10,SINGLEDEL-a#10,SINGLEDEL] 3: - 000009:[a#9,SET-a#9,SET] + 000009:[a#9,SETWITHDEL-a#9,SETWITHDEL] 4: 000007:[a#7,SET-a#7,SET] 5: @@ -57,7 +56,7 @@ compact a-b L1 2: 000010:[a#10,SINGLEDEL-a#10,SINGLEDEL] 3: - 000009:[a#9,SET-a#9,SET] + 000009:[a#9,SETWITHDEL-a#9,SETWITHDEL] 4: 000007:[a#7,SET-a#7,SET] 5: @@ -65,13 +64,15 @@ compact a-b L1 compact a-b L2 ---- +3: + 000011:[a#10,DEL-a#10,DEL] 4: 000007:[a#7,SET-a#7,SET] 5: 000008:[a#6,SET-a#6,SET] -# Deleted data reappears. +# Deleted data is not resurrected. iter first ---- -a:v2 +. diff --git a/tool/find.go b/tool/find.go index 214f9a256f..8714a5b757 100644 --- a/tool/find.go +++ b/tool/find.go @@ -345,7 +345,8 @@ func (f *findT) searchLogs(searchKey []byte, refs []findRef) []findRef { case base.InternalKeyKindDelete, base.InternalKeyKindSet, base.InternalKeyKindMerge, - base.InternalKeyKindSingleDelete: + base.InternalKeyKindSingleDelete, + base.InternalKeyKindSetWithDelete: if cmp(searchKey, ikey.UserKey) != 0 { continue } diff --git a/tool/wal.go b/tool/wal.go index 28092eef42..d701174bba 100644 --- a/tool/wal.go +++ b/tool/wal.go @@ -138,6 +138,8 @@ func (w *walT) runDump(cmd *cobra.Command, args []string) { fmt.Fprintf(stdout, "<%d>", len(value)) case base.InternalKeyKindSingleDelete: fmt.Fprintf(stdout, "%s", w.fmtKey.fn(ukey)) + case base.InternalKeyKindSetWithDelete: + fmt.Fprintf(stdout, "%s", w.fmtKey.fn(ukey)) case base.InternalKeyKindRangeDelete: fmt.Fprintf(stdout, "%s,%s", w.fmtKey.fn(ukey), w.fmtKey.fn(value)) }