Merge #59028 #59127

59028: coldata: fix Bytes invariant in some cases r=yuzefovich a=yuzefovich **execgen: remove SLICE method** `execgen.SLICE` directive is used only in one place, and we can use `execgen.WINDOW` there instead (which will have the same effect). Release note: None **coldata: fix updating offsets of bytes in Batch.SetLength** In `SetLength` method we are maintaining the invariant of `Bytes` vectors that the offsets are non-decreasing sequences. Previously, this was done incorrectly when a selection vector is present on the batch which could lead to out of bounds errors (caught by our panic-catcher) some time later. This is now fixed by correctly paying attention to the selection vector. I neither can easily come up with an example query that would trigger this condition nor can I prove that it won't occur, but I think we have seen a single sentry report that could be explained by this bug, so I think it's worth backporting. Additionally, this commit uses the assumption that the selection vectors are increasing sequences in order to calculate the largest index accessed by the batch. Fixes: #57297. Release note (bug fix): Previously, CockroachDB could encounter an internal error when executing queries with BYTES or STRING types via the vectorized engine in rare circumstances, and now this is fixed. 59127: importccl: skip BenchmarkUserFileImport r=RaduBerinde a=RaduBerinde This benchmark causes repeated CI failures. Informs #59126. Release note: None Co-authored-by: Yahor Yuzefovich <yahor@cockroachlabs.com> Co-authored-by: Radu Berinde <radu@cockroachlabs.com>
cockroachdb · Jan 19, 2021 · 731d083 · 731d083
3 parents 6a51818 + 3589b52 + 1ecbc2f
commit 731d083
Show file tree

Hide file tree

Showing 11 changed files with 64 additions and 33 deletions.
diff --git a/pkg/ccl/importccl/import_stmt_test.go b/pkg/ccl/importccl/import_stmt_test.go
@@ -3511,6 +3511,7 @@ func BenchmarkNodelocalImport(b *testing.B) {
 // BenchmarkUserfileImport-16    	       1	4060204527 ns/op	   6.68 MB/s
 // BenchmarkUserfileImport-16    	       1	4627419761 ns/op	   5.86 MB/s
 func BenchmarkUserfileImport(b *testing.B) {
+	skip.WithIssue(b, 59126)
 	benchUserUpload(b, "userfile://defaultdb.public.root")
 }
 

diff --git a/pkg/col/coldata/batch.go b/pkg/col/coldata/batch.go
@@ -245,8 +245,17 @@ func (m *MemBatch) SetSelection(b bool) {
 func (m *MemBatch) SetLength(length int) {
 	m.length = length
 	if length > 0 {
+		// In order to maintain the invariant of Bytes vectors we need to update
+		// offsets up to the element with the largest index that can be accessed
+		// by the batch.
+		maxIdx := length - 1
+		if m.useSel {
+			// Note that here we rely on the fact that selection vectors are
+			// increasing sequences.
+			maxIdx = m.sel[length-1]
+		}
 		for i, ok := m.bytesVecIdxs.Next(0); ok; i, ok = m.bytesVecIdxs.Next(i + 1) {
-			m.b[i].Bytes().UpdateOffsetsToBeNonDecreasing(length)
+			m.b[i].Bytes().UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
 		}
 	}
 }

diff --git a/pkg/col/coldata/batch_test.go b/pkg/col/coldata/batch_test.go
@@ -108,3 +108,37 @@ func TestBatchReset(t *testing.T) {
 	b = coldata.NewMemBatch(typsBytes, coldata.StandardColumnFactory)
 	resetAndCheck(b, typsBytes, 1, true)
 }
+
+// TestBatchWithBytesAndNulls verifies that the invariant of Bytes vectors is
+// maintained when NULL values are present in the vector and there is a
+// selection vector on the batch.
+func TestBatchWithBytesAndNulls(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	b := coldata.NewMemBatch([]*types.T{types.Bytes}, coldata.StandardColumnFactory)
+	// We will insert some garbage data into the Bytes vector in positions that
+	// are not mentioned in the selection vector. All the values that are
+	// selected are actually NULL values, so we don't set anything on the Bytes
+	// vector there.
+	if coldata.BatchSize() < 6 {
+		return
+	}
+	sel := []int{1, 3, 5}
+	vec := b.ColVec(0).Bytes()
+	vec.Set(0, []byte("zero"))
+	vec.Set(2, []byte("two"))
+	b.SetSelection(true)
+	copy(b.Selection(), sel)
+
+	// This is where the invariant of non-decreasing offsets in the Bytes vector
+	// should be updated.
+	b.SetLength(len(sel))
+
+	// In many cases in the vectorized execution engine, for performance
+	// reasons, we will attempt to get something from the Bytes vector at
+	// positions on which we have NULLs, so all of the Gets below should be
+	// safe.
+	for _, idx := range sel {
+		assert.True(t, len(vec.Get(idx)) == 0)
+	}
+}
diff --git a/pkg/col/coldata/vec.eg.go b/pkg/col/coldata/vec.eg.go
diff --git a/pkg/col/coldata/vec.go b/pkg/col/coldata/vec.go
@@ -96,6 +96,10 @@ type Vec interface {
 	// An optional Sel slice can also be provided to apply a filter on the source
 	// Vec.
 	// Refer to the SliceArgs comment for specifics and TestAppend for examples.
+	//
+	// NOTE: Append does *not* support the case of appending 0 values (i.e.
+	// the behavior of Append when args.SrcStartIdx == args.SrcEndIdx is
+	// undefined).
 	Append(SliceArgs)
 
 	// Copy uses CopySliceArgs to copy elements of a source Vec into this Vec. It is

diff --git a/pkg/col/coldata/vec_tmpl.go b/pkg/col/coldata/vec_tmpl.go
@@ -75,15 +75,13 @@ func (m *memColumn) Append(args SliceArgs) {
 				// whether the value is NULL. It is possible that Bytes' invariant of
 				// non-decreasing offsets on the source is currently not maintained, so
 				// we explicitly enforce it.
-				maxIdx := 0
-				for _, selIdx := range sel {
-					if selIdx > maxIdx {
-						maxIdx = selIdx
-					}
-				}
-				fromCol.UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
+				//
+				// Note that here we rely on the fact that selection vectors are
+				// increasing sequences.
+				fromCol.UpdateOffsetsToBeNonDecreasing(sel[len(sel)-1] + 1)
 				// {{else}}
-				toCol = execgen.SLICE(toCol, 0, args.DestIdx)
+				// {{/* Here WINDOW means slicing which allows us to use APPENDVAL below. */}}
+				toCol = execgen.WINDOW(toCol, 0, args.DestIdx)
 				// {{end}}
 				for _, selIdx := range sel {
 					val := fromCol.Get(selIdx)

diff --git a/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go b/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go
@@ -34,11 +34,6 @@ var dataManipulationReplacementInfos = []dataManipulationReplacementInfo{
 		numArgs:             3,
 		replaceWith:         "Set",
 	},
-	{
-		templatePlaceholder: "execgen.SLICE",
-		numArgs:             3,
-		replaceWith:         "Slice",
-	},
 	{
 		templatePlaceholder: "execgen.COPYSLICE",
 		numArgs:             5,

diff --git a/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go b/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go
@@ -481,12 +481,12 @@ func (b *argWidthOverloadBase) Set(target, i, new string) string {
 	return set(b.CanonicalTypeFamily, target, i, new)
 }
 
-// Slice is a function that should only be used in templates.
-func (b *argWidthOverloadBase) Slice(target, start, end string) string {
+// slice is a function that should only be used in templates.
+func (b *argWidthOverloadBase) slice(target, start, end string) string {
 	switch b.CanonicalTypeFamily {
 	case types.BytesFamily:
 		// Bytes vector doesn't support slicing.
-		colexecerror.InternalError(errors.AssertionFailedf("Slice method is attempted to be generated on Bytes vector"))
+		colexecerror.InternalError(errors.AssertionFailedf("slice method is attempted to be generated on Bytes vector"))
 	case typeconv.DatumVecCanonicalTypeFamily:
 		return fmt.Sprintf(`%s.Slice(%s, %s)`, target, start, end)
 	}
@@ -608,7 +608,7 @@ func (b *argWidthOverloadBase) Window(target, start, end string) string {
 	case types.BytesFamily:
 		return fmt.Sprintf(`%s.Window(%s, %s)`, target, start, end)
 	}
-	return b.Slice(target, start, end)
+	return b.slice(target, start, end)
 }
 
 // setVariableSize is a function that should only be used in templates. It
@@ -651,7 +651,6 @@ var (
 	_    = awob.GoTypeSliceName
 	_    = awob.CopyVal
 	_    = awob.Set
-	_    = awob.Slice
 	_    = awob.Sliceable
 	_    = awob.CopySlice
 	_    = awob.AppendSlice

diff --git a/pkg/sql/colexec/execgen/placeholders.go b/pkg/sql/colexec/execgen/placeholders.go
@@ -21,7 +21,6 @@ const nonTemplatePanic = "do not call from non-template code"
 var (
 	_ = COPYVAL
 	_ = SET
-	_ = SLICE
 	_ = COPYSLICE
 	_ = APPENDSLICE
 	_ = APPENDVAL
@@ -44,12 +43,6 @@ func SET(target, i, new interface{}) {
 	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))
 }
 
-// SLICE is a template function.
-func SLICE(target, start, end interface{}) interface{} {
-	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))
-	return nil
-}
-
 // COPYSLICE is a template function.
 func COPYSLICE(target, src, destIdx, srcStartIdx, srcEndIdx interface{}) {
 	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))

diff --git a/pkg/sql/colexec/hashtable.go b/pkg/sql/colexec/hashtable.go
@@ -477,6 +477,7 @@ func (ht *hashTable) removeDuplicates(
 // appendAllDistinct appends all tuples from batch to the hash table. It
 // assumes that all tuples are distinct and that ht.probeScratch.hashBuffer
 // contains the hash codes for all of them.
+// NOTE: batch must be of non-zero length.
 func (ht *hashTable) appendAllDistinct(ctx context.Context, batch coldata.Batch) {
 	numBuffered := uint64(ht.vals.Length())
 	ht.allocator.PerformOperation(ht.vals.ColVecs(), func() {

diff --git a/pkg/sql/colexec/utils.go b/pkg/sql/colexec/utils.go
@@ -206,6 +206,7 @@ func (b *appendOnlyBufferedBatch) ReplaceCol(coldata.Vec, int) {
 // [startIdx, endIdx) from batch (paying attention to the selection vector)
 // into b.
 // NOTE: this does *not* perform memory accounting.
+// NOTE: batch must be of non-zero length.
 func (b *appendOnlyBufferedBatch) append(batch coldata.Batch, startIdx, endIdx int) {
 	for _, colIdx := range b.colsToStore {
 		b.colVecs[colIdx].Append(