From 9726a61670671bc70fa917202168acd892159f8d Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Thu, 14 Jan 2021 18:03:39 -0800
Subject: [PATCH 1/2] execgen: remove SLICE method

`execgen.SLICE` directive is used only in one place, and we can use
`execgen.WINDOW` there instead (which will have the same effect).

Release note: None
---
 pkg/col/coldata/vec_tmpl.go                              | 3 ++-
 .../colexec/execgen/cmd/execgen/data_manipulation_gen.go | 5 -----
 pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go    | 9 ++++-----
 pkg/sql/colexec/execgen/placeholders.go                  | 7 -------
 4 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/pkg/col/coldata/vec_tmpl.go b/pkg/col/coldata/vec_tmpl.go
index 1005760b6f4c..1d282b3ac27d 100644
--- a/pkg/col/coldata/vec_tmpl.go
+++ b/pkg/col/coldata/vec_tmpl.go
@@ -83,7 +83,8 @@ func (m *memColumn) Append(args SliceArgs) {
 				}
 				fromCol.UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
 				// {{else}}
-				toCol = execgen.SLICE(toCol, 0, args.DestIdx)
+				// {{/* Here WINDOW means slicing which allows us to use APPENDVAL below. */}}
+				toCol = execgen.WINDOW(toCol, 0, args.DestIdx)
 				// {{end}}
 				for _, selIdx := range sel {
 					val := fromCol.Get(selIdx)
diff --git a/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go b/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go
index 85972971cd18..9de777d4481f 100644
--- a/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go
+++ b/pkg/sql/colexec/execgen/cmd/execgen/data_manipulation_gen.go
@@ -34,11 +34,6 @@ var dataManipulationReplacementInfos = []dataManipulationReplacementInfo{
 		numArgs:             3,
 		replaceWith:         "Set",
 	},
-	{
-		templatePlaceholder: "execgen.SLICE",
-		numArgs:             3,
-		replaceWith:         "Slice",
-	},
 	{
 		templatePlaceholder: "execgen.COPYSLICE",
 		numArgs:             5,
diff --git a/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go b/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go
index a0e48abda7d8..13660df36012 100644
--- a/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go
+++ b/pkg/sql/colexec/execgen/cmd/execgen/overloads_base.go
@@ -481,12 +481,12 @@ func (b *argWidthOverloadBase) Set(target, i, new string) string {
 	return set(b.CanonicalTypeFamily, target, i, new)
 }
 
-// Slice is a function that should only be used in templates.
-func (b *argWidthOverloadBase) Slice(target, start, end string) string {
+// slice is a function that should only be used in templates.
+func (b *argWidthOverloadBase) slice(target, start, end string) string {
 	switch b.CanonicalTypeFamily {
 	case types.BytesFamily:
 		// Bytes vector doesn't support slicing.
-		colexecerror.InternalError(errors.AssertionFailedf("Slice method is attempted to be generated on Bytes vector"))
+		colexecerror.InternalError(errors.AssertionFailedf("slice method is attempted to be generated on Bytes vector"))
 	case typeconv.DatumVecCanonicalTypeFamily:
 		return fmt.Sprintf(`%s.Slice(%s, %s)`, target, start, end)
 	}
@@ -608,7 +608,7 @@ func (b *argWidthOverloadBase) Window(target, start, end string) string {
 	case types.BytesFamily:
 		return fmt.Sprintf(`%s.Window(%s, %s)`, target, start, end)
 	}
-	return b.Slice(target, start, end)
+	return b.slice(target, start, end)
 }
 
 // setVariableSize is a function that should only be used in templates. It
@@ -651,7 +651,6 @@ var (
 	_    = awob.GoTypeSliceName
 	_    = awob.CopyVal
 	_    = awob.Set
-	_    = awob.Slice
 	_    = awob.Sliceable
 	_    = awob.CopySlice
 	_    = awob.AppendSlice
diff --git a/pkg/sql/colexec/execgen/placeholders.go b/pkg/sql/colexec/execgen/placeholders.go
index af8602947d45..b475e30f6970 100644
--- a/pkg/sql/colexec/execgen/placeholders.go
+++ b/pkg/sql/colexec/execgen/placeholders.go
@@ -21,7 +21,6 @@ const nonTemplatePanic = "do not call from non-template code"
 var (
 	_ = COPYVAL
 	_ = SET
-	_ = SLICE
 	_ = COPYSLICE
 	_ = APPENDSLICE
 	_ = APPENDVAL
@@ -44,12 +43,6 @@ func SET(target, i, new interface{}) {
 	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))
 }
 
-// SLICE is a template function.
-func SLICE(target, start, end interface{}) interface{} {
-	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))
-	return nil
-}
-
 // COPYSLICE is a template function.
 func COPYSLICE(target, src, destIdx, srcStartIdx, srcEndIdx interface{}) {
 	colexecerror.InternalError(errors.AssertionFailedf(nonTemplatePanic))

From 3589b52ff7c282cf19bbb069aba24099491e0cb1 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Thu, 14 Jan 2021 18:34:46 -0800
Subject: [PATCH 2/2] coldata: fix updating offsets of bytes in Batch.SetLength

In `SetLength` method we are maintaining the invariant of `Bytes`
vectors that the offsets are non-decreasing sequences. Previously, this
was done incorrectly when a selection vector is present on the batch
which could lead to out of bounds errors (caught by our panic-catcher)
some time later. This is now fixed by correctly paying attention to the
selection vector.

I neither can easily come up with an example query that would trigger
this condition nor can I prove that it won't occur, but I think we have
seen a single sentry report that could be explained by this bug, so I
think it's worth backporting.

Additionally, this commit uses the assumption that the selection vectors
are increasing sequences in order to calculate the largest index
accessed by the batch.

Release note (bug fix): Previously, CockroachDB could encounter an
internal error when executing queries with BYTES or STRING types via the
vectorized engine in rare circumstances, and now this is fixed.
---
 pkg/col/coldata/batch.go      | 11 ++++++++++-
 pkg/col/coldata/batch_test.go | 34 ++++++++++++++++++++++++++++++++++
 pkg/col/coldata/vec.eg.go     | 10 +++-------
 pkg/col/coldata/vec.go        |  4 ++++
 pkg/col/coldata/vec_tmpl.go   | 11 ++++-------
 pkg/sql/colexec/hashtable.go  |  1 +
 pkg/sql/colexec/utils.go      |  1 +
 7 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/pkg/col/coldata/batch.go b/pkg/col/coldata/batch.go
index 0a8d44408c8a..0ad0fc9ff2c6 100644
--- a/pkg/col/coldata/batch.go
+++ b/pkg/col/coldata/batch.go
@@ -245,8 +245,17 @@ func (m *MemBatch) SetSelection(b bool) {
 func (m *MemBatch) SetLength(length int) {
 	m.length = length
 	if length > 0 {
+		// In order to maintain the invariant of Bytes vectors we need to update
+		// offsets up to the element with the largest index that can be accessed
+		// by the batch.
+		maxIdx := length - 1
+		if m.useSel {
+			// Note that here we rely on the fact that selection vectors are
+			// increasing sequences.
+			maxIdx = m.sel[length-1]
+		}
 		for i, ok := m.bytesVecIdxs.Next(0); ok; i, ok = m.bytesVecIdxs.Next(i + 1) {
-			m.b[i].Bytes().UpdateOffsetsToBeNonDecreasing(length)
+			m.b[i].Bytes().UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
 		}
 	}
 }
diff --git a/pkg/col/coldata/batch_test.go b/pkg/col/coldata/batch_test.go
index 131b8fdaab04..c5dddcbe3e7a 100644
--- a/pkg/col/coldata/batch_test.go
+++ b/pkg/col/coldata/batch_test.go
@@ -108,3 +108,37 @@ func TestBatchReset(t *testing.T) {
 	b = coldata.NewMemBatch(typsBytes, coldata.StandardColumnFactory)
 	resetAndCheck(b, typsBytes, 1, true)
 }
+
+// TestBatchWithBytesAndNulls verifies that the invariant of Bytes vectors is
+// maintained when NULL values are present in the vector and there is a
+// selection vector on the batch.
+func TestBatchWithBytesAndNulls(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	b := coldata.NewMemBatch([]*types.T{types.Bytes}, coldata.StandardColumnFactory)
+	// We will insert some garbage data into the Bytes vector in positions that
+	// are not mentioned in the selection vector. All the values that are
+	// selected are actually NULL values, so we don't set anything on the Bytes
+	// vector there.
+	if coldata.BatchSize() < 6 {
+		return
+	}
+	sel := []int{1, 3, 5}
+	vec := b.ColVec(0).Bytes()
+	vec.Set(0, []byte("zero"))
+	vec.Set(2, []byte("two"))
+	b.SetSelection(true)
+	copy(b.Selection(), sel)
+
+	// This is where the invariant of non-decreasing offsets in the Bytes vector
+	// should be updated.
+	b.SetLength(len(sel))
+
+	// In many cases in the vectorized execution engine, for performance
+	// reasons, we will attempt to get something from the Bytes vector at
+	// positions on which we have NULLs, so all of the Gets below should be
+	// safe.
+	for _, idx := range sel {
+		assert.True(t, len(vec.Get(idx)) == 0)
+	}
+}
diff --git a/pkg/col/coldata/vec.eg.go b/pkg/col/coldata/vec.eg.go
index 748f616e4996..3e054e4fde26 100644
--- a/pkg/col/coldata/vec.eg.go
+++ b/pkg/col/coldata/vec.eg.go
@@ -73,13 +73,9 @@ func (m *memColumn) Append(args SliceArgs) {
 				// whether the value is NULL. It is possible that Bytes' invariant of
 				// non-decreasing offsets on the source is currently not maintained, so
 				// we explicitly enforce it.
-				maxIdx := 0
-				for _, selIdx := range sel {
-					if selIdx > maxIdx {
-						maxIdx = selIdx
-					}
-				}
-				fromCol.UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
+				// Note that here we rely on the fact that selection vectors are
+				// increasing sequences.
+				fromCol.UpdateOffsetsToBeNonDecreasing(sel[len(sel)-1] + 1)
 				for _, selIdx := range sel {
 					val := fromCol.Get(selIdx)
 					toCol.AppendVal(val)
diff --git a/pkg/col/coldata/vec.go b/pkg/col/coldata/vec.go
index 46695efd1f2a..db6752946542 100644
--- a/pkg/col/coldata/vec.go
+++ b/pkg/col/coldata/vec.go
@@ -96,6 +96,10 @@ type Vec interface {
 	// An optional Sel slice can also be provided to apply a filter on the source
 	// Vec.
 	// Refer to the SliceArgs comment for specifics and TestAppend for examples.
+	//
+	// NOTE: Append does *not* support the case of appending 0 values (i.e.
+	// the behavior of Append when args.SrcStartIdx == args.SrcEndIdx is
+	// undefined).
 	Append(SliceArgs)
 
 	// Copy uses CopySliceArgs to copy elements of a source Vec into this Vec. It is
diff --git a/pkg/col/coldata/vec_tmpl.go b/pkg/col/coldata/vec_tmpl.go
index 1d282b3ac27d..8cd6685943b6 100644
--- a/pkg/col/coldata/vec_tmpl.go
+++ b/pkg/col/coldata/vec_tmpl.go
@@ -75,13 +75,10 @@ func (m *memColumn) Append(args SliceArgs) {
 				// whether the value is NULL. It is possible that Bytes' invariant of
 				// non-decreasing offsets on the source is currently not maintained, so
 				// we explicitly enforce it.
-				maxIdx := 0
-				for _, selIdx := range sel {
-					if selIdx > maxIdx {
-						maxIdx = selIdx
-					}
-				}
-				fromCol.UpdateOffsetsToBeNonDecreasing(maxIdx + 1)
+				//
+				// Note that here we rely on the fact that selection vectors are
+				// increasing sequences.
+				fromCol.UpdateOffsetsToBeNonDecreasing(sel[len(sel)-1] + 1)
 				// {{else}}
 				// {{/* Here WINDOW means slicing which allows us to use APPENDVAL below. */}}
 				toCol = execgen.WINDOW(toCol, 0, args.DestIdx)
diff --git a/pkg/sql/colexec/hashtable.go b/pkg/sql/colexec/hashtable.go
index 3a8b410712e8..3da12e667a95 100644
--- a/pkg/sql/colexec/hashtable.go
+++ b/pkg/sql/colexec/hashtable.go
@@ -477,6 +477,7 @@ func (ht *hashTable) removeDuplicates(
 // appendAllDistinct appends all tuples from batch to the hash table. It
 // assumes that all tuples are distinct and that ht.probeScratch.hashBuffer
 // contains the hash codes for all of them.
+// NOTE: batch must be of non-zero length.
 func (ht *hashTable) appendAllDistinct(ctx context.Context, batch coldata.Batch) {
 	numBuffered := uint64(ht.vals.Length())
 	ht.allocator.PerformOperation(ht.vals.ColVecs(), func() {
diff --git a/pkg/sql/colexec/utils.go b/pkg/sql/colexec/utils.go
index ed5379b31d56..2118a3d2f4cf 100644
--- a/pkg/sql/colexec/utils.go
+++ b/pkg/sql/colexec/utils.go
@@ -206,6 +206,7 @@ func (b *appendOnlyBufferedBatch) ReplaceCol(coldata.Vec, int) {
 // [startIdx, endIdx) from batch (paying attention to the selection vector)
 // into b.
 // NOTE: this does *not* perform memory accounting.
+// NOTE: batch must be of non-zero length.
 func (b *appendOnlyBufferedBatch) append(batch coldata.Batch, startIdx, endIdx int) {
 	for _, colIdx := range b.colsToStore {
 		b.colVecs[colIdx].Append(