From 2ee173932d3c3f1164a7607f5b36cc50f027328d Mon Sep 17 00:00:00 2001 From: Rohan Yadav Date: Sat, 25 Apr 2020 17:46:39 -0400 Subject: [PATCH] sql: enable indexing and ordering on arrays of orderable and indexable types Fixes #17154. Fixes #35707. This PR enables arrays to be ordered and indexed by introducing an ordered key encoding for arrays. Once this exists, the rest of the SQL infrastructure is ready to handle indexing and ordering on arrays. To encode an array of elements `ARRAY[a, b]`, we create the following encoding. Let `AM` = a marker byte for arrays and let `AT` be a terminator byte. `enc(ARRAY[a, b]) = [AM, enc(a), enc(b), AT]` The key is that the terminator is less than the element marker. This allows for the "prefix matching" style comparison that arrays support. Release note (sql change): This PR adds support for indexing and ordering of arrays of indexable and orderable inner types. --- pkg/sql/flowinfra/stream_encoder.go | 2 +- .../logictest/testdata/logic_test/alter_table | 19 -- pkg/sql/logictest/testdata/logic_test/array | 273 +++++++++++++++--- .../logictest/testdata/logic_test/order_by | 12 - .../exec/execbuilder/testdata/inverted_index | 14 + pkg/sql/opt/optbuilder/orderby.go | 6 +- pkg/sql/opt/optbuilder/testdata/orderby | 66 ++++- pkg/sql/rowcontainer/disk_row_container.go | 4 +- pkg/sql/sem/tree/datum.go | 10 + pkg/sql/sqlbase/column_type_encoding.go | 66 +++++ pkg/sql/sqlbase/column_type_encoding_test.go | 144 ++++++--- pkg/sql/sqlbase/encoded_datum_test.go | 4 +- pkg/sql/sqlbase/index_encoding.go | 9 +- pkg/sql/sqlbase/structured.go | 32 +- pkg/sql/sqlbase/testutils.go | 43 ++- pkg/util/encoding/encoding.go | 169 ++++++++++- pkg/util/encoding/type_string.go | 6 +- 17 files changed, 726 insertions(+), 153 deletions(-) diff --git a/pkg/sql/flowinfra/stream_encoder.go b/pkg/sql/flowinfra/stream_encoder.go index 31c764793a06..b2a113f13780 100644 --- a/pkg/sql/flowinfra/stream_encoder.go +++ b/pkg/sql/flowinfra/stream_encoder.go @@ -105,7 +105,7 @@ func (se *StreamEncoder) AddRow(row sqlbase.EncDatumRow) error { if !ok { enc = PreferredEncoding } - sType := se.infos[i].Type.Family() + sType := &se.infos[i].Type if enc != sqlbase.DatumEncoding_VALUE && (sqlbase.HasCompositeKeyEncoding(sType) || sqlbase.MustBeValueEncoded(sType)) { // Force VALUE encoding for composite types (key encodings may lose data). diff --git a/pkg/sql/logictest/testdata/logic_test/alter_table b/pkg/sql/logictest/testdata/logic_test/alter_table index ea58e856812f..0b6ad7ee777b 100644 --- a/pkg/sql/logictest/testdata/logic_test/alter_table +++ b/pkg/sql/logictest/testdata/logic_test/alter_table @@ -801,25 +801,6 @@ decomputed_column CREATE TABLE decomputed_column ( statement ok CREATE TABLE b26483() -statement error unimplemented: column c is of type int\[\] and thus is not indexable -ALTER TABLE b26483 ADD COLUMN c INT[] UNIQUE - -# As above, but performed in a transaction -statement ok -BEGIN - -statement ok -CREATE TABLE b26483_tx() - -statement ok -ALTER TABLE b26483_tx ADD COLUMN c INT[] - -statement error unimplemented: column c is of type int\[\] and thus is not indexable -CREATE INDEX on b26483_tx (c) - -statement ok -ROLLBACK - # Verify that auditing can be enabled by root, and cannot be disabled by non-root. statement ok diff --git a/pkg/sql/logictest/testdata/logic_test/array b/pkg/sql/logictest/testdata/logic_test/array index faa361a25188..3c7b3b3e60b7 100644 --- a/pkg/sql/logictest/testdata/logic_test/array +++ b/pkg/sql/logictest/testdata/logic_test/array @@ -429,17 +429,6 @@ SELECT ARRAY[ARRAY[1,2,3]] query error VECTOR column types are unsupported CREATE TABLE badtable (b INT2VECTOR) -# Using an array as a primary key should be disallowed. #17154 - -statement error column b is of type int\[\] and thus is not indexable -CREATE TABLE badtable (b INT[] PRIMARY KEY) - -# Indexing an array column should be disallowed. #17154 - -statement error column b is of type int\[\] and thus is not indexable -CREATE TABLE a (b INT[] UNIQUE) - - # Regression test for #18745 statement ok @@ -449,18 +438,6 @@ query T SELECT ARRAY[ROW()] FROM ident ---- -statement error column b is of type int\[\] and thus is not indexable -CREATE TABLE a ( - b INT[], - CONSTRAINT c UNIQUE (b) -) - -statement error column b is of type int\[\] and thus is not indexable -CREATE TABLE a ( - b INT[], - INDEX c (b) -) - statement ok CREATE TABLE a (b INT ARRAY) @@ -475,18 +452,6 @@ a CREATE TABLE a ( statement ok DROP TABLE a -statement ok -CREATE TABLE a (b INT[], c INT[]) - -statement error column b is of type int\[\] and thus is not indexable -CREATE INDEX idx ON a (b) - -statement error the following columns are not indexable due to their type: b \(type int\[\]\), c \(type int\[\]\) -CREATE INDEX idx ON a (b, c) - -statement ok -DROP TABLE a - # Int array columns. statement ok @@ -1343,3 +1308,241 @@ SELECT x, y FROM t WHERE x < y query TT SELECT x, y FROM t WHERE x > y ---- + +query TT +SELECT x, y FROM t ORDER BY (x, y) +---- +{1} {1,2} +{1,1,1,1} {2} + +subtest array_indexes + +# Create indexes on arrays. +statement ok +DROP TABLE IF EXISTS t; +CREATE TABLE t (x INT[] PRIMARY KEY) + +statement ok +INSERT INTO t VALUES + (ARRAY[1]), + (ARRAY[5]), + (ARRAY[4]), + (ARRAY[1,4,5]), + (ARRAY[1,4,6]), + (ARRAY[1,NULL,10]), + (ARRAY[NULL]), + (ARRAY[NULL, NULL, NULL]) + +# Test that the unique index rejects bad inserts. +statement error pq: duplicate key value \(x\)=\(ARRAY\[1,NULL,10\]\) violates unique constraint "primary" +INSERT INTO t VALUES (ARRAY[1, NULL, 10]) + +query T +SELECT x FROM t ORDER BY x +---- +{NULL} +{NULL,NULL,NULL} +{1} +{1,NULL,10} +{1,4,5} +{1,4,6} +{4} +{5} + +# Use the index for point lookups. +query T +SELECT x FROM t WHERE x = ARRAY[1,4,6] +---- +{1,4,6} + +# Use the index for bounded scans. +# Note that nulls sort first in CockroachDB, so this ordering is different +# than what postgres will output. In postgres, NULLs in arrays are treated +# as larger than other elements, while we treat them as less. +# TODO (rohany): We have always done this for array comparisons, so I think +# it would be a breaking change + opposite with our other null behavior to +# change it suddenly... +query T +SELECT x FROM t WHERE x < ARRAY[1, 4, 3] ORDER BY x +---- +{NULL} +{NULL,NULL,NULL} +{1} +{1,NULL,10} + +query T +SELECT x FROM t WHERE x > ARRAY [1, NULL] ORDER BY x DESC +---- +{5} +{4} +{1,4,6} +{1,4,5} +{1,NULL,10} + +query T +SELECT x FROM t WHERE x > ARRAY[1, 3] AND x < ARRAY[1, 4, 10] ORDER BY x +---- +{1,4,5} +{1,4,6} + +query T +SELECT x FROM t WHERE x > ARRAY[NULL, NULL]:::INT[] ORDER BY x +---- +{NULL,NULL,NULL} +{1} +{1,NULL,10} +{1,4,5} +{1,4,6} +{4} +{5} + +# Test some operations on a descending index. +statement ok +CREATE INDEX i ON t(x DESC) + +query T +SELECT x FROM t@i WHERE x <= ARRAY[1] ORDER BY x DESC +---- +{1} +{NULL,NULL,NULL} +{NULL} + +query T +SELECT x FROM t@i WHERE x > ARRAY[1] ORDER BY x +---- +{1,NULL,10} +{1,4,5} +{1,4,6} +{4} +{5} + +# Ensure that we can order by the arrays without any indexes. +statement ok +DROP TABLE t; +CREATE TABLE t (x INT[]); +INSERT INTO t VALUES + (ARRAY[1]), + (ARRAY[5]), + (ARRAY[4]), + (ARRAY[1,4,5]), + (ARRAY[1,4,6]), + (ARRAY[1,NULL,10]), + (ARRAY[NULL]), + (ARRAY[NULL, NULL, NULL]) + +query T +SELECT x FROM t ORDER BY x +---- +{NULL} +{NULL,NULL,NULL} +{1} +{1,NULL,10} +{1,4,5} +{1,4,6} +{4} +{5} + +query T +SELECT x FROM t ORDER BY x DESC +---- +{5} +{4} +{1,4,6} +{1,4,5} +{1,NULL,10} +{1} +{NULL,NULL,NULL} +{NULL} + +statement ok +CREATE INDEX i ON t (x); +INSERT INTO t VALUES (NULL), (NULL) + +# Test that NULL's are differentiated from {NULL}. +query T +SELECT x FROM t@i WHERE x IS NOT NULL ORDER BY x +---- +{NULL} +{NULL,NULL,NULL} +{1} +{1,NULL,10} +{1,4,5} +{1,4,6} +{4} +{5} + +# Create an indexes on a bad type. +statement error pq: unimplemented: column x is of type geography\[\] and thus is not indexable +CREATE TABLE tbad (x GEOGRAPHY[] PRIMARY KEY) + +# Test arrays of composite types. +statement ok +CREATE TABLE tarray(x DECIMAL[] PRIMARY KEY); +INSERT INTO tarray VALUES (ARRAY[1.00]), (ARRAY[1.501]) + +# Ensure these are round tripped correctly. +query T +SELECT x FROM tarray ORDER BY x +---- +{1.00} +{1.501} + +# Test indexes on multiple columns with arrays. +statement ok +DROP TABLE t; +CREATE TABLE t (x INT, y INT[], z INT, INDEX i (x, y, z)); +INSERT INTO t VALUES + (1, ARRAY[1, 2, 3], 3), + (NULL, ARRAY[1, NULL, 3], NULL), + (2, ARRAY[NULL, NULL, NULL], NULL), + (NULL, ARRAY[NULL, NULL], 3), + (2, ARRAY[4, 5], 7) + +query ITI +SELECT x, y, z FROM t WHERE x IS NOT NULL AND y > ARRAY[1] ORDER BY z +---- + 1 {1,2,3} 3 + 2 {4,5} 7 + +query ITI +SELECT x, y, z FROM t WHERE x = 2 AND y < ARRAY[10] ORDER BY y +---- +2 {NULL,NULL,NULL} NULL +2 {4,5} 7 + +# Test that interleaving an array index doesn't lead to problems. +statement ok +DROP TABLE IF EXISTS parent, child CASCADE; +CREATE TABLE parent (x INT, y INT[], PRIMARY KEY (x, y DESC)); +CREATE TABLE child (x INT, y INT[], z INT[], PRIMARY KEY (x, y DESC, z)) INTERLEAVE IN PARENT parent (x, y); +INSERT INTO parent VALUES + (1, ARRAY[1, 2, 3]), + (1, ARRAY[1, NULL]), + (2, ARRAY[NULL]), + (3, ARRAY[NULL, 1, NULL]); +INSERT INTO child VALUES + (1, ARRAY[1, 2, 3], ARRAY[4]), + (1, ARRAY[1, 2, 3, 4], ARRAY[5]), + (1, ARRAY[1, NULL], ARRAY[5]), + (1, ARRAY[1, NULL, NULL], ARRAY[10]), + (2, ARRAY[NULL], ARRAY[1]), + (3, ARRAY[NULL, 1, NULL], ARRAY[3]); + +# Ensure scans on the parent and child aren't affected. +query IT +SELECT x, y FROM parent ORDER BY x, y DESC +---- +1 {1,2,3} +1 {1,NULL} +2 {NULL} +3 {NULL,1,NULL} + +query ITT +SELECT x, y, z FROM child ORDER BY x, y DESC, z +---- +1 {1,2,3,4} {5} +1 {1,2,3} {4} +1 {1,NULL,NULL} {10} +1 {1,NULL} {5} +2 {NULL} {1} +3 {NULL,1,NULL} {3} diff --git a/pkg/sql/logictest/testdata/logic_test/order_by b/pkg/sql/logictest/testdata/logic_test/order_by index 10dda90f2265..0a30a48cc47f 100644 --- a/pkg/sql/logictest/testdata/logic_test/order_by +++ b/pkg/sql/logictest/testdata/logic_test/order_by @@ -187,18 +187,6 @@ SELECT * FROM t ORDER BY foo query error no data source matches prefix: a SELECT a FROM t ORDER BY a.b -query error can't order by column type int\[\] -SELECT generate_series FROM generate_series(1, 100) ORDER BY ARRAY[generate_series] - -query error can't order by column type int\[\] -SELECT ARRAY[generate_series] FROM generate_series(1, 100) ORDER BY ARRAY[generate_series] - -query error can't order by column type int\[\] -SELECT ARRAY[generate_series] FROM generate_series(1, 100) ORDER BY 1 - -query error can't order by column type int\[\] -SELECT ARRAY[generate_series] AS a FROM generate_series(1, 100) ORDER BY a - query IT SELECT generate_series, ARRAY[generate_series] FROM generate_series(1, 1) ORDER BY 1 ---- diff --git a/pkg/sql/opt/exec/execbuilder/testdata/inverted_index b/pkg/sql/opt/exec/execbuilder/testdata/inverted_index index 93bbfc575d06..04088a049537 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/inverted_index +++ b/pkg/sql/opt/exec/execbuilder/testdata/inverted_index @@ -578,3 +578,17 @@ lookup-join · · └── scan · · () · · table e@e_b_idx · · · fixedvals 1 column · · + +# Ensure that an inverted index with a composite primary key still encodes +# the primary key data in the composite value. +statement ok +DROP TABLE IF EXISTS t; +CREATE TABLE t (x DECIMAL PRIMARY KEY, y int[], FAMILY (x, y)); +CREATE INVERTED INDEX ON t(y) + +query T kvtrace +INSERT INTO t VALUES (1.00, ARRAY[1,2]) +---- +CPut /Table/56/1/1/0 -> /TUPLE/1:1:Decimal/1.00/ +InitPut /Table/56/2/1/1/0 -> /BYTES/0x1503348964 +InitPut /Table/56/2/2/1/0 -> /BYTES/0x1503348964 diff --git a/pkg/sql/opt/optbuilder/orderby.go b/pkg/sql/opt/optbuilder/orderby.go index 2058110d0574..e2775cea4fc0 100644 --- a/pkg/sql/opt/optbuilder/orderby.go +++ b/pkg/sql/opt/optbuilder/orderby.go @@ -258,10 +258,8 @@ func (b *Builder) analyzeExtraArgument( func ensureColumnOrderable(e tree.TypedExpr) { typ := e.ResolvedType() - if typ.Family() == types.ArrayFamily { - panic(unimplementedWithIssueDetailf(35707, "", "can't order by column type %s", typ)) - } - if typ.Family() == types.JsonFamily { + if typ.Family() == types.JsonFamily || + (typ.Family() == types.ArrayFamily && typ.ArrayContents().Family() == types.JsonFamily) { panic(unimplementedWithIssueDetailf(35706, "", "can't order by column type jsonb")) } } diff --git a/pkg/sql/opt/optbuilder/testdata/orderby b/pkg/sql/opt/optbuilder/testdata/orderby index 235e0e24c958..5940a1dd51c2 100644 --- a/pkg/sql/opt/optbuilder/testdata/orderby +++ b/pkg/sql/opt/optbuilder/testdata/orderby @@ -391,22 +391,70 @@ error (42P01): no data source matches prefix: a build SELECT generate_series FROM generate_series(1, 100) ORDER BY ARRAY[generate_series] ---- -error (0A000): unimplemented: can't order by column type int[] +sort + ├── columns: generate_series:1 [hidden: column2:2] + ├── ordering: +2 + └── project + ├── columns: column2:2 generate_series:1 + ├── project-set + │ ├── columns: generate_series:1 + │ ├── values + │ │ └── () + │ └── zip + │ └── generate_series(1, 100) + └── projections + └── ARRAY[generate_series:1] [as=column2:2] build SELECT ARRAY[generate_series] FROM generate_series(1, 100) ORDER BY ARRAY[generate_series] ---- -error (0A000): unimplemented: can't order by column type int[] +sort + ├── columns: array:2 + ├── ordering: +2 + └── project + ├── columns: array:2 + ├── project-set + │ ├── columns: generate_series:1 + │ ├── values + │ │ └── () + │ └── zip + │ └── generate_series(1, 100) + └── projections + └── ARRAY[generate_series:1] [as=array:2] build SELECT ARRAY[generate_series] FROM generate_series(1, 100) ORDER BY 1 ---- -error (0A000): unimplemented: can't order by column type int[] +sort + ├── columns: array:2 + ├── ordering: +2 + └── project + ├── columns: array:2 + ├── project-set + │ ├── columns: generate_series:1 + │ ├── values + │ │ └── () + │ └── zip + │ └── generate_series(1, 100) + └── projections + └── ARRAY[generate_series:1] [as=array:2] build SELECT ARRAY[generate_series] AS a FROM generate_series(1, 100) ORDER BY a ---- -error (0A000): unimplemented: can't order by column type int[] +sort + ├── columns: a:2 + ├── ordering: +2 + └── project + ├── columns: a:2 + ├── project-set + │ ├── columns: generate_series:1 + │ ├── values + │ │ └── () + │ └── zip + │ └── generate_series(1, 100) + └── projections + └── ARRAY[generate_series:1] [as=a:2] build SELECT generate_series, ARRAY[generate_series] FROM generate_series(1, 1) ORDER BY 1 @@ -988,4 +1036,12 @@ project build SELECT ARRAY[a] FROM abcd ORDER BY 1 ---- -error (0A000): unimplemented: can't order by column type int[] +sort + ├── columns: array:5!null + ├── ordering: +5 + └── project + ├── columns: array:5!null + ├── scan abcd + │ └── columns: a:1!null b:2 c:3 d:4 + └── projections + └── ARRAY[a:1] [as=array:5] diff --git a/pkg/sql/rowcontainer/disk_row_container.go b/pkg/sql/rowcontainer/disk_row_container.go index e7d3a3e3d920..21b39641d9fc 100644 --- a/pkg/sql/rowcontainer/disk_row_container.go +++ b/pkg/sql/rowcontainer/disk_row_container.go @@ -114,7 +114,7 @@ func MakeDiskRowContainer( // returns true may not necessarily need to be encoded in the value, so // make this more fine-grained. See IsComposite() methods in // pkg/sql/parser/datum.go. - if _, ok := orderingIdxs[i]; !ok || sqlbase.HasCompositeKeyEncoding(d.types[i].Family()) { + if _, ok := orderingIdxs[i]; !ok || sqlbase.HasCompositeKeyEncoding(&d.types[i]) { d.valueIdxs = append(d.valueIdxs, i) } } @@ -244,7 +244,7 @@ func (d *DiskRowContainer) Close(ctx context.Context) { func (d *DiskRowContainer) keyValToRow(k []byte, v []byte) (sqlbase.EncDatumRow, error) { for i, orderInfo := range d.ordering { // Types with composite key encodings are decoded from the value. - if sqlbase.HasCompositeKeyEncoding(d.types[orderInfo.ColIdx].Family()) { + if sqlbase.HasCompositeKeyEncoding(&d.types[orderInfo.ColIdx]) { // Skip over the encoded key. encLen, err := encoding.PeekLength(k) if err != nil { diff --git a/pkg/sql/sem/tree/datum.go b/pkg/sql/sem/tree/datum.go index 7b60bbb76b39..6eb5827bf1d3 100644 --- a/pkg/sql/sem/tree/datum.go +++ b/pkg/sql/sem/tree/datum.go @@ -3517,6 +3517,16 @@ func (d *DArray) ResolvedType() *types.T { return types.MakeArray(d.ParamTyp) } +// IsComposite implements the CompositeDatum interface. +func (d *DArray) IsComposite() bool { + for _, elem := range d.Array { + if cdatum, ok := elem.(CompositeDatum); ok && cdatum.IsComposite() { + return true + } + } + return false +} + // FirstIndex returns the first index of the array. 1 for normal SQL arrays, // which are 1-indexed, and 0 for the special Postgers vector types which are // 0-indexed. diff --git a/pkg/sql/sqlbase/column_type_encoding.go b/pkg/sql/sqlbase/column_type_encoding.go index 65bde1c29a4c..3886667616f0 100644 --- a/pkg/sql/sqlbase/column_type_encoding.go +++ b/pkg/sql/sqlbase/column_type_encoding.go @@ -143,6 +143,8 @@ func EncodeTableKey(b []byte, val tree.Datum, dir encoding.Direction) ([]byte, e } } return b, nil + case *tree.DArray: + return encodeArrayKey(b, t, dir) case *tree.DCollatedString: if dir == encoding.Ascending { return encoding.EncodeBytesAscending(b, t.Key), nil @@ -187,6 +189,8 @@ func DecodeTableKey( var err error switch valType.Family() { + case types.ArrayFamily: + return decodeArrayKey(a, valType, key, dir) case types.BitFamily: var r bitarray.BitArray if dir == encoding.Ascending { @@ -918,6 +922,68 @@ func decodeTuple(a *DatumAlloc, tupTyp *types.T, b []byte) (tree.Datum, []byte, return a.NewDTuple(result), b, nil } +// encodeArrayKey generates an ordered key encoding of an array. +// The encoding format for an array [a, b] is as follows: +// [arrayMarker, enc(a), enc(b), terminator]. +// The terminator is guaranteed to be less than all encoded values, +// so two arrays with the same prefix but different lengths will sort +// correctly. The key difference is that NULL values need to be encoded +// differently, because the standard NULL encoding conflicts with the +// terminator byte. This NULL value is chosen to be larger than the +// terminator but less than all existing encoded values. +func encodeArrayKey(b []byte, array *tree.DArray, dir encoding.Direction) ([]byte, error) { + var err error + b = encoding.EncodeArrayKeyMarker(b, dir) + for _, elem := range array.Array { + if elem == tree.DNull { + b = encoding.EncodeNullWithinArrayKey(b, dir) + } else { + b, err = EncodeTableKey(b, elem, dir) + if err != nil { + return nil, err + } + } + } + return encoding.EncodeArrayKeyTerminator(b, dir), nil +} + +// decodeArrayKey decodes an array key generated by encodeArrayKey. +func decodeArrayKey( + a *DatumAlloc, t *types.T, buf []byte, dir encoding.Direction, +) (tree.Datum, []byte, error) { + var err error + buf, err = encoding.ValidateAndConsumeArrayKeyMarker(buf, dir) + if err != nil { + return nil, nil, err + } + + result := tree.NewDArray(t.ArrayContents()) + + for { + if len(buf) == 0 { + return nil, nil, errors.AssertionFailedf("invalid array encoding (unterminated)") + } + if encoding.IsArrayKeyDone(buf, dir) { + buf = buf[1:] + break + } + var d tree.Datum + if encoding.IsNextByteArrayEncodedNull(buf, dir) { + d = tree.DNull + buf = buf[1:] + } else { + d, buf, err = DecodeTableKey(a, t.ArrayContents(), buf, dir) + if err != nil { + return nil, nil, err + } + } + if err := result.Append(d); err != nil { + return nil, nil, err + } + } + return result, buf, nil +} + // encodeArray produces the value encoding for an array. func encodeArray(d *tree.DArray, scratch []byte) ([]byte, error) { if err := d.Validate(); err != nil { diff --git a/pkg/sql/sqlbase/column_type_encoding_test.go b/pkg/sql/sqlbase/column_type_encoding_test.go index ea11f8064ed7..63cba669795b 100644 --- a/pkg/sql/sqlbase/column_type_encoding_test.go +++ b/pkg/sql/sqlbase/column_type_encoding_test.go @@ -34,6 +34,13 @@ func genColumnType() gopter.Gen { } } +func genRandomArrayType() gopter.Gen { + return func(genParams *gopter.GenParameters) *gopter.GenResult { + arrType := RandArrayType(genParams.Rng) + return gopter.NewGenResult(arrType, gopter.NoShrinker) + } +} + func genDatum() gopter.Gen { return func(genParams *gopter.GenParameters) *gopter.GenResult { return gopter.NewGenResult(RandDatum(genParams.Rng, RandColumnType(genParams.Rng), @@ -48,6 +55,14 @@ func genDatumWithType(columnType interface{}) gopter.Gen { } } +func genArrayDatumWithType(arrTyp interface{}) gopter.Gen { + return func(genParams *gopter.GenParameters) *gopter.GenResult { + // Mark the array contents to have a 1 in 10 chance of being null. + datum := RandArray(genParams.Rng, arrTyp.(*types.T), 10) + return gopter.NewGenResult(datum, gopter.NoShrinker) + } +} + func genEncodingDirection() gopter.Gen { return func(genParams *gopter.GenParameters) *gopter.GenResult { return gopter.NewGenResult( @@ -59,9 +74,11 @@ func genEncodingDirection() gopter.Gen { func hasKeyEncoding(typ *types.T) bool { // Only some types are round-trip key encodable. switch typ.Family() { - case types.JsonFamily, types.ArrayFamily, types.CollatedStringFamily, types.TupleFamily, types.DecimalFamily, + case types.JsonFamily, types.CollatedStringFamily, types.TupleFamily, types.DecimalFamily, types.GeographyFamily, types.GeometryFamily: return false + case types.ArrayFamily: + return hasKeyEncoding(typ.ArrayContents()) } return true } @@ -102,61 +119,75 @@ func TestEncodeTableKey(t *testing.T) { parameters := gopter.DefaultTestParameters() parameters.MinSuccessfulTests = 10000 properties := gopter.NewProperties(parameters) + roundtripDatum := func(d tree.Datum, dir encoding.Direction) string { + b, err := EncodeTableKey(nil, d, dir) + if err != nil { + return "error: " + err.Error() + } + newD, leftoverBytes, err := DecodeTableKey(a, d.ResolvedType(), b, dir) + if len(leftoverBytes) > 0 { + return "Leftover bytes" + } + if err != nil { + return "error: " + err.Error() + } + if newD.Compare(ctx, d) != 0 { + return "unequal" + } + return "" + } properties.Property("roundtrip", prop.ForAll( - func(d tree.Datum, dir encoding.Direction) string { - b, err := EncodeTableKey(nil, d, dir) - if err != nil { - return "error: " + err.Error() - } - newD, leftoverBytes, err := DecodeTableKey(a, d.ResolvedType(), b, dir) - if len(leftoverBytes) > 0 { - return "Leftover bytes" - } - if err != nil { - return "error: " + err.Error() - } - if newD.Compare(ctx, d) != 0 { - return "unequal" - } - return "" - }, + roundtripDatum, genColumnType(). SuchThat(hasKeyEncoding). FlatMap(genDatumWithType, reflect.TypeOf((*tree.Datum)(nil)).Elem()), genEncodingDirection(), )) - properties.Property("order-preserving", prop.ForAll( - func(datums []tree.Datum, dir encoding.Direction) string { - d1 := datums[0] - d2 := datums[1] - b1, err := EncodeTableKey(nil, d1, dir) - if err != nil { - return "error: " + err.Error() - } - b2, err := EncodeTableKey(nil, d2, dir) - if err != nil { - return "error: " + err.Error() - } - expectedCmp := d1.Compare(ctx, d2) - cmp := bytes.Compare(b1, b2) + // Also run the property on arrays possibly containing NULL values. + // The random generator in the property above does not generate NULLs. + properties.Property("roundtrip-arrays", prop.ForAll( + roundtripDatum, + genRandomArrayType(). + SuchThat(hasKeyEncoding). + FlatMap(genArrayDatumWithType, reflect.TypeOf((*tree.Datum)(nil)).Elem()), + genEncodingDirection(), + )) - if expectedCmp == 0 { - if cmp != 0 { - return fmt.Sprintf("equal inputs produced inequal outputs: \n%v\n%v", b1, b2) - } - // If the inputs are equal and so are the outputs, no more checking to do. - return "" - } + generateAndCompareDatums := func(datums []tree.Datum, dir encoding.Direction) string { + d1 := datums[0] + d2 := datums[1] + b1, err := EncodeTableKey(nil, d1, dir) + if err != nil { + return "error: " + err.Error() + } + b2, err := EncodeTableKey(nil, d2, dir) + if err != nil { + return "error: " + err.Error() + } - cmpsMatch := expectedCmp == cmp - dirIsAscending := dir == encoding.Ascending + expectedCmp := d1.Compare(ctx, d2) + cmp := bytes.Compare(b1, b2) - if cmpsMatch != dirIsAscending { - return fmt.Sprintf("non-order preserving encoding: \n%v\n%v", b1, b2) + if expectedCmp == 0 { + if cmp != 0 { + return fmt.Sprintf("equal inputs produced inequal outputs: \n%v\n%v", b1, b2) } + // If the inputs are equal and so are the outputs, no more checking to do. return "" - }, + } + + cmpsMatch := expectedCmp == cmp + dirIsAscending := dir == encoding.Ascending + + if cmpsMatch != dirIsAscending { + return fmt.Sprintf("non-order preserving encoding: \n%v\n%v", b1, b2) + } + return "" + } + + properties.Property("order-preserving", prop.ForAll( + generateAndCompareDatums, // For each column type, generate two datums of that type. genColumnType(). SuchThat(hasKeyEncoding). @@ -176,6 +207,31 @@ func TestEncodeTableKey(t *testing.T) { }), genEncodingDirection(), )) + + // Also run the property on arrays possibly containing NULL values. + // The random generator in the property above does not generate NULLs. + properties.Property("order-preserving-arrays", prop.ForAll( + generateAndCompareDatums, + // For each column type, generate two datums of that type. + genRandomArrayType(). + SuchThat(hasKeyEncoding). + FlatMap( + func(t interface{}) gopter.Gen { + colTyp := t.(*types.T) + return gopter.CombineGens( + genArrayDatumWithType(colTyp), + genArrayDatumWithType(colTyp)) + }, reflect.TypeOf([]interface{}{})). + Map(func(datums []interface{}) []tree.Datum { + ret := make([]tree.Datum, len(datums)) + for i, d := range datums { + ret[i] = d.(tree.Datum) + } + return ret + }), + genEncodingDirection(), + )) + properties.TestingRun(t) } diff --git a/pkg/sql/sqlbase/encoded_datum_test.go b/pkg/sql/sqlbase/encoded_datum_test.go index f8f1ee708cf4..743be19ae850 100644 --- a/pkg/sql/sqlbase/encoded_datum_test.go +++ b/pkg/sql/sqlbase/encoded_datum_test.go @@ -248,7 +248,7 @@ func TestEncDatumCompare(t *testing.T) { // These cases require decoding. Data with a composite key encoding cannot // be decoded from their key part alone. - if !HasCompositeKeyEncoding(typ.Family()) { + if !HasCompositeKeyEncoding(typ) { checkEncDatumCmp(t, a, typ, &v1, &v2, noncmp, noncmp, -1, true) checkEncDatumCmp(t, a, typ, &v2, &v1, desc, noncmp, +1, true) checkEncDatumCmp(t, a, typ, &v1, &v1, asc, desc, 0, true) @@ -277,7 +277,7 @@ func TestEncDatumFromBuffer(t *testing.T) { var buf []byte enc := make([]DatumEncoding, len(ed)) for i := range ed { - if HasCompositeKeyEncoding(typs[i].Family()) { + if HasCompositeKeyEncoding(&typs[i]) { // There's no way to reconstruct data from the key part of a composite // encoding. enc[i] = DatumEncoding_VALUE diff --git a/pkg/sql/sqlbase/index_encoding.go b/pkg/sql/sqlbase/index_encoding.go index e3fa02bbcb80..dfbc95d8c993 100644 --- a/pkg/sql/sqlbase/index_encoding.go +++ b/pkg/sql/sqlbase/index_encoding.go @@ -1042,11 +1042,11 @@ func EncodeSecondaryIndex( key = append(key, extraKey...) } - // We do all computation that affects indexes with families in a separate code path to avoid performance - // regression for tables without column families. if len(tableDesc.Families) == 1 || secondaryIndex.Type == IndexDescriptor_INVERTED || secondaryIndex.Version == BaseIndexFormatVersion { + // We do all computation that affects indexes with families in a separate code path to avoid performance + // regression for tables without column families. entry, err := encodeSecondaryIndexNoFamilies(secondaryIndex, colMap, key, values, extraKey) if err != nil { return []IndexEntry{}, err @@ -1207,6 +1207,11 @@ func encodeSecondaryIndexNoFamilies( cols = append(cols, valueEncodedColumn{id: id, isComposite: false}) } for _, id := range index.CompositeColumnIDs { + // Inverted indexes on a composite type (i.e. an array of composite types) + // should not add the indexed column to the value. + if index.Type == IndexDescriptor_INVERTED && id == index.ColumnIDs[0] { + continue + } cols = append(cols, valueEncodedColumn{id: id, isComposite: true}) } sort.Sort(byID(cols)) diff --git a/pkg/sql/sqlbase/structured.go b/pkg/sql/sqlbase/structured.go index e792bd65e7bf..481e5ede76f0 100644 --- a/pkg/sql/sqlbase/structured.go +++ b/pkg/sql/sqlbase/structured.go @@ -35,6 +35,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/errors" + "github.com/lib/pq/oid" ) // ID, ColumnID, FamilyID, and IndexID are all uint32, but are each given a @@ -1281,12 +1282,14 @@ func (desc *MutableTableDescriptor) ensurePrimaryKey() error { // key, so that different strings that collate equal cannot both be used as // keys. The value part is the usual UTF-8 encoding of the string, stored so // that it can be recovered later for inspection/display. -func HasCompositeKeyEncoding(semanticType types.Family) bool { - switch semanticType { +func HasCompositeKeyEncoding(semanticType *types.T) bool { + switch semanticType.Family() { case types.CollatedStringFamily, types.FloatFamily, types.DecimalFamily: return true + case types.ArrayFamily: + return HasCompositeKeyEncoding(semanticType.ArrayContents()) } return false } @@ -1294,17 +1297,24 @@ func HasCompositeKeyEncoding(semanticType types.Family) bool { // DatumTypeHasCompositeKeyEncoding is a version of HasCompositeKeyEncoding // which works on datum types. func DatumTypeHasCompositeKeyEncoding(typ *types.T) bool { - return HasCompositeKeyEncoding(typ.Family()) + return HasCompositeKeyEncoding(typ) } // MustBeValueEncoded returns true if columns of the given kind can only be value // encoded. -func MustBeValueEncoded(semanticType types.Family) bool { - return semanticType == types.ArrayFamily || - semanticType == types.JsonFamily || - semanticType == types.TupleFamily || - semanticType == types.GeometryFamily || - semanticType == types.GeographyFamily +func MustBeValueEncoded(semanticType *types.T) bool { + switch semanticType.Family() { + case types.ArrayFamily: + switch semanticType.Oid() { + case oid.T_int2vector, oid.T_oidvector: + return true + default: + return MustBeValueEncoded(semanticType.ArrayContents()) + } + case types.JsonFamily, types.TupleFamily, types.GeographyFamily, types.GeometryFamily: + return true + } + return false } // HasOldStoredColumns returns whether the index has stored columns in the old @@ -1346,7 +1356,7 @@ func (desc *MutableTableDescriptor) allocateIndexIDs(columnNames map[string]Colu isCompositeColumn := make(map[ColumnID]struct{}) for i := range desc.Columns { col := &desc.Columns[i] - if HasCompositeKeyEncoding(col.Type.Family()) { + if HasCompositeKeyEncoding(&col.Type) { isCompositeColumn[col.ID] = struct{}{} } } @@ -2273,7 +2283,7 @@ func fitColumnToFamily(desc *MutableTableDescriptor, col ColumnDescriptor) (int, // ColumnTypeIsIndexable returns whether the type t is valid as an indexed column. func ColumnTypeIsIndexable(t *types.T) bool { - return !MustBeValueEncoded(t.Family()) + return !MustBeValueEncoded(t) } // ColumnTypeIsInvertedIndexable returns whether the type t is valid to be indexed diff --git a/pkg/sql/sqlbase/testutils.go b/pkg/sql/sqlbase/testutils.go index 5dbdc6ec0dea..8ec3c7eebdcc 100644 --- a/pkg/sql/sqlbase/testutils.go +++ b/pkg/sql/sqlbase/testutils.go @@ -265,17 +265,7 @@ func RandDatumWithNullChance(rng *rand.Rand, typ *types.T, nullChance int) tree. case types.UnknownFamily: return tree.DNull case types.ArrayFamily: - contents := typ.ArrayContents() - if contents.Family() == types.AnyFamily { - contents = RandArrayContentsType(rng) - } - arr := tree.NewDArray(contents) - for i := 0; i < rng.Intn(10); i++ { - if err := arr.Append(RandDatumWithNullChance(rng, contents, 0)); err != nil { - panic(err) - } - } - return arr + return RandArray(rng, typ, 0) case types.AnyFamily: return RandDatumWithNullChance(rng, RandType(rng), nullChance) default: @@ -283,6 +273,22 @@ func RandDatumWithNullChance(rng *rand.Rand, typ *types.T, nullChance int) tree. } } +// RandArray generates a random DArray where the contents have nullChance +// of being null. +func RandArray(rng *rand.Rand, typ *types.T, nullChance int) tree.Datum { + contents := typ.ArrayContents() + if contents.Family() == types.AnyFamily { + contents = RandArrayContentsType(rng) + } + arr := tree.NewDArray(contents) + for i := 0; i < rng.Intn(10); i++ { + if err := arr.Append(RandDatumWithNullChance(rng, contents, nullChance)); err != nil { + panic(err) + } + } + return arr +} + const simpleRange = 10 // RandDatumSimple generates a random Datum of the given type. The generated @@ -767,6 +773,17 @@ func RandColumnType(rng *rand.Rand) *types.T { } } +// RandArrayType generates a random array type. +func RandArrayType(rng *rand.Rand) *types.T { + for { + typ := RandColumnType(rng) + resTyp := types.MakeArray(typ) + if err := ValidateColumnDefType(resTyp); err == nil { + return resTyp + } + } +} + // RandColumnTypes returns a slice of numCols random types. These types must be // legal table column types. func RandColumnTypes(rng *rand.Rand, numCols int) []types.T { @@ -780,7 +797,7 @@ func RandColumnTypes(rng *rand.Rand, numCols int) []types.T { // RandSortingType returns a column type which can be key-encoded. func RandSortingType(rng *rand.Rand) *types.T { typ := RandType(rng) - for MustBeValueEncoded(typ.Family()) { + for MustBeValueEncoded(typ) { typ = RandType(rng) } return typ @@ -1325,7 +1342,7 @@ func randIndexTableDefFromCols( indexElemList := make(tree.IndexElemList, 0, len(cols)) for i := range cols { - semType := cols[i].Type.Family() + semType := cols[i].Type if MustBeValueEncoded(semType) { continue } diff --git a/pkg/util/encoding/encoding.go b/pkg/util/encoding/encoding.go index 232947d87792..1738d1fcae88 100644 --- a/pkg/util/encoding/encoding.go +++ b/pkg/util/encoding/encoding.go @@ -83,6 +83,20 @@ const ( timeTZMarker = bitArrayDescMarker + 1 geoMarker = timeTZMarker + 1 + // Markers and terminators for key encoding Datum arrays in sorted order. + arrayKeyMarker = geoMarker + 1 + arrayKeyDescendingMarker = arrayKeyMarker + 1 + arrayKeyTerminator byte = 0x00 + arrayKeyDescendingTerminator = 0xFF + // We use different null encodings for nulls within key arrays. + // Doing this allows for the terminator to be less/greater than + // the null value within arrays. These byte values overlap with + // encodedNotNull, encodedNotNullDesc, and interleavedSentinel, + // but they can only exist within an encoded array key. Because + // of the context, they cannot be ambiguous with these other bytes. + ascendingNullWithinArrayKey byte = 0x01 + descendingNullWithinArrayKey = 0xFE + // IntMin is chosen such that the range of int tags does not overlap the // ascii character set that is frequently used in testing. IntMin = 0x80 // 128 @@ -1277,6 +1291,8 @@ const ( BitArrayDesc Type = 18 // BitArray encoded descendingly TimeTZ Type = 19 Geo Type = 20 + ArrayKeyAsc Type = 21 // Array key encoding + ArrayKeyDesc Type = 22 // Array key encoded descendingly ) // typMap maps an encoded type byte to a decoded Type. It's got 256 slots, one @@ -1309,6 +1325,10 @@ func slowPeekType(b []byte) Type { return Null case m == encodedNotNull, m == encodedNotNullDesc: return NotNull + case m == arrayKeyMarker: + return ArrayKeyAsc + case m == arrayKeyDescendingMarker: + return ArrayKeyDesc case m == bytesMarker: return Bytes case m == bytesDescMarker: @@ -1369,6 +1389,30 @@ func getMultiNonsortingVarintLen(b []byte, num int) (int, error) { return p, nil } +// getArrayLength returns the length of a key encoded array. The input +// must have had the array type marker stripped from the front. +func getArrayLength(buf []byte, dir Direction) (int, error) { + result := 0 + for { + if len(buf) == 0 { + return 0, errors.AssertionFailedf("invalid array encoding (unterminated)") + } + if IsArrayKeyDone(buf, dir) { + // Increment to include the terminator byte. + result++ + break + } + next, err := PeekLength(buf) + if err != nil { + return 0, err + } + // Shift buf over by the encoded data amount. + buf = buf[next:] + result += next + } + return result, nil +} + // PeekLength returns the length of the encoded value at the start of b. Note: // if this function succeeds, it's not a guarantee that decoding the value will // succeed. PeekLength is meant to be used on key encoded data only. @@ -1383,6 +1427,9 @@ func PeekLength(b []byte) (int, error) { // interleavedSentinel also falls into this path. Since it // contains the same byte value as encodedNotNullDesc, it // cannot be included explicitly in the case statement. + // ascendingNullWithinArrayKey and descendingNullWithinArrayKey also + // contain the same byte values as encodedNotNull and encodedNotNullDesc + // respectively. return 1, nil case bitArrayMarker, bitArrayDescMarker: terminator := byte(bitArrayDataTerminator) @@ -1398,6 +1445,13 @@ func PeekLength(b []byte) (int, error) { return 1 + n + m + 1, err } return 1 + n + m + 1, nil + case arrayKeyMarker, arrayKeyDescendingMarker: + dir := Ascending + if m == arrayKeyDescendingMarker { + dir = Descending + } + length, err := getArrayLength(b[1:], dir) + return 1 + length, err case bytesMarker: return getBytesLength(b, ascendingEscapes) case jsonInvertedIndex: @@ -1510,7 +1564,7 @@ func prettyPrintValueImpl(valDirs []Direction, b []byte, sep string) (string, bo // even if we don't have directions for the child index's columns. func prettyPrintFirstValue(dir Direction, b []byte) ([]byte, string, error) { var err error - switch PeekType(b) { + switch typ := PeekType(b); typ { case Null: b, _ = DecodeIfNull(b) return b, "NULL", nil @@ -1520,6 +1574,46 @@ func prettyPrintFirstValue(dir Direction, b []byte) ([]byte, string, error) { return b[1:], "False", nil case Array: return b[1:], "Arr", nil + case ArrayKeyAsc, ArrayKeyDesc: + encDir := Ascending + if typ == ArrayKeyDesc { + encDir = Descending + } + var build strings.Builder + buf, err := ValidateAndConsumeArrayKeyMarker(b, encDir) + if err != nil { + return nil, "", err + } + build.WriteString("ARRAY[") + first := true + // Use the array key decoding logic, but instead of calling out + // to DecodeTableKey, just make a recursive call. + for { + if len(buf) == 0 { + return nil, "", errors.AssertionFailedf("invalid array (unterminated)") + } + if IsArrayKeyDone(buf, encDir) { + buf = buf[1:] + break + } + var next string + if IsNextByteArrayEncodedNull(buf, dir) { + next = "NULL" + buf = buf[1:] + } else { + buf, next, err = prettyPrintFirstValue(dir, buf) + if err != nil { + return nil, "", err + } + } + if !first { + build.WriteString(",") + } + build.WriteString(next) + first = false + } + build.WriteString("]") + return buf, build.String(), nil case NotNull: // The tag can be either encodedNotNull or encodedNotNullDesc. The // latter can be an interleaved sentinel. @@ -2603,3 +2697,76 @@ func getJSONInvertedIndexKeyLength(buf []byte) (int, error) { return len + valLen, nil } } + +// EncodeArrayKeyMarker adds the array key encoding marker to buf and +// returns the new buffer. +func EncodeArrayKeyMarker(buf []byte, dir Direction) []byte { + switch dir { + case Ascending: + return append(buf, arrayKeyMarker) + case Descending: + return append(buf, arrayKeyDescendingMarker) + default: + panic("invalid direction") + } +} + +// EncodeArrayKeyTerminator adds the array key terminator to buf and +// returns the new buffer. +func EncodeArrayKeyTerminator(buf []byte, dir Direction) []byte { + switch dir { + case Ascending: + return append(buf, arrayKeyTerminator) + case Descending: + return append(buf, arrayKeyDescendingTerminator) + default: + panic("invalid direction") + } +} + +// EncodeNullWithinArrayKey encodes NULL within a key encoded array. +func EncodeNullWithinArrayKey(buf []byte, dir Direction) []byte { + switch dir { + case Ascending: + return append(buf, ascendingNullWithinArrayKey) + case Descending: + return append(buf, descendingNullWithinArrayKey) + default: + panic("invalid direction") + } +} + +// IsNextByteArrayEncodedNull returns if the first byte in the input +// is the NULL encoded byte within an array key. +func IsNextByteArrayEncodedNull(buf []byte, dir Direction) bool { + expected := ascendingNullWithinArrayKey + if dir == Descending { + expected = descendingNullWithinArrayKey + } + return buf[0] == expected +} + +// ValidateAndConsumeArrayKeyMarker checks that the marker at the front +// of buf is valid for an array of the given direction, and consumes it +// if so. It returns an error if the tag is invalid. +func ValidateAndConsumeArrayKeyMarker(buf []byte, dir Direction) ([]byte, error) { + typ := PeekType(buf) + expected := ArrayKeyAsc + if dir == Descending { + expected = ArrayKeyDesc + } + if typ != expected { + return nil, errors.Newf("invalid type found %s", typ) + } + return buf[1:], nil +} + +// IsArrayKeyDone returns if the first byte in the input is the array +// terminator for the input direction. +func IsArrayKeyDone(buf []byte, dir Direction) bool { + expected := arrayKeyTerminator + if dir == Descending { + expected = arrayKeyDescendingTerminator + } + return buf[0] == expected +} diff --git a/pkg/util/encoding/type_string.go b/pkg/util/encoding/type_string.go index 096817a610b9..51ff7aec93a1 100644 --- a/pkg/util/encoding/type_string.go +++ b/pkg/util/encoding/type_string.go @@ -29,11 +29,13 @@ func _() { _ = x[BitArrayDesc-18] _ = x[TimeTZ-19] _ = x[Geo-20] + _ = x[ArrayKeyAsc-21] + _ = x[ArrayKeyDesc-22] } -const _Type_name = "UnknownNullNotNullIntFloatDecimalBytesBytesDescTimeDurationTrueFalseUUIDArrayIPAddrJSONTupleBitArrayBitArrayDescTimeTZGeo" +const _Type_name = "UnknownNullNotNullIntFloatDecimalBytesBytesDescTimeDurationTrueFalseUUIDArrayIPAddrJSONTupleBitArrayBitArrayDescTimeTZGeoArrayKeyAscArrayKeyDesc" -var _Type_index = [...]uint8{0, 7, 11, 18, 21, 26, 33, 38, 47, 51, 59, 63, 68, 72, 77, 83, 87, 92, 100, 112, 118, 121} +var _Type_index = [...]uint8{0, 7, 11, 18, 21, 26, 33, 38, 47, 51, 59, 63, 68, 72, 77, 83, 87, 92, 100, 112, 118, 121, 132, 144} func (i Type) String() string { if i < 0 || i >= Type(len(_Type_index)-1) {