diff --git a/pkg/roachpb/data.go b/pkg/roachpb/data.go index 2390c6e2f7c9..2e6fb90d4a74 100644 --- a/pkg/roachpb/data.go +++ b/pkg/roachpb/data.go @@ -2029,6 +2029,17 @@ func (s Span) EqualValue(o Span) bool { return s.Key.Equal(o.Key) && s.EndKey.Equal(o.EndKey) } +// Compare returns an integer comparing two Spans lexicographically. +// The result will be 0 if s==o, -1 if s starts before o or if the starts +// are equal and s ends before o, and +1 otherwise. +func (s Span) Compare(o Span) int { + cmp := bytes.Compare(s.Key, o.Key) + if cmp == 0 { + return bytes.Compare(s.EndKey, o.EndKey) + } + return cmp +} + // Overlaps returns true WLOG for span A and B iff: // 1. Both spans contain one key (just the start key) and they are equal; or // 2. The span with only one key is contained inside the other span; or diff --git a/pkg/sql/lex/BUILD.bazel b/pkg/sql/lex/BUILD.bazel index 7fdc01cd8bc5..a5e0e27619e7 100644 --- a/pkg/sql/lex/BUILD.bazel +++ b/pkg/sql/lex/BUILD.bazel @@ -6,6 +6,7 @@ go_library( "encode.go", "experimental_keywords.go", "keywords.go", # keep + "reserved_keywords.go", "tokens.go", # keep ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/lex", diff --git a/pkg/sql/rowenc/index_encoding.go b/pkg/sql/rowenc/index_encoding.go index 0b0b8a654829..ace32bc32574 100644 --- a/pkg/sql/rowenc/index_encoding.go +++ b/pkg/sql/rowenc/index_encoding.go @@ -806,6 +806,51 @@ func EncodeInvertedIndexTableKeys( return nil, errors.AssertionFailedf("trying to apply inverted index to unsupported type %s", datum.ResolvedType()) } +// EncodeContainingInvertedIndexSpans takes in a key prefix and returns the +// spans that must be scanned in the inverted index to evaluate a contains (@>) +// predicate with the given datum, which should be a container (either JSON +// or Array). These spans should be used to find the objects in the index that +// contain the given json or array. In other words, if we have a predicate +// x @> y, this function should use the value of y to find the spans to scan +// in an inverted index on x. +// +// The spans returned by EncodeContainingInvertedIndexSpans represent the +// intersection of unions. For example, if the returned results are: +// +// { {["a", "b"), ["c", "d")}, {["e", "f")} } +// +// the expression should be evaluated as: +// +// INTERSECTION +// / \ +// UNION ["e", "f") +// / \ +// ["a", "b") ["c", "d") +// +// The input inKey is prefixed to all returned keys. +func EncodeContainingInvertedIndexSpans( + evalCtx *tree.EvalContext, val tree.Datum, inKey []byte, version descpb.IndexDescriptorVersion, +) (spans []roachpb.Spans, tight bool, err error) { + if val == tree.DNull { + return nil, false, nil + } + datum := tree.UnwrapDatum(evalCtx, val) + switch val.ResolvedType().Family() { + case types.JsonFamily: + return json.EncodeContainingInvertedIndexSpans(inKey, val.(*tree.DJSON).JSON) + case types.ArrayFamily: + spans, err := encodeContainingArrayInvertedIndexSpans(val.(*tree.DArray), inKey, version) + if err != nil { + return nil, false, err + } + // Spans for array inverted indexes are always tight. + return spans, true, err + } + return nil, false, errors.AssertionFailedf( + "trying to apply inverted index to unsupported type %s", datum.ResolvedType(), + ) +} + // encodeArrayInvertedIndexTableKeys returns a list of inverted index keys for // the given input array, one per entry in the array. The input inKey is // prefixed to all returned keys. @@ -842,6 +887,29 @@ func encodeArrayInvertedIndexTableKeys( return outKeys, nil } +// encodeContainingArrayInvertedIndexSpans returns the spans that must be +// scanned in the inverted index to evaluate a contains (@>) predicate with +// the given array, one slice of spans per entry in the array. The input +// inKey is prefixed to all returned keys. +func encodeContainingArrayInvertedIndexSpans( + val *tree.DArray, inKey []byte, version descpb.IndexDescriptorVersion, +) (spans []roachpb.Spans, err error) { + if val.Array.Len() == 0 { + // All arrays contain the empty array. + return []roachpb.Spans{{roachpb.Span{Key: inKey}}}, nil + } + + keys, err := encodeArrayInvertedIndexTableKeys(val, inKey, version) + if err != nil { + return nil, err + } + spans = make([]roachpb.Spans, len(keys)) + for i, key := range keys { + spans[i] = roachpb.Spans{{Key: key}} + } + return spans, nil +} + // EncodeGeoInvertedIndexTableKeys is the equivalent of EncodeInvertedIndexTableKeys // for Geography and Geometry. func EncodeGeoInvertedIndexTableKeys( diff --git a/pkg/sql/rowenc/index_encoding_test.go b/pkg/sql/rowenc/index_encoding_test.go index 4df89ac51cdb..97a9bb381382 100644 --- a/pkg/sql/rowenc/index_encoding_test.go +++ b/pkg/sql/rowenc/index_encoding_test.go @@ -404,6 +404,103 @@ func TestInvertedIndexKey(t *testing.T) { } } +func TestEncodeContainingArrayInvertedIndexSpans(t *testing.T) { + testCases := []struct { + value string + contains string + expected bool + }{ + // This test uses EncodeInvertedIndexTableKeys and EncodeContainingInvertedIndexSpans + // to determine whether the first Array value contains the second. If the first + // value contains the second, expected is true. Otherwise it is false. + {`{}`, `{}`, true}, + {`{}`, `{1}`, false}, + {`{1}`, `{}`, true}, + {`{1}`, `{1}`, true}, + {`{1}`, `{1, 2}`, false}, + {`{1, 2}`, `{1}`, true}, + {`{1, 2}`, `{2}`, true}, + {`{1, 2}`, `{1, 2}`, true}, + {`{1, 2}`, `{1, 2, 1}`, true}, + {`{1, 2, 3}`, `{1, 2, 4}`, false}, + {`{1, 2, 3}`, `{}`, true}, + } + + evalCtx := tree.MakeTestingEvalContext(cluster.MakeTestingClusterSettings()) + parseArray := func(s string) tree.Datum { + arr, _, err := tree.ParseDArrayFromString(&evalCtx, s, types.Int) + if err != nil { + t.Fatalf("Failed to parse array %s: %v", s, err) + } + return arr + } + + version := descpb.EmptyArraysInInvertedIndexesVersion + for _, c := range testCases { + value, contains := parseArray(c.value), parseArray(c.contains) + + // First check that evaluating `value @> contains` matches the expected + // result. + res, err := tree.ArrayContains(&evalCtx, value.(*tree.DArray), contains.(*tree.DArray)) + if err != nil { + t.Fatal(err) + } + if bool(*res) != c.expected { + t.Fatalf( + "expected value of %s @> %s did not match actual value. Expected: %v. Got: %s", + c.value, c.contains, c.expected, res.String(), + ) + } + + // Now check that we get the same result with the inverted index spans. + keys, err := EncodeInvertedIndexTableKeys(value, nil, version) + if err != nil { + t.Fatal(err) + } + + spansSlice, _, err := EncodeContainingInvertedIndexSpans(&evalCtx, contains, nil, version) + if err != nil { + t.Fatal(err) + } + + // The spans returned by EncodeContainingInvertedIndexSpans represent the + // intersection of unions. So the below logic is performing a union on the + // inner loop (any span in the slice can contain any of the keys), and an + // intersection on the outer loop (all of the span slices must contain at + // least one key). + actual := true + for _, spans := range spansSlice { + found := false + for _, span := range spans { + if span.EndKey == nil { + // ContainsKey expects that the EndKey is filled in. + span.EndKey = span.Key.PrefixEnd() + } + for _, key := range keys { + if span.ContainsKey(key) { + found = true + break + } + } + if found == true { + break + } + } + actual = actual && found + } + + if actual != c.expected { + if c.expected { + t.Errorf("expected %s to contain %s but it did not", + c.value, c.contains) + } else { + t.Errorf("expected %s not to contain %s but it did", + c.value, c.contains) + } + } + } +} + type arrayEncodingTest struct { name string datum tree.DArray diff --git a/pkg/util/encoding/encoding.go b/pkg/util/encoding/encoding.go index 86e9bbc2f3d1..bb907c91b7a5 100644 --- a/pkg/util/encoding/encoding.go +++ b/pkg/util/encoding/encoding.go @@ -941,6 +941,14 @@ func EncodeNotNullAscending(b []byte) []byte { return append(b, encodedNotNull) } +// EncodeJSONObjectSpanStartAscending encodes the first possible value for JSON +// objects, which is \x00\xff. Non-objects (i.e., scalars and arrays) will +// start with \x00\x01 or \x00\x03 (see AddJSONPathTerminator and +// EncodeArrayAscending), so all objects will be ordered after them. +func EncodeJSONObjectSpanStartAscending(b []byte) []byte { + return append(b, escape, escaped00) +} + // EncodeArrayAscending encodes a value used to signify membership of an array for JSON objects. func EncodeArrayAscending(b []byte) []byte { return append(b, escape, escapedJSONArray) diff --git a/pkg/util/json/BUILD.bazel b/pkg/util/json/BUILD.bazel index c041090b69f0..da990af5db17 100644 --- a/pkg/util/json/BUILD.bazel +++ b/pkg/util/json/BUILD.bazel @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/geo", "//pkg/geo/geopb", + "//pkg/roachpb", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", @@ -39,8 +40,10 @@ go_test( deps = [ "//pkg/sql/pgwire/pgerror", "//pkg/util/encoding", + "//pkg/util/randutil", "//pkg/util/timeutil", "//pkg/util/unique", "//vendor/github.com/cockroachdb/apd/v2:apd", + "//vendor/github.com/stretchr/testify/require", ], ) diff --git a/pkg/util/json/encoded.go b/pkg/util/json/encoded.go index 5c70c651353f..6da955393fed 100644 --- a/pkg/util/json/encoded.go +++ b/pkg/util/json/encoded.go @@ -17,6 +17,7 @@ import ( "strconv" "unsafe" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/errors" ) @@ -714,6 +715,16 @@ func (j *jsonEncoded) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { return decoded.encodeInvertedIndexKeys(b) } +func (j *jsonEncoded) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + decoded, err := j.decode() + if err != nil { + return nil, false, err + } + return decoded.encodeContainingInvertedIndexSpans(b, root) +} + // numInvertedIndexEntries implements the JSON interface. func (j *jsonEncoded) numInvertedIndexEntries() (int, error) { if j.isScalar() || j.containerLen == 0 { diff --git a/pkg/util/json/json.go b/pkg/util/json/json.go index 8296cdfeeb43..565e718ebd5b 100644 --- a/pkg/util/json/json.go +++ b/pkg/util/json/json.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/apd/v2" "github.com/cockroachdb/cockroach/pkg/geo" "github.com/cockroachdb/cockroach/pkg/geo/geopb" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/util/encoding" @@ -72,10 +73,24 @@ type JSON interface { // Size returns the size of the JSON document in bytes. Size() uintptr - // EncodeInvertedIndexKeys takes in a key prefix and returns a slice of inverted index keys, - // one per path through the receiver. + // encodeInvertedIndexKeys takes in a key prefix and returns a slice of + // inverted index keys, one per path through the receiver. encodeInvertedIndexKeys(b []byte) ([][]byte, error) + // encodeContainingInvertedIndexSpans takes in a key prefix and returns + // slices of inverted index spans, one slice of spans per path through the + // receiver. If a path ends in an empty array or object, the corresponding + // slice includes one span for that path, as well one span for all paths in + // which the empty array or object is replaced with a non-empty array or + // object. This matches the logic of the @> (contains) operator. + // + // If root is true, this function is being called at the root level of the + // JSON hierarchy. + // + // Returns tight=true if the returned spans are tight and cannot produce + // false positives. Otherwise, returns tight=false. + encodeContainingInvertedIndexSpans(b []byte, root bool) (_ []roachpb.Spans, tight bool, err error) + // numInvertedIndexEntries returns the number of entries that will be // produced if this JSON gets included in an inverted index. numInvertedIndexEntries() (int, error) @@ -725,27 +740,89 @@ func ParseJSON(s string) (JSON, error) { func EncodeInvertedIndexKeys(b []byte, json JSON) ([][]byte, error) { return json.encodeInvertedIndexKeys(encoding.EncodeJSONAscending(b)) } + +// EncodeContainingInvertedIndexSpans takes in a key prefix and returns the +// spans that must be scanned in the inverted index to evaluate a contains (@>) +// predicate with the given json (i.e., find the objects in the index that +// contain the given json). +// +// The spans returned by EncodeContainingInvertedIndexSpans represent the +// intersection of unions. For example, if the returned results are: +// +// { {["a", "b"), ["c", "d")}, {["e", "f")} } +// +// the expression should be evaluated as: +// +// INTERSECTION +// / \ +// UNION ["e", "f") +// / \ +// ["a", "b") ["c", "d") +// +// Returns tight=true if the returned spans are tight and cannot produce false +// positives. Otherwise, returns tight=false. +func EncodeContainingInvertedIndexSpans( + b []byte, json JSON, +) (spans []roachpb.Spans, tight bool, err error) { + return json.encodeContainingInvertedIndexSpans(encoding.EncodeJSONAscending(b), true /* root */) +} + func (j jsonNull) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { b = encoding.AddJSONPathTerminator(b) return [][]byte{encoding.EncodeNullAscending(b)}, nil } + +func (j jsonNull) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) +} + func (jsonTrue) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { b = encoding.AddJSONPathTerminator(b) return [][]byte{encoding.EncodeTrueAscending(b)}, nil } + +func (j jsonTrue) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) +} + func (jsonFalse) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { b = encoding.AddJSONPathTerminator(b) return [][]byte{encoding.EncodeFalseAscending(b)}, nil } + +func (j jsonFalse) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) +} + func (j jsonString) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { b = encoding.AddJSONPathTerminator(b) return [][]byte{encoding.EncodeStringAscending(b, string(j))}, nil } + +func (j jsonString) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) +} + func (j jsonNumber) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { b = encoding.AddJSONPathTerminator(b) var dec = apd.Decimal(j) return [][]byte{encoding.EncodeDecimalAscending(b, &dec)}, nil } + +func (j jsonNumber) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) ([]roachpb.Spans, bool, error) { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) +} + func (j jsonArray) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { // Checking for an empty array. if len(j) == 0 { @@ -764,12 +841,65 @@ func (j jsonArray) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { // Deduplicate the entries, since arrays can have duplicates - we don't want // to emit duplicate keys from this method, as it's more expensive to - // deduplicate keys via KV (which will actually write the keys) than via SQL - // (just an in-memory sort and distinct). + // deduplicate keys via KV (which will actually write the keys) than to do + // it now (just an in-memory sort and distinct). outKeys = unique.UniquifyByteSlices(outKeys) return outKeys, nil } +func (j jsonArray) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) (spans []roachpb.Spans, tight bool, _ error) { + // Checking for an empty array. + if len(j) == 0 { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) + } + + prefix := encoding.EncodeArrayAscending(b) + tight = true + for i := range j { + isRoot := true + if j[i].isScalar() { + // Nested arrays and objects should be treated as a root element if they + // are contained in an array, but scalars should not. This is because + // there is special logic for scalars that only applies when they are not + // contained in an object or array. As described inside + // encodeContainingInvertedIndexSpansFromLeaf, if we find a scalar on the + // right side of the @> operator it means that we need to find both + // matching scalars and arrays that contain that value. For example, + // '1' @> '1' and '[1]' @> '1' are both true, but '[[1]]' @> '[1]' is + // false. + isRoot = false + } + children, childTight, err := j[i].encodeContainingInvertedIndexSpans( + prefix[:len(prefix):len(prefix)], isRoot, + ) + if err != nil { + return nil, false, err + } + spans = append(spans, children...) + tight = tight && childTight + + // If the child is also a container with more than one element, we cannot + // produce tight spans. This is because we cannot rely on the keys alone + // to determine whether the top level JSON is contained in another JSON. + // For example, '[[1], [2]]' and '[[1, 2]]' have exactly the same keys, but + // '[[1, 2]]' @> '[[1], [2]]' is true, while '[[1], [2]]' @> '[[1, 2]]' is + // false. We will return tight=false for the second case, which will + // signal the need to filter out false positives. + if isContainerWithMoreThenOneElement(j[i]) { + tight = false + } + } + + // Deduplicate the entries, since arrays can have duplicates - we don't want + // to emit duplicate spans from this method, as it's more expensive to + // deduplicate spans via KV (which will actually write the keys) than to do + // it now (just an in-memory sort and distinct). + spans = unique.SortAndUniquifySpanSets(spans) + return spans, tight, nil +} + func (j jsonObject) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { // Checking for an empty object. if len(j) == 0 { @@ -785,13 +915,7 @@ func (j jsonObject) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { // We're trying to see if this is the end of the JSON path. If it is, then we don't want to // add an extra separator. - end := true - switch j[i].v.(type) { - case jsonArray, jsonObject: - if j[i].v.Len() != 0 { - end = false - } - } + end := isEnd(j[i].v) for _, childBytes := range children { encodedKey := bytes.Join([][]byte{b, @@ -804,6 +928,181 @@ func (j jsonObject) encodeInvertedIndexKeys(b []byte) ([][]byte, error) { return outKeys, nil } +func (j jsonObject) encodeContainingInvertedIndexSpans( + b []byte, root bool, +) (spans []roachpb.Spans, tight bool, _ error) { + if len(j) == 0 { + return encodeContainingInvertedIndexSpansFromLeaf(j, b, root) + } + + tight = true + for i := range j { + // We're trying to see if this is the end of the JSON path. If it is, then + // we don't want to add an extra separator. + end := isEnd(j[i].v) + + prefix := encoding.EncodeJSONKeyStringAscending(b[:len(b):len(b)], string(j[i].k), end) + children, childTight, err := j[i].v.encodeContainingInvertedIndexSpans(prefix, false /* root */) + if err != nil { + return nil, false, err + } + + spans = append(spans, children...) + tight = tight && childTight + + // If the child is also a container with more than one element, we cannot + // produce tight spans. This is because we cannot rely on the keys alone + // to determine whether the top level JSON is contained in another JSON. + // For example, '[{"foo": [1]}, {"foo": [2]}]' and '[{"foo": [1, 2]}]' + // have exactly the same keys, but + // '[{"foo": [1, 2]}]' @> '[{"foo": [1]}, {"foo": [2]}]' is true, while + // '[{"foo": [1]}, {"foo": [2]}]' @> '[[{"foo": [1, 2]}]]' is false. + // We will return tight=false for the second case, which will signal the + // need to filter out false positives. + if isContainerWithMoreThenOneElement(j[i].v) { + tight = false + } + } + return spans, tight, nil + +} + +// isEnd returns true if a JSON value is the end of the JSON path. +// If it is, then we don't want to add an extra separator when encoding +// the keys. +func isEnd(json JSON) bool { + end := true + switch t := json.(type) { + case jsonArray, jsonObject: + if t.Len() != 0 { + end = false + } + + case *jsonEncoded: + switch t.typ { + case ArrayJSONType, ObjectJSONType: + if t.containerLen != 0 { + end = false + } + } + } + return end +} + +// isContainerWithMoreThenOneElement returns true if the given JSON is a +// container (i.e., JSON object or array) with more than one element. +func isContainerWithMoreThenOneElement(json JSON) bool { + switch t := json.(type) { + case jsonArray, jsonObject: + return t.Len() > 1 + case *jsonEncoded: + switch t.typ { + case ArrayJSONType, ObjectJSONType: + return t.containerLen > 1 + } + } + return false +} + +// encodeContainingInvertedIndexSpansFromLeaf encodes the spans that must be +// scanned in an inverted index to find the JSON objects that contain the given +// leaf JSON value. A leaf is any scalar json such as '1', 'true', or 'null', +// or an empty object or array. +// +// If root is true, this function is being called at the root level of the +// JSON hierarchy. +// +// Returns tight=true if the returned spans are tight and cannot produce +// false positives. Otherwise, returns tight=false. +func encodeContainingInvertedIndexSpansFromLeaf( + j JSON, b []byte, root bool, +) (_ []roachpb.Spans, tight bool, _ error) { + keys, err := j.encodeInvertedIndexKeys(b) + if err != nil { + return nil, false, err + } + + var spans roachpb.Spans + prefix := b[:len(b):len(b)] + if !root { + // At this point prefix contains the encoded key but not the + // escape + escapedJSONObjectKeyTerm suffix which is used for non-empty + // objects before encoding the contents of the object. The call to + // EncodeJSONKeyStringAscending(..., false) adds that suffix, and since + // the key is already encoded, we pass the empty string as the key. + prefix = encoding.EncodeJSONKeyStringAscending(prefix, "", false /* end */) + } + + switch t := j.(type) { + case jsonArray: + if t.Len() != 0 { + return nil, false, errors.AssertionFailedf( + "encodeContainingInvertedIndexSpansFromLeaf called on a non-empty jsonArray", + ) + } + + // At this point, `keys` contains the empty array, which ensures that + // '{"a": []}' matches '{"a": []}' and '[]' matches '[]'. This is correct + // because a JSON object or array always contains itself. + + // Add a key to cover all non-empty arrays. It is needed for JSON arrays + // such as '[]' to match '[1]' and '{"a": []}' to match '{"a": [1]}' + // (i.e., '[1]' @> '[]' and '{"a": [1]}' @> '{"a": []}' are true). + // EncodeArrayAscending generates the prefix that is used for all non-empty + // arrays. + keys = append(keys, encoding.EncodeArrayAscending(prefix)) + + case jsonObject: + if t.Len() != 0 { + return nil, false, errors.AssertionFailedf( + "encodeContainingInvertedIndexSpansFromLeaf called on a non-empty jsonObject", + ) + } + + // At this point, `keys` contains the empty object, which ensures that + // '{"a": {}}' matches '{"a": {}}' and '{}' matches '{}'. This is correct + // because a JSON object always contains itself. This key will be converted + // into a span below. + + // Add a span to cover keys for non-empty objects. It is needed for + // JSON objects such as '{}' to match '{"a": "b"}', but not '[1]', + // and '{"a": {}}' to match '{"a": {"b": "c"}}', but not '{"a": [1]}' or + // ["a"]. (i.e., '{"a": "b"}' @> '{}' and '{"a": {"b": "c"}}' @> '{"a": {}}' + // are true, but '[1]' @> '{}', '{"a": [1]}' @> '{"a": {}}', and + // '["a"]' @> '{"a": {}}' are false) + spans = append(spans, roachpb.Span{ + // EncodeJSONObjectSpanStartAscending generates the first possible value + // for JSON objects. + Key: roachpb.Key(encoding.EncodeJSONObjectSpanStartAscending(prefix)), + // This end key is equal to jsonInvertedIndex + 1. + EndKey: roachpb.Key(prefix).PrefixEnd(), + }) + + default: + if root { + // If we find a scalar on the right side of the @> operator it means that + // we need to find both matching scalars and arrays that contain that value. + // In order to do this we generate two logical spans, one for the original + // scalar (which we have already done above) and one for arrays containing + // the scalar. + arr := NewArrayBuilder(1) + arr.Add(j) + jArr := arr.Build() + arrKeys, err := jArr.encodeInvertedIndexKeys(prefix) + if err != nil { + return nil, false, err + } + keys = append(keys, arrKeys...) + } + } + + for _, key := range keys { + spans = append(spans, roachpb.Span{Key: key}) + } + + return []roachpb.Spans{spans}, true, nil +} + // NumInvertedIndexEntries returns the number of inverted index entries that // would be created for the given JSON value. Since identical elements of an // array are encoded identically in the inverted index, the total number of diff --git a/pkg/util/json/json_test.go b/pkg/util/json/json_test.go index 089143ca49db..a55ab6dbc0d9 100644 --- a/pkg/util/json/json_test.go +++ b/pkg/util/json/json_test.go @@ -22,8 +22,10 @@ import ( "github.com/cockroachdb/apd/v2" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/util/encoding" + "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/unique" + "github.com/stretchr/testify/require" ) func eachPair(a, b JSON, f func(a, b JSON)) { @@ -1343,6 +1345,166 @@ func TestEncodeJSONInvertedIndex(t *testing.T) { } } +func TestEncodeJSONInvertedIndexSpans(t *testing.T) { + testCases := []struct { + value string + contains string + expected bool + tight bool + }{ + // This test uses EncodeInvertedIndexKeys and + // EncodeContainingInvertedIndexSpans to determine whether the first JSON + // value contains the second. If the first value contains the second, + // expected is true. Otherwise expected is false. If the spans produced for + // contains are tight, tight is true. Otherwise tight is false. + {`{}`, `{}`, true, true}, + {`[]`, `[]`, true, true}, + {`[]`, `{}`, false, true}, + {`"a"`, `"a"`, true, true}, + {`null`, `{}`, false, true}, + {`{}`, `true`, false, true}, + {`[[], {}]`, `[]`, true, true}, + {`[[], {}]`, `{}`, false, true}, // Surprising, but matches Postgres' behavior. + {`[{"a": "a"}, {"a": "a"}]`, `[]`, true, true}, + {`[[[["a"]]], [[["a"]]]]`, `[]`, true, true}, + {`{}`, `{"a": {}}`, false, true}, + {`{"a": 123.123}`, `{}`, true, true}, + {`{"a": [{}]}`, `{"a": []}`, true, true}, + {`{"a": [{}]}`, `{"a": {}}`, false, true}, + {`{"a": [1]}`, `{"a": []}`, true, true}, + {`{"a": {"b": "c"}}`, `{"a": {}}`, true, true}, + {`{"a": {}}`, `{"a": {"b": true}}`, false, true}, + {`[1, 2, 3, 4, "foo"]`, `[1, 2]`, true, true}, + {`[1, 2, 3, 4, "foo"]`, `[1, "bar"]`, false, true}, + {`{"a": {"b": [1]}}`, `{"a": {"b": [1]}}`, true, true}, + {`{"a": {"b": [1, [2]]}}`, `{"a": {"b": [1]}}`, true, true}, + {`{"a": "b", "c": "d"}`, `{"a": "b", "c": "d"}`, true, true}, + {`{"a": {"b": false}}`, `{"a": {"b": true}}`, false, true}, + {`[{"a": {"b": [1, [2]]}}, "d"]`, `[{"a": {"b": [[2]]}}, "d"]`, true, true}, + {`["a", "a"]`, `"a"`, true, true}, + {`[1, 2, 3, 1]`, `1`, true, true}, + {`[true, false, null, 1.23, "a"]`, `"b"`, false, true}, + {`{"a": {"b": "c", "d": "e"}, "f": "g"}`, `{"a": {"b": "c"}}`, true, true}, + {`{"\u0000\u0001": "b"}`, `{}`, true, true}, + {`{"\u0000\u0001": {"\u0000\u0001": "b"}}`, `{"\u0000\u0001": {}}`, true, true}, + {`[[1], false, null]`, `[null, []]`, true, true}, + {`[[[], {}], false, null]`, `[null, []]`, true, true}, + {`[false, null]`, `[null, []]`, false, true}, + {`[[], null]`, `[null, []]`, true, true}, + {`[{"a": []}, null]`, `[null, []]`, false, true}, + {`[{"a": [[]]}, null]`, `[null, []]`, false, true}, + {`[{"foo": {"bar": "foobar"}}, true]`, `[true, {}]`, true, true}, + {`[{"b": null}, {"bar": "c"}]`, `[{"b": {}}]`, false, true}, + {`[[[[{}], [], false], false], [{}]]`, `[[[[]]]]`, true, true}, + {`[[[[{}], [], false], false], [{}]]`, `[false]`, false, true}, + {`[[{"a": {}, "c": "foo"}, {}], [false]]`, `[[false, {}]]`, false, false}, + {`[[1], [2]]`, `[[1, 2]]`, false, false}, + {`[[1, 2]]`, `[[1], [2]]`, true, true}, + {`{"bar": [["c"]]}`, `{"bar": []}`, true, true}, + {`{"c": [{"a": "b"}, []]}`, `{"c": [{}]}`, true, true}, + {`[{"bar": {"foo": {}}}, {"a": []}]`, `[{}, {"a": [], "bar": {}}, {}]`, false, false}, + {`[{"bar": [1]},{"bar": [2]}]`, `[{"bar": [1, 2]}]`, false, false}, + } + + // runTest checks that evaluating `left @> right` using keys from + // EncodeInvertedIndexKeys and spans from EncodeContainingInvertedIndexSpans + // produces the expected result. + // returns tight=true if the spans from EncodeContainingInvertedIndexSpans + // were tight, and tight=false otherwise. + runTest := func(left, right JSON, expected bool) (tight bool) { + keys, err := EncodeInvertedIndexKeys(nil, left) + require.NoError(t, err) + + spansSlice, tight, err := EncodeContainingInvertedIndexSpans(nil, right) + require.NoError(t, err) + + // The spans returned by EncodeContainingInvertedIndexSpans represent the intersection + // of unions. So the below logic is performing a union on the inner loop + // (any span in the slice can contain any of the keys), and an intersection + // on the outer loop (all of the span slices must contain at least one key). + actual := true + for _, spans := range spansSlice { + found := false + for _, span := range spans { + if span.EndKey == nil { + // ContainsKey expects that the EndKey is filled in. + span.EndKey = span.Key.PrefixEnd() + } + for _, key := range keys { + if span.ContainsKey(key) { + found = true + break + } + } + if found == true { + break + } + } + actual = actual && found + } + + // There may be some false positives, so filter those out. + if actual && !tight { + actual, err = Contains(left, right) + require.NoError(t, err) + } + + if actual != expected { + if expected { + t.Errorf("expected %s to contain %s but it did not", left.String(), right.String()) + } else { + t.Errorf("expected %s not to contain %s but it did", left.String(), right.String()) + } + } + + return tight + } + + // Run pre-defined test cases from above. + for _, c := range testCases { + value, contains := jsonTestShorthand(c.value), jsonTestShorthand(c.contains) + + // First check that evaluating `value @> contains` matches the expected + // result. + res, err := Contains(value, contains) + require.NoError(t, err) + if res != c.expected { + t.Fatalf( + "expected value of %s @> %s did not match actual value. Expected: %v. Got: %v", + c.value, c.contains, c.expected, res, + ) + } + + // Now check that we get the same result with the inverted index spans. + tight := runTest(value, contains, c.expected) + + // And check that the tightness matches the expected value. + if tight != c.tight { + if c.tight { + t.Errorf("expected spans for %s to be tight but they were not", c.contains) + } else { + t.Errorf("expected spans for %s not to be tight but they were", c.contains) + } + } + } + + // Run a set of randomly generated test cases. + rng, _ := randutil.NewPseudoRand() + for i := 0; i < 100; i++ { + // Generate two random JSONs and evaluate the result of `left @> right`. + left, err := Random(20, rng) + require.NoError(t, err) + right, err := Random(20, rng) + require.NoError(t, err) + + res, err := Contains(left, right) + require.NoError(t, err) + + // Now check that we get the same result with the inverted index spans. + runTest(left, right, res) + } +} + func TestNumInvertedIndexEntries(t *testing.T) { testCases := []struct { value string diff --git a/pkg/util/unique/BUILD.bazel b/pkg/util/unique/BUILD.bazel index 54b4017a8cfe..a9b921427943 100644 --- a/pkg/util/unique/BUILD.bazel +++ b/pkg/util/unique/BUILD.bazel @@ -5,10 +5,12 @@ go_library( srcs = ["unique.go"], importpath = "github.com/cockroachdb/cockroach/pkg/util/unique", visibility = ["//visibility:public"], + deps = ["//pkg/roachpb"], ) go_test( name = "unique_test", srcs = ["unique_test.go"], embed = [":unique"], + deps = ["//pkg/roachpb"], ) diff --git a/pkg/util/unique/unique.go b/pkg/util/unique/unique.go index 589ac13f3fd5..3523f173be32 100644 --- a/pkg/util/unique/unique.go +++ b/pkg/util/unique/unique.go @@ -14,6 +14,8 @@ import ( "bytes" "reflect" "sort" + + "github.com/cockroachdb/cockroach/pkg/roachpb" ) // UniquifyByteSlices takes as input a slice of slices of bytes, and @@ -42,6 +44,47 @@ func UniquifyByteSlices(slices [][]byte) [][]byte { return slices } +// SortAndUniquifySpanSets takes as input a slice of Spans, and deduplicates +// them using a sort and unique. It modifies the input slice in place and +// returns it. The result will not contain any duplicates but it will be sorted +// according to the following logic: +// - Span set a (which has type roachpb.Spans) will be ordered before span set +// b if the first span in a that is not equal to the corresponding span in b +// (i.e., at the same position) is less than it (according to Span.Compare) +// or all corresponding spans are equal but there are fewer spans in a. +// - Each span set will itself be sorted using Span.Compare. +func SortAndUniquifySpanSets(slices []roachpb.Spans) []roachpb.Spans { + if len(slices) == 0 { + return slices + } + + // First sort each slice individually. + for _, slice := range slices { + sort.Slice(slice, func(i int, j int) bool { + return slice[i].Compare(slice[j]) < 0 + }) + } + + // Then sort all slices. + sort.Slice(slices, func(i int, j int) bool { + return compare(slices[i], slices[j]) < 0 + }) + + // Then distinct. + lastUniqueIdx := 0 + for i := 1; i < len(slices); i++ { + if compare(slices[i], slices[lastUniqueIdx]) != 0 { + // We found a unique entry, at index i. The last unique entry in the array + // was at lastUniqueIdx, so set the entry after that one to our new unique + // entry, and bump lastUniqueIdx for the next loop iteration. + lastUniqueIdx++ + slices[lastUniqueIdx] = slices[i] + } + } + slices = slices[:lastUniqueIdx+1] + return slices +} + // UniquifyAcrossSlices removes elements from both slices that are duplicated // across both of the slices. For example, inputs [1,2,3], [2,3,4] would remove // 2 and 3 from both lists. @@ -108,3 +151,27 @@ func UniquifyAcrossSlices( } return lOut, rOut } + +// compare returns an integer comparing two Spans lexicographically. +// - The result will be 0 if a==b. +// - The result will be -1 if the first span in a that is not equal to the +// corresponding span in b (i.e., at the same position) is less than it +// (according to Span.Compare) or all corresponding spans are equal but +// there are fewer spans in a. +// - The result will be +1 otherwise. +// Assumes that each of the Spans are already sorted using Span.Compare. +func compare(a, b roachpb.Spans) int { + for i := 0; i < len(a) && i < len(b); i++ { + cmp := a[i].Compare(b[i]) + if cmp != 0 { + return cmp + } + } + if len(a) < len(b) { + return -1 + } + if len(b) < len(a) { + return 1 + } + return 0 +} diff --git a/pkg/util/unique/unique_test.go b/pkg/util/unique/unique_test.go index 85cababb0db6..905260119771 100644 --- a/pkg/util/unique/unique_test.go +++ b/pkg/util/unique/unique_test.go @@ -12,9 +12,12 @@ package unique import ( "fmt" + "math/rand" "reflect" "strconv" "testing" + + "github.com/cockroachdb/cockroach/pkg/roachpb" ) func TestUniquifyByteSlices(t *testing.T) { @@ -68,6 +71,77 @@ func TestUniquifyByteSlices(t *testing.T) { } } +func TestUniquifySpans(t *testing.T) { + tests := []struct { + input [][][]string + expected [][][]string + }{ + { + input: [][][]string{{{"a", "b"}}, {{"a", "b"}}}, + expected: [][][]string{{{"a", "b"}}}, + }, + { + input: [][][]string{}, + expected: [][][]string{}, + }, + { + input: [][][]string{{{"a", "b"}}}, + expected: [][][]string{{{"a", "b"}}}, + }, + { + input: [][][]string{{{"a", "b"}}, {{"a", "b"}, {"c", "d"}}, {{"a", "b"}}}, + expected: [][][]string{{{"a", "b"}}, {{"a", "b"}, {"c", "d"}}}, + }, + { + input: [][][]string{{{"a", "b"}, {"c", "d"}}, {{"a", "b"}}}, + expected: [][][]string{{{"a", "b"}}, {{"a", "b"}, {"c", "d"}}}, + }, + { + input: [][][]string{{{"bar", "foo"}}, {{"bar", "foo"}}, {{"foobar", "foobaz"}}}, + expected: [][][]string{{{"bar", "foo"}}, {{"foobar", "foobaz"}}}, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + // Add a random permutation within each span set. + for idx := range tt.input { + rand.Shuffle(len(tt.input[idx]), func(i, j int) { + tt.input[idx][i], tt.input[idx][j] = tt.input[idx][j], tt.input[idx][i] + }) + } + + // Add a random permutation at the top level. + rand.Shuffle(len(tt.input), func(i, j int) { + tt.input[i], tt.input[j] = tt.input[j], tt.input[i] + }) + + input := make([]roachpb.Spans, len(tt.input)) + expected := make([]roachpb.Spans, len(tt.expected)) + for i, spans := range tt.input { + input[i] = make(roachpb.Spans, len(spans)) + for j := range spans { + input[i][j] = roachpb.Span{ + Key: roachpb.Key(spans[j][0]), + EndKey: roachpb.Key(spans[j][1]), + } + } + } + for i, spans := range tt.expected { + expected[i] = make(roachpb.Spans, len(spans)) + for j := range spans { + expected[i][j] = roachpb.Span{ + Key: roachpb.Key(spans[j][0]), + EndKey: roachpb.Key(spans[j][1]), + } + } + } + if got := SortAndUniquifySpanSets(input); !reflect.DeepEqual(got, expected) { + t.Errorf("SortAndUniquifySpanSets() = %v, expected %v", got, expected) + } + }) + } +} + type uasTestCase = struct { left []int right []int