From 3d438d0d72e9a8ae036ad7146d9981be17cec8eb Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Fri, 14 Jan 2022 17:31:44 -0600 Subject: [PATCH] opt: fix bug in histogram calculation for TimeTZ This commit fixes a bug in the histogram estimation code for TimeTZ that made the faulty assumption that TimeTZ values are ordered by TimeOfDay. This is incorrect since it does not take the OffsetSecs into account. As a result, it was possible to estimate that the size of a histogram bucket was negative, which caused problems in the statistics estimation code. This commit fixes the problem by taking into account both TimeOfDay and OffsetSecs when estimating the size of a bucket in a TimeTZ histogram. Fixes #74667 Release note (bug fix): Fixed an internal error, "estimated row count must be non-zero", that could occur during planning for queries over a table with a TimeTZ column. This error was due to a faulty assumption in the statistics estimation code about ordering of TimeTZ values, which has now been fixed. The error could occur when TimeTZ values used in the query had a different time zone offset than the TimeTZ values stored in the table. --- pkg/sql/opt/memo/testdata/stats/groupby | 73 ++++++++++++++++++++++ pkg/sql/opt/props/BUILD.bazel | 1 + pkg/sql/opt/props/histogram.go | 17 ++++- pkg/sql/opt/props/histogram_test.go | 70 ++++++++++++++++++--- pkg/sql/sem/tree/datum_integration_test.go | 4 +- 5 files changed, 151 insertions(+), 14 deletions(-) diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby index 64a7fc9c459a..e696d8303ebb 100644 --- a/pkg/sql/opt/memo/testdata/stats/groupby +++ b/pkg/sql/opt/memo/testdata/stats/groupby @@ -526,3 +526,76 @@ project │ └── bool_or:5 [type=bool, outer=(5), constraints=(/5: [/true - /true]; tight), fd=()-->(5)] └── projections └── 1 [as="?column?":6, type=int] + +# Regression test for #74667. +exec-ddl +CREATE TABLE t74667 (col TIMETZ PRIMARY KEY) +---- + +exec-ddl +ALTER TABLE t74667 INJECT STATISTICS '[ + { + "columns": [ + "col" + ], + "created_at": "2000-01-01 00:00:00+00:00", + "distinct_count": 950814763580487611, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 3873172219268179689, + "num_range": 0, + "upper_bound": "00:00:00+15:59:00" + }, + { + "distinct_range": 400000, + "num_eq": 3000000000, + "num_range": 400000, + "upper_bound": "04:40:23.558699+11:08:00" + }, + { + "distinct_range": 381143202295070850, + "num_eq": 6399369578112136136, + "num_range": 381143202295070816, + "upper_bound": "06:12:15.740051+06:40:00" + } + ], + "histo_col_type": "TIMETZ", + "name": "__auto__", + "null_count": 0, + "row_count": 1188522479222658429 + } +]' +---- + +norm +SELECT count(*) +FROM t74667 +WHERE col < '03:33:05.598931+07:11:00':::TIMETZ +GROUP BY col; +---- +project + ├── columns: count:4(int!null) + ├── stats: [rows=2.00509067e+16] + └── group-by + ├── columns: col:1(timetz!null) count_rows:4(int!null) + ├── grouping columns: col:1(timetz!null) + ├── stats: [rows=2.00509067e+16, distinct(1)=2.00509067e+16, null(1)=0] + ├── key: (1) + ├── fd: (1)-->(4) + ├── select + │ ├── columns: col:1(timetz!null) + │ ├── stats: [rows=4.52141047e+17, distinct(1)=2.00509067e+16, null(1)=0] + │ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 2.0051e+16 0 + │ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ------------ '03:33:06.598931+07:11:01' + │ ├── key: (1) + │ ├── scan t74667 + │ │ ├── columns: col:1(timetz!null) + │ │ ├── stats: [rows=1.18852248e+18, distinct(1)=9.50814764e+17, null(1)=0] + │ │ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 4.252e+16 7.1391e+17 + │ │ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ----------- '06:12:15.740051+06:40:00' + │ │ └── key: (1) + │ └── filters + │ └── col:1 < '03:33:05.598931+07:11:00' [type=bool, outer=(1), constraints=(/1: (/NULL - /'03:33:06.598931+07:11:01']; tight)] + └── aggregations + └── count-rows [as=count_rows:4, type=int] diff --git a/pkg/sql/opt/props/BUILD.bazel b/pkg/sql/opt/props/BUILD.bazel index f88850baa337..b84cad1ab205 100644 --- a/pkg/sql/opt/props/BUILD.bazel +++ b/pkg/sql/opt/props/BUILD.bazel @@ -27,6 +27,7 @@ go_library( "//pkg/sql/types", "//pkg/util/encoding", "//pkg/util/log", + "//pkg/util/timetz", "@com_github_cockroachdb_errors//:errors", "@com_github_olekukonko_tablewriter//:tablewriter", ], diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go index 80a4704df2a8..3f3be3429198 100644 --- a/pkg/sql/opt/props/histogram.go +++ b/pkg/sql/opt/props/histogram.go @@ -17,6 +17,7 @@ import ( "io" "math" "sort" + "time" "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/sql/opt" @@ -26,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/encoding" + "github.com/cockroachdb/cockroach/pkg/util/timetz" "github.com/cockroachdb/errors" "github.com/olekukonko/tablewriter" ) @@ -785,9 +787,18 @@ func getRangesBeforeAndAfter( return rng, true case types.TimeTZFamily: - lower := lowerBound.(*tree.DTimeTZ).TimeOfDay - upper := upperBound.(*tree.DTimeTZ).TimeOfDay - rng = float64(upper) - float64(lower) + // timeTZOffsetSecsRange is the total number of possible values for offset. + timeTZOffsetSecsRange := timetz.MaxTimeTZOffsetSecs - timetz.MinTimeTZOffsetSecs + 1 + + // Find the range in microseconds based on the absolute times of the range + // boundaries. + lower := lowerBound.(*tree.DTimeTZ) + upper := upperBound.(*tree.DTimeTZ) + rng = float64(upper.ToTime().Sub(lower.ToTime()) / time.Microsecond) + + // Account for the offset. + rng *= float64(timeTZOffsetSecsRange) + rng += float64(upper.OffsetSecs - lower.OffsetSecs) return rng, true default: diff --git a/pkg/sql/opt/props/histogram_test.go b/pkg/sql/opt/props/histogram_test.go index eca72b201a3b..41fb081741b3 100644 --- a/pkg/sql/opt/props/histogram_test.go +++ b/pkg/sql/opt/props/histogram_test.go @@ -384,7 +384,7 @@ func TestFilterBucket(t *testing.T) { t.Fatalf("for span %s expected an error", testCase.span) } if !reflect.DeepEqual(testCase.expected, actual) { - t.Fatalf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual) + t.Errorf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual) } } } @@ -612,17 +612,17 @@ func TestFilterBucket(t *testing.T) { }) t.Run("timetz", func(t *testing.T) { - upperBound, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond) + upperBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond) if err != nil { t.Fatal(err) } - lowerBound, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond) + lowerBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond) if err != nil { t.Fatal(err) } - h := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ - {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound)}, - {NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound}, + h1 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ + {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound1)}, + {NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound1}, }} ub1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:15:00", time.Microsecond) @@ -630,18 +630,70 @@ func TestFilterBucket(t *testing.T) { t.Fatal(err) } - testData := []testCase{ + testData1 := []testCase{ { span: "[/04:00:00 - /04:15:00]", expected: &cat.HistogramBucket{NumEq: 0, NumRange: 15.5, DistinctRange: 7.75, UpperBound: ub1}, }, { span: "[/04:30:00 - /05:00:00]", - expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound}, + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + { + span: "[/06:30:00+02:00:00 - /05:00:00]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + { + span: "[/08:30:00+04:00:00 - /05:00:00]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + } + + // This test distinguishes between values that have the same time but + // different offsets. + upperBound2, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00.000001", time.Microsecond) + if err != nil { + t.Fatal(err) + } + h2 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ + {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: upperBound1}, + {NumEq: 1, NumRange: 10000, DistinctRange: 1000, UpperBound: upperBound2}, + }} + + ub2, _, err := tree.ParseDTimeTZ(&evalCtx, "20:59:00.000001+15:59:00", time.Microsecond) + if err != nil { + t.Fatal(err) + } + + testData2 := []testCase{ + { + span: "[/05:00:00.000001 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: upperBound2}, + }, + { + span: "[/07:00:00.000001+02:00:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 625.65, DistinctRange: 62.57, UpperBound: upperBound2}, + }, + { + span: "[/09:00:00.000001+04:00:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 1251.3, DistinctRange: 125.13, UpperBound: upperBound2}, + }, + { + span: "[/04:59:59-00:00:01 - /20:59:00.000001+15:59:00]", + expected: &cat.HistogramBucket{NumEq: 0, NumRange: 5000, DistinctRange: 500, UpperBound: ub2}, + }, + { + span: "[/20:59:00.000001+15:59:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 5000, DistinctRange: 500, UpperBound: upperBound2}, + }, + { + span: "[/04:59:58-00:00:02 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 9999.91, DistinctRange: 999.99, UpperBound: upperBound2}, }, } - runTest(h, testData, types.TimeTZFamily) + runTest(h1, testData1, types.TimeTZFamily) + runTest(h2, testData2, types.TimeTZFamily) }) t.Run("string", func(t *testing.T) { diff --git a/pkg/sql/sem/tree/datum_integration_test.go b/pkg/sql/sem/tree/datum_integration_test.go index 4a6fc4d2b0b8..12d9d5863063 100644 --- a/pkg/sql/sem/tree/datum_integration_test.go +++ b/pkg/sql/sem/tree/datum_integration_test.go @@ -1019,7 +1019,7 @@ func TestDTimeTZPrev(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - rng, _ := randutil.NewTestRand() + rng, _ := randutil.NewPseudoRand() evalCtx := &tree.EvalContext{ SessionDataStack: sessiondata.NewStack(&sessiondata.SessionData{ Location: time.UTC, @@ -1094,7 +1094,7 @@ func TestDTimeTZNext(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - rng, _ := randutil.NewTestRand() + rng, _ := randutil.NewPseudoRand() evalCtx := &tree.EvalContext{ SessionDataStack: sessiondata.NewStack(&sessiondata.SessionData{ Location: time.UTC,