Skip to content

Commit

Permalink
opt: fix bug in histogram calculation for TimeTZ
Browse files Browse the repository at this point in the history
This commit fixes a bug in the histogram estimation code for TimeTZ
that made the faulty assumption that TimeTZ values are ordered by
TimeOfDay. This is incorrect since it does not take the OffsetSecs
into account. As a result, it was possible to estimate that the size
of a histogram bucket was negative, which caused problems in the
statistics estimation code. This commit fixes the problem by taking
into account both TimeOfDay and OffsetSecs when estimating the size of
a bucket in a TimeTZ histogram.

Fixes #74667

Release note (bug fix): Fixed an internal error, "estimated row count must
be non-zero", that could occur during planning for queries over a table
with a TimeTZ column. This error was due to a faulty assumption in the
statistics estimation code about ordering of TimeTZ values, which has now
been fixed. The error could occur when TimeTZ values used in the query had
a different time zone offset than the TimeTZ values stored in the table.
  • Loading branch information
rytaft committed Jan 19, 2022
1 parent 81e83ae commit ad7bf04
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 15 deletions.
73 changes: 73 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/groupby
Original file line number Diff line number Diff line change
Expand Up @@ -526,3 +526,76 @@ project
│ └── bool_or:5 [type=bool, outer=(5), constraints=(/5: [/true - /true]; tight), fd=()-->(5)]
└── projections
└── 1 [as="?column?":6, type=int]

# Regression test for #74667.
exec-ddl
CREATE TABLE t74667 (col TIMETZ PRIMARY KEY)
----

exec-ddl
ALTER TABLE t74667 INJECT STATISTICS '[
{
"columns": [
"col"
],
"created_at": "2000-01-01 00:00:00+00:00",
"distinct_count": 950814763580487611,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 3873172219268179689,
"num_range": 0,
"upper_bound": "00:00:00+15:59:00"
},
{
"distinct_range": 400000,
"num_eq": 3000000000,
"num_range": 400000,
"upper_bound": "04:40:23.558699+11:08:00"
},
{
"distinct_range": 381143202295070850,
"num_eq": 6399369578112136136,
"num_range": 381143202295070816,
"upper_bound": "06:12:15.740051+06:40:00"
}
],
"histo_col_type": "TIMETZ",
"name": "__auto__",
"null_count": 0,
"row_count": 1188522479222658429
}
]'
----

norm
SELECT count(*)
FROM t74667
WHERE col < '03:33:05.598931+07:11:00':::TIMETZ
GROUP BY col;
----
project
├── columns: count:4(int!null)
├── stats: [rows=2.00509067e+16]
└── group-by (hash)
├── columns: col:1(timetz!null) count_rows:4(int!null)
├── grouping columns: col:1(timetz!null)
├── stats: [rows=2.00509067e+16, distinct(1)=2.00509067e+16, null(1)=0]
├── key: (1)
├── fd: (1)-->(4)
├── select
│ ├── columns: col:1(timetz!null)
│ ├── stats: [rows=4.52141047e+17, distinct(1)=2.00509067e+16, null(1)=0]
│ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 2.0051e+16 0
│ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ------------ '03:33:06.598931+07:11:01'
│ ├── key: (1)
│ ├── scan t74667
│ │ ├── columns: col:1(timetz!null)
│ │ ├── stats: [rows=1.18852248e+18, distinct(1)=9.50814764e+17, null(1)=0]
│ │ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 4.252e+16 7.1391e+17
│ │ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ----------- '06:12:15.740051+06:40:00'
│ │ └── key: (1)
│ └── filters
│ └── col:1 < '03:33:05.598931+07:11:00' [type=bool, outer=(1), constraints=(/1: (/NULL - /'03:33:06.598931+07:11:01']; tight)]
└── aggregations
└── count-rows [as=count_rows:4, type=int]
1 change: 1 addition & 0 deletions pkg/sql/opt/props/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ go_library(
"//pkg/util/buildutil",
"//pkg/util/encoding",
"//pkg/util/log",
"//pkg/util/timetz",
"@com_github_cockroachdb_errors//:errors",
"@com_github_olekukonko_tablewriter//:tablewriter",
],
Expand Down
25 changes: 19 additions & 6 deletions pkg/sql/opt/props/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"io"
"math"
"sort"
"time"

"github.com/cockroachdb/cockroach/pkg/sql/inverted"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
Expand All @@ -26,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/timetz"
"github.com/cockroachdb/errors"
"github.com/olekukonko/tablewriter"
)
Expand Down Expand Up @@ -805,12 +807,23 @@ func getRangesBeforeAndAfter(
return rangeBefore, rangeAfter, true

case types.TimeTZFamily:
lowerBefore := beforeLowerBound.(*tree.DTimeTZ).TimeOfDay
upperBefore := beforeUpperBound.(*tree.DTimeTZ).TimeOfDay
lowerAfter := afterLowerBound.(*tree.DTimeTZ).TimeOfDay
upperAfter := afterUpperBound.(*tree.DTimeTZ).TimeOfDay
rangeBefore = float64(upperBefore) - float64(lowerBefore)
rangeAfter = float64(upperAfter) - float64(lowerAfter)
// timeTZOffsetSecsRange is the total number of possible values for offset.
timeTZOffsetSecsRange := timetz.MaxTimeTZOffsetSecs - timetz.MinTimeTZOffsetSecs + 1

// Find the ranges in microseconds based on the absolute times of the range
// boundaries.
lowerBefore := beforeLowerBound.(*tree.DTimeTZ)
upperBefore := beforeUpperBound.(*tree.DTimeTZ)
lowerAfter := afterLowerBound.(*tree.DTimeTZ)
upperAfter := afterUpperBound.(*tree.DTimeTZ)
rangeBefore = float64(upperBefore.ToTime().Sub(lowerBefore.ToTime()) / time.Microsecond)
rangeAfter = float64(upperAfter.ToTime().Sub(lowerAfter.ToTime()) / time.Microsecond)

// Account for the offset.
rangeBefore *= float64(timeTZOffsetSecsRange)
rangeAfter *= float64(timeTZOffsetSecsRange)
rangeBefore += float64(upperBefore.OffsetSecs - lowerBefore.OffsetSecs)
rangeAfter += float64(upperAfter.OffsetSecs - lowerAfter.OffsetSecs)
return rangeBefore, rangeAfter, true

case types.StringFamily, types.BytesFamily, types.UuidFamily, types.INetFamily:
Expand Down
70 changes: 61 additions & 9 deletions pkg/sql/opt/props/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ func TestFilterBucket(t *testing.T) {
t.Fatalf("for span %s expected an error", testCase.span)
}
if !reflect.DeepEqual(testCase.expected, actual) {
t.Fatalf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual)
t.Errorf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual)
}
}
}
Expand Down Expand Up @@ -612,36 +612,88 @@ func TestFilterBucket(t *testing.T) {
})

t.Run("timetz", func(t *testing.T) {
upperBound, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond)
upperBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond)
if err != nil {
t.Fatal(err)
}
lowerBound, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond)
lowerBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond)
if err != nil {
t.Fatal(err)
}
h := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound)},
{NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound},
h1 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound1)},
{NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound1},
}}

ub1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:15:00", time.Microsecond)
if err != nil {
t.Fatal(err)
}

testData := []testCase{
testData1 := []testCase{
{
span: "[/04:00:00 - /04:15:00]",
expected: &cat.HistogramBucket{NumEq: 0, NumRange: 15.5, DistinctRange: 7.75, UpperBound: ub1},
},
{
span: "[/04:30:00 - /05:00:00]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound},
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
},
{
span: "[/06:30:00+02:00:00 - /05:00:00]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
},
{
span: "[/08:30:00+04:00:00 - /05:00:00]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
},
}

// This test distinguishes between values that have the same time but
// different offsets.
upperBound2, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00.000001", time.Microsecond)
if err != nil {
t.Fatal(err)
}
h2 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: upperBound1},
{NumEq: 1, NumRange: 10000, DistinctRange: 1000, UpperBound: upperBound2},
}}

ub2, _, err := tree.ParseDTimeTZ(&evalCtx, "20:59:00.000001+15:59:00", time.Microsecond)
if err != nil {
t.Fatal(err)
}

testData2 := []testCase{
{
span: "[/05:00:00.000001 - /05:00:00.000001]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: upperBound2},
},
{
span: "[/07:00:00.000001+02:00:00 - /05:00:00.000001]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 625.65, DistinctRange: 62.57, UpperBound: upperBound2},
},
{
span: "[/09:00:00.000001+04:00:00 - /05:00:00.000001]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 1251.3, DistinctRange: 125.13, UpperBound: upperBound2},
},
{
span: "[/04:59:59-00:00:01 - /20:59:00.000001+15:59:00]",
expected: &cat.HistogramBucket{NumEq: 0, NumRange: 5000, DistinctRange: 500, UpperBound: ub2},
},
{
span: "[/20:59:00.000001+15:59:00 - /05:00:00.000001]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 5000, DistinctRange: 500, UpperBound: upperBound2},
},
{
span: "[/04:59:58-00:00:02 - /05:00:00.000001]",
expected: &cat.HistogramBucket{NumEq: 1, NumRange: 9999.91, DistinctRange: 999.99, UpperBound: upperBound2},
},
}

runTest(h, testData, types.TimeTZFamily)
runTest(h1, testData1, types.TimeTZFamily)
runTest(h2, testData2, types.TimeTZFamily)
})

t.Run("string-bytes", func(t *testing.T) {
Expand Down

0 comments on commit ad7bf04

Please sign in to comment.