Skip to content

Commit

Permalink
Align gapfill bucket generation with time_bucket
Browse files Browse the repository at this point in the history
In certain scenarios, when generating buckets with
monthly buckets and different timezones, gapfill
would create timestamps which don't align with
time_bucket and thus potentially generating multiple
rows for an individual month. Instead of relying on
previous timestamp to generate the next one, now
we generate them always from the start point
which will make us align with time_bucket buckets.
  • Loading branch information
antekresic committed Oct 19, 2023
1 parent a409065 commit 65ecda9
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 57 deletions.
1 change: 1 addition & 0 deletions .unreleased/PR_6155
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixes: #6155 Align gapfill bucket generation with time_bucket
22 changes: 16 additions & 6 deletions tsl/src/nodes/gapfill/gapfill_exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -640,15 +640,15 @@ gapfill_advance_timestamp(GapFillState *state)
{
case DATEOID:
next = DirectFunctionCall2(date_pl_interval,
DateADTGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
DateADTGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
next = DirectFunctionCall1(timestamp_date, next);
state->next_timestamp = DatumGetDateADT(next);
break;
case TIMESTAMPOID:
next = DirectFunctionCall2(timestamp_pl_interval,
TimestampGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestamp(next);
break;
case TIMESTAMPTZOID:
Expand All @@ -658,14 +658,22 @@ gapfill_advance_timestamp(GapFillState *state)
*/
next = DirectFunctionCall2(state->have_timezone ? timestamptz_pl_interval :
timestamp_pl_interval,
TimestampTzGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampTzGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestampTz(next);
break;
default:
state->next_timestamp += state->gapfill_period;
break;
}
/* Advance the interval offset if necessary */
if (state->gapfill_interval)
{
Datum tspan = DirectFunctionCall2(interval_pl,
IntervalPGetDatum(state->gapfill_interval),
IntervalPGetDatum(state->next_offset));
state->next_offset = DatumGetIntervalP(tspan);
}
}

/*
Expand Down Expand Up @@ -742,6 +750,7 @@ gapfill_begin(CustomScanState *node, EState *estate, int eflags)
state->gapfill_start = align_with_time_bucket(state, get_start_arg(state));
}
state->next_timestamp = state->gapfill_start;
state->next_offset = state->gapfill_interval;

/* gap fill end */
if (is_const_null(get_finish_arg(state)))
Expand Down Expand Up @@ -938,6 +947,7 @@ gapfill_state_reset_group(GapFillState *state, TupleTableSlot *slot)
break;
}
}
state->next_offset = state->gapfill_interval;
}

/*
Expand Down
2 changes: 2 additions & 0 deletions tsl/src/nodes/gapfill/gapfill_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ typedef struct GapFillState
Interval *gapfill_interval;

int64 next_timestamp;
/* interval offset for next_timestamp from gapfill_start */
Interval *next_offset;
int64 subslot_time; /* time of tuple in subslot */

int time_index; /* position of time column */
Expand Down
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-13.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-14.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-15.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
Loading

0 comments on commit 65ecda9

Please sign in to comment.