Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align gapfill bucket generation with time_bucket #6155

Merged
merged 1 commit into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .unreleased/PR_6155
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixes: #6155 Align gapfill bucket generation with time_bucket
22 changes: 16 additions & 6 deletions tsl/src/nodes/gapfill/gapfill_exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -640,15 +640,15 @@ gapfill_advance_timestamp(GapFillState *state)
{
case DATEOID:
next = DirectFunctionCall2(date_pl_interval,
DateADTGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
DateADTGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
Comment on lines -643 to +644
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am slightly unsure what was causing the previous issue, but it looks like this approach suffer from the same problem as the previous one, or I might be misunderstanding what difference it makes. Was the original problem due to a precision issue, or something else?

Don't you want to do a computation along the lines gapfill_start + n * gapfill_interval?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous implementation depended on previous timestamp to calculate the next one. This becomes a problem in case of variable bucket sizes like months when the bucket lands on the last day of the month (which is a different day every time).

The computation you suggest is the same one I'm doing here but instead of maintaining the position (n), I maintain the offset by incrementing it for every generated bucket.

My implementation seemed simpler in terms of calculating the next timestamp so that's why I went with it.

next = DirectFunctionCall1(timestamp_date, next);
state->next_timestamp = DatumGetDateADT(next);
break;
case TIMESTAMPOID:
next = DirectFunctionCall2(timestamp_pl_interval,
TimestampGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestamp(next);
break;
case TIMESTAMPTZOID:
Expand All @@ -658,14 +658,22 @@ gapfill_advance_timestamp(GapFillState *state)
*/
next = DirectFunctionCall2(state->have_timezone ? timestamptz_pl_interval :
timestamp_pl_interval,
TimestampTzGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampTzGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestampTz(next);
break;
default:
state->next_timestamp += state->gapfill_period;
break;
}
/* Advance the interval offset if necessary */
if (state->gapfill_interval)
{
Datum tspan = DirectFunctionCall2(interval_pl,
IntervalPGetDatum(state->gapfill_interval),
IntervalPGetDatum(state->next_offset));
state->next_offset = DatumGetIntervalP(tspan);
}
}

/*
Expand Down Expand Up @@ -742,6 +750,7 @@ gapfill_begin(CustomScanState *node, EState *estate, int eflags)
state->gapfill_start = align_with_time_bucket(state, get_start_arg(state));
}
state->next_timestamp = state->gapfill_start;
state->next_offset = state->gapfill_interval;

/* gap fill end */
if (is_const_null(get_finish_arg(state)))
Expand Down Expand Up @@ -938,6 +947,7 @@ gapfill_state_reset_group(GapFillState *state, TupleTableSlot *slot)
break;
}
}
state->next_offset = state->gapfill_interval;
}

/*
Expand Down
2 changes: 2 additions & 0 deletions tsl/src/nodes/gapfill/gapfill_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ typedef struct GapFillState
Interval *gapfill_interval;

int64 next_timestamp;
/* interval offset for next_timestamp from gapfill_start */
Interval *next_offset;
int64 subslot_time; /* time of tuple in subslot */

int time_index; /* position of time column */
Expand Down
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-13.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-14.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
52 changes: 40 additions & 12 deletions tsl/test/shared/expected/gapfill-15.out
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)

DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
Expand Down Expand Up @@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)

DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
Expand Down Expand Up @@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)

SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
Expand All @@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)

SET timezone TO 'Europe/Berlin';
Expand All @@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)

RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
Expand All @@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)

-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)

INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)

DROP TABLE month_timezone;
Loading
Loading