Add functions for determining compression defaults

Add functions to help determine defaults for segment_by and order_by.
fabriziomello · Jan 22, 2024 · e89bc24 · e89bc24
1 parent 2ab2a4f
commit e89bc24
Show file tree

Hide file tree

Showing 9 changed files with 802 additions and 5 deletions.
diff --git a/.unreleased/pr_6386 b/.unreleased/pr_6386
@@ -0,0 +1 @@
+Implements: #6386 Add functions for determining compression defaults
diff --git a/cmake/ScriptFiles.cmake b/cmake/ScriptFiles.cmake
@@ -54,7 +54,8 @@ set(SOURCE_FILES
     cagg_utils.sql
     cagg_migrate.sql
     job_error_log_retention.sql
-    osm_api.sql)
+    osm_api.sql
+    compression_defaults.sql)
 
 if(ENABLE_DEBUG_UTILS AND CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND SOURCE_FILES debug_build_utils.sql)

diff --git a/sql/compression_defaults.sql b/sql/compression_defaults.sql
@@ -0,0 +1,300 @@
+-- This file and its contents are licensed under the Apache License 2.0.
+-- Please see the included NOTICE for copyright information and
+-- LICENSE-APACHE for a copy of the license.
+
+
+-- This function return a jsonb with the following keys:
+-- - columns: an array of column names that shold be used for segment by
+-- - confidence: a number between 0 and 10 (most confident) indicating how sure we are.
+-- - message: a message that should be displayed to the user to evaluate the result.
+CREATE OR REPLACE FUNCTION _timescaledb_functions.get_segmentby_defaults(
+    relation regclass
+)
+    RETURNS JSONB LANGUAGE PLPGSQL AS
+$BODY$
+DECLARE
+    _table_name NAME;
+    _schema_name NAME;
+    _hypertable_row _timescaledb_catalog.hypertable;
+    _segmentby NAME;
+    _cnt int;
+BEGIN
+    SELECT n.nspname, c.relname INTO STRICT _schema_name, _table_name
+    FROM pg_class c
+    INNER JOIN pg_namespace n ON (n.oid = c.relnamespace)
+    WHERE c.oid = relation;
+
+    SELECT * INTO STRICT _hypertable_row FROM _timescaledb_catalog.hypertable h WHERE h.table_name = _table_name AND h.schema_name = schema_name;
+
+    --STEP 1 if column stats exist use unique indexes. Pick the column that comes first in any such indexes. Ties are broken arbitrarily.
+    --Note: this will only pick a column that is NOT unique in a multi-column unique index.
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where indisunique and indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      a.attname INTO _segmentby
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    --right now stats are from the hypertable itself. Use chunks in the future.
+    INNER JOIN pg_statistic s ON (s.staattnum = a.attnum and s.starelid = relation)
+    WHERE
+      a.attname NOT IN (SELECT column_name FROM _timescaledb_catalog.dimension d WHERE d.hypertable_id = _hypertable_row.id)
+      AND s.stadistinct > 1
+    ORDER BY i.pos
+    LIMIT 1;
+
+    IF FOUND THEN
+        return json_build_object('columns', json_build_array(_segmentby), 'confidence', 10);
+    END IF;
+
+
+    --STEP 2 if column stats exist and no unique indexes use non-unique indexes. Pick the column that comes first in any such indexes. Ties are broken arbitrarily.
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where NOT indisunique and indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      a.attname INTO _segmentby
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    --right now stats are from the hypertable itself. Use chunks in the future.
+    INNER JOIN pg_statistic s ON (s.staattnum = a.attnum and s.starelid = relation)
+    WHERE
+      a.attname NOT IN (SELECT column_name FROM _timescaledb_catalog.dimension d WHERE d.hypertable_id = _hypertable_row.id)
+      AND s.stadistinct > 1
+    ORDER BY i.pos
+    LIMIT 1;
+
+    IF FOUND THEN
+        return json_build_object('columns', json_build_array(_segmentby), 'confidence', 8);
+    END IF;
+
+    --STEP 3 if column stats do not exist use non-unique indexes. Pick the column that comes first in any such indexes. Ties are broken arbitrarily.
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where NOT indisunique and indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      a.attname INTO _segmentby
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    LEFT JOIN
+      pg_catalog.pg_attrdef ad ON (ad.adrelid = relation AND ad.adnum = a.attnum)
+    LEFT JOIN
+      pg_statistic s ON (s.staattnum = a.attnum and s.starelid = relation)
+    WHERE
+      a.attname NOT IN (SELECT column_name FROM _timescaledb_catalog.dimension d WHERE d.hypertable_id = _hypertable_row.id)
+      AND s.stadistinct is null
+      AND a.attidentity = '' AND (ad.adbin IS NULL OR pg_get_expr(adbin, adrelid) not like 'nextval%')
+    ORDER BY i.pos
+    LIMIT 1;
+
+    IF FOUND THEN
+        return json_build_object(
+            'columns', json_build_array(_segmentby),
+            'confidence', 5,
+            'message',  'Please make sure '|| _segmentby||' is not a unique column and appropriate for a segment by');
+    END IF;
+
+    --STEP 4 if column stats do not exist and no non-unique indexes, use unique indexes. Pick the column that comes first in any such indexes. Ties are broken arbitrarily.
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where indisunique and indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      a.attname INTO _segmentby
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    LEFT JOIN
+      pg_catalog.pg_attrdef ad ON (ad.adrelid = relation AND ad.adnum = a.attnum)
+    LEFT JOIN
+      pg_statistic s ON (s.staattnum = a.attnum and s.starelid = relation)
+    WHERE
+      a.attname NOT IN (SELECT column_name FROM _timescaledb_catalog.dimension d WHERE d.hypertable_id = _hypertable_row.id)
+      AND s.stadistinct is null
+      AND a.attidentity = '' AND (ad.adbin IS NULL OR pg_get_expr(adbin, adrelid) not like 'nextval%')
+    ORDER BY i.pos
+    LIMIT 1;
+
+    IF FOUND THEN
+            return json_build_object(
+            'columns', json_build_array(_segmentby),
+            'confidence', 5,
+            'message',  'Please make sure '|| _segmentby||' is not a unique column and appropriate for a segment by');
+    END IF;
+
+
+    --are there any indexed columns that are not dimemsions and are not serial/identity?
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where indisunique and indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      count(*) INTO STRICT _cnt
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    LEFT JOIN
+      pg_catalog.pg_attrdef ad ON (ad.adrelid = relation AND ad.adnum = a.attnum)
+    WHERE
+      a.attname NOT IN (SELECT column_name FROM _timescaledb_catalog.dimension d WHERE d.hypertable_id = _hypertable_row.id)
+      AND a.attidentity = '' AND (ad.adbin IS NULL OR pg_get_expr(adbin, adrelid) not like 'nextval%');
+
+    IF _cnt > 0 THEN
+        --there are many potential candidates. We do not have enough information to choose one.
+        return json_build_object(
+            'columns', json_build_array(),
+            'confidence', 0,
+            'message',  'Several columns are potential segment by candidates and we do not have enough information to choose one. Please use the segment_by option to explicitly specify the segment_by column');
+    ELSE
+        --there are no potential candidates. There is a good chance no segment by is the correct choice.
+        return json_build_object(
+            'columns', json_build_array(),
+            'confidence', 5,
+            'message',  'You do not have any indexes on columns that can be used for segment_by and thus we are not using segment_by for compression. Please make sure you are not missing any indexes');
+    END IF;
+END
+$BODY$ SET search_path TO pg_catalog, pg_temp;
+
+-- This function return a jsonb with the following keys:
+-- - clauses: an array of column names and sort order key words that shold be used for order by.
+-- - confidence: a number between 0 and 10 (most confident) indicating how sure we are.
+-- - message: a message that should be shown to the user to evaluate the result.
+CREATE OR REPLACE FUNCTION _timescaledb_functions.get_orderby_defaults(
+    relation regclass, segment_by_cols text[]
+)
+    RETURNS JSONB LANGUAGE PLPGSQL AS
+$BODY$
+DECLARE
+    _table_name NAME;
+    _schema_name NAME;
+    _hypertable_row _timescaledb_catalog.hypertable;
+    _orderby_names NAME[];
+    _dimension_names NAME[];
+    _first_index_attrs NAME[];
+    _orderby_clauses text[];
+    _confidence int;
+BEGIN
+    SELECT n.nspname, c.relname INTO STRICT _schema_name, _table_name
+    FROM pg_class c
+    INNER JOIN pg_namespace n ON (n.oid = c.relnamespace)
+    WHERE c.oid = relation;
+
+    SELECT * INTO STRICT _hypertable_row FROM _timescaledb_catalog.hypertable h WHERE h.table_name = _table_name AND h.schema_name = schema_name;
+
+    --start with the unique index columns minus the segment by columns
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+             --is there a better way to pick the right unique index if there are multiple?
+            (select indkey, indnkeyatts from pg_catalog.pg_index where indisunique and indrelid = relation limit 1) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos <= i.indnkeyatts
+        GROUP BY 1
+    )
+    SELECT
+      array_agg(a.attname ORDER BY i.pos) INTO _orderby_names
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    WHERE
+      NOT(a.attname::text = ANY (segment_by_cols));
+
+    if _orderby_names is null then
+        _orderby_names := array[]::name[];
+        _confidence := 5;
+    else
+        _confidence := 8;
+    end if;
+
+    --add dimension colomns to the end. A dimension column like time should probably always be part of the order by.
+    SELECT
+      array_agg(d.column_name) INTO _dimension_names
+    FROM _timescaledb_catalog.dimension d
+    WHERE
+      d.hypertable_id = _hypertable_row.id
+      AND NOT(d.column_name::text = ANY (_orderby_names))
+      AND NOT(d.column_name::text = ANY (segment_by_cols));
+    _orderby_names := _orderby_names || _dimension_names;
+
+    --add the first attribute of any index
+    with index_attr as (
+        SELECT
+        a.attnum, min(a.pos) as pos
+        FROM
+            (select indkey, indnkeyatts from pg_catalog.pg_index where indrelid = relation) i
+        INNER JOIN LATERAL
+            (select * from unnest(i.indkey) with ordinality) a(attnum, pos) ON (TRUE)
+        WHERE a.pos = 1
+        GROUP BY 1
+    )
+    SELECT
+      array_agg(a.attname ORDER BY i.pos) INTO _first_index_attrs
+    FROM
+      index_attr i
+    INNER JOIN
+      pg_attribute a on (a.attnum = i.attnum AND a.attrelid = relation)
+    WHERE
+          NOT(a.attname::text = ANY (_orderby_names))
+      AND NOT(a.attname::text = ANY (segment_by_cols));
+
+    _orderby_names := _orderby_names || _first_index_attrs;
+
+    --add DESC to any dimensions
+    SELECT
+      array_agg(
+      CASE WHEN d.column_name IS NULL THEN
+        a.colname
+      ELSE
+        a.colname || ' DESC'
+      END ORDER BY pos) INTO STRICT _orderby_clauses
+    FROM unnest(_orderby_names) WITH ORDINALITY as a(colname, pos)
+    LEFT JOIN _timescaledb_catalog.dimension d ON (d.column_name = a.colname AND d.hypertable_id = _hypertable_row.id);
+
+
+    return json_build_object('clauses', _orderby_clauses, 'confidence', _confidence);
+END
+$BODY$ SET search_path TO pg_catalog, pg_temp;
diff --git a/sql/updates/reverse-dev.sql b/sql/updates/reverse-dev.sql
@@ -768,3 +768,5 @@ CREATE FUNCTION _timescaledb_functions.hypertable_constraint_add_table_fk_constr
 CREATE FUNCTION _timescaledb_functions.chunks_in(record RECORD, chunks INTEGER[]) RETURNS BOOL
 AS 'BEGIN END' LANGUAGE PLPGSQL SET search_path TO pg_catalog,pg_temp;
 
+DROP FUNCTION IF EXISTS _timescaledb_functions.get_orderby_defaults(regclass,text[]);
+DROP FUNCTION IF EXISTS _timescaledb_functions.get_segmentby_defaults(regclass);
diff --git a/tsl/src/compression/README.md b/tsl/src/compression/README.md
@@ -69,11 +69,11 @@ compression mechanism works. It can store any type of data.
 
 # Merging chunks while compressing #
 
-## Setup ## 
+## Setup ##
 
 Chunks will be merged during compression if we specify the `compress_chunk_time_interval` parameter.
 This value will be used to merge chunks adjacent on the time dimension if possible. This allows usage
-of smaller chunk intervals which are rolled into bigger compressed chunks. 
+of smaller chunk intervals which are rolled into bigger compressed chunks.
 
 ## Operation ##
 
@@ -84,9 +84,49 @@ and chunk interval can be increased not to go over compress chunk time interval)
 After compression completes, catalog is updated by dropping the compressed chunk and increasing the chunk
 interval of the adjacent chunk to include its time dimension slice. Chunk constraints are updated as necessary.
 
-## Compression setup where time dimension is not the first column on order by ## 
+## Compression setup where time dimension is not the first column on order by ##
 
 When merging such chunks, due to the nature of sequence number ordering, we will inherently be left with
-chunks where the sequence numbers are not correctly ordered. In order to mitigate this issue, chunks are 
+chunks where the sequence numbers are not correctly ordered. In order to mitigate this issue, chunks are
 recompressed immediately. This has obvious performance implications which might make merging chunks
 not optimal for certain setups.
+
+# Picking default for `segment_by` and `order_by`.
+
+We have two functions to determine the columns for `timescaledb.compress_segmentby` and `timescaledb.compress_orderby` . These functions can be called
+by the UI to give good defaults. They can also be called internally when a hypertable has compression enabled
+but no values are provided to specify these options.
+
+## `_timescaledb_functions.get_segmentby_defaults`
+
+This function determines a segment-by column to use. It returns a JSONB with the following top-level keys:
+- columns: an array of column names that should be used for segment by. Right now it always returns a single column.
+- confidence: a number between 0 and 10 (most confident) indicating how sure we are.
+- message: a message that should be shown to the user to evaluate the result.
+
+The intuition is as follows:
+
+we use 2 criterias:
+- We want to pick an "important" column for querying. We measure "importance", in terms of how early the column comes in an index (i.e. leading columns are very important, others less so).
+- The column has many rows for the same column value so that the segments will have many rows. We establish that a column will have many values if (i) it is not a dimension and (ii) either statistics tell us so (via `stadistinct` > 1) or, if statistics aren't populated, we check whether the column is a generated identity or serial column.
+
+Naturally, statistics give us more confidence that the column has enough rows per segment. In this case we break ties by preferring columns from unique indexes. Otherwise, we prefer columns from non-unique indexes (we are less likely to run into a unique column there).
+
+Thus, our preference is based on the whether the column is from a unique or regular index as well as the position of the column in the index. Given these preferences, we think ties happened rarely but will be resolved arbitrarily.
+
+One final point: a number of tables don't have any indexed columns that aren't dimensions or serial columns. In this case, we have medium confidence that an empty segment by is correct.
+
+## `_timescaledb_functions.get_orderby_defaults`
+
+This function determines which order by columns to use. It returns a JSONB with the following top-level keys:
+
+- clauses: an array of column names and sort order key words that shold be used for order by.
+- confidence: a number between 0 and 10 (most confident) indicating how sure we are.
+- message: a message that should be shown to the user to evaluate the result.
+
+The order by is built in three steps:
+1) Use the column order in a unique index (removing the segment_by columns).
+2) Add any dimension columns
+3) Add the first attribute of any other index (to establish min-max filters on those columns).
+
+All non-dimension columns are returned without a sort specifier (thus using `ASC` as default). The dimension columns use `DESC`.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Implements: #6386 Add functions for determining compression defaults