From c167a6b306dcfeaf3956ffa45517f4471f6e5750 Mon Sep 17 00:00:00 2001 From: Mats Kindahl Date: Thu, 28 Sep 2023 17:22:03 +0200 Subject: [PATCH] Restart scheduler on error If the scheduler receives an error, it will never restart again since `bgw_restart_time` is set to `BGW_NEVER_RESTART`, which will prevent all jobs from executing. This commit adds the GUC `timescaledb.bgw_scheduler_restart_time` that can be set to the restart time for the scheduler. It defaults to 60 seconds, which is the default restart interval for background workers defined by PostgreSQL. It also adds `timescaledb.debug_bgw_scheduler_exit_status` to be able to shutdown the scheduler with a non-zero exit status, which allows the restart functionality to be tested. It also ensures that `backend_type` is explicitly set up rather than copied from `application_name` and add some more information to `application_name`. It also updates the tests to use `backend_type` where applicable. To avoid exhausting slots when the launcher restarts, it will kill all existing schedulers and start new ones. --- .unreleased/pr_7527 | 1 + src/bgw/scheduler.c | 1 + src/compat/compat.h | 6 + src/guc.c | 27 ++++ src/guc.h | 8 ++ src/loader/bgw_launcher.c | 129 ++++++++++++++++++-- src/loader/bgw_launcher.h | 7 +- src/loader/loader.c | 2 +- test/expected/bgw_launcher.out | 42 ++++--- test/sql/bgw_launcher.sql | 20 +-- test/sql/include/bgw_launcher_utils.sql | 11 +- tsl/test/expected/bgw_scheduler_restart.out | 85 +++++++++++++ tsl/test/sql/CMakeLists.txt | 1 + tsl/test/sql/bgw_scheduler_restart.sql | 41 +++++++ 14 files changed, 334 insertions(+), 47 deletions(-) create mode 100644 .unreleased/pr_7527 create mode 100644 tsl/test/expected/bgw_scheduler_restart.out create mode 100644 tsl/test/sql/bgw_scheduler_restart.sql diff --git a/.unreleased/pr_7527 b/.unreleased/pr_7527 new file mode 100644 index 00000000000..534b8bec629 --- /dev/null +++ b/.unreleased/pr_7527 @@ -0,0 +1 @@ +Fixes: #7527 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index d44fc59d433..b11d11d43b7 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -879,6 +879,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); scheduled_jobs = NIL; + proc_exit(ts_debug_bgw_scheduler_exit_status); } static void diff --git a/src/compat/compat.h b/src/compat/compat.h index 298f74fb716..58654de9f9e 100644 --- a/src/compat/compat.h +++ b/src/compat/compat.h @@ -655,6 +655,12 @@ RelationGetSmgr(Relation rel) GenerationContextCreate(parent, name, blockSize) #endif +#if PG16_GE +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_get_local_beentry_by_index(idx) +#else +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_fetch_stat_local_beentry(idx) +#endif + /* * PG16 adds a new parameter to DefineIndex, total_parts, that takes * in the total number of direct and indirect partitions of the relation. diff --git a/src/guc.c b/src/guc.c index 861760c17e2..1c17b83a7af 100644 --- a/src/guc.c +++ b/src/guc.c @@ -183,6 +183,20 @@ bool ts_guc_debug_require_batch_sorted_merge = false; bool ts_guc_debug_allow_cagg_with_deprecated_funcs = false; +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_debug_bgw_scheduler_exit_status = 0; + #ifdef TS_DEBUG bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; @@ -1067,6 +1081,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("debug_bgw_scheduler_exit_status"), + /* short_desc= */ "exit status to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_debug_bgw_scheduler_exit_status, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ MAKE_EXTOPTION("current_timestamp_mock"), /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 34ebc0ef2d6..88d4ba18e39 100644 --- a/src/guc.h +++ b/src/guc.h @@ -92,6 +92,14 @@ extern TSDLLEXPORT bool ts_guc_auto_sparse_indexes; extern TSDLLEXPORT bool ts_guc_enable_columnarscan; extern TSDLLEXPORT int ts_guc_bgw_log_level; +/* + * Exit code to use when scheduler exits. + * + * Mostly used for debugging, but defined also for non-debug builds since that + * simplifies the code (and also simplifies debugging non-debug builds). + */ +extern TSDLLEXPORT int ts_debug_bgw_scheduler_exit_status; + #ifdef TS_DEBUG extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index af2a2cc3e13..9929dcfb31d 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -7,6 +7,7 @@ /* BGW includes below */ /* These are always necessary for a bgworker */ +#include #include #include #include @@ -23,6 +24,8 @@ #include #include #include +#include +#include #include /* and checking db list for whether we're in a template*/ @@ -40,11 +43,15 @@ /* for allocating the htab storage */ #include +#include +#include + /* for getting settings correct before loading the versioned scheduler */ #include "catalog/pg_db_role_setting.h" #include "../compat/compat.h" #include "../extension_constants.h" +#include "../utils.h" #include "bgw_counter.h" #include "bgw_launcher.h" #include "bgw_message_queue.h" @@ -84,6 +91,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = BGW_DEFAULT_RESTART_INTERVAL; + static void launcher_sighup(SIGNAL_ARGS) { @@ -124,6 +133,7 @@ typedef struct DbHashEntry } DbHashEntry; static void scheduler_state_trans_enabled_to_allocated(DbHashEntry *entry); +static void scheduler_modify_state(DbHashEntry *entry, SchedulerState new_state); static void bgw_on_postmaster_death(void) @@ -238,13 +248,27 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("bgw_scheduler_restart_time"), + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ BGW_DEFAULT_RESTART_INTERVAL, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_SIGHUP, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); + snprintf(worker.bgw_name, BGW_MAXLEN, TS_BGW_TYPE_LAUNCHER); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_restart_time = BGW_LAUNCHER_RESTART_TIME_S; @@ -274,9 +298,10 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke BackgroundWorker worker; memset(&worker, 0, sizeof(worker)); - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); + snprintf(worker.bgw_type, BGW_MAXLEN, TS_BGW_TYPE_SCHEDULER); + snprintf(worker.bgw_name, BGW_MAXLEN, "%s for database %d", TS_BGW_TYPE_SCHEDULER, db_id); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); @@ -332,15 +357,89 @@ db_hash_entry_create_if_not_exists(HTAB *db_htab, Oid db_oid) return db_he; } +/* + * Result from signalling a backend. + * + * Error codes are non-zero, and success is zero. + */ +enum SignalBackendResult +{ + SIGNAL_BACKEND_SUCCESS = 0, + SIGNAL_BACKEND_ERROR, + SIGNAL_BACKEND_NOPERMISSION, + SIGNAL_BACKEND_NOSUPERUSER, +}; + +/* + * Terminate a background worker. + * + * This is copied from pg_signal_backend() in + * src/backend/storage/ipc/signalfuncs.c but tweaked to not require a database + * connection since the launcher does not have one. + */ +static enum SignalBackendResult +ts_signal_backend(int pid, int sig) +{ + PGPROC *proc = BackendPidGetProc(pid); + + if (unlikely(proc == NULL)) + { + ereport(WARNING, (errmsg("PID %d is not a PostgreSQL backend process", pid))); + return SIGNAL_BACKEND_ERROR; + } + + if (unlikely(kill(pid, sig))) + { + /* Again, just a warning to allow loops */ + ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); + return SIGNAL_BACKEND_ERROR; + } + + return SIGNAL_BACKEND_SUCCESS; +} + +/* + * Terminate backends by backend type. + * + * We iterate through all backends and mark those that match the given backend + * type as terminated. + * + * Note that there is potentially a delay between marking backends as + * terminated and their actual termination, so the backends have to be able to + * run even if there are multiple instances accessing the same data. + * + * Parts of this code is taken from pg_stat_get_activity() in + * src/backend/utils/adt/pgstatfuncs.c. + */ +static void +terminate_backends_by_backend_type(const char *backend_type) +{ + Assert(backend_type); + + const int num_backends = pgstat_fetch_stat_numbackends(); + for (int curr_backend = 1; curr_backend <= num_backends; ++curr_backend) + { + const LocalPgBackendStatus *local_beentry = + pgstat_get_local_beentry_by_index_compat(curr_backend); + const PgBackendStatus *beentry = &local_beentry->backendStatus; + const char *bgw_type = GetBackgroundWorkerTypeByPid(beentry->st_procpid); + if (bgw_type && strcmp(backend_type, bgw_type) == 0) + { + int error = ts_signal_backend(beentry->st_procpid, SIGTERM); + if (error) + elog(LOG, "failed to terminate backend with pid %d", beentry->st_procpid); + } + } +} + /* * Model this on autovacuum.c -> get_database_list. * - * Note that we are not doing - * all the things around memory context that they do, because the hashtable - * we're using to store db entries is automatically created in its own memory - * context (a child of TopMemoryContext) This can get called at two different - * times 1) when the cluster launcher starts and is looking for dbs and 2) if - * it restarts due to a postmaster signal. + * Note that we are not doing all the things around memory context that they + * do, because the hashtable we're using to store db entries is automatically + * created in its own memory context (a child of TopMemoryContext) This can + * get called at two different times 1) when the cluster launcher starts and + * is looking for dbs and 2) if it restarts due to a postmaster signal. */ static void populate_database_htab(HTAB *db_htab) @@ -757,6 +856,16 @@ ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS) db_htab = init_database_htab(); *htab_storage = db_htab; + /* + * If the launcher was restarted and discovers old schedulers, these has + * to be terminated to avoid exhausting the worker slots. + * + * We cannot easily pick up the old schedulers since we do not have access + * to the slots array in PostgreSQL, so instead we scan for something that + * looks like schedulers for databases, and kill them. New ones will then + * be spawned below. + */ + terminate_backends_by_backend_type(TS_BGW_TYPE_SCHEDULER); populate_database_htab(db_htab); while (true) diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index f90a65cb3a9..82c2ec1893b 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -8,7 +8,12 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +#define TS_BGW_TYPE_LAUNCHER "TimescaleDB Background Worker Launcher" +#define TS_BGW_TYPE_SCHEDULER "TimescaleDB Background Worker Scheduler" + +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index 6537e49d8c5..fdff65236fa 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -591,7 +591,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); diff --git a/test/expected/bgw_launcher.out b/test/expected/bgw_launcher.out index 84c2b30bf5a..ff8a61559b6 100644 --- a/test/expected/bgw_launcher.out +++ b/test/expected/bgw_launcher.out @@ -21,11 +21,12 @@ CREATE DATABASE :TEST_DBNAME_2; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -103,7 +104,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -122,7 +123,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; backend_start_changed | wait_event_changed -----------------------+-------------------- @@ -138,7 +139,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; wait_event_changed -------------------- @@ -187,7 +188,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times SELECT _timescaledb_functions.start_background_workers() as start_background_workers, * FROM generate_series(1,20); @@ -227,7 +228,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -274,7 +275,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; DROP EXTENSION timescaledb; @@ -294,7 +295,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -315,9 +316,9 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_cancel_backend ------------------- t @@ -445,11 +446,12 @@ ALTER ROLE :ROLE_DEFAULT_PERM_USER WITH NOSUPERUSER; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -602,7 +604,7 @@ SELECT _timescaledb_functions.stop_background_workers(); t (1 row) -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_terminate_backend ---------------------- t diff --git a/test/sql/bgw_launcher.sql b/test/sql/bgw_launcher.sql index db3d240c6fb..19a01789667 100644 --- a/test/sql/bgw_launcher.sql +++ b/test/sql/bgw_launcher.sql @@ -33,7 +33,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -43,14 +43,14 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; COMMIT; SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; -- Test stop @@ -68,7 +68,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times @@ -85,7 +85,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -109,7 +109,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; @@ -126,7 +126,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -143,10 +143,10 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; SELECT wait_worker_counts(1,0,1,0); @@ -259,7 +259,7 @@ SELECT wait_for_bgw_scheduler(:'TEST_DBNAME'); -- Connect to TEST_DBNAME (_timescaledb_functions.stop_background_workers() is not available in TEST_DBNAME_2) \c :TEST_DBNAME :ROLE_SUPERUSER SELECT _timescaledb_functions.stop_background_workers(); -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; \c :TEST_DBNAME_2 :ROLE_SUPERUSER -- make sure nobody is using it diff --git a/test/sql/include/bgw_launcher_utils.sql b/test/sql/include/bgw_launcher_utils.sql index de06a853814..3e52778fc02 100644 --- a/test/sql/include/bgw_launcher_utils.sql +++ b/test/sql/include/bgw_launcher_utils.sql @@ -8,11 +8,12 @@ -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ diff --git a/tsl/test/expected/bgw_scheduler_restart.out b/tsl/test/expected/bgw_scheduler_restart.out new file mode 100644 index 00000000000..87214d3146e --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_restart.out @@ -0,0 +1,85 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE application_name LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 1min +(1 row) + +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +SELECT pg_sleep(10); -- Wait for scheduler to start. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(60); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%'; + pg_terminate_backend +---------------------- + t +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 2b3a007d7a2..c2074441528 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -73,6 +73,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) bgw_job_stat_history_errors.sql bgw_job_stat_history_errors_permissions.sql bgw_db_scheduler_fixed.sql + bgw_scheduler_restart.sql bgw_reorder_drop_chunks.sql scheduler_fixed.sql compress_bgw_reorder_drop_chunks.sql diff --git a/tsl/test/sql/bgw_scheduler_restart.sql b/tsl/test/sql/bgw_scheduler_restart.sql new file mode 100644 index 00000000000..0f2f7cb27e9 --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_restart.sql @@ -0,0 +1,41 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE application_name LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; + +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; + +SELECT _timescaledb_functions.start_background_workers(); + +SELECT pg_sleep(10); -- Wait for scheduler to start. + +SELECT datname, application_name FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + +SELECT datname, application_name FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + +SELECT pg_sleep(60); -- Wait for scheduler to restart. + +SELECT datname, application_name FROM tsdb_bgw; + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%';