From bb093c295285af94e16ad6b003d0bcd61a04640b Mon Sep 17 00:00:00 2001 From: Mats Kindahl Date: Thu, 28 Sep 2023 17:22:03 +0200 Subject: [PATCH] Restart scheduler on error If the scheduler receives an error, it will never restart again since `bgw_restart_time` is set to `BGW_NEVER_RESTART`, which will prevent all jobs from executing. This commit adds the GUC `timescaledb.bgw_scheduler_restart_time` that can be set to the restart time for the scheduler. It defaults to 60 seconds, which is the default restart interval for background workers defined by PostgreSQL. It also adds `timescaledb.debug_bgw_scheduler_exit_status` to be able to shutdown the scheduler with a non-zero exit status, which allows the restart functionality to be tested. It also ensures that `backend_type` is explicitly set up rather than copied from `application_name` and add some more information to `application_name`. It also updates the tests to use `backend_type` where applicable. To avoid exhausting slots when the launcher restarts, it will kill all existing schedulers and start new ones. Since background worker slots are easily exhausted in the `bgw_launcher` test, we do not run it repeatedly in the flakes workflow. --- .github/gh_matrix_builder.py | 13 ++- .unreleased/pr_7527 | 1 + src/bgw/scheduler.c | 14 ++- src/compat/compat.h | 6 + src/guc.c | 27 +++++ src/guc.h | 8 ++ src/loader/bgw_launcher.c | 121 ++++++++++++++++++-- src/loader/bgw_launcher.h | 7 +- src/loader/loader.c | 2 +- test/expected/bgw_launcher.out | 42 +++---- test/pgtest/CMakeLists.txt | 3 + test/sql/bgw_launcher.sql | 20 ++-- test/sql/include/bgw_launcher_utils.sql | 11 +- tsl/test/expected/bgw_scheduler_control.out | 2 +- tsl/test/expected/bgw_scheduler_restart.out | 85 ++++++++++++++ tsl/test/sql/CMakeLists.txt | 2 + tsl/test/sql/bgw_scheduler_restart.sql | 41 +++++++ 17 files changed, 351 insertions(+), 54 deletions(-) create mode 100644 .unreleased/pr_7527 create mode 100644 tsl/test/expected/bgw_scheduler_restart.out create mode 100644 tsl/test/sql/bgw_scheduler_restart.sql diff --git a/.github/gh_matrix_builder.py b/.github/gh_matrix_builder.py index 6d9a8606a40..1e24ff9af77 100755 --- a/.github/gh_matrix_builder.py +++ b/.github/gh_matrix_builder.py @@ -53,6 +53,13 @@ "memoize", } +# Tests that we do not run as part of a Flake tests +flaky_exclude_tests = { + # Not executed as a flake test since it easily exhausts available + # background worker slots. + "bgw_launcher", +} + # helper functions to generate matrix entries # the release and apache config inherit from the @@ -309,11 +316,13 @@ def macos_config(overrides): sys.exit(1) if tests: + flake_tests = [t for t in tests if t not in flaky_exclude_tests] + installcheck_args = f'TESTS="{" ".join(flake_tests * 20)}"' m["include"].append( build_debug_config( { "coverage": False, - "installcheck_args": f'TESTS="{" ".join(list(tests) * 20)}"', + "installcheck_args": installcheck_args, "name": "Flaky Check Debug", "pg": PG16_LATEST, "pginstallcheck": False, @@ -324,7 +333,7 @@ def macos_config(overrides): build_debug_config( { "coverage": False, - "installcheck_args": f'TESTS="{" ".join(list(tests) * 20)}"', + "installcheck_args": installcheck_args, "name": "Flaky Check Debug", "pg": PG17_LATEST, "pginstallcheck": False, diff --git a/.unreleased/pr_7527 b/.unreleased/pr_7527 new file mode 100644 index 00000000000..534b8bec629 --- /dev/null +++ b/.unreleased/pr_7527 @@ -0,0 +1 @@ +Fixes: #7527 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index d44fc59d433..fefe4cf2602 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -795,9 +795,11 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, * exit. */ if (ts_guc_restoring || IsBinaryUpgrade) { - elog(LOG, - "scheduler for database %u exiting since the database is restoring or upgrading", - MyDatabaseId); + ereport(LOG, + errmsg("scheduler for database %u exiting with exit status %d", + MyDatabaseId, + ts_debug_bgw_scheduler_exit_status), + errdetail("the database is restoring or upgrading")); terminate_all_jobs_and_release_workers(); goto scheduler_exit; } @@ -866,7 +868,10 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, MemoryContextReset(scratch_mctx); } - elog(DEBUG1, "database scheduler for database %u exiting", MyDatabaseId); + elog(DEBUG1, + "scheduler for database %u exiting with exit status %d", + MyDatabaseId, + ts_debug_bgw_scheduler_exit_status); #ifdef TS_DEBUG if (ts_shutdown_bgw) @@ -879,6 +884,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); scheduled_jobs = NIL; + proc_exit(ts_debug_bgw_scheduler_exit_status); } static void diff --git a/src/compat/compat.h b/src/compat/compat.h index 298f74fb716..58654de9f9e 100644 --- a/src/compat/compat.h +++ b/src/compat/compat.h @@ -655,6 +655,12 @@ RelationGetSmgr(Relation rel) GenerationContextCreate(parent, name, blockSize) #endif +#if PG16_GE +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_get_local_beentry_by_index(idx) +#else +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_fetch_stat_local_beentry(idx) +#endif + /* * PG16 adds a new parameter to DefineIndex, total_parts, that takes * in the total number of direct and indirect partitions of the relation. diff --git a/src/guc.c b/src/guc.c index 861760c17e2..1c17b83a7af 100644 --- a/src/guc.c +++ b/src/guc.c @@ -183,6 +183,20 @@ bool ts_guc_debug_require_batch_sorted_merge = false; bool ts_guc_debug_allow_cagg_with_deprecated_funcs = false; +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_debug_bgw_scheduler_exit_status = 0; + #ifdef TS_DEBUG bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; @@ -1067,6 +1081,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("debug_bgw_scheduler_exit_status"), + /* short_desc= */ "exit status to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_debug_bgw_scheduler_exit_status, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ MAKE_EXTOPTION("current_timestamp_mock"), /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 34ebc0ef2d6..88d4ba18e39 100644 --- a/src/guc.h +++ b/src/guc.h @@ -92,6 +92,14 @@ extern TSDLLEXPORT bool ts_guc_auto_sparse_indexes; extern TSDLLEXPORT bool ts_guc_enable_columnarscan; extern TSDLLEXPORT int ts_guc_bgw_log_level; +/* + * Exit code to use when scheduler exits. + * + * Mostly used for debugging, but defined also for non-debug builds since that + * simplifies the code (and also simplifies debugging non-debug builds). + */ +extern TSDLLEXPORT int ts_debug_bgw_scheduler_exit_status; + #ifdef TS_DEBUG extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index af2a2cc3e13..02daa180b64 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -84,6 +84,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = BGW_DEFAULT_RESTART_INTERVAL; + static void launcher_sighup(SIGNAL_ARGS) { @@ -238,13 +240,27 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("bgw_scheduler_restart_time"), + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ BGW_DEFAULT_RESTART_INTERVAL, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_SIGHUP, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); + snprintf(worker.bgw_name, BGW_MAXLEN, TS_BGW_TYPE_LAUNCHER); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_restart_time = BGW_LAUNCHER_RESTART_TIME_S; @@ -274,9 +290,10 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke BackgroundWorker worker; memset(&worker, 0, sizeof(worker)); - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); + snprintf(worker.bgw_type, BGW_MAXLEN, TS_BGW_TYPE_SCHEDULER); + snprintf(worker.bgw_name, BGW_MAXLEN, "%s for database %d", TS_BGW_TYPE_SCHEDULER, db_id); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); @@ -332,15 +349,89 @@ db_hash_entry_create_if_not_exists(HTAB *db_htab, Oid db_oid) return db_he; } +/* + * Result from signalling a backend. + * + * Error codes are non-zero, and success is zero. + */ +enum SignalBackendResult +{ + SIGNAL_BACKEND_SUCCESS = 0, + SIGNAL_BACKEND_ERROR, + SIGNAL_BACKEND_NOPERMISSION, + SIGNAL_BACKEND_NOSUPERUSER, +}; + +/* + * Terminate a background worker. + * + * This is copied from pg_signal_backend() in + * src/backend/storage/ipc/signalfuncs.c but tweaked to not require a database + * connection since the launcher does not have one. + */ +static enum SignalBackendResult +ts_signal_backend(int pid, int sig) +{ + PGPROC *proc = BackendPidGetProc(pid); + + if (unlikely(proc == NULL)) + { + ereport(WARNING, (errmsg("PID %d is not a PostgreSQL backend process", pid))); + return SIGNAL_BACKEND_ERROR; + } + + if (unlikely(kill(pid, sig))) + { + /* Again, just a warning to allow loops */ + ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); + return SIGNAL_BACKEND_ERROR; + } + + return SIGNAL_BACKEND_SUCCESS; +} + +/* + * Terminate backends by backend type. + * + * We iterate through all backends and mark those that match the given backend + * type as terminated. + * + * Note that there is potentially a delay between marking backends as + * terminated and their actual termination, so the backends have to be able to + * run even if there are multiple instances accessing the same data. + * + * Parts of this code is taken from pg_stat_get_activity() in + * src/backend/utils/adt/pgstatfuncs.c. + */ +static void +terminate_backends_by_backend_type(const char *backend_type) +{ + Assert(backend_type); + + const int num_backends = pgstat_fetch_stat_numbackends(); + for (int curr_backend = 1; curr_backend <= num_backends; ++curr_backend) + { + const LocalPgBackendStatus *local_beentry = + pgstat_get_local_beentry_by_index_compat(curr_backend); + const PgBackendStatus *beentry = &local_beentry->backendStatus; + const char *bgw_type = GetBackgroundWorkerTypeByPid(beentry->st_procpid); + if (bgw_type && strcmp(backend_type, bgw_type) == 0) + { + int error = ts_signal_backend(beentry->st_procpid, SIGTERM); + if (error) + elog(LOG, "failed to terminate backend with pid %d", beentry->st_procpid); + } + } +} + /* * Model this on autovacuum.c -> get_database_list. * - * Note that we are not doing - * all the things around memory context that they do, because the hashtable - * we're using to store db entries is automatically created in its own memory - * context (a child of TopMemoryContext) This can get called at two different - * times 1) when the cluster launcher starts and is looking for dbs and 2) if - * it restarts due to a postmaster signal. + * Note that we are not doing all the things around memory context that they + * do, because the hashtable we're using to store db entries is automatically + * created in its own memory context (a child of TopMemoryContext) This can + * get called at two different times 1) when the cluster launcher starts and + * is looking for dbs and 2) if it restarts due to a postmaster signal. */ static void populate_database_htab(HTAB *db_htab) @@ -757,6 +848,16 @@ ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS) db_htab = init_database_htab(); *htab_storage = db_htab; + /* + * If the launcher was restarted and discovers old schedulers, these has + * to be terminated to avoid exhausting the worker slots. + * + * We cannot easily pick up the old schedulers since we do not have access + * to the slots array in PostgreSQL, so instead we scan for something that + * looks like schedulers for databases, and kill them. New ones will then + * be spawned below. + */ + terminate_backends_by_backend_type(TS_BGW_TYPE_SCHEDULER); populate_database_htab(db_htab); while (true) diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index f90a65cb3a9..82c2ec1893b 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -8,7 +8,12 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +#define TS_BGW_TYPE_LAUNCHER "TimescaleDB Background Worker Launcher" +#define TS_BGW_TYPE_SCHEDULER "TimescaleDB Background Worker Scheduler" + +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index 6537e49d8c5..fdff65236fa 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -591,7 +591,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); diff --git a/test/expected/bgw_launcher.out b/test/expected/bgw_launcher.out index 84c2b30bf5a..ff8a61559b6 100644 --- a/test/expected/bgw_launcher.out +++ b/test/expected/bgw_launcher.out @@ -21,11 +21,12 @@ CREATE DATABASE :TEST_DBNAME_2; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -103,7 +104,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -122,7 +123,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; backend_start_changed | wait_event_changed -----------------------+-------------------- @@ -138,7 +139,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; wait_event_changed -------------------- @@ -187,7 +188,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times SELECT _timescaledb_functions.start_background_workers() as start_background_workers, * FROM generate_series(1,20); @@ -227,7 +228,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -274,7 +275,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; DROP EXTENSION timescaledb; @@ -294,7 +295,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -315,9 +316,9 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_cancel_backend ------------------- t @@ -445,11 +446,12 @@ ALTER ROLE :ROLE_DEFAULT_PERM_USER WITH NOSUPERUSER; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -602,7 +604,7 @@ SELECT _timescaledb_functions.stop_background_workers(); t (1 row) -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_terminate_backend ---------------------- t diff --git a/test/pgtest/CMakeLists.txt b/test/pgtest/CMakeLists.txt index 2d65a493eb2..8d7dc3779c1 100644 --- a/test/pgtest/CMakeLists.txt +++ b/test/pgtest/CMakeLists.txt @@ -29,6 +29,9 @@ set(PG_IGNORE_TESTS sanity_check type_sanity create_am + # Ignoring because it spawns different number of workers in different + # versions. + select_parallel psql) # Modify the test schedule to ignore some tests diff --git a/test/sql/bgw_launcher.sql b/test/sql/bgw_launcher.sql index db3d240c6fb..19a01789667 100644 --- a/test/sql/bgw_launcher.sql +++ b/test/sql/bgw_launcher.sql @@ -33,7 +33,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -43,14 +43,14 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; COMMIT; SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; -- Test stop @@ -68,7 +68,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times @@ -85,7 +85,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -109,7 +109,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; @@ -126,7 +126,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -143,10 +143,10 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; SELECT wait_worker_counts(1,0,1,0); @@ -259,7 +259,7 @@ SELECT wait_for_bgw_scheduler(:'TEST_DBNAME'); -- Connect to TEST_DBNAME (_timescaledb_functions.stop_background_workers() is not available in TEST_DBNAME_2) \c :TEST_DBNAME :ROLE_SUPERUSER SELECT _timescaledb_functions.stop_background_workers(); -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; \c :TEST_DBNAME_2 :ROLE_SUPERUSER -- make sure nobody is using it diff --git a/test/sql/include/bgw_launcher_utils.sql b/test/sql/include/bgw_launcher_utils.sql index de06a853814..3e52778fc02 100644 --- a/test/sql/include/bgw_launcher_utils.sql +++ b/test/sql/include/bgw_launcher_utils.sql @@ -8,11 +8,12 @@ -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ diff --git a/tsl/test/expected/bgw_scheduler_control.out b/tsl/test/expected/bgw_scheduler_control.out index 8ed411971f4..bb55cb2771b 100644 --- a/tsl/test/expected/bgw_scheduler_control.out +++ b/tsl/test/expected/bgw_scheduler_control.out @@ -99,7 +99,7 @@ SELECT * FROM cleaned_bgw_log; 4 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM) 0 | test_job_1b | Execute job 1 1 | test_job_1b | job 1000 (test_job_1b) exiting with success: execution time (RANDOM) ms - 5 | DB Scheduler | database scheduler for database (RANDOM) exiting + 5 | DB Scheduler | scheduler for database (RANDOM) exiting with exit status 0 (8 rows) ALTER DATABASE :TEST_DBNAME RESET timescaledb.bgw_log_level; diff --git a/tsl/test/expected/bgw_scheduler_restart.out b/tsl/test/expected/bgw_scheduler_restart.out new file mode 100644 index 00000000000..87214d3146e --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_restart.out @@ -0,0 +1,85 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE application_name LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 1min +(1 row) + +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +SELECT pg_sleep(10); -- Wait for scheduler to start. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(60); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%'; + pg_terminate_backend +---------------------- + t +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 2b3a007d7a2..cc00f329d4a 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -73,6 +73,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) bgw_job_stat_history_errors.sql bgw_job_stat_history_errors_permissions.sql bgw_db_scheduler_fixed.sql + bgw_scheduler_restart.sql bgw_reorder_drop_chunks.sql scheduler_fixed.sql compress_bgw_reorder_drop_chunks.sql @@ -185,6 +186,7 @@ set(SOLO_TESTS # log level. bgw_custom bgw_scheduler_control + bgw_scheduler_restart bgw_db_scheduler bgw_job_stat_history_errors_permissions bgw_job_stat_history_errors diff --git a/tsl/test/sql/bgw_scheduler_restart.sql b/tsl/test/sql/bgw_scheduler_restart.sql new file mode 100644 index 00000000000..0f2f7cb27e9 --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_restart.sql @@ -0,0 +1,41 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE application_name LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; + +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; + +SELECT _timescaledb_functions.start_background_workers(); + +SELECT pg_sleep(10); -- Wait for scheduler to start. + +SELECT datname, application_name FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + +SELECT datname, application_name FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + +SELECT pg_sleep(60); -- Wait for scheduler to restart. + +SELECT datname, application_name FROM tsdb_bgw; + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%';