diff --git a/.unreleased/feature_6195 b/.unreleased/feature_6195 new file mode 100644 index 00000000000..a930c0a7bcc --- /dev/null +++ b/.unreleased/feature_6195 @@ -0,0 +1 @@ +Implements: #6195 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index 4c32418b946..d6d28373d5a 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -814,6 +814,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); + proc_exit(ts_bgw_scheduler_exit_code); } static void diff --git a/src/guc.c b/src/guc.c index 9b789bce8f8..6f6ddd9a4b9 100644 --- a/src/guc.c +++ b/src/guc.c @@ -122,6 +122,20 @@ TSDLLEXPORT int ts_guc_hypertable_replication_factor_default = 1; bool ts_guc_debug_require_batch_sorted_merge = false; +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_bgw_scheduler_exit_code = 0; + #ifdef TS_DEBUG bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; @@ -759,6 +773,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ "timescaledb.shutdown_bgw_scheduler_exit_code", + /* short_desc= */ "exit code to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_bgw_scheduler_exit_code, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ "timescaledb.current_timestamp_mock", /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 176b0d86d85..8bdeb4cabf8 100644 --- a/src/guc.h +++ b/src/guc.h @@ -99,6 +99,14 @@ extern TSDLLEXPORT DistCopyTransferFormat ts_guc_dist_copy_transfer_format; typedef void (*set_ssl_options_hook_type)(const char *user_name); extern TSDLLEXPORT set_ssl_options_hook_type ts_set_ssl_options_hook; +/* + * Exit code to use when scheduler exits. + * + * Mostly used for debugging, but defined also for non-debug builds since that + * simplifies the code (and also simplifies debugging non-debug builds). + */ +extern TSDLLEXPORT int ts_bgw_scheduler_exit_code; + #ifdef TS_DEBUG extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index 9e0adb1a979..bb6db58c5da 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -84,6 +84,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = 30; + static void launcher_sighup(SIGNAL_ARGS) { @@ -238,10 +240,24 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ "timescaledb.bgw_scheduler_restart_time", + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ 30, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_POSTMASTER, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); @@ -276,7 +292,7 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke memset(&worker, 0, sizeof(worker)); snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index 69d9ca27b1a..0833145cc06 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -10,7 +10,9 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index f96a7733b29..714ce24978a 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -711,7 +711,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); ts_seclabel_init(); diff --git a/tsl/test/expected/bgw_scheduler_restart.out b/tsl/test/expected/bgw_scheduler_restart.out new file mode 100644 index 00000000000..4ca6926008a --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_restart.out @@ -0,0 +1,83 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +CREATE VIEW tsdb_bgw AS + SELECT datname, application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY datname, application_name; +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 30s +(1 row) + +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +SELECT pg_sleep(10); -- Wait for scheduler to start. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%'; + pg_terminate_backend +---------------------- + t +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index f7cc6c1726f..5f36ccd3a0a 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -44,6 +44,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) TEST_FILES bgw_db_scheduler.sql bgw_scheduler_control.sql + bgw_scheduler_restart.sql job_errors_permissions.sql troubleshooting_job_errors.sql bgw_db_scheduler_fixed.sql @@ -139,6 +140,7 @@ set(SOLO_TESTS # This interferes with other tests since it reloads the config to increase # log level. bgw_scheduler_control + bgw_scheduler_restart bgw_db_scheduler job_errors_permissions troubleshooting_job_errors diff --git a/tsl/test/sql/bgw_scheduler_restart.sql b/tsl/test/sql/bgw_scheduler_restart.sql new file mode 100644 index 00000000000..2f7f78fca45 --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_restart.sql @@ -0,0 +1,39 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +CREATE VIEW tsdb_bgw AS + SELECT datname, application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY datname, application_name; + +SHOW timescaledb.bgw_scheduler_restart_time; + +SELECT _timescaledb_functions.start_background_workers(); + +SELECT pg_sleep(10); -- Wait for scheduler to start. + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + +SELECT * FROM tsdb_bgw; + +SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE datname = :'TEST_DBNAME' + AND application_name LIKE 'TimescaleDB%';