diff --git a/.unreleased/feature_6195 b/.unreleased/feature_6195 new file mode 100644 index 00000000000..a930c0a7bcc --- /dev/null +++ b/.unreleased/feature_6195 @@ -0,0 +1 @@ +Implements: #6195 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index 90755f6e43e..e8ee5f2fa14 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -807,6 +807,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); + proc_exit(ts_bgw_scheduler_exit_code); } static void diff --git a/src/guc.c b/src/guc.c index 5027c930229..37841b6b2b6 100644 --- a/src/guc.c +++ b/src/guc.c @@ -113,6 +113,20 @@ TSDLLEXPORT int ts_guc_hypertable_replication_factor_default = 1; bool ts_guc_debug_require_batch_sorted_merge = false; +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_bgw_scheduler_exit_code = 0; + #ifdef TS_DEBUG bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; @@ -738,6 +752,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ "timescaledb.shutdown_bgw_scheduler_exit_code", + /* short_desc= */ "exit code to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_bgw_scheduler_exit_code, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ "timescaledb.current_timestamp_mock", /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 409ba38df7c..f0c40131e55 100644 --- a/src/guc.h +++ b/src/guc.h @@ -98,6 +98,14 @@ extern TSDLLEXPORT DistCopyTransferFormat ts_guc_dist_copy_transfer_format; typedef void (*set_ssl_options_hook_type)(const char *user_name); extern TSDLLEXPORT set_ssl_options_hook_type ts_set_ssl_options_hook; +/* + * Exit code to use when scheduler exits. + * + * Mostly used for debugging, but defined also for non-debug builds since that + * simplifies the code (and also simplifies debugging non-debug builds). + */ +extern TSDLLEXPORT int ts_bgw_scheduler_exit_code; + #ifdef TS_DEBUG extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index 9e0adb1a979..bb6db58c5da 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -84,6 +84,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = 30; + static void launcher_sighup(SIGNAL_ARGS) { @@ -238,10 +240,24 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ "timescaledb.bgw_scheduler_restart_time", + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ 30, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_POSTMASTER, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); @@ -276,7 +292,7 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke memset(&worker, 0, sizeof(worker)); snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index 69d9ca27b1a..0833145cc06 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -10,7 +10,9 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index f96a7733b29..714ce24978a 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -711,7 +711,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); ts_seclabel_init(); diff --git a/tsl/test/expected/bgw_scheduler_control.out b/tsl/test/expected/bgw_scheduler_control.out new file mode 100644 index 00000000000..84af2884332 --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_control.out @@ -0,0 +1,89 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +-- There is going to be one scheduler for each database, but we only +-- care if there is zero (all schedulers shut down) or at least one +-- (schedulers should be automatically restarted). +CREATE VIEW tsdb_bgw AS + SELECT DISTINCT application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY application_name; +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 30s +(1 row) + +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +SELECT pg_sleep(5); + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +----------------------------------------- + TimescaleDB Background Worker Launcher + TimescaleDB Background Worker Scheduler +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(5); -- Wait for scheduler stop. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +---------------------------------------- + TimescaleDB Background Worker Launcher +(1 row) + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +----------------------------------------- + TimescaleDB Background Worker Launcher + TimescaleDB Background Worker Scheduler +(2 rows) + +SELECT _timescaledb_functions.stop_background_workers(); + stop_background_workers +------------------------- + t +(1 row) + +SELECT pg_sleep(5); + pg_sleep +---------- + +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 9a623ffee2d..eb5d082b1ee 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -43,6 +43,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) APPEND TEST_FILES bgw_db_scheduler.sql + bgw_scheduler_control.sql job_errors_permissions.sql troubleshooting_job_errors.sql bgw_db_scheduler_fixed.sql @@ -135,6 +136,7 @@ endif() set(SOLO_TESTS # dist_hypertable needs a lot of memory when the Sanitizer is active dist_hypertable-${PG_VERSION_MAJOR} + bgw_scheduler_control bgw_db_scheduler job_errors_permissions troubleshooting_job_errors diff --git a/tsl/test/sql/bgw_scheduler_control.sql b/tsl/test/sql/bgw_scheduler_control.sql new file mode 100644 index 00000000000..795e3e751fb --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_control.sql @@ -0,0 +1,39 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +-- There is going to be one scheduler for each database, but we only +-- care if there is zero (all schedulers shut down) or at least one +-- (schedulers should be automatically restarted). +CREATE VIEW tsdb_bgw AS + SELECT DISTINCT application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY application_name; + +SHOW timescaledb.bgw_scheduler_restart_time; + +SELECT _timescaledb_functions.start_background_workers(); +SELECT pg_sleep(5); + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(5); -- Wait for scheduler stop. + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + +SELECT * FROM tsdb_bgw; + +SELECT _timescaledb_functions.stop_background_workers(); +SELECT pg_sleep(5);