From efd6eb46ee04572dfae41b91ed5629ae66b9c1f8 Mon Sep 17 00:00:00 2001 From: Felix Kunde Date: Thu, 26 Sep 2024 15:09:58 +0200 Subject: [PATCH] enable hourly log ship mode (#1025) --------- Co-authored-by: Polina Bungina <27892524+hughcapet@users.noreply.github.com> --- ENVIRONMENT.rst | 3 +- .../bootstrap/maybe_pg_upgrade.py | 2 +- postgres-appliance/launch.sh | 12 +++- postgres-appliance/scripts/configure_spilo.py | 26 ++++++- postgres-appliance/scripts/post_init.sh | 70 +++++++++++++++---- .../scripts/upload_pg_log_to_s3.py | 22 ++++-- postgres-appliance/tests/test_spilo.sh | 35 +++++++++- 7 files changed, 142 insertions(+), 28 deletions(-) diff --git a/ENVIRONMENT.rst b/ENVIRONMENT.rst index 158b9a868..f791a75de 100644 --- a/ENVIRONMENT.rst +++ b/ENVIRONMENT.rst @@ -91,7 +91,8 @@ Environment Configuration Settings - **CALLBACK_SCRIPT**: the callback script to run on various cluster actions (on start, on stop, on restart, on role change). The script will receive the cluster name, connection string and the current action. See `Patroni `__ documentation for details. - **LOG_S3_BUCKET**: path to the S3 bucket used for PostgreSQL daily log files (i.e. foobar, without `s3://` prefix). Spilo will add `/spilo/{LOG_BUCKET_SCOPE_PREFIX}{SCOPE}{LOG_BUCKET_SCOPE_SUFFIX}/log/` to that path. Logs are shipped if this variable is set. - **LOG_S3_TAGS**: map of key value pairs to be used for tagging files uploaded to S3. Values should be referencing existing environment variables e.g. ``{"ClusterName": "SCOPE", "Namespace": "POD_NAMESPACE"}`` -- **LOG_SHIP_SCHEDULE**: cron schedule for shipping compressed logs from ``pg_log`` (if this feature is enabled, '00 02 * * *' by default) +- **LOG_SHIP_HOURLY**: if true, log rotation in Postgres is set to 1h incl. foreign tables for every hour (schedule `1 */1 * * *`) +- **LOG_SHIP_SCHEDULE**: cron schedule for shipping compressed logs from ``pg_log`` (``1 0 * * *`` by default) - **LOG_ENV_DIR**: directory to store environment variables necessary for log shipping - **LOG_TMPDIR**: directory to store temporary compressed daily log files. PGROOT/../tmp by default. - **LOG_S3_ENDPOINT**: (optional) S3 Endpoint to use with Boto3 diff --git a/postgres-appliance/bootstrap/maybe_pg_upgrade.py b/postgres-appliance/bootstrap/maybe_pg_upgrade.py index b7ffae5c6..4f36e6953 100644 --- a/postgres-appliance/bootstrap/maybe_pg_upgrade.py +++ b/postgres-appliance/bootstrap/maybe_pg_upgrade.py @@ -1,9 +1,9 @@ #!/usr/bin/env python +import glob import logging import os import subprocess import sys -import glob logger = logging.getLogger(__name__) diff --git a/postgres-appliance/launch.sh b/postgres-appliance/launch.sh index a5f7e60ae..1e3e54f30 100755 --- a/postgres-appliance/launch.sh +++ b/postgres-appliance/launch.sh @@ -34,8 +34,16 @@ fi ## Ensure all logfiles exist, most appliances will have ## a foreign data wrapper pointing to these files for i in $(seq 0 7); do - if [ ! -f "${PGLOG}/postgresql-$i.csv" ]; then - touch "${PGLOG}/postgresql-$i.csv" + if [ "$LOG_SHIP_HOURLY" != "true" ]; then + if [ ! -f "${PGLOG}/postgresql-${i}.csv" ]; then + touch "${PGLOG}/postgresql-${i}.csv" + fi + else + for h in $(seq -w 0 23); do + if [ ! -f "${PGLOG}/postgresql-${i}-${h}.csv" ]; then + touch "${PGLOG}/postgresql-${i}-${h}.csv" + fi + done fi done chown -R postgres: "$PGROOT" "$RW_DIR/certs" diff --git a/postgres-appliance/scripts/configure_spilo.py b/postgres-appliance/scripts/configure_spilo.py index eb4f5e1db..fa2f3ca9c 100755 --- a/postgres-appliance/scripts/configure_spilo.py +++ b/postgres-appliance/scripts/configure_spilo.py @@ -22,7 +22,7 @@ import pystache import requests -from spilo_commons import RW_DIR, PATRONI_CONFIG_FILE, append_extensions,\ +from spilo_commons import RW_DIR, PATRONI_CONFIG_FILE, append_extensions, \ get_binary_version, get_bin_dir, is_valid_pg_version, write_file, write_patroni_config @@ -292,9 +292,19 @@ def deep_update(a, b): logging_collector: 'on' log_destination: csvlog log_directory: ../pg_log + {{#LOG_SHIP_HOURLY}} + log_filename: 'postgresql-%u-%H.log' + {{/LOG_SHIP_HOURLY}} + {{^LOG_SHIP_HOURLY}} log_filename: 'postgresql-%u.log' + {{/LOG_SHIP_HOURLY}} log_file_mode: '0644' + {{#LOG_SHIP_HOURLY}} + log_rotation_age: '1h' + {{/LOG_SHIP_HOURLY}} + {{^LOG_SHIP_HOURLY}} log_rotation_age: '1d' + {{/LOG_SHIP_HOURLY}} log_truncate_on_rotation: 'on' ssl: 'on' {{#SSL_CA_FILE}} @@ -580,6 +590,7 @@ def get_placeholders(provider): placeholders.setdefault('CLONE_TARGET_INCLUSIVE', True) placeholders.setdefault('LOG_GROUP_BY_DATE', False) + placeholders.setdefault('LOG_SHIP_HOURLY', '') placeholders.setdefault('LOG_SHIP_SCHEDULE', '1 0 * * *') placeholders.setdefault('LOG_S3_BUCKET', '') placeholders.setdefault('LOG_S3_ENDPOINT', '') @@ -587,6 +598,10 @@ def get_placeholders(provider): placeholders.setdefault('LOG_TMPDIR', os.path.abspath(os.path.join(placeholders['PGROOT'], '../tmp'))) placeholders.setdefault('LOG_BUCKET_SCOPE_SUFFIX', '') + # only accept true as value or else it will be empty = disabled + if placeholders['LOG_SHIP_HOURLY']: + placeholders['LOG_SHIP_HOURLY'] = os.environ.get('LOG_SHIP_HOURLY', '') in ['true', 'TRUE'] + # see comment for wal-e bucket prefix placeholders.setdefault('LOG_BUCKET_SCOPE_PREFIX', '{0}-'.format(placeholders['NAMESPACE']) if placeholders['NAMESPACE'] not in ('default', '') else '') @@ -773,6 +788,7 @@ def write_log_environment(placeholders): os.makedirs(log_env['LOG_ENV_DIR']) for var in ('LOG_TMPDIR', + 'LOG_SHIP_HOURLY', 'LOG_AWS_REGION', 'LOG_S3_ENDPOINT', 'LOG_S3_KEY', @@ -1008,8 +1024,12 @@ def write_crontab(placeholders, overwrite): ' "{PGDATA}"').format(**placeholders)] if bool(placeholders.get('LOG_S3_BUCKET')): - lines += [('{LOG_SHIP_SCHEDULE} nice -n 5 envdir "{LOG_ENV_DIR}"' + - ' /scripts/upload_pg_log_to_s3.py').format(**placeholders)] + log_dir = placeholders.get('LOG_ENV_DIR') + schedule = placeholders.get('LOG_SHIP_SCHEDULE') + if placeholders.get('LOG_SHIP_HOURLY') == 'true': + schedule = '1 */1 * * *' + lines += [('{0} nice -n 5 envdir "{1}"' + + ' /scripts/upload_pg_log_to_s3.py').format(schedule, log_dir)] lines += yaml.safe_load(placeholders['CRONTAB']) diff --git a/postgres-appliance/scripts/post_init.sh b/postgres-appliance/scripts/post_init.sh index 407d9b982..0950d04ef 100755 --- a/postgres-appliance/scripts/post_init.sh +++ b/postgres-appliance/scripts/post_init.sh @@ -147,19 +147,61 @@ if [ "$PGVER" -ge 14 ]; then fi # Sunday could be 0 or 7 depending on the format, we just create both +LOG_SHIP_HOURLY=$(echo "SELECT text(current_setting('log_rotation_age') = '1h')" | psql -tAX -d postgres 2> /dev/null | tail -n 1) +if [ "$LOG_SHIP_HOURLY" != "true" ]; then + tbl_regex='postgres_log_\d_\d{2}$' +else + tbl_regex='postgres_log_\d$' +fi +echo "DO \$\$DECLARE tbl_name TEXT; + BEGIN + FOR tbl_name IN SELECT 'public' || '.' || quote_ident(relname) FROM pg_class + WHERE relname ~ '${tbl_regex}' AND relnamespace = 'public'::pg_catalog.regnamespace AND relkind = 'f' + LOOP + IF tbl_name IS NOT NULL THEN + EXECUTE format('DROP FOREIGN TABLE IF EXISTS %s CASCADE', tbl_name); + END IF; + END LOOP; +END;\$\$;" + for i in $(seq 0 7); do - echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_$i () INHERITS (public.postgres_log) SERVER pglog - OPTIONS (filename '../pg_log/postgresql-$i.csv', format 'csv', header 'false'); -GRANT SELECT ON public.postgres_log_$i TO admin; - -CREATE OR REPLACE VIEW public.failed_authentication_$i WITH (security_barrier) AS -SELECT * - FROM public.postgres_log_$i - WHERE command_tag = 'authentication' - AND error_severity = 'FATAL'; -ALTER VIEW public.failed_authentication_$i OWNER TO postgres; -GRANT SELECT ON TABLE public.failed_authentication_$i TO robot_zmon; -" + if [ "$LOG_SHIP_HOURLY" != "true" ]; then + echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_${i} () INHERITS (public.postgres_log) SERVER pglog + OPTIONS (filename '../pg_log/postgresql-${i}.csv', format 'csv', header 'false'); + GRANT SELECT ON public.postgres_log_${i} TO admin; + + CREATE OR REPLACE VIEW public.failed_authentication_${i} WITH (security_barrier) AS + SELECT * + FROM public.postgres_log_${i} + WHERE command_tag = 'authentication' + AND error_severity = 'FATAL'; + ALTER VIEW public.failed_authentication_${i} OWNER TO postgres; + GRANT SELECT ON TABLE public.failed_authentication_${i} TO robot_zmon;" + else + daily_log="CREATE OR REPLACE VIEW public.postgres_log_${i} AS\n" + daily_auth="CREATE OR REPLACE VIEW public.failed_authentication_${i} WITH (security_barrier) AS\n" + daily_union="" + + for h in $(seq -w 0 23); do + filter_logs="SELECT * FROM public.postgres_log_${i}_${h} WHERE command_tag = 'authentication' AND error_severity = 'FATAL'" + + echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_${i}_${h} () INHERITS (public.postgres_log) SERVER pglog + OPTIONS (filename '../pg_log/postgresql-${i}-${h}.csv', format 'csv', header 'false'); + GRANT SELECT ON public.postgres_log_${i}_${h} TO admin; + + CREATE OR REPLACE VIEW public.failed_authentication_${i}_${h} WITH (security_barrier) AS + ${filter_logs}; + ALTER VIEW public.failed_authentication_${i}_${h} OWNER TO postgres; + GRANT SELECT ON TABLE public.failed_authentication_${i}_${h} TO robot_zmon;" + + daily_log="${daily_log}${daily_union}SELECT * FROM public.postgres_log_${i}_${h}\n" + daily_auth="${daily_auth}${daily_union}${filter_logs}\n" + daily_union="UNION ALL\n" + done + + echo -e "${daily_log};" + echo -e "${daily_auth};" + fi done cat _zmon_schema.dump @@ -176,10 +218,10 @@ while IFS= read -r db_name; do if [ "$UPGRADE_TIMESCALEDB_TOOLKIT" = "t" ]; then echo "ALTER EXTENSION timescaledb_toolkit UPDATE;" fi - UPGRADE_POSTGIS=$(echo -e "SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1) + UPGRADE_POSTGIS=$(echo "SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1) if [ "$UPGRADE_POSTGIS" = "1" ]; then # public.postgis_lib_version() is available only if postgis extension is created - UPGRADE_POSTGIS=$(echo -e "SELECT extversion != public.postgis_lib_version() FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1) + UPGRADE_POSTGIS=$(echo "SELECT extversion != public.postgis_lib_version() FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1) if [ "$UPGRADE_POSTGIS" = "t" ]; then echo "ALTER EXTENSION postgis UPDATE;" echo "SELECT public.postgis_extensions_upgrade();" diff --git a/postgres-appliance/scripts/upload_pg_log_to_s3.py b/postgres-appliance/scripts/upload_pg_log_to_s3.py index 3bf5cda75..39966c355 100755 --- a/postgres-appliance/scripts/upload_pg_log_to_s3.py +++ b/postgres-appliance/scripts/upload_pg_log_to_s3.py @@ -16,15 +16,25 @@ logger = logging.getLogger(__name__) -def compress_pg_log(): - yesterday = datetime.now() - timedelta(days=1) - yesterday_day_number = yesterday.strftime('%u') +def get_file_names(): + prev_interval = datetime.now() - timedelta(days=1) + prev_interval_number = prev_interval.strftime('%u') + + if os.getenv('LOG_SHIP_HOURLY') == 'true': + prev_interval = datetime.now() - timedelta(hours=1) + prev_interval_number = prev_interval.strftime('%u-%H') + + log_file = os.path.join(os.getenv('PGLOG'), 'postgresql-' + prev_interval_number + '.csv') + archived_log_file = os.path.join(os.getenv('LOG_TMPDIR'), prev_interval.strftime('%F-%H') + '.csv.gz') - log_file = os.path.join(os.getenv('PGLOG'), 'postgresql-' + yesterday_day_number + '.csv') - archived_log_file = os.path.join(os.getenv('LOG_TMPDIR'), yesterday.strftime('%F') + '.csv.gz') + return log_file, archived_log_file + + +def compress_pg_log(): + log_file, archived_log_file = get_file_names() if os.path.getsize(log_file) == 0: - logger.warning("Postgres log from yesterday '%s' is empty.", log_file) + logger.warning("Postgres log '%s' is empty.", log_file) sys.exit(0) try: diff --git a/postgres-appliance/tests/test_spilo.sh b/postgres-appliance/tests/test_spilo.sh index 478c44640..d84bdc4a6 100755 --- a/postgres-appliance/tests/test_spilo.sh +++ b/postgres-appliance/tests/test_spilo.sh @@ -227,6 +227,19 @@ function start_clone_with_basebackup_upgrade_container() { -d spilo3 } +function start_clone_with_hourly_log_rotation() { + docker-compose run \ + -e SCOPE=hourlylogs \ + -e PGVERSION=16 \ + -e LOG_SHIP_HOURLY="true" \ + -e CLONE_SCOPE=upgrade2 \ + -e CLONE_PGVERSION=14 \ + -e CLONE_METHOD=CLONE_WITH_WALE \ + -e CLONE_TARGET_TIME="$(next_minute)" \ + --name "${PREFIX}hourlylogs" \ + -d "spilo3" +} + function verify_clone_upgrade() { local type=$2 local from_version=$3 @@ -241,6 +254,16 @@ function verify_archive_mode_is_on() { [ "$archive_mode" = "on" ] } +function verify_hourly_log_rotation() { + log_rotation_age=$(docker_exec "$1" "psql -U postgres -tAc \"SHOW log_rotation_age\"") + log_filename=$(docker_exec "$1" "psql -U postgres -tAc \"SHOW log_filename\"") + # we expect 8x24 foreign tables and views + 8 views for daily logs and failed authentications + postgres_log_ftables=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_foreign_table WHERE ftrelid::regclass::text LIKE 'postgres_log_%'\"") + postgres_log_views=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_views WHERE viewname LIKE 'postgres_log_%'\"") + postgres_failed_auth_views=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_views WHERE viewname LIKE 'failed_authentication_%'\"") + + [ "$log_rotation_age" = "1h" ] && [ "$log_filename" = "postgresql-%u-%H.log" ] && [ "$postgres_log_ftables" -eq 192 ] && [ "$postgres_log_views" -eq 8 ] && [ "$postgres_failed_auth_views" -eq 200 ] +} # TEST SUITE 1 - In-place major upgrade 12->13->...->16 # TEST SUITE 2 - Major upgrade 12->16 after wal-e clone (with CLONE_PGVERSION set) @@ -248,6 +271,7 @@ function verify_archive_mode_is_on() { # TEST SUITE 4 - Major upgrade 12->13 after wal-e clone (no CLONE_PGVERSION) # TEST SUITE 5 - Replica bootstrap with wal-e # TEST SUITE 6 - Major upgrade 13->14 after clone with basebackup +# TEST SUITE 7 - Hourly log rotation function test_spilo() { # TEST SUITE 1 local container=$1 @@ -344,7 +368,7 @@ function test_spilo() { local basebackup_container basebackup_container=$(start_clone_with_basebackup_upgrade_container "$upgrade_container") # SCOPE=upgrade2 PGVERSION=14 CLONE: _SCOPE=upgrade log_info "[TS6] Started $basebackup_container for testing major upgrade 13->14 after clone with basebackup" - + wait_backup "$basebackup_container" # TEST SUITE 1 # run_test test_pg_upgrade_to_16_check_failed "$container" # pg_upgrade --check complains about timescaledb @@ -362,11 +386,20 @@ function test_spilo() { log_info "[TS5] Waiting for postgres to start in the $upgrade_replica_container and stream from primary..." wait_all_streaming "$upgrade_container" 1 + # TEST SUITE 7 + local hourlylogs_container + hourlylogs_container=$(start_clone_with_hourly_log_rotation "$upgrade_container") + log_info "[TS7] Started $hourlylogs_container for testing hourly log rotation" # TEST SUITE 6 log_info "[TS6] Testing in-place major upgrade 13->14 after clone with basebackup" run_test verify_clone_upgrade "$basebackup_container" "basebackup" 13 14 run_test verify_archive_mode_is_on "$basebackup_container" + + # TEST SUITE 7 + find_leader "$hourlylogs_container" + log_info "[TS7] Testing correct setup with hourly log rotation" + run_test verify_hourly_log_rotation "$hourlylogs_container" } function main() {