Skip to content

Commit

Permalink
enable hourly log ship mode (#1025)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Polina Bungina <[email protected]>
  • Loading branch information
FxKu and hughcapet authored Sep 26, 2024
1 parent fde34d4 commit efd6eb4
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 28 deletions.
3 changes: 2 additions & 1 deletion ENVIRONMENT.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ Environment Configuration Settings
- **CALLBACK_SCRIPT**: the callback script to run on various cluster actions (on start, on stop, on restart, on role change). The script will receive the cluster name, connection string and the current action. See `Patroni <http://patroni.readthedocs.io/en/latest/SETTINGS.html?highlight=callback#postgresql>`__ documentation for details.
- **LOG_S3_BUCKET**: path to the S3 bucket used for PostgreSQL daily log files (i.e. foobar, without `s3://` prefix). Spilo will add `/spilo/{LOG_BUCKET_SCOPE_PREFIX}{SCOPE}{LOG_BUCKET_SCOPE_SUFFIX}/log/` to that path. Logs are shipped if this variable is set.
- **LOG_S3_TAGS**: map of key value pairs to be used for tagging files uploaded to S3. Values should be referencing existing environment variables e.g. ``{"ClusterName": "SCOPE", "Namespace": "POD_NAMESPACE"}``
- **LOG_SHIP_SCHEDULE**: cron schedule for shipping compressed logs from ``pg_log`` (if this feature is enabled, '00 02 * * *' by default)
- **LOG_SHIP_HOURLY**: if true, log rotation in Postgres is set to 1h incl. foreign tables for every hour (schedule `1 */1 * * *`)
- **LOG_SHIP_SCHEDULE**: cron schedule for shipping compressed logs from ``pg_log`` (``1 0 * * *`` by default)
- **LOG_ENV_DIR**: directory to store environment variables necessary for log shipping
- **LOG_TMPDIR**: directory to store temporary compressed daily log files. PGROOT/../tmp by default.
- **LOG_S3_ENDPOINT**: (optional) S3 Endpoint to use with Boto3
Expand Down
2 changes: 1 addition & 1 deletion postgres-appliance/bootstrap/maybe_pg_upgrade.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env python
import glob
import logging
import os
import subprocess
import sys
import glob

logger = logging.getLogger(__name__)

Expand Down
12 changes: 10 additions & 2 deletions postgres-appliance/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,16 @@ fi
## Ensure all logfiles exist, most appliances will have
## a foreign data wrapper pointing to these files
for i in $(seq 0 7); do
if [ ! -f "${PGLOG}/postgresql-$i.csv" ]; then
touch "${PGLOG}/postgresql-$i.csv"
if [ "$LOG_SHIP_HOURLY" != "true" ]; then
if [ ! -f "${PGLOG}/postgresql-${i}.csv" ]; then
touch "${PGLOG}/postgresql-${i}.csv"
fi
else
for h in $(seq -w 0 23); do
if [ ! -f "${PGLOG}/postgresql-${i}-${h}.csv" ]; then
touch "${PGLOG}/postgresql-${i}-${h}.csv"
fi
done
fi
done
chown -R postgres: "$PGROOT" "$RW_DIR/certs"
Expand Down
26 changes: 23 additions & 3 deletions postgres-appliance/scripts/configure_spilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import pystache
import requests

from spilo_commons import RW_DIR, PATRONI_CONFIG_FILE, append_extensions,\
from spilo_commons import RW_DIR, PATRONI_CONFIG_FILE, append_extensions, \
get_binary_version, get_bin_dir, is_valid_pg_version, write_file, write_patroni_config


Expand Down Expand Up @@ -292,9 +292,19 @@ def deep_update(a, b):
logging_collector: 'on'
log_destination: csvlog
log_directory: ../pg_log
{{#LOG_SHIP_HOURLY}}
log_filename: 'postgresql-%u-%H.log'
{{/LOG_SHIP_HOURLY}}
{{^LOG_SHIP_HOURLY}}
log_filename: 'postgresql-%u.log'
{{/LOG_SHIP_HOURLY}}
log_file_mode: '0644'
{{#LOG_SHIP_HOURLY}}
log_rotation_age: '1h'
{{/LOG_SHIP_HOURLY}}
{{^LOG_SHIP_HOURLY}}
log_rotation_age: '1d'
{{/LOG_SHIP_HOURLY}}
log_truncate_on_rotation: 'on'
ssl: 'on'
{{#SSL_CA_FILE}}
Expand Down Expand Up @@ -580,13 +590,18 @@ def get_placeholders(provider):
placeholders.setdefault('CLONE_TARGET_INCLUSIVE', True)

placeholders.setdefault('LOG_GROUP_BY_DATE', False)
placeholders.setdefault('LOG_SHIP_HOURLY', '')
placeholders.setdefault('LOG_SHIP_SCHEDULE', '1 0 * * *')
placeholders.setdefault('LOG_S3_BUCKET', '')
placeholders.setdefault('LOG_S3_ENDPOINT', '')
placeholders.setdefault('LOG_S3_TAGS', '{}')
placeholders.setdefault('LOG_TMPDIR', os.path.abspath(os.path.join(placeholders['PGROOT'], '../tmp')))
placeholders.setdefault('LOG_BUCKET_SCOPE_SUFFIX', '')

# only accept true as value or else it will be empty = disabled
if placeholders['LOG_SHIP_HOURLY']:
placeholders['LOG_SHIP_HOURLY'] = os.environ.get('LOG_SHIP_HOURLY', '') in ['true', 'TRUE']

# see comment for wal-e bucket prefix
placeholders.setdefault('LOG_BUCKET_SCOPE_PREFIX', '{0}-'.format(placeholders['NAMESPACE'])
if placeholders['NAMESPACE'] not in ('default', '') else '')
Expand Down Expand Up @@ -773,6 +788,7 @@ def write_log_environment(placeholders):
os.makedirs(log_env['LOG_ENV_DIR'])

for var in ('LOG_TMPDIR',
'LOG_SHIP_HOURLY',
'LOG_AWS_REGION',
'LOG_S3_ENDPOINT',
'LOG_S3_KEY',
Expand Down Expand Up @@ -1008,8 +1024,12 @@ def write_crontab(placeholders, overwrite):
' "{PGDATA}"').format(**placeholders)]

if bool(placeholders.get('LOG_S3_BUCKET')):
lines += [('{LOG_SHIP_SCHEDULE} nice -n 5 envdir "{LOG_ENV_DIR}"' +
' /scripts/upload_pg_log_to_s3.py').format(**placeholders)]
log_dir = placeholders.get('LOG_ENV_DIR')
schedule = placeholders.get('LOG_SHIP_SCHEDULE')
if placeholders.get('LOG_SHIP_HOURLY') == 'true':
schedule = '1 */1 * * *'
lines += [('{0} nice -n 5 envdir "{1}"' +
' /scripts/upload_pg_log_to_s3.py').format(schedule, log_dir)]

lines += yaml.safe_load(placeholders['CRONTAB'])

Expand Down
70 changes: 56 additions & 14 deletions postgres-appliance/scripts/post_init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,61 @@ if [ "$PGVER" -ge 14 ]; then
fi

# Sunday could be 0 or 7 depending on the format, we just create both
LOG_SHIP_HOURLY=$(echo "SELECT text(current_setting('log_rotation_age') = '1h')" | psql -tAX -d postgres 2> /dev/null | tail -n 1)
if [ "$LOG_SHIP_HOURLY" != "true" ]; then
tbl_regex='postgres_log_\d_\d{2}$'
else
tbl_regex='postgres_log_\d$'
fi
echo "DO \$\$DECLARE tbl_name TEXT;
BEGIN
FOR tbl_name IN SELECT 'public' || '.' || quote_ident(relname) FROM pg_class
WHERE relname ~ '${tbl_regex}' AND relnamespace = 'public'::pg_catalog.regnamespace AND relkind = 'f'
LOOP
IF tbl_name IS NOT NULL THEN
EXECUTE format('DROP FOREIGN TABLE IF EXISTS %s CASCADE', tbl_name);
END IF;
END LOOP;
END;\$\$;"

for i in $(seq 0 7); do
echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_$i () INHERITS (public.postgres_log) SERVER pglog
OPTIONS (filename '../pg_log/postgresql-$i.csv', format 'csv', header 'false');
GRANT SELECT ON public.postgres_log_$i TO admin;
CREATE OR REPLACE VIEW public.failed_authentication_$i WITH (security_barrier) AS
SELECT *
FROM public.postgres_log_$i
WHERE command_tag = 'authentication'
AND error_severity = 'FATAL';
ALTER VIEW public.failed_authentication_$i OWNER TO postgres;
GRANT SELECT ON TABLE public.failed_authentication_$i TO robot_zmon;
"
if [ "$LOG_SHIP_HOURLY" != "true" ]; then
echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_${i} () INHERITS (public.postgres_log) SERVER pglog
OPTIONS (filename '../pg_log/postgresql-${i}.csv', format 'csv', header 'false');
GRANT SELECT ON public.postgres_log_${i} TO admin;
CREATE OR REPLACE VIEW public.failed_authentication_${i} WITH (security_barrier) AS
SELECT *
FROM public.postgres_log_${i}
WHERE command_tag = 'authentication'
AND error_severity = 'FATAL';
ALTER VIEW public.failed_authentication_${i} OWNER TO postgres;
GRANT SELECT ON TABLE public.failed_authentication_${i} TO robot_zmon;"
else
daily_log="CREATE OR REPLACE VIEW public.postgres_log_${i} AS\n"
daily_auth="CREATE OR REPLACE VIEW public.failed_authentication_${i} WITH (security_barrier) AS\n"
daily_union=""

for h in $(seq -w 0 23); do
filter_logs="SELECT * FROM public.postgres_log_${i}_${h} WHERE command_tag = 'authentication' AND error_severity = 'FATAL'"

echo "CREATE FOREIGN TABLE IF NOT EXISTS public.postgres_log_${i}_${h} () INHERITS (public.postgres_log) SERVER pglog
OPTIONS (filename '../pg_log/postgresql-${i}-${h}.csv', format 'csv', header 'false');
GRANT SELECT ON public.postgres_log_${i}_${h} TO admin;
CREATE OR REPLACE VIEW public.failed_authentication_${i}_${h} WITH (security_barrier) AS
${filter_logs};
ALTER VIEW public.failed_authentication_${i}_${h} OWNER TO postgres;
GRANT SELECT ON TABLE public.failed_authentication_${i}_${h} TO robot_zmon;"

daily_log="${daily_log}${daily_union}SELECT * FROM public.postgres_log_${i}_${h}\n"
daily_auth="${daily_auth}${daily_union}${filter_logs}\n"
daily_union="UNION ALL\n"
done

echo -e "${daily_log};"
echo -e "${daily_auth};"
fi
done

cat _zmon_schema.dump
Expand All @@ -176,10 +218,10 @@ while IFS= read -r db_name; do
if [ "$UPGRADE_TIMESCALEDB_TOOLKIT" = "t" ]; then
echo "ALTER EXTENSION timescaledb_toolkit UPDATE;"
fi
UPGRADE_POSTGIS=$(echo -e "SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1)
UPGRADE_POSTGIS=$(echo "SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1)
if [ "$UPGRADE_POSTGIS" = "1" ]; then
# public.postgis_lib_version() is available only if postgis extension is created
UPGRADE_POSTGIS=$(echo -e "SELECT extversion != public.postgis_lib_version() FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1)
UPGRADE_POSTGIS=$(echo "SELECT extversion != public.postgis_lib_version() FROM pg_catalog.pg_extension WHERE extname = 'postgis'" | psql -tAX -d "${db_name}" 2> /dev/null | tail -n 1)
if [ "$UPGRADE_POSTGIS" = "t" ]; then
echo "ALTER EXTENSION postgis UPDATE;"
echo "SELECT public.postgis_extensions_upgrade();"
Expand Down
22 changes: 16 additions & 6 deletions postgres-appliance/scripts/upload_pg_log_to_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,25 @@
logger = logging.getLogger(__name__)


def compress_pg_log():
yesterday = datetime.now() - timedelta(days=1)
yesterday_day_number = yesterday.strftime('%u')
def get_file_names():
prev_interval = datetime.now() - timedelta(days=1)
prev_interval_number = prev_interval.strftime('%u')

if os.getenv('LOG_SHIP_HOURLY') == 'true':
prev_interval = datetime.now() - timedelta(hours=1)
prev_interval_number = prev_interval.strftime('%u-%H')

log_file = os.path.join(os.getenv('PGLOG'), 'postgresql-' + prev_interval_number + '.csv')
archived_log_file = os.path.join(os.getenv('LOG_TMPDIR'), prev_interval.strftime('%F-%H') + '.csv.gz')

log_file = os.path.join(os.getenv('PGLOG'), 'postgresql-' + yesterday_day_number + '.csv')
archived_log_file = os.path.join(os.getenv('LOG_TMPDIR'), yesterday.strftime('%F') + '.csv.gz')
return log_file, archived_log_file


def compress_pg_log():
log_file, archived_log_file = get_file_names()

if os.path.getsize(log_file) == 0:
logger.warning("Postgres log from yesterday '%s' is empty.", log_file)
logger.warning("Postgres log '%s' is empty.", log_file)
sys.exit(0)

try:
Expand Down
35 changes: 34 additions & 1 deletion postgres-appliance/tests/test_spilo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,19 @@ function start_clone_with_basebackup_upgrade_container() {
-d spilo3
}

function start_clone_with_hourly_log_rotation() {
docker-compose run \
-e SCOPE=hourlylogs \
-e PGVERSION=16 \
-e LOG_SHIP_HOURLY="true" \
-e CLONE_SCOPE=upgrade2 \
-e CLONE_PGVERSION=14 \
-e CLONE_METHOD=CLONE_WITH_WALE \
-e CLONE_TARGET_TIME="$(next_minute)" \
--name "${PREFIX}hourlylogs" \
-d "spilo3"
}

function verify_clone_upgrade() {
local type=$2
local from_version=$3
Expand All @@ -241,13 +254,24 @@ function verify_archive_mode_is_on() {
[ "$archive_mode" = "on" ]
}

function verify_hourly_log_rotation() {
log_rotation_age=$(docker_exec "$1" "psql -U postgres -tAc \"SHOW log_rotation_age\"")
log_filename=$(docker_exec "$1" "psql -U postgres -tAc \"SHOW log_filename\"")
# we expect 8x24 foreign tables and views + 8 views for daily logs and failed authentications
postgres_log_ftables=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_foreign_table WHERE ftrelid::regclass::text LIKE 'postgres_log_%'\"")
postgres_log_views=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_views WHERE viewname LIKE 'postgres_log_%'\"")
postgres_failed_auth_views=$(docker_exec "$1" "psql -U postgres -tAc \"SELECT count(*) FROM pg_views WHERE viewname LIKE 'failed_authentication_%'\"")

[ "$log_rotation_age" = "1h" ] && [ "$log_filename" = "postgresql-%u-%H.log" ] && [ "$postgres_log_ftables" -eq 192 ] && [ "$postgres_log_views" -eq 8 ] && [ "$postgres_failed_auth_views" -eq 200 ]
}

# TEST SUITE 1 - In-place major upgrade 12->13->...->16
# TEST SUITE 2 - Major upgrade 12->16 after wal-e clone (with CLONE_PGVERSION set)
# TEST SUITE 3 - PITR (clone with wal-e) with unreachable target (13+)
# TEST SUITE 4 - Major upgrade 12->13 after wal-e clone (no CLONE_PGVERSION)
# TEST SUITE 5 - Replica bootstrap with wal-e
# TEST SUITE 6 - Major upgrade 13->14 after clone with basebackup
# TEST SUITE 7 - Hourly log rotation
function test_spilo() {
# TEST SUITE 1
local container=$1
Expand Down Expand Up @@ -344,7 +368,7 @@ function test_spilo() {
local basebackup_container
basebackup_container=$(start_clone_with_basebackup_upgrade_container "$upgrade_container") # SCOPE=upgrade2 PGVERSION=14 CLONE: _SCOPE=upgrade
log_info "[TS6] Started $basebackup_container for testing major upgrade 13->14 after clone with basebackup"

wait_backup "$basebackup_container"

# TEST SUITE 1
# run_test test_pg_upgrade_to_16_check_failed "$container" # pg_upgrade --check complains about timescaledb
Expand All @@ -362,11 +386,20 @@ function test_spilo() {
log_info "[TS5] Waiting for postgres to start in the $upgrade_replica_container and stream from primary..."
wait_all_streaming "$upgrade_container" 1

# TEST SUITE 7
local hourlylogs_container
hourlylogs_container=$(start_clone_with_hourly_log_rotation "$upgrade_container")
log_info "[TS7] Started $hourlylogs_container for testing hourly log rotation"

# TEST SUITE 6
log_info "[TS6] Testing in-place major upgrade 13->14 after clone with basebackup"
run_test verify_clone_upgrade "$basebackup_container" "basebackup" 13 14
run_test verify_archive_mode_is_on "$basebackup_container"

# TEST SUITE 7
find_leader "$hourlylogs_container"
log_info "[TS7] Testing correct setup with hourly log rotation"
run_test verify_hourly_log_rotation "$hourlylogs_container"
}

function main() {
Expand Down

0 comments on commit efd6eb4

Please sign in to comment.