Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Point In Time Recovery #531

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4c65f78
Add binlog_utils_udf plugin.
Zvirovyi Nov 24, 2024
b8287d7
Enable gtid_mode and enforce_gtid_consistency for the MySQL.
Zvirovyi Nov 24, 2024
d8aabf4
Add S3 compatibility check based on the group replication id.
Zvirovyi Nov 24, 2024
cc61944
Point-in-time-recovery.
Zvirovyi Dec 8, 2024
373fc04
Merge branch 'refs/heads/main' into pitr
Zvirovyi Dec 8, 2024
dd1a2ed
Fix constants.
Zvirovyi Dec 9, 2024
19e05ef
Integration tests.
Zvirovyi Dec 9, 2024
ff4f865
Merge branch 'main' into pitr
Zvirovyi Dec 28, 2024
9979001
Binlogs collector service improvement.
Zvirovyi Jan 4, 2025
170268b
Format restore function.
Zvirovyi Jan 4, 2025
160857e
Use context manager for ca_file in s3_helpers.
Zvirovyi Jan 7, 2025
6776fbf
Rename start_stop_binlogs_collecting to reconcile_binlogs_collection.
Zvirovyi Jan 8, 2025
7b8f6d8
Delete binlogs collector config when not needed.
Zvirovyi Jan 9, 2025
9d3abbf
Improve update_binlogs_collector_config.
Zvirovyi Jan 9, 2025
904ab13
Merge branch 'main' into pitr
Zvirovyi Jan 9, 2025
39aad21
Add restore-to-time validation and format notice.
Zvirovyi Jan 11, 2025
7469ec4
Merge branch 'main' into pitr
Zvirovyi Jan 17, 2025
ddd58e7
Merge branch 'main' into pitr
Zvirovyi Jan 22, 2025
2f8d898
Sync lib changes from VM PR.
Zvirovyi Jan 23, 2025
9c7b122
Merge branch 'main' into pitr
Zvirovyi Jan 23, 2025
e02519a
Improve binlogs collection service.
Zvirovyi Jan 23, 2025
c5e0965
Merge branch 'main' into pitr
Zvirovyi Jan 25, 2025
d8fd9cd
Increment LIBPATCH for libs.
Zvirovyi Jan 25, 2025
9466460
Fix errors after main merge.
Zvirovyi Jan 26, 2025
99ab5a0
Merge branch 'main' into pitr
Zvirovyi Jan 29, 2025
7d4733d
LIBPATCH
Zvirovyi Jan 29, 2025
c656eae
Move binlogs collector config to the env.
Zvirovyi Jan 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ restore:
backup-id:
type: string
description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ)
restore-to-time:
type: string
description: Point-in-time-recovery target (format = %Y-%m-%d %H:%M:%S).

pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
Expand Down
234 changes: 219 additions & 15 deletions lib/charms/mysql/v0/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,15 @@ def is_unit_blocked(self) -> bool:
import datetime
import logging
import pathlib
import re
import typing
from typing import Dict, List, Optional, Tuple

from charms.data_platform_libs.v0.s3 import S3Requirer
from charms.data_platform_libs.v0.s3 import (
CredentialsChangedEvent,
CredentialsGoneEvent,
S3Requirer,
)
from charms.mysql.v0.mysql import (
MySQLConfigureInstanceError,
MySQLCreateClusterError,
Expand All @@ -67,6 +72,7 @@ def is_unit_blocked(self) -> bool:
MySQLPrepareBackupForRestoreError,
MySQLRescanClusterError,
MySQLRestoreBackupError,
MySQLRestorePitrError,
MySQLRetrieveBackupWithXBCloudError,
MySQLServiceNotRunningError,
MySQLSetInstanceOfflineModeError,
Expand All @@ -76,6 +82,7 @@ def is_unit_blocked(self) -> bool:
MySQLUnableToGetMemberStateError,
)
from charms.mysql.v0.s3_helpers import (
ensure_s3_compatible_group_replication_id,
fetch_and_check_existence_of_s3_path,
list_backups_in_s3_path,
upload_content_to_s3,
Expand All @@ -85,7 +92,11 @@ def is_unit_blocked(self) -> bool:
from ops.jujuversion import JujuVersion
from ops.model import BlockedStatus, MaintenanceStatus

from constants import MYSQL_DATA_DIR
from constants import (
MYSQL_DATA_DIR,
SERVER_CONFIG_PASSWORD_KEY,
SERVER_CONFIG_USERNAME,
)

logger = logging.getLogger(__name__)

Expand All @@ -100,8 +111,12 @@ def is_unit_blocked(self) -> bool:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 12
LIBPATCH = 13

ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE = "S3 repository claimed by another cluster"
MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR = (
"Move restored cluster to another S3 repository"
)

if typing.TYPE_CHECKING:
from charm import MySQLOperatorCharm
Expand All @@ -119,6 +134,13 @@ def __init__(self, charm: "MySQLOperatorCharm", s3_integrator: S3Requirer) -> No
self.framework.observe(self.charm.on.create_backup_action, self._on_create_backup)
self.framework.observe(self.charm.on.list_backups_action, self._on_list_backups)
self.framework.observe(self.charm.on.restore_action, self._on_restore)
self.framework.observe(
self.s3_integrator.on.credentials_changed, self._on_s3_credentials_changed
)
self.framework.observe(self.charm.on.leader_elected, self._on_s3_credentials_changed)
self.framework.observe(
self.s3_integrator.on.credentials_gone, self._on_s3_credentials_gone
)

# ------------------ Helpers ------------------
@property
Expand Down Expand Up @@ -235,18 +257,33 @@ def _on_list_backups(self, event: ActionEvent) -> None:

# ------------------ Create Backup ------------------

def _on_create_backup(self, event: ActionEvent) -> None:
"""Handle the create backup action."""
logger.info("A backup has been requested on unit")
def _pre_create_backup_checks(self, event: ActionEvent) -> bool:
"""Run some checks before creating the backup.

Returns: a boolean indicating whether operation should be run.
"""
if not self._s3_integrator_relation_exists:
logger.error("Backup failed: missing relation with S3 integrator charm")
event.fail("Missing relation with S3 integrator charm")
return
return False

if "s3-block-message" in self.charm.app_peer_data:
logger.error("Backup failed: S3 relation is blocked for write")
event.fail("S3 relation is blocked for write")
return False

if not self.charm._mysql.is_mysqld_running():
logger.error(f"Backup failed: process mysqld is not running on {self.charm.unit.name}")
event.fail("Process mysqld not running")
return False

return True

def _on_create_backup(self, event: ActionEvent) -> None:
"""Handle the create backup action."""
logger.info("A backup has been requested on unit")

if not self._pre_create_backup_checks(event):
return

datetime_backup_requested = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
Expand Down Expand Up @@ -455,6 +492,18 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:
event.fail(error_message)
return False

# Quick check for timestamp format
restore_to_time = event.params.get("restore-to-time")
if (
restore_to_time
and restore_to_time != "latest"
and not self._is_mysql_timestamp(restore_to_time)
):
error_message = "Bad restore-to-time format"
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
return False

if not self.charm._mysql.is_server_connectable():
error_message = "Server running mysqld is not connectable"
logger.error(f"Restore failed: {error_message}")
Expand All @@ -479,7 +528,7 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:

return True

def _on_restore(self, event: ActionEvent) -> None:
def _on_restore(self, event: ActionEvent) -> None: # noqa: C901
"""Handle the restore backup action event.

Restore a backup from S3 (parameters for which can retrieved from the
Expand All @@ -489,7 +538,12 @@ def _on_restore(self, event: ActionEvent) -> None:
return

backup_id = event.params["backup-id"].strip().strip("/")
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
restore_to_time = event.params.get("restore-to-time")
logger.info(
f"A restore with backup-id {backup_id}"
f"{f' to time point {restore_to_time}' if restore_to_time else ''}"
f" has been requested on the unit"
)

# Retrieve and validate missing S3 parameters
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
Expand Down Expand Up @@ -519,14 +573,28 @@ def _on_restore(self, event: ActionEvent) -> None:
if not success:
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)

if recoverable:
self._clean_data_dir_and_start_mysqld()
else:
self.charm.unit.status = BlockedStatus(error_message)

return

if restore_to_time is not None:
self.charm.unit.status = MaintenanceStatus("Running point-in-time-recovery operations")
success, error_message = self._pitr_restore(restore_to_time, s3_parameters)
if not success:
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
self.charm.unit.status = BlockedStatus(error_message)
return

self.charm.app_peer_data.update({
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR,
"binlogs-collecting": "",
})
if not self.charm._mysql.reconcile_binlogs_collection():
logger.error("Failed to stop binlogs collecting prior to restore")

# Run post-restore operations
self.charm.unit.status = MaintenanceStatus("Running post-restore operations")
success, error_message = self._post_restore()
Expand Down Expand Up @@ -611,6 +679,10 @@ def _restore(self, backup_id: str, s3_parameters: Dict[str, str]) -> Tuple[bool,
except MySQLRestoreBackupError:
return False, False, f"Failed to restore backup {backup_id}"

success, error_message = self._clean_data_dir_and_start_mysqld()
if not success:
return False, False, error_message

return True, True, ""

def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]:
Expand All @@ -636,15 +708,29 @@ def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]:

return True, ""

def _pitr_restore(
self, restore_to_time: str, s3_parameters: Dict[str, str]
) -> Tuple[bool, str]:
try:
logger.info("Restoring point-in-time-recovery")
stdout, stderr = self.charm._mysql.restore_pitr(
host=self.charm.get_unit_address(self.charm.unit),
mysql_user=SERVER_CONFIG_USERNAME,
password=self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY),
Comment on lines +718 to +719
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user always need to be super user, you can simplify _pitr_restore method to have the user in it, using the user and password attributes from the self object in MySQL class instance

s3_parameters=s3_parameters,
restore_to_time=restore_to_time,
)
logger.debug(f"Stdout of mysql-pitr-helper restore command: {stdout}")
logger.debug(f"Stderr of mysql-pitr-helper restore command: {stderr}")
except MySQLRestorePitrError:
return False, f"Failed to restore point-in-time-recovery to the {restore_to_time}"
return True, ""

def _post_restore(self) -> Tuple[bool, str]:
"""Run operations required after restoring a backup.

Returns: tuple of (success, error_message)
"""
success, error_message = self._clean_data_dir_and_start_mysqld()
if not success:
return success, error_message

try:
logger.info("Configuring instance to be part of an InnoDB cluster")
self.charm._mysql.configure_instance(create_cluster_admin=False)
Expand Down Expand Up @@ -674,3 +760,121 @@ def _post_restore(self) -> Tuple[bool, str]:
return False, "Failed to rescan the cluster"

return True, ""

def _on_s3_credentials_changed(self, event: CredentialsChangedEvent) -> None:
if not self.charm.unit.is_leader():
logger.debug("Early exit on _on_s3_credentials_changed: unit is not a leader")
return

if not self._s3_integrator_relation_exists:
logger.debug(
"Early exit on _on_s3_credentials_changed: s3 integrator relation does not exist"
)
return

if (
not self.charm._mysql.is_mysqld_running()
or not self.charm.unit_initialized
or not self.charm.upgrade.idle
):
logger.debug(
"Deferring _on_s3_credentials_changed: mysql cluster is not started yet or upgrade is occurring"
)
event.defer()
return

try:
self.charm._mysql.wait_until_mysql_connection()
except MySQLServiceNotRunningError:
logger.debug(
"Deferring _on_s3_credentials_changed: mysql cluster is not connectable yet"
)
event.defer()
return

logger.info("Retrieving s3 parameters from the s3-integrator relation")
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
if missing_parameters:
logger.error(f"Missing S3 parameters: {missing_parameters}")
return

logger.info("Ensuring compatibility with the provided S3 repository")
if ensure_s3_compatible_group_replication_id(
self.charm._mysql.get_current_group_replication_id(), s3_parameters
):
self.charm.app_peer_data.update({
"s3-block-message": "",
"binlogs-collecting": "true",
})
else:
self.charm.app_peer_data.update({
"s3-block-message": ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE,
"binlogs-collecting": "",
})

if not self.charm._mysql.reconcile_binlogs_collection(
force_restart=True, ignore_inactive_error=True
):
logger.error("Failed to restart binlogs collecting after S3 relation update")

def _on_s3_credentials_gone(self, event: CredentialsGoneEvent) -> None:
if self.charm.unit.is_leader():
self.charm.app_peer_data.update({
"s3-block-message": "",
"binlogs-collecting": "",
})
if not self.charm._mysql.reconcile_binlogs_collection():
logger.error("Failed to stop binlogs collecting after S3 relation depart")

def get_binlogs_collector_config(self) -> Dict[str, str]:
"""Update binlogs collector service config file.

Returns: whether this operation was successful.
"""
if not self._s3_integrator_relation_exists:
logger.error(
"Cannot get binlogs collector config: s3 integrator relation does not exist"
)
return {}

logger.info("Retrieving s3 parameters from the s3-integrator relation")
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
if missing_parameters:
logger.error(
f"Cannot get binlogs collector config: Missing S3 parameters: {missing_parameters}"
)
return {}

bucket_url = (
f"{s3_parameters['bucket']}/{s3_parameters['path']}binlogs"
if not s3_parameters["path"] or s3_parameters["path"][-1] == "/"
else f"{s3_parameters['bucket']}/{s3_parameters['path']}/binlogs"
)

return {
"ENDPOINT": s3_parameters["endpoint"],
"HOSTS": ",".join(self.charm._mysql.get_cluster_members()),
"USER": SERVER_CONFIG_USERNAME,
"PASS": self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY),
"STORAGE_TYPE": "s3",
"ACCESS_KEY_ID": s3_parameters["access-key"],
"SECRET_ACCESS_KEY": s3_parameters["secret-key"],
"S3_BUCKET_URL": bucket_url,
"DEFAULT_REGION": s3_parameters["region"],
}

def _is_mysql_timestamp(self, timestamp: str) -> bool:
# Format is the same as in the mysql-pitr-helper project.
if not re.match(
r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$",
timestamp,
):
return False
try:
self._parse_mysql_timestamp(timestamp)
return True
except ValueError:
return False

def _parse_mysql_timestamp(self, timestamp: str) -> datetime.datetime:
return datetime.datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
Loading