From e9d26bc709d38437a68e28372528a9cdbe68eab6 Mon Sep 17 00:00:00 2001 From: Markus Hentsch <129268441+markus-hentsch@users.noreply.github.com> Date: Tue, 17 Sep 2024 10:27:53 +0200 Subject: [PATCH] Add standard for volume backup functionality (#567) Signed-off-by: Markus Hentsch Signed-off-by: Markus Hentsch <129268441+markus-hentsch@users.noreply.github.com> Co-authored-by: josephineSei <128813814+josephineSei@users.noreply.github.com> Co-authored-by: anjastrunk <119566837+anjastrunk@users.noreply.github.com> --- .../scs-0117-v1-volume-backup-service.md | 97 ++++++ Tests/iaas/volume-backup/README.md | 70 +++++ .../volume-backup/volume-backup-tester.py | 282 ++++++++++++++++++ 3 files changed, 449 insertions(+) create mode 100644 Standards/scs-0117-v1-volume-backup-service.md create mode 100644 Tests/iaas/volume-backup/README.md create mode 100644 Tests/iaas/volume-backup/volume-backup-tester.py diff --git a/Standards/scs-0117-v1-volume-backup-service.md b/Standards/scs-0117-v1-volume-backup-service.md new file mode 100644 index 000000000..d272dfa05 --- /dev/null +++ b/Standards/scs-0117-v1-volume-backup-service.md @@ -0,0 +1,97 @@ +--- +title: Volume Backup Functionality +type: Standard +status: Draft +track: IaaS +--- + +## Introduction + +OpenStack offers a variety of resources where users are able to transfer and store data in the infrastructure. +A prime example of these resources are volumes which are attached to virtual machines as virtual block storage devices. +As such they carry potentially large amounts of user data which is constantly changing at runtime. +It is important for users to have the ability to create backups of this data in a reliable and effifcient manner. + +## Terminology + +| Term | Meaning | +|---|---| +| CSP | Cloud Service Provider, provider managing the OpenStack infrastructure | +| IaaS | Abbreviation for Infrastructure as a Service | +| Image | IaaS resource representing a snapshot of a block storage disk, can be used to create Volumes | +| Volume | IaaS resource representing a virtual block storage device that can be attached as a disk to virtual machines | + +## Motivation + +The [volume backup functionality of the Block Storage API](https://docs.openstack.org/cinder/latest/admin/volume-backups.html) is a feature that is not available in all clouds per default, e.g., in OpenStack. +The feature requires a backend to be prepared and configured correctly before it can be used. +In the Block Storage service, the backup storage backend is usually configured separately from the storage backend of the general volume service and may not be mandatory. +Thus, an arbitrary cloud may or may not offer the backup feature in the Block Storage API. + +This standard aims to make this functionality the default in SCS clouds so that customers can expect the feature to be usable. + +## Design Considerations + +The standard should make sure that the feature is available and usable but should not limit the exact implementation (e.g. choice of backend driver) any further than necessary. + +### Options considered + +#### Only recommend volume backup feature, use images as alternative + +As an alternative to the volume backup feature of the Block Storage API, images can also be created based on volumes and act as a backup under certain circumstances. +As an option, this standard could keep the actual integration of the volume backup feature optional and guide users how to use images as backup targets instead in case the feature is unavailable. + +However, it is not guaranteed that the image backend storage is separate from the volume storage. +For instance, both could be using the same Ceph cluster. +In such case, the images would not count as genuine backups. + +Although users are able to download images and transfer them to a different storage location, this approach might also prove unfeasible depending on the image size and the existence (or lack) of appropriate target storage on the user side. + +Furthermore, incremental backups are not possible when creating images from volumes either. +This results in time-consuming backup operations of fully copying a volume everytime a backup is created. + +#### Focus on feature availability, make feature mandatory + +This option is pretty straightforward. +It would make the volume backup feature mandatory for SCS clouds. +This way users can expect the feature to be available and usable. + +With this, users can leverage functionalities like incremental backups and benefit from optimized performance of the backup process due to the tight integration with the volume service. + +However, it does not seem feasible to also mandate having a separate storage backend for volume backups at the same time due to potential infrastructure limitations at CSP-side making it hard or even impossible to offer. +As such, the actual benefit of backups in terms of reliability and security aspects would be questionable if a separate storage backend is not mandated and therefore not guaranteed. + +This approach would focus on feature availability rather than backup reliability. + +#### Focus on backup reliability, make separate backend mandatory + +As an alternative, the volume backup feature availability could be made optional but in case a CSP chooses to offer it, the standard would mandate a separate storage backend to be used for volume backups. +This way, failures of the volume storage backend would not directly impact the availability and safety of volume backups, making them actually live up to their name. + +In contrast to the above, this approach would focus on backup reliability rather than feature availability. + +## Standard + +This standard decides to go with the second option and makes the volume backup feature mandatory in the following way: + +In an SCS cloud, the volume backup functionality MUST be configured properly and its API as defined per `/v3/{project_id}/backups` MUST be offered to customers. +If using Cinder, a suitable [backup driver](https://docs.openstack.org/cinder/latest/configuration/block-storage/backup-drivers.html) MUST be set up. + +The volume backup target storage SHOULD be a separate storage system from the one used for volumes themselves. + +## Related Documents + +- [OpenStack Block Storage v3 Backup API reference](https://docs.openstack.org/api-ref/block-storage/v3/index.html#backups-backups) +- [OpenStack Volume Backup Drivers](https://docs.openstack.org/cinder/latest/configuration/block-storage/backup-drivers.html) + +## Conformance Tests + +Conformance tests include using the `/v3/{project_id}/backups` Block Storage API endpoint to create a volume and a backup of it as a non-admin user and subsequently restore the backup on a new volume while verifying the success of each operation. +These tests verify the mandatory part of the standard: providing the Volume Backup API. + +There is a test suite in [`volume-backup-tester.py`](https://github.com/SovereignCloudStack/standards/blob/main/Tests/iaas/volume-backup/volume-backup-tester.py). +The test suite connects to the OpenStack API and executes basic operations using the volume backup API to verify that the functionality requested by the standard is available. +Please consult the associated [README.md](https://github.com/SovereignCloudStack/standards/blob/main/Tests/iaas/volume-backup/README.md) for detailed setup and testing instructions. + +Note that these tests don't verify the optional part of the standard: providing a separate storage backend for Cinder volume backups. +This cannot be checked from outside of the infrastructure as it is an architectural property of the infrastructure itself and transparent to customers. diff --git a/Tests/iaas/volume-backup/README.md b/Tests/iaas/volume-backup/README.md new file mode 100644 index 000000000..2b6cd4716 --- /dev/null +++ b/Tests/iaas/volume-backup/README.md @@ -0,0 +1,70 @@ +# Volume Backup API Test Suite + +## Test Environment Setup + +### Test Execution Environment + +> **NOTE:** The test execution procedure does not require cloud admin rights. + +To execute the test suite a valid cloud configuration for the OpenStack SDK in the shape of "`clouds.yaml`" is mandatory[^1]. +**The file is expected to be located in the current working directory where the test script is executed unless configured otherwise.** + +[^1]: [OpenStack Documentation: Configuring OpenStack SDK Applications](https://docs.openstack.org/openstacksdk/latest/user/config/configuration.html) + +The test execution environment can be located on any system outside of the cloud infrastructure that has OpenStack API access. +Make sure that the API access is configured properly in "`clouds.yaml`". + +It is recommended to use a Python virtual environment[^2]. +Next, install the OpenStack SDK required by the test suite: + +```bash +pip3 install openstacksdk +``` + +Within this environment execute the test suite. + +[^2]: [Python 3 Documentation: Virtual Environments and Packages](https://docs.python.org/3/tutorial/venv.html) + +## Test Execution + +The test suite is executed as follows: + +```bash +python3 volume-backup-tester.py --os-cloud mycloud +``` + +As an alternative to "`--os-cloud`", the "`OS_CLOUD`" environment variable may be specified instead. +The parameter is used to look up the correct cloud configuration in "`clouds.yaml`". +For the example command above, this file should contain a `clouds.mycloud` section like this: + +```yaml +--- +clouds: + mycloud: + auth: + auth_url: ... + ... + ... +``` + +If the test suite fails and leaves test resources behind, the "`--cleanup-only`" flag may be used to delete those resources from the domains: + +```bash +python3 volume-backup-tester.py --os-cloud mycloud --cleanup-only +``` + +For any further options consult the output of "`python3 volume-backup-tester.py --help`". + +### Script Behavior & Test Results + +> **NOTE:** Before any execution of test batches, the script will automatically perform a cleanup of volumes and volume backups matching a special prefix (see the "`--prefix`" flag). +> This cleanup behavior is identical to "`--cleanup-only`". + +The script will print all cleanup actions and passed tests to `stdout`. + +If all tests pass, the script will return with an exit code of `0`. + +If any test fails, the script will halt, print the exact error to `stderr` and return with a non-zero exit code. + +In case of a failed test, cleanup is not performed automatically, allowing for manual inspection of the cloud state for debugging purposes. +Although unnecessary due to automatic cleanup upon next execution, you can manually trigger a cleanup using the "`--cleanup-only`" flag of this script. diff --git a/Tests/iaas/volume-backup/volume-backup-tester.py b/Tests/iaas/volume-backup/volume-backup-tester.py new file mode 100644 index 000000000..f4fa9522d --- /dev/null +++ b/Tests/iaas/volume-backup/volume-backup-tester.py @@ -0,0 +1,282 @@ +"""Volume Backup API tester for Block Storage API + +This test script executes basic operations on the Block Storage API centered +around volume backups. Its purpose is to verify that the Volume Backup API is +available and working as expected using simple operations such as creating and +restoring volume backups. + +It verifies that a properly configured backup driver is present to the extent +that aforementioned operations succeed on the API level. It does not by any +means verify that the backup and restore procedures actual handle the data +correctly (it only uses empty volumes and does not look at data for the sake +of simplicity). +""" + +import argparse +import getpass +import os +import time +import typing + +import openstack + +# prefix to be included in the names of any Keystone resources created +# used by the cleanup routine to identify resources that can be safely deleted +DEFAULT_PREFIX = "scs-test-" + +# timeout in seconds for resource availability checks +# (e.g. a volume becoming available) +WAIT_TIMEOUT = 60 + + +def connect(cloud_name: str, password: typing.Optional[str] = None + ) -> openstack.connection.Connection: + """Create a connection to an OpenStack cloud + + :param string cloud_name: + The name of the configuration to load from clouds.yaml. + + :param string password: + Optional password override for the connection. + + :returns: openstack.connnection.Connection + """ + + if password: + return openstack.connect( + cloud=cloud_name, + password=password + ) + else: + return openstack.connect( + cloud=cloud_name, + ) + + +def test_backup(conn: openstack.connection.Connection, + prefix=DEFAULT_PREFIX, timeout=WAIT_TIMEOUT) -> None: + """Execute volume backup tests on the connection + + This will create an empty volume, a backup of that empty volume and then + attempt to restore the backup onto a new volume. + Purpose of these tests is to verify that the volume backup API is working + correctly. + """ + + # CREATE VOLUME + print("Creating volume ...") + volume = conn.block_storage.create_volume( + name=f"{prefix}volume", + size=1 + ) + assert volume is not None, ( + "Initial volume creation failed" + ) + volume_id = volume.id + assert conn.block_storage.get_volume(volume_id) is not None, ( + "Retrieving initial volume by ID failed" + ) + + print( + f"↳ waiting for volume with ID '{volume_id}' to reach status " + f"'available' ..." + ) + seconds_waited = 0 + while conn.block_storage.get_volume(volume_id).status != "available": + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for volume to reach status " + f"'available' (volume id: {volume_id}) after {seconds_waited} " + f"seconds" + ) + print("Create empty volume: PASS") + + # CREATE BACKUP + print("Creating backup from volume ...") + backup = conn.block_storage.create_backup( + name=f"{prefix}volume-backup", + volume_id=volume_id + ) + assert backup is not None, ( + "Backup creation failed" + ) + backup_id = backup.id + assert conn.block_storage.get_backup(backup_id) is not None, ( + "Retrieving backup by ID failed" + ) + + print(f"↳ waiting for backup '{backup_id}' to become available ...") + seconds_waited = 0 + while conn.block_storage.get_backup(backup_id).status != "available": + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for backup to reach status " + f"'available' (backup id: {backup_id}) after {seconds_waited} " + f"seconds" + ) + print("Create backup from volume: PASS") + + # RESTORE BACKUP + print("Restoring backup to volume ...") + restored_volume_name = f"{prefix}restored-backup" + conn.block_storage.restore_backup( + backup_id, + name=restored_volume_name + ) + + print( + f"↳ waiting for restoration target volume '{restored_volume_name}' " + f"to be created ..." + ) + seconds_waited = 0 + while conn.block_storage.find_volume(restored_volume_name) is None: + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for restored volume to be created " + f"(volume name: {restored_volume_name}) after {seconds_waited} " + f"seconds" + ) + # wait for the volume restoration to finish + print( + f"↳ waiting for restoration target volume '{restored_volume_name}' " + f"to reach 'available' status ..." + ) + volume_id = conn.block_storage.find_volume(restored_volume_name).id + while conn.block_storage.get_volume(volume_id).status != "available": + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for restored volume reach status " + f"'available' (volume id: {volume_id}) after {seconds_waited} " + f"seconds" + ) + print("Restore volume from backup: PASS") + + +def cleanup(conn: openstack.connection.Connection, prefix=DEFAULT_PREFIX, + timeout=WAIT_TIMEOUT): + """ + Looks up volume and volume backup resources matching the given prefix and + deletes them. + """ + + def wait_for_resource(resource_type: str, resource_id: str, + expected_status="available") -> None: + seconds_waited = 0 + get_func = getattr(conn.block_storage, f"get_{resource_type}") + while get_func(resource_id).status != expected_status: + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for {resource_type} during " + f"cleanup to be in status '{expected_status}' " + f"({resource_type} id: {resource_id}) after {seconds_waited} " + f"seconds" + ) + + print(f"\nPerforming cleanup for resources with the " + f"'{prefix}' prefix ...") + + backups = conn.block_storage.backups() + for backup in backups: + if backup.name.startswith(prefix): + try: + wait_for_resource("backup", backup.id) + except openstack.exceptions.ResourceNotFound: + # if the resource has vanished on + # its own in the meantime ignore it + continue + print(f"↳ deleting volume backup '{backup.id}' ...") + conn.block_storage.delete_backup(backup.id) + + # wait for all backups to be cleaned up before attempting to remove volumes + seconds_waited = 0 + while len( + # list of all backups whose name starts with the prefix + [b for b in conn.block_storage.backups() if b.name.startswith(prefix)] + ) > 0: + time.sleep(1.0) + seconds_waited += 1 + assert seconds_waited < timeout, ( + f"Timeout reached while waiting for all backups with prefix " + f"'{prefix}' to finish deletion" + ) + + volumes = conn.block_storage.volumes() + for volume in volumes: + if volume.name.startswith(prefix): + try: + wait_for_resource("volume", volume.id) + except openstack.exceptions.ResourceNotFound: + # if the resource has vanished on + # its own in the meantime ignore it + continue + print(f"↳ deleting volume '{volume.id}' ...") + conn.block_storage.delete_volume(volume.id) + + +def main(): + parser = argparse.ArgumentParser( + description="SCS Volume Backup API Conformance Checker") + parser.add_argument( + "--os-cloud", type=str, + help="Name of the cloud from clouds.yaml, alternative " + "to the OS_CLOUD environment variable" + ) + parser.add_argument( + "--ask", + help="Ask for password interactively instead of reading it from the " + "clouds.yaml", + action="store_true" + ) + parser.add_argument( + "--debug", action="store_true", + help="Enable OpenStack SDK debug logging" + ) + parser.add_argument( + "--prefix", type=str, + default=DEFAULT_PREFIX, + help=f"OpenStack resource name prefix for all resources to be created " + f"and/or cleaned up by this script within the configured domains " + f"(default: '{DEFAULT_PREFIX}')" + ) + parser.add_argument( + "--timeout", type=int, + default=WAIT_TIMEOUT, + help=f"Timeout in seconds for operations waiting for resources to " + f"become available such as creating volumes and volume backups " + f"(default: '{WAIT_TIMEOUT}')" + ) + parser.add_argument( + "--cleanup-only", action="store_true", + help="Instead of executing tests, cleanup all resources " + "with the prefix specified via '--prefix' (or its default)" + ) + args = parser.parse_args() + openstack.enable_logging(debug=args.debug) + + # parse cloud name for lookup in clouds.yaml + cloud = os.environ.get("OS_CLOUD", None) + if args.os_cloud: + cloud = args.os_cloud + assert cloud, ( + "You need to have the OS_CLOUD environment variable set to your " + "cloud name or pass it via --os-cloud" + ) + conn = connect( + cloud, + password=getpass.getpass("Enter password: ") if args.ask else None + ) + if args.cleanup_only: + cleanup(conn, prefix=args.prefix, timeout=args.timeout) + else: + cleanup(conn, prefix=args.prefix, timeout=args.timeout) + test_backup(conn, prefix=args.prefix, timeout=args.timeout) + cleanup(conn, prefix=args.prefix, timeout=args.timeout) + + +if __name__ == "__main__": + main()