diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08249f3..177bbb6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,22 +6,19 @@ name: Continuous integration on: [push, pull_request] jobs: - pre-commit: - runs-on: ubuntu-latest timeout-minutes: 5 steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: "3.8" - - uses: pre-commit/action@v2.0.0 + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - uses: pre-commit/action@v2.0.0 tests: - runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -30,30 +27,37 @@ jobs: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies (including dev dependencies at frozen version) - # I'm using pip install -e to make sure that the coverage properly traces the runs - # also of the concurrent tests (maybe we can achieve this differently) - run: | - python -m pip install --upgrade pip - pip install -e .[progressbar,optionaltests] - pip install -r requirements.lock - - name: Test with pytest - # No need to run the benchmarks, they will run in a different workflow - # Also, run in very verbose mode so if there is an error we get a complete diff - run: pytest -vv --cov=disk_objectstore --benchmark-skip - env: - SQLALCHEMY_WARN_20: 1 - - name: Create xml coverage - run: coverage xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - files: ./coverage.xml - name: disk-objectstore - ## Commenting the following lines - if often fails, and if at least one manages to push, it should be enough - # fail_ci_if_error: true + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + # Set up 'ssh localhost' that is used in testing the backup command + # skipped for windows, as it doesn't support this setup or the backup command + - name: set up 'ssh localhost' + if: matrix.os != 'windows-latest' + run: | + .github/workflows/setup-ssh-localhost.sh + ssh -v localhost + - name: Install dependencies (including dev dependencies at frozen version) + # I'm using pip install -e to make sure that the coverage properly traces the runs + # also of the concurrent tests (maybe we can achieve this differently) + run: | + python -m pip install --upgrade pip + pip install -e .[progressbar,optionaltests] + pip install -r requirements.lock + - name: Test with pytest + # No need to run the benchmarks, they will run in a different workflow + # Also, run in very verbose mode so if there is an error we get a complete diff + run: pytest -vv --cov=disk_objectstore --benchmark-skip + env: + SQLALCHEMY_WARN_20: 1 + - name: Create xml coverage + run: coverage xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: ./coverage.xml + name: disk-objectstore + ## Commenting the following lines - if often fails, and if at least one manages to push, it should be enough + # fail_ci_if_error: true diff --git a/.github/workflows/setup-ssh-localhost.sh b/.github/workflows/setup-ssh-localhost.sh new file mode 100755 index 0000000..ab3347a --- /dev/null +++ b/.github/workflows/setup-ssh-localhost.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -ev + +ssh-keygen -q -t rsa -b 4096 -N "" -f "${HOME}/.ssh/id_rsa" +ssh-keygen -y -f "${HOME}/.ssh/id_rsa" >> "${HOME}/.ssh/authorized_keys" +ssh-keyscan -H localhost >> "${HOME}/.ssh/known_hosts" + +chmod 700 "${HOME}/.ssh" +chmod 600 "${HOME}/.ssh"/* diff --git a/disk_objectstore/backup_utils.py b/disk_objectstore/backup_utils.py new file mode 100644 index 0000000..8e89425 --- /dev/null +++ b/disk_objectstore/backup_utils.py @@ -0,0 +1,371 @@ +""" +Utilities to back up a container. +""" + +import datetime +import logging +import random +import shutil +import sqlite3 +import string +import subprocess +import tempfile +from pathlib import Path +from typing import Callable, Optional + +from disk_objectstore.container import Container + +logging.basicConfig(format="%(levelname)s:%(message)s") +backup_logger = logging.getLogger(__name__) + + +class BackupError(Exception): + "Raised when backup fails." + + +def split_remote_and_path(dest: str): + """extract remote and path from :""" + split_dest = dest.split(":") + if len(split_dest) == 1: + return None, Path(dest) + if len(split_dest) == 2: + return split_dest[0], Path(split_dest[1]) + # more than 1 colon: + raise ValueError("Invalid destination format: :") + + +def is_exe_found(exe: str) -> bool: + return shutil.which(exe) is not None + + +class BackupManager: + """ + Class that contains all configuration and utility functions to create + backups except for the backup function itself, which is passed in according + to what is backed up (e.g. the disk-objectstore container in for this repo, + or the whole aiida storage in aiida-core) + """ + + def __init__( + self, + dest: str, + logger: logging.Logger, + keep: int = 1, + exes: Optional[dict] = None, + ) -> None: + self.dest = dest + self.keep = keep + self.logger = logger + self.remote, self.path = split_remote_and_path(dest) + + if exes is None: + self.exes = {} + else: + self.exes = exes + + # make sure rsync gets added so it gets validated + if "rsync" not in self.exes: + self.exes["rsync"] = "rsync" + + # Validate the backup config inputs + + if self.keep < 0: + raise ValueError( + "Input validation failed: keep variable can't be negative!" + ) + + if self.remote: + self.check_if_remote_accessible() + + if self.exes: + for _, path in self.exes.items(): + if not is_exe_found(path): + raise ValueError(f"Input validation failed: {path} not accessible.") + + if not self.check_path_exists(self.path): + success = self.run_cmd(["mkdir", str(self.path)])[0] + if not success: + raise ValueError( + f"Input validation failed: Couldn't access/create '{str(self.path)}'!" + ) + + def check_if_remote_accessible(self): + """Check if remote host is accessible via ssh""" + self.logger.info(f"Checking if '{self.remote}' is accessible...") + success = self.run_cmd(["exit"])[0] + if not success: + raise BackupError(f"Remote '{self.remote}' is not accessible!") + self.logger.info("Success! '%s' is accessible!", self.remote) + + def check_path_exists(self, path: Path) -> bool: + cmd = ["[", "-e", str(path), "]"] + return self.run_cmd(cmd)[0] + + def run_cmd( + self, + args: list, + ): + """ + Run a command locally or remotely. + """ + all_args = args[:] + if self.remote: + all_args = ["ssh", self.remote] + all_args + + res = subprocess.run(all_args, capture_output=True, text=True, check=False) + + self.logger.debug( + f"Command: {all_args}\n" + f" Exit Code: {res.returncode}\n" + f" stdout/stderr: {res.stdout}\n{res.stderr}" + ) + + success = res.returncode == 0 + + return success, res.stdout + + def call_rsync( # pylint: disable=too-many-arguments + self, + src: Path, + dest: Path, + link_dest: Optional[Path] = None, + src_trailing_slash: bool = False, + dest_trailing_slash: bool = False, + extra_args: Optional[list] = None, + ): + """Call rsync with specified arguments and handle possible errors & stdout/stderr + + :param link_dest: + Path to the hardlinked files location (previous backup). + + :param src_trailing_slash: + Add a trailing slash to the source path. This makes rsync copy the contents + of the folder instead of the folder itself. + + :param dest_trailing_slash: + Add a trailing slash to the destination path. This makes rsync interpret the + destination as a folder and create it if it doesn't exists. + + """ + + assert "rsync" in self.exes + + all_args = [self.exes["rsync"], "-azh", "-vv", "--no-whole-file"] + if extra_args: + all_args += extra_args + if link_dest: + if not self.remote: + # for local paths, use resolve() to get absolute path + link_dest = link_dest.resolve() + all_args += [f"--link-dest={link_dest}"] + + if src_trailing_slash: + all_args += [str(src) + "/"] + else: + all_args += [str(src)] + + dest_str = str(dest) + if dest_trailing_slash: + dest_str += "/" + + if not self.remote: + all_args += [dest_str] + else: + all_args += [f"{self.remote}:{dest_str}"] + + res = subprocess.run(all_args, capture_output=True, text=True, check=False) + + self.logger.debug( + "Command: %s\n Exit Code: %s\n stdout/stderr: %s\n%s", + str(all_args), + res.returncode, + res.stdout, + res.stderr, + ) + + if res.returncode != 0: + raise BackupError(f"rsync failed for: {str(src)} to {str(dest)}") + + # ---- + # Utilities to manage multiple folders of backups, e.g. hard-linking to previous backup; + # deleting old backups. + # ---- + + def get_existing_backup_folders(self): + """Get all folders matching the backup folder name pattern.""" + success, stdout = self.run_cmd( + [ + "find", + str(self.path), + "-maxdepth", + "1", + "-type", + "d", + "-name", + "backup_*_*", + "-print", + ] + ) + + if not success: + raise BackupError("Existing backups determination failed.") + + return stdout.splitlines() + + def get_last_backup_folder(self): + """Get the latest backup folder, if it exists.""" + existing_backups = self.get_existing_backup_folders() + return Path(sorted(existing_backups)[-1]) if existing_backups else None + + def delete_old_backups(self): + """Get all folders matching the backup pattern, and delete oldest ones.""" + sorted_folders = sorted(self.get_existing_backup_folders()) + to_delete = sorted_folders[: -(self.keep + 1)] + for folder in to_delete: + success = self.run_cmd(["rm", "-rf", folder])[0] + if success: + self.logger.info(f"Deleted old backup: {folder}") + else: + self.logger.warning("Warning: couldn't delete old backup: %s", folder) + + def backup_auto_folders(self, backup_func: Callable) -> None: + """Create a backup, managing live and previous backup folders automatically + + The running backup is done to `/live-backup`. When it completes, it is moved to + the final path: `/backup__`. If the filesystem supports it, + the symlink `/last-backup` is added to point to the last backup. + Rsync `link-dest` is used to keep the backups incremental and performant. + + :param backup_func: + Function that is used to make a single backup. Needs to have two arguments: path and + previous_backup location (which can be None). + + """ + + live_folder = self.path / "live-backup" + + last_folder = self.get_last_backup_folder() + + if last_folder: + self.logger.info( + f"Last backup is '{str(last_folder)}', using it for rsync --link-dest." + ) + else: + self.logger.info("Couldn't find a previous backup to increment from.") + + backup_func( + live_folder, + last_folder, + ) + + # move live-backup -> backup__ + timestamp = datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y%m%d%H%M%S" + ) + randstr = "".join(random.choices(string.ascii_lowercase + string.digits, k=4)) + folder_name = f"backup_{timestamp}_{randstr}" + + success = self.run_cmd(["mv", str(live_folder), str(self.path / folder_name)])[ + 0 + ] + if not success: + raise BackupError( + f"Failed to move '{str(live_folder)}' to '{str(self.path / folder_name)}'" + ) + + self.logger.info( + f"Backup moved from '{str(live_folder)}' to '{str(self.path / folder_name)}'." + ) + + symlink_name = "last-backup" + success = self.run_cmd( + ["ln", "-sfn", str(folder_name), str(self.path / symlink_name)] + )[0] + if not success: + self.logger.warning( + f"Couldn't create symlink '{symlink_name}'. Perhaps the filesystem doesn't support it." + ) + else: + self.logger.info(f"Added symlink '{symlink_name}' to '{folder_name}'.") + + self.delete_old_backups() + + +def _sqlite_backup(src: Path, dst: Path): + """ + Safe way to make a backup of the sqlite db, while it might potentially be accessed + https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.backup + """ + src_connect = sqlite3.connect(str(src)) + dst_connect = sqlite3.connect(str(dst)) + with dst_connect: + src_connect.backup(dst_connect) + dst_connect.close() + src_connect.close() + + +def backup_container( + manager: BackupManager, + container: Container, + path: Path, + prev_backup: Optional[Path] = None, +) -> None: + """Create a backup of the disk-objectstore container + + This is safe to perform when the container is being used. + + It should be done in the following order: + 1) loose files; + 2) sqlite database; + 3) packed files. + + """ + + container_root_path = container.get_folder() + loose_path = container._get_loose_folder() # pylint: disable=protected-access + packs_path = container._get_pack_folder() # pylint: disable=protected-access + sqlite_path = container._get_pack_index_path() # pylint: disable=protected-access + + # step 1: back up loose files + loose_path_rel = loose_path.relative_to(container_root_path) + prev_backup_loose = prev_backup / loose_path_rel if prev_backup else None + + manager.call_rsync(loose_path, path, link_dest=prev_backup_loose) + manager.logger.info(f"Transferred {str(loose_path)} to {str(path)}") + + # step 2: back up sqlite db + + # make a temporary directory to dump sqlite db locally + with tempfile.TemporaryDirectory() as temp_dir_name: + sqlite_temp_loc = Path(temp_dir_name) / "packs.idx" + _sqlite_backup(sqlite_path, sqlite_temp_loc) + + if sqlite_temp_loc.is_file(): + manager.logger.info(f"Dumped the SQLite database to {str(sqlite_temp_loc)}") + else: + raise BackupError(f"'{str(sqlite_temp_loc)}' failed to be created.") + + # step 3: transfer the SQLITE database file + manager.call_rsync(sqlite_temp_loc, path, link_dest=prev_backup) + manager.logger.info(f"Transferred SQLite database to {str(path)}") + + # step 4: transfer the packed files + packs_path_rel = packs_path.relative_to(container_root_path) + manager.call_rsync(packs_path, path, link_dest=prev_backup) + manager.logger.info(f"Transferred {str(packs_path)} to {str(path)}") + + # step 5: transfer anything else in the container folder + manager.call_rsync( + container_root_path, + path, + link_dest=prev_backup, + src_trailing_slash=True, + extra_args=[ + "--exclude", + str(loose_path_rel), + "--exclude", + "packs.idx", + "--exclude", + str(packs_path_rel), + ], + ) diff --git a/disk_objectstore/cli.py b/disk_objectstore/cli.py index acd3332..fabb957 100644 --- a/disk_objectstore/cli.py +++ b/disk_objectstore/cli.py @@ -1,6 +1,7 @@ """A small CLI tool for managing stores.""" import dataclasses import json +import logging import os import sys from pathlib import Path @@ -8,7 +9,7 @@ import click -from disk_objectstore import __version__ +from disk_objectstore import __version__, backup_utils from disk_objectstore.container import Container @@ -183,3 +184,74 @@ def optimize( container.clean_storage(vacuum=vacuum) size = sum(f.stat().st_size for f in dostore.path.glob("**/*") if f.is_file()) click.echo(f"Final container size: {round(size/1000, 2)} Mb") + + +@main.command("backup") +@click.argument("dest", nargs=1, type=click.Path()) +@click.option( + "--keep", + default=1, + show_default=True, + help="Number of previous backups to keep in the destination.", +) +@click.option( + "--rsync-exe", + default="rsync", + help="Specify the 'rsync' executable, if not in PATH. Used for both local and remote destinations.", +) +@click.option( + "--verbosity", + default="info", + type=click.Choice(("silent", "info", "debug")), + help="Set verbosity of the logger.", +) +@pass_dostore +def backup( + dostore: ContainerContext, dest: str, keep: int, rsync_exe: str, verbosity: str +): + """Create a backup of the container. + + The backup is created at the `DEST` destination, in a subfolder + backup__ and a symlink `last-backup` is created to it in the same folder. + + NOTE: This is safe to run while the container is being used. + + NOTE: the symlink `last-backup` is omitted if the filesystem doesn't support it. + + Destination (DEST) can either be a local path, or a remote destination (reachable via ssh). + In the latter case, remote destination needs to have the following syntax: + + [@]: + + i.e., contain the remote host name and the remote path, separated by a colon (and optionally the + remote user separated by an @ symbol). You can tune SSH parameters using the standard options given + by OpenSSH, such as adding configuration options to ~/.ssh/config (e.g. to allow for passwordless + login - recommended, since this script might ask multiple times for the password). + + NOTE: 'rsync' and other UNIX-specific commands are called, thus the command will not work on + non-UNIX environments. + """ + + if verbosity == "silent": + backup_utils.backup_logger.setLevel(logging.ERROR) + elif verbosity == "info": + backup_utils.backup_logger.setLevel(logging.INFO) + elif verbosity == "debug": + backup_utils.backup_logger.setLevel(logging.DEBUG) + + with dostore.container as container: + try: + backup_manager = backup_utils.BackupManager( + dest, + backup_utils.backup_logger, + exes={"rsync": rsync_exe}, + keep=keep, + ) + backup_manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + backup_manager, container, path, prev + ) + ) + except (ValueError, backup_utils.BackupError) as e: + click.echo(f"Error: {e}") + sys.exit(1) diff --git a/docs/index.rst b/docs/index.rst index 41dac50..c1b825d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -64,6 +64,19 @@ and does not require a server running. ---------------------------------------------- + :fa:`bookmark,mr-1` **Making backups** + + Information on how to back up a container. + + +++++++++++++++++++++++++++++++++++++++++++++ + + .. link-button:: pages/backup + :type: ref + :text: To the backup explanation + :classes: btn-outline-primary btn-block stretched-link + + ---------------------------------------------- + :fa:`cogs,mr-1` **Design** Background information on the design of the package. @@ -84,4 +97,5 @@ and does not require a server running. pages/advanced_usage pages/cli_usage pages/packing + pages/backup pages/design diff --git a/docs/pages/backup.md b/docs/pages/backup.md new file mode 100644 index 0000000..ec481c5 --- /dev/null +++ b/docs/pages/backup.md @@ -0,0 +1,119 @@ +# Making backups + +## User instructions + +A disk-objectstore container is fully contained in its root folder. If the container is not being modified, a backup can be made by just making a copy of this folder. The recommended way is to use the `rsync` tool, as the library was designed to be performant with it and make use of the incremental copying capabilities. + +However, the preferred way to make a backup, that is also safe while the container is being used (except for when repacking or deleting files), is to use the built-in CLI command: + +```console +$ dostore backup --help +Usage: dostore backup [OPTIONS] DEST + + Create a backup of the container to destination location DEST, in a + subfolder backup__ and point a symlink called `last- + backup` to it. + + NOTE: This is safe to run while the container is being used. + + NOTE: the symlink `last-backup` is omitted if the filesystem doesn't support + it. + + Destination (DEST) can either be a local path, or a remote destination + (reachable via ssh). In the latter case, remote destination needs to have + the following syntax: + + [@]: + + i.e., contain the remote host name and the remote path, separated by a colon + (and optionally the remote user separated by an @ symbol). You can tune SSH + parameters using the standard options given by OpenSSH, such as adding + configuration options to ~/.ssh/config (e.g. to allow for passwordless login + - recommended, since this script might ask multiple times for the password). + + NOTE: 'rsync' and other UNIX-specific commands are called, thus the command + will not work on non-UNIX environments. + +Options: + --keep INTEGER Number of previous backups to keep in the destination. + (default: 1) + --rsync-exe TEXT Specify the 'rsync' executable, if not in PATH. Used for + both local and remote destinations. + --verbosity TEXT Set verbosity [silent|info|debug], default is 'info'. + --help Show this message and exit. + +``` + +Example usage: + +```console +$ dostore --path /path/to/container backup /path/to/backup +INFO:Last backup is '/path/to/backup/backup_20231207142602_ymqf', using it for rsync --link-dest. +INFO:Transferred /path/to/container/loose to /path/to/backup/live-backup +INFO:Dumped the SQLite database to /tmp/tmpgewwse3f/packs.idx +INFO:Transferred SQLite database to /path/to/backup/live-backup +INFO:Transferred /path/to/container/packs to /path/to/backup/live-backup +INFO:Backup moved from '/path/to/backup/live-backup' to '/path/to/backup/backup_20231207142913_pz7m'. +INFO:Added symlink 'last-backup' to 'backup_20231207142913_pz7m'. +INFO:Deleted old backup: /path/to/backup/backup_20231207131741_zar7 +``` + +For more detailed information about how the backup is made, see the next section. + +## Detailed info/design + +The primary purpose of the backup functionality is to copy the content of the container in a specific order that prevents data corruption due to the container being updated. This order is the following + +1. loose files; +2. sqlite database that contains the packed file indexes; +3. packed files. + +To understand why, let's consider ways the backup could become corrupted: + +- In the case of packing files (`optimize`) or adding directly packed files, the library first adds data to the pack file and then writes the metadata to the sqlite database. The backup becomes corrupted if the following happens + + 1. data is being added to a pack file; + 2. backup copies the pack file, containing the incomplete section; + 3. packfile is completed & the sqlite database is updated; + 4. backup copies the sqlite database. + + This results in the backup containing an index that references an incomplete section in a pack file. To prevent this, is to always copy first the sqlite db and then the pack files. This can still result in an incomplete section in the pack files but it effectively doesn't exist for the backup. + +- If loose files are packed up at the end, the following might happen: + + 1. backup copies pack files & sqlite db; + 2. user runs optimize & clean_storage, which adds loose files to a pack & deletes the original files; + 3. backup copies loose files. + + This results in files missing in the backup. Therefore, loose files should be copied first. + +Note: one should not run the backup while repacking and deleting files. + +Implementation details: + +- The backup command runs operating-system-level commands on the destination machine by using the python subprocess library. These currently include + + 1. running rsync. + 2. For remote destinations, checking if it is accessible (`ssh exit`); + 3. checking if destination path exists (`[ -e ]`); + 4. checking if destination directory can be made, if it doesn't exist (`mkdir `); + 5. moving and removing folders. + + For 3-5, remote cases just append `ssh ` in front of the command, while rsync is used via its native interface to access a remote destination. For both of these cases of remote access, the standard configuration options of OpenSSH are used (such as configuration in `~/.ssh/config`) + +- Steps in order: + - Input validation: + - is remote accessible? + - is `DEST` accessible? + - is `rsync` executable found? + - Check if a backup already exists in `DEST` + - if yes, use the most recent one (based on timestamp in name) for `rsync --link-dest` argument in all `rsync` calls + - Create `DEST/live-backup` folder + - rsync loose folder to `DEST/live-backup` + - dump sqlite database in a safe manner to a `tempfile.TemporaryDirectory()` + - rsync the sqlite database to `DEST/live-backup` + - rsync the packed files to `DEST/live-backup` + - rsync everything else to `DEST/live-backup` + - rename `DEST/live-backup` to `DEST/backup__` + - update `DEST/last-backup` symlink to point to `DEST/backup__` + - delete number of previous backups down to `--keep` argument diff --git a/docs/pages/cli_usage.md b/docs/pages/cli_usage.md index 781fcd0..722eff2 100644 --- a/docs/pages/cli_usage.md +++ b/docs/pages/cli_usage.md @@ -16,9 +16,11 @@ Options: Commands: add-files Add file(s) to the container + backup Create a backup of the container to destination location... create Create a container optimize Optimize the container's memory use status Print details about the container + validate Validate the container ``` Create a container: diff --git a/tests/test_backup.py b/tests/test_backup.py new file mode 100644 index 0000000..a8b199f --- /dev/null +++ b/tests/test_backup.py @@ -0,0 +1,229 @@ +"""Test the backup functionality. + +""" + +import platform +import random +import string +from pathlib import Path + +import pytest + +from disk_objectstore import backup_utils +from disk_objectstore.backup_utils import BackupError, BackupManager + +pytestmark = pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) + + +def _random_string(n=10): + return "".join(random.choices(string.ascii_lowercase + string.digits, k=n)) + + +def test_invalid_destination(): + """Test invalid destination with two colons.""" + dest = "localhost:/tmp/test:" + with pytest.raises(ValueError, match="Invalid destination format"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_inaccessible_remote(): + """Test a remote destination of random characters that will not be accessible.""" + dest = f"_{_random_string()}:/tmp/test" + with pytest.raises(BackupError, match="is not accessible"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_negative_keep(): + """Test a negative keep value.""" + dest = "/tmp/test" + with pytest.raises(ValueError, match="keep variable can't be negative"): + BackupManager(dest, backup_utils.backup_logger, keep=-1) + + +def test_inaccessible_exe(): + """Test case where rsync is not accessible.""" + dest = "/tmp/test" + rsync_exe = f"_{_random_string()}" + with pytest.raises(ValueError, match=f"{rsync_exe} not accessible."): + BackupManager(dest, backup_utils.backup_logger, exes={"rsync": rsync_exe}) + + +def test_inaccessible_path(): + """Test case where path is not accessible.""" + dest = f"/_{_random_string()}" # I assume there will be a permission error for this path + with pytest.raises(ValueError, match=f"Couldn't access/create '{dest}'"): + BackupManager(dest, backup_utils.backup_logger) + + +def test_rsync_failure(): + """Test case where rsync fails.""" + dest = "/tmp/test" + with pytest.raises(BackupError, match="rsync failed"): + manager = BackupManager(dest, backup_utils.backup_logger) + # pick a src that doesn't exists + manager.call_rsync(Path(f"/_{_random_string()}"), Path(dest)) + + +def test_rsync_dest_trailing_slash(temp_dir): + """Test case for dest_trailing_slash.""" + dest1 = Path(temp_dir) / "dest1" + dest2 = Path(temp_dir) / "dest2" + # manager will create dest1 folder + manager = BackupManager(str(dest1), backup_utils.backup_logger) + # dest_trailing_slash=True will create dest2 + manager.call_rsync(dest1, dest2, dest_trailing_slash=True) + assert dest2.exists() + + +def test_existing_backups_failure(): + """Test case where existing backups fail to be determined.""" + dest = "/tmp/test" + with pytest.raises(BackupError, match="Existing backups determination failed"): + manager = BackupManager(dest, backup_utils.backup_logger) + # override the path to something that will fail + manager.path = f"/_{_random_string()}" + manager.get_existing_backup_folders() + + +def test_sqlite_failure(monkeypatch, temp_container, temp_dir): + """Test case where sqlite fails to make a backup file.""" + + # monkeypatch sqlite backup to do nothing + def mock_sqlite_backup(src, dst): # pylint: disable=unused-argument + pass + + monkeypatch.setattr( + backup_utils, + "_sqlite_backup", + mock_sqlite_backup, + ) + + # make a container + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + with pytest.raises(BackupError, match="'.*' failed to be created."): + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + + +def test_mv_failure(monkeypatch, temp_container, temp_dir): + """ + Test case where mv command fails by monkeypatching. + Make sure correct BackupError is raised. + """ + + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "mv" in args: + return False, "" + return original_run_cmd(self, args) + + monkeypatch.setattr( + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, + ) + + # make a container and back it up + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + with pytest.raises(BackupError, match="Failed to move"): + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + + +def test_ln_failure(monkeypatch, temp_container, temp_dir, caplog): + """ + Test case where ln command fails by monkeypatching. + Make sure correct warning is logged. + """ + + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "ln" in args: + return False, "" + return original_run_cmd(self, args) + + monkeypatch.setattr( + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, + ) + + # make a container and back it up + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + manager = BackupManager(str(dest), backup_utils.backup_logger) + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + assert "Couldn't create symlink" in caplog.text + + +def test_rm_failure(monkeypatch, temp_container, temp_dir, caplog): + """ + Test case where rm command fails by monkeypatching. + Make sure correct warning is logged. + Note, this is used for deleting old backups, so create two with keep=0. + """ + + # save a reference to the original run_cmd command + original_run_cmd = backup_utils.BackupManager.run_cmd + + # monkeypatch the run_cmd command to fail when "mv" is used + def mock_run_cmd(self, args): + if "rm" in args: + return False, "" + return original_run_cmd(self, args) + + monkeypatch.setattr( + backup_utils.BackupManager, + "run_cmd", + mock_run_cmd, + ) + + # make a container and back it up + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + dest = Path(temp_dir) / "backup" + manager = BackupManager(str(dest), backup_utils.backup_logger, keep=0) + for _ in range(2): + manager.backup_auto_folders( + lambda path, prev: backup_utils.backup_container( + manager, temp_container, path, prev + ) + ) + assert "Warning: couldn't delete old backup" in caplog.text diff --git a/tests/test_cli.py b/tests/test_cli.py index 9280471..5b7e2b9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,5 @@ """Test the CLI commands""" +import platform from pathlib import Path import pytest @@ -185,3 +186,124 @@ def myimport( assert result.exit_code == 0 assert "INFO: no `tqdm` package found" in result.stdout assert "No errors found" in result.stdout + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) +@pytest.mark.parametrize( + "remote, verbosity", + [ + (False, None), + (False, "silent"), + (False, "info"), + (False, "debug"), + (True, None), + ], +) +def test_backup(temp_container, temp_dir, remote, verbosity): + """Test the backup command""" + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + path = Path(temp_dir) / "backup" + + if remote: + destination = f"localhost:{str(path)}" + else: + destination = str(path) + + args = [destination] + + if verbosity: + args += [f"--verbosity={verbosity}"] + + result = CliRunner().invoke(cli.backup, args, obj=obj) + + assert result.exit_code == 0 + assert path.exists() + + path_contents = [entry.name for entry in path.iterdir()] + backup_dirs = [ + entry for entry in path.iterdir() if entry.name.startswith("backup_") + ] + + assert "last-backup" in path_contents + assert len(backup_dirs) == 1 + + backup_dir_contents = [entry.name for entry in backup_dirs[0].iterdir()] + + for item in ["config.json", "duplicates", "loose", "packs", "packs.idx", "sandbox"]: + assert item in backup_dir_contents + + # validate the backup + + obj = cli.ContainerContext(backup_dirs[0]) + result = CliRunner().invoke(cli.validate, obj=obj) + + assert result.exit_code == 0 + assert "No errors found" in result.stdout + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) +@pytest.mark.parametrize("remote", [False, True]) +def test_backup_repeated(temp_container, temp_dir, remote): + """Test the backup command repeated 3 times. + + Considering --keep 1 is default, the last one should get deleted. + """ + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + path = Path(temp_dir) / "backup" + + if remote: + destination = f"localhost:{str(path)}" + else: + destination = str(path) + + for _ in range(3): + result = CliRunner().invoke(cli.backup, [destination], obj=obj) + assert result.exit_code == 0 + + assert path.exists() + path_contents = [entry.name for entry in path.iterdir()] + backup_dirs = [ + entry for entry in path.iterdir() if entry.name.startswith("backup_") + ] + + assert "last-backup" in path_contents + assert len(backup_dirs) == 2 + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Backup not supported on Windows" +) +def test_backup_failure(temp_container): + """Test failure when providing invalid destination""" + + temp_container.init_container(clear=True) + # Add a few objects + for idx in range(100): + temp_container.add_object(f"test-{idx}".encode()) + + obj = cli.ContainerContext(temp_container.get_folder()) + + dest = "abc:abc:" + + result = CliRunner().invoke(cli.backup, [dest], obj=obj) + + assert result.exit_code == 1 + assert "Error:" in result.stdout