diff --git a/ansible/cassandra-backup.yml b/ansible/cassandra-backup.yml index d8a054b580..73ef0fcbf1 100644 --- a/ansible/cassandra-backup.yml +++ b/ansible/cassandra-backup.yml @@ -1,7 +1,28 @@ +# This task will take snapshot serially for all cassandra nodes +# If we club both taking snapshot and upload the data, then the +# interval of snapshots b/w nodes will be very high; Which might lead +# to data discrepancies +- hosts: cassandra + become: yes + serial: true + tasks: + - name: taking cassandra snapshots + shell: | + nodetool clearsnapshot + nodetool snapshot -t "cassandra-backup-{{ lookup('pipe', 'date +%Y%m%d') }}-{{ ansible_hostname }}" + - name: sleeping 1 min b/w snapshots + pause: + echo: true + minutes: 1 # A positive number of minutes to pause for. + prompt: "Pausing after snapshot" # Optional text to use for the prompt message. + +# Once snaphot is done, +# We can take the snapshot and compress it and upload it +# This will take some cpu and memory in the nodes +# Because of that we're running it serially, so that it won't affect the perfomance - hosts: cassandra become: yes - serial: 1 vars_files: - ['{{ inventory_dir }}/secrets.yml'] roles: - - cassandra-backup + - {name: cassandra-backup, vars: [ additional_arguments: "--disablesnapshot"]} diff --git a/ansible/roles/azure-cli/tasks/main.yml b/ansible/roles/azure-cli/tasks/main.yml index cf29bcb106..798d80188c 100644 --- a/ansible/roles/azure-cli/tasks/main.yml +++ b/ansible/roles/azure-cli/tasks/main.yml @@ -1,2 +1,3 @@ - name: install azure cli - shell: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + shell: + which az || curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash diff --git a/ansible/roles/cassandra-backup/tasks/main.yml b/ansible/roles/cassandra-backup/tasks/main.yml index 8d6da3b6d2..9d154af074 100755 --- a/ansible/roles/cassandra-backup/tasks/main.yml +++ b/ansible/roles/cassandra-backup/tasks/main.yml @@ -4,7 +4,10 @@ - name: copy the backup script become: yes - template: src=cassandra_backup.j2 dest=/data/cassandra/backup/cassandra_backup.py mode=0755 + template: + src: ../../../../deploy/cassandra_backup.py + dest: /data/cassandra/backup/cassandra_backup.py + mode: 0755 - set_fact: cassandra_backup_gzip_file_name: "cassandra-backup-{{ lookup('pipe', 'date +%Y%m%d') }}-{{ ansible_hostname }}" @@ -14,7 +17,7 @@ - name: run the backup script become: yes - shell: python cassandra_backup.py --snapshotname "{{ cassandra_backup_gzip_file_name }}" + shell: python3 cassandra_backup.py --snapshotname "{{ cassandra_backup_gzip_file_name }}" "{{additional_arguments|d('')}}" args: chdir: /data/cassandra/backup async: 7200 diff --git a/deploy/cassandra_backup.py b/deploy/cassandra_backup.py index 076379a9dd..bdabf66373 100755 --- a/deploy/cassandra_backup.py +++ b/deploy/cassandra_backup.py @@ -19,13 +19,14 @@ for help ./cassandra_backup.py -h ''' -from os import walk, sep, system, getcwd, makedirs +from os import walk, sep, system, getcwd, makedirs, cpu_count from argparse import ArgumentParser from shutil import rmtree, ignore_patterns, copytree from re import match, compile from sys import exit from tempfile import mkdtemp from time import strftime +import concurrent.futures parser = ArgumentParser(description="Create a snapshot and create tar ball inside tardirectory") parser.add_argument("-d", "--datadirectory", metavar="datadir", default='/var/lib/cassandra/data', @@ -35,12 +36,19 @@ help="Name with which snapshot to be taken. Default {}".format("cassandra_backup-"+strftime("%Y-%m-%d"))) parser.add_argument("-t", "--tardirectory", metavar="tardir", default=getcwd(), help="Path to create the tarball. Default {}".format(getcwd())) +parser.add_argument("-w", "--workers", metavar="workers", + default=cpu_count(), help="Number of workers to use. Default same as cpu cores {}".format(cpu_count())) +parser.add_argument("--disablesnapshot", action="store_true", + help="disable taking snapshot, snapshot name can be given via -s flag") args = parser.parse_args() # Create temporary directory to copy data tmpdir = mkdtemp() makedirs(tmpdir+sep+"cassandra_backup") +def customCopy(root, root_target_dir): + print("copying {} to {}".format(root, root_target_dir)) + copytree(src=root, dst=root_target_dir, ignore=ignore_patterns('.*')) def copy(): ''' @@ -48,17 +56,26 @@ def copy(): ''' root_levels = args.datadirectory.rstrip('/').count(sep) ignore_list = compile(tmpdir+sep+"cassandra_backup"+sep+'(system|system|systemtauth|system_traces|system_schema|system_distributed)') - + # List of the threds running in background + futures = [] try: - for root, dirs, files in walk(args.datadirectory): - root_target_dir = tmpdir+sep+"cassandra_backup"+sep+sep.join(root.split(sep)[root_levels+1:-2]) - if match(ignore_list, root_target_dir): - continue - if root.split(sep)[-1] == args.snapshotname: - copytree(src=root, dst=root_target_dir, ignore=ignore_patterns('.*')) + with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor: + for root, dirs, files in walk(args.datadirectory): + root_target_dir = tmpdir+sep+"cassandra_backup"+sep+sep.join(root.split(sep)[root_levels+1:-2]) + if match(ignore_list, root_target_dir): + continue + if root.split(sep)[-1] == args.snapshotname: + # Keeping copy operation in background with threads + tmp_arr = [root, root_target_dir] + futures.append( executor.submit( lambda p: customCopy(*p), tmp_arr)) except Exception as e: print(e) - + # Checking status of the copy operation + for future in concurrent.futures.as_completed(futures): + try: + print("Task completed. Result: {}".format(future.result())) + except Exception as e: + print(e) # Creating schema command = "cqlsh -e 'DESC SCHEMA' > {}/cassandra_backup/db_schema.cql".format(tmpdir) @@ -67,18 +84,30 @@ def copy(): print("Couldn't backup schema, exiting...") exit(1) print("Schema backup completed. saved in {}/cassandra_backup/db_schema.sql".format(tmpdir)) -# Cleaning all old snapshots -command = "nodetool clearsnapshot" -system(command) + # Creating snapshots -command = "nodetool snapshot -t {}".format(args.snapshotname) -rc = system(command) -if rc == 0: - print("Snapshot taken.") - copy() - print("Making a tarball: {}.tar.gz".format(args.snapshotname)) - command = "cd {} && tar -czvf {}/{}.tar.gz *".format(tmpdir, args.tardirectory, args.snapshotname) +if not args.disablesnapshot: + # Cleaning all old snapshots + command = "nodetool clearsnapshot" system(command) - # Cleaning up backup directory - rmtree(tmpdir) - print("Cassandra backup completed and stored in {}/{}.tar.gz".format(args.tardirectory, args.snapshotname)) + # Taking new snapshot + command = "nodetool snapshot -t {}".format(args.snapshotname) + rc = system(command) + if rc != 0: + print("Backup failed") + exit(1) + print("Snapshot taken.") + +# Copying the snapshot to proper folder structure +copy() +# Creating tarball +print("Making a tarball: {}.tar.gz".format(args.snapshotname)) +command = "cd {} && tar --remove-files -czvf {}/{}.tar.gz *".format(tmpdir, args.tardirectory, args.snapshotname) +rc = system(command) +if rc != 0: + print("Creation of tar failed") + exit(1) +# Cleaning up backup directory +rmtree(tmpdir) +print("Cassandra backup completed and stored in {}/{}.tar.gz".format(args.tardirectory, args.snapshotname)) +