From 3a7d627b2d4785e574d0e4e6747c029d982ebbcb Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 19:53:16 +0000 Subject: [PATCH 01/48] Edit run_hash func to allow reading in previous hashed files from json --- deduplify/hash_files.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index e4ab4b9..7b5b1f6 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -15,6 +15,7 @@ import hashlib import logging from typing import Tuple +from itertools import chain from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed @@ -80,7 +81,9 @@ def filter_dict(results: dict) -> Tuple[dict, dict]: return duplicated, unique -def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs): +def run_hash( + dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs +): """Hash files within a directory structure Args: @@ -88,21 +91,47 @@ def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs): count (int): Number of threads to parallelise over dupfile (str): JSON file location for duplicated hashes unfile (str): JSON file location for unique hashes + restart (bool): If true, will restart a hash run. dupfile and unfile + must exist since the filenames already hashed will be skipped. + Default: False. """ # Check the directory path exists if not os.path.exists(dir): raise ValueError("Please provide a known filepath!") + if restart: + for input_file in [dupfile, unfile]: + if not os.path.isfile(input_file): + raise FileNotFoundError( + f"{input_file} must exist to restart a hash run!" + ) + + with open(dupfile) as stream: + dup_dict = json.load(stream) + + with open(unfile) as stream: + un_dict = json.load(stream) + + pre_hashed_dict = {**dup_dict, **un_dict} + files_to_skip = list(chain(*pre_hashed_dict.values())) + else: + files_to_skip = [] + print(files_to_skip[:10]) + logger.info("Walking structure of: %s" % dir) logger.info("Generating MD5 hashes for files...") - hashes = defaultdict(list) # Empty dict to store hashes in counter = 0 + if restart: + hashes = pre_hashed_dict.copy() + else: + hashes = defaultdict(list) # Empty dict to store hashes in - for dirName, subdirs, fileList in os.walk(dir): + for dirName, _, fileList in os.walk(dir): with ThreadPoolExecutor(max_workers=count) as executor: futures = [ executor.submit(hashfile, os.path.join(dirName, filename)) for filename in fileList + if filename not in files_to_skip ] for future in as_completed(futures): hash, filepath = future.result() From 6fd6f4021c0902afe8cc331697dcf080f625de18 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 19:54:44 +0000 Subject: [PATCH 02/48] Add --restart flag to cli --- deduplify/cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deduplify/cli.py b/deduplify/cli.py index 3e8d93e..756c1c7 100644 --- a/deduplify/cli.py +++ b/deduplify/cli.py @@ -83,6 +83,11 @@ def parse_args(args): default="uniques.json", help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json", ) + parser_hash.add_argument( + "--restart", + action="store_true", + help="Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.", + ) # Compare subcommand parser_compare = subparsers.add_parser( From e061609b7fc64237c0fb1f01358d2774d431701f Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 19:58:44 +0000 Subject: [PATCH 03/48] Remove unnecessary print command --- deduplify/hash_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 7b5b1f6..5779ac1 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -116,7 +116,6 @@ def run_hash( files_to_skip = list(chain(*pre_hashed_dict.values())) else: files_to_skip = [] - print(files_to_skip[:10]) logger.info("Walking structure of: %s" % dir) logger.info("Generating MD5 hashes for files...") From a10677b39e15fb64413bbfff53558a6396e0ed2a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:28:31 +0000 Subject: [PATCH 04/48] Version bump --- deduplify/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/_version.py b/deduplify/_version.py index 007cab2..5adebd9 100644 --- a/deduplify/_version.py +++ b/deduplify/_version.py @@ -7,5 +7,5 @@ from incremental import Version -__version__ = Version("deduplify", 0, 1, 2) +__version__ = Version("deduplify", 0, 2, 0) __all__ = ["__version__"] From 50c038c25a0cd3d07625faa5ec62c3e3d191b2c5 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 09:24:00 +0000 Subject: [PATCH 05/48] Add logging statement when restarting the hashing process --- deduplify/hash_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 5779ac1..367b959 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -100,6 +100,8 @@ def run_hash( raise ValueError("Please provide a known filepath!") if restart: + logger.info("Restarting hashing process") + for input_file in [dupfile, unfile]: if not os.path.isfile(input_file): raise FileNotFoundError( From c52499d105bd789c22f2e18fa35e40cea4436309 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:20:13 +0000 Subject: [PATCH 06/48] Define a function to count total number of xml files --- deduplify/hash_files.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 367b959..362445a 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -14,6 +14,7 @@ import json import hashlib import logging +import subprocess from typing import Tuple from itertools import chain from collections import defaultdict @@ -22,6 +23,27 @@ logger = logging.getLogger() +def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int: + """Count the total number of files of a given extension in a directory. + + Args: + dir (str): The target directory to search. + file_ext (str): The file extension to search for. Default: .xml + + Returns: + int: The number of files with the matching extension within the tree + of the target directory + """ + find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"'] + wc_cmd = ["wc", "-l"] + + find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE) + output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout) + find_proc.wait() + + return int(output.decode("utf-8").strip("\n")) + + def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]: """Calculate the MD5 hash of a given file @@ -99,6 +121,8 @@ def run_hash( if not os.path.exists(dir): raise ValueError("Please provide a known filepath!") + total_file_number = get_total_number_of_files(dir) + if restart: logger.info("Restarting hashing process") From d9d7adf62fc5457a90feef6b5fe349e84b69162e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:20:43 +0000 Subject: [PATCH 07/48] Deprecate previous counter mechanism --- deduplify/hash_files.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 362445a..8a4f657 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -145,7 +145,7 @@ def run_hash( logger.info("Walking structure of: %s" % dir) logger.info("Generating MD5 hashes for files...") - counter = 0 + # counter = 0 if restart: hashes = pre_hashed_dict.copy() else: @@ -162,9 +162,9 @@ def run_hash( hash, filepath = future.result() hashes[hash].append(filepath) - counter += 1 - print(f"Total files hashed: {counter}", end="\r") - sys.stdout.flush() + # counter += 1 + # print(f"Total files hashed: {counter}", end="\r") + # sys.stdout.flush() dup_dict, unique_dict = filter_dict(hashes) # Filter the results From b1eb8f423a9d0cac3df091c3c6b2d969186c1218 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:21:03 +0000 Subject: [PATCH 08/48] Apply manually controlled tqdm progress bar --- deduplify/hash_files.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 8a4f657..9384812 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -15,6 +15,7 @@ import hashlib import logging import subprocess +from tqdm import tqdm from typing import Tuple from itertools import chain from collections import defaultdict @@ -151,6 +152,8 @@ def run_hash( else: hashes = defaultdict(list) # Empty dict to store hashes in + pbar = tqdm(total=total_file_number - len(files_to_skip)) + for dirName, _, fileList in os.walk(dir): with ThreadPoolExecutor(max_workers=count) as executor: futures = [ @@ -166,6 +169,10 @@ def run_hash( # print(f"Total files hashed: {counter}", end="\r") # sys.stdout.flush() + pbar.update(1) + + pbar.close() + dup_dict, unique_dict = filter_dict(hashes) # Filter the results for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): From 854fad5e93b0b10136c43ba8587a438bde3422ba Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:39:08 +0000 Subject: [PATCH 09/48] Comment out unused import --- deduplify/hash_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 9384812..17ce060 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -10,7 +10,7 @@ """ import os -import sys +# import sys import json import hashlib import logging From 6a54c88a291cb7a8be34d553503bb8ad6f47db24 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 22 Mar 2021 20:40:32 +0000 Subject: [PATCH 10/48] Run formatter --- deduplify/hash_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 17ce060..a5df73d 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -8,9 +8,9 @@ Author: Sarah Gibson Python version: >=3.7 (developed with 3.8) """ +# import sys import os -# import sys import json import hashlib import logging From 419b79b616dffb51da734582979fc75886d0df0b Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 09:14:34 +0000 Subject: [PATCH 11/48] Add logging statements to get_total_numbers_of_files func --- deduplify/hash_files.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index a5df73d..9237d3b 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -35,6 +35,8 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int: int: The number of files with the matching extension within the tree of the target directory """ + logger.info("Calculating number of files that will be hashed") + find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"'] wc_cmd = ["wc", "-l"] @@ -42,7 +44,11 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int: output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout) find_proc.wait() - return int(output.decode("utf-8").strip("\n")) + output = int(output.decode("utf-8").strip("\n")) + + logger.info("%s files to be hashed" % output) + + return output def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]: From fc42a9a0b46abaae2926afd6c100437f5f87458e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 15:20:08 +0000 Subject: [PATCH 12/48] Merging stash and resolving conflicts --- deduplify/hash_files.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 9237d3b..958b186 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -154,7 +154,7 @@ def run_hash( logger.info("Generating MD5 hashes for files...") # counter = 0 if restart: - hashes = pre_hashed_dict.copy() + hashes = defaultdict(lambda: None, pre_hashed_dict) else: hashes = defaultdict(list) # Empty dict to store hashes in @@ -171,6 +171,13 @@ def run_hash( hash, filepath = future.result() hashes[hash].append(filepath) + dup_dict, unique_dict = filter_dict(hashes) # Filter the results + + for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): + logger.info("Writing outputs to: %s" % filename) + with open(filename, "w") as f: + json.dump(content, f, indent=2, sort_keys=True) + # counter += 1 # print(f"Total files hashed: {counter}", end="\r") # sys.stdout.flush() @@ -178,10 +185,3 @@ def run_hash( pbar.update(1) pbar.close() - - dup_dict, unique_dict = filter_dict(hashes) # Filter the results - - for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): - logger.info("Writing outputs to: %s" % filename) - with open(filename, "w") as f: - json.dump(content, f, indent=2, sort_keys=True) From 80588469cbd21dbc32c6af5a43e4d1e0dd28b14c Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 15:55:46 +0000 Subject: [PATCH 13/48] Rename dir arg in get_total_number_of_files func --- deduplify/hash_files.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 958b186..c84d039 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -24,11 +24,11 @@ logger = logging.getLogger() -def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int: +def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: """Count the total number of files of a given extension in a directory. Args: - dir (str): The target directory to search. + target_dir (str): The target directory to search. file_ext (str): The file extension to search for. Default: .xml Returns: @@ -37,7 +37,7 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int: """ logger.info("Calculating number of files that will be hashed") - find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"'] + find_cmd = ["find", target_dir, "-type", "f", "-name", f'\"*{file_ext}\"'] wc_cmd = ["wc", "-l"] find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE) From a445febe08050bc48e19ec05b6f4922d1f5188a5 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 15:56:37 +0000 Subject: [PATCH 14/48] [HOTFIX] Fixing file number total --- deduplify/hash_files.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index c84d039..0a2b147 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -128,8 +128,6 @@ def run_hash( if not os.path.exists(dir): raise ValueError("Please provide a known filepath!") - total_file_number = get_total_number_of_files(dir) - if restart: logger.info("Restarting hashing process") @@ -158,7 +156,8 @@ def run_hash( else: hashes = defaultdict(list) # Empty dict to store hashes in - pbar = tqdm(total=total_file_number - len(files_to_skip)) + total = 10410200 - len(files_to_skip) + pbar = tqdm(total=total) for dirName, _, fileList in os.walk(dir): with ThreadPoolExecutor(max_workers=count) as executor: From f4bfb9ccc18b6ad84e7b37b17fa891b6987a4578 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 15:57:05 +0000 Subject: [PATCH 15/48] setting a defaultdict from previous dict --- deduplify/hash_files.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 0a2b147..bb46ecb 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -143,6 +143,9 @@ def run_hash( with open(unfile) as stream: un_dict = json.load(stream) + for key, value in un_dict.items(): + un_dict[key] = [value] + pre_hashed_dict = {**dup_dict, **un_dict} files_to_skip = list(chain(*pre_hashed_dict.values())) else: @@ -152,7 +155,7 @@ def run_hash( logger.info("Generating MD5 hashes for files...") # counter = 0 if restart: - hashes = defaultdict(lambda: None, pre_hashed_dict) + hashes = defaultdict(list, pre_hashed_dict) else: hashes = defaultdict(list) # Empty dict to store hashes in From 7c55422d7669be63627a71d124b53de23ce47070 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 15:57:16 +0000 Subject: [PATCH 16/48] Update log message --- deduplify/hash_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index bb46ecb..8c4f7b5 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -105,7 +105,7 @@ def filter_dict(results: dict) -> Tuple[dict, dict]: for value in duplicated.values(): total += len(value) - logger.info("Number of identical files: %s" % total) + logger.info("Number of duplicated files: %s" % total) return duplicated, unique From 53946bc77e38729b8dd8aec625065fbfe5534e8f Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 23 Mar 2021 16:42:01 +0000 Subject: [PATCH 17/48] Fix counting of files --- deduplify/hash_files.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 8c4f7b5..fb2a101 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -14,7 +14,7 @@ import json import hashlib import logging -import subprocess +import fnmatch from tqdm import tqdm from typing import Tuple from itertools import chain @@ -24,6 +24,18 @@ logger = logging.getLogger() +def resolvepath(path): + """Resolve and normalize a path + + 1. Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and + ~otheruser/bin to /home/otheruser/bin + 2. Normalize the path so that it doesn't contain relative segments, turning + e.g. /usr/local/../bin to /usr/bin + 3. Get the real path of the actual file, resolving symbolic links + """ + return os.path.realpath(os.path.normpath(os.path.expanduser(path))) + + def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: """Count the total number of files of a given extension in a directory. @@ -35,18 +47,12 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: int: The number of files with the matching extension within the tree of the target directory """ - logger.info("Calculating number of files that will be hashed") + logger.info("Calculating number of files that will be hashed in %s" % target_dir) - find_cmd = ["find", target_dir, "-type", "f", "-name", f'\"*{file_ext}\"'] - wc_cmd = ["wc", "-l"] + dirpath = resolvepath(target_dir) + output = len(fnmatch.filter(os.listdir(dirpath), f"*{file_ext}")) - find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE) - output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout) - find_proc.wait() - - output = int(output.decode("utf-8").strip("\n")) - - logger.info("%s files to be hashed" % output) + logger.info("%s files to be hashed in %s" % (output, target_dir)) return output @@ -128,6 +134,8 @@ def run_hash( if not os.path.exists(dir): raise ValueError("Please provide a known filepath!") + total_file_num = get_total_number_of_files(dir) + if restart: logger.info("Restarting hashing process") @@ -147,7 +155,7 @@ def run_hash( un_dict[key] = [value] pre_hashed_dict = {**dup_dict, **un_dict} - files_to_skip = list(chain(*pre_hashed_dict.values())) + files_to_skip = [item for values in pre_hashed_dict.values() for item in values] else: files_to_skip = [] @@ -159,7 +167,7 @@ def run_hash( else: hashes = defaultdict(list) # Empty dict to store hashes in - total = 10410200 - len(files_to_skip) + total = total_file_num - len(files_to_skip) pbar = tqdm(total=total) for dirName, _, fileList in os.walk(dir): From 5219de56377585d0d697cbe93a9618a9c0c721ec Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 10:28:47 +0000 Subject: [PATCH 18/48] Removing deprecated code --- deduplify/hash_files.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index fb2a101..a47e131 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -8,8 +8,6 @@ Author: Sarah Gibson Python version: >=3.7 (developed with 3.8) """ -# import sys - import os import json import hashlib @@ -17,7 +15,6 @@ import fnmatch from tqdm import tqdm from typing import Tuple -from itertools import chain from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed @@ -161,7 +158,7 @@ def run_hash( logger.info("Walking structure of: %s" % dir) logger.info("Generating MD5 hashes for files...") - # counter = 0 + if restart: hashes = defaultdict(list, pre_hashed_dict) else: @@ -188,10 +185,6 @@ def run_hash( with open(filename, "w") as f: json.dump(content, f, indent=2, sort_keys=True) - # counter += 1 - # print(f"Total files hashed: {counter}", end="\r") - # sys.stdout.flush() - pbar.update(1) pbar.close() From bb3db10806d2523a5143ebdb9d6fbaa8a600991b Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 10:33:29 +0000 Subject: [PATCH 19/48] Resolve filepaths in the CLI --- deduplify/cli.py | 21 +++++++++++++++++---- deduplify/hash_files.py | 15 +-------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/deduplify/cli.py b/deduplify/cli.py index 756c1c7..ff7830e 100644 --- a/deduplify/cli.py +++ b/deduplify/cli.py @@ -1,3 +1,4 @@ +import os import sys import logging import argparse @@ -28,6 +29,18 @@ def setup_logging(verbose=False): ) +def resolvepath(path): + """Resolve and normalize a path + + 1. Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and + ~otheruser/bin to /home/otheruser/bin + 2. Normalize the path so that it doesn't contain relative segments, turning + e.g. /usr/local/../bin to /usr/bin + 3. Get the real path of the actual file, resolving symbolic links + """ + return os.path.realpath(os.path.normpath(os.path.expanduser(path))) + + def parse_args(args): parser = argparse.ArgumentParser( description="Find and delete duplicated files in messy datasets" @@ -56,7 +69,7 @@ def parse_args(args): # Positional parser parser_pos = argparse.ArgumentParser(add_help=False) parser_pos.add_argument( - "dir", type=str, help="Path to directory to begin search from" + "dir", type=resolvepath, help="Path to directory to begin search from" ) # Hash subcommand @@ -70,7 +83,7 @@ def parse_args(args): parser_hash.add_argument( "-d", "--dupfile", - type=str, + type=resolvepath, dest="dupfile", default="duplicates.json", help="Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json", @@ -78,7 +91,7 @@ def parse_args(args): parser_hash.add_argument( "-u", "--unfile", - type=str, + type=resolvepath, dest="unfile", default="uniques.json", help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json", @@ -100,7 +113,7 @@ def parse_args(args): parser_compare.add_argument( "-i", "--infile", - type=str, + type=resolvepath, default="duplicates.json", help="Filename to analyse. Must be a JSON file. Default: duplicates.json", ) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index a47e131..d6b104d 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -21,18 +21,6 @@ logger = logging.getLogger() -def resolvepath(path): - """Resolve and normalize a path - - 1. Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and - ~otheruser/bin to /home/otheruser/bin - 2. Normalize the path so that it doesn't contain relative segments, turning - e.g. /usr/local/../bin to /usr/bin - 3. Get the real path of the actual file, resolving symbolic links - """ - return os.path.realpath(os.path.normpath(os.path.expanduser(path))) - - def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: """Count the total number of files of a given extension in a directory. @@ -46,8 +34,7 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: """ logger.info("Calculating number of files that will be hashed in %s" % target_dir) - dirpath = resolvepath(target_dir) - output = len(fnmatch.filter(os.listdir(dirpath), f"*{file_ext}")) + output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}")) logger.info("%s files to be hashed in %s" % (output, target_dir)) From 5e64212f5ac245137b45ee07784297302df57e66 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 10:51:56 +0000 Subject: [PATCH 20/48] Set dev/testing requirements --- .github/workflows/ci.yml | 2 +- dev-requirements.txt | 2 ++ setup.cfg | 2 ++ setup.py | 3 ++- 4 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 dev-requirements.txt create mode 100644 setup.cfg diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index baaedf6..567b7eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: run: | python -m pip install -U pip pip install -r requirements.txt - pip install pytest coverage + pip install -r dev-requirements.txt - name: Run tests run: | diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..20330a5 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,2 @@ +coverage==5.3 +pytest==6.2.2 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b7e4789 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/setup.py b/setup.py index d5b487d..e589937 100644 --- a/setup.py +++ b/setup.py @@ -75,5 +75,6 @@ "Programming Language :: Python :: 3.7", ], use_incremental=True, - setup_requires=["incremental"], + setup_requires=["incremental", "pytest-runner"], + tests_require=test_require, ) From b6507ede2f91b9edc94526afd186f2f02917ad08 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 10:53:36 +0000 Subject: [PATCH 21/48] Merge lint and format CI jobs into 1 file --- .github/workflows/ci.yml | 42 +++++++++++++++++++++++++++++++ .github/workflows/lint-format.yml | 35 -------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) delete mode 100644 .github/workflows/lint-format.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 567b7eb..6061f81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,3 +37,45 @@ jobs: - name: Print coverage report run: | coverage report + + formatting: + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Setup Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install -U pip + pip install black + + - name: Format Python files with black + run: | + black --check . + + linting: + runs-on: ubuntu-16.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v2 + + - name: Setup Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install -U pip + pip install flake8 + + - name: Lint Python files with flake8 + run: | + flake8 --ignore=E501 . diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml deleted file mode 100644 index 71ec3f6..0000000 --- a/.github/workflows/lint-format.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Lint and Format Python Files - -on: - push: - branches: - - main - pull_request: - branches: - - main - -jobs: - lint-format: - runs-on: ubuntu-latest - - steps: - - name: Checkout repo - uses: actions/checkout@v2 - - - name: Setup Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install dependencies - run: | - python -m pip install -U pip - pip install black flake8 - - - name: Format Python files with black - run: | - black --check . - - - name: Lint Python files with flake8 - run: | - flake8 --ignore=E501 . From 201dc5568f877a8e532385ea4eb127ff18633fd1 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:02:15 +0000 Subject: [PATCH 22/48] Test get_total_number_of_files func --- tests/test_hash.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index 9e7a128..2a1eae0 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,4 +1,6 @@ -from deduplify.hash_files import filter_dict +import os +from deduplify.cli import resolvepath +from deduplify.hash_files import filter_dict, get_total_number_of_files def test_filter_dict(): @@ -8,3 +10,14 @@ def test_filter_dict(): assert dupdict == {"hash2": ["filepath2", "filepath3"]} assert undict == {"hash1": "filepath1"} + + +def test_get_total_number_of_files(): + dirpath = resolvepath(os.path.join("tests", "testdir")) + print(dirpath) + + output1 = get_total_number_of_files(dirpath) + output2 = get_total_number_of_files(dirpath, file_ext=".txt") + + assert output1 == 2 + assert output2 == 1 From f01bcec05fd52a68de0bd2228349f89321336905 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:10:31 +0000 Subject: [PATCH 23/48] Add test for hashfile func --- tests/test_hash.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index 2a1eae0..e4192cf 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,6 +1,6 @@ import os from deduplify.cli import resolvepath -from deduplify.hash_files import filter_dict, get_total_number_of_files +from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile def test_filter_dict(): @@ -21,3 +21,12 @@ def test_get_total_number_of_files(): assert output1 == 2 assert output2 == 1 + + +def test_hashfile(): + path = os.path.join("tests", "assets", "test_infile.json") + + md5_hash, outpath = hashfile(path) + + assert md5_hash == 'f3fb257d843b252bdc0442402552d840' + assert outpath == path From 5ba77517f1e2e9014e64958ce5b2dc9e2cf9f4bd Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:12:16 +0000 Subject: [PATCH 24/48] Fix broken test --- tests/test_delete.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_delete.py b/tests/test_delete.py index f275167..130562b 100644 --- a/tests/test_delete.py +++ b/tests/test_delete.py @@ -5,7 +5,7 @@ @patch("deduplify.del_empty_dirs.os.rmdir") def test_del_empty_dirs(mock): - test_dir = os.path.join("tests", "testdir") + test_dir = os.path.join("tests", "testdir_empty") test_call = [call(os.path.abspath(test_dir))] if not os.path.exists(test_dir): From 428a3895c78a6a1cbb35803374b651dd8fff3e32 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:42:08 +0000 Subject: [PATCH 25/48] FIXUP: test_get_total_number_of_files func --- tests/test_hash.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index e4192cf..96184ec 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,5 +1,4 @@ import os -from deduplify.cli import resolvepath from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile @@ -13,8 +12,9 @@ def test_filter_dict(): def test_get_total_number_of_files(): - dirpath = resolvepath(os.path.join("tests", "testdir")) - print(dirpath) + ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__)) + HERE = "/".join(ABSOLUTE_HERE.split("/")[:-1]) + dirpath = os.path.join(HERE, "tests", "testdir") output1 = get_total_number_of_files(dirpath) output2 = get_total_number_of_files(dirpath, file_ext=".txt") From 1939f91edd61edd603ee1e9d9730881d0dc75430 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:47:12 +0000 Subject: [PATCH 26/48] Trying to fix tests in CI --- .github/workflows/ci.yml | 2 ++ tests/test_hash.py | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6061f81..ef7b884 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,8 @@ jobs: pip install -r requirements.txt pip install -r dev-requirements.txt + - run: pwd + - name: Run tests run: | python -m coverage run -m pytest -vvv diff --git a/tests/test_hash.py b/tests/test_hash.py index 96184ec..59d224b 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -12,9 +12,7 @@ def test_filter_dict(): def test_get_total_number_of_files(): - ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__)) - HERE = "/".join(ABSOLUTE_HERE.split("/")[:-1]) - dirpath = os.path.join(HERE, "tests", "testdir") + dirpath = os.path.join("tests", "testdir") output1 = get_total_number_of_files(dirpath) output2 = get_total_number_of_files(dirpath, file_ext=".txt") From 583610ec329468f182e3525304b7452536fc0924 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:49:38 +0000 Subject: [PATCH 27/48] Adding debugging statements to CI --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ef7b884..0744e62 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,9 @@ jobs: pip install -r requirements.txt pip install -r dev-requirements.txt - - run: pwd + - run: | + pwd + ls -al - name: Run tests run: | From 8a8b98e64afd0e090b6403852b1c9c665731ff6a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:50:30 +0000 Subject: [PATCH 28/48] Edit dirpath --- tests/test_hash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index 59d224b..ca35fa7 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -12,7 +12,7 @@ def test_filter_dict(): def test_get_total_number_of_files(): - dirpath = os.path.join("tests", "testdir") + dirpath = os.path.join(os.getcwd(), "tests", "testdir") output1 = get_total_number_of_files(dirpath) output2 = get_total_number_of_files(dirpath, file_ext=".txt") From 80212bfe578ff44c4ed4f1b66adbe170934696a1 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:53:45 +0000 Subject: [PATCH 29/48] Add dir list test --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0744e62..72bfc67 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: - run: | pwd - ls -al + find . -type f - name: Run tests run: | From da0eafa5f9de133c51ed47332c592782da64eea3 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:57:06 +0000 Subject: [PATCH 30/48] Force add test files --- tests/testdir/test_file_1.xml | 0 tests/testdir/test_file_2.xml | 0 tests/testdir/test_file_3.txt | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/testdir/test_file_1.xml create mode 100644 tests/testdir/test_file_2.xml create mode 100644 tests/testdir/test_file_3.txt diff --git a/tests/testdir/test_file_1.xml b/tests/testdir/test_file_1.xml new file mode 100644 index 0000000..e69de29 diff --git a/tests/testdir/test_file_2.xml b/tests/testdir/test_file_2.xml new file mode 100644 index 0000000..e69de29 diff --git a/tests/testdir/test_file_3.txt b/tests/testdir/test_file_3.txt new file mode 100644 index 0000000..e69de29 From 6ab6a276c6102e603a39adc6346517793d0a57bd Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:57:12 +0000 Subject: [PATCH 31/48] Update gitignore --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 03feae7..a35da5e 100644 --- a/.gitignore +++ b/.gitignore @@ -128,9 +128,6 @@ dmypy.json # Pyre type checker .pyre/ -# Ignore dir created for tests -**/testdir/** - # Ignore JSON output files **/duplicates.json **/uniques.json From de691fb99ce90cc183497bba80d4e64abf211116 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:57:59 +0000 Subject: [PATCH 32/48] Remove debugging commands from CI --- .github/workflows/ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72bfc67..6061f81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,10 +30,6 @@ jobs: pip install -r requirements.txt pip install -r dev-requirements.txt - - run: | - pwd - find . -type f - - name: Run tests run: | python -m coverage run -m pytest -vvv From 153b6b42786c8fd632d19aa1225c563000ea9b32 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 11:59:45 +0000 Subject: [PATCH 33/48] Tweak filepaths --- tests/test_hash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index ca35fa7..59d224b 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -12,7 +12,7 @@ def test_filter_dict(): def test_get_total_number_of_files(): - dirpath = os.path.join(os.getcwd(), "tests", "testdir") + dirpath = os.path.join("tests", "testdir") output1 = get_total_number_of_files(dirpath) output2 = get_total_number_of_files(dirpath, file_ext=".txt") From 33b0667922bfc68c347c1ec00d8a0478c3a2619d Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 12:00:39 +0000 Subject: [PATCH 34/48] Run formatter --- deduplify/hash_files.py | 4 +++- tests/test_hash.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index d6b104d..a2b37d3 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -167,7 +167,9 @@ def run_hash( dup_dict, unique_dict = filter_dict(hashes) # Filter the results - for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): + for filename, content in zip( + [dupfile, unfile], [dup_dict, unique_dict] + ): logger.info("Writing outputs to: %s" % filename) with open(filename, "w") as f: json.dump(content, f, indent=2, sort_keys=True) diff --git a/tests/test_hash.py b/tests/test_hash.py index 59d224b..b580321 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -26,5 +26,5 @@ def test_hashfile(): md5_hash, outpath = hashfile(path) - assert md5_hash == 'f3fb257d843b252bdc0442402552d840' + assert md5_hash == "f3fb257d843b252bdc0442402552d840" assert outpath == path From 84040fbf0a1e01986c4e948fdf97aec10270ce0a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 17:08:58 +0000 Subject: [PATCH 35/48] Create transform_dict and restart_run funcs --- deduplify/hash_files.py | 70 +++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index a2b37d3..89b566f 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -100,6 +100,50 @@ def filter_dict(results: dict) -> Tuple[dict, dict]: return duplicated, unique +def transform_dict(input_dict: dict) -> dict: + """Transforms a dictionary with str type values into one with list type + values + + Args: + input_dict (dict): of type {key: str} + + Returns: + dict: of type {key: [str]} + """ + output_dict = {key: [value] for (key, value) in input_dict.items()} + return output_dict + + +def restart_run(dupfile: os.path, unfile: os.path) -> Tuple[dict, list]: + """When restarting a hash run, read in and wrangle the previous output files + to reconstruct the dictionary and identify which files need to be skipped + + Args: + dupfile (os.path): Path the the file containing duplicated hashes and filenames + unfile (os.path): Path to the file containing unique hashes and filenames + """ + logger.info("Restarting hashing process") + logger.info("Checking required files exist") + for filename in [dupfile, unfile]: + if not os.path.exists(filename): + raise FileNotFoundError(f"{filename} must exist to restart a hash run!") + + logger.info("Reading in files") + with open(dupfile) as stream: + dup_dict = json.load(stream) + with open(unfile) as stream: + un_dict = json.load(stream) + + un_dict = transform_dict(un_dict) + + pre_hashed_dict = {**dup_dict, **un_dict} + hashes = defaultdict(list, pre_hashed_dict) + + files_to_skip = [item for values in pre_hashed_dict.values() for item in values] + + return hashes, files_to_skip + + def run_hash( dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs ): @@ -121,36 +165,14 @@ def run_hash( total_file_num = get_total_number_of_files(dir) if restart: - logger.info("Restarting hashing process") - - for input_file in [dupfile, unfile]: - if not os.path.isfile(input_file): - raise FileNotFoundError( - f"{input_file} must exist to restart a hash run!" - ) - - with open(dupfile) as stream: - dup_dict = json.load(stream) - - with open(unfile) as stream: - un_dict = json.load(stream) - - for key, value in un_dict.items(): - un_dict[key] = [value] - - pre_hashed_dict = {**dup_dict, **un_dict} - files_to_skip = [item for values in pre_hashed_dict.values() for item in values] + hashes, files_to_skip = restart_run(dupfile, unfile) else: + hashes = defaultdict(list) # Empty dict to store hashes in files_to_skip = [] logger.info("Walking structure of: %s" % dir) logger.info("Generating MD5 hashes for files...") - if restart: - hashes = defaultdict(list, pre_hashed_dict) - else: - hashes = defaultdict(list) # Empty dict to store hashes in - total = total_file_num - len(files_to_skip) pbar = tqdm(total=total) From 59c38354b2f48f17e36477c722004430db78a16e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 17:09:31 +0000 Subject: [PATCH 36/48] Add tests for transform_dict and restart_run funcs --- tests/test_hash.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/test_hash.py b/tests/test_hash.py index b580321..71ba817 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,5 +1,12 @@ import os -from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile +from collections import defaultdict +from deduplify.hash_files import ( + filter_dict, + get_total_number_of_files, + hashfile, + transform_dict, + restart_run, +) def test_filter_dict(): @@ -28,3 +35,33 @@ def test_hashfile(): assert md5_hash == "f3fb257d843b252bdc0442402552d840" assert outpath == path + + +def test_transform_dict(): + test_dict = {"key1": "value1", "key2": "value2"} + expected = {"key1": ["value1"], "key2": ["value2"]} + + output = transform_dict(test_dict) + + assert output == expected + + +def test_restart_run(): + dup_file = os.path.join(os.getcwd(), "tests", "assets", "test_duplicates.json") + un_file = os.path.join(os.getcwd(), "tests", "assets", "test_uniques.json") + + expected_dict = defaultdict( + list, + { + "key1": ["valueA", "valueB"], + "key2": ["valueC", "valueD"], + "key3": ["valueE"], + "key4": ["valueF"], + }, + ) + expected_list = ["valueA", "valueB", "valueC", "valueD", "valueE", "valueF"] + + hashes, files_to_be_skipped = restart_run(dup_file, un_file) + + assert hashes == expected_dict + assert files_to_be_skipped == expected_list From 8d6921f635ca28ecb96769b26c3c751bcb275463 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 17:09:39 +0000 Subject: [PATCH 37/48] Provide test assets --- tests/assets/test_duplicates.json | 10 ++++++++++ tests/assets/test_uniques.json | 4 ++++ 2 files changed, 14 insertions(+) create mode 100644 tests/assets/test_duplicates.json create mode 100644 tests/assets/test_uniques.json diff --git a/tests/assets/test_duplicates.json b/tests/assets/test_duplicates.json new file mode 100644 index 0000000..e51f123 --- /dev/null +++ b/tests/assets/test_duplicates.json @@ -0,0 +1,10 @@ +{ + "key1": [ + "valueA", + "valueB" + ], + "key2": [ + "valueC", + "valueD" + ] +} diff --git a/tests/assets/test_uniques.json b/tests/assets/test_uniques.json new file mode 100644 index 0000000..bfcc6de --- /dev/null +++ b/tests/assets/test_uniques.json @@ -0,0 +1,4 @@ +{ + "key3": "valueE", + "key4": "valueF" +} From 8c8060e7b49e8b4331f8f332c8c668a30a6bff5f Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 17:11:13 +0000 Subject: [PATCH 38/48] Bump package version --- deduplify/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/_version.py b/deduplify/_version.py index 5adebd9..b606d60 100644 --- a/deduplify/_version.py +++ b/deduplify/_version.py @@ -7,5 +7,5 @@ from incremental import Version -__version__ = Version("deduplify", 0, 2, 0) +__version__ = Version("deduplify", 0, 3, 0) __all__ = ["__version__"] From 5e7abc693830b73bc6e53dbc1762fb4acad0b023 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 24 Mar 2021 17:17:12 +0000 Subject: [PATCH 39/48] Document restart flag --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ea2e5a0..bde82b2 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ This file is organised such that the keys are the hashes and the values are a ** **Command line usage:** ```bash -usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] dir +usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] [--restart] dir positional arguments: dir Path to directory to begin search from @@ -91,6 +91,7 @@ optional arguments: Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json -u UNFILE, --unfile UNFILE Destination file for unique hashes. Must be a JSON file. Default: uniques.json + --restart Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist. ``` ### Comparing files From 9148bf0a920961da14c1804bc631ecb30b91aaf2 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 19 Apr 2021 13:39:58 +0100 Subject: [PATCH 40/48] Move writing to file to outside the ThreadPool context manager --- deduplify/hash_files.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 89b566f..1c6223a 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -187,15 +187,13 @@ def run_hash( hash, filepath = future.result() hashes[hash].append(filepath) - dup_dict, unique_dict = filter_dict(hashes) # Filter the results + pbar.update(1) - for filename, content in zip( - [dupfile, unfile], [dup_dict, unique_dict] - ): - logger.info("Writing outputs to: %s" % filename) - with open(filename, "w") as f: - json.dump(content, f, indent=2, sort_keys=True) + dup_dict, unique_dict = filter_dict(hashes) # Filter the results - pbar.update(1) + for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): + logger.info("Writing outputs to: %s" % filename) + with open(filename, "w") as f: + json.dump(content, f, indent=2, sort_keys=True) pbar.close() From 12105f75d43e6dc9813682e3b21d916fb3ff1556 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 23 Apr 2021 13:45:10 +0100 Subject: [PATCH 41/48] Only filter dictionaries and dump the JSON after the loop has completed --- deduplify/hash_files.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 1c6223a..639db18 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -189,11 +189,11 @@ def run_hash( pbar.update(1) - dup_dict, unique_dict = filter_dict(hashes) # Filter the results + pbar.close() - for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): - logger.info("Writing outputs to: %s" % filename) - with open(filename, "w") as f: - json.dump(content, f, indent=2, sort_keys=True) + dup_dict, unique_dict = filter_dict(hashes) # Filter the results - pbar.close() + for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]): + logger.info("Writing outputs to: %s" % filename) + with open(filename, "w") as f: + json.dump(content, f, indent=2, sort_keys=True) From 007dcc935e736c53893c1287586c43f8d09e9776 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 26 May 2021 14:28:37 +0100 Subject: [PATCH 42/48] Add TODO item, open issue #20 --- deduplify/compare_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deduplify/compare_files.py b/deduplify/compare_files.py index 71dbbd9..f63bb86 100644 --- a/deduplify/compare_files.py +++ b/deduplify/compare_files.py @@ -62,6 +62,8 @@ def compare_filenames(file_list: list) -> str: ] # Get the filenames name_freq = Counter(filenames) # Count the frequency of the filenames + # TODO: #20 Update to handle cases where the length of the filenames are equivalent + # but they are different filenames. if len(name_freq) == 1: file_list.remove(min(file_list, key=len)) return file_list From 99f43c6e1e0552dd142e8eb6d2698a16c025c7a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 26 Feb 2022 10:25:32 +0000 Subject: [PATCH 43/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deduplify/cli.py | 2 +- deduplify/hash_files.py | 10 +++++----- tests/test_hash.py | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/deduplify/cli.py b/deduplify/cli.py index cf189be..fb71e41 100644 --- a/deduplify/cli.py +++ b/deduplify/cli.py @@ -1,6 +1,6 @@ -import os import argparse import logging +import os import sys from multiprocessing import cpu_count diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py index 81eda53..cc3e043 100644 --- a/deduplify/hash_files.py +++ b/deduplify/hash_files.py @@ -8,18 +8,18 @@ Author: Sarah Gibson Python version: >=3.7 (developed with 3.8) """ -import os +import fnmatch import hashlib import json import logging -import fnmatch -from tqdm import tqdm -from typing import Tuple +import os import sys from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Tuple +from tqdm import tqdm + logger = logging.getLogger() EXPANDED_USER = os.path.expanduser("~") @@ -39,7 +39,7 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int: output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}")) - logger.info("%s files to be hashed in %s" % (output, target_dir)) + logger.info(f"{output} files to be hashed in {target_dir}") return output diff --git a/tests/test_hash.py b/tests/test_hash.py index 71ba817..a394dfd 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -1,11 +1,12 @@ import os from collections import defaultdict + from deduplify.hash_files import ( filter_dict, get_total_number_of_files, hashfile, - transform_dict, restart_run, + transform_dict, ) From 799dd9ea9705fa4c23512d918d9497cc53918c1a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Sat, 26 Feb 2022 10:29:45 +0000 Subject: [PATCH 44/48] Set fail-fast strategy to false --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba6f32c..d0ae8ad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,6 +12,7 @@ jobs: tests: runs-on: ubuntu-latest strategy: + fail-fast: false # Don't cancel all jobs if one fails matrix: python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] From 20a26101c58a4e279c0fceef9d2748bfe352e387 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Sat, 26 Feb 2022 10:30:45 +0000 Subject: [PATCH 45/48] Remove unnecessary jobs from ci workflow --- .github/workflows/ci.yml | 42 ---------------------------------------- 1 file changed, 42 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0ae8ad..9ab36da 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,45 +38,3 @@ jobs: - name: Print coverage report run: | coverage report - - formatting: - runs-on: ubuntu-latest - - steps: - - name: Checkout repo - uses: actions/checkout@v2 - - - name: Setup Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install dependencies - run: | - python -m pip install -U pip - pip install black - - - name: Format Python files with black - run: | - black --check . - - linting: - runs-on: ubuntu-16.04 - - steps: - - name: Checkout repo - uses: actions/checkout@v2 - - - name: Setup Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Install dependencies - run: | - python -m pip install -U pip - pip install flake8 - - - name: Lint Python files with flake8 - run: | - flake8 --ignore=E501 . From 7fe0b92ccddd599624d94ba1be1439f313714208 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Sat, 26 Feb 2022 10:32:26 +0000 Subject: [PATCH 46/48] Bump pytest version to fix CI failure --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 20330a5..ce3106d 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,2 +1,2 @@ coverage==5.3 -pytest==6.2.2 +pytest==6.2.5 From be961ede4af95386750c00990d3e3f11972bd60e Mon Sep 17 00:00:00 2001 From: Sarah Gibson <44771837+sgibson91@users.noreply.github.com> Date: Sat, 26 Feb 2022 10:34:42 +0000 Subject: [PATCH 47/48] Update deduplify/compare_files.py --- deduplify/compare_files.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deduplify/compare_files.py b/deduplify/compare_files.py index 4a242ca..82cdf1b 100644 --- a/deduplify/compare_files.py +++ b/deduplify/compare_files.py @@ -63,8 +63,6 @@ def compare_filenames(file_list: list) -> str: ] # Get the filenames name_freq = Counter(filenames) # Count the frequency of the filenames - # TODO: #20 Update to handle cases where the length of the filenames are equivalent - # but they are different filenames. if len(name_freq) == 1: file_list.remove(min(file_list, key=len)) return file_list From ca87216995d8a0b5fdce743fd782183378dd02de Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Sat, 26 Feb 2022 10:36:29 +0000 Subject: [PATCH 48/48] Bump minor version --- deduplify/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduplify/_version.py b/deduplify/_version.py index c1154e4..5adebd9 100644 --- a/deduplify/_version.py +++ b/deduplify/_version.py @@ -7,5 +7,5 @@ from incremental import Version -__version__ = Version("deduplify", 0, 1, 5) +__version__ = Version("deduplify", 0, 2, 0) __all__ = ["__version__"]