From 3a7d627b2d4785e574d0e4e6747c029d982ebbcb Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 19:53:16 +0000
Subject: [PATCH 01/48] Edit run_hash func to allow reading in previous hashed
 files from json

---
 deduplify/hash_files.py | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index e4ab4b9..7b5b1f6 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -15,6 +15,7 @@
 import hashlib
 import logging
 from typing import Tuple
+from itertools import chain
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
@@ -80,7 +81,9 @@ def filter_dict(results: dict) -> Tuple[dict, dict]:
     return duplicated, unique
 
 
-def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs):
+def run_hash(
+    dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs
+):
     """Hash files within a directory structure
 
     Args:
@@ -88,21 +91,47 @@ def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs):
         count (int): Number of threads to parallelise over
         dupfile (str): JSON file location for duplicated hashes
         unfile (str): JSON file location for unique hashes
+        restart (bool): If true, will restart a hash run. dupfile and unfile
+            must exist since the filenames already hashed will be skipped.
+            Default: False.
     """
     # Check the directory path exists
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
+    if restart:
+        for input_file in [dupfile, unfile]:
+            if not os.path.isfile(input_file):
+                raise FileNotFoundError(
+                    f"{input_file} must exist to restart a hash run!"
+                )
+
+        with open(dupfile) as stream:
+            dup_dict = json.load(stream)
+
+        with open(unfile) as stream:
+            un_dict = json.load(stream)
+
+        pre_hashed_dict = {**dup_dict, **un_dict}
+        files_to_skip = list(chain(*pre_hashed_dict.values()))
+    else:
+        files_to_skip = []
+    print(files_to_skip[:10])
+
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
-    hashes = defaultdict(list)  # Empty dict to store hashes in
     counter = 0
+    if restart:
+        hashes = pre_hashed_dict.copy()
+    else:
+        hashes = defaultdict(list)  # Empty dict to store hashes in
 
-    for dirName, subdirs, fileList in os.walk(dir):
+    for dirName, _, fileList in os.walk(dir):
         with ThreadPoolExecutor(max_workers=count) as executor:
             futures = [
                 executor.submit(hashfile, os.path.join(dirName, filename))
                 for filename in fileList
+                if filename not in files_to_skip
             ]
             for future in as_completed(futures):
                 hash, filepath = future.result()

From 6fd6f4021c0902afe8cc331697dcf080f625de18 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 19:54:44 +0000
Subject: [PATCH 02/48] Add --restart flag to cli

---
 deduplify/cli.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/deduplify/cli.py b/deduplify/cli.py
index 3e8d93e..756c1c7 100644
--- a/deduplify/cli.py
+++ b/deduplify/cli.py
@@ -83,6 +83,11 @@ def parse_args(args):
         default="uniques.json",
         help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json",
     )
+    parser_hash.add_argument(
+        "--restart",
+        action="store_true",
+        help="Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.",
+    )
 
     # Compare subcommand
     parser_compare = subparsers.add_parser(

From e061609b7fc64237c0fb1f01358d2774d431701f Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 19:58:44 +0000
Subject: [PATCH 03/48] Remove unnecessary print command

---
 deduplify/hash_files.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 7b5b1f6..5779ac1 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -116,7 +116,6 @@ def run_hash(
         files_to_skip = list(chain(*pre_hashed_dict.values()))
     else:
         files_to_skip = []
-    print(files_to_skip[:10])
 
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")

From a10677b39e15fb64413bbfff53558a6396e0ed2a Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:28:31 +0000
Subject: [PATCH 04/48] Version bump

---
 deduplify/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/_version.py b/deduplify/_version.py
index 007cab2..5adebd9 100644
--- a/deduplify/_version.py
+++ b/deduplify/_version.py
@@ -7,5 +7,5 @@
 
 from incremental import Version
 
-__version__ = Version("deduplify", 0, 1, 2)
+__version__ = Version("deduplify", 0, 2, 0)
 __all__ = ["__version__"]

From 50c038c25a0cd3d07625faa5ec62c3e3d191b2c5 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 09:24:00 +0000
Subject: [PATCH 05/48] Add logging statement when restarting the hashing
 process

---
 deduplify/hash_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 5779ac1..367b959 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -100,6 +100,8 @@ def run_hash(
         raise ValueError("Please provide a known filepath!")
 
     if restart:
+        logger.info("Restarting hashing process")
+
         for input_file in [dupfile, unfile]:
             if not os.path.isfile(input_file):
                 raise FileNotFoundError(

From c52499d105bd789c22f2e18fa35e40cea4436309 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:20:13 +0000
Subject: [PATCH 06/48] Define a function to count total number of xml files

---
 deduplify/hash_files.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 367b959..362445a 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -14,6 +14,7 @@
 import json
 import hashlib
 import logging
+import subprocess
 from typing import Tuple
 from itertools import chain
 from collections import defaultdict
@@ -22,6 +23,27 @@
 logger = logging.getLogger()
 
 
+def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int:
+    """Count the total number of files of a given extension in a directory.
+
+    Args:
+        dir (str): The target directory to search.
+        file_ext (str): The file extension to search for. Default: .xml
+
+    Returns:
+        int: The number of files with the matching extension within the tree
+            of the target directory
+    """
+    find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"']
+    wc_cmd = ["wc", "-l"]
+
+    find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE)
+    output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout)
+    find_proc.wait()
+
+    return int(output.decode("utf-8").strip("\n"))
+
+
 def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
     """Calculate the MD5 hash of a given file
 
@@ -99,6 +121,8 @@ def run_hash(
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
+    total_file_number = get_total_number_of_files(dir)
+
     if restart:
         logger.info("Restarting hashing process")
 

From d9d7adf62fc5457a90feef6b5fe349e84b69162e Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:20:43 +0000
Subject: [PATCH 07/48] Deprecate previous counter mechanism

---
 deduplify/hash_files.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 362445a..8a4f657 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -145,7 +145,7 @@ def run_hash(
 
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
-    counter = 0
+    # counter = 0
     if restart:
         hashes = pre_hashed_dict.copy()
     else:
@@ -162,9 +162,9 @@ def run_hash(
                 hash, filepath = future.result()
                 hashes[hash].append(filepath)
 
-                counter += 1
-                print(f"Total files hashed: {counter}", end="\r")
-                sys.stdout.flush()
+                # counter += 1
+                # print(f"Total files hashed: {counter}", end="\r")
+                # sys.stdout.flush()
 
     dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 

From b1eb8f423a9d0cac3df091c3c6b2d969186c1218 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:21:03 +0000
Subject: [PATCH 08/48] Apply manually controlled tqdm progress bar

---
 deduplify/hash_files.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 8a4f657..9384812 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -15,6 +15,7 @@
 import hashlib
 import logging
 import subprocess
+from tqdm import tqdm
 from typing import Tuple
 from itertools import chain
 from collections import defaultdict
@@ -151,6 +152,8 @@ def run_hash(
     else:
         hashes = defaultdict(list)  # Empty dict to store hashes in
 
+    pbar = tqdm(total=total_file_number - len(files_to_skip))
+
     for dirName, _, fileList in os.walk(dir):
         with ThreadPoolExecutor(max_workers=count) as executor:
             futures = [
@@ -166,6 +169,10 @@ def run_hash(
                 # print(f"Total files hashed: {counter}", end="\r")
                 # sys.stdout.flush()
 
+                pbar.update(1)
+
+    pbar.close()
+
     dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 
     for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):

From 854fad5e93b0b10136c43ba8587a438bde3422ba Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:39:08 +0000
Subject: [PATCH 09/48] Comment out unused import

---
 deduplify/hash_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 9384812..17ce060 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -10,7 +10,7 @@
 """
 
 import os
-import sys
+# import sys
 import json
 import hashlib
 import logging

From 6a54c88a291cb7a8be34d553503bb8ad6f47db24 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 22 Mar 2021 20:40:32 +0000
Subject: [PATCH 10/48] Run formatter

---
 deduplify/hash_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 17ce060..a5df73d 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -8,9 +8,9 @@
 Author: Sarah Gibson
 Python version: >=3.7 (developed with 3.8)
 """
+# import sys
 
 import os
-# import sys
 import json
 import hashlib
 import logging

From 419b79b616dffb51da734582979fc75886d0df0b Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 09:14:34 +0000
Subject: [PATCH 11/48] Add logging statements to get_total_numbers_of_files
 func

---
 deduplify/hash_files.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index a5df73d..9237d3b 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -35,6 +35,8 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int:
         int: The number of files with the matching extension within the tree
             of the target directory
     """
+    logger.info("Calculating number of files that will be hashed")
+
     find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"']
     wc_cmd = ["wc", "-l"]
 
@@ -42,7 +44,11 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int:
     output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout)
     find_proc.wait()
 
-    return int(output.decode("utf-8").strip("\n"))
+    output = int(output.decode("utf-8").strip("\n"))
+
+    logger.info("%s files to be hashed" % output)
+
+    return output
 
 
 def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:

From fc42a9a0b46abaae2926afd6c100437f5f87458e Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 15:20:08 +0000
Subject: [PATCH 12/48] Merging stash and resolving conflicts

---
 deduplify/hash_files.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 9237d3b..958b186 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -154,7 +154,7 @@ def run_hash(
     logger.info("Generating MD5 hashes for files...")
     # counter = 0
     if restart:
-        hashes = pre_hashed_dict.copy()
+        hashes = defaultdict(lambda: None, pre_hashed_dict)
     else:
         hashes = defaultdict(list)  # Empty dict to store hashes in
 
@@ -171,6 +171,13 @@ def run_hash(
                 hash, filepath = future.result()
                 hashes[hash].append(filepath)
 
+                dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
+
+                for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
+                    logger.info("Writing outputs to: %s" % filename)
+                    with open(filename, "w") as f:
+                        json.dump(content, f, indent=2, sort_keys=True)
+
                 # counter += 1
                 # print(f"Total files hashed: {counter}", end="\r")
                 # sys.stdout.flush()
@@ -178,10 +185,3 @@ def run_hash(
                 pbar.update(1)
 
     pbar.close()
-
-    dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
-
-    for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
-        logger.info("Writing outputs to: %s" % filename)
-        with open(filename, "w") as f:
-            json.dump(content, f, indent=2, sort_keys=True)

From 80588469cbd21dbc32c6af5a43e4d1e0dd28b14c Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 15:55:46 +0000
Subject: [PATCH 13/48] Rename dir arg in get_total_number_of_files func

---
 deduplify/hash_files.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 958b186..c84d039 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -24,11 +24,11 @@
 logger = logging.getLogger()
 
 
-def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int:
+def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
     """Count the total number of files of a given extension in a directory.
 
     Args:
-        dir (str): The target directory to search.
+        target_dir (str): The target directory to search.
         file_ext (str): The file extension to search for. Default: .xml
 
     Returns:
@@ -37,7 +37,7 @@ def get_total_number_of_files(dir: str, file_ext: str = ".xml") -> int:
     """
     logger.info("Calculating number of files that will be hashed")
 
-    find_cmd = ["find", dir, "-type", "f", "-name", f'"*{file_ext}"']
+    find_cmd = ["find", target_dir, "-type", "f", "-name", f'\"*{file_ext}\"']
     wc_cmd = ["wc", "-l"]
 
     find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE)

From a445febe08050bc48e19ec05b6f4922d1f5188a5 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 15:56:37 +0000
Subject: [PATCH 14/48] [HOTFIX] Fixing file number total

---
 deduplify/hash_files.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index c84d039..0a2b147 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -128,8 +128,6 @@ def run_hash(
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
-    total_file_number = get_total_number_of_files(dir)
-
     if restart:
         logger.info("Restarting hashing process")
 
@@ -158,7 +156,8 @@ def run_hash(
     else:
         hashes = defaultdict(list)  # Empty dict to store hashes in
 
-    pbar = tqdm(total=total_file_number - len(files_to_skip))
+    total = 10410200 - len(files_to_skip)
+    pbar = tqdm(total=total)
 
     for dirName, _, fileList in os.walk(dir):
         with ThreadPoolExecutor(max_workers=count) as executor:

From f4bfb9ccc18b6ad84e7b37b17fa891b6987a4578 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 15:57:05 +0000
Subject: [PATCH 15/48] setting a defaultdict from previous dict

---
 deduplify/hash_files.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 0a2b147..bb46ecb 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -143,6 +143,9 @@ def run_hash(
         with open(unfile) as stream:
             un_dict = json.load(stream)
 
+        for key, value in un_dict.items():
+            un_dict[key] = [value]
+
         pre_hashed_dict = {**dup_dict, **un_dict}
         files_to_skip = list(chain(*pre_hashed_dict.values()))
     else:
@@ -152,7 +155,7 @@ def run_hash(
     logger.info("Generating MD5 hashes for files...")
     # counter = 0
     if restart:
-        hashes = defaultdict(lambda: None, pre_hashed_dict)
+        hashes = defaultdict(list, pre_hashed_dict)
     else:
         hashes = defaultdict(list)  # Empty dict to store hashes in
 

From 7c55422d7669be63627a71d124b53de23ce47070 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 15:57:16 +0000
Subject: [PATCH 16/48] Update log message

---
 deduplify/hash_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index bb46ecb..8c4f7b5 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -105,7 +105,7 @@ def filter_dict(results: dict) -> Tuple[dict, dict]:
     for value in duplicated.values():
         total += len(value)
 
-    logger.info("Number of identical files: %s" % total)
+    logger.info("Number of duplicated files: %s" % total)
 
     return duplicated, unique
 

From 53946bc77e38729b8dd8aec625065fbfe5534e8f Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Tue, 23 Mar 2021 16:42:01 +0000
Subject: [PATCH 17/48] Fix counting of files

---
 deduplify/hash_files.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 8c4f7b5..fb2a101 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -14,7 +14,7 @@
 import json
 import hashlib
 import logging
-import subprocess
+import fnmatch
 from tqdm import tqdm
 from typing import Tuple
 from itertools import chain
@@ -24,6 +24,18 @@
 logger = logging.getLogger()
 
 
+def resolvepath(path):
+    """Resolve and normalize a path
+
+    1.  Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and
+        ~otheruser/bin to /home/otheruser/bin
+    2.  Normalize the path so that it doesn't contain relative segments, turning
+        e.g. /usr/local/../bin to /usr/bin
+    3.  Get the real path of the actual file, resolving symbolic links
+    """
+    return os.path.realpath(os.path.normpath(os.path.expanduser(path)))
+
+
 def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
     """Count the total number of files of a given extension in a directory.
 
@@ -35,18 +47,12 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
         int: The number of files with the matching extension within the tree
             of the target directory
     """
-    logger.info("Calculating number of files that will be hashed")
+    logger.info("Calculating number of files that will be hashed in %s" % target_dir)
 
-    find_cmd = ["find", target_dir, "-type", "f", "-name", f'\"*{file_ext}\"']
-    wc_cmd = ["wc", "-l"]
+    dirpath = resolvepath(target_dir)
+    output = len(fnmatch.filter(os.listdir(dirpath), f"*{file_ext}"))
 
-    find_proc = subprocess.Popen(find_cmd, stdout=subprocess.PIPE)
-    output = subprocess.check_output(wc_cmd, stdin=find_proc.stdout)
-    find_proc.wait()
-
-    output = int(output.decode("utf-8").strip("\n"))
-
-    logger.info("%s files to be hashed" % output)
+    logger.info("%s files to be hashed in %s" % (output, target_dir))
 
     return output
 
@@ -128,6 +134,8 @@ def run_hash(
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
+    total_file_num = get_total_number_of_files(dir)
+
     if restart:
         logger.info("Restarting hashing process")
 
@@ -147,7 +155,7 @@ def run_hash(
             un_dict[key] = [value]
 
         pre_hashed_dict = {**dup_dict, **un_dict}
-        files_to_skip = list(chain(*pre_hashed_dict.values()))
+        files_to_skip = [item for values in pre_hashed_dict.values() for item in values]
     else:
         files_to_skip = []
 
@@ -159,7 +167,7 @@ def run_hash(
     else:
         hashes = defaultdict(list)  # Empty dict to store hashes in
 
-    total = 10410200 - len(files_to_skip)
+    total = total_file_num - len(files_to_skip)
     pbar = tqdm(total=total)
 
     for dirName, _, fileList in os.walk(dir):

From 5219de56377585d0d697cbe93a9618a9c0c721ec Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 10:28:47 +0000
Subject: [PATCH 18/48] Removing deprecated code

---
 deduplify/hash_files.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index fb2a101..a47e131 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -8,8 +8,6 @@
 Author: Sarah Gibson
 Python version: >=3.7 (developed with 3.8)
 """
-# import sys
-
 import os
 import json
 import hashlib
@@ -17,7 +15,6 @@
 import fnmatch
 from tqdm import tqdm
 from typing import Tuple
-from itertools import chain
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
@@ -161,7 +158,7 @@ def run_hash(
 
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
-    # counter = 0
+
     if restart:
         hashes = defaultdict(list, pre_hashed_dict)
     else:
@@ -188,10 +185,6 @@ def run_hash(
                     with open(filename, "w") as f:
                         json.dump(content, f, indent=2, sort_keys=True)
 
-                # counter += 1
-                # print(f"Total files hashed: {counter}", end="\r")
-                # sys.stdout.flush()
-
                 pbar.update(1)
 
     pbar.close()

From bb3db10806d2523a5143ebdb9d6fbaa8a600991b Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 10:33:29 +0000
Subject: [PATCH 19/48] Resolve filepaths in the CLI

---
 deduplify/cli.py        | 21 +++++++++++++++++----
 deduplify/hash_files.py | 15 +--------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/deduplify/cli.py b/deduplify/cli.py
index 756c1c7..ff7830e 100644
--- a/deduplify/cli.py
+++ b/deduplify/cli.py
@@ -1,3 +1,4 @@
+import os
 import sys
 import logging
 import argparse
@@ -28,6 +29,18 @@ def setup_logging(verbose=False):
         )
 
 
+def resolvepath(path):
+    """Resolve and normalize a path
+
+    1.  Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and
+        ~otheruser/bin to /home/otheruser/bin
+    2.  Normalize the path so that it doesn't contain relative segments, turning
+        e.g. /usr/local/../bin to /usr/bin
+    3.  Get the real path of the actual file, resolving symbolic links
+    """
+    return os.path.realpath(os.path.normpath(os.path.expanduser(path)))
+
+
 def parse_args(args):
     parser = argparse.ArgumentParser(
         description="Find and delete duplicated files in messy datasets"
@@ -56,7 +69,7 @@ def parse_args(args):
     # Positional parser
     parser_pos = argparse.ArgumentParser(add_help=False)
     parser_pos.add_argument(
-        "dir", type=str, help="Path to directory to begin search from"
+        "dir", type=resolvepath, help="Path to directory to begin search from"
     )
 
     # Hash subcommand
@@ -70,7 +83,7 @@ def parse_args(args):
     parser_hash.add_argument(
         "-d",
         "--dupfile",
-        type=str,
+        type=resolvepath,
         dest="dupfile",
         default="duplicates.json",
         help="Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json",
@@ -78,7 +91,7 @@ def parse_args(args):
     parser_hash.add_argument(
         "-u",
         "--unfile",
-        type=str,
+        type=resolvepath,
         dest="unfile",
         default="uniques.json",
         help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json",
@@ -100,7 +113,7 @@ def parse_args(args):
     parser_compare.add_argument(
         "-i",
         "--infile",
-        type=str,
+        type=resolvepath,
         default="duplicates.json",
         help="Filename to analyse. Must be a JSON file. Default: duplicates.json",
     )
diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index a47e131..d6b104d 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -21,18 +21,6 @@
 logger = logging.getLogger()
 
 
-def resolvepath(path):
-    """Resolve and normalize a path
-
-    1.  Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and
-        ~otheruser/bin to /home/otheruser/bin
-    2.  Normalize the path so that it doesn't contain relative segments, turning
-        e.g. /usr/local/../bin to /usr/bin
-    3.  Get the real path of the actual file, resolving symbolic links
-    """
-    return os.path.realpath(os.path.normpath(os.path.expanduser(path)))
-
-
 def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
     """Count the total number of files of a given extension in a directory.
 
@@ -46,8 +34,7 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
     """
     logger.info("Calculating number of files that will be hashed in %s" % target_dir)
 
-    dirpath = resolvepath(target_dir)
-    output = len(fnmatch.filter(os.listdir(dirpath), f"*{file_ext}"))
+    output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}"))
 
     logger.info("%s files to be hashed in %s" % (output, target_dir))
 

From 5e64212f5ac245137b45ee07784297302df57e66 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 10:51:56 +0000
Subject: [PATCH 20/48] Set dev/testing requirements

---
 .github/workflows/ci.yml | 2 +-
 dev-requirements.txt     | 2 ++
 setup.cfg                | 2 ++
 setup.py                 | 3 ++-
 4 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 dev-requirements.txt
 create mode 100644 setup.cfg

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index baaedf6..567b7eb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
         run: |
           python -m pip install -U pip
           pip install -r requirements.txt
-          pip install pytest coverage
+          pip install -r dev-requirements.txt
 
       - name: Run tests
         run: |
diff --git a/dev-requirements.txt b/dev-requirements.txt
new file mode 100644
index 0000000..20330a5
--- /dev/null
+++ b/dev-requirements.txt
@@ -0,0 +1,2 @@
+coverage==5.3
+pytest==6.2.2
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..b7e4789
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[aliases]
+test=pytest
diff --git a/setup.py b/setup.py
index d5b487d..e589937 100644
--- a/setup.py
+++ b/setup.py
@@ -75,5 +75,6 @@
         "Programming Language :: Python :: 3.7",
     ],
     use_incremental=True,
-    setup_requires=["incremental"],
+    setup_requires=["incremental", "pytest-runner"],
+    tests_require=test_require,
 )

From b6507ede2f91b9edc94526afd186f2f02917ad08 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 10:53:36 +0000
Subject: [PATCH 21/48] Merge lint and format CI jobs into 1 file

---
 .github/workflows/ci.yml          | 42 +++++++++++++++++++++++++++++++
 .github/workflows/lint-format.yml | 35 --------------------------
 2 files changed, 42 insertions(+), 35 deletions(-)
 delete mode 100644 .github/workflows/lint-format.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 567b7eb..6061f81 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,3 +37,45 @@ jobs:
       - name: Print coverage report
         run: |
           coverage report
+
+  formatting:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip
+          pip install black
+
+      - name: Format Python files with black
+        run: |
+          black --check .
+
+  linting:
+    runs-on: ubuntu-16.04
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip
+          pip install flake8
+
+      - name: Lint Python files with flake8
+        run: |
+          flake8 --ignore=E501 .
diff --git a/.github/workflows/lint-format.yml b/.github/workflows/lint-format.yml
deleted file mode 100644
index 71ec3f6..0000000
--- a/.github/workflows/lint-format.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Lint and Format Python Files
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  lint-format:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v2
-      
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v1
-        with:
-          python-version: 3.8
-      
-      - name: Install dependencies
-        run: |
-          python -m pip install -U pip
-          pip install black flake8
-      
-      - name: Format Python files with black
-        run: |
-          black --check .
-      
-      - name: Lint Python files with flake8
-        run: |
-          flake8 --ignore=E501 .

From 201dc5568f877a8e532385ea4eb127ff18633fd1 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:02:15 +0000
Subject: [PATCH 22/48] Test get_total_number_of_files func

---
 tests/test_hash.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index 9e7a128..2a1eae0 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -1,4 +1,6 @@
-from deduplify.hash_files import filter_dict
+import os
+from deduplify.cli import resolvepath
+from deduplify.hash_files import filter_dict, get_total_number_of_files
 
 
 def test_filter_dict():
@@ -8,3 +10,14 @@ def test_filter_dict():
 
     assert dupdict == {"hash2": ["filepath2", "filepath3"]}
     assert undict == {"hash1": "filepath1"}
+
+
+def test_get_total_number_of_files():
+    dirpath = resolvepath(os.path.join("tests", "testdir"))
+    print(dirpath)
+
+    output1 = get_total_number_of_files(dirpath)
+    output2 = get_total_number_of_files(dirpath, file_ext=".txt")
+
+    assert output1 == 2
+    assert output2 == 1

From f01bcec05fd52a68de0bd2228349f89321336905 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:10:31 +0000
Subject: [PATCH 23/48] Add test for hashfile func

---
 tests/test_hash.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index 2a1eae0..e4192cf 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -1,6 +1,6 @@
 import os
 from deduplify.cli import resolvepath
-from deduplify.hash_files import filter_dict, get_total_number_of_files
+from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile
 
 
 def test_filter_dict():
@@ -21,3 +21,12 @@ def test_get_total_number_of_files():
 
     assert output1 == 2
     assert output2 == 1
+
+
+def test_hashfile():
+    path = os.path.join("tests", "assets", "test_infile.json")
+
+    md5_hash, outpath = hashfile(path)
+
+    assert md5_hash == 'f3fb257d843b252bdc0442402552d840'
+    assert outpath == path

From 5ba77517f1e2e9014e64958ce5b2dc9e2cf9f4bd Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:12:16 +0000
Subject: [PATCH 24/48] Fix broken test

---
 tests/test_delete.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_delete.py b/tests/test_delete.py
index f275167..130562b 100644
--- a/tests/test_delete.py
+++ b/tests/test_delete.py
@@ -5,7 +5,7 @@
 
 @patch("deduplify.del_empty_dirs.os.rmdir")
 def test_del_empty_dirs(mock):
-    test_dir = os.path.join("tests", "testdir")
+    test_dir = os.path.join("tests", "testdir_empty")
     test_call = [call(os.path.abspath(test_dir))]
 
     if not os.path.exists(test_dir):

From 428a3895c78a6a1cbb35803374b651dd8fff3e32 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:42:08 +0000
Subject: [PATCH 25/48] FIXUP: test_get_total_number_of_files func

---
 tests/test_hash.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index e4192cf..96184ec 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -1,5 +1,4 @@
 import os
-from deduplify.cli import resolvepath
 from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile
 
 
@@ -13,8 +12,9 @@ def test_filter_dict():
 
 
 def test_get_total_number_of_files():
-    dirpath = resolvepath(os.path.join("tests", "testdir"))
-    print(dirpath)
+    ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__))
+    HERE = "/".join(ABSOLUTE_HERE.split("/")[:-1])
+    dirpath = os.path.join(HERE, "tests", "testdir")
 
     output1 = get_total_number_of_files(dirpath)
     output2 = get_total_number_of_files(dirpath, file_ext=".txt")

From 1939f91edd61edd603ee1e9d9730881d0dc75430 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:47:12 +0000
Subject: [PATCH 26/48] Trying to fix tests in CI

---
 .github/workflows/ci.yml | 2 ++
 tests/test_hash.py       | 4 +---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6061f81..ef7b884 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,6 +30,8 @@ jobs:
           pip install -r requirements.txt
           pip install -r dev-requirements.txt
 
+      - run: pwd
+
       - name: Run tests
         run: |
           python -m coverage run -m pytest -vvv
diff --git a/tests/test_hash.py b/tests/test_hash.py
index 96184ec..59d224b 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -12,9 +12,7 @@ def test_filter_dict():
 
 
 def test_get_total_number_of_files():
-    ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__))
-    HERE = "/".join(ABSOLUTE_HERE.split("/")[:-1])
-    dirpath = os.path.join(HERE, "tests", "testdir")
+    dirpath = os.path.join("tests", "testdir")
 
     output1 = get_total_number_of_files(dirpath)
     output2 = get_total_number_of_files(dirpath, file_ext=".txt")

From 583610ec329468f182e3525304b7452536fc0924 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:49:38 +0000
Subject: [PATCH 27/48] Adding debugging statements to CI

---
 .github/workflows/ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ef7b884..0744e62 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,7 +30,9 @@ jobs:
           pip install -r requirements.txt
           pip install -r dev-requirements.txt
 
-      - run: pwd
+      - run: |
+          pwd
+          ls -al
 
       - name: Run tests
         run: |

From 8a8b98e64afd0e090b6403852b1c9c665731ff6a Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:50:30 +0000
Subject: [PATCH 28/48] Edit dirpath

---
 tests/test_hash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index 59d224b..ca35fa7 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -12,7 +12,7 @@ def test_filter_dict():
 
 
 def test_get_total_number_of_files():
-    dirpath = os.path.join("tests", "testdir")
+    dirpath = os.path.join(os.getcwd(), "tests", "testdir")
 
     output1 = get_total_number_of_files(dirpath)
     output2 = get_total_number_of_files(dirpath, file_ext=".txt")

From 80212bfe578ff44c4ed4f1b66adbe170934696a1 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:53:45 +0000
Subject: [PATCH 29/48] Add dir list test

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0744e62..72bfc67 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
 
       - run: |
           pwd
-          ls -al
+          find . -type f
 
       - name: Run tests
         run: |

From da0eafa5f9de133c51ed47332c592782da64eea3 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:57:06 +0000
Subject: [PATCH 30/48] Force add test files

---
 tests/testdir/test_file_1.xml | 0
 tests/testdir/test_file_2.xml | 0
 tests/testdir/test_file_3.txt | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/testdir/test_file_1.xml
 create mode 100644 tests/testdir/test_file_2.xml
 create mode 100644 tests/testdir/test_file_3.txt

diff --git a/tests/testdir/test_file_1.xml b/tests/testdir/test_file_1.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/testdir/test_file_2.xml b/tests/testdir/test_file_2.xml
new file mode 100644
index 0000000..e69de29
diff --git a/tests/testdir/test_file_3.txt b/tests/testdir/test_file_3.txt
new file mode 100644
index 0000000..e69de29

From 6ab6a276c6102e603a39adc6346517793d0a57bd Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:57:12 +0000
Subject: [PATCH 31/48] Update gitignore

---
 .gitignore | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 03feae7..a35da5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -128,9 +128,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# Ignore dir created for tests
-**/testdir/**
-
 # Ignore JSON output files
 **/duplicates.json
 **/uniques.json

From de691fb99ce90cc183497bba80d4e64abf211116 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:57:59 +0000
Subject: [PATCH 32/48] Remove debugging commands from CI

---
 .github/workflows/ci.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 72bfc67..6061f81 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,10 +30,6 @@ jobs:
           pip install -r requirements.txt
           pip install -r dev-requirements.txt
 
-      - run: |
-          pwd
-          find . -type f
-
       - name: Run tests
         run: |
           python -m coverage run -m pytest -vvv

From 153b6b42786c8fd632d19aa1225c563000ea9b32 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 11:59:45 +0000
Subject: [PATCH 33/48] Tweak filepaths

---
 tests/test_hash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index ca35fa7..59d224b 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -12,7 +12,7 @@ def test_filter_dict():
 
 
 def test_get_total_number_of_files():
-    dirpath = os.path.join(os.getcwd(), "tests", "testdir")
+    dirpath = os.path.join("tests", "testdir")
 
     output1 = get_total_number_of_files(dirpath)
     output2 = get_total_number_of_files(dirpath, file_ext=".txt")

From 33b0667922bfc68c347c1ec00d8a0478c3a2619d Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 12:00:39 +0000
Subject: [PATCH 34/48] Run formatter

---
 deduplify/hash_files.py | 4 +++-
 tests/test_hash.py      | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index d6b104d..a2b37d3 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -167,7 +167,9 @@ def run_hash(
 
                 dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 
-                for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
+                for filename, content in zip(
+                    [dupfile, unfile], [dup_dict, unique_dict]
+                ):
                     logger.info("Writing outputs to: %s" % filename)
                     with open(filename, "w") as f:
                         json.dump(content, f, indent=2, sort_keys=True)
diff --git a/tests/test_hash.py b/tests/test_hash.py
index 59d224b..b580321 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -26,5 +26,5 @@ def test_hashfile():
 
     md5_hash, outpath = hashfile(path)
 
-    assert md5_hash == 'f3fb257d843b252bdc0442402552d840'
+    assert md5_hash == "f3fb257d843b252bdc0442402552d840"
     assert outpath == path

From 84040fbf0a1e01986c4e948fdf97aec10270ce0a Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 17:08:58 +0000
Subject: [PATCH 35/48] Create transform_dict and restart_run funcs

---
 deduplify/hash_files.py | 70 +++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index a2b37d3..89b566f 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -100,6 +100,50 @@ def filter_dict(results: dict) -> Tuple[dict, dict]:
     return duplicated, unique
 
 
+def transform_dict(input_dict: dict) -> dict:
+    """Transforms a dictionary with str type values into one with list type
+    values
+
+    Args:
+        input_dict (dict): of type {key: str}
+
+    Returns:
+        dict: of type {key: [str]}
+    """
+    output_dict = {key: [value] for (key, value) in input_dict.items()}
+    return output_dict
+
+
+def restart_run(dupfile: os.path, unfile: os.path) -> Tuple[dict, list]:
+    """When restarting a hash run, read in and wrangle the previous output files
+    to reconstruct the dictionary and identify which files need to be skipped
+
+    Args:
+        dupfile (os.path): Path the the file containing duplicated hashes and filenames
+        unfile (os.path): Path to the file containing unique hashes and filenames
+    """
+    logger.info("Restarting hashing process")
+    logger.info("Checking required files exist")
+    for filename in [dupfile, unfile]:
+        if not os.path.exists(filename):
+            raise FileNotFoundError(f"{filename} must exist to restart a hash run!")
+
+    logger.info("Reading in files")
+    with open(dupfile) as stream:
+        dup_dict = json.load(stream)
+    with open(unfile) as stream:
+        un_dict = json.load(stream)
+
+    un_dict = transform_dict(un_dict)
+
+    pre_hashed_dict = {**dup_dict, **un_dict}
+    hashes = defaultdict(list, pre_hashed_dict)
+
+    files_to_skip = [item for values in pre_hashed_dict.values() for item in values]
+
+    return hashes, files_to_skip
+
+
 def run_hash(
     dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs
 ):
@@ -121,36 +165,14 @@ def run_hash(
     total_file_num = get_total_number_of_files(dir)
 
     if restart:
-        logger.info("Restarting hashing process")
-
-        for input_file in [dupfile, unfile]:
-            if not os.path.isfile(input_file):
-                raise FileNotFoundError(
-                    f"{input_file} must exist to restart a hash run!"
-                )
-
-        with open(dupfile) as stream:
-            dup_dict = json.load(stream)
-
-        with open(unfile) as stream:
-            un_dict = json.load(stream)
-
-        for key, value in un_dict.items():
-            un_dict[key] = [value]
-
-        pre_hashed_dict = {**dup_dict, **un_dict}
-        files_to_skip = [item for values in pre_hashed_dict.values() for item in values]
+        hashes, files_to_skip = restart_run(dupfile, unfile)
     else:
+        hashes = defaultdict(list)  # Empty dict to store hashes in
         files_to_skip = []
 
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
 
-    if restart:
-        hashes = defaultdict(list, pre_hashed_dict)
-    else:
-        hashes = defaultdict(list)  # Empty dict to store hashes in
-
     total = total_file_num - len(files_to_skip)
     pbar = tqdm(total=total)
 

From 59c38354b2f48f17e36477c722004430db78a16e Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 17:09:31 +0000
Subject: [PATCH 36/48] Add tests for transform_dict and restart_run funcs

---
 tests/test_hash.py | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tests/test_hash.py b/tests/test_hash.py
index b580321..71ba817 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -1,5 +1,12 @@
 import os
-from deduplify.hash_files import filter_dict, get_total_number_of_files, hashfile
+from collections import defaultdict
+from deduplify.hash_files import (
+    filter_dict,
+    get_total_number_of_files,
+    hashfile,
+    transform_dict,
+    restart_run,
+)
 
 
 def test_filter_dict():
@@ -28,3 +35,33 @@ def test_hashfile():
 
     assert md5_hash == "f3fb257d843b252bdc0442402552d840"
     assert outpath == path
+
+
+def test_transform_dict():
+    test_dict = {"key1": "value1", "key2": "value2"}
+    expected = {"key1": ["value1"], "key2": ["value2"]}
+
+    output = transform_dict(test_dict)
+
+    assert output == expected
+
+
+def test_restart_run():
+    dup_file = os.path.join(os.getcwd(), "tests", "assets", "test_duplicates.json")
+    un_file = os.path.join(os.getcwd(), "tests", "assets", "test_uniques.json")
+
+    expected_dict = defaultdict(
+        list,
+        {
+            "key1": ["valueA", "valueB"],
+            "key2": ["valueC", "valueD"],
+            "key3": ["valueE"],
+            "key4": ["valueF"],
+        },
+    )
+    expected_list = ["valueA", "valueB", "valueC", "valueD", "valueE", "valueF"]
+
+    hashes, files_to_be_skipped = restart_run(dup_file, un_file)
+
+    assert hashes == expected_dict
+    assert files_to_be_skipped == expected_list

From 8d6921f635ca28ecb96769b26c3c751bcb275463 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 17:09:39 +0000
Subject: [PATCH 37/48] Provide test assets

---
 tests/assets/test_duplicates.json | 10 ++++++++++
 tests/assets/test_uniques.json    |  4 ++++
 2 files changed, 14 insertions(+)
 create mode 100644 tests/assets/test_duplicates.json
 create mode 100644 tests/assets/test_uniques.json

diff --git a/tests/assets/test_duplicates.json b/tests/assets/test_duplicates.json
new file mode 100644
index 0000000..e51f123
--- /dev/null
+++ b/tests/assets/test_duplicates.json
@@ -0,0 +1,10 @@
+{
+    "key1": [
+        "valueA",
+        "valueB"
+    ],
+    "key2": [
+        "valueC",
+        "valueD"
+    ]
+}
diff --git a/tests/assets/test_uniques.json b/tests/assets/test_uniques.json
new file mode 100644
index 0000000..bfcc6de
--- /dev/null
+++ b/tests/assets/test_uniques.json
@@ -0,0 +1,4 @@
+{
+    "key3": "valueE",
+    "key4": "valueF"
+}

From 8c8060e7b49e8b4331f8f332c8c668a30a6bff5f Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 17:11:13 +0000
Subject: [PATCH 38/48] Bump package version

---
 deduplify/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/_version.py b/deduplify/_version.py
index 5adebd9..b606d60 100644
--- a/deduplify/_version.py
+++ b/deduplify/_version.py
@@ -7,5 +7,5 @@
 
 from incremental import Version
 
-__version__ = Version("deduplify", 0, 2, 0)
+__version__ = Version("deduplify", 0, 3, 0)
 __all__ = ["__version__"]

From 5e7abc693830b73bc6e53dbc1762fb4acad0b023 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 24 Mar 2021 17:17:12 +0000
Subject: [PATCH 39/48] Document restart flag

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ea2e5a0..bde82b2 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ This file is organised such that the keys are the hashes and the values are a **
 **Command line usage:**
 
 ```bash
-usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] dir
+usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] [--restart] dir
 
 positional arguments:
   dir                   Path to directory to begin search from
@@ -91,6 +91,7 @@ optional arguments:
                         Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json
   -u UNFILE, --unfile UNFILE
                         Destination file for unique hashes. Must be a JSON file. Default: uniques.json
+  --restart             Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.
 ```
 
 ### Comparing files

From 9148bf0a920961da14c1804bc631ecb30b91aaf2 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Mon, 19 Apr 2021 13:39:58 +0100
Subject: [PATCH 40/48] Move writing to file to outside the ThreadPool context
 manager

---
 deduplify/hash_files.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 89b566f..1c6223a 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -187,15 +187,13 @@ def run_hash(
                 hash, filepath = future.result()
                 hashes[hash].append(filepath)
 
-                dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
+                pbar.update(1)
 
-                for filename, content in zip(
-                    [dupfile, unfile], [dup_dict, unique_dict]
-                ):
-                    logger.info("Writing outputs to: %s" % filename)
-                    with open(filename, "w") as f:
-                        json.dump(content, f, indent=2, sort_keys=True)
+        dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 
-                pbar.update(1)
+        for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
+            logger.info("Writing outputs to: %s" % filename)
+            with open(filename, "w") as f:
+                json.dump(content, f, indent=2, sort_keys=True)
 
     pbar.close()

From 12105f75d43e6dc9813682e3b21d916fb3ff1556 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Fri, 23 Apr 2021 13:45:10 +0100
Subject: [PATCH 41/48] Only filter dictionaries and dump the JSON after the
 loop has completed

---
 deduplify/hash_files.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 1c6223a..639db18 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -189,11 +189,11 @@ def run_hash(
 
                 pbar.update(1)
 
-        dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
+    pbar.close()
 
-        for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
-            logger.info("Writing outputs to: %s" % filename)
-            with open(filename, "w") as f:
-                json.dump(content, f, indent=2, sort_keys=True)
+    dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 
-    pbar.close()
+    for filename, content in zip([dupfile, unfile], [dup_dict, unique_dict]):
+        logger.info("Writing outputs to: %s" % filename)
+        with open(filename, "w") as f:
+            json.dump(content, f, indent=2, sort_keys=True)

From 007dcc935e736c53893c1287586c43f8d09e9776 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <sgibson@turing.ac.uk>
Date: Wed, 26 May 2021 14:28:37 +0100
Subject: [PATCH 42/48] Add TODO item, open issue #20

---
 deduplify/compare_files.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deduplify/compare_files.py b/deduplify/compare_files.py
index 71dbbd9..f63bb86 100644
--- a/deduplify/compare_files.py
+++ b/deduplify/compare_files.py
@@ -62,6 +62,8 @@ def compare_filenames(file_list: list) -> str:
     ]  # Get the filenames
     name_freq = Counter(filenames)  # Count the frequency of the filenames
 
+    # TODO: #20 Update to handle cases where the length of the filenames are equivalent
+    # but they are different filenames.
     if len(name_freq) == 1:
         file_list.remove(min(file_list, key=len))
         return file_list

From 99f43c6e1e0552dd142e8eb6d2698a16c025c7a2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 26 Feb 2022 10:25:32 +0000
Subject: [PATCH 43/48] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deduplify/cli.py        |  2 +-
 deduplify/hash_files.py | 10 +++++-----
 tests/test_hash.py      |  3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/deduplify/cli.py b/deduplify/cli.py
index cf189be..fb71e41 100644
--- a/deduplify/cli.py
+++ b/deduplify/cli.py
@@ -1,6 +1,6 @@
-import os
 import argparse
 import logging
+import os
 import sys
 from multiprocessing import cpu_count
 
diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
index 81eda53..cc3e043 100644
--- a/deduplify/hash_files.py
+++ b/deduplify/hash_files.py
@@ -8,18 +8,18 @@
 Author: Sarah Gibson
 Python version: >=3.7 (developed with 3.8)
 """
-import os
+import fnmatch
 import hashlib
 import json
 import logging
-import fnmatch
-from tqdm import tqdm
-from typing import Tuple
+import os
 import sys
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Tuple
 
+from tqdm import tqdm
+
 logger = logging.getLogger()
 EXPANDED_USER = os.path.expanduser("~")
 
@@ -39,7 +39,7 @@ def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
 
     output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}"))
 
-    logger.info("%s files to be hashed in %s" % (output, target_dir))
+    logger.info(f"{output} files to be hashed in {target_dir}")
 
     return output
 
diff --git a/tests/test_hash.py b/tests/test_hash.py
index 71ba817..a394dfd 100644
--- a/tests/test_hash.py
+++ b/tests/test_hash.py
@@ -1,11 +1,12 @@
 import os
 from collections import defaultdict
+
 from deduplify.hash_files import (
     filter_dict,
     get_total_number_of_files,
     hashfile,
-    transform_dict,
     restart_run,
+    transform_dict,
 )
 
 

From 799dd9ea9705fa4c23512d918d9497cc53918c1a Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Sat, 26 Feb 2022 10:29:45 +0000
Subject: [PATCH 44/48] Set fail-fast strategy to false

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ba6f32c..d0ae8ad 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,6 +12,7 @@ jobs:
   tests:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false  # Don't cancel all jobs if one fails
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
 

From 20a26101c58a4e279c0fceef9d2748bfe352e387 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Sat, 26 Feb 2022 10:30:45 +0000
Subject: [PATCH 45/48] Remove unnecessary jobs from ci workflow

---
 .github/workflows/ci.yml | 42 ----------------------------------------
 1 file changed, 42 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d0ae8ad..9ab36da 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,45 +38,3 @@ jobs:
       - name: Print coverage report
         run: |
           coverage report
-
-  formatting:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v2
-
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v1
-        with:
-          python-version: 3.8
-
-      - name: Install dependencies
-        run: |
-          python -m pip install -U pip
-          pip install black
-
-      - name: Format Python files with black
-        run: |
-          black --check .
-
-  linting:
-    runs-on: ubuntu-16.04
-
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v2
-
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v1
-        with:
-          python-version: 3.8
-
-      - name: Install dependencies
-        run: |
-          python -m pip install -U pip
-          pip install flake8
-
-      - name: Lint Python files with flake8
-        run: |
-          flake8 --ignore=E501 .

From 7fe0b92ccddd599624d94ba1be1439f313714208 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Sat, 26 Feb 2022 10:32:26 +0000
Subject: [PATCH 46/48] Bump pytest version to fix CI failure

---
 dev-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 20330a5..ce3106d 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,2 +1,2 @@
 coverage==5.3
-pytest==6.2.2
+pytest==6.2.5

From be961ede4af95386750c00990d3e3f11972bd60e Mon Sep 17 00:00:00 2001
From: Sarah Gibson <44771837+sgibson91@users.noreply.github.com>
Date: Sat, 26 Feb 2022 10:34:42 +0000
Subject: [PATCH 47/48] Update deduplify/compare_files.py

---
 deduplify/compare_files.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/deduplify/compare_files.py b/deduplify/compare_files.py
index 4a242ca..82cdf1b 100644
--- a/deduplify/compare_files.py
+++ b/deduplify/compare_files.py
@@ -63,8 +63,6 @@ def compare_filenames(file_list: list) -> str:
     ]  # Get the filenames
     name_freq = Counter(filenames)  # Count the frequency of the filenames
 
-    # TODO: #20 Update to handle cases where the length of the filenames are equivalent
-    # but they are different filenames.
     if len(name_freq) == 1:
         file_list.remove(min(file_list, key=len))
         return file_list

From ca87216995d8a0b5fdce743fd782183378dd02de Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Sat, 26 Feb 2022 10:36:29 +0000
Subject: [PATCH 48/48] Bump minor version

---
 deduplify/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deduplify/_version.py b/deduplify/_version.py
index c1154e4..5adebd9 100644
--- a/deduplify/_version.py
+++ b/deduplify/_version.py
@@ -7,5 +7,5 @@
 
 from incremental import Version
 
-__version__ = Version("deduplify", 0, 1, 5)
+__version__ = Version("deduplify", 0, 2, 0)
 __all__ = ["__version__"]