Merge pull request #17 from Living-with-machines/feature/restart-run

Living-with-machines · Feb 26, 2022 · ac4c30a · ac4c30a
2 parents 83ac512 + ca87216
commit ac4c30a
Show file tree

Hide file tree

Showing 16 changed files with 196 additions and 23 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,6 +12,7 @@ jobs:
   tests:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false  # Don't cancel all jobs if one fails
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]
 

diff --git a/.gitignore b/.gitignore
@@ -128,9 +128,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# Ignore dir created for tests
-**/testdir/**
-
 # Ignore JSON output files
 **/duplicates.json
 **/uniques.json
diff --git a/README.md b/README.md
@@ -77,7 +77,7 @@ This file is organised such that the keys are the hashes and the values are a **
 **Command line usage:**
 
 ```bash
-usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] dir
+usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] [--restart] dir
 
 positional arguments:
   dir                   Path to directory to begin search from
@@ -91,6 +91,7 @@ optional arguments:
                         Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json
   -u UNFILE, --unfile UNFILE
                         Destination file for unique hashes. Must be a JSON file. Default: uniques.json
+  --restart             Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.
 ```
 
 ### Comparing files

diff --git a/deduplify/_version.py b/deduplify/_version.py
@@ -7,5 +7,5 @@
 
 from incremental import Version
 
-__version__ = Version("deduplify", 0, 1, 5)
+__version__ = Version("deduplify", 0, 2, 0)
 __all__ = ["__version__"]
diff --git a/deduplify/cli.py b/deduplify/cli.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import os
 import sys
 from multiprocessing import cpu_count
 
@@ -28,6 +29,18 @@ def setup_logging(verbose=False):
         )
 
 
+def resolvepath(path):
+    """Resolve and normalize a path
+
+    1.  Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and
+        ~otheruser/bin to /home/otheruser/bin
+    2.  Normalize the path so that it doesn't contain relative segments, turning
+        e.g. /usr/local/../bin to /usr/bin
+    3.  Get the real path of the actual file, resolving symbolic links
+    """
+    return os.path.realpath(os.path.normpath(os.path.expanduser(path)))
+
+
 def parse_args(args):
     parser = argparse.ArgumentParser(
         description="Find and delete duplicated files in messy datasets"
@@ -56,7 +69,7 @@ def parse_args(args):
     # Positional parser
     parser_pos = argparse.ArgumentParser(add_help=False)
     parser_pos.add_argument(
-        "dir", type=str, help="Path to directory to begin search from"
+        "dir", type=resolvepath, help="Path to directory to begin search from"
     )
 
     # Hash subcommand
@@ -70,19 +83,24 @@ def parse_args(args):
     parser_hash.add_argument(
         "-d",
         "--dupfile",
-        type=str,
+        type=resolvepath,
         dest="dupfile",
         default="duplicates.json",
         help="Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json",
     )
     parser_hash.add_argument(
         "-u",
         "--unfile",
-        type=str,
+        type=resolvepath,
         dest="unfile",
         default="uniques.json",
         help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json",
     )
+    parser_hash.add_argument(
+        "--restart",
+        action="store_true",
+        help="Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.",
+    )
 
     # Compare subcommand
     parser_compare = subparsers.add_parser(
@@ -95,7 +113,7 @@ def parse_args(args):
     parser_compare.add_argument(
         "-i",
         "--infile",
-        type=str,
+        type=resolvepath,
         default="duplicates.json",
         help="Filename to analyse. Must be a JSON file. Default: duplicates.json",
     )

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
@@ -8,7 +8,7 @@
 Author: Sarah Gibson
 Python version: >=3.7 (developed with 3.8)
 """
-
+import fnmatch
 import hashlib
 import json
 import logging
@@ -18,10 +18,32 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Tuple
 
+from tqdm import tqdm
+
 logger = logging.getLogger()
 EXPANDED_USER = os.path.expanduser("~")
 
 
+def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
+    """Count the total number of files of a given extension in a directory.
+
+    Args:
+        target_dir (str): The target directory to search.
+        file_ext (str): The file extension to search for. Default: .xml
+
+    Returns:
+        int: The number of files with the matching extension within the tree
+            of the target directory
+    """
+    logger.info("Calculating number of files that will be hashed in %s" % target_dir)
+
+    output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}"))
+
+    logger.info(f"{output} files to be hashed in {target_dir}")
+
+    return output
+
+
 def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
     """Calculate the MD5 hash of a given file
 
@@ -76,42 +98,101 @@ def filter_dict(results: dict) -> Tuple[dict, dict]:
     for value in duplicated.values():
         total += len(value)
 
-    logger.info("Number of identical files: %s" % total)
+    logger.info("Number of duplicated files: %s" % total)
 
     return duplicated, unique
 
 
-def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs):
+def transform_dict(input_dict: dict) -> dict:
+    """Transforms a dictionary with str type values into one with list type
+    values
+
+    Args:
+        input_dict (dict): of type {key: str}
+
+    Returns:
+        dict: of type {key: [str]}
+    """
+    output_dict = {key: [value] for (key, value) in input_dict.items()}
+    return output_dict
+
+
+def restart_run(dupfile: os.path, unfile: os.path) -> Tuple[dict, list]:
+    """When restarting a hash run, read in and wrangle the previous output files
+    to reconstruct the dictionary and identify which files need to be skipped
+
+    Args:
+        dupfile (os.path): Path the the file containing duplicated hashes and filenames
+        unfile (os.path): Path to the file containing unique hashes and filenames
+    """
+    logger.info("Restarting hashing process")
+    logger.info("Checking required files exist")
+    for filename in [dupfile, unfile]:
+        if not os.path.exists(filename):
+            raise FileNotFoundError(f"{filename} must exist to restart a hash run!")
+
+    logger.info("Reading in files")
+    with open(dupfile) as stream:
+        dup_dict = json.load(stream)
+    with open(unfile) as stream:
+        un_dict = json.load(stream)
+
+    un_dict = transform_dict(un_dict)
+
+    pre_hashed_dict = {**dup_dict, **un_dict}
+    hashes = defaultdict(list, pre_hashed_dict)
+
+    files_to_skip = [item for values in pre_hashed_dict.values() for item in values]
+
+    return hashes, files_to_skip
+
+
+def run_hash(
+    dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs
+):
     """Hash files within a directory structure
 
     Args:
         dir (str): Root directory to search under
         count (int): Number of threads to parallelise over
         dupfile (str): JSON file location for duplicated hashes
         unfile (str): JSON file location for unique hashes
+        restart (bool): If true, will restart a hash run. dupfile and unfile
+            must exist since the filenames already hashed will be skipped.
+            Default: False.
     """
     # Check the directory path exists
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
+    total_file_num = get_total_number_of_files(dir)
+
+    if restart:
+        hashes, files_to_skip = restart_run(dupfile, unfile)
+    else:
+        hashes = defaultdict(list)  # Empty dict to store hashes in
+        files_to_skip = []
+
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
-    hashes = defaultdict(list)  # Empty dict to store hashes in
-    counter = 0
 
-    for dirName, subdirs, fileList in os.walk(dir):
+    total = total_file_num - len(files_to_skip)
+    pbar = tqdm(total=total)
+
+    for dirName, _, fileList in os.walk(dir):
         with ThreadPoolExecutor(max_workers=count) as executor:
             futures = [
                 executor.submit(hashfile, os.path.join(dirName, filename))
                 for filename in fileList
+                if filename not in files_to_skip
             ]
             for future in as_completed(futures):
                 hash, filepath = future.result()
                 hashes[hash].append(filepath)
 
-                counter += 1
-                print(f"Total files hashed: {counter}", end="\r")
-                sys.stdout.flush()
+                pbar.update(1)
+
+    pbar.close()
 
     dup_dict, unique_dict = filter_dict(hashes)  # Filter the results
 

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,2 +1,2 @@
-coverage
-pytest
+coverage==5.3
+pytest==6.2.5
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[aliases]
+test=pytest
diff --git a/setup.py b/setup.py
@@ -78,5 +78,6 @@
         "Programming Language :: Python :: 3.7",
     ],
     use_incremental=True,
-    setup_requires=["incremental"],
+    setup_requires=["incremental", "pytest-runner"],
+    tests_require=test_require,
 )
diff --git a/tests/assets/test_duplicates.json b/tests/assets/test_duplicates.json
@@ -0,0 +1,10 @@
+{
+    "key1": [
+        "valueA",
+        "valueB"
+    ],
+    "key2": [
+        "valueC",
+        "valueD"
+    ]
+}
diff --git a/tests/assets/test_uniques.json b/tests/assets/test_uniques.json
@@ -0,0 +1,4 @@
+{
+    "key3": "valueE",
+    "key4": "valueF"
+}
diff --git a/tests/test_delete.py b/tests/test_delete.py
@@ -6,7 +6,7 @@
 
 @patch("deduplify.del_empty_dirs.os.rmdir")
 def test_del_empty_dirs(mock):
-    test_dir = os.path.join("tests", "testdir")
+    test_dir = os.path.join("tests", "testdir_empty")
     test_call = [call(os.path.abspath(test_dir))]
 
     if not os.path.exists(test_dir):

diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -1,4 +1,13 @@
-from deduplify.hash_files import filter_dict
+import os
+from collections import defaultdict
+
+from deduplify.hash_files import (
+    filter_dict,
+    get_total_number_of_files,
+    hashfile,
+    restart_run,
+    transform_dict,
+)
 
 
 def test_filter_dict():
@@ -8,3 +17,52 @@ def test_filter_dict():
 
     assert dupdict == {"hash2": ["filepath2", "filepath3"]}
     assert undict == {"hash1": "filepath1"}
+
+
+def test_get_total_number_of_files():
+    dirpath = os.path.join("tests", "testdir")
+
+    output1 = get_total_number_of_files(dirpath)
+    output2 = get_total_number_of_files(dirpath, file_ext=".txt")
+
+    assert output1 == 2
+    assert output2 == 1
+
+
+def test_hashfile():
+    path = os.path.join("tests", "assets", "test_infile.json")
+
+    md5_hash, outpath = hashfile(path)
+
+    assert md5_hash == "f3fb257d843b252bdc0442402552d840"
+    assert outpath == path
+
+
+def test_transform_dict():
+    test_dict = {"key1": "value1", "key2": "value2"}
+    expected = {"key1": ["value1"], "key2": ["value2"]}
+
+    output = transform_dict(test_dict)
+
+    assert output == expected
+
+
+def test_restart_run():
+    dup_file = os.path.join(os.getcwd(), "tests", "assets", "test_duplicates.json")
+    un_file = os.path.join(os.getcwd(), "tests", "assets", "test_uniques.json")
+
+    expected_dict = defaultdict(
+        list,
+        {
+            "key1": ["valueA", "valueB"],
+            "key2": ["valueC", "valueD"],
+            "key3": ["valueE"],
+            "key4": ["valueF"],
+        },
+    )
+    expected_list = ["valueA", "valueB", "valueC", "valueD", "valueE", "valueF"]
+
+    hashes, files_to_be_skipped = restart_run(dup_file, un_file)
+
+    assert hashes == expected_dict
+    assert files_to_be_skipped == expected_list
diff --git a/tests/testdir/test_file_1.xml b/tests/testdir/test_file_1.xml
diff --git a/tests/testdir/test_file_2.xml b/tests/testdir/test_file_2.xml
diff --git a/tests/testdir/test_file_3.txt b/tests/testdir/test_file_3.txt