Skip to content

Commit

Permalink
Merge pull request #17 from Living-with-machines/feature/restart-run
Browse files Browse the repository at this point in the history
  • Loading branch information
sgibson91 authored Feb 26, 2022
2 parents 83ac512 + ca87216 commit ac4c30a
Show file tree
Hide file tree
Showing 16 changed files with 196 additions and 23 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ jobs:
tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false # Don't cancel all jobs if one fails
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, "3.10"]

Expand Down
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,6 @@ dmypy.json
# Pyre type checker
.pyre/

# Ignore dir created for tests
**/testdir/**

# Ignore JSON output files
**/duplicates.json
**/uniques.json
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ This file is organised such that the keys are the hashes and the values are a **
**Command line usage:**

```bash
usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] dir
usage: deduplify hash [-h] [-c COUNT] [-v] [-d DUPFILE] [-u UNFILE] [--restart] dir

positional arguments:
dir Path to directory to begin search from
Expand All @@ -91,6 +91,7 @@ optional arguments:
Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json
-u UNFILE, --unfile UNFILE
Destination file for unique hashes. Must be a JSON file. Default: uniques.json
--restart Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.
```

### Comparing files
Expand Down
2 changes: 1 addition & 1 deletion deduplify/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

from incremental import Version

__version__ = Version("deduplify", 0, 1, 5)
__version__ = Version("deduplify", 0, 2, 0)
__all__ = ["__version__"]
26 changes: 22 additions & 4 deletions deduplify/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import logging
import os
import sys
from multiprocessing import cpu_count

Expand Down Expand Up @@ -28,6 +29,18 @@ def setup_logging(verbose=False):
)


def resolvepath(path):
"""Resolve and normalize a path
1. Handle tilde expansion; turn ~/.ssh into /home/user/.ssh and
~otheruser/bin to /home/otheruser/bin
2. Normalize the path so that it doesn't contain relative segments, turning
e.g. /usr/local/../bin to /usr/bin
3. Get the real path of the actual file, resolving symbolic links
"""
return os.path.realpath(os.path.normpath(os.path.expanduser(path)))


def parse_args(args):
parser = argparse.ArgumentParser(
description="Find and delete duplicated files in messy datasets"
Expand Down Expand Up @@ -56,7 +69,7 @@ def parse_args(args):
# Positional parser
parser_pos = argparse.ArgumentParser(add_help=False)
parser_pos.add_argument(
"dir", type=str, help="Path to directory to begin search from"
"dir", type=resolvepath, help="Path to directory to begin search from"
)

# Hash subcommand
Expand All @@ -70,19 +83,24 @@ def parse_args(args):
parser_hash.add_argument(
"-d",
"--dupfile",
type=str,
type=resolvepath,
dest="dupfile",
default="duplicates.json",
help="Destination file for duplicated hashes. Must be a JSON file. Default: duplicates.json",
)
parser_hash.add_argument(
"-u",
"--unfile",
type=str,
type=resolvepath,
dest="unfile",
default="uniques.json",
help="Destination file for unique hashes. Must be a JSON file. Default: uniques.json",
)
parser_hash.add_argument(
"--restart",
action="store_true",
help="Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and unique filenames must already exist.",
)

# Compare subcommand
parser_compare = subparsers.add_parser(
Expand All @@ -95,7 +113,7 @@ def parse_args(args):
parser_compare.add_argument(
"-i",
"--infile",
type=str,
type=resolvepath,
default="duplicates.json",
help="Filename to analyse. Must be a JSON file. Default: duplicates.json",
)
Expand Down
99 changes: 90 additions & 9 deletions deduplify/hash_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Author: Sarah Gibson
Python version: >=3.7 (developed with 3.8)
"""

import fnmatch
import hashlib
import json
import logging
Expand All @@ -18,10 +18,32 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple

from tqdm import tqdm

logger = logging.getLogger()
EXPANDED_USER = os.path.expanduser("~")


def get_total_number_of_files(target_dir: str, file_ext: str = ".xml") -> int:
"""Count the total number of files of a given extension in a directory.
Args:
target_dir (str): The target directory to search.
file_ext (str): The file extension to search for. Default: .xml
Returns:
int: The number of files with the matching extension within the tree
of the target directory
"""
logger.info("Calculating number of files that will be hashed in %s" % target_dir)

output = len(fnmatch.filter(os.listdir(target_dir), f"*{file_ext}"))

logger.info(f"{output} files to be hashed in {target_dir}")

return output


def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
"""Calculate the MD5 hash of a given file
Expand Down Expand Up @@ -76,42 +98,101 @@ def filter_dict(results: dict) -> Tuple[dict, dict]:
for value in duplicated.values():
total += len(value)

logger.info("Number of identical files: %s" % total)
logger.info("Number of duplicated files: %s" % total)

return duplicated, unique


def run_hash(dir: str, count: int, dupfile: str, unfile: str, **kwargs):
def transform_dict(input_dict: dict) -> dict:
"""Transforms a dictionary with str type values into one with list type
values
Args:
input_dict (dict): of type {key: str}
Returns:
dict: of type {key: [str]}
"""
output_dict = {key: [value] for (key, value) in input_dict.items()}
return output_dict


def restart_run(dupfile: os.path, unfile: os.path) -> Tuple[dict, list]:
"""When restarting a hash run, read in and wrangle the previous output files
to reconstruct the dictionary and identify which files need to be skipped
Args:
dupfile (os.path): Path the the file containing duplicated hashes and filenames
unfile (os.path): Path to the file containing unique hashes and filenames
"""
logger.info("Restarting hashing process")
logger.info("Checking required files exist")
for filename in [dupfile, unfile]:
if not os.path.exists(filename):
raise FileNotFoundError(f"{filename} must exist to restart a hash run!")

logger.info("Reading in files")
with open(dupfile) as stream:
dup_dict = json.load(stream)
with open(unfile) as stream:
un_dict = json.load(stream)

un_dict = transform_dict(un_dict)

pre_hashed_dict = {**dup_dict, **un_dict}
hashes = defaultdict(list, pre_hashed_dict)

files_to_skip = [item for values in pre_hashed_dict.values() for item in values]

return hashes, files_to_skip


def run_hash(
dir: str, count: int, dupfile: str, unfile: str, restart: bool = False, **kwargs
):
"""Hash files within a directory structure
Args:
dir (str): Root directory to search under
count (int): Number of threads to parallelise over
dupfile (str): JSON file location for duplicated hashes
unfile (str): JSON file location for unique hashes
restart (bool): If true, will restart a hash run. dupfile and unfile
must exist since the filenames already hashed will be skipped.
Default: False.
"""
# Check the directory path exists
if not os.path.exists(dir):
raise ValueError("Please provide a known filepath!")

total_file_num = get_total_number_of_files(dir)

if restart:
hashes, files_to_skip = restart_run(dupfile, unfile)
else:
hashes = defaultdict(list) # Empty dict to store hashes in
files_to_skip = []

logger.info("Walking structure of: %s" % dir)
logger.info("Generating MD5 hashes for files...")
hashes = defaultdict(list) # Empty dict to store hashes in
counter = 0

for dirName, subdirs, fileList in os.walk(dir):
total = total_file_num - len(files_to_skip)
pbar = tqdm(total=total)

for dirName, _, fileList in os.walk(dir):
with ThreadPoolExecutor(max_workers=count) as executor:
futures = [
executor.submit(hashfile, os.path.join(dirName, filename))
for filename in fileList
if filename not in files_to_skip
]
for future in as_completed(futures):
hash, filepath = future.result()
hashes[hash].append(filepath)

counter += 1
print(f"Total files hashed: {counter}", end="\r")
sys.stdout.flush()
pbar.update(1)

pbar.close()

dup_dict, unique_dict = filter_dict(hashes) # Filter the results

Expand Down
4 changes: 2 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
coverage
pytest
coverage==5.3
pytest==6.2.5
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[aliases]
test=pytest
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,6 @@
"Programming Language :: Python :: 3.7",
],
use_incremental=True,
setup_requires=["incremental"],
setup_requires=["incremental", "pytest-runner"],
tests_require=test_require,
)
10 changes: 10 additions & 0 deletions tests/assets/test_duplicates.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"key1": [
"valueA",
"valueB"
],
"key2": [
"valueC",
"valueD"
]
}
4 changes: 4 additions & 0 deletions tests/assets/test_uniques.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"key3": "valueE",
"key4": "valueF"
}
2 changes: 1 addition & 1 deletion tests/test_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

@patch("deduplify.del_empty_dirs.os.rmdir")
def test_del_empty_dirs(mock):
test_dir = os.path.join("tests", "testdir")
test_dir = os.path.join("tests", "testdir_empty")
test_call = [call(os.path.abspath(test_dir))]

if not os.path.exists(test_dir):
Expand Down
60 changes: 59 additions & 1 deletion tests/test_hash.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
from deduplify.hash_files import filter_dict
import os
from collections import defaultdict

from deduplify.hash_files import (
filter_dict,
get_total_number_of_files,
hashfile,
restart_run,
transform_dict,
)


def test_filter_dict():
Expand All @@ -8,3 +17,52 @@ def test_filter_dict():

assert dupdict == {"hash2": ["filepath2", "filepath3"]}
assert undict == {"hash1": "filepath1"}


def test_get_total_number_of_files():
dirpath = os.path.join("tests", "testdir")

output1 = get_total_number_of_files(dirpath)
output2 = get_total_number_of_files(dirpath, file_ext=".txt")

assert output1 == 2
assert output2 == 1


def test_hashfile():
path = os.path.join("tests", "assets", "test_infile.json")

md5_hash, outpath = hashfile(path)

assert md5_hash == "f3fb257d843b252bdc0442402552d840"
assert outpath == path


def test_transform_dict():
test_dict = {"key1": "value1", "key2": "value2"}
expected = {"key1": ["value1"], "key2": ["value2"]}

output = transform_dict(test_dict)

assert output == expected


def test_restart_run():
dup_file = os.path.join(os.getcwd(), "tests", "assets", "test_duplicates.json")
un_file = os.path.join(os.getcwd(), "tests", "assets", "test_uniques.json")

expected_dict = defaultdict(
list,
{
"key1": ["valueA", "valueB"],
"key2": ["valueC", "valueD"],
"key3": ["valueE"],
"key4": ["valueF"],
},
)
expected_list = ["valueA", "valueB", "valueC", "valueD", "valueE", "valueF"]

hashes, files_to_be_skipped = restart_run(dup_file, un_file)

assert hashes == expected_dict
assert files_to_be_skipped == expected_list
Empty file added tests/testdir/test_file_1.xml
Empty file.
Empty file added tests/testdir/test_file_2.xml
Empty file.
Empty file added tests/testdir/test_file_3.txt
Empty file.

0 comments on commit ac4c30a

Please sign in to comment.