From 976eade6fbd177bb77a961451b72b7a3cd9d9eb3 Mon Sep 17 00:00:00 2001 From: Nicol Lo Date: Tue, 30 Jun 2020 18:57:48 +0800 Subject: [PATCH 1/7] skip broken symlinks when hashing --- pydra/engine/helpers_file.py | 8 +++++++- pydra/engine/tests/test_helpers_file.py | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py index 0de413104c..68fba0812d 100644 --- a/pydra/engine/helpers_file.py +++ b/pydra/engine/helpers_file.py @@ -73,8 +73,11 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True): if afile is None or isinstance(afile, LazyField) or isinstance(afile, list): return None if not Path(afile).is_file(): + if Path(afile).is_symlink(): + logger.debug(f"Skip broken symlink to file: {afile}") + return None if raise_notfound: - raise RuntimeError('File "%s" not found.' % afile) + raise RuntimeError(f"File {afile} not found.") return None crypto_obj = crypto() @@ -124,6 +127,9 @@ def hash_dir( if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list): return None if not Path(dirpath).is_dir(): + if Path(dirpath).is_symlink(): + logger.debug(f"Skip broken symlink to directory: {dirpath}") + return None if raise_notfound: raise FileNotFoundError(f"Directory {dirpath} not found.") return None diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/engine/tests/test_helpers_file.py index 56cc8000c6..b6a9abdbbf 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/engine/tests/test_helpers_file.py @@ -6,6 +6,8 @@ from ..helpers_file import ( split_filename, + hash_file, + hash_dir, copyfile, copyfiles, on_cifs, @@ -57,6 +59,24 @@ def _temp_analyze_files_prime(tmpdir): return Path(orig_img.strpath), Path(orig_hdr.strpath) +def test_hash_broken_symlink(_temp_analyze_files): + """Skip broken symlinks in hash_file and hash_dir""" + orig_img, orig_hdr = _temp_analyze_files + pth, _ = os.path.split(orig_hdr) + + img_link = os.path.join(pth, "img_link") + os.symlink(orig_img, img_link) + os.remove(orig_img) # delete original file to create broken link + assert hash_file(img_link) == None + + new_dir = os.path.join(pth, "temp") + os.mkdir(new_dir) + new_dir_link = os.path.join(pth, "new_dir_link") + os.symlink(new_dir, new_dir_link, target_is_directory=True) + os.rmdir(new_dir) # delete directory to create broken link + assert hash_dir(new_dir_link) == None + + def test_copyfile(_temp_analyze_files): orig_img, orig_hdr = _temp_analyze_files pth, fname = os.path.split(orig_img) From 03070d65cc2f3d2b25a022c6f07ad335769bbe7f Mon Sep 17 00:00:00 2001 From: Nicol Lo Date: Wed, 1 Jul 2020 23:03:19 +0800 Subject: [PATCH 2/7] clean up hash_file and hash_dir --- pydra/engine/helpers_file.py | 40 ++++++++++--------------- pydra/engine/tests/test_helpers_file.py | 17 +++++++++-- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py index 68fba0812d..c62cfc2a8a 100644 --- a/pydra/engine/helpers_file.py +++ b/pydra/engine/helpers_file.py @@ -70,25 +70,20 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True): """Compute hash of a file using 'crypto' module.""" from .specs import LazyField - if afile is None or isinstance(afile, LazyField) or isinstance(afile, list): - return None - if not Path(afile).is_file(): - if Path(afile).is_symlink(): - logger.debug(f"Skip broken symlink to file: {afile}") - return None - if raise_notfound: - raise RuntimeError(f"File {afile} not found.") + try: + crypto_obj = crypto() + with open(afile, "rb") as fp: + while True: + data = fp.read(chunk_len) + if not data: + break + crypto_obj.update(data) + return crypto_obj.hexdigest() + except: + if not Path(afile).is_symlink() and raise_notfound: + raise FileNotFoundError(f"File {afile} not found.") return None - crypto_obj = crypto() - with open(afile, "rb") as fp: - while True: - data = fp.read(chunk_len) - if not data: - break - crypto_obj.update(data) - return crypto_obj.hexdigest() - def hash_dir( dirpath, @@ -124,13 +119,8 @@ def hash_dir( """ from .specs import LazyField - if dirpath is None or isinstance(dirpath, LazyField) or isinstance(dirpath, list): - return None if not Path(dirpath).is_dir(): - if Path(dirpath).is_symlink(): - logger.debug(f"Skip broken symlink to directory: {dirpath}") - return None - if raise_notfound: + if not Path(dirpath).is_symlink() and raise_notfound: # ignore broken symlinks raise FileNotFoundError(f"Directory {dirpath} not found.") return None @@ -150,8 +140,8 @@ def hash_dir( crypto_obj = crypto() for h in file_hashes: - crypto_obj.update(h.encode()) - + if h: # ignore broken links and broken files + crypto_obj.update(h.encode()) return crypto_obj.hexdigest() diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/engine/tests/test_helpers_file.py index b6a9abdbbf..915247f72d 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/engine/tests/test_helpers_file.py @@ -67,14 +67,25 @@ def test_hash_broken_symlink(_temp_analyze_files): img_link = os.path.join(pth, "img_link") os.symlink(orig_img, img_link) os.remove(orig_img) # delete original file to create broken link - assert hash_file(img_link) == None + assert not hash_file(img_link) new_dir = os.path.join(pth, "temp") os.mkdir(new_dir) + new_hdr = os.path.join(new_dir, "new_hdr") + os.rename(orig_hdr, new_hdr) # move hdr to new_dir + new_dir_link = os.path.join(pth, "new_dir_link") + new_hdr_link = os.path.join(new_dir, "new_hdr_link") os.symlink(new_dir, new_dir_link, target_is_directory=True) - os.rmdir(new_dir) # delete directory to create broken link - assert hash_dir(new_dir_link) == None + os.symlink(new_hdr, new_hdr_link) # link to a file in same directory + + os.remove(new_hdr) # break new_hdr_link + assert hash_dir(new_dir) # dir containing broken links should still hash + assert not hash_file(new_hdr_link) + + os.remove(new_hdr_link) + os.rmdir(new_dir) # delete directory to break new_dir_link + assert not hash_dir(new_dir_link) def test_copyfile(_temp_analyze_files): From c67d496026c13e1a6c9633c9c9d082007f84f31d Mon Sep 17 00:00:00 2001 From: Nicole Lo Date: Wed, 8 Jul 2020 21:49:24 +0800 Subject: [PATCH 3/7] Update pydra/engine/helpers_file.py Co-authored-by: Chris Markiewicz --- pydra/engine/helpers_file.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py index c62cfc2a8a..279cd71fc5 100644 --- a/pydra/engine/helpers_file.py +++ b/pydra/engine/helpers_file.py @@ -71,18 +71,20 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True): from .specs import LazyField try: - crypto_obj = crypto() - with open(afile, "rb") as fp: - while True: - data = fp.read(chunk_len) - if not data: - break - crypto_obj.update(data) - return crypto_obj.hexdigest() - except: + fp = open(afile, "rb") + except FileNotFoundError: if not Path(afile).is_symlink() and raise_notfound: - raise FileNotFoundError(f"File {afile} not found.") - return None + raise + return None + crypto_obj = crypto() + with fp as fp: + while True: + data = fp.read(chunk_len) + if not data: + break + crypto_obj.update(data) + return crypto_obj.hexdigest() + def hash_dir( From 836ef6569753b4d2697c069bcce151846675ce92 Mon Sep 17 00:00:00 2001 From: Nicol Lo Date: Wed, 22 Jul 2020 21:23:49 +0800 Subject: [PATCH 4/7] add option to raise error for broken paths and symlinks during hashing --- pydra/engine/helpers.py | 2 +- pydra/engine/helpers_file.py | 30 ++++++++++++++++--------- pydra/engine/tests/test_helpers_file.py | 29 ------------------------ 3 files changed, 21 insertions(+), 40 deletions(-) diff --git a/pydra/engine/helpers.py b/pydra/engine/helpers.py index b684656516..e2a6c4302b 100644 --- a/pydra/engine/helpers.py +++ b/pydra/engine/helpers.py @@ -457,7 +457,7 @@ def hash_value(value, tp=None, metadata=None): and is_existing_file(value) and "container_path" not in metadata ): - return hash_file(value) + return hash_file(value, raise_notfound=True) elif ( (tp is File or "pydra.engine.specs.Directory" in str(tp)) and is_existing_file(value) diff --git a/pydra/engine/helpers_file.py b/pydra/engine/helpers_file.py index 279cd71fc5..6b110400ad 100644 --- a/pydra/engine/helpers_file.py +++ b/pydra/engine/helpers_file.py @@ -66,16 +66,20 @@ def split_filename(fname): return pth, fname, ext -def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True): +def hash_file(afile, raise_notfound=True, chunk_len=8192, crypto=sha256): """Compute hash of a file using 'crypto' module.""" from .specs import LazyField try: fp = open(afile, "rb") except FileNotFoundError: - if not Path(afile).is_symlink() and raise_notfound: - raise - return None + if raise_notfound: + if Path(afile).is_symlink(): + raise FileNotFoundError(f"Broken symlink to file: {afile}") + else: + raise FileNotFoundError(f"File doesn't exist: {afile} ") + return None + crypto_obj = crypto() with fp as fp: while True: @@ -84,7 +88,6 @@ def hash_file(afile, chunk_len=8192, crypto=sha256, raise_notfound=True): break crypto_obj.update(data) return crypto_obj.hexdigest() - def hash_dir( @@ -100,6 +103,10 @@ def hash_dir( computes the hash of that list of hashes to return a single hash value. The directory is traversed recursively. + Note + ------- + Directories that contain some broken links should still be hashed. + Parameters ---------- dirpath : :obj:`str` @@ -111,8 +118,8 @@ def hash_dir( ignore_hidden_dirs : :obj:`bool` If `True`, ignore files in directories that begin with `.`. raise_notfound : :obj:`bool` - If `True` and `dirpath` does not exist, raise `FileNotFound` exception. If - `False` and `dirpath` does not exist, return `None`. + If `True` and `dirpath` does not exist, raise `FileNotFound` exception. + If `False` and `dirpath` does not exist, return `None`. Returns ------- @@ -122,8 +129,11 @@ def hash_dir( from .specs import LazyField if not Path(dirpath).is_dir(): - if not Path(dirpath).is_symlink() and raise_notfound: # ignore broken symlinks - raise FileNotFoundError(f"Directory {dirpath} not found.") + if raise_notfound: + if Path(dirpath).is_symlink(): + raise FileNotFoundError(f"Broken symlink to directory: {dirpath}") + else: + raise FileNotFoundError(f"Directory doesn't exist: {dirpath}") return None file_hashes = [] @@ -137,7 +147,7 @@ def hash_dir( for filename in filenames: if ignore_hidden_files and filename.startswith("."): continue - this_hash = hash_file(dpath / filename) + this_hash = hash_file(dpath / filename, raise_notfound=False) file_hashes.append(this_hash) crypto_obj = crypto() diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/engine/tests/test_helpers_file.py index 915247f72d..5462d6274f 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/engine/tests/test_helpers_file.py @@ -59,35 +59,6 @@ def _temp_analyze_files_prime(tmpdir): return Path(orig_img.strpath), Path(orig_hdr.strpath) -def test_hash_broken_symlink(_temp_analyze_files): - """Skip broken symlinks in hash_file and hash_dir""" - orig_img, orig_hdr = _temp_analyze_files - pth, _ = os.path.split(orig_hdr) - - img_link = os.path.join(pth, "img_link") - os.symlink(orig_img, img_link) - os.remove(orig_img) # delete original file to create broken link - assert not hash_file(img_link) - - new_dir = os.path.join(pth, "temp") - os.mkdir(new_dir) - new_hdr = os.path.join(new_dir, "new_hdr") - os.rename(orig_hdr, new_hdr) # move hdr to new_dir - - new_dir_link = os.path.join(pth, "new_dir_link") - new_hdr_link = os.path.join(new_dir, "new_hdr_link") - os.symlink(new_dir, new_dir_link, target_is_directory=True) - os.symlink(new_hdr, new_hdr_link) # link to a file in same directory - - os.remove(new_hdr) # break new_hdr_link - assert hash_dir(new_dir) # dir containing broken links should still hash - assert not hash_file(new_hdr_link) - - os.remove(new_hdr_link) - os.rmdir(new_dir) # delete directory to break new_dir_link - assert not hash_dir(new_dir_link) - - def test_copyfile(_temp_analyze_files): orig_img, orig_hdr = _temp_analyze_files pth, fname = os.path.split(orig_img) From 75cdbf6159074e248d459a8c0e67a74c4e901287 Mon Sep 17 00:00:00 2001 From: Nicol Lo Date: Wed, 22 Jul 2020 21:25:41 +0800 Subject: [PATCH 5/7] add tests for broken file/link handling in hash_dir and hash_file --- pydra/engine/tests/test_helpers_file.py | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/engine/tests/test_helpers_file.py index 5462d6274f..3fbbbcfb53 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/engine/tests/test_helpers_file.py @@ -186,6 +186,79 @@ def test_get_related_files_noninclusive(_temp_analyze_files): assert orig_hdr not in related_files +def test_hash_file_broken(_temp_analyze_files): + """ + Test how broken file paths are handled during hashing + + When `raise_notfound` is True, raise error for broken file paths or + broken symlinks. Otherwise return None + """ + # broken file path + assert hash_file("broken_file.txt", raise_notfound=False) is None + with pytest.raises(FileNotFoundError) as e1: + hash_file("broken_file.txt", raise_notfound=True) + assert "File doesn't exist" in str(e1) + + # broken symlink to file + orig_img, _ = _temp_analyze_files + pth, _ = os.path.split(orig_img) + img_link = os.path.join(pth, "img_link") + os.symlink(orig_img, img_link) + os.remove(orig_img) + assert hash_file(img_link, raise_notfound=False) is None + with pytest.raises(FileNotFoundError) as e2: + hash_file(img_link, raise_notfound=True) + assert "Broken symlink to file" in str(e2) + + +def test_hash_dir_broken(tmpdir): + """ + Test how broken dir paths or broken files in dirs are handled during hashing + + When `raise_notfound` is True raise error for broken dir paths or + broken symlinks to directories, otherwise return None. + + Directories that contain some broken links should still be hashed. + """ + + assert hash_dir("/broken_dir_path/", raise_notfound=False) is None + with pytest.raises(FileNotFoundError) as e1: + hash_dir("/broken_dir_path/", raise_notfound=True) + assert "Directory doesn't exist" in str(e1) + + # broken symlink to dir path + dir1 = tmpdir.join("dir1") + os.mkdir(dir1) + dir1_link = tmpdir.join("dir1_link") + os.symlink(dir1, dir1_link) + os.rmdir(dir1) + assert hash_dir(dir1_link, raise_notfound=False) is None + with pytest.raises(FileNotFoundError) as e1: + hash_dir(dir1_link, raise_notfound=True) + assert "Broken symlink to directory" in str(e1) + + # dirs with broken symlink(s) are hashed + dir2 = tmpdir.join("dir2") + os.mkdir(dir2) + file1 = dir2.join("file1") + file2 = dir2.join("file2") + file1.open("w+").close() + file2.open("w+").close() + + file1_link = dir2.join("file1_link") + os.symlink(file1, file1_link) + os.remove(file1) # file1_link is broken + assert isinstance(hash_dir(dir2, raise_notfound=False), str) + assert isinstance(hash_dir(dir2, raise_notfound=True), str) + + # dir is still hashed when all files in dir are broken + file2_link = dir2.join("file2_link") + os.symlink(file2, file2_link) + os.remove(file2) # file2_link is broken + assert isinstance(hash_dir(dir2, raise_notfound=False), str) + assert isinstance(hash_dir(dir2, raise_notfound=True), str) + + @pytest.mark.parametrize( "filename, expected", [ From c2594e991dea903c2c6bb587d81dfefaaa18f8a6 Mon Sep 17 00:00:00 2001 From: Nicol Lo Date: Thu, 23 Jul 2020 18:27:46 +0800 Subject: [PATCH 6/7] fix exception message handling is hash_file and hash_dir tests for python3.8 --- pydra/engine/tests/test_helpers_file.py | 53 +++++++++++++++---------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/pydra/engine/tests/test_helpers_file.py b/pydra/engine/tests/test_helpers_file.py index 3fbbbcfb53..723294804b 100644 --- a/pydra/engine/tests/test_helpers_file.py +++ b/pydra/engine/tests/test_helpers_file.py @@ -186,46 +186,55 @@ def test_get_related_files_noninclusive(_temp_analyze_files): assert orig_hdr not in related_files -def test_hash_file_broken(_temp_analyze_files): +def test_hash_file_broken1(): """ - Test how broken file paths are handled during hashing + Test how broken paths are hashed in hash_file - When `raise_notfound` is True, raise error for broken file paths or - broken symlinks. Otherwise return None + When `raise_notfound` is False return None, otherwise raise error """ - # broken file path assert hash_file("broken_file.txt", raise_notfound=False) is None - with pytest.raises(FileNotFoundError) as e1: + with pytest.raises(FileNotFoundError) as e: hash_file("broken_file.txt", raise_notfound=True) - assert "File doesn't exist" in str(e1) + assert "File doesn't exist" in str(e.value) - # broken symlink to file + +def test_hash_file_broken2(_temp_analyze_files): + """ + Test how broken symlinks are hashed in hash_file + + When `raise_notfound` is False return None, otherwise raise error + """ orig_img, _ = _temp_analyze_files pth, _ = os.path.split(orig_img) img_link = os.path.join(pth, "img_link") os.symlink(orig_img, img_link) os.remove(orig_img) assert hash_file(img_link, raise_notfound=False) is None - with pytest.raises(FileNotFoundError) as e2: + with pytest.raises(FileNotFoundError) as e: hash_file(img_link, raise_notfound=True) - assert "Broken symlink to file" in str(e2) + assert "Broken symlink to file" in str(e.value) -def test_hash_dir_broken(tmpdir): +def test_hash_dir_broken1(tmpdir): """ - Test how broken dir paths or broken files in dirs are handled during hashing - - When `raise_notfound` is True raise error for broken dir paths or - broken symlinks to directories, otherwise return None. + Test how broken paths are hashed in hash_dir - Directories that contain some broken links should still be hashed. + When `raise_notfound` is False return None, otherwise raise error """ - assert hash_dir("/broken_dir_path/", raise_notfound=False) is None - with pytest.raises(FileNotFoundError) as e1: + with pytest.raises(FileNotFoundError) as e: hash_dir("/broken_dir_path/", raise_notfound=True) - assert "Directory doesn't exist" in str(e1) + assert "Directory doesn't exist" in str(e.value) + +def test_hash_dir_broken2(tmpdir): + """ + Test how broken symlinks are hashed in hash_dir + + When `raise_notfound` is True, broken symlink to a directory raises an error + when it's directly provided as an input. + However, directories that contain some broken links should always be hashed. + """ # broken symlink to dir path dir1 = tmpdir.join("dir1") os.mkdir(dir1) @@ -233,9 +242,9 @@ def test_hash_dir_broken(tmpdir): os.symlink(dir1, dir1_link) os.rmdir(dir1) assert hash_dir(dir1_link, raise_notfound=False) is None - with pytest.raises(FileNotFoundError) as e1: + with pytest.raises(FileNotFoundError) as e: hash_dir(dir1_link, raise_notfound=True) - assert "Broken symlink to directory" in str(e1) + assert "Broken symlink to directory" in str(e.value) # dirs with broken symlink(s) are hashed dir2 = tmpdir.join("dir2") @@ -251,7 +260,7 @@ def test_hash_dir_broken(tmpdir): assert isinstance(hash_dir(dir2, raise_notfound=False), str) assert isinstance(hash_dir(dir2, raise_notfound=True), str) - # dir is still hashed when all files in dir are broken + # dirs with all broken symlinks are still hashed file2_link = dir2.join("file2_link") os.symlink(file2, file2_link) os.remove(file2) # file2_link is broken From 4b93c0c247860d2ce2feb804c493c4806aa3cb7d Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Thu, 23 Jul 2020 15:34:39 -0700 Subject: [PATCH 7/7] adding tests that takes broken link as an input --- pydra/engine/tests/test_shelltask.py | 50 ++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pydra/engine/tests/test_shelltask.py b/pydra/engine/tests/test_shelltask.py index 992af79049..e6ecd859f5 100644 --- a/pydra/engine/tests/test_shelltask.py +++ b/pydra/engine/tests/test_shelltask.py @@ -1153,6 +1153,56 @@ def test_shell_cmd_inputspec_9(plugin, results_function, tmpdir): assert res.output.stdout == "hello from boston" +def test_shell_cmd_inputspec_10_err(tmpdir): + """ checking if the proper error is raised when broken symlink is provided + as a input field with File as a type + """ + + file_1 = tmpdir.join("file_1.txt") + with open(file_1, "w") as f: + f.write("hello") + file_2 = tmpdir.join("file_2.txt") + + # creating symlink and removing the original file + os.symlink(file_1, file_2) + os.remove(file_1) + + cmd_exec = "cat" + + my_input_spec = SpecInfo( + name="Input", + fields=[ + ( + "files", + attr.ib( + type=File, + metadata={ + "position": 1, + "argstr": "", + "help_string": "a file", + "mandatory": True, + }, + ), + ) + ], + bases=(ShellSpec,), + ) + + shelly = ShellCommandTask( + name="shelly", executable=cmd_exec, files=file_2, input_spec=my_input_spec + ) + + # checking if the broken symlink error is raised when checksum is calculated + with pytest.raises(FileNotFoundError) as e: + checksum = shelly.checksum + assert "Broken symlink" in str(e.value) + + # checking if the broken symlink error is raised when the task is run + with pytest.raises(FileNotFoundError) as e: + res = shelly() + assert "Broken symlink" in str(e.value) + + @pytest.mark.parametrize("results_function", [result_no_submitter, result_submitter]) def test_shell_cmd_inputspec_copyfile_1(plugin, results_function, tmpdir): """ shelltask changes a file in place,