From ca36407a65cf81196dac37cae680d282aba07c42 Mon Sep 17 00:00:00 2001
From: Webb Scales <7795764+webbnh@users.noreply.github.com>
Date: Wed, 2 Aug 2023 12:24:03 -0400
Subject: [PATCH] Attempt to improve upload performance (#3501)

Use subprocess tar command to extract files from tarball

PBENCH-1226
---
 lib/pbench/server/__init__.py                 |   5 +
 lib/pbench/server/cache_manager.py            | 240 +++++--
 .../test/unit/server/test_cache_manager.py    | 636 +++++++++++++-----
 3 files changed, 631 insertions(+), 250 deletions(-)

diff --git a/lib/pbench/server/__init__.py b/lib/pbench/server/__init__.py
index 346eb81931..ee2f8cdf4c 100644
--- a/lib/pbench/server/__init__.py
+++ b/lib/pbench/server/__init__.py
@@ -5,6 +5,7 @@
 from datetime import datetime, timedelta, tzinfo
 from enum import auto, Enum
 from logging import Logger
+import os
 from pathlib import Path
 from time import time as _time
 from typing import Dict, List, Optional, Union
@@ -21,6 +22,10 @@
 JSONOBJECT = Dict[JSONSTRING, JSONVALUE]
 JSON = JSONVALUE
 
+# Define a type hint for "Path-like" parameters, so the cumbersomeness of the
+# type won't discourage us from using it where it is warranted.
+PathLike = Union[str, bytes, os.PathLike]
+
 
 class OperationCode(Enum):
     """Enumeration for CRUD operations.
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
index 4715d48193..8de5ff964b 100644
--- a/lib/pbench/server/cache_manager.py
+++ b/lib/pbench/server/cache_manager.py
@@ -6,11 +6,11 @@
 import shlex
 import shutil
 import subprocess
-import tarfile
+import time
 from typing import Any, IO, Optional, Union
 
 from pbench.common import MetadataLog, selinux
-from pbench.server import JSONOBJECT, PbenchServerConfig
+from pbench.server import JSONOBJECT, PathLike, PbenchServerConfig
 from pbench.server.database.models.datasets import Dataset
 from pbench.server.utils import get_tarball_md5
 
@@ -35,7 +35,7 @@ def __str__(self) -> str:
 class BadFilename(CacheManagerError):
     """A bad tarball path was given."""
 
-    def __init__(self, path: Union[str, Path]):
+    def __init__(self, path: PathLike):
         self.path = str(path)
 
     def __str__(self) -> str:
@@ -45,7 +45,7 @@ def __str__(self) -> str:
 class CacheExtractBadPath(CacheManagerError):
     """Request to extract a path that's bad or not a file"""
 
-    def __init__(self, tar_name: Path, path: Union[str, Path]):
+    def __init__(self, tar_name: Path, path: PathLike):
         self.name = tar_name.name
         self.path = str(path)
 
@@ -85,7 +85,7 @@ def __str__(self) -> str:
 
 
 class TarballUnpackError(CacheManagerError):
-    """An error occured trying to unpack a tarball."""
+    """An error occurred trying to unpack a tarball."""
 
     def __init__(self, tarball: Path, error: str):
         self.tarball = tarball
@@ -134,6 +134,14 @@ class CacheObject:
     type: CacheType
 
 
+# Type hint definitions for the cache map.
+#
+# CacheMapEntry: { "details": CacheObject, "children": CacheMap }
+# CacheMap: { "<directory_entry>": CacheMapEntry, ... }
+CacheMapEntry = dict[str, Union[CacheObject, "CacheMap"]]
+CacheMap = dict[str, CacheMapEntry]
+
+
 def make_cache_object(dir_path: Path, path: Path) -> CacheObject:
     """Collects the file info
 
@@ -185,14 +193,14 @@ def make_cache_object(dir_path: Path, path: Path) -> CacheObject:
 
 
 class Inventory:
-    """Encapsulate the tarfile TarFile object and file stream
+    """Encapsulate the file stream and subprocess.Popen object
 
     This encapsulation allows cleaner downstream handling, so that we can close
-    both the extractfile file stream and the tarball object itself when we're
-    done. This eliminates interference with later operations.
+    both the extracted file stream and the Popen object itself when we're done.
+    This eliminates interference with later operations.
     """
 
-    def __init__(self, stream: IO[bytes], tarfile: Optional[tarfile.TarFile] = None):
+    def __init__(self, stream: IO[bytes], subproc: Optional[subprocess.Popen] = None):
         """Construct an instance to track extracted inventory
 
         This encapsulates many byte stream operations so that it can be used
@@ -200,16 +208,40 @@ def __init__(self, stream: IO[bytes], tarfile: Optional[tarfile.TarFile] = None)
 
         Args:
             stream: the data stream of a specific tarball member
-            tarfile: the TarFile object
+            subproc: the subprocess producing the stream, if any
         """
-        self.tarfile = tarfile
+        self.subproc = subproc
         self.stream = stream
 
     def close(self):
-        """Close both the byte stream and, if open, a tarfile object"""
+        """Close the byte stream and clean up the Popen object"""
+        if self.subproc:
+            try:
+                if self.subproc.poll() is None:
+                    # The subprocess is still running: kill it, drain the outputs,
+                    # and wait for its termination.  (If the timeout on the wait()
+                    # is exceeded, it will raise subprocess.TimeoutExpired rather
+                    # than waiting forever...it's not clear what will happen after
+                    # that, but there's not a good alternative, so I hope this
+                    # never actually happens.)
+                    self.subproc.kill()
+                    if self.subproc.stdout:
+                        while self.subproc.stdout.read(4096):
+                            pass
+                    if self.subproc.stderr:
+                        while self.subproc.stderr.read(4096):
+                            pass
+                    self.subproc.wait(60.0)
+            finally:
+                # Release our reference to the subprocess.Popen object so that the
+                # object can be reclaimed.
+                self.subproc = None
+
+        # Explicitly close the stream, in case there never was an associated
+        # subprocess.  (If there was an associated subprocess, the streams are
+        # now empty, and we'll assume that they are closed when the Popen
+        # object is reclaimed.)
         self.stream.close()
-        if self.tarfile:
-            self.tarfile.close()
 
     def getbuffer(self):
         """Return the underlying byte buffer (used by send_file)"""
@@ -229,7 +261,7 @@ def seek(self, *args, **kwargs) -> int:
 
     def __repr__(self) -> str:
         """Return a string representation"""
-        return f"<Stream {self.stream} from {self.tarfile}>"
+        return f"<Stream {self.stream} from {self.subproc}>"
 
     def __iter__(self):
         """Allow iterating through lines in the buffer"""
@@ -255,6 +287,11 @@ class Tarball:
     database representations of a dataset.
     """
 
+    # Wait no more than a minute for the tar(1) command to start producing
+    # output; perform the wait in 0.02s increments.
+    TAR_EXEC_TIMEOUT = 60.0
+    TAR_EXEC_WAIT = 0.02
+
     def __init__(self, path: Path, controller: "Controller"):
         """Construct a `Tarball` object instance
 
@@ -282,7 +319,7 @@ def __init__(self, path: Path, controller: "Controller"):
         self.cache: Path = controller.cache / self.resource_id
 
         # Record hierarchy of a Tar ball
-        self.cachemap: Optional[JSONOBJECT] = None
+        self.cachemap: Optional[CacheMap] = None
 
         # Record the base of the unpacked files for cache management, which
         # is (self.cache / self.name) and will be None when the cache is
@@ -362,11 +399,11 @@ def create(cls, tarball: Path, controller: "Controller") -> "Tarball":
         except Exception as e:
             try:
                 md5_destination.unlink()
-            except Exception as e:
+            except Exception as md5_e:
                 controller.logger.error(
                     "Unable to recover by removing {} MD5 after tarball copy failure: {}",
                     name,
-                    e,
+                    md5_e,
                 )
             controller.logger.error(
                 "ERROR copying dataset {} tarball {}: {}", name, tarball, e
@@ -399,13 +436,15 @@ def cache_map(self, dir_path: Path):
             dir_path: root directory
         """
         root_dir_path = dir_path.parent
-        cmap = {dir_path.name: {"details": make_cache_object(root_dir_path, dir_path)}}
-        dir_queue = deque([(dir_path, cmap)])
+        cmap: CacheMap = {
+            dir_path.name: {"details": make_cache_object(root_dir_path, dir_path)}
+        }
+        dir_queue = deque(((dir_path, cmap),))
         while dir_queue:
             dir_path, parent_map = dir_queue.popleft()
             tar_n = dir_path.name
 
-            curr = {}
+            curr: CacheMapEntry = {}
             for l_path in dir_path.glob("*"):
                 tar_info = make_cache_object(root_dir_path, l_path)
                 curr[l_path.name] = {"details": tar_info}
@@ -418,13 +457,13 @@ def cache_map(self, dir_path: Path):
         self.cachemap = cmap
 
     @staticmethod
-    def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
+    def traverse_cmap(path: Path, cachemap: CacheMap) -> CacheMapEntry:
         """Sequentially traverses the cachemap to find the leaf of a
         relative path reference
 
         Args:
-            path: relative path of the sub-directory/file
-            cachemap: dictionary mapping of the root Dicrectory
+            path: relative path of the subdirectory/file
+            cachemap: dictionary mapping of the root directory
 
         Raises:
             BadDirpath if the directory/file path is not valid
@@ -437,9 +476,9 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
 
         try:
             for file_l in file_list:
-                info = f_entries[file_l]
+                info: CacheMapEntry = f_entries[file_l]
                 if info["details"].type == CacheType.DIRECTORY:
-                    f_entries = info["children"]
+                    f_entries: CacheMap = info["children"]
                 else:
                     raise BadDirpath(
                         f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
@@ -454,13 +493,13 @@ def get_info(self, path: Path) -> JSONOBJECT:
         """Returns the details of the given file/directory in dict format
 
         NOTE: This requires a call to the cache_map method to build a map that
-        can be traversed. Currently this is done only on unpack, and isn't
+        can be traversed. Currently, this is done only on unpack, and isn't
         useful except within the `pbench-index` process. This map needs to
         either be built dynamically (potentially expensive) or persisted in
         SQL or perhaps Redis.
 
         Args:
-            path: path of the file/sub-directory
+            path: path of the file/subdirectory
 
         Raises:
             BadDirpath on bad directory path
@@ -506,7 +545,7 @@ def get_info(self, path: Path) -> JSONOBJECT:
         return fd_info
 
     @staticmethod
-    def extract(tarball_path: Path, path: Path) -> Inventory:
+    def extract(tarball_path: Path, path: str) -> Inventory:
         """Returns a file stream for a file within a tarball
 
         Args:
@@ -515,25 +554,87 @@ def extract(tarball_path: Path, path: Path) -> Inventory:
 
         Returns:
             An inventory object that mimics an IO[bytes] object while also
-            maintaining a reference to the tarfile TarFile object to be
-            closed later.
+            maintaining a reference to the subprocess.Popen object to be
+            cleaned up later.
 
         Raise:
-            TarballNotFound on failure opening the tarball
             CacheExtractBadPath if the target cannot be extracted
+            TarballUnpackError on other tar-command failures
+            Any exception raised by subprocess.Popen()
+            subprocess.TimeoutExpired if the tar command hangs
         """
-        try:
-            tar = tarfile.open(tarball_path, "r:*")
-        except Exception as exc:
-            raise TarballNotFound(str(tarball_path)) from exc
-        try:
-            stream = tar.extractfile(str(path))
-        except Exception as exc:
-            raise CacheExtractBadPath(tarball_path, path) from exc
-        else:
-            if not stream:
-                raise CacheExtractBadPath(tarball_path, path)
-            return Inventory(stream, tar)
+        tar_path = shutil.which("tar")
+        if tar_path is None:
+            raise TarballUnpackError(
+                tarball_path, "External 'tar' executable not found"
+            )
+
+        # The external tar utility offers better capabilities than the
+        # Standard Library package, so run it in a subprocess:  extract
+        # the target member from the specified tar archive and direct it to
+        # stdout; we expect only one occurrence of the target member, so stop
+        # processing as soon as we find it instead of looking for additional
+        # instances of it later in the archive -- this is a huge savings when
+        # the archive is very large.
+        tar_command = [
+            str(tar_path),
+            "xf",
+            tarball_path,
+            "--to-stdout",
+            "--occurrence=1",
+            path,
+        ]
+        tarproc = subprocess.Popen(
+            tar_command,
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        # Wait for the tar(1) command to start producing output, but stop
+        # waiting if the subprocess exits or if it takes too long.
+        start = time.time()
+        while not tarproc.stdout.peek():
+            if tarproc.poll() is not None:
+                break
+
+            elapsed = time.time() - start
+            if elapsed > Tarball.TAR_EXEC_TIMEOUT:
+                # No signs of life from the subprocess.  Kill it to ensure that
+                # the Python runtime can clean it up after we leave, and report
+                # the failure.
+                tarproc.kill()
+                raise subprocess.TimeoutExpired(
+                    cmd=tar_command,
+                    timeout=elapsed,
+                    output=tarproc.stdout,
+                    stderr=tarproc.stderr,
+                )
+
+            time.sleep(Tarball.TAR_EXEC_WAIT)
+
+        # If the return code is None (meaning the command is still running) or
+        # is zero (meaning it completed successfully), then return the stream
+        # containing the extracted file to our caller, and return the Popen
+        # object so that we can clean it up when the Inventory object is closed.
+        if not tarproc.returncode:
+            return Inventory(tarproc.stdout, tarproc)
+
+        # The tar command was invoked successfully (otherwise, the Popen()
+        # constructor would have raised an exception), but it exited with
+        # an error code.  We have to glean what went wrong by looking at
+        # stderr, which is fragile but the only option.  Rather than
+        # relying on looking for specific text, we assume that, if the error
+        # references the archive member, then it was a bad path; otherwise, it
+        # was some sort of error unpacking it.
+        error_text = tarproc.stderr.read().decode()
+        if path in error_text:
+            # "tar: missing_member.txt: Not found in archive"
+            raise CacheExtractBadPath(tarball_path, path)
+        # "tar: /path/to/bad_tarball.tar.xz: Cannot open: No such file or directory"
+        raise TarballUnpackError(
+            tarball_path, f"Unexpected error from {tar_path}: {error_text!r}"
+        )
 
     def get_inventory(self, path: str) -> Optional[JSONOBJECT]:
         """Access the file stream of a tarball member file.
@@ -551,9 +652,9 @@ def get_inventory(self, path: str) -> Optional[JSONOBJECT]:
                 "stream": Inventory(self.tarball_path.open("rb"), None),
             }
         else:
-            file_path = Path(self.name) / path
+            file_path = f"{self.name}/{path}"
             stream = Tarball.extract(self.tarball_path, file_path)
-            info = {"name": file_path.name, "type": CacheType.FILE, "stream": stream}
+            info = {"name": file_path, "type": CacheType.FILE, "stream": stream}
 
         return info
 
@@ -579,7 +680,10 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
 
     @staticmethod
     def subprocess_run(
-        command: str, working_dir: Path, exception: type[CacheManagerError], ctx: Path
+        command: str,
+        working_dir: PathLike,
+        exception: type[CacheManagerError],
+        ctx: Path,
     ):
         """Runs a command as a subprocess.
 
@@ -629,14 +733,15 @@ def unpack(self):
         Rather than passing the indexer the root `/srv/pbench/.cache` or trying
         to update all of the indexer code (which still jumps back and forth
         between the tarball and the unpacked files), we maintain the "cache"
-        directory as two paths: self.cache which is the directory we manage
+        directory as two paths:  self.cache which is the directory we manage
         here and pass to the indexer (/srv/pbench/.cache/<resource_id>) and
         the actual unpacked root (/srv/pbench/.cache/<resource_id>/<name>).
         """
         self.cache.mkdir(parents=True)
 
         try:
-            tar_command = f"tar -x --no-same-owner --delay-directory-restore --force-local --file='{str(self.tarball_path)}'"
+            tar_command = "tar -x --no-same-owner --delay-directory-restore "
+            tar_command += f"--force-local --file='{str(self.tarball_path)}'"
             self.subprocess_run(
                 tar_command, self.cache, TarballUnpackError, self.tarball_path
             )
@@ -770,18 +875,18 @@ def create(
         controller_dir.mkdir(exist_ok=True, mode=0o755)
         return cls(controller_dir, options.CACHE, logger)
 
-    def create_tarball(self, tarfile: Path) -> Tarball:
+    def create_tarball(self, tarfile_path: Path) -> Tarball:
         """Create a new dataset tarball object under the controller
 
         The new tarball object is linked to the controller so we can find it.
 
         Args:
-            tarfile: Path to source tarball file
+            tarfile_path: Path to source tarball file
 
         Returns:
             Tarball object
         """
-        tarball = Tarball.create(tarfile, self)
+        tarball = Tarball.create(tarfile_path, self)
         self.datasets[tarball.resource_id] = tarball
         self.tarballs[tarball.name] = tarball
         return tarball
@@ -974,7 +1079,7 @@ def _discover_controllers(self):
         """Build a representation of the ARCHIVE tree
 
         Record all controllers (top level directories), and the tarballs that
-        that represent datasets within them.
+        represent datasets within them.
         """
         for file in self.archive_root.iterdir():
             if file.is_dir() and file.name != CacheManager.TEMPORARY:
@@ -1008,12 +1113,12 @@ def find_dataset(self, dataset_id: str) -> Tarball:
 
         # The dataset isn't already known; so search for it in the ARCHIVE tree
         # and (if found) discover the controller containing that dataset.
-        for dir in self.archive_root.iterdir():
-            if dir.is_dir() and dir.name != self.TEMPORARY:
-                for file in dir.glob(f"*{Dataset.TARBALL_SUFFIX}"):
+        for dir_entry in self.archive_root.iterdir():
+            if dir_entry.is_dir() and dir_entry.name != self.TEMPORARY:
+                for file in dir_entry.glob(f"*{Dataset.TARBALL_SUFFIX}"):
                     md5 = get_tarball_md5(file)
                     if md5 == dataset_id:
-                        self._add_controller(dir)
+                        self._add_controller(dir_entry)
                         return self.datasets[dataset_id]
         raise TarballNotFound(dataset_id)
 
@@ -1037,7 +1142,7 @@ def find_dataset(self, dataset_id: str) -> Tarball:
     #   Remove the tarball and MD5 file from ARCHIVE after uncaching the
     #   unpacked directory tree.
 
-    def create(self, tarfile: Path) -> Tarball:
+    def create(self, tarfile_path: Path) -> Tarball:
         """Bring a new tarball under cache manager management.
 
         Move a dataset tarball and companion MD5 file into the specified
@@ -1045,8 +1150,7 @@ def create(self, tarfile: Path) -> Tarball:
         necessary.
 
         Args:
-            controller: associated controller name
-            tarfile: dataset tarball path
+            tarfile_path: dataset tarball path
 
         Raises
             BadDirpath: Failure on extracting the file from tarball
@@ -1059,19 +1163,19 @@ def create(self, tarfile: Path) -> Tarball:
             Tarball object
         """
         try:
-            metadata = Tarball._get_metadata(tarfile)
+            metadata = Tarball._get_metadata(tarfile_path)
             if metadata:
                 controller_name = metadata["run"]["controller"]
             else:
                 controller_name = "unknown"
         except Exception as exc:
-            raise MetadataError(tarfile, exc)
+            raise MetadataError(tarfile_path, exc)
 
         if not controller_name:
-            raise MetadataError(tarfile, "no controller value")
-        if not tarfile.is_file():
-            raise BadFilename(tarfile)
-        name = Dataset.stem(tarfile)
+            raise MetadataError(tarfile_path, ValueError("no controller value"))
+        if not tarfile_path.is_file():
+            raise BadFilename(tarfile_path)
+        name = Dataset.stem(tarfile_path)
         if name in self.tarballs:
             raise DuplicateTarball(name)
         if controller_name in self.controllers:
@@ -1079,7 +1183,7 @@ def create(self, tarfile: Path) -> Tarball:
         else:
             controller = Controller.create(controller_name, self.options, self.logger)
             self.controllers[controller_name] = controller
-        tarball = controller.create_tarball(tarfile)
+        tarball = controller.create_tarball(tarfile_path)
         tarball.metadata = metadata
         self.tarballs[tarball.name] = tarball
         self.datasets[tarball.resource_id] = tarball
@@ -1122,7 +1226,7 @@ def get_inventory(self, dataset_id: str, target: str) -> Optional[JSONOBJECT]:
             }
 
         Args:
-            dataset: Dataset resource ID
+            dataset_id: Dataset resource ID
             target: relative file path within the tarball
 
         Returns:
diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
index 2bd59f4c2b..d483fa5153 100644
--- a/lib/pbench/test/unit/server/test_cache_manager.py
+++ b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -6,8 +6,7 @@
 import re
 import shutil
 import subprocess
-import tarfile
-from typing import IO
+from typing import Optional
 
 import pytest
 
@@ -71,10 +70,16 @@ def selinux_enabled(monkeypatch):
     monkeypatch.setattr("pbench.common.selinux.restorecon", lambda a: None)
 
 
-def fake_get_metadata(tb_path):
+def fake_get_metadata(_tb_path):
     return {"pbench": {"date": "2002-05-16T00:00:00"}, "run": {"controller": "ABC"}}
 
 
+MEMBER_NOT_FOUND_MSG = b"mock-tar: metadata.log: Not found in mock-archive"
+CANNOT_OPEN_MSG = (
+    b"mock-tar: /mock/result.tar.xz: Cannot open: No such mock-file or mock-directory"
+)
+
+
 class TestCacheManager:
     def test_create(self, server_config, make_logger):
         """
@@ -137,13 +142,13 @@ def test_metadata(
     ):
         """Test behavior with metadata.log access errors."""
 
-        def fake_metadata(tar_path):
+        def fake_metadata(_tar_path):
             return {"pbench": {"date": "2002-05-16T00:00:00"}}
 
-        def fake_metadata_run(tar_path):
+        def fake_metadata_run(_tar_path):
             return {"pbench": {"date": "2002-05-16T00:00:00"}, "run": {}}
 
-        def fake_metadata_controller(tar_path):
+        def fake_metadata_controller(_tar_path):
             return {
                 "pbench": {"date": "2002-05-16T00:00:00"},
                 "run": {"controller": ""},
@@ -168,7 +173,8 @@ def fake_metadata_controller(tar_path):
                 cm.create(source_tarball)
             assert str(exc.value) == expected_metaerror
 
-        expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'no controller value'"
+        expected_metaerror = "A problem occurred processing metadata.log "
+        expected_metaerror += f"from {source_tarball!s}: 'no controller value'"
         with monkeypatch.context() as m:
             m.setattr(Tarball, "_get_metadata", fake_metadata_controller)
             with pytest.raises(MetadataError) as exc:
@@ -210,7 +216,7 @@ def test_create_bad(
 
         cm.create(source_tarball)
 
-        # The create will remove the source files, so trying again should
+        # The create call will remove the source files, so trying again should
         # result in an error.
         with pytest.raises(BadFilename) as exc:
             cm.create(source_md5)
@@ -221,10 +227,8 @@ def test_create_bad(
         tarball = cm.find_dataset(md5)
         with pytest.raises(DuplicateTarball) as exc:
             cm.create(tarball.tarball_path)
-        assert (
-            str(exc.value)
-            == "A dataset tarball named 'pbench-user-benchmark_some + config_2021.05.01T12.42.42' is already present"
-        )
+        msg = "A dataset tarball named 'pbench-user-benchmark_some + config_2021.05.01T12.42.42' is already present"
+        assert str(exc.value) == msg
         assert tarball.metadata == fake_get_metadata(tarball.tarball_path)
         assert exc.value.tarball == tarball.name
 
@@ -253,10 +257,10 @@ def test_duplicate(
 
     def test_tarball_subprocess_run_with_exception(self, monkeypatch):
         """Test to check the subprocess_run functionality of the Tarball when
-        an Exception occured"""
+        an Exception occurred"""
         my_command = "mycommand"
 
-        def mock_run(args, **kwargs):
+        def mock_run(args, **_kwargs):
             assert args[0] == my_command
             raise subprocess.TimeoutExpired(my_command, 43)
 
@@ -266,18 +270,18 @@ def mock_run(args, **kwargs):
             my_dir = "my_dir"
 
             with pytest.raises(TarballUnpackError) as exc:
-                Tarball.subprocess_run(command, my_dir, TarballUnpackError, my_dir)
-            assert (
-                str(exc.value)
-                == f"An error occurred while unpacking {my_dir}: Command '{my_command}' timed out after 43 seconds"
-            )
+                Tarball.subprocess_run(
+                    command, my_dir, TarballUnpackError, Path(my_dir)
+                )
+            msg = f"An error occurred while unpacking {my_dir}: Command '{my_command}' timed out after 43 seconds"
+            assert str(exc.value) == msg
 
     def test_tarball_subprocess_run_with_returncode(self, monkeypatch):
         """Test to check the subprocess_run functionality of the Tarball when
         returncode value is not zero"""
         my_command = "mycommand"
 
-        def mock_run(args, **kwargs):
+        def mock_run(args, **_kwargs):
             assert args[0] == my_command
             return subprocess.CompletedProcess(
                 args, returncode=1, stdout=None, stderr="Some error unpacking tarball\n"
@@ -290,10 +294,10 @@ def mock_run(args, **kwargs):
 
             with pytest.raises(TarballUnpackError) as exc:
                 Tarball.subprocess_run(command, my_dir, TarballUnpackError, my_dir)
-            assert (
-                str(exc.value)
-                == f"An error occurred while unpacking {my_dir.name}: {my_command} exited with status 1:  'Some error unpacking tarball'"
-            )
+
+            msg = f"An error occurred while unpacking {my_dir.name}: {my_command} "
+            msg += "exited with status 1:  'Some error unpacking tarball'"
+            assert str(exc.value) == msg
 
     def test_tarball_subprocess_run_success(self, monkeypatch):
         """Test to check the successful run of subprocess_run functionality of the Tarball."""
@@ -340,7 +344,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                         subdir11/
                         subdir12/
                             f121_sym -> ../../subdir1/subdir15
-                            f122_sym -> ./bad_subdir/nonexixtent_file.txt
+                            f122_sym -> ./bad_subdir/nonexistent_file.txt
                         subdir13/
                             f131_sym -> /etc/passwd
                         subdir14/
@@ -437,7 +441,7 @@ def __init__(self, path: Path, controller: Controller):
             self.cache = controller.cache / "ABC"
             self.unpacked = None
 
-    def test_unpack_tar_subprocess_exception(self, monkeypatch):
+    def test_unpack_tar_subprocess_exception(self, make_logger, monkeypatch):
         """Show that, when unpacking of the Tarball fails and raises
         an Exception it is handled successfully."""
         tar = Path("/mock/A.tar.xz")
@@ -451,7 +455,7 @@ def mock_rmtree(path: Path, ignore_errors=False):
             assert ignore_errors
             assert path == cache / "ABC"
 
-        def mock_run(command, dir_path, exception, dir_p):
+        def mock_run(command, _dir_path, exception, dir_p):
             verb = "tar"
             assert command.startswith(verb)
             raise exception(dir_p, subprocess.TimeoutExpired(verb, 43))
@@ -462,17 +466,15 @@ def mock_run(command, dir_path, exception, dir_p):
             m.setattr(shutil, "rmtree", mock_rmtree)
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             with pytest.raises(TarballUnpackError) as exc:
                 tb.unpack()
-            assert (
-                str(exc.value)
-                == f"An error occurred while unpacking {tar}: Command 'tar' timed out after 43 seconds"
-            )
+            msg = f"An error occurred while unpacking {tar}: Command 'tar' timed out after 43 seconds"
+            assert str(exc.value) == msg
             assert exc.type == TarballUnpackError
             assert rmtree_called
 
-    def test_unpack_find_subprocess_exception(self, monkeypatch):
+    def test_unpack_find_subprocess_exception(self, make_logger, monkeypatch):
         """Show that, when permission change of the Tarball fails and raises
         an Exception it is handled successfully."""
         tar = Path("/mock/A.tar.xz")
@@ -486,7 +488,7 @@ def mock_rmtree(path: Path, ignore_errors=False):
             assert ignore_errors
             assert path == cache / "ABC"
 
-        def mock_run(command, dir_path, exception, dir_p):
+        def mock_run(command, _dir_path, exception, dir_p):
             verb = "find"
             if command.startswith(verb):
                 raise exception(dir_p, subprocess.TimeoutExpired(verb, 43))
@@ -499,24 +501,23 @@ def mock_run(command, dir_path, exception, dir_p):
             m.setattr(shutil, "rmtree", mock_rmtree)
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
 
             with pytest.raises(TarballModeChangeError) as exc:
                 tb.unpack()
-            assert (
-                str(exc.value)
-                == f"An error occurred while changing file permissions of {cache / 'ABC'}: Command 'find' timed out after 43 seconds"
-            )
+            msg = "An error occurred while changing file permissions of "
+            msg += f"{cache / 'ABC'}: Command 'find' timed out after 43 seconds"
+            assert str(exc.value) == msg
             assert exc.type == TarballModeChangeError
             assert rmtree_called
 
-    def test_unpack_success(self, monkeypatch):
+    def test_unpack_success(self, make_logger, monkeypatch):
         """Test to check the unpacking functionality of the CacheManager"""
         tar = Path("/mock/A.tar.xz")
         cache = Path("/mock/.cache")
         call = list()
 
-        def mock_run(args, **kwargs):
+        def mock_run(args, **_kwargs):
             call.append(args[0])
 
             tar_target = "--file=" + str(tar)
@@ -525,18 +526,23 @@ def mock_run(args, **kwargs):
                 args, returncode=0, stdout="Successfully Unpacked!", stderr=None
             )
 
+        def mock_resolve(_path, _strict=False):
+            """In this scenario, there are no symlinks,
+            so resolve() should never be called."""
+            raise AssertionError("Unexpected call to Path.resolve()")
+
         with monkeypatch.context() as m:
             m.setattr(Path, "mkdir", lambda path, parents: None)
             m.setattr(subprocess, "run", mock_run)
-            m.setattr(Path, "resolve", lambda path, strict: path)
+            m.setattr(Path, "resolve", mock_resolve)
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tb.unpack()
             assert call == ["tar", "find"]
             assert tb.unpacked == cache / "ABC" / tb.name
 
-    def test_cache_map_success(self, monkeypatch, tmp_path):
+    def test_cache_map_success(self, make_logger, monkeypatch, tmp_path):
         """Test to build the cache map of the root directory"""
         tar = Path("/mock/dir_name.tar.xz")
         cache = Path("/mock/.cache")
@@ -544,28 +550,25 @@ def test_cache_map_success(self, monkeypatch, tmp_path):
         with monkeypatch.context() as m:
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tar_dir = TestCacheManager.MockController.generate_test_result_tree(
                 tmp_path, "dir_name"
             )
             tb.cache_map(tar_dir)
-            assert (
-                tb.cachemap["dir_name"]["children"]["subdir1"]["details"].name
-                == "subdir1"
-            )
-            assert (
-                tb.cachemap["dir_name"]["children"]["subdir1"]["children"]["subdir14"][
-                    "children"
-                ]["subdir141"]["children"]["f1412_sym"]["details"].type
-                == CacheType.SYMLINK
-            )
+
+            sd1 = tb.cachemap["dir_name"]["children"]["subdir1"]
+            assert sd1["details"].name == "subdir1"
+
+            sd141 = sd1["children"]["subdir14"]["children"]["subdir141"]
+            assert sd141["children"]["f1412_sym"]["details"].type == CacheType.SYMLINK
 
     @pytest.mark.parametrize(
         "file_path, expected_msg",
         [
             (
                 "/dir_name/subdir1/f11.txt",
-                "The path '/dir_name/subdir1/f11.txt' is an absolute path, we expect relative path to the root directory.",
+                "The path '/dir_name/subdir1/f11.txt' is an absolute path, "
+                "we expect relative path to the root directory.",
             ),
             (
                 "dir_name/subdir1/subdir11/../f11.txt",
@@ -593,12 +596,13 @@ def test_cache_map_success(self, monkeypatch, tmp_path):
             ),
             (
                 "dir_name/subdir1/subdir14/subdir141/f1412_sym/ne_file",
-                "Found a file 'f1412_sym' where a directory was expected in path 'dir_name/subdir1/subdir14/subdir141/f1412_sym/ne_file'",
+                "Found a file 'f1412_sym' where a directory was expected "
+                "in path 'dir_name/subdir1/subdir14/subdir141/f1412_sym/ne_file'",
             ),
         ],
     )
     def test_cache_map_bad_dir_path(
-        self, monkeypatch, tmp_path, file_path, expected_msg
+        self, make_logger, monkeypatch, tmp_path, file_path, expected_msg
     ):
         """Test to check bad directory or file path"""
         tar = Path("/mock/dir_name.tar.xz")
@@ -607,7 +611,7 @@ def test_cache_map_bad_dir_path(
         with monkeypatch.context() as m:
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tar_dir = TestCacheManager.MockController.generate_test_result_tree(
                 tmp_path, "dir_name"
             )
@@ -768,6 +772,7 @@ def test_cache_map_bad_dir_path(
     )
     def test_cache_map_traverse_cmap(
         self,
+        make_logger,
         monkeypatch,
         tmp_path,
         file_path,
@@ -785,7 +790,7 @@ def test_cache_map_traverse_cmap(
         with monkeypatch.context() as m:
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tar_dir = TestCacheManager.MockController.generate_test_result_tree(
                 tmp_path, "dir_name"
             )
@@ -862,7 +867,7 @@ def test_cache_map_traverse_cmap(
         ],
     )
     def test_cache_map_get_info_cmap(
-        self, monkeypatch, tmp_path, file_path, expected_msg
+        self, make_logger, monkeypatch, tmp_path, file_path, expected_msg
     ):
         """Test to check if the info returned by the cachemap is correct"""
         tar = Path("/mock/dir_name.tar.xz")
@@ -871,7 +876,7 @@ def test_cache_map_get_info_cmap(
         with monkeypatch.context() as m:
             m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tar_dir = TestCacheManager.MockController.generate_test_result_tree(
                 tmp_path, "dir_name"
             )
@@ -891,7 +896,7 @@ def test_cache_map_get_info_cmap(
         ],
     )
     def test_get_inventory(
-        self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
+        self, make_logger, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
     ):
         """Test to extract file contents/stream from a file"""
         tar = Path("/mock/dir_name.tar.xz")
@@ -902,7 +907,7 @@ def test_get_inventory(
             m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
             m.setattr(Tarball, "extract", lambda _t, _p: Inventory(exp_stream))
             m.setattr(Path, "open", lambda _s, _m="rb": exp_stream)
-            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, make_logger))
             tar_dir = TestCacheManager.MockController.generate_test_result_tree(
                 tmp_path, "dir_name"
             )
@@ -913,19 +918,19 @@ def test_get_inventory(
 
     def test_cm_inventory(self, monkeypatch, server_config, make_logger):
         """Verify the happy path of the high level get_inventory"""
-        id = None
+        dataset_id = None
 
         class MockTarball:
             def get_inventory(self, target: str) -> JSONOBJECT:
                 return {
-                    "name": target,
+                    "name": target if self else None,  # Quiet the linter
                     "type": CacheType.FILE,
                     "stream": Inventory(io.BytesIO(b"success")),
                 }
 
-        def mock_find_dataset(self, dataset: str) -> MockTarball:
-            nonlocal id
-            id = dataset
+        def mock_find_dataset(_self, dataset: str) -> MockTarball:
+            nonlocal dataset_id
+            dataset_id = dataset
 
             return MockTarball()
 
@@ -933,84 +938,140 @@ def mock_find_dataset(self, dataset: str) -> MockTarball:
             m.setattr(CacheManager, "find_dataset", mock_find_dataset)
             cm = CacheManager(server_config, make_logger)
             inventory = cm.get_inventory("dataset", "target")
-            assert id == "dataset"
+            assert dataset_id == "dataset"
             assert inventory["name"] == "target"
             assert inventory["stream"].read() == b"success"
 
-    def test_tarfile_extract(self, monkeypatch, tmp_path):
-        """Test to check Tarball.extract success"""
-        tar = Path("/mock/result.tar.xz")
-        contents = b"[test]\nfoo=bar\n"
-
-        class MockTarFile:
-            def extractfile(self, path: str) -> IO[bytes]:
-                if path == "metadata.log":
-                    return io.BytesIO(contents)
-                raise Exception("you can't handle exceptions")
-
-        def fake_tarfile_open(tarfile: str, *args):
-            if str(tarfile) == str(tar):
-                return MockTarFile()
-            raise Exception("You didn't see this coming")
-
-        with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-            got = Tarball.extract(tar, Path("metadata.log"))
-            assert isinstance(got, Inventory)
-            assert got.read() == contents
-
-    def test_tarfile_open_fails(self, monkeypatch, tmp_path):
-        """Test to check non-existent tarfile"""
-        tar = Path("/mock/result.tar.xz")
-
-        def fake_tarfile_open(self, path):
-            raise tarfile.TarError("Invalid Tarfile")
-
-        with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-
-            expected_error_msg = f"The dataset tarball named '{tar}' is not found"
-            with pytest.raises(TarballNotFound) as exc:
-                Tarball.extract(tar, Path("subdir1/f11.txt"))
-            assert str(exc.value) == expected_error_msg
-
-    def test_tarfile_extractfile_fails(self, monkeypatch, tmp_path):
-        """Test to check non-existent path in tarfile"""
-        tar = Path("/mock/result.tar.xz")
-        path = Path("subdir/f11.txt")
-
-        class MockTarFile:
-            def extractfile(self, path):
-                raise Exception("Mr Robot refuses trivial human command")
-
-        def fake_tarfile_open(self, path):
-            return MockTarFile()
-
-        with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-            expected_error_msg = f"Unable to extract {path} from {tar.name}"
-            with pytest.raises(CacheExtractBadPath) as exc:
-                Tarball.extract(tar, path)
-            assert str(exc.value) == expected_error_msg
-
-    def test_tarfile_extractfile_notfile(self, monkeypatch, tmp_path):
-        """Test to check target that's not a file"""
+    @pytest.mark.parametrize(
+        (
+            "tar_path",
+            "popen_fail",
+            "wait_cnt",
+            "peek_return",
+            "poll_return",
+            "proc_return",
+            "stderr_contents",
+        ),
+        (
+            (None, False, 0, b"", None, 2, b""),  # No tar executable
+            ("/usr/bin/tar", True, 0, b"", None, 2, b""),  # Popen failure
+            # Success, output in peek
+            ("/usr/bin/tar", False, 0, b"[test]", None, 0, b""),
+            ("/usr/bin/tar", False, 0, b"", 0, 0, b""),  # Success, poll() show success
+            # Loop/sleep twice, then success
+            ("/usr/bin/tar", False, 2, b"[test]", None, 0, b""),
+            # Member path failure
+            ("/usr/bin/tar", False, 0, b"", 1, 1, MEMBER_NOT_FOUND_MSG),
+            # Archive access failure
+            ("/usr/bin/tar", False, 0, b"", 1, 1, CANNOT_OPEN_MSG),
+            # Unexpected failure
+            ("/usr/bin/tar", False, 0, b"", 1, 1, b"mock-tar: bolt out of the blue!"),
+            # Hang, never returning output nor an exit code
+            ("/usr/bin/tar", False, 0, b"", None, None, b""),
+        ),
+    )
+    def test_tarfile_extract(
+        self,
+        monkeypatch,
+        tmp_path,
+        tar_path: str,
+        popen_fail: bool,
+        wait_cnt: int,
+        peek_return: Optional[bytes],
+        poll_return: Optional[int],
+        proc_return: int,
+        stderr_contents: Optional[bytes],
+    ):
+        """Test to check Tarball.extract behaviors"""
         tar = Path("/mock/result.tar.xz")
-        path = Path("subdir/f11.txt")
-
-        class MockTarFile:
-            def extractfile(self, path):
-                return None
-
-        def fake_tarfile_open(self, path):
-            return MockTarFile()
+        path = "metadata.log"
+        stdout_contents = b"[test]\nfoo=bar\n"
+
+        class MockBufferedReader(io.BufferedReader):
+            def __init__(self, contents: bytes):
+                # No effect, other than to quiet the linter
+                None if True else super().__init__(io.RawIOBase())
+                self.contents = contents
+                self.loop_count = wait_cnt
+
+            def close(self) -> None:
+                raise AssertionError(
+                    "This test doesn't expect the stream to be closed."
+                )
+
+            def peek(self, size=0) -> bytes:
+                if self.loop_count > 0:
+                    self.loop_count -= 1
+                    return b""
+                return peek_return
+
+            def read(self, _size: int = -1) -> bytes:
+                return self.contents
+
+        class MockPopen(subprocess.Popen):
+            def __init__(self, *_args, **_kwargs):
+                # No effect, other than to quiet the linter
+                None if True else super().__init__([])
+                if popen_fail:
+                    raise ValueError(
+                        "MockPopen pretending it was called with invalid arguments"
+                    )
+                self.stdout = MockBufferedReader(stdout_contents)
+                self.stderr = MockBufferedReader(stderr_contents)
+                self.returncode = None
+                self.loop_count = wait_cnt
+
+            def poll(self) -> Optional[int]:
+                if self.loop_count > 0:
+                    self.loop_count -= 1
+                    return None
+                self.returncode = poll_return
+                return poll_return
+
+            def kill(self) -> None:
+                pass
+
+        def mock_shutil_which(
+            cmd: str, _mode: int = os.F_OK | os.X_OK, _path: Optional[str] = None
+        ):
+            assert cmd == "tar"
+            return tar_path
 
         with monkeypatch.context() as m:
-            m.setattr(tarfile, "open", fake_tarfile_open)
-            expected_error_msg = f"Unable to extract {path} from {tar.name}"
-            with pytest.raises(CacheExtractBadPath) as exc:
-                Tarball.extract(tar, path)
-            assert str(exc.value) == expected_error_msg
+            m.setattr(shutil, "which", mock_shutil_which)
+            m.setattr(subprocess, "Popen", MockPopen)
+            m.setattr(Inventory, "close", MockBufferedReader.close)
+
+            try:
+                got = Tarball.extract(tar, path)
+            except CacheExtractBadPath as exc:
+                assert tar_path
+                assert not popen_fail
+                assert stderr_contents == MEMBER_NOT_FOUND_MSG
+                assert str(exc) == f"Unable to extract {path} from {tar.name}"
+            except subprocess.TimeoutExpired as exc:
+                assert tar_path
+                assert not popen_fail
+                assert (
+                    not peek_return and not poll_return
+                ), f"Unexpected test timeout: {exc}"
+            except TarballUnpackError as exc:
+                if tar_path is None:
+                    msg = "External 'tar' executable not found"
+                else:
+                    assert not popen_fail
+                    msg = f"Unexpected error from {tar_path}: {stderr_contents.decode()!r}"
+                assert stderr_contents != MEMBER_NOT_FOUND_MSG
+                assert str(exc) == f"An error occurred while unpacking {tar}: {msg}"
+            except ValueError:
+                assert tar_path
+                assert popen_fail
+            else:
+                assert tar_path
+                assert not popen_fail
+                assert peek_return or poll_return is not None
+                assert isinstance(got, Inventory)
+                assert got.read() == stdout_contents
 
     @pytest.mark.parametrize(
         "tarball,stream", (("hasmetalog.tar.xz", True), ("nometalog.tar.xz", False))
@@ -1018,7 +1079,6 @@ def fake_tarfile_open(self, path):
     def test_get_metadata(self, monkeypatch, tarball, stream):
         """Verify access and processing of `metadata.log`"""
 
-        @staticmethod
         def fake_extract(t: Path, f: Path):
             if str(t) == tarball and str(f) == f"{Dataset.stem(t)}/metadata.log":
                 if stream:
@@ -1027,7 +1087,7 @@ def fake_extract(t: Path, f: Path):
             raise Exception(f"Unexpected mock exception with stream:{stream}: {t}, {f}")
 
         with monkeypatch.context() as m:
-            m.setattr(Tarball, "extract", fake_extract)
+            m.setattr(Tarball, "extract", staticmethod(fake_extract))
             metadata = Tarball._get_metadata(Path(tarball))
 
         if stream:
@@ -1035,34 +1095,258 @@ def fake_extract(t: Path, f: Path):
         else:
             assert metadata is None
 
-    def test_inventory(self):
-        closed = False
+    def test_inventory_without_subprocess(self):
+        """Test the Inventory class when used without a subprocess
 
-        class MockTarFile:
-            def close(self):
-                nonlocal closed
-                closed = True
+        This tests the Inventory class functions other than close(), which are
+        unaffected by whether a subprocess is driving the stream, and it also
+        tests the behavior of close() when there is no subprocess.
+        """
+        calls = []
+        my_buffer = bytes()
 
-            def __repr__(self) -> str:
-                return "<Mock tarfile>"
+        class MockBufferedReader(io.BufferedReader):
+            def __init__(self):
+                # No effect, other than to quiet the linter
+                None if True else super().__init__(io.RawIOBase())
 
-        raw = b"abcde\nfghij\n"
-        stream = Inventory(io.BytesIO(raw), MockTarFile())
-        assert re.match(
-            r"^<Stream <_io.BytesIO object at 0x[a-z0-9]+> from <Mock tarfile>>$",
-            str(stream),
-        )
+            def close(self) -> None:
+                calls.append("close")
+
+            def getbuffer(self):
+                calls.append("getbuffer") if self else None  # Quiet the linter
+                return my_buffer
+
+            def read(self, _size: int = -1) -> bytes:
+                calls.append("read")
+                return b"read"
+
+            def readable(self) -> bool:
+                calls.append("readable")
+                return True
+
+            def readline(self, _size: int = -1) -> bytes:
+                """Return a non-empty byte-string on the first call; return an
+                empty string on subsequent calls."""
+                calls.append("readline")
+                return b"readline" if len(calls) < 2 else b""
+
+            def seek(self, offset: int, _whence: int = io.SEEK_SET) -> int:
+                calls.append("seek")
+                return offset
+
+        # Invoke the CUT
+        stream = Inventory(MockBufferedReader(), None)
+
+        assert stream.subproc is None
+
+        # Test Inventory.getbuffer()
+        calls.clear()
+        assert stream.getbuffer() is my_buffer and calls == ["getbuffer"]
+
+        # Test Inventory.read()
+        calls.clear()
+        assert stream.read() == b"read" and calls == ["read"]
+
+        # Test Inventory.readable()
+        calls.clear()
+        assert stream.readable() and calls == ["readable"]
 
-        assert stream.getbuffer() == raw
-        assert stream.readable()
-        assert stream.read(5) == b"abcde"
-        assert stream.read() == b"\nfghij\n"
-        assert stream.seek(0) == 0
-        assert [b for b in stream] == [b"abcde\n", b"fghij\n"]
+        # Test Inventory.seek()
+        calls.clear()
+        assert stream.seek(12345) == 12345 and calls == ["seek"]
+
+        # Test Inventory.__iter__() and Inventory.__next__()
+        calls.clear()
+        contents = [b for b in stream]
+        assert contents == [b"readline"] and calls == ["readline", "readline"]
+
+        # Test Inventory.__repr__()
+        assert str(stream) == "<Stream <MockBufferedReader> from None>"
+
+        # Test Inventory.close()
+        calls.clear()
         stream.close()
-        assert closed
-        with pytest.raises(ValueError):
-            stream.read()
+        assert calls == ["close"]
+
+    @pytest.mark.parametrize(
+        ("poll_val", "stdout_size", "stderr_size", "wait_timeout", "exp_calls"),
+        (
+            # The subprocess completed before the close() call
+            (0, 0, None, None, ["poll", "close"]),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, stdout is empty and there is no stderr.
+            (None, 0, None, False, ["poll", "kill", "stdout", "wait", "close"]),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, stderr is empty and there is no stdout.
+            (None, None, 0, False, ["poll", "kill", "stderr", "wait", "close"]),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, both stdout and stderr are present and empty.
+            (None, 0, 0, False, ["poll", "kill", "stdout", "stderr", "wait", "close"]),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, stdout and stderr each require one read to
+            # drain them (and a second to see that they are empty).
+            (
+                None,
+                2000,
+                2000,
+                False,
+                [
+                    "poll",
+                    "kill",
+                    "stdout",
+                    "stdout",
+                    "stderr",
+                    "stderr",
+                    "wait",
+                    "close",
+                ],
+            ),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, stdout and stderr each require two reads to
+            # drain them (and a third to see that they are empty).
+            (
+                None,
+                6000,
+                6000,
+                False,
+                [
+                    "poll",
+                    "kill",
+                    "stdout",
+                    "stdout",
+                    "stdout",
+                    "stderr",
+                    "stderr",
+                    "stderr",
+                    "wait",
+                    "close",
+                ],
+            ),
+            # The subprocess is still running when close() is called, the wait
+            # does not time out, stdout and stderr each require three reads to
+            # drain them (and a fourth to see that they are empty).
+            (
+                None,
+                9000,
+                9000,
+                False,
+                [
+                    "poll",
+                    "kill",
+                    "stdout",
+                    "stdout",
+                    "stdout",
+                    "stdout",
+                    "stderr",
+                    "stderr",
+                    "stderr",
+                    "stderr",
+                    "wait",
+                    "close",
+                ],
+            ),
+            # The subprocess is still running when close() is called, stdout is
+            # empty, there is no stderr, and the wait times out.
+            (None, 0, None, True, ["poll", "kill", "stdout", "wait"]),
+        ),
+    )
+    def test_inventory(
+        self, poll_val, stdout_size, stderr_size, wait_timeout, exp_calls
+    ):
+        """Test the Inventory class when used with a subprocess
+
+        This test focuses on the behavior of the close() function, since the
+        behavior of the other functions are checked in the previous test.
+        """
+        my_calls = []
+
+        class MockPopen(subprocess.Popen):
+            def __init__(
+                self,
+                stdout: Optional[io.BufferedReader],
+                stderr: Optional[io.BufferedReader],
+            ):
+                # No effect, other than to quiet the linter.
+                None if True else super().__init__([])
+                self.stdout = stdout
+                self.stderr = stderr
+                self.returncode = None
+
+            def kill(self):
+                my_calls.append("kill")
+
+            def poll(self) -> Optional[int]:
+                my_calls.append("poll")
+                assert (
+                    self.returncode is None
+                ), "returncode is unexpectedly set...test bug?"
+                self.returncode = poll_val
+                return self.returncode
+
+            def wait(self, timeout: Optional[float] = None) -> Optional[int]:
+                my_calls.append("wait")
+                assert (
+                    self.returncode is None
+                ), "returncode is unexpectedly set...test bug?"
+                if wait_timeout:
+                    raise subprocess.TimeoutExpired(
+                        cmd="mock_subprocess",
+                        timeout=timeout,
+                        output=b"I'm dead!",
+                        stderr=b"No, really, I'm dead!",
+                    )
+                self.returncode = 0
+                return self.returncode
+
+            def __repr__(self):
+                return self.__class__.__name__
+
+        class MockBufferedReader(io.BufferedReader):
+            def __init__(self, size: int, name: str):
+                # No effect, other than to quiet the linter
+                None if True else super().__init__(io.RawIOBase())
+                self.size = size
+                self.stream_name = name
+
+            def close(self) -> None:
+                my_calls.append("close")
+                pass
+
+            def read(self, size: int = -1) -> bytes:
+                my_calls.append(self.stream_name)
+                if self.size <= 0:
+                    return b""
+                if size < 0 or size >= self.size:
+                    self.size = 0
+                else:
+                    self.size -= size
+                return b"read"
+
+        my_stdout = (
+            None if stdout_size is None else MockBufferedReader(stdout_size, "stdout")
+        )
+        my_stderr = (
+            None if stderr_size is None else MockBufferedReader(stderr_size, "stderr")
+        )
+        my_stream = my_stdout if my_stdout is not None else my_stderr
+        assert my_stream, "Test bug:  we need at least one of stdout and stderr"
+
+        # Invoke the CUT
+        stream = Inventory(my_stream, MockPopen(my_stdout, my_stderr))
+
+        # Test Inventory.__repr__()
+        assert str(stream) == "<Stream <MockBufferedReader> from MockPopen>"
+
+        try:
+            stream.close()
+        except subprocess.TimeoutExpired:
+            assert wait_timeout, "wait() timed out unexpectedly"
+        else:
+            assert not wait_timeout, "wait() failed to time out as expected"
+
+        assert stream.subproc is None
+        assert my_calls == exp_calls
 
     def test_find(
         self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
@@ -1072,12 +1356,6 @@ def test_find(
         through the various supported methods.
         """
 
-        def mock_run(args, **kwargs):
-            """Prevents the Tarball contents from actually being unpacked"""
-            return subprocess.CompletedProcess(
-                args, returncode=0, stdout="Success!", stderr=None
-            )
-
         monkeypatch.setattr(Tarball, "_get_metadata", fake_get_metadata)
         source_tarball, source_md5, md5 = tarball
         dataset_name = Dataset.stem(source_tarball)
@@ -1096,7 +1374,7 @@ def mock_run(args, **kwargs):
         # Test __getitem__
         assert tarball == cm[md5]
         with pytest.raises(TarballNotFound) as exc:
-            cm["foobar"]
+            _ = cm["foobar"]
         assert str(exc.value) == "The dataset tarball named 'foobar' is not found"
 
         # Test __contains__
@@ -1150,12 +1428,6 @@ def test_lifecycle(
         delete it.
         """
 
-        def mock_run(args, **kwargs):
-            """Prevents the Tarball contents from actually being unpacked"""
-            return subprocess.CompletedProcess(
-                args, returncode=0, stdout="Success!", stderr=None
-            )
-
         source_tarball, source_md5, md5 = tarball
         cm = CacheManager(server_config, make_logger)
         archive = cm.archive_root / "ABC"
@@ -1184,9 +1456,9 @@ def mock_run(args, **kwargs):
         assert md5file.exists()
 
         assert md5 == md5file.read_text()
-        hash = hashlib.md5()
-        hash.update(tarfile.read_bytes())
-        assert md5 == hash.hexdigest()
+        md5_hash = hashlib.md5()
+        md5_hash.update(tarfile.read_bytes())
+        assert md5 == md5_hash.hexdigest()
 
         assert list(cm.controllers.keys()) == ["ABC"]
         dataset_name = source_tarball.name[:-7]