diff --git a/doc/code/orchestrators/use_huggingface_chat_target.ipynb b/doc/code/orchestrators/use_huggingface_chat_target.ipynb
index 6f4d94b10..14d61e48a 100644
--- a/doc/code/orchestrators/use_huggingface_chat_target.ipynb
+++ b/doc/code/orchestrators/use_huggingface_chat_target.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "d623d73a",
+   "id": "066bb566",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -17,7 +17,7 @@
     "   - This notebook supports the following **instruct models** that follow a structured chat template. These are examples, and more instruct models are available on Hugging Face:\n",
     "     - `HuggingFaceTB/SmolLM-360M-Instruct`\n",
     "     - `microsoft/Phi-3-mini-4k-instruct`\n",
-    "  \n",
+    "\n",
     "     - `...`\n",
     "\n",
     "2. **Excluded Models**:\n",
@@ -37,63 +37,116 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "a0d61a68",
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "id": "940f8d8a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-11T22:41:35.643730Z",
+     "iopub.status.busy": "2024-11-11T22:41:35.643730Z",
+     "iopub.status.idle": "2024-11-11T22:43:23.863745Z",
+     "shell.execute_reply": "2024-11-11T22:43:23.862727Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running model: HuggingFaceTB/SmolLM-135M-Instruct\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average response time for HuggingFaceTB/SmolLM-135M-Instruct: 37.12 seconds\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[22m\u001b[39mConversation ID: 5223e15e-f21c-4d15-88af-8c02d6558182\n",
+      "\u001b[1m\u001b[34muser: What is 4*4? Give me the solution.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[22m\u001b[33massistant: What a great question!\n",
+      "\n",
+      "The number 4*4 is a special number because it can be expressed as a product of two numbers,\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[22m\u001b[39mConversation ID: b0238d3e-ce2e-48c3-a5e1-eaebf2c58e6f\n",
+      "\u001b[1m\u001b[34muser: What is 3*3? Give me the solution.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[22m\u001b[33massistant: What a great question!\n",
+      "\n",
+      "The number 3*3 is a fascinating number that has been a subject of fascination for mathematicians and computer scientists for\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HuggingFaceTB/SmolLM-135M-Instruct: 37.12 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "import time\n",
-    "from pyrit.prompt_target import HuggingFaceChatTarget  \n",
+    "from pyrit.prompt_target import HuggingFaceChatTarget\n",
     "from pyrit.orchestrator import PromptSendingOrchestrator\n",
     "\n",
     "# models to test\n",
-    "model_id = \"HuggingFaceTB/SmolLM-135M-Instruct\" \n",
+    "model_id = \"HuggingFaceTB/SmolLM-135M-Instruct\"\n",
     "\n",
     "# List of prompts to send\n",
-    "prompt_list = [\n",
-    "        \"What is 3*3? Give me the solution.\",\n",
-    "        \"What is 4*4? Give me the solution.\"\n",
-    "    ]\n",
+    "prompt_list = [\"What is 3*3? Give me the solution.\", \"What is 4*4? Give me the solution.\"]\n",
     "\n",
     "# Dictionary to store average response times\n",
     "model_times = {}\n",
-    "  \n",
+    "\n",
     "print(f\"Running model: {model_id}\")\n",
-    "        \n",
+    "\n",
     "try:\n",
     "    # Initialize HuggingFaceChatTarget with the current model\n",
-    "    target = HuggingFaceChatTarget(\n",
-    "        model_id=model_id,          \n",
-    "        use_cuda=False,     \n",
-    "        tensor_format=\"pt\",\n",
-    "        max_new_tokens=30           \n",
-    "    )\n",
-    "            \n",
+    "    target = HuggingFaceChatTarget(model_id=model_id, use_cuda=False, tensor_format=\"pt\", max_new_tokens=30)\n",
+    "\n",
     "    # Initialize the orchestrator\n",
-    "    orchestrator = PromptSendingOrchestrator(\n",
-    "        prompt_target=target,\n",
-    "        verbose=False\n",
-    "    )\n",
-    "            \n",
+    "    orchestrator = PromptSendingOrchestrator(prompt_target=target, verbose=False)\n",
+    "\n",
     "    # Record start time\n",
     "    start_time = time.time()\n",
-    "            \n",
+    "\n",
     "    # Send prompts asynchronously\n",
-    "    responses = await orchestrator.send_prompts_async(prompt_list=prompt_list) # type: ignore\n",
-    "            \n",
+    "    responses = await orchestrator.send_prompts_async(prompt_list=prompt_list)  # type: ignore\n",
+    "\n",
     "    # Record end time\n",
     "    end_time = time.time()\n",
-    "            \n",
+    "\n",
     "    # Calculate total and average response time\n",
     "    total_time = end_time - start_time\n",
     "    avg_time = total_time / len(prompt_list)\n",
     "    model_times[model_id] = avg_time\n",
-    "            \n",
+    "\n",
     "    print(f\"Average response time for {model_id}: {avg_time:.2f} seconds\\n\")\n",
-    "            \n",
+    "\n",
     "    # Print the conversations\n",
-    "    await orchestrator.print_conversations() # type: ignore\n",
-    "        \n",
+    "    await orchestrator.print_conversations()  # type: ignore\n",
+    "\n",
     "except Exception as e:\n",
     "    print(f\"An error occurred with model {model_id}: {e}\\n\")\n",
     "    model_times[model_id] = None\n",
@@ -108,14 +161,12 @@
  ],
  "metadata": {
   "jupytext": {
-   "cell_metadata_filter": "-all",
-   "main_language": "python",
-   "notebook_metadata_filter": "-all"
+   "cell_metadata_filter": "-all"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "pyrit-dev",
    "language": "python",
-   "name": "python3"
+   "name": "pyrit-dev"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/pyrit/common/download_hf_model.py b/pyrit/common/download_hf_model.py
new file mode 100644
index 000000000..bcc7ecc74
--- /dev/null
+++ b/pyrit/common/download_hf_model.py
@@ -0,0 +1,112 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import asyncio
+import logging
+import os
+import httpx
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_available_files(model_id: str, token: str):
+    """Fetches available files for a model from the Hugging Face repository."""
+    api = HfApi()
+    try:
+        model_info = api.model_info(model_id, token=token)
+        available_files = [file.rfilename for file in model_info.siblings]
+
+        # Perform simple validation: raise a ValueError if no files are available
+        if not len(available_files):
+            raise ValueError(f"No available files found for the model: {model_id}")
+
+        return available_files
+    except Exception as e:
+        logger.info(f"Error fetching model files for {model_id}: {e}")
+        return []
+
+
+async def download_specific_files(model_id: str, file_patterns: list, token: str, cache_dir: Path):
+    """
+    Downloads specific files from a Hugging Face model repository.
+    If file_patterns is None, downloads all files.
+
+    Returns:
+        List of URLs for the downloaded files.
+    """
+    os.makedirs(cache_dir, exist_ok=True)
+
+    available_files = get_available_files(model_id, token)
+    # If no file patterns are provided, download all available files
+    if file_patterns is None:
+        files_to_download = available_files
+        logger.info(f"Downloading all files for model {model_id}.")
+    else:
+        # Filter files based on the patterns provided
+        files_to_download = [file for file in available_files if any(pattern in file for pattern in file_patterns)]
+        if not files_to_download:
+            logger.info(f"No files matched the patterns provided for model {model_id}.")
+            return
+
+    # Generate download URLs directly
+    base_url = f"https://huggingface.co/{model_id}/resolve/main/"
+    urls = [base_url + file for file in files_to_download]
+
+    # Download the files
+    await download_files(urls, token, cache_dir)
+
+
+async def download_chunk(url, headers, start, end, client):
+    """Download a chunk of the file with a specified byte range."""
+    range_header = {"Range": f"bytes={start}-{end}", **headers}
+    response = await client.get(url, headers=range_header)
+    response.raise_for_status()
+    return response.content
+
+
+async def download_file(url, token, download_dir, num_splits):
+    """Download a file in multiple segments (splits) using byte-range requests."""
+    headers = {"Authorization": f"Bearer {token}"}
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        # Get the file size to determine chunk size
+        response = await client.head(url, headers=headers)
+        response.raise_for_status()
+        file_size = int(response.headers["Content-Length"])
+        chunk_size = file_size // num_splits
+
+        # Prepare tasks for each chunk
+        tasks = []
+        file_name = url.split("/")[-1]
+        file_path = Path(download_dir, file_name)
+
+        for i in range(num_splits):
+            start = i * chunk_size
+            end = start + chunk_size - 1 if i < num_splits - 1 else file_size - 1
+            tasks.append(download_chunk(url, headers, start, end, client))
+
+        # Download all chunks concurrently
+        chunks = await asyncio.gather(*tasks)
+
+        # Write chunks to the file in order
+        with open(file_path, "wb") as f:
+            for chunk in chunks:
+                f.write(chunk)
+        logger.info(f"Downloaded {file_name} to {file_path}")
+
+
+async def download_files(urls: list[str], token: str, download_dir: Path, num_splits=3, parallel_downloads=4):
+    """Download multiple files with parallel downloads and segmented downloading."""
+
+    # Limit the number of parallel downloads
+    semaphore = asyncio.Semaphore(parallel_downloads)
+
+    async def download_with_limit(url):
+        async with semaphore:
+            await download_file(url, token, download_dir, num_splits)
+
+    # Run downloads concurrently, but limit to parallel_downloads at a time
+    await asyncio.gather(*(download_with_limit(url) for url in urls))
diff --git a/pyrit/common/download_hf_model_with_aria2.py b/pyrit/common/download_hf_model_with_aria2.py
deleted file mode 100644
index b5c7cb35f..000000000
--- a/pyrit/common/download_hf_model_with_aria2.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-import os
-from pathlib import Path
-import subprocess
-from typing import Optional
-
-from huggingface_hub import HfApi
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_available_files(model_id: str, token: str):
-    """Fetches available files for a model from the Hugging Face repository."""
-    api = HfApi()
-    try:
-        model_info = api.model_info(model_id, token=token)
-        available_files = [file.rfilename for file in model_info.siblings]
-
-        # Perform simple validation: raise a ValueError if no files are available
-        if not len(available_files):
-            raise ValueError(f"No available files found for the model: {model_id}")
-
-        return available_files
-    except Exception as e:
-        logger.info(f"Error fetching model files for {model_id}: {e}")
-        return []
-
-
-def download_files_with_aria2(urls: list, token: str, download_dir: Optional[Path] = None):
-    """Uses aria2 to download files from the given list of URLs."""
-
-    # Convert download_dir to string if it's a Path object
-    download_dir_str = str(download_dir) if isinstance(download_dir, Path) else download_dir
-
-    aria2_command = [
-        "aria2c",
-        "-d",
-        download_dir_str,
-        "-x",
-        "3",  # Number of connections per server for each download.
-        "-s",
-        "5",  # Number of splits for each file.
-        "-j",
-        "4",  # Maximum number of parallel downloads.
-        "--continue=true",
-        "--enable-http-pipelining=true",
-        f"--header=Authorization: Bearer {token}",
-        "-i",
-        "-",  # Use '-' to read input from stdin
-    ]
-
-    try:
-        # Run aria2c with input from stdin
-        process = subprocess.Popen(aria2_command, stdin=subprocess.PIPE, text=True)
-        process.communicate("\n".join(urls))  # Pass URLs directly to stdin
-        if process.returncode == 0:
-            logger.info(f"\nFiles downloaded successfully to {download_dir}.")
-        else:
-            logger.info(f"Error downloading files with aria2, return code: {process.returncode}.")
-            raise subprocess.CalledProcessError(process.returncode, aria2_command)
-    except subprocess.CalledProcessError as e:
-        logger.info(f"Error downloading files with aria2: {e}")
-        raise
-
-
-def download_specific_files_with_aria2(model_id: str, file_patterns: list, token: str, cache_dir: Optional[Path]):
-    """
-    Downloads specific files from a Hugging Face model repository using aria2.
-    If file_patterns is None, downloads all files.
-    """
-    os.makedirs(cache_dir, exist_ok=True)
-
-    available_files = get_available_files(model_id, token)
-    # If no file patterns are provided, download all available files
-    if file_patterns is None:
-        files_to_download = available_files
-        logger.info(f"Downloading all files for model {model_id}.")
-    else:
-        # Filter files based on the patterns provided
-        files_to_download = [file for file in available_files if any(pattern in file for pattern in file_patterns)]
-        if not files_to_download:
-            logger.info(f"No files matched the patterns provided for model {model_id}.")
-            return
-
-    # Generate download URLs directly
-    base_url = f"https://huggingface.co/{model_id}/resolve/main/"
-    urls = [base_url + file for file in files_to_download]
-
-    # Use aria2c to download the files
-    download_files_with_aria2(urls, token, cache_dir)
diff --git a/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py b/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py
index e4a0af318..17fde4e58 100644
--- a/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py
+++ b/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import asyncio
 import json
 import logging
 import os
@@ -10,7 +11,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
 
 from pyrit.prompt_target import PromptChatTarget
-from pyrit.common.download_hf_model_with_aria2 import download_specific_files_with_aria2
+from pyrit.common.download_hf_model import download_specific_files
 from pyrit.models.prompt_request_response import PromptRequestResponse, construct_response_from_request
 from pyrit.exceptions import EmptyResponseException, pyrit_target_retry
 from pyrit.common import default_values
@@ -30,7 +31,7 @@ class HuggingFaceChatTarget(PromptChatTarget):
     _cached_model_id = None
 
     # Class-level flag to enable or disable cache
-    _cache_enabled = False
+    _cache_enabled = True
 
     # Define the environment variable name for the Hugging Face token
     HUGGINGFACE_TOKEN_ENVIRONMENT_VARIABLE = "HUGGINGFACE_TOKEN"
@@ -48,6 +49,8 @@ def __init__(
         top_p: float = 1.0,
         skip_special_tokens: bool = True,
     ) -> None:
+        super().__init__()
+
         self.model_id = model_id
         self.use_cuda = use_cuda
         self.tensor_format = tensor_format
@@ -73,8 +76,7 @@ def __init__(
         if self.use_cuda and not torch.cuda.is_available():
             raise RuntimeError("CUDA requested but not available.")
 
-        # Load the model and tokenizer using the encapsulated method
-        self.load_model_and_tokenizer()
+        self.load_model_and_tokenizer_task = asyncio.create_task(self.load_model_and_tokenizer())
 
     def is_model_id_valid(self) -> bool:
         """
@@ -89,7 +91,7 @@ def is_model_id_valid(self) -> bool:
             logger.error(f"Invalid HuggingFace model ID {self.model_id}: {e}")
             return False
 
-    def load_model_and_tokenizer(self):
+    async def load_model_and_tokenizer(self):
         """Loads the model and tokenizer, downloading if necessary.
 
         Downloads the model to the HF_MODELS_DIR folder if it does not exist,
@@ -113,14 +115,12 @@ def load_model_and_tokenizer(self):
 
             if self.necessary_files is None:
                 # Download all files if no specific files are provided
-                logger.info(f"Downloading all files for {self.model_id} using aria2...")
-                download_specific_files_with_aria2(self.model_id, None, self.huggingface_token, cache_dir)
+                logger.info(f"Downloading all files for {self.model_id}...")
+                await download_specific_files(self.model_id, None, self.huggingface_token, cache_dir)
             else:
                 # Download only the necessary files
-                logger.info(f"Downloading specific files for {self.model_id} using aria2...")
-                download_specific_files_with_aria2(
-                    self.model_id, self.necessary_files, self.huggingface_token, cache_dir
-                )
+                logger.info(f"Downloading specific files for {self.model_id}...")
+                await download_specific_files(self.model_id, self.necessary_files, self.huggingface_token, cache_dir)
 
             # Load the tokenizer and model from the specified directory
             logger.info(f"Loading model {self.model_id} from cache path: {cache_dir}...")
@@ -151,6 +151,9 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P
         """
         Sends a normalized prompt asynchronously to the HuggingFace model.
         """
+        # Load the model and tokenizer using the encapsulated method
+        await self.load_model_and_tokenizer_task
+
         self._validate_request(prompt_request=prompt_request)
         request = prompt_request.request_pieces[0]
         prompt_template = request.converted_value
@@ -162,20 +165,23 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P
 
         # Apply chat template via the _apply_chat_template method
         tokenized_chat = self._apply_chat_template(messages)
+        input_ids = tokenized_chat["input_ids"]
+        attention_mask = tokenized_chat["attention_mask"]
 
-        logger.info(f"Tokenized chat: {tokenized_chat}")
+        logger.info(f"Tokenized chat: {input_ids}")
 
         try:
             # Ensure model is on the correct device (should already be the case from `load_model_and_tokenizer`)
             self.model.to(self.device)
 
             # Record the length of the input tokens to later extract only the generated tokens
-            input_length = tokenized_chat.shape[-1]
+            input_length = input_ids.shape[-1]
 
             # Generate the response
             logger.info("Generating response from model...")
             generated_ids = self.model.generate(
-                input_ids=tokenized_chat,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
                 max_new_tokens=self.max_new_tokens,
                 temperature=self.temperature,
                 top_p=self.top_p,
@@ -216,7 +222,11 @@ def _apply_chat_template(self, messages):
 
             # Apply the chat template to format and tokenize the messages
             tokenized_chat = self.tokenizer.apply_chat_template(
-                messages, tokenize=True, add_generation_prompt=True, return_tensors=self.tensor_format
+                messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_tensors=self.tensor_format,
+                return_dict=True,
             ).to(self.device)
             return tokenized_chat
         else:
diff --git a/tests/test_hf_model_downloads.py b/tests/test_hf_model_downloads.py
index 4ee2b9c18..b18cf7f81 100644
--- a/tests/test_hf_model_downloads.py
+++ b/tests/test_hf_model_downloads.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT license.
 
 import os
+from pathlib import Path
 import pytest
 from unittest.mock import patch
 
 # Import functions to test from local application files
-from pyrit.common.download_hf_model_with_aria2 import download_specific_files_with_aria2
+from pyrit.common.download_hf_model import download_specific_files
 
 
 # Define constants for testing
@@ -31,8 +32,10 @@ def setup_environment():
         yield token
 
 
-def test_download_specific_files_with_aria2(setup_environment):
-    """Test downloading specific files using aria2."""
+def test_download_specific_files(setup_environment):
+    """Test downloading specific files"""
     token = setup_environment  # Get the token from the fixture
-    with pytest.raises(Exception):
-        download_specific_files_with_aria2(MODEL_ID, FILE_PATTERNS, token)
+
+    with patch("os.makedirs"):
+        with patch("pyrit.common.download_hf_model.download_files"):
+            download_specific_files(MODEL_ID, FILE_PATTERNS, token, Path(""))
diff --git a/tests/test_huggingface_chat_target.py b/tests/test_huggingface_chat_target.py
index 87a80862a..ece9de2a5 100644
--- a/tests/test_huggingface_chat_target.py
+++ b/tests/test_huggingface_chat_target.py
@@ -1,8 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+from asyncio import Task
 import pytest
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch, MagicMock, AsyncMock
 
 from pyrit.prompt_target import HuggingFaceChatTarget
 from pyrit.models.prompt_request_response import PromptRequestResponse, PromptRequestPiece
@@ -22,12 +23,10 @@ def mock_get_required_value(request):
         yield
 
 
-# Fixture to mock download_specific_files_with_aria2 globally for all tests
+# Fixture to mock download_specific_files globally for all tests
 @pytest.fixture(autouse=True)
-def mock_download_specific_files_with_aria2():
-    with patch(
-        "pyrit.common.download_hf_model_with_aria2.download_specific_files_with_aria2", return_value=None
-    ) as mock_download:
+def mock_download_specific_files():
+    with patch("pyrit.common.download_hf_model.download_specific_files", return_value=None) as mock_download:
         yield mock_download
 
 
@@ -73,6 +72,17 @@ def mock_pretrained_config():
         yield
 
 
+class AwaitableMock(AsyncMock):
+    def __await__(self):
+        return iter([])
+
+
+@pytest.fixture(autouse=True)
+def mock_create_task():
+    with patch("asyncio.create_task", return_value=AwaitableMock(spec=Task)):
+        yield
+
+
 def test_init_with_no_token_var_raises(monkeypatch):
     # Ensure the environment variable is unset
     monkeypatch.delenv("HUGGINGFACE_TOKEN", raising=False)
@@ -83,12 +93,15 @@ def test_init_with_no_token_var_raises(monkeypatch):
     assert "Environment variable HUGGINGFACE_TOKEN is required" in str(excinfo.value)
 
 
-def test_initialization():
+@pytest.mark.asyncio
+async def test_initialization():
     # Test the initialization without loading the actual models
     hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False)
     assert hf_chat.model_id == "test_model"
     assert not hf_chat.use_cuda
     assert hf_chat.device == "cpu"
+
+    await hf_chat.load_model_and_tokenizer()
     assert hf_chat.model is not None
     assert hf_chat.tokenizer is not None
 
@@ -106,8 +119,10 @@ def test_is_model_id_valid_false():
         assert not hf_chat.is_model_id_valid()
 
 
-def test_load_model_and_tokenizer():
+@pytest.mark.asyncio
+async def test_load_model_and_tokenizer():
     hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False)
+    await hf_chat.load_model_and_tokenizer()
     assert hf_chat.model is not None
     assert hf_chat.tokenizer is not None
 
@@ -115,6 +130,7 @@ def test_load_model_and_tokenizer():
 @pytest.mark.asyncio
 async def test_send_prompt_async():
     hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False)
+    await hf_chat.load_model_and_tokenizer()
 
     request_piece = PromptRequestPiece(
         role="user",
@@ -134,6 +150,7 @@ async def test_send_prompt_async():
 @pytest.mark.asyncio
 async def test_missing_chat_template_error():
     hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False)
+    await hf_chat.load_model_and_tokenizer()
     hf_chat.tokenizer.chat_template = None
 
     request_piece = PromptRequestPiece(
@@ -169,8 +186,11 @@ def test_invalid_prompt_request_validation():
     assert "This target only supports a single prompt request piece." in str(excinfo.value)
 
 
-def test_load_with_missing_files():
+@pytest.mark.asyncio
+async def test_load_with_missing_files():
     hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False, necessary_files=["file1", "file2"])
+    await hf_chat.load_model_and_tokenizer()
+
     assert hf_chat.model is not None
     assert hf_chat.tokenizer is not None