diff --git a/doc/code/orchestrators/use_huggingface_chat_target.ipynb b/doc/code/orchestrators/use_huggingface_chat_target.ipynb index 6f4d94b10..14d61e48a 100644 --- a/doc/code/orchestrators/use_huggingface_chat_target.ipynb +++ b/doc/code/orchestrators/use_huggingface_chat_target.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "d623d73a", + "id": "066bb566", "metadata": { "lines_to_next_cell": 2 }, @@ -17,7 +17,7 @@ " - This notebook supports the following **instruct models** that follow a structured chat template. These are examples, and more instruct models are available on Hugging Face:\n", " - `HuggingFaceTB/SmolLM-360M-Instruct`\n", " - `microsoft/Phi-3-mini-4k-instruct`\n", - " \n", + "\n", " - `...`\n", "\n", "2. **Excluded Models**:\n", @@ -37,63 +37,116 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a0d61a68", - "metadata": {}, - "outputs": [], + "execution_count": 1, + "id": "940f8d8a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-11T22:41:35.643730Z", + "iopub.status.busy": "2024-11-11T22:41:35.643730Z", + "iopub.status.idle": "2024-11-11T22:43:23.863745Z", + "shell.execute_reply": "2024-11-11T22:43:23.862727Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running model: HuggingFaceTB/SmolLM-135M-Instruct\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average response time for HuggingFaceTB/SmolLM-135M-Instruct: 37.12 seconds\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[22m\u001b[39mConversation ID: 5223e15e-f21c-4d15-88af-8c02d6558182\n", + "\u001b[1m\u001b[34muser: What is 4*4? Give me the solution.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[22m\u001b[33massistant: What a great question!\n", + "\n", + "The number 4*4 is a special number because it can be expressed as a product of two numbers,\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[22m\u001b[39mConversation ID: b0238d3e-ce2e-48c3-a5e1-eaebf2c58e6f\n", + "\u001b[1m\u001b[34muser: What is 3*3? Give me the solution.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[22m\u001b[33massistant: What a great question!\n", + "\n", + "The number 3*3 is a fascinating number that has been a subject of fascination for mathematicians and computer scientists for\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HuggingFaceTB/SmolLM-135M-Instruct: 37.12 seconds\n" + ] + } + ], "source": [ "import time\n", - "from pyrit.prompt_target import HuggingFaceChatTarget \n", + "from pyrit.prompt_target import HuggingFaceChatTarget\n", "from pyrit.orchestrator import PromptSendingOrchestrator\n", "\n", "# models to test\n", - "model_id = \"HuggingFaceTB/SmolLM-135M-Instruct\" \n", + "model_id = \"HuggingFaceTB/SmolLM-135M-Instruct\"\n", "\n", "# List of prompts to send\n", - "prompt_list = [\n", - " \"What is 3*3? Give me the solution.\",\n", - " \"What is 4*4? Give me the solution.\"\n", - " ]\n", + "prompt_list = [\"What is 3*3? Give me the solution.\", \"What is 4*4? Give me the solution.\"]\n", "\n", "# Dictionary to store average response times\n", "model_times = {}\n", - " \n", + "\n", "print(f\"Running model: {model_id}\")\n", - " \n", + "\n", "try:\n", " # Initialize HuggingFaceChatTarget with the current model\n", - " target = HuggingFaceChatTarget(\n", - " model_id=model_id, \n", - " use_cuda=False, \n", - " tensor_format=\"pt\",\n", - " max_new_tokens=30 \n", - " )\n", - " \n", + " target = HuggingFaceChatTarget(model_id=model_id, use_cuda=False, tensor_format=\"pt\", max_new_tokens=30)\n", + "\n", " # Initialize the orchestrator\n", - " orchestrator = PromptSendingOrchestrator(\n", - " prompt_target=target,\n", - " verbose=False\n", - " )\n", - " \n", + " orchestrator = PromptSendingOrchestrator(prompt_target=target, verbose=False)\n", + "\n", " # Record start time\n", " start_time = time.time()\n", - " \n", + "\n", " # Send prompts asynchronously\n", - " responses = await orchestrator.send_prompts_async(prompt_list=prompt_list) # type: ignore\n", - " \n", + " responses = await orchestrator.send_prompts_async(prompt_list=prompt_list) # type: ignore\n", + "\n", " # Record end time\n", " end_time = time.time()\n", - " \n", + "\n", " # Calculate total and average response time\n", " total_time = end_time - start_time\n", " avg_time = total_time / len(prompt_list)\n", " model_times[model_id] = avg_time\n", - " \n", + "\n", " print(f\"Average response time for {model_id}: {avg_time:.2f} seconds\\n\")\n", - " \n", + "\n", " # Print the conversations\n", - " await orchestrator.print_conversations() # type: ignore\n", - " \n", + " await orchestrator.print_conversations() # type: ignore\n", + "\n", "except Exception as e:\n", " print(f\"An error occurred with model {model_id}: {e}\\n\")\n", " model_times[model_id] = None\n", @@ -108,14 +161,12 @@ ], "metadata": { "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" + "cell_metadata_filter": "-all" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyrit-dev", "language": "python", - "name": "python3" + "name": "pyrit-dev" }, "language_info": { "codemirror_mode": { diff --git a/pyrit/common/download_hf_model.py b/pyrit/common/download_hf_model.py new file mode 100644 index 000000000..bcc7ecc74 --- /dev/null +++ b/pyrit/common/download_hf_model.py @@ -0,0 +1,112 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import asyncio +import logging +import os +import httpx +from pathlib import Path + +from huggingface_hub import HfApi + + +logger = logging.getLogger(__name__) + + +def get_available_files(model_id: str, token: str): + """Fetches available files for a model from the Hugging Face repository.""" + api = HfApi() + try: + model_info = api.model_info(model_id, token=token) + available_files = [file.rfilename for file in model_info.siblings] + + # Perform simple validation: raise a ValueError if no files are available + if not len(available_files): + raise ValueError(f"No available files found for the model: {model_id}") + + return available_files + except Exception as e: + logger.info(f"Error fetching model files for {model_id}: {e}") + return [] + + +async def download_specific_files(model_id: str, file_patterns: list, token: str, cache_dir: Path): + """ + Downloads specific files from a Hugging Face model repository. + If file_patterns is None, downloads all files. + + Returns: + List of URLs for the downloaded files. + """ + os.makedirs(cache_dir, exist_ok=True) + + available_files = get_available_files(model_id, token) + # If no file patterns are provided, download all available files + if file_patterns is None: + files_to_download = available_files + logger.info(f"Downloading all files for model {model_id}.") + else: + # Filter files based on the patterns provided + files_to_download = [file for file in available_files if any(pattern in file for pattern in file_patterns)] + if not files_to_download: + logger.info(f"No files matched the patterns provided for model {model_id}.") + return + + # Generate download URLs directly + base_url = f"https://huggingface.co/{model_id}/resolve/main/" + urls = [base_url + file for file in files_to_download] + + # Download the files + await download_files(urls, token, cache_dir) + + +async def download_chunk(url, headers, start, end, client): + """Download a chunk of the file with a specified byte range.""" + range_header = {"Range": f"bytes={start}-{end}", **headers} + response = await client.get(url, headers=range_header) + response.raise_for_status() + return response.content + + +async def download_file(url, token, download_dir, num_splits): + """Download a file in multiple segments (splits) using byte-range requests.""" + headers = {"Authorization": f"Bearer {token}"} + async with httpx.AsyncClient(follow_redirects=True) as client: + # Get the file size to determine chunk size + response = await client.head(url, headers=headers) + response.raise_for_status() + file_size = int(response.headers["Content-Length"]) + chunk_size = file_size // num_splits + + # Prepare tasks for each chunk + tasks = [] + file_name = url.split("/")[-1] + file_path = Path(download_dir, file_name) + + for i in range(num_splits): + start = i * chunk_size + end = start + chunk_size - 1 if i < num_splits - 1 else file_size - 1 + tasks.append(download_chunk(url, headers, start, end, client)) + + # Download all chunks concurrently + chunks = await asyncio.gather(*tasks) + + # Write chunks to the file in order + with open(file_path, "wb") as f: + for chunk in chunks: + f.write(chunk) + logger.info(f"Downloaded {file_name} to {file_path}") + + +async def download_files(urls: list[str], token: str, download_dir: Path, num_splits=3, parallel_downloads=4): + """Download multiple files with parallel downloads and segmented downloading.""" + + # Limit the number of parallel downloads + semaphore = asyncio.Semaphore(parallel_downloads) + + async def download_with_limit(url): + async with semaphore: + await download_file(url, token, download_dir, num_splits) + + # Run downloads concurrently, but limit to parallel_downloads at a time + await asyncio.gather(*(download_with_limit(url) for url in urls)) diff --git a/pyrit/common/download_hf_model_with_aria2.py b/pyrit/common/download_hf_model_with_aria2.py deleted file mode 100644 index b5c7cb35f..000000000 --- a/pyrit/common/download_hf_model_with_aria2.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import os -from pathlib import Path -import subprocess -from typing import Optional - -from huggingface_hub import HfApi - - -logger = logging.getLogger(__name__) - - -def get_available_files(model_id: str, token: str): - """Fetches available files for a model from the Hugging Face repository.""" - api = HfApi() - try: - model_info = api.model_info(model_id, token=token) - available_files = [file.rfilename for file in model_info.siblings] - - # Perform simple validation: raise a ValueError if no files are available - if not len(available_files): - raise ValueError(f"No available files found for the model: {model_id}") - - return available_files - except Exception as e: - logger.info(f"Error fetching model files for {model_id}: {e}") - return [] - - -def download_files_with_aria2(urls: list, token: str, download_dir: Optional[Path] = None): - """Uses aria2 to download files from the given list of URLs.""" - - # Convert download_dir to string if it's a Path object - download_dir_str = str(download_dir) if isinstance(download_dir, Path) else download_dir - - aria2_command = [ - "aria2c", - "-d", - download_dir_str, - "-x", - "3", # Number of connections per server for each download. - "-s", - "5", # Number of splits for each file. - "-j", - "4", # Maximum number of parallel downloads. - "--continue=true", - "--enable-http-pipelining=true", - f"--header=Authorization: Bearer {token}", - "-i", - "-", # Use '-' to read input from stdin - ] - - try: - # Run aria2c with input from stdin - process = subprocess.Popen(aria2_command, stdin=subprocess.PIPE, text=True) - process.communicate("\n".join(urls)) # Pass URLs directly to stdin - if process.returncode == 0: - logger.info(f"\nFiles downloaded successfully to {download_dir}.") - else: - logger.info(f"Error downloading files with aria2, return code: {process.returncode}.") - raise subprocess.CalledProcessError(process.returncode, aria2_command) - except subprocess.CalledProcessError as e: - logger.info(f"Error downloading files with aria2: {e}") - raise - - -def download_specific_files_with_aria2(model_id: str, file_patterns: list, token: str, cache_dir: Optional[Path]): - """ - Downloads specific files from a Hugging Face model repository using aria2. - If file_patterns is None, downloads all files. - """ - os.makedirs(cache_dir, exist_ok=True) - - available_files = get_available_files(model_id, token) - # If no file patterns are provided, download all available files - if file_patterns is None: - files_to_download = available_files - logger.info(f"Downloading all files for model {model_id}.") - else: - # Filter files based on the patterns provided - files_to_download = [file for file in available_files if any(pattern in file for pattern in file_patterns)] - if not files_to_download: - logger.info(f"No files matched the patterns provided for model {model_id}.") - return - - # Generate download URLs directly - base_url = f"https://huggingface.co/{model_id}/resolve/main/" - urls = [base_url + file for file in files_to_download] - - # Use aria2c to download the files - download_files_with_aria2(urls, token, cache_dir) diff --git a/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py b/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py index e4a0af318..17fde4e58 100644 --- a/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py +++ b/pyrit/prompt_target/hugging_face/hugging_face_chat_target.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import asyncio import json import logging import os @@ -10,7 +11,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig from pyrit.prompt_target import PromptChatTarget -from pyrit.common.download_hf_model_with_aria2 import download_specific_files_with_aria2 +from pyrit.common.download_hf_model import download_specific_files from pyrit.models.prompt_request_response import PromptRequestResponse, construct_response_from_request from pyrit.exceptions import EmptyResponseException, pyrit_target_retry from pyrit.common import default_values @@ -30,7 +31,7 @@ class HuggingFaceChatTarget(PromptChatTarget): _cached_model_id = None # Class-level flag to enable or disable cache - _cache_enabled = False + _cache_enabled = True # Define the environment variable name for the Hugging Face token HUGGINGFACE_TOKEN_ENVIRONMENT_VARIABLE = "HUGGINGFACE_TOKEN" @@ -48,6 +49,8 @@ def __init__( top_p: float = 1.0, skip_special_tokens: bool = True, ) -> None: + super().__init__() + self.model_id = model_id self.use_cuda = use_cuda self.tensor_format = tensor_format @@ -73,8 +76,7 @@ def __init__( if self.use_cuda and not torch.cuda.is_available(): raise RuntimeError("CUDA requested but not available.") - # Load the model and tokenizer using the encapsulated method - self.load_model_and_tokenizer() + self.load_model_and_tokenizer_task = asyncio.create_task(self.load_model_and_tokenizer()) def is_model_id_valid(self) -> bool: """ @@ -89,7 +91,7 @@ def is_model_id_valid(self) -> bool: logger.error(f"Invalid HuggingFace model ID {self.model_id}: {e}") return False - def load_model_and_tokenizer(self): + async def load_model_and_tokenizer(self): """Loads the model and tokenizer, downloading if necessary. Downloads the model to the HF_MODELS_DIR folder if it does not exist, @@ -113,14 +115,12 @@ def load_model_and_tokenizer(self): if self.necessary_files is None: # Download all files if no specific files are provided - logger.info(f"Downloading all files for {self.model_id} using aria2...") - download_specific_files_with_aria2(self.model_id, None, self.huggingface_token, cache_dir) + logger.info(f"Downloading all files for {self.model_id}...") + await download_specific_files(self.model_id, None, self.huggingface_token, cache_dir) else: # Download only the necessary files - logger.info(f"Downloading specific files for {self.model_id} using aria2...") - download_specific_files_with_aria2( - self.model_id, self.necessary_files, self.huggingface_token, cache_dir - ) + logger.info(f"Downloading specific files for {self.model_id}...") + await download_specific_files(self.model_id, self.necessary_files, self.huggingface_token, cache_dir) # Load the tokenizer and model from the specified directory logger.info(f"Loading model {self.model_id} from cache path: {cache_dir}...") @@ -151,6 +151,9 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P """ Sends a normalized prompt asynchronously to the HuggingFace model. """ + # Load the model and tokenizer using the encapsulated method + await self.load_model_and_tokenizer_task + self._validate_request(prompt_request=prompt_request) request = prompt_request.request_pieces[0] prompt_template = request.converted_value @@ -162,20 +165,23 @@ async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> P # Apply chat template via the _apply_chat_template method tokenized_chat = self._apply_chat_template(messages) + input_ids = tokenized_chat["input_ids"] + attention_mask = tokenized_chat["attention_mask"] - logger.info(f"Tokenized chat: {tokenized_chat}") + logger.info(f"Tokenized chat: {input_ids}") try: # Ensure model is on the correct device (should already be the case from `load_model_and_tokenizer`) self.model.to(self.device) # Record the length of the input tokens to later extract only the generated tokens - input_length = tokenized_chat.shape[-1] + input_length = input_ids.shape[-1] # Generate the response logger.info("Generating response from model...") generated_ids = self.model.generate( - input_ids=tokenized_chat, + input_ids=input_ids, + attention_mask=attention_mask, max_new_tokens=self.max_new_tokens, temperature=self.temperature, top_p=self.top_p, @@ -216,7 +222,11 @@ def _apply_chat_template(self, messages): # Apply the chat template to format and tokenize the messages tokenized_chat = self.tokenizer.apply_chat_template( - messages, tokenize=True, add_generation_prompt=True, return_tensors=self.tensor_format + messages, + tokenize=True, + add_generation_prompt=True, + return_tensors=self.tensor_format, + return_dict=True, ).to(self.device) return tokenized_chat else: diff --git a/tests/test_hf_model_downloads.py b/tests/test_hf_model_downloads.py index 4ee2b9c18..b18cf7f81 100644 --- a/tests/test_hf_model_downloads.py +++ b/tests/test_hf_model_downloads.py @@ -2,11 +2,12 @@ # Licensed under the MIT license. import os +from pathlib import Path import pytest from unittest.mock import patch # Import functions to test from local application files -from pyrit.common.download_hf_model_with_aria2 import download_specific_files_with_aria2 +from pyrit.common.download_hf_model import download_specific_files # Define constants for testing @@ -31,8 +32,10 @@ def setup_environment(): yield token -def test_download_specific_files_with_aria2(setup_environment): - """Test downloading specific files using aria2.""" +def test_download_specific_files(setup_environment): + """Test downloading specific files""" token = setup_environment # Get the token from the fixture - with pytest.raises(Exception): - download_specific_files_with_aria2(MODEL_ID, FILE_PATTERNS, token) + + with patch("os.makedirs"): + with patch("pyrit.common.download_hf_model.download_files"): + download_specific_files(MODEL_ID, FILE_PATTERNS, token, Path("")) diff --git a/tests/test_huggingface_chat_target.py b/tests/test_huggingface_chat_target.py index 87a80862a..ece9de2a5 100644 --- a/tests/test_huggingface_chat_target.py +++ b/tests/test_huggingface_chat_target.py @@ -1,8 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from asyncio import Task import pytest -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, AsyncMock from pyrit.prompt_target import HuggingFaceChatTarget from pyrit.models.prompt_request_response import PromptRequestResponse, PromptRequestPiece @@ -22,12 +23,10 @@ def mock_get_required_value(request): yield -# Fixture to mock download_specific_files_with_aria2 globally for all tests +# Fixture to mock download_specific_files globally for all tests @pytest.fixture(autouse=True) -def mock_download_specific_files_with_aria2(): - with patch( - "pyrit.common.download_hf_model_with_aria2.download_specific_files_with_aria2", return_value=None - ) as mock_download: +def mock_download_specific_files(): + with patch("pyrit.common.download_hf_model.download_specific_files", return_value=None) as mock_download: yield mock_download @@ -73,6 +72,17 @@ def mock_pretrained_config(): yield +class AwaitableMock(AsyncMock): + def __await__(self): + return iter([]) + + +@pytest.fixture(autouse=True) +def mock_create_task(): + with patch("asyncio.create_task", return_value=AwaitableMock(spec=Task)): + yield + + def test_init_with_no_token_var_raises(monkeypatch): # Ensure the environment variable is unset monkeypatch.delenv("HUGGINGFACE_TOKEN", raising=False) @@ -83,12 +93,15 @@ def test_init_with_no_token_var_raises(monkeypatch): assert "Environment variable HUGGINGFACE_TOKEN is required" in str(excinfo.value) -def test_initialization(): +@pytest.mark.asyncio +async def test_initialization(): # Test the initialization without loading the actual models hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False) assert hf_chat.model_id == "test_model" assert not hf_chat.use_cuda assert hf_chat.device == "cpu" + + await hf_chat.load_model_and_tokenizer() assert hf_chat.model is not None assert hf_chat.tokenizer is not None @@ -106,8 +119,10 @@ def test_is_model_id_valid_false(): assert not hf_chat.is_model_id_valid() -def test_load_model_and_tokenizer(): +@pytest.mark.asyncio +async def test_load_model_and_tokenizer(): hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False) + await hf_chat.load_model_and_tokenizer() assert hf_chat.model is not None assert hf_chat.tokenizer is not None @@ -115,6 +130,7 @@ def test_load_model_and_tokenizer(): @pytest.mark.asyncio async def test_send_prompt_async(): hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False) + await hf_chat.load_model_and_tokenizer() request_piece = PromptRequestPiece( role="user", @@ -134,6 +150,7 @@ async def test_send_prompt_async(): @pytest.mark.asyncio async def test_missing_chat_template_error(): hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False) + await hf_chat.load_model_and_tokenizer() hf_chat.tokenizer.chat_template = None request_piece = PromptRequestPiece( @@ -169,8 +186,11 @@ def test_invalid_prompt_request_validation(): assert "This target only supports a single prompt request piece." in str(excinfo.value) -def test_load_with_missing_files(): +@pytest.mark.asyncio +async def test_load_with_missing_files(): hf_chat = HuggingFaceChatTarget(model_id="test_model", use_cuda=False, necessary_files=["file1", "file2"]) + await hf_chat.load_model_and_tokenizer() + assert hf_chat.model is not None assert hf_chat.tokenizer is not None