From 36396f13988d9255ab8397616ba37f573661eb3c Mon Sep 17 00:00:00 2001 From: Lucain Date: Tue, 16 Jul 2024 18:00:37 +0200 Subject: [PATCH] [InferenceClient] Add support for `adapter_id` (text-generation) and `response_format` (chat-completion) (#2383) * types * Add adapter_id arg to text_generation * Add adapter_id to text-generation and response_format to chat_completion * update example * add test * fix quality * remove dummy * lint * b * lint --- .../en/package_reference/inference_types.md | 10 ++- .../ko/package_reference/inference_types.md | 10 ++- src/huggingface_hub/__init__.py | 10 ++- src/huggingface_hub/inference/_client.py | 73 ++++++++++++++++-- src/huggingface_hub/inference/_common.py | 2 - .../inference/_generated/_async_client.py | 75 +++++++++++++++++-- .../inference/_generated/types/__init__.py | 5 +- .../_generated/types/chat_completion.py | 62 +++++++++------ .../_generated/types/text_generation.py | 29 +++++++ ..._chat_completion_with_response_format.yaml | 60 +++++++++++++++ tests/test_inference_async_client.py | 2 - tests/test_inference_client.py | 37 ++++++++- 12 files changed, 326 insertions(+), 49 deletions(-) create mode 100644 tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md index c0079d5e1d..8dd826336c 100644 --- a/docs/source/en/package_reference/inference_types.md +++ b/docs/source/en/package_reference/inference_types.md @@ -55,14 +55,20 @@ This part of the lib is still under development and will be improved in future r [[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition +[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName + +[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType + [[autodoc]] huggingface_hub.ChatCompletionInputMessage -[[autodoc]] huggingface_hub.ChatCompletionInputTool +[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk -[[autodoc]] huggingface_hub.ChatCompletionInputToolCall +[[autodoc]] huggingface_hub.ChatCompletionInputTool [[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass +[[autodoc]] huggingface_hub.ChatCompletionInputURL + [[autodoc]] huggingface_hub.ChatCompletionOutput [[autodoc]] huggingface_hub.ChatCompletionOutputComplete diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md index 548249b581..27e96cb491 100644 --- a/docs/source/ko/package_reference/inference_types.md +++ b/docs/source/ko/package_reference/inference_types.md @@ -54,14 +54,20 @@ rendered properly in your Markdown viewer. [[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition +[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName + +[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType + [[autodoc]] huggingface_hub.ChatCompletionInputMessage -[[autodoc]] huggingface_hub.ChatCompletionInputTool +[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk -[[autodoc]] huggingface_hub.ChatCompletionInputToolCall +[[autodoc]] huggingface_hub.ChatCompletionInputTool [[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass +[[autodoc]] huggingface_hub.ChatCompletionInputURL + [[autodoc]] huggingface_hub.ChatCompletionOutput [[autodoc]] huggingface_hub.ChatCompletionOutputComplete diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 131496eb69..1b1a345a6d 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -280,10 +280,13 @@ "AutomaticSpeechRecognitionParameters", "ChatCompletionInput", "ChatCompletionInputFunctionDefinition", + "ChatCompletionInputFunctionName", + "ChatCompletionInputGrammarType", "ChatCompletionInputMessage", + "ChatCompletionInputMessageChunk", "ChatCompletionInputTool", - "ChatCompletionInputToolCall", "ChatCompletionInputToolTypeClass", + "ChatCompletionInputURL", "ChatCompletionOutput", "ChatCompletionOutputComplete", "ChatCompletionOutputFunctionDefinition", @@ -775,10 +778,13 @@ def __dir__(): AutomaticSpeechRecognitionParameters, # noqa: F401 ChatCompletionInput, # noqa: F401 ChatCompletionInputFunctionDefinition, # noqa: F401 + ChatCompletionInputFunctionName, # noqa: F401 + ChatCompletionInputGrammarType, # noqa: F401 ChatCompletionInputMessage, # noqa: F401 + ChatCompletionInputMessageChunk, # noqa: F401 ChatCompletionInputTool, # noqa: F401 - ChatCompletionInputToolCall, # noqa: F401 ChatCompletionInputToolTypeClass, # noqa: F401 + ChatCompletionInputURL, # noqa: F401 ChatCompletionOutput, # noqa: F401 ChatCompletionOutputComplete, # noqa: F401 ChatCompletionOutputFunctionDefinition, # noqa: F401 diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 5f3cca1b1b..a23b7d7f0a 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -78,6 +78,7 @@ AudioClassificationOutputElement, AudioToAudioOutputElement, AutomaticSpeechRecognitionOutput, + ChatCompletionInputGrammarType, ChatCompletionInputTool, ChatCompletionInputToolTypeClass, ChatCompletionOutput, @@ -103,7 +104,6 @@ ZeroShotClassificationOutputElement, ZeroShotImageClassificationOutputElement, ) -from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum from huggingface_hub.inference._types import ( ConversationalOutput, # soon to be removed ) @@ -465,10 +465,11 @@ def chat_completion( # type: ignore max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -488,10 +489,11 @@ def chat_completion( # type: ignore max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -511,10 +513,11 @@ def chat_completion( max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -534,10 +537,11 @@ def chat_completion( max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -584,6 +588,8 @@ def chat_completion( presence_penalty (`float`, *optional*): Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + response_format ([`ChatCompletionInputGrammarType`], *optional*): + Grammar constraints. Can be either a JSONSchema or a regex. seed (Optional[`int`], *optional*): Seed for reproducible control flow. Defaults to None. stop (Optional[`str`], *optional*): @@ -601,7 +607,7 @@ def chat_completion( top_p (`float`, *optional*): Fraction of the most likely next words to sample from. Must be between 0 and 1. Defaults to 1.0. - tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*): + tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*): The tool to use for the completion. Defaults to "auto". tool_prompt (`str`, *optional*): A prompt to be appended before the tools. @@ -624,7 +630,6 @@ def chat_completion( Example: ```py - # Chat example >>> from huggingface_hub import InferenceClient >>> messages = [{"role": "user", "content": "What is the capital of France?"}] >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") @@ -654,7 +659,13 @@ def chat_completion( total_tokens=25 ) ) + ``` + Example (stream=True): + ```py + >>> from huggingface_hub import InferenceClient + >>> messages = [{"role": "user", "content": "What is the capital of France?"}] + >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") >>> for token in client.chat_completion(messages, max_tokens=10, stream=True): ... print(token) ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504) @@ -770,6 +781,37 @@ def chat_completion( description=None ) ``` + + Example using response_format: + ```py + >>> from huggingface_hub import InferenceClient + >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct") + >>> messages = [ + ... { + ... "role": "user", + ... "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?", + ... }, + ... ] + >>> response_format = { + ... "type": "json", + ... "value": { + ... "properties": { + ... "location": {"type": "string"}, + ... "activity": {"type": "string"}, + ... "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5}, + ... "animals": {"type": "array", "items": {"type": "string"}}, + ... }, + ... "required": ["location", "activity", "animals_seen", "animals"], + ... }, + ... } + >>> response = client.chat_completion( + ... messages=messages, + ... response_format=response_format, + ... max_tokens=500, + ) + >>> response.choices[0].message.content + '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}' + ``` """ # Determine model # `self.xxx` takes precedence over the method argument only in `chat_completion` @@ -804,6 +846,7 @@ def chat_completion( max_tokens=max_tokens, n=n, presence_penalty=presence_penalty, + response_format=response_format, seed=seed, stop=stop, temperature=temperature, @@ -855,6 +898,11 @@ def chat_completion( "Tools are not supported by the model. This is due to the model not been served by a " "Text-Generation-Inference server. The provided tool parameters will be ignored." ) + if response_format is not None: + warnings.warn( + "Response format is not supported by the model. This is due to the model not been served by a " + "Text-Generation-Inference server. The provided response format will be ignored." + ) # generate response text_generation_output = self.text_generation( @@ -873,7 +921,6 @@ def chat_completion( return ChatCompletionOutput( id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", usage=None, # type: ignore # set to `None` as we don't want to provide false information created=int(time.time()), @@ -1742,6 +1789,7 @@ def text_generation( # type: ignore stream: Literal[False] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1770,6 +1818,7 @@ def text_generation( # type: ignore stream: Literal[False] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1798,6 +1847,7 @@ def text_generation( # type: ignore stream: Literal[True] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1826,6 +1876,7 @@ def text_generation( # type: ignore stream: Literal[True] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1854,6 +1905,7 @@ def text_generation( stream: bool = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1881,6 +1933,7 @@ def text_generation( stream: bool = False, model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1932,6 +1985,8 @@ def text_generation( model (`str`, *optional*): The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None. + adapter_id (`str`, *optional*): + Lora adapter id. best_of (`int`, *optional*): Generate best_of sequences and return the one if the highest token logprobs. decoder_input_details (`bool`, *optional*): @@ -2100,6 +2155,7 @@ def text_generation( # Build payload parameters = { + "adapter_id": adapter_id, "best_of": best_of, "decoder_input_details": decoder_input_details, "details": details, @@ -2170,6 +2226,7 @@ def text_generation( details=details, stream=stream, model=model, + adapter_id=adapter_id, best_of=best_of, decoder_input_details=decoder_input_details, do_sample=do_sample, diff --git a/src/huggingface_hub/inference/_common.py b/src/huggingface_hub/inference/_common.py index bd669e417a..f093d2e538 100644 --- a/src/huggingface_hub/inference/_common.py +++ b/src/huggingface_hub/inference/_common.py @@ -315,7 +315,6 @@ def _format_chat_completion_stream_output_from_text_generation( # explicitly set 'dummy' values to reduce expectations from users id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", choices=[ ChatCompletionStreamOutputChoice( @@ -335,7 +334,6 @@ def _format_chat_completion_stream_output_from_text_generation( # explicitly set 'dummy' values to reduce expectations from users id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", choices=[ ChatCompletionStreamOutputChoice( diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 982adf618e..be3d549ffa 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -64,6 +64,7 @@ AudioClassificationOutputElement, AudioToAudioOutputElement, AutomaticSpeechRecognitionOutput, + ChatCompletionInputGrammarType, ChatCompletionInputTool, ChatCompletionInputToolTypeClass, ChatCompletionOutput, @@ -89,7 +90,6 @@ ZeroShotClassificationOutputElement, ZeroShotImageClassificationOutputElement, ) -from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum from huggingface_hub.inference._types import ( ConversationalOutput, # soon to be removed ) @@ -466,10 +466,11 @@ async def chat_completion( # type: ignore max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -489,10 +490,11 @@ async def chat_completion( # type: ignore max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -512,10 +514,11 @@ async def chat_completion( max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -535,10 +538,11 @@ async def chat_completion( max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[float] = None, + response_format: Optional[ChatCompletionInputGrammarType] = None, seed: Optional[int] = None, stop: Optional[List[str]] = None, temperature: Optional[float] = None, - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None, + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None, tool_prompt: Optional[str] = None, tools: Optional[List[ChatCompletionInputTool]] = None, top_logprobs: Optional[int] = None, @@ -585,6 +589,8 @@ async def chat_completion( presence_penalty (`float`, *optional*): Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + response_format ([`ChatCompletionInputGrammarType`], *optional*): + Grammar constraints. Can be either a JSONSchema or a regex. seed (Optional[`int`], *optional*): Seed for reproducible control flow. Defaults to None. stop (Optional[`str`], *optional*): @@ -602,7 +608,7 @@ async def chat_completion( top_p (`float`, *optional*): Fraction of the most likely next words to sample from. Must be between 0 and 1. Defaults to 1.0. - tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*): + tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*): The tool to use for the completion. Defaults to "auto". tool_prompt (`str`, *optional*): A prompt to be appended before the tools. @@ -626,7 +632,6 @@ async def chat_completion( ```py # Must be run in an async context - # Chat example >>> from huggingface_hub import AsyncInferenceClient >>> messages = [{"role": "user", "content": "What is the capital of France?"}] >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") @@ -656,7 +661,14 @@ async def chat_completion( total_tokens=25 ) ) + ``` + Example (stream=True): + ```py + # Must be run in an async context + >>> from huggingface_hub import AsyncInferenceClient + >>> messages = [{"role": "user", "content": "What is the capital of France?"}] + >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") >>> async for token in await client.chat_completion(messages, max_tokens=10, stream=True): ... print(token) ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504) @@ -774,6 +786,38 @@ async def chat_completion( description=None ) ``` + + Example using response_format: + ```py + # Must be run in an async context + >>> from huggingface_hub import AsyncInferenceClient + >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct") + >>> messages = [ + ... { + ... "role": "user", + ... "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?", + ... }, + ... ] + >>> response_format = { + ... "type": "json", + ... "value": { + ... "properties": { + ... "location": {"type": "string"}, + ... "activity": {"type": "string"}, + ... "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5}, + ... "animals": {"type": "array", "items": {"type": "string"}}, + ... }, + ... "required": ["location", "activity", "animals_seen", "animals"], + ... }, + ... } + >>> response = await client.chat_completion( + ... messages=messages, + ... response_format=response_format, + ... max_tokens=500, + ) + >>> response.choices[0].message.content + '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}' + ``` """ # Determine model # `self.xxx` takes precedence over the method argument only in `chat_completion` @@ -808,6 +852,7 @@ async def chat_completion( max_tokens=max_tokens, n=n, presence_penalty=presence_penalty, + response_format=response_format, seed=seed, stop=stop, temperature=temperature, @@ -859,6 +904,11 @@ async def chat_completion( "Tools are not supported by the model. This is due to the model not been served by a " "Text-Generation-Inference server. The provided tool parameters will be ignored." ) + if response_format is not None: + warnings.warn( + "Response format is not supported by the model. This is due to the model not been served by a " + "Text-Generation-Inference server. The provided response format will be ignored." + ) # generate response text_generation_output = await self.text_generation( @@ -877,7 +927,6 @@ async def chat_completion( return ChatCompletionOutput( id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", usage=None, # type: ignore # set to `None` as we don't want to provide false information created=int(time.time()), @@ -1770,6 +1819,7 @@ async def text_generation( # type: ignore stream: Literal[False] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1798,6 +1848,7 @@ async def text_generation( # type: ignore stream: Literal[False] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1826,6 +1877,7 @@ async def text_generation( # type: ignore stream: Literal[True] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1854,6 +1906,7 @@ async def text_generation( # type: ignore stream: Literal[True] = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1882,6 +1935,7 @@ async def text_generation( stream: bool = ..., model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1909,6 +1963,7 @@ async def text_generation( stream: bool = False, model: Optional[str] = None, # Parameters from `TextGenerationInputGenerateParameters` (maintained manually) + adapter_id: Optional[str] = None, best_of: Optional[int] = None, decoder_input_details: Optional[bool] = None, do_sample: Optional[bool] = False, # Manual default value @@ -1960,6 +2015,8 @@ async def text_generation( model (`str`, *optional*): The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None. + adapter_id (`str`, *optional*): + Lora adapter id. best_of (`int`, *optional*): Generate best_of sequences and return the one if the highest token logprobs. decoder_input_details (`bool`, *optional*): @@ -2129,6 +2186,7 @@ async def text_generation( # Build payload parameters = { + "adapter_id": adapter_id, "best_of": best_of, "decoder_input_details": decoder_input_details, "details": details, @@ -2199,6 +2257,7 @@ async def text_generation( details=details, stream=stream, model=model, + adapter_id=adapter_id, best_of=best_of, decoder_input_details=decoder_input_details, do_sample=do_sample, diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py index e79930374b..db2793be23 100644 --- a/src/huggingface_hub/inference/_generated/types/__init__.py +++ b/src/huggingface_hub/inference/_generated/types/__init__.py @@ -20,10 +20,13 @@ from .chat_completion import ( ChatCompletionInput, ChatCompletionInputFunctionDefinition, + ChatCompletionInputFunctionName, + ChatCompletionInputGrammarType, ChatCompletionInputMessage, + ChatCompletionInputMessageChunk, ChatCompletionInputTool, - ChatCompletionInputToolCall, ChatCompletionInputToolTypeClass, + ChatCompletionInputURL, ChatCompletionOutput, ChatCompletionOutputComplete, ChatCompletionOutputFunctionDefinition, diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py index e20dc11dd6..fa6e373140 100644 --- a/src/huggingface_hub/inference/_generated/types/chat_completion.py +++ b/src/huggingface_hub/inference/_generated/types/chat_completion.py @@ -10,33 +10,55 @@ @dataclass -class ChatCompletionInputFunctionDefinition(BaseInferenceType): - arguments: Any - name: str - description: Optional[str] = None +class ChatCompletionInputURL(BaseInferenceType): + url: str + + +ChatCompletionInputMessageChunkType = Literal["text", "image_url"] @dataclass -class ChatCompletionInputToolCall(BaseInferenceType): - function: ChatCompletionInputFunctionDefinition - id: int - type: str +class ChatCompletionInputMessageChunk(BaseInferenceType): + type: "ChatCompletionInputMessageChunkType" + image_url: Optional[ChatCompletionInputURL] = None + text: Optional[str] = None @dataclass class ChatCompletionInputMessage(BaseInferenceType): + content: Union[List[ChatCompletionInputMessageChunk], str] role: str - content: Optional[str] = None name: Optional[str] = None - tool_calls: Optional[List[ChatCompletionInputToolCall]] = None + + +ChatCompletionInputGrammarTypeType = Literal["json", "regex"] + + +@dataclass +class ChatCompletionInputGrammarType(BaseInferenceType): + type: "ChatCompletionInputGrammarTypeType" + value: Any + """A string that represents a [JSON Schema](https://json-schema.org/). + JSON Schema is a declarative language that allows to annotate JSON documents + with types and descriptions. + """ + + +@dataclass +class ChatCompletionInputFunctionName(BaseInferenceType): + name: str @dataclass class ChatCompletionInputToolTypeClass(BaseInferenceType): - function_name: str + function: Optional[ChatCompletionInputFunctionName] = None -ChatCompletionInputToolTypeEnum = Literal["OneOf"] +@dataclass +class ChatCompletionInputFunctionDefinition(BaseInferenceType): + arguments: Any + name: str + description: Optional[str] = None @dataclass @@ -55,10 +77,6 @@ class ChatCompletionInput(BaseInferenceType): messages: List[ChatCompletionInputMessage] """A list of messages comprising the conversation so far.""" - model: str - """[UNUSED] ID of the model to use. See the model endpoint compatibility table for details - on which models work with the Chat API. - """ frequency_penalty: Optional[float] = None """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, @@ -83,6 +101,10 @@ class ChatCompletionInput(BaseInferenceType): """ max_tokens: Optional[int] = None """The maximum number of tokens that can be generated in the chat completion.""" + model: Optional[str] = None + """[UNUSED] ID of the model to use. See the model endpoint compatibility table for details + on which models work with the Chat API. + """ n: Optional[int] = None """UNUSED How many chat completion choices to generate for each input message. Note that you will @@ -94,6 +116,7 @@ class ChatCompletionInput(BaseInferenceType): appear in the text so far, increasing the model's likelihood to talk about new topics """ + response_format: Optional[ChatCompletionInputGrammarType] = None seed: Optional[int] = None stop: Optional[List[str]] = None """Up to 4 sequences where the API will stop generating further tokens.""" @@ -104,7 +127,7 @@ class ChatCompletionInput(BaseInferenceType): lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both. """ - tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, "ChatCompletionInputToolTypeEnum"]] = None + tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None tool_prompt: Optional[str] = None """A prompt to be appended before the tools""" tools: Optional[List[ChatCompletionInputTool]] = None @@ -153,7 +176,7 @@ class ChatCompletionOutputFunctionDefinition(BaseInferenceType): @dataclass class ChatCompletionOutputToolCall(BaseInferenceType): function: ChatCompletionOutputFunctionDefinition - id: int + id: str type: str @@ -161,7 +184,6 @@ class ChatCompletionOutputToolCall(BaseInferenceType): class ChatCompletionOutputMessage(BaseInferenceType): role: str content: Optional[str] = None - name: Optional[str] = None tool_calls: Optional[List[ChatCompletionOutputToolCall]] = None @@ -192,7 +214,6 @@ class ChatCompletionOutput(BaseInferenceType): created: int id: str model: str - object: str system_fingerprint: str usage: ChatCompletionOutputUsage @@ -256,5 +277,4 @@ class ChatCompletionStreamOutput(BaseInferenceType): created: int id: str model: str - object: str system_fingerprint: str diff --git a/src/huggingface_hub/inference/_generated/types/text_generation.py b/src/huggingface_hub/inference/_generated/types/text_generation.py index 0d63072590..27c70c7e2b 100644 --- a/src/huggingface_hub/inference/_generated/types/text_generation.py +++ b/src/huggingface_hub/inference/_generated/types/text_generation.py @@ -24,24 +24,53 @@ class TextGenerationInputGrammarType(BaseInferenceType): @dataclass class TextGenerationInputGenerateParameters(BaseInferenceType): + adapter_id: Optional[str] = None + """Lora adapter id""" best_of: Optional[int] = None + """Generate best_of sequences and return the one if the highest token logprobs.""" decoder_input_details: Optional[bool] = None + """Whether to return decoder input token logprobs and ids.""" details: Optional[bool] = None + """Whether to return generation details.""" do_sample: Optional[bool] = None + """Activate logits sampling.""" frequency_penalty: Optional[float] = None + """The parameter for frequency penalty. 1.0 means no penalty + Penalize new tokens based on their existing frequency in the text so far, + decreasing the model's likelihood to repeat the same line verbatim. + """ grammar: Optional[TextGenerationInputGrammarType] = None max_new_tokens: Optional[int] = None + """Maximum number of tokens to generate.""" repetition_penalty: Optional[float] = None + """The parameter for repetition penalty. 1.0 means no penalty. + See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + """ return_full_text: Optional[bool] = None + """Whether to prepend the prompt to the generated text""" seed: Optional[int] = None + """Random sampling seed.""" stop: Optional[List[str]] = None + """Stop generating tokens if a member of `stop` is generated.""" temperature: Optional[float] = None + """The value used to module the logits distribution.""" top_k: Optional[int] = None + """The number of highest probability vocabulary tokens to keep for top-k-filtering.""" top_n_tokens: Optional[int] = None + """The number of highest probability vocabulary tokens to keep for top-n-filtering.""" top_p: Optional[float] = None + """Top-p value for nucleus sampling.""" truncate: Optional[int] = None + """Truncate inputs tokens to the given size.""" typical_p: Optional[float] = None + """Typical Decoding mass + See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) + for more information. + """ watermark: Optional[bool] = None + """Watermarking with [A Watermark for Large Language + Models](https://arxiv.org/abs/2301.10226). + """ @dataclass diff --git a/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml b/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml new file mode 100644 index 0000000000..0e7e2fcbbf --- /dev/null +++ b/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml @@ -0,0 +1,60 @@ +interactions: +- request: + body: '{"model": "meta-llama/Meta-Llama-3-70B-Instruct", "messages": [{"role": + "user", "content": "I saw a puppy a cat and a raccoon during my bike ride in + the park. What did I saw and when?"}], "frequency_penalty": null, "logit_bias": + null, "logprobs": null, "max_tokens": 500, "n": null, "presence_penalty": null, + "response_format": {"type": "json", "value": {"properties": {"location": {"type": + "string"}, "activity": {"type": "string"}, "animals_seen": {"type": "integer", + "minimum": 1, "maximum": 5}, "animals": {"type": "array", "items": {"type": + "string"}}}, "required": ["location", "activity", "animals_seen", "animals"]}}, + "seed": null, "stop": null, "temperature": null, "tool_choice": null, "tool_prompt": + null, "tools": null, "top_logprobs": null, "top_p": null, "stream": false}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, br + Connection: + - keep-alive + Content-Length: + - '785' + Content-Type: + - application/json + X-Amzn-Trace-Id: + - 64f96be3-d5ef-4a00-adf3-43b9d20d2ab1 + user-agent: + - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0; + fastcore/1.5.23 + method: POST + uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct/v1/chat/completions + response: + body: + string: '{"object":"chat.completion","id":"","created":1720527901,"model":"meta-llama/Meta-Llama-3-70B-Instruct","system_fingerprint":"2.1.1-dev0-sha-4327210","choices":[{"index":0,"message":{"role":"assistant","content":"{ + \n\n\"activity\": \"bike ride\", \n\"animals\": [\"puppy\", \"cat\", \"raccoon\"], + \n\"animals_seen\": 3, \n\"location\": \"park\" \n}"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":35,"completion_tokens":45,"total_tokens":80}}' + headers: + Connection: + - keep-alive + Content-Length: + - '468' + Content-Type: + - application/json + Date: + - Tue, 09 Jul 2024 12:47:17 GMT + access-control-allow-credentials: + - 'true' + vary: + - Origin, Access-Control-Request-Method, Access-Control-Request-Headers + x-compute-time: + - '1.772222068' + x-compute-type: + - cache + x-request-id: + - NltuLUIFa5XCw_HIQyQsk + x-sha: + - 7129260dd854a80eb10ace5f61c20324b472b31c + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py index c9ad204ec3..822dab8986 100644 --- a/tests/test_inference_async_client.py +++ b/tests/test_inference_async_client.py @@ -164,7 +164,6 @@ async def test_async_chat_completion_no_stream() -> None: assert output == ChatCompletionOutput( id="", model="HuggingFaceH4/zephyr-7b-beta", - object="text_completion", system_fingerprint="1.4.3-sha-e6bb3ff", usage=ChatCompletionOutputUsage(completion_tokens=10, prompt_tokens=47, total_tokens=57), choices=[ @@ -191,7 +190,6 @@ async def test_async_chat_completion_not_tgi_no_stream() -> None: assert output == ChatCompletionOutput( id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", usage=None, choices=[ diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index 43c91ff3d2..2c453b0880 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import io +import json import time import unittest from pathlib import Path @@ -145,6 +146,26 @@ }, ] +CHAT_COMPLETION_RESPONSE_FORMAT_MESSAGE = [ + { + "role": "user", + "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?", + }, +] + +CHAT_COMPLETION_RESPONSE_FORMAT = { + "type": "json", + "value": { + "properties": { + "location": {"type": "string"}, + "activity": {"type": "string"}, + "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5}, + "animals": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["location", "activity", "animals_seen", "animals"], + }, +} + class InferenceClientTest(unittest.TestCase): @classmethod @@ -266,7 +287,6 @@ def test_chat_completion_with_non_tgi(self) -> None: assert output == ChatCompletionOutput( id="dummy", model="dummy", - object="dummy", system_fingerprint="dummy", usage=None, choices=[ @@ -328,6 +348,21 @@ def test_chat_completion_with_tool(self) -> None: "location": "San Francisco, CA", } + def test_chat_completion_with_response_format(self) -> None: + response = self.client.chat_completion( + model="meta-llama/Meta-Llama-3-70B-Instruct", + messages=CHAT_COMPLETION_RESPONSE_FORMAT_MESSAGE, + response_format=CHAT_COMPLETION_RESPONSE_FORMAT, + max_tokens=500, + ) + output = response.choices[0].message.content + assert json.loads(output) == { + "activity": "bike ride", + "animals": ["puppy", "cat", "raccoon"], + "animals_seen": 3, + "location": "park", + } + def test_chat_completion_unprocessable_entity(self) -> None: """Regression test for #2225.