From 36396f13988d9255ab8397616ba37f573661eb3c Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Tue, 16 Jul 2024 18:00:37 +0200
Subject: [PATCH] [InferenceClient] Add support for `adapter_id`
 (text-generation) and `response_format` (chat-completion) (#2383)

* types

* Add adapter_id arg to text_generation

* Add adapter_id to text-generation and response_format to chat_completion

* update example

* add test

* fix quality

* remove dummy

* lint

* b

* lint
---
 .../en/package_reference/inference_types.md   | 10 ++-
 .../ko/package_reference/inference_types.md   | 10 ++-
 src/huggingface_hub/__init__.py               | 10 ++-
 src/huggingface_hub/inference/_client.py      | 73 ++++++++++++++++--
 src/huggingface_hub/inference/_common.py      |  2 -
 .../inference/_generated/_async_client.py     | 75 +++++++++++++++++--
 .../inference/_generated/types/__init__.py    |  5 +-
 .../_generated/types/chat_completion.py       | 62 +++++++++------
 .../_generated/types/text_generation.py       | 29 +++++++
 ..._chat_completion_with_response_format.yaml | 60 +++++++++++++++
 tests/test_inference_async_client.py          |  2 -
 tests/test_inference_client.py                | 37 ++++++++-
 12 files changed, 326 insertions(+), 49 deletions(-)
 create mode 100644 tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml

diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
index c0079d5e1d..8dd826336c 100644
--- a/docs/source/en/package_reference/inference_types.md
+++ b/docs/source/en/package_reference/inference_types.md
@@ -55,14 +55,20 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition
 
+[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
+
+[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+
 [[autodoc]] huggingface_hub.ChatCompletionInputMessage
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolCall
+[[autodoc]] huggingface_hub.ChatCompletionInputTool
 
 [[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
 
+[[autodoc]] huggingface_hub.ChatCompletionInputURL
+
 [[autodoc]] huggingface_hub.ChatCompletionOutput
 
 [[autodoc]] huggingface_hub.ChatCompletionOutputComplete
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
index 548249b581..27e96cb491 100644
--- a/docs/source/ko/package_reference/inference_types.md
+++ b/docs/source/ko/package_reference/inference_types.md
@@ -54,14 +54,20 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition
 
+[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
+
+[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+
 [[autodoc]] huggingface_hub.ChatCompletionInputMessage
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolCall
+[[autodoc]] huggingface_hub.ChatCompletionInputTool
 
 [[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
 
+[[autodoc]] huggingface_hub.ChatCompletionInputURL
+
 [[autodoc]] huggingface_hub.ChatCompletionOutput
 
 [[autodoc]] huggingface_hub.ChatCompletionOutputComplete
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
index 131496eb69..1b1a345a6d 100644
--- a/src/huggingface_hub/__init__.py
+++ b/src/huggingface_hub/__init__.py
@@ -280,10 +280,13 @@
         "AutomaticSpeechRecognitionParameters",
         "ChatCompletionInput",
         "ChatCompletionInputFunctionDefinition",
+        "ChatCompletionInputFunctionName",
+        "ChatCompletionInputGrammarType",
         "ChatCompletionInputMessage",
+        "ChatCompletionInputMessageChunk",
         "ChatCompletionInputTool",
-        "ChatCompletionInputToolCall",
         "ChatCompletionInputToolTypeClass",
+        "ChatCompletionInputURL",
         "ChatCompletionOutput",
         "ChatCompletionOutputComplete",
         "ChatCompletionOutputFunctionDefinition",
@@ -775,10 +778,13 @@ def __dir__():
         AutomaticSpeechRecognitionParameters,  # noqa: F401
         ChatCompletionInput,  # noqa: F401
         ChatCompletionInputFunctionDefinition,  # noqa: F401
+        ChatCompletionInputFunctionName,  # noqa: F401
+        ChatCompletionInputGrammarType,  # noqa: F401
         ChatCompletionInputMessage,  # noqa: F401
+        ChatCompletionInputMessageChunk,  # noqa: F401
         ChatCompletionInputTool,  # noqa: F401
-        ChatCompletionInputToolCall,  # noqa: F401
         ChatCompletionInputToolTypeClass,  # noqa: F401
+        ChatCompletionInputURL,  # noqa: F401
         ChatCompletionOutput,  # noqa: F401
         ChatCompletionOutputComplete,  # noqa: F401
         ChatCompletionOutputFunctionDefinition,  # noqa: F401
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index 5f3cca1b1b..a23b7d7f0a 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -78,6 +78,7 @@
     AudioClassificationOutputElement,
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
+    ChatCompletionInputGrammarType,
     ChatCompletionInputTool,
     ChatCompletionInputToolTypeClass,
     ChatCompletionOutput,
@@ -103,7 +104,6 @@
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
 from huggingface_hub.inference._types import (
     ConversationalOutput,  # soon to be removed
 )
@@ -465,10 +465,11 @@ def chat_completion(  # type: ignore
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -488,10 +489,11 @@ def chat_completion(  # type: ignore
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -511,10 +513,11 @@ def chat_completion(
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -534,10 +537,11 @@ def chat_completion(
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -584,6 +588,8 @@ def chat_completion(
             presence_penalty (`float`, *optional*):
                 Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
                 text so far, increasing the model's likelihood to talk about new topics.
+            response_format ([`ChatCompletionInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
             seed (Optional[`int`], *optional*):
                 Seed for reproducible control flow. Defaults to None.
             stop (Optional[`str`], *optional*):
@@ -601,7 +607,7 @@ def chat_completion(
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*):
+            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
@@ -624,7 +630,6 @@ def chat_completion(
         Example:
 
         ```py
-        # Chat example
         >>> from huggingface_hub import InferenceClient
         >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
         >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
@@ -654,7 +659,13 @@ def chat_completion(
                 total_tokens=25
             )
         )
+        ```
 
+        Example (stream=True):
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
         >>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
         ...     print(token)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
@@ -770,6 +781,37 @@ def chat_completion(
             description=None
         )
         ```
+
+        Example using response_format:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...     },
+        ... ]
+        >>> response_format = {
+        ...     "type": "json",
+        ...     "value": {
+        ...         "properties": {
+        ...             "location": {"type": "string"},
+        ...             "activity": {"type": "string"},
+        ...             "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...             "animals": {"type": "array", "items": {"type": "string"}},
+        ...         },
+        ...         "required": ["location", "activity", "animals_seen", "animals"],
+        ...     },
+        ... }
+        >>> response = client.chat_completion(
+        ...     messages=messages,
+        ...     response_format=response_format,
+        ...     max_tokens=500,
+        )
+        >>> response.choices[0].message.content
+        '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
+        ```
         """
         # Determine model
         # `self.xxx` takes precedence over the method argument only in `chat_completion`
@@ -804,6 +846,7 @@ def chat_completion(
                         max_tokens=max_tokens,
                         n=n,
                         presence_penalty=presence_penalty,
+                        response_format=response_format,
                         seed=seed,
                         stop=stop,
                         temperature=temperature,
@@ -855,6 +898,11 @@ def chat_completion(
                 "Tools are not supported by the model. This is due to the model not been served by a "
                 "Text-Generation-Inference server. The provided tool parameters will be ignored."
             )
+        if response_format is not None:
+            warnings.warn(
+                "Response format is not supported by the model. This is due to the model not been served by a "
+                "Text-Generation-Inference server. The provided response format will be ignored."
+            )
 
         # generate response
         text_generation_output = self.text_generation(
@@ -873,7 +921,6 @@ def chat_completion(
         return ChatCompletionOutput(
             id="dummy",
             model="dummy",
-            object="dummy",
             system_fingerprint="dummy",
             usage=None,  # type: ignore # set to `None` as we don't want to provide false information
             created=int(time.time()),
@@ -1742,6 +1789,7 @@ def text_generation(  # type: ignore
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1770,6 +1818,7 @@ def text_generation(  # type: ignore
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1798,6 +1847,7 @@ def text_generation(  # type: ignore
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1826,6 +1876,7 @@ def text_generation(  # type: ignore
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1854,6 +1905,7 @@ def text_generation(
         stream: bool = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1881,6 +1933,7 @@ def text_generation(
         stream: bool = False,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1932,6 +1985,8 @@ def text_generation(
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            adapter_id (`str`, *optional*):
+                Lora adapter id.
             best_of (`int`, *optional*):
                 Generate best_of sequences and return the one if the highest token logprobs.
             decoder_input_details (`bool`, *optional*):
@@ -2100,6 +2155,7 @@ def text_generation(
 
         # Build payload
         parameters = {
+            "adapter_id": adapter_id,
             "best_of": best_of,
             "decoder_input_details": decoder_input_details,
             "details": details,
@@ -2170,6 +2226,7 @@ def text_generation(
                     details=details,
                     stream=stream,
                     model=model,
+                    adapter_id=adapter_id,
                     best_of=best_of,
                     decoder_input_details=decoder_input_details,
                     do_sample=do_sample,
diff --git a/src/huggingface_hub/inference/_common.py b/src/huggingface_hub/inference/_common.py
index bd669e417a..f093d2e538 100644
--- a/src/huggingface_hub/inference/_common.py
+++ b/src/huggingface_hub/inference/_common.py
@@ -315,7 +315,6 @@ def _format_chat_completion_stream_output_from_text_generation(
             # explicitly set 'dummy' values to reduce expectations from users
             id="dummy",
             model="dummy",
-            object="dummy",
             system_fingerprint="dummy",
             choices=[
                 ChatCompletionStreamOutputChoice(
@@ -335,7 +334,6 @@ def _format_chat_completion_stream_output_from_text_generation(
             # explicitly set 'dummy' values to reduce expectations from users
             id="dummy",
             model="dummy",
-            object="dummy",
             system_fingerprint="dummy",
             choices=[
                 ChatCompletionStreamOutputChoice(
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 982adf618e..be3d549ffa 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -64,6 +64,7 @@
     AudioClassificationOutputElement,
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
+    ChatCompletionInputGrammarType,
     ChatCompletionInputTool,
     ChatCompletionInputToolTypeClass,
     ChatCompletionOutput,
@@ -89,7 +90,6 @@
     ZeroShotClassificationOutputElement,
     ZeroShotImageClassificationOutputElement,
 )
-from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
 from huggingface_hub.inference._types import (
     ConversationalOutput,  # soon to be removed
 )
@@ -466,10 +466,11 @@ async def chat_completion(  # type: ignore
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -489,10 +490,11 @@ async def chat_completion(  # type: ignore
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -512,10 +514,11 @@ async def chat_completion(
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -535,10 +538,11 @@ async def chat_completion(
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
+        response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
         tool_prompt: Optional[str] = None,
         tools: Optional[List[ChatCompletionInputTool]] = None,
         top_logprobs: Optional[int] = None,
@@ -585,6 +589,8 @@ async def chat_completion(
             presence_penalty (`float`, *optional*):
                 Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
                 text so far, increasing the model's likelihood to talk about new topics.
+            response_format ([`ChatCompletionInputGrammarType`], *optional*):
+                Grammar constraints. Can be either a JSONSchema or a regex.
             seed (Optional[`int`], *optional*):
                 Seed for reproducible control flow. Defaults to None.
             stop (Optional[`str`], *optional*):
@@ -602,7 +608,7 @@ async def chat_completion(
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*):
+            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
@@ -626,7 +632,6 @@ async def chat_completion(
 
         ```py
         # Must be run in an async context
-        # Chat example
         >>> from huggingface_hub import AsyncInferenceClient
         >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
         >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
@@ -656,7 +661,14 @@ async def chat_completion(
                 total_tokens=25
             )
         )
+        ```
 
+        Example (stream=True):
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
         >>> async for token in await client.chat_completion(messages, max_tokens=10, stream=True):
         ...     print(token)
         ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
@@ -774,6 +786,38 @@ async def chat_completion(
             description=None
         )
         ```
+
+        Example using response_format:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+        ...     },
+        ... ]
+        >>> response_format = {
+        ...     "type": "json",
+        ...     "value": {
+        ...         "properties": {
+        ...             "location": {"type": "string"},
+        ...             "activity": {"type": "string"},
+        ...             "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+        ...             "animals": {"type": "array", "items": {"type": "string"}},
+        ...         },
+        ...         "required": ["location", "activity", "animals_seen", "animals"],
+        ...     },
+        ... }
+        >>> response = await client.chat_completion(
+        ...     messages=messages,
+        ...     response_format=response_format,
+        ...     max_tokens=500,
+        )
+        >>> response.choices[0].message.content
+        '{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
+        ```
         """
         # Determine model
         # `self.xxx` takes precedence over the method argument only in `chat_completion`
@@ -808,6 +852,7 @@ async def chat_completion(
                         max_tokens=max_tokens,
                         n=n,
                         presence_penalty=presence_penalty,
+                        response_format=response_format,
                         seed=seed,
                         stop=stop,
                         temperature=temperature,
@@ -859,6 +904,11 @@ async def chat_completion(
                 "Tools are not supported by the model. This is due to the model not been served by a "
                 "Text-Generation-Inference server. The provided tool parameters will be ignored."
             )
+        if response_format is not None:
+            warnings.warn(
+                "Response format is not supported by the model. This is due to the model not been served by a "
+                "Text-Generation-Inference server. The provided response format will be ignored."
+            )
 
         # generate response
         text_generation_output = await self.text_generation(
@@ -877,7 +927,6 @@ async def chat_completion(
         return ChatCompletionOutput(
             id="dummy",
             model="dummy",
-            object="dummy",
             system_fingerprint="dummy",
             usage=None,  # type: ignore # set to `None` as we don't want to provide false information
             created=int(time.time()),
@@ -1770,6 +1819,7 @@ async def text_generation(  # type: ignore
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1798,6 +1848,7 @@ async def text_generation(  # type: ignore
         stream: Literal[False] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1826,6 +1877,7 @@ async def text_generation(  # type: ignore
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1854,6 +1906,7 @@ async def text_generation(  # type: ignore
         stream: Literal[True] = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1882,6 +1935,7 @@ async def text_generation(
         stream: bool = ...,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1909,6 +1963,7 @@ async def text_generation(
         stream: bool = False,
         model: Optional[str] = None,
         # Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
+        adapter_id: Optional[str] = None,
         best_of: Optional[int] = None,
         decoder_input_details: Optional[bool] = None,
         do_sample: Optional[bool] = False,  # Manual default value
@@ -1960,6 +2015,8 @@ async def text_generation(
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
+            adapter_id (`str`, *optional*):
+                Lora adapter id.
             best_of (`int`, *optional*):
                 Generate best_of sequences and return the one if the highest token logprobs.
             decoder_input_details (`bool`, *optional*):
@@ -2129,6 +2186,7 @@ async def text_generation(
 
         # Build payload
         parameters = {
+            "adapter_id": adapter_id,
             "best_of": best_of,
             "decoder_input_details": decoder_input_details,
             "details": details,
@@ -2199,6 +2257,7 @@ async def text_generation(
                     details=details,
                     stream=stream,
                     model=model,
+                    adapter_id=adapter_id,
                     best_of=best_of,
                     decoder_input_details=decoder_input_details,
                     do_sample=do_sample,
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
index e79930374b..db2793be23 100644
--- a/src/huggingface_hub/inference/_generated/types/__init__.py
+++ b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -20,10 +20,13 @@
 from .chat_completion import (
     ChatCompletionInput,
     ChatCompletionInputFunctionDefinition,
+    ChatCompletionInputFunctionName,
+    ChatCompletionInputGrammarType,
     ChatCompletionInputMessage,
+    ChatCompletionInputMessageChunk,
     ChatCompletionInputTool,
-    ChatCompletionInputToolCall,
     ChatCompletionInputToolTypeClass,
+    ChatCompletionInputURL,
     ChatCompletionOutput,
     ChatCompletionOutputComplete,
     ChatCompletionOutputFunctionDefinition,
diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py
index e20dc11dd6..fa6e373140 100644
--- a/src/huggingface_hub/inference/_generated/types/chat_completion.py
+++ b/src/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -10,33 +10,55 @@
 
 
 @dataclass
-class ChatCompletionInputFunctionDefinition(BaseInferenceType):
-    arguments: Any
-    name: str
-    description: Optional[str] = None
+class ChatCompletionInputURL(BaseInferenceType):
+    url: str
+
+
+ChatCompletionInputMessageChunkType = Literal["text", "image_url"]
 
 
 @dataclass
-class ChatCompletionInputToolCall(BaseInferenceType):
-    function: ChatCompletionInputFunctionDefinition
-    id: int
-    type: str
+class ChatCompletionInputMessageChunk(BaseInferenceType):
+    type: "ChatCompletionInputMessageChunkType"
+    image_url: Optional[ChatCompletionInputURL] = None
+    text: Optional[str] = None
 
 
 @dataclass
 class ChatCompletionInputMessage(BaseInferenceType):
+    content: Union[List[ChatCompletionInputMessageChunk], str]
     role: str
-    content: Optional[str] = None
     name: Optional[str] = None
-    tool_calls: Optional[List[ChatCompletionInputToolCall]] = None
+
+
+ChatCompletionInputGrammarTypeType = Literal["json", "regex"]
+
+
+@dataclass
+class ChatCompletionInputGrammarType(BaseInferenceType):
+    type: "ChatCompletionInputGrammarTypeType"
+    value: Any
+    """A string that represents a [JSON Schema](https://json-schema.org/).
+    JSON Schema is a declarative language that allows to annotate JSON documents
+    with types and descriptions.
+    """
+
+
+@dataclass
+class ChatCompletionInputFunctionName(BaseInferenceType):
+    name: str
 
 
 @dataclass
 class ChatCompletionInputToolTypeClass(BaseInferenceType):
-    function_name: str
+    function: Optional[ChatCompletionInputFunctionName] = None
 
 
-ChatCompletionInputToolTypeEnum = Literal["OneOf"]
+@dataclass
+class ChatCompletionInputFunctionDefinition(BaseInferenceType):
+    arguments: Any
+    name: str
+    description: Optional[str] = None
 
 
 @dataclass
@@ -55,10 +77,6 @@ class ChatCompletionInput(BaseInferenceType):
 
     messages: List[ChatCompletionInputMessage]
     """A list of messages comprising the conversation so far."""
-    model: str
-    """[UNUSED] ID of the model to use. See the model endpoint compatibility table for details
-    on which models work with the Chat API.
-    """
     frequency_penalty: Optional[float] = None
     """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing
     frequency in the text so far,
@@ -83,6 +101,10 @@ class ChatCompletionInput(BaseInferenceType):
     """
     max_tokens: Optional[int] = None
     """The maximum number of tokens that can be generated in the chat completion."""
+    model: Optional[str] = None
+    """[UNUSED] ID of the model to use. See the model endpoint compatibility table for details
+    on which models work with the Chat API.
+    """
     n: Optional[int] = None
     """UNUSED
     How many chat completion choices to generate for each input message. Note that you will
@@ -94,6 +116,7 @@ class ChatCompletionInput(BaseInferenceType):
     appear in the text so far,
     increasing the model's likelihood to talk about new topics
     """
+    response_format: Optional[ChatCompletionInputGrammarType] = None
     seed: Optional[int] = None
     stop: Optional[List[str]] = None
     """Up to 4 sequences where the API will stop generating further tokens."""
@@ -104,7 +127,7 @@ class ChatCompletionInput(BaseInferenceType):
     lower values like 0.2 will make it more focused and deterministic.
     We generally recommend altering this or `top_p` but not both.
     """
-    tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, "ChatCompletionInputToolTypeEnum"]] = None
+    tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None
     tool_prompt: Optional[str] = None
     """A prompt to be appended before the tools"""
     tools: Optional[List[ChatCompletionInputTool]] = None
@@ -153,7 +176,7 @@ class ChatCompletionOutputFunctionDefinition(BaseInferenceType):
 @dataclass
 class ChatCompletionOutputToolCall(BaseInferenceType):
     function: ChatCompletionOutputFunctionDefinition
-    id: int
+    id: str
     type: str
 
 
@@ -161,7 +184,6 @@ class ChatCompletionOutputToolCall(BaseInferenceType):
 class ChatCompletionOutputMessage(BaseInferenceType):
     role: str
     content: Optional[str] = None
-    name: Optional[str] = None
     tool_calls: Optional[List[ChatCompletionOutputToolCall]] = None
 
 
@@ -192,7 +214,6 @@ class ChatCompletionOutput(BaseInferenceType):
     created: int
     id: str
     model: str
-    object: str
     system_fingerprint: str
     usage: ChatCompletionOutputUsage
 
@@ -256,5 +277,4 @@ class ChatCompletionStreamOutput(BaseInferenceType):
     created: int
     id: str
     model: str
-    object: str
     system_fingerprint: str
diff --git a/src/huggingface_hub/inference/_generated/types/text_generation.py b/src/huggingface_hub/inference/_generated/types/text_generation.py
index 0d63072590..27c70c7e2b 100644
--- a/src/huggingface_hub/inference/_generated/types/text_generation.py
+++ b/src/huggingface_hub/inference/_generated/types/text_generation.py
@@ -24,24 +24,53 @@ class TextGenerationInputGrammarType(BaseInferenceType):
 
 @dataclass
 class TextGenerationInputGenerateParameters(BaseInferenceType):
+    adapter_id: Optional[str] = None
+    """Lora adapter id"""
     best_of: Optional[int] = None
+    """Generate best_of sequences and return the one if the highest token logprobs."""
     decoder_input_details: Optional[bool] = None
+    """Whether to return decoder input token logprobs and ids."""
     details: Optional[bool] = None
+    """Whether to return generation details."""
     do_sample: Optional[bool] = None
+    """Activate logits sampling."""
     frequency_penalty: Optional[float] = None
+    """The parameter for frequency penalty. 1.0 means no penalty
+    Penalize new tokens based on their existing frequency in the text so far,
+    decreasing the model's likelihood to repeat the same line verbatim.
+    """
     grammar: Optional[TextGenerationInputGrammarType] = None
     max_new_tokens: Optional[int] = None
+    """Maximum number of tokens to generate."""
     repetition_penalty: Optional[float] = None
+    """The parameter for repetition penalty. 1.0 means no penalty.
+    See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    """
     return_full_text: Optional[bool] = None
+    """Whether to prepend the prompt to the generated text"""
     seed: Optional[int] = None
+    """Random sampling seed."""
     stop: Optional[List[str]] = None
+    """Stop generating tokens if a member of `stop` is generated."""
     temperature: Optional[float] = None
+    """The value used to module the logits distribution."""
     top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
     top_n_tokens: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-n-filtering."""
     top_p: Optional[float] = None
+    """Top-p value for nucleus sampling."""
     truncate: Optional[int] = None
+    """Truncate inputs tokens to the given size."""
     typical_p: Optional[float] = None
+    """Typical Decoding mass
+    See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)
+    for more information.
+    """
     watermark: Optional[bool] = None
+    """Watermarking with [A Watermark for Large Language
+    Models](https://arxiv.org/abs/2301.10226).
+    """
 
 
 @dataclass
diff --git a/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml b/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml
new file mode 100644
index 0000000000..0e7e2fcbbf
--- /dev/null
+++ b/tests/cassettes/InferenceClientVCRTest.test_chat_completion_with_response_format.yaml
@@ -0,0 +1,60 @@
+interactions:
+- request:
+    body: '{"model": "meta-llama/Meta-Llama-3-70B-Instruct", "messages": [{"role":
+      "user", "content": "I saw a puppy a cat and a raccoon during my bike ride in
+      the park. What did I saw and when?"}], "frequency_penalty": null, "logit_bias":
+      null, "logprobs": null, "max_tokens": 500, "n": null, "presence_penalty": null,
+      "response_format": {"type": "json", "value": {"properties": {"location": {"type":
+      "string"}, "activity": {"type": "string"}, "animals_seen": {"type": "integer",
+      "minimum": 1, "maximum": 5}, "animals": {"type": "array", "items": {"type":
+      "string"}}}, "required": ["location", "activity", "animals_seen", "animals"]}},
+      "seed": null, "stop": null, "temperature": null, "tool_choice": null, "tool_prompt":
+      null, "tools": null, "top_logprobs": null, "top_p": null, "stream": false}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate, br
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '785'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - 64f96be3-d5ef-4a00-adf3-43b9d20d2ab1
+      user-agent:
+      - unknown/None; hf_hub/0.24.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.15.0;
+        fastcore/1.5.23
+    method: POST
+    uri: https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B-Instruct/v1/chat/completions
+  response:
+    body:
+      string: '{"object":"chat.completion","id":"","created":1720527901,"model":"meta-llama/Meta-Llama-3-70B-Instruct","system_fingerprint":"2.1.1-dev0-sha-4327210","choices":[{"index":0,"message":{"role":"assistant","content":"{
+        \n\n\"activity\": \"bike ride\", \n\"animals\": [\"puppy\", \"cat\", \"raccoon\"],
+        \n\"animals_seen\": 3, \n\"location\": \"park\" \n}"},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":35,"completion_tokens":45,"total_tokens":80}}'
+    headers:
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '468'
+      Content-Type:
+      - application/json
+      Date:
+      - Tue, 09 Jul 2024 12:47:17 GMT
+      access-control-allow-credentials:
+      - 'true'
+      vary:
+      - Origin, Access-Control-Request-Method, Access-Control-Request-Headers
+      x-compute-time:
+      - '1.772222068'
+      x-compute-type:
+      - cache
+      x-request-id:
+      - NltuLUIFa5XCw_HIQyQsk
+      x-sha:
+      - 7129260dd854a80eb10ace5f61c20324b472b31c
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py
index c9ad204ec3..822dab8986 100644
--- a/tests/test_inference_async_client.py
+++ b/tests/test_inference_async_client.py
@@ -164,7 +164,6 @@ async def test_async_chat_completion_no_stream() -> None:
     assert output == ChatCompletionOutput(
         id="",
         model="HuggingFaceH4/zephyr-7b-beta",
-        object="text_completion",
         system_fingerprint="1.4.3-sha-e6bb3ff",
         usage=ChatCompletionOutputUsage(completion_tokens=10, prompt_tokens=47, total_tokens=57),
         choices=[
@@ -191,7 +190,6 @@ async def test_async_chat_completion_not_tgi_no_stream() -> None:
     assert output == ChatCompletionOutput(
         id="dummy",
         model="dummy",
-        object="dummy",
         system_fingerprint="dummy",
         usage=None,
         choices=[
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
index 43c91ff3d2..2c453b0880 100644
--- a/tests/test_inference_client.py
+++ b/tests/test_inference_client.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
+import json
 import time
 import unittest
 from pathlib import Path
@@ -145,6 +146,26 @@
     },
 ]
 
+CHAT_COMPLETION_RESPONSE_FORMAT_MESSAGE = [
+    {
+        "role": "user",
+        "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
+    },
+]
+
+CHAT_COMPLETION_RESPONSE_FORMAT = {
+    "type": "json",
+    "value": {
+        "properties": {
+            "location": {"type": "string"},
+            "activity": {"type": "string"},
+            "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
+            "animals": {"type": "array", "items": {"type": "string"}},
+        },
+        "required": ["location", "activity", "animals_seen", "animals"],
+    },
+}
+
 
 class InferenceClientTest(unittest.TestCase):
     @classmethod
@@ -266,7 +287,6 @@ def test_chat_completion_with_non_tgi(self) -> None:
         assert output == ChatCompletionOutput(
             id="dummy",
             model="dummy",
-            object="dummy",
             system_fingerprint="dummy",
             usage=None,
             choices=[
@@ -328,6 +348,21 @@ def test_chat_completion_with_tool(self) -> None:
             "location": "San Francisco, CA",
         }
 
+    def test_chat_completion_with_response_format(self) -> None:
+        response = self.client.chat_completion(
+            model="meta-llama/Meta-Llama-3-70B-Instruct",
+            messages=CHAT_COMPLETION_RESPONSE_FORMAT_MESSAGE,
+            response_format=CHAT_COMPLETION_RESPONSE_FORMAT,
+            max_tokens=500,
+        )
+        output = response.choices[0].message.content
+        assert json.loads(output) == {
+            "activity": "bike ride",
+            "animals": ["puppy", "cat", "raccoon"],
+            "animals_seen": 3,
+            "location": "park",
+        }
+
     def test_chat_completion_unprocessable_entity(self) -> None:
         """Regression test for #2225.