From f26447f74e14fb994c5195b58f5a2ba92a3e3d73 Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Tue, 7 May 2024 18:20:19 -0700
Subject: [PATCH 01/11] basic implementation of llama.cpp chat generation

allows for constraining to json

allows for function calling (not tested)

streaming needs to be implemented when stream is set to true in generation_kwargs
---
 .../generators/llama_cpp/__init__.py          |  3 +-
 .../llama_cpp/chat/chat_generator.py          | 97 +++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py

diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py
index cac9235bd..10b20d363 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from .chat.chat_generator import LlamaCppChatGenerator
 from .generator import LlamaCppGenerator
 
-__all__ = ["LlamaCppGenerator"]
+__all__ = ["LlamaCppGenerator", "LlamaCppChatGenerator"]
diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
new file mode 100644
index 000000000..e1903ce38
--- /dev/null
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -0,0 +1,97 @@
+import logging
+from typing import Any, Dict, List, Optional
+
+from haystack import component
+from haystack.dataclasses import ChatMessage
+from llama_cpp import Llama
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class LlamaCppChatGenerator:
+    """
+    Provides an interface to generate text using LLM via llama.cpp.
+
+    [llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
+    It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
+
+    Usage example:
+    ```python
+    from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
+    generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
+
+    print(generator.run("Who is the best American actor?", generation_kwargs={"max_tokens": 128}))
+    # {'replies': ['John Cusack'], 'meta': [{"object": "text_completion", ...}]}
+    ```
+    """
+
+    def __init__(
+        self,
+        model: str,
+        n_ctx: Optional[int] = 0,
+        n_batch: Optional[int] = 512,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        generation_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
+            If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
+        :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
+        :param n_batch: Prompt processing maximum batch size.
+        :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
+            These keyword arguments provide fine-grained control over the model loading.
+            In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
+            For more information on the available kwargs, see
+            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
+        :param generation_kwargs:  A dictionary containing keyword arguments to customize text generation.
+            For more information on the available kwargs, see
+            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
+        """
+
+        model_kwargs = model_kwargs or {}
+        generation_kwargs = generation_kwargs or {}
+
+        # check if the model_kwargs contain the essential parameters
+        # otherwise, populate them with values from init parameters
+        model_kwargs.setdefault("model_path", model)
+        model_kwargs.setdefault("n_ctx", n_ctx)
+        model_kwargs.setdefault("n_batch", n_batch)
+
+        self.model_path = model
+        self.n_ctx = n_ctx
+        self.n_batch = n_batch
+        self.model_kwargs = model_kwargs
+        self.generation_kwargs = generation_kwargs
+        self.model = None
+
+    def warm_up(self):
+        if self.model is None:
+            self.model = Llama(**self.model_kwargs)
+
+    @component.output_types(replies=List[ChatMessage])
+    def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
+        """
+        Run the text generation model on the given prompt.
+
+        :param messages:
+            A list of ChatMessage instances representing the input messages.
+        :param generation_kwargs:  A dictionary containing keyword arguments to customize text generation.
+            For more information on the available kwargs, see
+            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
+        :returns: A dictionary with the following keys:
+            - `replies`: The responses from the model
+        """
+        if self.model is None:
+            error_msg = "The model has not been loaded. Please call warm_up() before running."
+            raise RuntimeError(error_msg)
+
+        updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
+        formatted_messages = [msg.to_openai_format() for msg in messages]
+
+        # Check if stream in generation_kwargs is set to True; handle streaming
+
+        output = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
+        replies = [ChatMessage.from_assistant(content=output["choices"][0]["message"]["content"])]
+
+        return {"replies": replies}

From 64792b51db7a24488fbfaa05776c2cde8e0e1083 Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Wed, 8 May 2024 22:32:26 -0700
Subject: [PATCH 02/11] add testing

---
 .../llama_cpp/chat/chat_generator.py          |  31 ++-
 .../llama_cpp/tests/test_chat_generator.py    | 253 ++++++++++++++++++
 2 files changed, 278 insertions(+), 6 deletions(-)
 create mode 100644 integrations/llama_cpp/tests/test_chat_generator.py

diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
index e1903ce38..7261c9925 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Optional
 
 from haystack import component
-from haystack.dataclasses import ChatMessage
+from haystack.dataclasses import ChatMessage, ChatRole
 from llama_cpp import Llama
 
 logger = logging.getLogger(__name__)
@@ -69,6 +69,10 @@ def warm_up(self):
         if self.model is None:
             self.model = Llama(**self.model_kwargs)
 
+    def stream_to_stdout(self, chunk):
+        """Print streamed data to stdout."""
+        print(chunk.content, end='', flush=True)
+
     @component.output_types(replies=List[ChatMessage])
     def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
         """
@@ -86,12 +90,27 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
             error_msg = "The model has not been loaded. Please call warm_up() before running."
             raise RuntimeError(error_msg)
 
+        if not messages:
+            return {"replies": []}
+
         updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
         formatted_messages = [msg.to_openai_format() for msg in messages]
 
-        # Check if stream in generation_kwargs is set to True; handle streaming
-
-        output = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
-        replies = [ChatMessage.from_assistant(content=output["choices"][0]["message"]["content"])]
-
+        response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
+        replies = []
+        for choice in response["choices"]:
+            metadata = {
+                "response_id": response["id"],
+                "model": response["model"],
+                "created": response["created"],
+                "index": choice["index"],
+                "finish_reason": choice["finish_reason"],
+                "usage": response["usage"],
+            }
+
+            content = choice["message"]["content"]
+            role = choice["message"]["role"].upper()
+
+            chat_message = ChatMessage(content=content, role=ChatRole[role], name=None, meta=metadata)
+            replies.append(chat_message)
         return {"replies": replies}
diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
new file mode 100644
index 000000000..898b584c4
--- /dev/null
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -0,0 +1,253 @@
+import os
+import urllib.request
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+from haystack import Document, Pipeline
+from haystack.components.builders.answer_builder import AnswerBuilder
+from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder
+from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
+from haystack.dataclasses import ChatMessage, ChatRole
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
+
+
+@pytest.fixture
+def model_path():
+    return Path(__file__).parent / "models"
+
+
+def download_file(file_link, filename, capsys):
+    # Checks if the file already exists before downloading
+    if not os.path.isfile(filename):
+        urllib.request.urlretrieve(file_link, filename)  # noqa: S310
+        with capsys.disabled():
+            print("\nModel file downloaded successfully.")
+    else:
+        with capsys.disabled():
+            print("\nModel file already exists.")
+
+
+class TestLlamaCppChatGenerator:
+    @pytest.fixture
+    def generator(self, model_path, capsys):
+        gguf_model_path = (
+            "https://huggingface.co/TheBloke/openchat-3.5-1210-GGUF/resolve/main/openchat-3.5-1210.Q3_K_S.gguf"
+        )
+        filename = "openchat-3.5-1210.Q3_K_S.gguf"
+
+        # Download GGUF model from HuggingFace
+        download_file(gguf_model_path, str(model_path / filename), capsys)
+
+        model_path = str(model_path / filename)
+        generator = LlamaCppChatGenerator(model=model_path, n_ctx=128, n_batch=128)
+        generator.warm_up()
+        return generator
+
+    @pytest.fixture
+    def generator_mock(self):
+        mock_model = MagicMock()
+        generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=2048, n_batch=512)
+        generator.model = mock_model
+        return generator, mock_model
+
+    def test_default_init(self):
+        """
+        Test default initialization parameters.
+        """
+        generator = LlamaCppChatGenerator(model="test_model.gguf")
+
+        assert generator.model_path == "test_model.gguf"
+        assert generator.n_ctx == 0
+        assert generator.n_batch == 512
+        assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 0, "n_batch": 512}
+        assert generator.generation_kwargs == {}
+
+    def test_custom_init(self):
+        """
+        Test custom initialization parameters.
+        """
+        generator = LlamaCppChatGenerator(
+            model="test_model.gguf",
+            n_ctx=2048,
+            n_batch=512,
+        )
+
+        assert generator.model_path == "test_model.gguf"
+        assert generator.n_ctx == 2048
+        assert generator.n_batch == 512
+        assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 2048, "n_batch": 512}
+        assert generator.generation_kwargs == {}
+
+    def test_ignores_model_path_if_specified_in_model_kwargs(self):
+        """
+        Test that model_path is ignored if already specified in model_kwargs.
+        """
+        generator = LlamaCppChatGenerator(
+            model="test_model.gguf",
+            n_ctx=512,
+            n_batch=512,
+            model_kwargs={"model_path": "other_model.gguf"},
+        )
+        assert generator.model_kwargs["model_path"] == "other_model.gguf"
+
+    def test_ignores_n_ctx_if_specified_in_model_kwargs(self):
+        """
+        Test that n_ctx is ignored if already specified in model_kwargs.
+        """
+        generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 1024})
+        assert generator.model_kwargs["n_ctx"] == 1024
+
+    def test_ignores_n_batch_if_specified_in_model_kwargs(self):
+        """
+        Test that n_batch is ignored if already specified in model_kwargs.
+        """
+        generator = LlamaCppChatGenerator(
+            model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_batch": 1024}
+        )
+        assert generator.model_kwargs["n_batch"] == 1024
+
+    def test_raises_error_without_warm_up(self):
+        """
+        Test that the generator raises an error if warm_up() is not called before running.
+        """
+        generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512)
+        with pytest.raises(RuntimeError):
+            generator.run("What is the capital of China?")
+
+    def test_run_with_empty_message(self, generator_mock):
+        """
+        Test that an empty message returns an empty list of replies.
+        """
+        generator, _ = generator_mock
+        result = generator.run([])
+        assert isinstance(result["replies"], list)
+        assert len(result["replies"]) == 0
+
+    def test_run_with_valid_message(self, generator_mock):
+        """
+        Test that a valid message returns a list of replies.
+        """
+        generator, mock_model = generator_mock
+        mock_output = {
+            "id": "unique-id-123",
+            "model": "Test Model Path",
+            "created": 1715226164,
+            "choices": [
+                {"index": 0, "message": {"content": "Generated text", "role": "assistant"}, "finish_reason": "stop"}
+            ],
+            "usage": {"prompt_tokens": 14, "completion_tokens": 57, "total_tokens": 71},
+        }
+        mock_model.create_chat_completion.return_value = mock_output
+        result = generator.run(messages=[ChatMessage.from_system("Test")])
+        assert isinstance(result["replies"], list)
+        assert len(result["replies"]) == 1
+        assert isinstance(result["replies"][0], ChatMessage)
+        assert result["replies"][0].content == "Generated text"
+        assert result["replies"][0].role == ChatRole.ASSISTANT
+
+    def test_run_with_generation_kwargs(self, generator_mock):
+        """
+        Test that a valid message and generation kwargs returns a list of replies.
+        """
+        generator, mock_model = generator_mock
+        mock_output = {
+            "id": "unique-id-123",
+            "model": "Test Model Path",
+            "created": 1715226164,
+            "choices": [
+                {"index": 0, "message": {"content": "Generated text", "role": "assistant"}, "finish_reason": "length"}
+            ],
+            "usage": {"prompt_tokens": 14, "completion_tokens": 57, "total_tokens": 71},
+        }
+        mock_model.create_chat_completion.return_value = mock_output
+        generation_kwargs = {"max_tokens": 128}
+        result = generator.run([ChatMessage.from_system("Write a 200 word paragraph.")], generation_kwargs)
+        assert result["replies"][0].content == "Generated text"
+        assert result["replies"][0].meta["finish_reason"] == "length"
+
+    @pytest.mark.integration
+    def test_run(self, generator):
+        """
+        Test that a valid message returns a list of replies.
+        """
+        questions_and_answers = [
+            ("What's the capital of France?", "Paris"),
+            ("What is the capital of Canada?", "Ottawa"),
+            ("What is the capital of Ghana?", "Accra"),
+        ]
+
+        for question, answer in questions_and_answers:
+            chat_message = ChatMessage.from_system(
+                f"GPT4 Correct User: Answer in a single word. {question} <|end_of_turn|>\n GPT4 Correct Assistant:"
+            )
+            result = generator.run([chat_message])
+
+            assert "replies" in result
+            assert isinstance(result["replies"], list)
+            assert len(result["replies"]) > 0
+            assert any(answer.lower() in reply.content.lower() for reply in result["replies"])
+
+    @pytest.mark.integration
+    def test_run_rag_pipeline(self, generator):
+        """
+        Test that a valid message returns a list of replies.
+        """
+        user_message = (
+            ChatMessage.from_user
+        ) = """GPT4 Correct User: Answer the question in a single word. {{question}}
+        Context:
+        {% for doc in documents %}
+            {{ doc.content }}
+        {% endfor %}
+        <|end_of_turn|>
+        GPT4 Correct Assistant:
+        """
+        rag_pipeline = Pipeline()
+        rag_pipeline.add_component(
+            instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever"
+        )
+        rag_pipeline.add_component(
+            instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder"
+        )
+        rag_pipeline.add_component(instance=generator, name="llm")
+        rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
+        rag_pipeline.connect("retriever", "prompt_builder.documents")
+        rag_pipeline.connect("prompt_builder", "llm")
+        rag_pipeline.connect("llm.replies", "answer_builder.replies")
+        rag_pipeline.connect("retriever", "answer_builder.documents")
+
+        # Populate the document store
+        documents = [
+            Document(content="The capital of France is Paris."),
+            Document(content="The capital of Canada is Ottawa."),
+            Document(content="The capital of Ghana is Accra."),
+        ]
+        rag_pipeline.get_component("retriever").document_store.write_documents(documents)
+
+        # Query and assert
+        questions_and_answers = [
+            ("What's the capital of France?", "Paris"),
+            ("What is the capital of Canada?", "Ottawa"),
+            ("What is the capital of Ghana?", "Accra"),
+        ]
+
+        for question, answer in questions_and_answers:
+            result = rag_pipeline.run(
+                {
+                    "retriever": {"query": question},
+                    "prompt_builder": {
+                        "prompt_source": [user_message],
+                        "query": question,
+                    },
+                    "llm": {"generation_kwargs": {"temperature": 0.1}},
+                }
+            )
+
+            assert len(result["answer_builder"]["answers"]) == 1
+            generated_answer = result["answer_builder"]["answers"][0]
+            assert answer.lower() in generated_answer.data.lower()
+            assert generated_answer.query == question
+            assert hasattr(generated_answer, "documents")
+            assert hasattr(generated_answer, "meta")

From 4a85312eefe811d7afbd785e76b95ff98ba63e5e Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Wed, 8 May 2024 22:33:42 -0700
Subject: [PATCH 03/11] remove unnecessary function

---
 .../components/generators/llama_cpp/chat/chat_generator.py    | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
index 7261c9925..7ea26535c 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -69,10 +69,6 @@ def warm_up(self):
         if self.model is None:
             self.model = Llama(**self.model_kwargs)
 
-    def stream_to_stdout(self, chunk):
-        """Print streamed data to stdout."""
-        print(chunk.content, end='', flush=True)
-
     @component.output_types(replies=List[ChatMessage])
     def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
         """

From 6f58ad496839645733446f506c756b0c91129f03 Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Wed, 8 May 2024 22:47:41 -0700
Subject: [PATCH 04/11] slight documentation fix, comment out broken test

---
 .../llama_cpp/chat/chat_generator.py          |   9 +-
 .../llama_cpp/tests/test_chat_generator.py    | 124 +++++++++---------
 2 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
index 7ea26535c..232bce8f3 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -18,11 +18,12 @@ class LlamaCppChatGenerator:
 
     Usage example:
     ```python
-    from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
+    from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
+    user_message = [ChatMessage.from_user("Who is the best American actor?")]
     generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
 
-    print(generator.run("Who is the best American actor?", generation_kwargs={"max_tokens": 128}))
-    # {'replies': ['John Cusack'], 'meta': [{"object": "text_completion", ...}]}
+    print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
+    # {'replies': [ChatMessage(content='John Cusack', role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={...}]}
     ```
     """
 
@@ -72,7 +73,7 @@ def warm_up(self):
     @component.output_types(replies=List[ChatMessage])
     def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
         """
-        Run the text generation model on the given prompt.
+        Run the text generation model on the given list of ChatMessages.
 
         :param messages:
             A list of ChatMessage instances representing the input messages.
diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
index 898b584c4..703bb8d3b 100644
--- a/integrations/llama_cpp/tests/test_chat_generator.py
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -189,65 +189,65 @@ def test_run(self, generator):
             assert len(result["replies"]) > 0
             assert any(answer.lower() in reply.content.lower() for reply in result["replies"])
 
-    @pytest.mark.integration
-    def test_run_rag_pipeline(self, generator):
-        """
-        Test that a valid message returns a list of replies.
-        """
-        user_message = (
-            ChatMessage.from_user
-        ) = """GPT4 Correct User: Answer the question in a single word. {{question}}
-        Context:
-        {% for doc in documents %}
-            {{ doc.content }}
-        {% endfor %}
-        <|end_of_turn|>
-        GPT4 Correct Assistant:
-        """
-        rag_pipeline = Pipeline()
-        rag_pipeline.add_component(
-            instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever"
-        )
-        rag_pipeline.add_component(
-            instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder"
-        )
-        rag_pipeline.add_component(instance=generator, name="llm")
-        rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
-        rag_pipeline.connect("retriever", "prompt_builder.documents")
-        rag_pipeline.connect("prompt_builder", "llm")
-        rag_pipeline.connect("llm.replies", "answer_builder.replies")
-        rag_pipeline.connect("retriever", "answer_builder.documents")
-
-        # Populate the document store
-        documents = [
-            Document(content="The capital of France is Paris."),
-            Document(content="The capital of Canada is Ottawa."),
-            Document(content="The capital of Ghana is Accra."),
-        ]
-        rag_pipeline.get_component("retriever").document_store.write_documents(documents)
-
-        # Query and assert
-        questions_and_answers = [
-            ("What's the capital of France?", "Paris"),
-            ("What is the capital of Canada?", "Ottawa"),
-            ("What is the capital of Ghana?", "Accra"),
-        ]
-
-        for question, answer in questions_and_answers:
-            result = rag_pipeline.run(
-                {
-                    "retriever": {"query": question},
-                    "prompt_builder": {
-                        "prompt_source": [user_message],
-                        "query": question,
-                    },
-                    "llm": {"generation_kwargs": {"temperature": 0.1}},
-                }
-            )
-
-            assert len(result["answer_builder"]["answers"]) == 1
-            generated_answer = result["answer_builder"]["answers"][0]
-            assert answer.lower() in generated_answer.data.lower()
-            assert generated_answer.query == question
-            assert hasattr(generated_answer, "documents")
-            assert hasattr(generated_answer, "meta")
+    # @pytest.mark.integration
+    # def test_run_rag_pipeline(self, generator):
+    #     """
+    #     Test that a valid message returns a list of replies.
+    #     """
+    #     user_message = (
+    #         ChatMessage.from_user
+    #     ) = """GPT4 Correct User: Answer the question in a single word. {{question}}
+    #     Context:
+    #     {% for doc in documents %}
+    #         {{ doc.content }}
+    #     {% endfor %}
+    #     <|end_of_turn|>
+    #     GPT4 Correct Assistant:
+    #     """
+    #     rag_pipeline = Pipeline()
+    #     rag_pipeline.add_component(
+    #         instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever"
+    #     )
+    #     rag_pipeline.add_component(
+    #         instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder"
+    #     )
+    #     rag_pipeline.add_component(instance=generator, name="llm")
+    #     rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
+    #     rag_pipeline.connect("retriever", "prompt_builder.documents")
+    #     rag_pipeline.connect("prompt_builder", "llm")
+    #     rag_pipeline.connect("llm.replies", "answer_builder.replies")
+    #     rag_pipeline.connect("retriever", "answer_builder.documents")
+
+    #     # Populate the document store
+    #     documents = [
+    #         Document(content="The capital of France is Paris."),
+    #         Document(content="The capital of Canada is Ottawa."),
+    #         Document(content="The capital of Ghana is Accra."),
+    #     ]
+    #     rag_pipeline.get_component("retriever").document_store.write_documents(documents)
+
+    #     # Query and assert
+    #     questions_and_answers = [
+    #         ("What's the capital of France?", "Paris"),
+    #         ("What is the capital of Canada?", "Ottawa"),
+    #         ("What is the capital of Ghana?", "Accra"),
+    #     ]
+
+    #     for question, answer in questions_and_answers:
+    #         result = rag_pipeline.run(
+    #             {
+    #                 "retriever": {"query": question},
+    #                 "prompt_builder": {
+    #                     "prompt_source": [user_message],
+    #                     "query": question,
+    #                 },
+    #                 "llm": {"generation_kwargs": {"temperature": 0.1}},
+    #             }
+    #         )
+
+    #         assert len(result["answer_builder"]["answers"]) == 1
+    #         generated_answer = result["answer_builder"]["answers"][0]
+    #         assert answer.lower() in generated_answer.data.lower()
+    #         assert generated_answer.query == question
+    #         assert hasattr(generated_answer, "documents")
+    #         assert hasattr(generated_answer, "meta")

From 788b3119c7df34018725ff9b603ce1b97805b8aa Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Sat, 11 May 2024 01:37:21 -0700
Subject: [PATCH 05/11] support for function calling through functionary

also add a basic rag test
---
 integrations/llama_cpp/pyproject.toml         |   5 +-
 .../llama_cpp/chat/chat_generator.py          |  44 ++--
 .../llama_cpp/tests/test_chat_generator.py    | 201 +++++++++++-------
 3 files changed, 162 insertions(+), 88 deletions(-)

diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml
index 563af391d..20967a933 100644
--- a/integrations/llama_cpp/pyproject.toml
+++ b/integrations/llama_cpp/pyproject.toml
@@ -28,7 +28,10 @@ classifiers = [
 ]
 dependencies = [
   "haystack-ai",
-  "llama-cpp-python"
+  "llama-cpp-python",
+  "transformers",
+  "sentencepiece",
+  "protobuf"
 ]
 
 [project.urls]
diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
index 232bce8f3..32797cc2e 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -1,9 +1,11 @@
+import json
 import logging
 from typing import Any, Dict, List, Optional
 
 from haystack import component
 from haystack.dataclasses import ChatMessage, ChatRole
 from llama_cpp import Llama
+from llama_cpp.llama_tokenizer import LlamaHFTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -53,6 +55,10 @@ def __init__(
         model_kwargs = model_kwargs or {}
         generation_kwargs = generation_kwargs or {}
 
+        if 'hf_tokenizer_path' in model_kwargs:
+            tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs['hf_tokenizer_path'])
+            model_kwargs['tokenizer'] = tokenizer
+
         # check if the model_kwargs contain the essential parameters
         # otherwise, populate them with values from init parameters
         model_kwargs.setdefault("model_path", model)
@@ -94,20 +100,26 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str,
         formatted_messages = [msg.to_openai_format() for msg in messages]
 
         response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
-        replies = []
-        for choice in response["choices"]:
-            metadata = {
-                "response_id": response["id"],
-                "model": response["model"],
-                "created": response["created"],
-                "index": choice["index"],
-                "finish_reason": choice["finish_reason"],
-                "usage": response["usage"],
-            }
-
-            content = choice["message"]["content"]
-            role = choice["message"]["role"].upper()
-
-            chat_message = ChatMessage(content=content, role=ChatRole[role], name=None, meta=metadata)
-            replies.append(chat_message)
+        replies = [
+            ChatMessage(
+                content=choice["message"]["content"],
+                role=ChatRole[choice["message"]["role"].upper()],
+                name=None,
+                meta={
+                    "response_id": response["id"],
+                    "model": response["model"],
+                    "created": response["created"],
+                    "index": choice["index"],
+                    "finish_reason": choice["finish_reason"],
+                    "usage": response["usage"],
+                },
+            )
+            for choice in response["choices"]
+        ]
+
+        for reply, choice in zip(replies, response["choices"]):
+            tool_calls = choice.get("message", {}).get("tool_calls", [])
+            if tool_calls:
+                reply.meta["tool_calls"] = tool_calls
+        reply.name = tool_calls[0]["function"]["name"] if tool_calls else None
         return {"replies": replies}
diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
index 703bb8d3b..e37dd0bac 100644
--- a/integrations/llama_cpp/tests/test_chat_generator.py
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -1,3 +1,4 @@
+import json
 import os
 import urllib.request
 from pathlib import Path
@@ -5,7 +6,6 @@
 
 import pytest
 from haystack import Document, Pipeline
-from haystack.components.builders.answer_builder import AnswerBuilder
 from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder
 from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
 from haystack.dataclasses import ChatMessage, ChatRole
@@ -41,7 +41,7 @@ def generator(self, model_path, capsys):
         download_file(gguf_model_path, str(model_path / filename), capsys)
 
         model_path = str(model_path / filename)
-        generator = LlamaCppChatGenerator(model=model_path, n_ctx=128, n_batch=128)
+        generator = LlamaCppChatGenerator(model=model_path, n_ctx=8192, n_batch=512)
         generator.warm_up()
         return generator
 
@@ -70,14 +70,14 @@ def test_custom_init(self):
         """
         generator = LlamaCppChatGenerator(
             model="test_model.gguf",
-            n_ctx=2048,
+            n_ctx=8192,
             n_batch=512,
         )
 
         assert generator.model_path == "test_model.gguf"
-        assert generator.n_ctx == 2048
+        assert generator.n_ctx == 8192
         assert generator.n_batch == 512
-        assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 2048, "n_batch": 512}
+        assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512}
         assert generator.generation_kwargs == {}
 
     def test_ignores_model_path_if_specified_in_model_kwargs(self):
@@ -86,7 +86,7 @@ def test_ignores_model_path_if_specified_in_model_kwargs(self):
         """
         generator = LlamaCppChatGenerator(
             model="test_model.gguf",
-            n_ctx=512,
+            n_ctx=8192,
             n_batch=512,
             model_kwargs={"model_path": "other_model.gguf"},
         )
@@ -96,15 +96,15 @@ def test_ignores_n_ctx_if_specified_in_model_kwargs(self):
         """
         Test that n_ctx is ignored if already specified in model_kwargs.
         """
-        generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 1024})
-        assert generator.model_kwargs["n_ctx"] == 1024
+        generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 8192})
+        assert generator.model_kwargs["n_ctx"] == 8192
 
     def test_ignores_n_batch_if_specified_in_model_kwargs(self):
         """
         Test that n_batch is ignored if already specified in model_kwargs.
         """
         generator = LlamaCppChatGenerator(
-            model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_batch": 1024}
+            model="test_model.gguf", n_ctx=8192, n_batch=512, model_kwargs={"n_batch": 1024}
         )
         assert generator.model_kwargs["n_batch"] == 1024
 
@@ -189,65 +189,124 @@ def test_run(self, generator):
             assert len(result["replies"]) > 0
             assert any(answer.lower() in reply.content.lower() for reply in result["replies"])
 
-    # @pytest.mark.integration
-    # def test_run_rag_pipeline(self, generator):
-    #     """
-    #     Test that a valid message returns a list of replies.
-    #     """
-    #     user_message = (
-    #         ChatMessage.from_user
-    #     ) = """GPT4 Correct User: Answer the question in a single word. {{question}}
-    #     Context:
-    #     {% for doc in documents %}
-    #         {{ doc.content }}
-    #     {% endfor %}
-    #     <|end_of_turn|>
-    #     GPT4 Correct Assistant:
-    #     """
-    #     rag_pipeline = Pipeline()
-    #     rag_pipeline.add_component(
-    #         instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever"
-    #     )
-    #     rag_pipeline.add_component(
-    #         instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder"
-    #     )
-    #     rag_pipeline.add_component(instance=generator, name="llm")
-    #     rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
-    #     rag_pipeline.connect("retriever", "prompt_builder.documents")
-    #     rag_pipeline.connect("prompt_builder", "llm")
-    #     rag_pipeline.connect("llm.replies", "answer_builder.replies")
-    #     rag_pipeline.connect("retriever", "answer_builder.documents")
-
-    #     # Populate the document store
-    #     documents = [
-    #         Document(content="The capital of France is Paris."),
-    #         Document(content="The capital of Canada is Ottawa."),
-    #         Document(content="The capital of Ghana is Accra."),
-    #     ]
-    #     rag_pipeline.get_component("retriever").document_store.write_documents(documents)
-
-    #     # Query and assert
-    #     questions_and_answers = [
-    #         ("What's the capital of France?", "Paris"),
-    #         ("What is the capital of Canada?", "Ottawa"),
-    #         ("What is the capital of Ghana?", "Accra"),
-    #     ]
-
-    #     for question, answer in questions_and_answers:
-    #         result = rag_pipeline.run(
-    #             {
-    #                 "retriever": {"query": question},
-    #                 "prompt_builder": {
-    #                     "prompt_source": [user_message],
-    #                     "query": question,
-    #                 },
-    #                 "llm": {"generation_kwargs": {"temperature": 0.1}},
-    #             }
-    #         )
-
-    #         assert len(result["answer_builder"]["answers"]) == 1
-    #         generated_answer = result["answer_builder"]["answers"][0]
-    #         assert answer.lower() in generated_answer.data.lower()
-    #         assert generated_answer.query == question
-    #         assert hasattr(generated_answer, "documents")
-    #         assert hasattr(generated_answer, "meta")
+    @pytest.mark.integration
+    def test_run_rag_pipeline(self, generator):
+        """
+        Test that a valid message returns a list of replies.
+        """
+        document_store = InMemoryDocumentStore()
+        documents = [
+            Document(content="There are over 7,000 languages spoken around the world today."),
+            Document(
+                content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."
+            ),
+            Document(
+                content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves."
+            ),
+        ]
+        document_store.write_documents(documents=documents)
+
+        pipeline = Pipeline()
+        pipeline.add_component(
+            instance=InMemoryBM25Retriever(document_store=document_store, top_k=1),
+            name="retriever",
+        )
+        pipeline.add_component(
+            instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder"
+        )
+        pipeline.add_component(instance=generator, name="llm")
+        pipeline.connect("retriever.documents", "prompt_builder.documents")
+        pipeline.connect("prompt_builder.prompt", "llm.messages")
+
+        question = "How many languages are there?"
+        location = "Puerto Rico"
+        system_message = ChatMessage.from_system(
+            "You are a helpful assistant giving out valuable information to tourists."
+        )
+        messages = [
+            system_message,
+            ChatMessage.from_user(
+                """
+        Given these documents and given that I am currently in {{ location }}, answer the question.\nDocuments:
+            {% for doc in documents %}
+                {{ doc.content }}
+            {% endfor %}
+
+            \nQuestion: {{query}}
+            \nAnswer:
+        """
+            ),
+        ]
+        question = "Can I see bioluminescent waves at my current location?"
+        result = pipeline.run(
+            data={
+                "retriever": {"query": question},
+                "prompt_builder": {
+                    "template_variables": {"location": location},
+                    "prompt_source": messages,
+                    "query": question,
+                },
+            }
+        )
+
+        replies = result['llm']['replies']
+        assert len(replies) > 0
+        assert any("bioluminescent waves" in reply.content for reply in replies)
+        assert all(reply.role == ChatRole.ASSISTANT for reply in replies)
+
+
+class TestLlamaCppChatGeneratorFunctionCalls:
+    @pytest.fixture
+    def generator(self, model_path, capsys):
+        gguf_model_path = (
+            "https://huggingface.co/meetkai/functionary-small-v2.4-GGUF/resolve/main/functionary-small-v2.4.Q4_0.gguf"
+        )
+        filename = "functionary-small-v2.4.Q4_0.gguf"
+        download_file(gguf_model_path, str(model_path / filename), capsys)
+        model_path = str(model_path / filename)
+        hf_tokenizer_path = "meetkai/functionary-small-v2.4-GGUF"
+        generator = LlamaCppChatGenerator(
+            model=model_path,
+            n_ctx=8192,
+            n_batch=512,
+            model_kwargs={
+                "chat_format": "functionary-v2",
+                "hf_tokenizer_path": hf_tokenizer_path,
+            },
+        )
+        generator.warm_up()
+        return generator
+
+    @pytest.mark.integration
+    def test_function_call_scenario(self, generator):
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_user_info",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "username": {"type": "string", "description": "The username to retrieve information for."}
+                        },
+                        "required": ["username"],
+                    },
+                    "description": "Retrieves detailed information about a user.",
+                },
+            }
+        ]
+        tool_choice = {"type": "function", "function": {"name": "get_user_info"}}
+
+        messages = [
+            ChatMessage.from_user("Get information for user john_doe"),
+        ]
+        generation_kwargs = {"tools": tools, "tool_choice": tool_choice}
+
+        response = generator.run(messages=messages, generation_kwargs=generation_kwargs)
+
+        assert "tool_calls" in response["replies"][0].meta
+        tool_calls = response["replies"][0].meta["tool_calls"]
+        assert len(tool_calls) > 0
+        assert tool_calls[0]["function"]["name"] == "get_user_info"
+        assert "username" in json.loads(tool_calls[0]["function"]["arguments"])
+        assert response["replies"][0].role == ChatRole.ASSISTANT

From 40bf82b3f2f1e8c2ed2a9c81506caa5ebeafdc8c Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Sat, 11 May 2024 16:03:38 -0700
Subject: [PATCH 06/11] add function calling and execute test, it works!

---
 .../llama_cpp/chat/chat_generator.py          |  9 +--
 .../llama_cpp/tests/test_chat_generator.py    | 76 +++++++++++++++++--
 2 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
index 32797cc2e..e305c2a3d 100644
--- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
+++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py
@@ -1,4 +1,3 @@
-import json
 import logging
 from typing import Any, Dict, List, Optional
 
@@ -25,7 +24,7 @@ class LlamaCppChatGenerator:
     generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
 
     print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
-    # {'replies': [ChatMessage(content='John Cusack', role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={...}]}
+    # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
     ```
     """
 
@@ -55,9 +54,9 @@ def __init__(
         model_kwargs = model_kwargs or {}
         generation_kwargs = generation_kwargs or {}
 
-        if 'hf_tokenizer_path' in model_kwargs:
-            tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs['hf_tokenizer_path'])
-            model_kwargs['tokenizer'] = tokenizer
+        if "hf_tokenizer_path" in model_kwargs:
+            tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs["hf_tokenizer_path"])
+            model_kwargs["tokenizer"] = tokenizer
 
         # check if the model_kwargs contain the essential parameters
         # otherwise, populate them with values from init parameters
diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
index e37dd0bac..c3ed8ea54 100644
--- a/integrations/llama_cpp/tests/test_chat_generator.py
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -198,10 +198,12 @@ def test_run_rag_pipeline(self, generator):
         documents = [
             Document(content="There are over 7,000 languages spoken around the world today."),
             Document(
-                content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."
+                content="""Elephants have been observed to behave in a way that indicates a high
+                level of self-awareness, such as recognizing themselves in mirrors."""
             ),
             Document(
-                content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves."
+                content="""In certain parts of the world, like the Maldives, Puerto Rico,
+                and San Diego, you can witness the phenomenon of bioluminescent waves."""
             ),
         ]
         document_store.write_documents(documents=documents)
@@ -249,13 +251,24 @@ def test_run_rag_pipeline(self, generator):
             }
         )
 
-        replies = result['llm']['replies']
+        replies = result["llm"]["replies"]
         assert len(replies) > 0
         assert any("bioluminescent waves" in reply.content for reply in replies)
         assert all(reply.role == ChatRole.ASSISTANT for reply in replies)
 
 
 class TestLlamaCppChatGeneratorFunctionCalls:
+    def get_current_temperature(self, location):
+        """Get the current temperature in a given location"""
+        if "tokyo" in location.lower():
+            return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
+        elif "san francisco" in location.lower():
+            return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"})
+        elif "paris" in location.lower():
+            return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
+        else:
+            return json.dumps({"location": location, "temperature": "unknown"})
+
     @pytest.fixture
     def generator(self, model_path, capsys):
         gguf_model_path = (
@@ -278,7 +291,7 @@ def generator(self, model_path, capsys):
         return generator
 
     @pytest.mark.integration
-    def test_function_call_scenario(self, generator):
+    def test_function_call(self, generator):
         tools = [
             {
                 "type": "function",
@@ -300,9 +313,7 @@ def test_function_call_scenario(self, generator):
         messages = [
             ChatMessage.from_user("Get information for user john_doe"),
         ]
-        generation_kwargs = {"tools": tools, "tool_choice": tool_choice}
-
-        response = generator.run(messages=messages, generation_kwargs=generation_kwargs)
+        response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
 
         assert "tool_calls" in response["replies"][0].meta
         tool_calls = response["replies"][0].meta["tool_calls"]
@@ -310,3 +321,54 @@ def test_function_call_scenario(self, generator):
         assert tool_calls[0]["function"]["name"] == "get_user_info"
         assert "username" in json.loads(tool_calls[0]["function"]["arguments"])
         assert response["replies"][0].role == ChatRole.ASSISTANT
+
+    def test_function_call_and_execute(self, generator):
+        messages = [ChatMessage.from_user("What's the weather like in San Francisco?")]
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_temperature",
+                    "description": "Get the current temperature in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                        },
+                        "required": ["location"],
+                    },
+                },
+            }
+        ]
+
+        response = generator.run(messages=messages, generation_kwargs={"tools": tools})
+
+        available_functions = {
+            "get_current_temperature": self.get_current_temperature,
+        }
+
+        assert "replies" in response
+        assert len(response["replies"]) > 0
+
+        first_reply = response["replies"][0]
+        assert "tool_calls" in first_reply.meta
+        tool_calls = first_reply.meta["tool_calls"]
+
+        for tool_call in tool_calls:
+            function_name = tool_call["function"]["name"]
+            function_args = json.loads(tool_call["function"]["arguments"])
+            assert function_name in available_functions
+            function_response = available_functions[function_name](**function_args)
+            function_message = ChatMessage.from_function(function_response, function_name)
+            messages.append(function_message)
+
+        second_response = generator.run(messages=messages)
+        print(second_response)
+        assert "replies" in second_response
+        assert len(second_response["replies"]) > 0
+        assert any("current temperature" in reply.content for reply in second_response["replies"])
+        assert any("72" in reply.content for reply in second_response["replies"])

From 93a4dd64e37160740a11f4e675b1df665cf7079b Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Sat, 11 May 2024 22:27:44 -0700
Subject: [PATCH 07/11] add json test, add chatml test

---
 .../llama_cpp/tests/test_chat_generator.py    | 111 +++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
index c3ed8ea54..a1dd41abd 100644
--- a/integrations/llama_cpp/tests/test_chat_generator.py
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -256,8 +256,53 @@ def test_run_rag_pipeline(self, generator):
         assert any("bioluminescent waves" in reply.content for reply in replies)
         assert all(reply.role == ChatRole.ASSISTANT for reply in replies)
 
+    @pytest.mark.integration
+    def test_json_constraining(self, generator):
+        """
+        Test that the generator can output valid JSON.
+        """
+        messages = [ChatMessage.from_system("Output valid json only. List 2 people with their name and age.")]
+        json_schema = {
+            "type": "object",
+            "properties": {
+                "people": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "age": {"type": "number"},
+                        },
+                    },
+                },
+            },
+            "required": ["people"],
+        }
 
-class TestLlamaCppChatGeneratorFunctionCalls:
+        result = generator.run(
+            messages=messages,
+            generation_kwargs={
+                "response_format": {"type": "json_object", "schema": json_schema},
+            },
+        )
+
+        assert "replies" in result
+        assert isinstance(result["replies"], list)
+        assert len(result["replies"]) > 0
+        assert all(reply.role == ChatRole.ASSISTANT for reply in result["replies"])
+        for reply in result["replies"]:
+            assert json.loads(reply.content)
+            assert isinstance(json.loads(reply.content), dict)
+            assert "people" in json.loads(reply.content)
+            assert isinstance(json.loads(reply.content)["people"], list)
+            assert all(isinstance(person, dict) for person in json.loads(reply.content)["people"])
+            assert all("name" in person for person in json.loads(reply.content)["people"])
+            assert all("age" in person for person in json.loads(reply.content)["people"])
+            assert all(isinstance(person["name"], str) for person in json.loads(reply.content)["people"])
+            assert all(isinstance(person["age"], int) for person in json.loads(reply.content)["people"])
+
+
+class TestLlamaCppChatGeneratorFunctionary:
     def get_current_temperature(self, location):
         """Get the current temperature in a given location"""
         if "tokyo" in location.lower():
@@ -372,3 +417,67 @@ def test_function_call_and_execute(self, generator):
         assert len(second_response["replies"]) > 0
         assert any("current temperature" in reply.content for reply in second_response["replies"])
         assert any("72" in reply.content for reply in second_response["replies"])
+
+
+class TestLlamaCppChatGeneratorChatML:
+
+    @pytest.fixture
+    def generator(self, model_path, capsys):
+        gguf_model_path = (
+            "https://huggingface.co/TheBloke/openchat-3.5-1210-GGUF/resolve/main/openchat-3.5-1210.Q3_K_S.gguf"
+        )
+        filename = "openchat-3.5-1210.Q3_K_S.gguf"
+        download_file(gguf_model_path, str(model_path / filename), capsys)
+        model_path = str(model_path / filename)
+        generator = LlamaCppChatGenerator(
+            model=model_path,
+            n_ctx=8192,
+            n_batch=512,
+            model_kwargs={
+                "chat_format": "chatml-function-calling",
+            },
+        )
+        generator.warm_up()
+        return generator
+
+    @pytest.mark.integration
+    def test_function_call_chatml(self, generator):
+        messages = [
+            ChatMessage.from_system(
+                """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful,
+                detailed, and polite answers to the user's questions. The assistant calls functions with appropriate
+                input when necessary"""
+            ),
+            ChatMessage.from_user("Extract Jason is 25 years old"),
+        ]
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "UserDetail",
+                    "parameters": {
+                        "type": "object",
+                        "title": "UserDetail",
+                        "properties": {
+                            "name": {"title": "Name", "type": "string"},
+                            "age": {"title": "Age", "type": "integer"},
+                        },
+                        "required": ["name", "age"],
+                    },
+                },
+            }
+        ]
+
+        tool_choice = {"type": "function", "function": {"name": "UserDetail"}}
+
+        response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
+        for reply in response["replies"]:
+            assert "tool_calls" in reply.meta
+            tool_calls = reply.meta["tool_calls"]
+            assert len(tool_calls) > 0
+            assert tool_calls[0]["function"]["name"] == "UserDetail"
+            assert "name" in json.loads(tool_calls[0]["function"]["arguments"])
+            assert "age" in json.loads(tool_calls[0]["function"]["arguments"])
+            assert "Jason" in json.loads(tool_calls[0]["function"]["arguments"])["name"]
+            assert 25 == json.loads(tool_calls[0]["function"]["arguments"])["age"]

From bdc23c9e7ae6f4fbff01df378dfc2c5a5ae30d4b Mon Sep 17 00:00:00 2001
From: Ulises M <30765968+lbux@users.noreply.github.com>
Date: Sat, 11 May 2024 22:44:07 -0700
Subject: [PATCH 08/11] make function call and execute more deterministic

---
 integrations/llama_cpp/tests/test_chat_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py
index a1dd41abd..5666f109a 100644
--- a/integrations/llama_cpp/tests/test_chat_generator.py
+++ b/integrations/llama_cpp/tests/test_chat_generator.py
@@ -415,7 +415,7 @@ def test_function_call_and_execute(self, generator):
         print(second_response)
         assert "replies" in second_response
         assert len(second_response["replies"]) > 0
-        assert any("current temperature" in reply.content for reply in second_response["replies"])
+        assert any("San Francisco" in reply.content for reply in second_response["replies"])
         assert any("72" in reply.content for reply in second_response["replies"])
 
 

From 0dda139279fde398d0f1168c0185d3eec9f7405a Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Mon, 13 May 2024 13:01:13 +0200
Subject: [PATCH 09/11] try removing additional deps

---
 integrations/llama_cpp/pyproject.toml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml
index 20967a933..563af391d 100644
--- a/integrations/llama_cpp/pyproject.toml
+++ b/integrations/llama_cpp/pyproject.toml
@@ -28,10 +28,7 @@ classifiers = [
 ]
 dependencies = [
   "haystack-ai",
-  "llama-cpp-python",
-  "transformers",
-  "sentencepiece",
-  "protobuf"
+  "llama-cpp-python"
 ]
 
 [project.urls]

From 9f8447edbc6b92c540e0ab46daa9be6a52a52c4e Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Mon, 13 May 2024 13:22:00 +0200
Subject: [PATCH 10/11] revert

---
 integrations/llama_cpp/pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml
index 563af391d..50b8040b2 100644
--- a/integrations/llama_cpp/pyproject.toml
+++ b/integrations/llama_cpp/pyproject.toml
@@ -28,7 +28,8 @@ classifiers = [
 ]
 dependencies = [
   "haystack-ai",
-  "llama-cpp-python"
+  "llama-cpp-python",
+  "transformers[sentencepiece]"
 ]
 
 [project.urls]

From 869c36c8c4436c21746c929d7932e6538b416221 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Mon, 13 May 2024 13:30:21 +0200
Subject: [PATCH 11/11] make transformers a tests-only dependency

---
 integrations/llama_cpp/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml
index 50b8040b2..a90118ee4 100644
--- a/integrations/llama_cpp/pyproject.toml
+++ b/integrations/llama_cpp/pyproject.toml
@@ -28,8 +28,7 @@ classifiers = [
 ]
 dependencies = [
   "haystack-ai",
-  "llama-cpp-python",
-  "transformers[sentencepiece]"
+  "llama-cpp-python"
 ]
 
 [project.urls]
@@ -53,6 +52,7 @@ dependencies = [
     "coverage[toml]>=6.5",
     "pytest",
     "haystack-pydoc-tools",
+    "transformers[sentencepiece]"
 ]
 [tool.hatch.envs.default.scripts]
 test = "pytest {args:tests}"