From f26447f74e14fb994c5195b58f5a2ba92a3e3d73 Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Tue, 7 May 2024 18:20:19 -0700 Subject: [PATCH 01/11] basic implementation of llama.cpp chat generation allows for constraining to json allows for function calling (not tested) streaming needs to be implemented when stream is set to true in generation_kwargs --- .../generators/llama_cpp/__init__.py | 3 +- .../llama_cpp/chat/chat_generator.py | 97 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py index cac9235bd..10b20d363 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/__init__.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from .chat.chat_generator import LlamaCppChatGenerator from .generator import LlamaCppGenerator -__all__ = ["LlamaCppGenerator"] +__all__ = ["LlamaCppGenerator", "LlamaCppChatGenerator"] diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py new file mode 100644 index 000000000..e1903ce38 --- /dev/null +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -0,0 +1,97 @@ +import logging +from typing import Any, Dict, List, Optional + +from haystack import component +from haystack.dataclasses import ChatMessage +from llama_cpp import Llama + +logger = logging.getLogger(__name__) + + +@component +class LlamaCppChatGenerator: + """ + Provides an interface to generate text using LLM via llama.cpp. + + [llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs. + It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs). + + Usage example: + ```python + from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator + generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512) + + print(generator.run("Who is the best American actor?", generation_kwargs={"max_tokens": 128})) + # {'replies': ['John Cusack'], 'meta': [{"object": "text_completion", ...}]} + ``` + """ + + def __init__( + self, + model: str, + n_ctx: Optional[int] = 0, + n_batch: Optional[int] = 512, + model_kwargs: Optional[Dict[str, Any]] = None, + generation_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf". + If the model path is also specified in the `model_kwargs`, this parameter will be ignored. + :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model. + :param n_batch: Prompt processing maximum batch size. + :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation. + These keyword arguments provide fine-grained control over the model loading. + In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters. + For more information on the available kwargs, see + [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__). + :param generation_kwargs: A dictionary containing keyword arguments to customize text generation. + For more information on the available kwargs, see + [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion). + """ + + model_kwargs = model_kwargs or {} + generation_kwargs = generation_kwargs or {} + + # check if the model_kwargs contain the essential parameters + # otherwise, populate them with values from init parameters + model_kwargs.setdefault("model_path", model) + model_kwargs.setdefault("n_ctx", n_ctx) + model_kwargs.setdefault("n_batch", n_batch) + + self.model_path = model + self.n_ctx = n_ctx + self.n_batch = n_batch + self.model_kwargs = model_kwargs + self.generation_kwargs = generation_kwargs + self.model = None + + def warm_up(self): + if self.model is None: + self.model = Llama(**self.model_kwargs) + + @component.output_types(replies=List[ChatMessage]) + def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None): + """ + Run the text generation model on the given prompt. + + :param messages: + A list of ChatMessage instances representing the input messages. + :param generation_kwargs: A dictionary containing keyword arguments to customize text generation. + For more information on the available kwargs, see + [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion). + :returns: A dictionary with the following keys: + - `replies`: The responses from the model + """ + if self.model is None: + error_msg = "The model has not been loaded. Please call warm_up() before running." + raise RuntimeError(error_msg) + + updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})} + formatted_messages = [msg.to_openai_format() for msg in messages] + + # Check if stream in generation_kwargs is set to True; handle streaming + + output = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs) + replies = [ChatMessage.from_assistant(content=output["choices"][0]["message"]["content"])] + + return {"replies": replies} From 64792b51db7a24488fbfaa05776c2cde8e0e1083 Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Wed, 8 May 2024 22:32:26 -0700 Subject: [PATCH 02/11] add testing --- .../llama_cpp/chat/chat_generator.py | 31 ++- .../llama_cpp/tests/test_chat_generator.py | 253 ++++++++++++++++++ 2 files changed, 278 insertions(+), 6 deletions(-) create mode 100644 integrations/llama_cpp/tests/test_chat_generator.py diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py index e1903ce38..7261c9925 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional from haystack import component -from haystack.dataclasses import ChatMessage +from haystack.dataclasses import ChatMessage, ChatRole from llama_cpp import Llama logger = logging.getLogger(__name__) @@ -69,6 +69,10 @@ def warm_up(self): if self.model is None: self.model = Llama(**self.model_kwargs) + def stream_to_stdout(self, chunk): + """Print streamed data to stdout.""" + print(chunk.content, end='', flush=True) + @component.output_types(replies=List[ChatMessage]) def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None): """ @@ -86,12 +90,27 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, error_msg = "The model has not been loaded. Please call warm_up() before running." raise RuntimeError(error_msg) + if not messages: + return {"replies": []} + updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})} formatted_messages = [msg.to_openai_format() for msg in messages] - # Check if stream in generation_kwargs is set to True; handle streaming - - output = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs) - replies = [ChatMessage.from_assistant(content=output["choices"][0]["message"]["content"])] - + response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs) + replies = [] + for choice in response["choices"]: + metadata = { + "response_id": response["id"], + "model": response["model"], + "created": response["created"], + "index": choice["index"], + "finish_reason": choice["finish_reason"], + "usage": response["usage"], + } + + content = choice["message"]["content"] + role = choice["message"]["role"].upper() + + chat_message = ChatMessage(content=content, role=ChatRole[role], name=None, meta=metadata) + replies.append(chat_message) return {"replies": replies} diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py new file mode 100644 index 000000000..898b584c4 --- /dev/null +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -0,0 +1,253 @@ +import os +import urllib.request +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.dataclasses import ChatMessage, ChatRole +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator + + +@pytest.fixture +def model_path(): + return Path(__file__).parent / "models" + + +def download_file(file_link, filename, capsys): + # Checks if the file already exists before downloading + if not os.path.isfile(filename): + urllib.request.urlretrieve(file_link, filename) # noqa: S310 + with capsys.disabled(): + print("\nModel file downloaded successfully.") + else: + with capsys.disabled(): + print("\nModel file already exists.") + + +class TestLlamaCppChatGenerator: + @pytest.fixture + def generator(self, model_path, capsys): + gguf_model_path = ( + "https://huggingface.co/TheBloke/openchat-3.5-1210-GGUF/resolve/main/openchat-3.5-1210.Q3_K_S.gguf" + ) + filename = "openchat-3.5-1210.Q3_K_S.gguf" + + # Download GGUF model from HuggingFace + download_file(gguf_model_path, str(model_path / filename), capsys) + + model_path = str(model_path / filename) + generator = LlamaCppChatGenerator(model=model_path, n_ctx=128, n_batch=128) + generator.warm_up() + return generator + + @pytest.fixture + def generator_mock(self): + mock_model = MagicMock() + generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=2048, n_batch=512) + generator.model = mock_model + return generator, mock_model + + def test_default_init(self): + """ + Test default initialization parameters. + """ + generator = LlamaCppChatGenerator(model="test_model.gguf") + + assert generator.model_path == "test_model.gguf" + assert generator.n_ctx == 0 + assert generator.n_batch == 512 + assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 0, "n_batch": 512} + assert generator.generation_kwargs == {} + + def test_custom_init(self): + """ + Test custom initialization parameters. + """ + generator = LlamaCppChatGenerator( + model="test_model.gguf", + n_ctx=2048, + n_batch=512, + ) + + assert generator.model_path == "test_model.gguf" + assert generator.n_ctx == 2048 + assert generator.n_batch == 512 + assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 2048, "n_batch": 512} + assert generator.generation_kwargs == {} + + def test_ignores_model_path_if_specified_in_model_kwargs(self): + """ + Test that model_path is ignored if already specified in model_kwargs. + """ + generator = LlamaCppChatGenerator( + model="test_model.gguf", + n_ctx=512, + n_batch=512, + model_kwargs={"model_path": "other_model.gguf"}, + ) + assert generator.model_kwargs["model_path"] == "other_model.gguf" + + def test_ignores_n_ctx_if_specified_in_model_kwargs(self): + """ + Test that n_ctx is ignored if already specified in model_kwargs. + """ + generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 1024}) + assert generator.model_kwargs["n_ctx"] == 1024 + + def test_ignores_n_batch_if_specified_in_model_kwargs(self): + """ + Test that n_batch is ignored if already specified in model_kwargs. + """ + generator = LlamaCppChatGenerator( + model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_batch": 1024} + ) + assert generator.model_kwargs["n_batch"] == 1024 + + def test_raises_error_without_warm_up(self): + """ + Test that the generator raises an error if warm_up() is not called before running. + """ + generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512) + with pytest.raises(RuntimeError): + generator.run("What is the capital of China?") + + def test_run_with_empty_message(self, generator_mock): + """ + Test that an empty message returns an empty list of replies. + """ + generator, _ = generator_mock + result = generator.run([]) + assert isinstance(result["replies"], list) + assert len(result["replies"]) == 0 + + def test_run_with_valid_message(self, generator_mock): + """ + Test that a valid message returns a list of replies. + """ + generator, mock_model = generator_mock + mock_output = { + "id": "unique-id-123", + "model": "Test Model Path", + "created": 1715226164, + "choices": [ + {"index": 0, "message": {"content": "Generated text", "role": "assistant"}, "finish_reason": "stop"} + ], + "usage": {"prompt_tokens": 14, "completion_tokens": 57, "total_tokens": 71}, + } + mock_model.create_chat_completion.return_value = mock_output + result = generator.run(messages=[ChatMessage.from_system("Test")]) + assert isinstance(result["replies"], list) + assert len(result["replies"]) == 1 + assert isinstance(result["replies"][0], ChatMessage) + assert result["replies"][0].content == "Generated text" + assert result["replies"][0].role == ChatRole.ASSISTANT + + def test_run_with_generation_kwargs(self, generator_mock): + """ + Test that a valid message and generation kwargs returns a list of replies. + """ + generator, mock_model = generator_mock + mock_output = { + "id": "unique-id-123", + "model": "Test Model Path", + "created": 1715226164, + "choices": [ + {"index": 0, "message": {"content": "Generated text", "role": "assistant"}, "finish_reason": "length"} + ], + "usage": {"prompt_tokens": 14, "completion_tokens": 57, "total_tokens": 71}, + } + mock_model.create_chat_completion.return_value = mock_output + generation_kwargs = {"max_tokens": 128} + result = generator.run([ChatMessage.from_system("Write a 200 word paragraph.")], generation_kwargs) + assert result["replies"][0].content == "Generated text" + assert result["replies"][0].meta["finish_reason"] == "length" + + @pytest.mark.integration + def test_run(self, generator): + """ + Test that a valid message returns a list of replies. + """ + questions_and_answers = [ + ("What's the capital of France?", "Paris"), + ("What is the capital of Canada?", "Ottawa"), + ("What is the capital of Ghana?", "Accra"), + ] + + for question, answer in questions_and_answers: + chat_message = ChatMessage.from_system( + f"GPT4 Correct User: Answer in a single word. {question} <|end_of_turn|>\n GPT4 Correct Assistant:" + ) + result = generator.run([chat_message]) + + assert "replies" in result + assert isinstance(result["replies"], list) + assert len(result["replies"]) > 0 + assert any(answer.lower() in reply.content.lower() for reply in result["replies"]) + + @pytest.mark.integration + def test_run_rag_pipeline(self, generator): + """ + Test that a valid message returns a list of replies. + """ + user_message = ( + ChatMessage.from_user + ) = """GPT4 Correct User: Answer the question in a single word. {{question}} + Context: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|end_of_turn|> + GPT4 Correct Assistant: + """ + rag_pipeline = Pipeline() + rag_pipeline.add_component( + instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever" + ) + rag_pipeline.add_component( + instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder" + ) + rag_pipeline.add_component(instance=generator, name="llm") + rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + rag_pipeline.connect("retriever", "prompt_builder.documents") + rag_pipeline.connect("prompt_builder", "llm") + rag_pipeline.connect("llm.replies", "answer_builder.replies") + rag_pipeline.connect("retriever", "answer_builder.documents") + + # Populate the document store + documents = [ + Document(content="The capital of France is Paris."), + Document(content="The capital of Canada is Ottawa."), + Document(content="The capital of Ghana is Accra."), + ] + rag_pipeline.get_component("retriever").document_store.write_documents(documents) + + # Query and assert + questions_and_answers = [ + ("What's the capital of France?", "Paris"), + ("What is the capital of Canada?", "Ottawa"), + ("What is the capital of Ghana?", "Accra"), + ] + + for question, answer in questions_and_answers: + result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": { + "prompt_source": [user_message], + "query": question, + }, + "llm": {"generation_kwargs": {"temperature": 0.1}}, + } + ) + + assert len(result["answer_builder"]["answers"]) == 1 + generated_answer = result["answer_builder"]["answers"][0] + assert answer.lower() in generated_answer.data.lower() + assert generated_answer.query == question + assert hasattr(generated_answer, "documents") + assert hasattr(generated_answer, "meta") From 4a85312eefe811d7afbd785e76b95ff98ba63e5e Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Wed, 8 May 2024 22:33:42 -0700 Subject: [PATCH 03/11] remove unnecessary function --- .../components/generators/llama_cpp/chat/chat_generator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py index 7261c9925..7ea26535c 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -69,10 +69,6 @@ def warm_up(self): if self.model is None: self.model = Llama(**self.model_kwargs) - def stream_to_stdout(self, chunk): - """Print streamed data to stdout.""" - print(chunk.content, end='', flush=True) - @component.output_types(replies=List[ChatMessage]) def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None): """ From 6f58ad496839645733446f506c756b0c91129f03 Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Wed, 8 May 2024 22:47:41 -0700 Subject: [PATCH 04/11] slight documentation fix, comment out broken test --- .../llama_cpp/chat/chat_generator.py | 9 +- .../llama_cpp/tests/test_chat_generator.py | 124 +++++++++--------- 2 files changed, 67 insertions(+), 66 deletions(-) diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py index 7ea26535c..232bce8f3 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -18,11 +18,12 @@ class LlamaCppChatGenerator: Usage example: ```python - from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator + from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator + user_message = [ChatMessage.from_user("Who is the best American actor?")] generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512) - print(generator.run("Who is the best American actor?", generation_kwargs={"max_tokens": 128})) - # {'replies': ['John Cusack'], 'meta': [{"object": "text_completion", ...}]} + print(generator.run(user_message, generation_kwargs={"max_tokens": 128})) + # {'replies': [ChatMessage(content='John Cusack', role=, name=None, meta={...}]} ``` """ @@ -72,7 +73,7 @@ def warm_up(self): @component.output_types(replies=List[ChatMessage]) def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None): """ - Run the text generation model on the given prompt. + Run the text generation model on the given list of ChatMessages. :param messages: A list of ChatMessage instances representing the input messages. diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py index 898b584c4..703bb8d3b 100644 --- a/integrations/llama_cpp/tests/test_chat_generator.py +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -189,65 +189,65 @@ def test_run(self, generator): assert len(result["replies"]) > 0 assert any(answer.lower() in reply.content.lower() for reply in result["replies"]) - @pytest.mark.integration - def test_run_rag_pipeline(self, generator): - """ - Test that a valid message returns a list of replies. - """ - user_message = ( - ChatMessage.from_user - ) = """GPT4 Correct User: Answer the question in a single word. {{question}} - Context: - {% for doc in documents %} - {{ doc.content }} - {% endfor %} - <|end_of_turn|> - GPT4 Correct Assistant: - """ - rag_pipeline = Pipeline() - rag_pipeline.add_component( - instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever" - ) - rag_pipeline.add_component( - instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder" - ) - rag_pipeline.add_component(instance=generator, name="llm") - rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") - rag_pipeline.connect("retriever", "prompt_builder.documents") - rag_pipeline.connect("prompt_builder", "llm") - rag_pipeline.connect("llm.replies", "answer_builder.replies") - rag_pipeline.connect("retriever", "answer_builder.documents") - - # Populate the document store - documents = [ - Document(content="The capital of France is Paris."), - Document(content="The capital of Canada is Ottawa."), - Document(content="The capital of Ghana is Accra."), - ] - rag_pipeline.get_component("retriever").document_store.write_documents(documents) - - # Query and assert - questions_and_answers = [ - ("What's the capital of France?", "Paris"), - ("What is the capital of Canada?", "Ottawa"), - ("What is the capital of Ghana?", "Accra"), - ] - - for question, answer in questions_and_answers: - result = rag_pipeline.run( - { - "retriever": {"query": question}, - "prompt_builder": { - "prompt_source": [user_message], - "query": question, - }, - "llm": {"generation_kwargs": {"temperature": 0.1}}, - } - ) - - assert len(result["answer_builder"]["answers"]) == 1 - generated_answer = result["answer_builder"]["answers"][0] - assert answer.lower() in generated_answer.data.lower() - assert generated_answer.query == question - assert hasattr(generated_answer, "documents") - assert hasattr(generated_answer, "meta") + # @pytest.mark.integration + # def test_run_rag_pipeline(self, generator): + # """ + # Test that a valid message returns a list of replies. + # """ + # user_message = ( + # ChatMessage.from_user + # ) = """GPT4 Correct User: Answer the question in a single word. {{question}} + # Context: + # {% for doc in documents %} + # {{ doc.content }} + # {% endfor %} + # <|end_of_turn|> + # GPT4 Correct Assistant: + # """ + # rag_pipeline = Pipeline() + # rag_pipeline.add_component( + # instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever" + # ) + # rag_pipeline.add_component( + # instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder" + # ) + # rag_pipeline.add_component(instance=generator, name="llm") + # rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + # rag_pipeline.connect("retriever", "prompt_builder.documents") + # rag_pipeline.connect("prompt_builder", "llm") + # rag_pipeline.connect("llm.replies", "answer_builder.replies") + # rag_pipeline.connect("retriever", "answer_builder.documents") + + # # Populate the document store + # documents = [ + # Document(content="The capital of France is Paris."), + # Document(content="The capital of Canada is Ottawa."), + # Document(content="The capital of Ghana is Accra."), + # ] + # rag_pipeline.get_component("retriever").document_store.write_documents(documents) + + # # Query and assert + # questions_and_answers = [ + # ("What's the capital of France?", "Paris"), + # ("What is the capital of Canada?", "Ottawa"), + # ("What is the capital of Ghana?", "Accra"), + # ] + + # for question, answer in questions_and_answers: + # result = rag_pipeline.run( + # { + # "retriever": {"query": question}, + # "prompt_builder": { + # "prompt_source": [user_message], + # "query": question, + # }, + # "llm": {"generation_kwargs": {"temperature": 0.1}}, + # } + # ) + + # assert len(result["answer_builder"]["answers"]) == 1 + # generated_answer = result["answer_builder"]["answers"][0] + # assert answer.lower() in generated_answer.data.lower() + # assert generated_answer.query == question + # assert hasattr(generated_answer, "documents") + # assert hasattr(generated_answer, "meta") From 788b3119c7df34018725ff9b603ce1b97805b8aa Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Sat, 11 May 2024 01:37:21 -0700 Subject: [PATCH 05/11] support for function calling through functionary also add a basic rag test --- integrations/llama_cpp/pyproject.toml | 5 +- .../llama_cpp/chat/chat_generator.py | 44 ++-- .../llama_cpp/tests/test_chat_generator.py | 201 +++++++++++------- 3 files changed, 162 insertions(+), 88 deletions(-) diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml index 563af391d..20967a933 100644 --- a/integrations/llama_cpp/pyproject.toml +++ b/integrations/llama_cpp/pyproject.toml @@ -28,7 +28,10 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "llama-cpp-python" + "llama-cpp-python", + "transformers", + "sentencepiece", + "protobuf" ] [project.urls] diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py index 232bce8f3..32797cc2e 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -1,9 +1,11 @@ +import json import logging from typing import Any, Dict, List, Optional from haystack import component from haystack.dataclasses import ChatMessage, ChatRole from llama_cpp import Llama +from llama_cpp.llama_tokenizer import LlamaHFTokenizer logger = logging.getLogger(__name__) @@ -53,6 +55,10 @@ def __init__( model_kwargs = model_kwargs or {} generation_kwargs = generation_kwargs or {} + if 'hf_tokenizer_path' in model_kwargs: + tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs['hf_tokenizer_path']) + model_kwargs['tokenizer'] = tokenizer + # check if the model_kwargs contain the essential parameters # otherwise, populate them with values from init parameters model_kwargs.setdefault("model_path", model) @@ -94,20 +100,26 @@ def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, formatted_messages = [msg.to_openai_format() for msg in messages] response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs) - replies = [] - for choice in response["choices"]: - metadata = { - "response_id": response["id"], - "model": response["model"], - "created": response["created"], - "index": choice["index"], - "finish_reason": choice["finish_reason"], - "usage": response["usage"], - } - - content = choice["message"]["content"] - role = choice["message"]["role"].upper() - - chat_message = ChatMessage(content=content, role=ChatRole[role], name=None, meta=metadata) - replies.append(chat_message) + replies = [ + ChatMessage( + content=choice["message"]["content"], + role=ChatRole[choice["message"]["role"].upper()], + name=None, + meta={ + "response_id": response["id"], + "model": response["model"], + "created": response["created"], + "index": choice["index"], + "finish_reason": choice["finish_reason"], + "usage": response["usage"], + }, + ) + for choice in response["choices"] + ] + + for reply, choice in zip(replies, response["choices"]): + tool_calls = choice.get("message", {}).get("tool_calls", []) + if tool_calls: + reply.meta["tool_calls"] = tool_calls + reply.name = tool_calls[0]["function"]["name"] if tool_calls else None return {"replies": replies} diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py index 703bb8d3b..e37dd0bac 100644 --- a/integrations/llama_cpp/tests/test_chat_generator.py +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -1,3 +1,4 @@ +import json import os import urllib.request from pathlib import Path @@ -5,7 +6,6 @@ import pytest from haystack import Document, Pipeline -from haystack.components.builders.answer_builder import AnswerBuilder from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder from haystack.components.retrievers.in_memory import InMemoryBM25Retriever from haystack.dataclasses import ChatMessage, ChatRole @@ -41,7 +41,7 @@ def generator(self, model_path, capsys): download_file(gguf_model_path, str(model_path / filename), capsys) model_path = str(model_path / filename) - generator = LlamaCppChatGenerator(model=model_path, n_ctx=128, n_batch=128) + generator = LlamaCppChatGenerator(model=model_path, n_ctx=8192, n_batch=512) generator.warm_up() return generator @@ -70,14 +70,14 @@ def test_custom_init(self): """ generator = LlamaCppChatGenerator( model="test_model.gguf", - n_ctx=2048, + n_ctx=8192, n_batch=512, ) assert generator.model_path == "test_model.gguf" - assert generator.n_ctx == 2048 + assert generator.n_ctx == 8192 assert generator.n_batch == 512 - assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 2048, "n_batch": 512} + assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512} assert generator.generation_kwargs == {} def test_ignores_model_path_if_specified_in_model_kwargs(self): @@ -86,7 +86,7 @@ def test_ignores_model_path_if_specified_in_model_kwargs(self): """ generator = LlamaCppChatGenerator( model="test_model.gguf", - n_ctx=512, + n_ctx=8192, n_batch=512, model_kwargs={"model_path": "other_model.gguf"}, ) @@ -96,15 +96,15 @@ def test_ignores_n_ctx_if_specified_in_model_kwargs(self): """ Test that n_ctx is ignored if already specified in model_kwargs. """ - generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 1024}) - assert generator.model_kwargs["n_ctx"] == 1024 + generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_ctx": 8192}) + assert generator.model_kwargs["n_ctx"] == 8192 def test_ignores_n_batch_if_specified_in_model_kwargs(self): """ Test that n_batch is ignored if already specified in model_kwargs. """ generator = LlamaCppChatGenerator( - model="test_model.gguf", n_ctx=512, n_batch=512, model_kwargs={"n_batch": 1024} + model="test_model.gguf", n_ctx=8192, n_batch=512, model_kwargs={"n_batch": 1024} ) assert generator.model_kwargs["n_batch"] == 1024 @@ -189,65 +189,124 @@ def test_run(self, generator): assert len(result["replies"]) > 0 assert any(answer.lower() in reply.content.lower() for reply in result["replies"]) - # @pytest.mark.integration - # def test_run_rag_pipeline(self, generator): - # """ - # Test that a valid message returns a list of replies. - # """ - # user_message = ( - # ChatMessage.from_user - # ) = """GPT4 Correct User: Answer the question in a single word. {{question}} - # Context: - # {% for doc in documents %} - # {{ doc.content }} - # {% endfor %} - # <|end_of_turn|> - # GPT4 Correct Assistant: - # """ - # rag_pipeline = Pipeline() - # rag_pipeline.add_component( - # instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore(), top_k=1), name="retriever" - # ) - # rag_pipeline.add_component( - # instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder" - # ) - # rag_pipeline.add_component(instance=generator, name="llm") - # rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") - # rag_pipeline.connect("retriever", "prompt_builder.documents") - # rag_pipeline.connect("prompt_builder", "llm") - # rag_pipeline.connect("llm.replies", "answer_builder.replies") - # rag_pipeline.connect("retriever", "answer_builder.documents") - - # # Populate the document store - # documents = [ - # Document(content="The capital of France is Paris."), - # Document(content="The capital of Canada is Ottawa."), - # Document(content="The capital of Ghana is Accra."), - # ] - # rag_pipeline.get_component("retriever").document_store.write_documents(documents) - - # # Query and assert - # questions_and_answers = [ - # ("What's the capital of France?", "Paris"), - # ("What is the capital of Canada?", "Ottawa"), - # ("What is the capital of Ghana?", "Accra"), - # ] - - # for question, answer in questions_and_answers: - # result = rag_pipeline.run( - # { - # "retriever": {"query": question}, - # "prompt_builder": { - # "prompt_source": [user_message], - # "query": question, - # }, - # "llm": {"generation_kwargs": {"temperature": 0.1}}, - # } - # ) - - # assert len(result["answer_builder"]["answers"]) == 1 - # generated_answer = result["answer_builder"]["answers"][0] - # assert answer.lower() in generated_answer.data.lower() - # assert generated_answer.query == question - # assert hasattr(generated_answer, "documents") - # assert hasattr(generated_answer, "meta") + @pytest.mark.integration + def test_run_rag_pipeline(self, generator): + """ + Test that a valid message returns a list of replies. + """ + document_store = InMemoryDocumentStore() + documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors." + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves." + ), + ] + document_store.write_documents(documents=documents) + + pipeline = Pipeline() + pipeline.add_component( + instance=InMemoryBM25Retriever(document_store=document_store, top_k=1), + name="retriever", + ) + pipeline.add_component( + instance=DynamicChatPromptBuilder(runtime_variables=["query", "documents"]), name="prompt_builder" + ) + pipeline.add_component(instance=generator, name="llm") + pipeline.connect("retriever.documents", "prompt_builder.documents") + pipeline.connect("prompt_builder.prompt", "llm.messages") + + question = "How many languages are there?" + location = "Puerto Rico" + system_message = ChatMessage.from_system( + "You are a helpful assistant giving out valuable information to tourists." + ) + messages = [ + system_message, + ChatMessage.from_user( + """ + Given these documents and given that I am currently in {{ location }}, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + + \nQuestion: {{query}} + \nAnswer: + """ + ), + ] + question = "Can I see bioluminescent waves at my current location?" + result = pipeline.run( + data={ + "retriever": {"query": question}, + "prompt_builder": { + "template_variables": {"location": location}, + "prompt_source": messages, + "query": question, + }, + } + ) + + replies = result['llm']['replies'] + assert len(replies) > 0 + assert any("bioluminescent waves" in reply.content for reply in replies) + assert all(reply.role == ChatRole.ASSISTANT for reply in replies) + + +class TestLlamaCppChatGeneratorFunctionCalls: + @pytest.fixture + def generator(self, model_path, capsys): + gguf_model_path = ( + "https://huggingface.co/meetkai/functionary-small-v2.4-GGUF/resolve/main/functionary-small-v2.4.Q4_0.gguf" + ) + filename = "functionary-small-v2.4.Q4_0.gguf" + download_file(gguf_model_path, str(model_path / filename), capsys) + model_path = str(model_path / filename) + hf_tokenizer_path = "meetkai/functionary-small-v2.4-GGUF" + generator = LlamaCppChatGenerator( + model=model_path, + n_ctx=8192, + n_batch=512, + model_kwargs={ + "chat_format": "functionary-v2", + "hf_tokenizer_path": hf_tokenizer_path, + }, + ) + generator.warm_up() + return generator + + @pytest.mark.integration + def test_function_call_scenario(self, generator): + tools = [ + { + "type": "function", + "function": { + "name": "get_user_info", + "parameters": { + "type": "object", + "properties": { + "username": {"type": "string", "description": "The username to retrieve information for."} + }, + "required": ["username"], + }, + "description": "Retrieves detailed information about a user.", + }, + } + ] + tool_choice = {"type": "function", "function": {"name": "get_user_info"}} + + messages = [ + ChatMessage.from_user("Get information for user john_doe"), + ] + generation_kwargs = {"tools": tools, "tool_choice": tool_choice} + + response = generator.run(messages=messages, generation_kwargs=generation_kwargs) + + assert "tool_calls" in response["replies"][0].meta + tool_calls = response["replies"][0].meta["tool_calls"] + assert len(tool_calls) > 0 + assert tool_calls[0]["function"]["name"] == "get_user_info" + assert "username" in json.loads(tool_calls[0]["function"]["arguments"]) + assert response["replies"][0].role == ChatRole.ASSISTANT From 40bf82b3f2f1e8c2ed2a9c81506caa5ebeafdc8c Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Sat, 11 May 2024 16:03:38 -0700 Subject: [PATCH 06/11] add function calling and execute test, it works! --- .../llama_cpp/chat/chat_generator.py | 9 +-- .../llama_cpp/tests/test_chat_generator.py | 76 +++++++++++++++++-- 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py index 32797cc2e..e305c2a3d 100644 --- a/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +++ b/integrations/llama_cpp/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py @@ -1,4 +1,3 @@ -import json import logging from typing import Any, Dict, List, Optional @@ -25,7 +24,7 @@ class LlamaCppChatGenerator: generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512) print(generator.run(user_message, generation_kwargs={"max_tokens": 128})) - # {'replies': [ChatMessage(content='John Cusack', role=, name=None, meta={...}]} + # {"replies": [ChatMessage(content="John Cusack", role=, name=None, meta={...}]} ``` """ @@ -55,9 +54,9 @@ def __init__( model_kwargs = model_kwargs or {} generation_kwargs = generation_kwargs or {} - if 'hf_tokenizer_path' in model_kwargs: - tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs['hf_tokenizer_path']) - model_kwargs['tokenizer'] = tokenizer + if "hf_tokenizer_path" in model_kwargs: + tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs["hf_tokenizer_path"]) + model_kwargs["tokenizer"] = tokenizer # check if the model_kwargs contain the essential parameters # otherwise, populate them with values from init parameters diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py index e37dd0bac..c3ed8ea54 100644 --- a/integrations/llama_cpp/tests/test_chat_generator.py +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -198,10 +198,12 @@ def test_run_rag_pipeline(self, generator): documents = [ Document(content="There are over 7,000 languages spoken around the world today."), Document( - content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors." + content="""Elephants have been observed to behave in a way that indicates a high + level of self-awareness, such as recognizing themselves in mirrors.""" ), Document( - content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves." + content="""In certain parts of the world, like the Maldives, Puerto Rico, + and San Diego, you can witness the phenomenon of bioluminescent waves.""" ), ] document_store.write_documents(documents=documents) @@ -249,13 +251,24 @@ def test_run_rag_pipeline(self, generator): } ) - replies = result['llm']['replies'] + replies = result["llm"]["replies"] assert len(replies) > 0 assert any("bioluminescent waves" in reply.content for reply in replies) assert all(reply.role == ChatRole.ASSISTANT for reply in replies) class TestLlamaCppChatGeneratorFunctionCalls: + def get_current_temperature(self, location): + """Get the current temperature in a given location""" + if "tokyo" in location.lower(): + return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"}) + elif "san francisco" in location.lower(): + return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}) + elif "paris" in location.lower(): + return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"}) + else: + return json.dumps({"location": location, "temperature": "unknown"}) + @pytest.fixture def generator(self, model_path, capsys): gguf_model_path = ( @@ -278,7 +291,7 @@ def generator(self, model_path, capsys): return generator @pytest.mark.integration - def test_function_call_scenario(self, generator): + def test_function_call(self, generator): tools = [ { "type": "function", @@ -300,9 +313,7 @@ def test_function_call_scenario(self, generator): messages = [ ChatMessage.from_user("Get information for user john_doe"), ] - generation_kwargs = {"tools": tools, "tool_choice": tool_choice} - - response = generator.run(messages=messages, generation_kwargs=generation_kwargs) + response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice}) assert "tool_calls" in response["replies"][0].meta tool_calls = response["replies"][0].meta["tool_calls"] @@ -310,3 +321,54 @@ def test_function_call_scenario(self, generator): assert tool_calls[0]["function"]["name"] == "get_user_info" assert "username" in json.loads(tool_calls[0]["function"]["arguments"]) assert response["replies"][0].role == ChatRole.ASSISTANT + + def test_function_call_and_execute(self, generator): + messages = [ChatMessage.from_user("What's the weather like in San Francisco?")] + tools = [ + { + "type": "function", + "function": { + "name": "get_current_temperature", + "description": "Get the current temperature in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + } + ] + + response = generator.run(messages=messages, generation_kwargs={"tools": tools}) + + available_functions = { + "get_current_temperature": self.get_current_temperature, + } + + assert "replies" in response + assert len(response["replies"]) > 0 + + first_reply = response["replies"][0] + assert "tool_calls" in first_reply.meta + tool_calls = first_reply.meta["tool_calls"] + + for tool_call in tool_calls: + function_name = tool_call["function"]["name"] + function_args = json.loads(tool_call["function"]["arguments"]) + assert function_name in available_functions + function_response = available_functions[function_name](**function_args) + function_message = ChatMessage.from_function(function_response, function_name) + messages.append(function_message) + + second_response = generator.run(messages=messages) + print(second_response) + assert "replies" in second_response + assert len(second_response["replies"]) > 0 + assert any("current temperature" in reply.content for reply in second_response["replies"]) + assert any("72" in reply.content for reply in second_response["replies"]) From 93a4dd64e37160740a11f4e675b1df665cf7079b Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Sat, 11 May 2024 22:27:44 -0700 Subject: [PATCH 07/11] add json test, add chatml test --- .../llama_cpp/tests/test_chat_generator.py | 111 +++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py index c3ed8ea54..a1dd41abd 100644 --- a/integrations/llama_cpp/tests/test_chat_generator.py +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -256,8 +256,53 @@ def test_run_rag_pipeline(self, generator): assert any("bioluminescent waves" in reply.content for reply in replies) assert all(reply.role == ChatRole.ASSISTANT for reply in replies) + @pytest.mark.integration + def test_json_constraining(self, generator): + """ + Test that the generator can output valid JSON. + """ + messages = [ChatMessage.from_system("Output valid json only. List 2 people with their name and age.")] + json_schema = { + "type": "object", + "properties": { + "people": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "number"}, + }, + }, + }, + }, + "required": ["people"], + } -class TestLlamaCppChatGeneratorFunctionCalls: + result = generator.run( + messages=messages, + generation_kwargs={ + "response_format": {"type": "json_object", "schema": json_schema}, + }, + ) + + assert "replies" in result + assert isinstance(result["replies"], list) + assert len(result["replies"]) > 0 + assert all(reply.role == ChatRole.ASSISTANT for reply in result["replies"]) + for reply in result["replies"]: + assert json.loads(reply.content) + assert isinstance(json.loads(reply.content), dict) + assert "people" in json.loads(reply.content) + assert isinstance(json.loads(reply.content)["people"], list) + assert all(isinstance(person, dict) for person in json.loads(reply.content)["people"]) + assert all("name" in person for person in json.loads(reply.content)["people"]) + assert all("age" in person for person in json.loads(reply.content)["people"]) + assert all(isinstance(person["name"], str) for person in json.loads(reply.content)["people"]) + assert all(isinstance(person["age"], int) for person in json.loads(reply.content)["people"]) + + +class TestLlamaCppChatGeneratorFunctionary: def get_current_temperature(self, location): """Get the current temperature in a given location""" if "tokyo" in location.lower(): @@ -372,3 +417,67 @@ def test_function_call_and_execute(self, generator): assert len(second_response["replies"]) > 0 assert any("current temperature" in reply.content for reply in second_response["replies"]) assert any("72" in reply.content for reply in second_response["replies"]) + + +class TestLlamaCppChatGeneratorChatML: + + @pytest.fixture + def generator(self, model_path, capsys): + gguf_model_path = ( + "https://huggingface.co/TheBloke/openchat-3.5-1210-GGUF/resolve/main/openchat-3.5-1210.Q3_K_S.gguf" + ) + filename = "openchat-3.5-1210.Q3_K_S.gguf" + download_file(gguf_model_path, str(model_path / filename), capsys) + model_path = str(model_path / filename) + generator = LlamaCppChatGenerator( + model=model_path, + n_ctx=8192, + n_batch=512, + model_kwargs={ + "chat_format": "chatml-function-calling", + }, + ) + generator.warm_up() + return generator + + @pytest.mark.integration + def test_function_call_chatml(self, generator): + messages = [ + ChatMessage.from_system( + """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, + detailed, and polite answers to the user's questions. The assistant calls functions with appropriate + input when necessary""" + ), + ChatMessage.from_user("Extract Jason is 25 years old"), + ] + + tools = [ + { + "type": "function", + "function": { + "name": "UserDetail", + "parameters": { + "type": "object", + "title": "UserDetail", + "properties": { + "name": {"title": "Name", "type": "string"}, + "age": {"title": "Age", "type": "integer"}, + }, + "required": ["name", "age"], + }, + }, + } + ] + + tool_choice = {"type": "function", "function": {"name": "UserDetail"}} + + response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice}) + for reply in response["replies"]: + assert "tool_calls" in reply.meta + tool_calls = reply.meta["tool_calls"] + assert len(tool_calls) > 0 + assert tool_calls[0]["function"]["name"] == "UserDetail" + assert "name" in json.loads(tool_calls[0]["function"]["arguments"]) + assert "age" in json.loads(tool_calls[0]["function"]["arguments"]) + assert "Jason" in json.loads(tool_calls[0]["function"]["arguments"])["name"] + assert 25 == json.loads(tool_calls[0]["function"]["arguments"])["age"] From bdc23c9e7ae6f4fbff01df378dfc2c5a5ae30d4b Mon Sep 17 00:00:00 2001 From: Ulises M <30765968+lbux@users.noreply.github.com> Date: Sat, 11 May 2024 22:44:07 -0700 Subject: [PATCH 08/11] make function call and execute more deterministic --- integrations/llama_cpp/tests/test_chat_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/llama_cpp/tests/test_chat_generator.py b/integrations/llama_cpp/tests/test_chat_generator.py index a1dd41abd..5666f109a 100644 --- a/integrations/llama_cpp/tests/test_chat_generator.py +++ b/integrations/llama_cpp/tests/test_chat_generator.py @@ -415,7 +415,7 @@ def test_function_call_and_execute(self, generator): print(second_response) assert "replies" in second_response assert len(second_response["replies"]) > 0 - assert any("current temperature" in reply.content for reply in second_response["replies"]) + assert any("San Francisco" in reply.content for reply in second_response["replies"]) assert any("72" in reply.content for reply in second_response["replies"]) From 0dda139279fde398d0f1168c0185d3eec9f7405a Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 13 May 2024 13:01:13 +0200 Subject: [PATCH 09/11] try removing additional deps --- integrations/llama_cpp/pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml index 20967a933..563af391d 100644 --- a/integrations/llama_cpp/pyproject.toml +++ b/integrations/llama_cpp/pyproject.toml @@ -28,10 +28,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "llama-cpp-python", - "transformers", - "sentencepiece", - "protobuf" + "llama-cpp-python" ] [project.urls] From 9f8447edbc6b92c540e0ab46daa9be6a52a52c4e Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 13 May 2024 13:22:00 +0200 Subject: [PATCH 10/11] revert --- integrations/llama_cpp/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml index 563af391d..50b8040b2 100644 --- a/integrations/llama_cpp/pyproject.toml +++ b/integrations/llama_cpp/pyproject.toml @@ -28,7 +28,8 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "llama-cpp-python" + "llama-cpp-python", + "transformers[sentencepiece]" ] [project.urls] From 869c36c8c4436c21746c929d7932e6538b416221 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Mon, 13 May 2024 13:30:21 +0200 Subject: [PATCH 11/11] make transformers a tests-only dependency --- integrations/llama_cpp/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/llama_cpp/pyproject.toml b/integrations/llama_cpp/pyproject.toml index 50b8040b2..a90118ee4 100644 --- a/integrations/llama_cpp/pyproject.toml +++ b/integrations/llama_cpp/pyproject.toml @@ -28,8 +28,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "llama-cpp-python", - "transformers[sentencepiece]" + "llama-cpp-python" ] [project.urls] @@ -53,6 +52,7 @@ dependencies = [ "coverage[toml]>=6.5", "pytest", "haystack-pydoc-tools", + "transformers[sentencepiece]" ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}"