diff --git a/haystack/components/builders/__init__.py b/haystack/components/builders/__init__.py index 47a6bf388a..c898d5734a 100644 --- a/haystack/components/builders/__init__.py +++ b/haystack/components/builders/__init__.py @@ -1,6 +1,7 @@ from haystack.components.builders.answer_builder import AnswerBuilder from haystack.components.builders.prompt_builder import PromptBuilder from haystack.components.builders.dynamic_prompt_builder import DynamicPromptBuilder +from haystack.components.builders.metadata_builder import MetadataBuilder from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder -__all__ = ["AnswerBuilder", "PromptBuilder", "DynamicPromptBuilder", "DynamicChatPromptBuilder"] +__all__ = ["AnswerBuilder", "PromptBuilder", "DynamicPromptBuilder", "DynamicChatPromptBuilder", "MetadataBuilder"] diff --git a/haystack/components/builders/metadata_builder.py b/haystack/components/builders/metadata_builder.py new file mode 100644 index 0000000000..ffc4752943 --- /dev/null +++ b/haystack/components/builders/metadata_builder.py @@ -0,0 +1,40 @@ +import logging +from typing import Any, Dict, List, Optional + +from haystack import component +from haystack.dataclasses import Document + +logger = logging.getLogger(__name__) + + +class MetadataBuilder: + def __init__(self, meta_keys: List[str]): + self.meta_keys = meta_keys + + @component.output_types(documents=List[Document]) + def run( + self, documents: List[Document], data: Dict[str, Any], meta: Optional[List[Dict[str, Any]]] = None + ) -> Dict[str, List[Document]]: + """ + The MetadataBuilder component takes a list of Documents, the output of a component to which these Documents were passed, + and adds the output from the component as metadata to the Documents. + The MetadataBuilder component takes these replies and metadata and adds them to the Documents. + It does this by adding the replies and metadata to the metadata of the Document. + :param documents: The documents used as input to the Generator. A list of `Document` objects. + :param data: The output of a component (Generator , TextEmbedder, EntityExtractor). + :param meta: The metadata returned by the component. + """ + if not meta: + meta = [{}] * len(data) + + if not len(documents) == len(data) == len(meta): + raise ValueError( + f"Number of Documents ({len(documents)}), data ({len(data)}), and metadata ({len(meta)})" " must match." + ) + + meta = {key: data[key] for key in self.meta_keys} + + for i, doc in enumerate(documents): + doc.meta.update(meta) + + return {"documents": documents} diff --git a/releasenotes/notes/add_metadatabuilder-9ae9a11fc754f58a.yaml b/releasenotes/notes/add_metadatabuilder-9ae9a11fc754f58a.yaml new file mode 100644 index 0000000000..a9a289d71d --- /dev/null +++ b/releasenotes/notes/add_metadatabuilder-9ae9a11fc754f58a.yaml @@ -0,0 +1,3 @@ +features: + - | + Add MetadataBuilder component. diff --git a/test/components/builders/test_metadata_builder.py b/test/components/builders/test_metadata_builder.py new file mode 100644 index 0000000000..6681e0ebee --- /dev/null +++ b/test/components/builders/test_metadata_builder.py @@ -0,0 +1,125 @@ +import pytest +from haystack.dataclasses import Document +from haystack.components.builders import MetadataBuilder + + +class TestMetadataBuilder: + def test_recieves_list_of_summaries_entities(self): + metadata_builder = MetadataBuilder(meta_keys=["entities", "summary"]) + documents = [Document(content="document_0"), Document(content="document_1")] + data = {"entities": ["entity1", "entity2", "entity3"], "summary": ["Summary 1", "Summary 2", "Summary3"]} + metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}] + + result = metadata_builder.run(documents=documents, data=data, meta=metadata) + assert len(result["documents"]) == 2 + + def test_receives_list_of_replies_and_no_metadata(self): + """ + The component receives only a list of Documents and replies and no metadata. + """ + metadata_builder = MetadataBuilder(meta_keys=["summary"]) + + documents = [Document(content="document_0")] + data = {"summary": ["reply_0", "reply_1", "reply_2"]} + meta = [{"key_0": "value_0"}] + # Invoke the run method without providing metadata + result = metadata_builder.run(documents=documents, data=data, meta=meta) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == 1 + + def test_receives_list_of_replies_and_metadata(self): + """ + The component receives a list of Documents, replies and metadata. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + # Single document, one + documents = [Document(content="document_0")] + data = {"replies": ["reply1", "reply2"]} + metadata = [{"key_0": "value_0"}] + + result = metadata_builder.run(documents=documents, meta=metadata, data=data) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == 1 + + def test_recieves_replies_and_no_metadata(self): + """ + The component receives only a list of Documents and replies and no metadata. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + + documents = [Document(content="document_0")] + data = {"replies": ["reply1", "reply2"]} + + # Invoke the run method without providing metadata + result = metadata_builder.run(documents=documents, data=data) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == 1 + + def test_mismatched_documents_replies_and_no_metadata(self): + """ + If the length of the Document list and the replies list are different having no metadata, the component raises a ValueError. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + documents = [Document(content="document_0"), Document(content="document_1")] + data = {"replies": ["reply1", "reply2", "reply3"]} + + # Check that a ValueError is raised when invoking the run method + with pytest.raises(ValueError): + metadata_builder.run(documents=documents, data=data) + + def test_mismatched_documents_replies(self): + """ + If the length of the Document list and the replies list are different, having metadata the component raises a ValueError. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + + documents = [Document(content="document_0"), Document(content="document_1")] + data = {"replies": ["reply1", "reply2", "reply3"]} + metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}, {"key_2": "value_2"}] + + # Check that a ValueError is raised when invoking the run method + with pytest.raises(ValueError): + metadata_builder.run(documents=documents, data=data, meta=metadata) + + def test_mismatched_documents_metadata(self): + """ + If the length of the Document list and the metadata list are different, the component raises a ValueError. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + + documents = [Document(content="document_0"), Document(content="document_1"), Document(content="document_2")] + data = {"replies": ["reply1", "reply2", "reply3"]} + metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}] + + # Check that a ValueError is raised when invoking the run method + with pytest.raises(ValueError): + metadata_builder.run(documents=documents, data=data, meta=metadata) + + def test_mismatched_documents_replies_metadata(self): + """ + If the length of the Document list, replies list and the metadata list are all different, the component raises a ValueError. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + + documents = [Document(content="document_0"), Document(content="document_1")] + data = {"replies": ["reply0"]} + metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}, {"key_2": "value_2"}] + + # Check that a ValueError is raised when invoking the run method + with pytest.raises(ValueError): + metadata_builder.run(documents=documents, data=data, meta=metadata) + + def test_metadata_with_same_keys(self): + """ + The component should correctly add the metadata if the Document metadata already has a reply. + """ + metadata_builder = MetadataBuilder(meta_keys=["replies"]) + data = {"replies": ["reply_0"]} + documents = [Document(content="document content", meta={"reply": "original text"})] + result = metadata_builder.run(documents=documents, data=data) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == 1