Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add MetadataBuilder #6636

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion haystack/components/builders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.builders.dynamic_prompt_builder import DynamicPromptBuilder
from haystack.components.builders.metadata_builder import MetadataBuilder
from haystack.components.builders.dynamic_chat_prompt_builder import DynamicChatPromptBuilder

__all__ = ["AnswerBuilder", "PromptBuilder", "DynamicPromptBuilder", "DynamicChatPromptBuilder"]
__all__ = ["AnswerBuilder", "PromptBuilder", "DynamicPromptBuilder", "DynamicChatPromptBuilder", "MetadataBuilder"]
40 changes: 40 additions & 0 deletions haystack/components/builders/metadata_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
from typing import Any, Dict, List, Optional

from haystack import component
from haystack.dataclasses import Document

logger = logging.getLogger(__name__)


class MetadataBuilder:
def __init__(self, meta_keys: List[str]):
self.meta_keys = meta_keys

@component.output_types(documents=List[Document])
def run(
self, documents: List[Document], data: Dict[str, Any], meta: Optional[List[Dict[str, Any]]] = None
) -> Dict[str, List[Document]]:
"""
The MetadataBuilder component takes a list of Documents, the output of a component to which these Documents were passed,
and adds the output from the component as metadata to the Documents.
The MetadataBuilder component takes these replies and metadata and adds them to the Documents.
It does this by adding the replies and metadata to the metadata of the Document.
:param documents: The documents used as input to the Generator. A list of `Document` objects.
:param data: The output of a component (Generator , TextEmbedder, EntityExtractor).
:param meta: The metadata returned by the component.
"""
if not meta:
meta = [{}] * len(data)

if not len(documents) == len(data) == len(meta):
raise ValueError(
f"Number of Documents ({len(documents)}), data ({len(data)}), and metadata ({len(meta)})" " must match."
)

meta = {key: data[key] for key in self.meta_keys}

for i, doc in enumerate(documents):
doc.meta.update(meta)

return {"documents": documents}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
features:
- |
Add MetadataBuilder component.
125 changes: 125 additions & 0 deletions test/components/builders/test_metadata_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pytest
from haystack.dataclasses import Document
from haystack.components.builders import MetadataBuilder


class TestMetadataBuilder:
def test_recieves_list_of_summaries_entities(self):
metadata_builder = MetadataBuilder(meta_keys=["entities", "summary"])
documents = [Document(content="document_0"), Document(content="document_1")]
data = {"entities": ["entity1", "entity2", "entity3"], "summary": ["Summary 1", "Summary 2", "Summary3"]}
metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}]

result = metadata_builder.run(documents=documents, data=data, meta=metadata)
assert len(result["documents"]) == 2

def test_receives_list_of_replies_and_no_metadata(self):
"""
The component receives only a list of Documents and replies and no metadata.
"""
metadata_builder = MetadataBuilder(meta_keys=["summary"])

documents = [Document(content="document_0")]
data = {"summary": ["reply_0", "reply_1", "reply_2"]}
meta = [{"key_0": "value_0"}]
# Invoke the run method without providing metadata
result = metadata_builder.run(documents=documents, data=data, meta=meta)

assert isinstance(result["documents"], list)
assert len(result["documents"]) == 1

def test_receives_list_of_replies_and_metadata(self):
"""
The component receives a list of Documents, replies and metadata.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])
# Single document, one
documents = [Document(content="document_0")]
data = {"replies": ["reply1", "reply2"]}
metadata = [{"key_0": "value_0"}]

result = metadata_builder.run(documents=documents, meta=metadata, data=data)

assert isinstance(result["documents"], list)
assert len(result["documents"]) == 1

def test_recieves_replies_and_no_metadata(self):
"""
The component receives only a list of Documents and replies and no metadata.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])

documents = [Document(content="document_0")]
data = {"replies": ["reply1", "reply2"]}

# Invoke the run method without providing metadata
result = metadata_builder.run(documents=documents, data=data)

assert isinstance(result["documents"], list)
assert len(result["documents"]) == 1

def test_mismatched_documents_replies_and_no_metadata(self):
"""
If the length of the Document list and the replies list are different having no metadata, the component raises a ValueError.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])
documents = [Document(content="document_0"), Document(content="document_1")]
data = {"replies": ["reply1", "reply2", "reply3"]}

# Check that a ValueError is raised when invoking the run method
with pytest.raises(ValueError):
metadata_builder.run(documents=documents, data=data)

def test_mismatched_documents_replies(self):
"""
If the length of the Document list and the replies list are different, having metadata the component raises a ValueError.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])

documents = [Document(content="document_0"), Document(content="document_1")]
data = {"replies": ["reply1", "reply2", "reply3"]}
metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}, {"key_2": "value_2"}]

# Check that a ValueError is raised when invoking the run method
with pytest.raises(ValueError):
metadata_builder.run(documents=documents, data=data, meta=metadata)

def test_mismatched_documents_metadata(self):
"""
If the length of the Document list and the metadata list are different, the component raises a ValueError.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])

documents = [Document(content="document_0"), Document(content="document_1"), Document(content="document_2")]
data = {"replies": ["reply1", "reply2", "reply3"]}
metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}]

# Check that a ValueError is raised when invoking the run method
with pytest.raises(ValueError):
metadata_builder.run(documents=documents, data=data, meta=metadata)

def test_mismatched_documents_replies_metadata(self):
"""
If the length of the Document list, replies list and the metadata list are all different, the component raises a ValueError.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])

documents = [Document(content="document_0"), Document(content="document_1")]
data = {"replies": ["reply0"]}
metadata = [{"key_0": "value_0"}, {"key_1": "value_1"}, {"key_2": "value_2"}]

# Check that a ValueError is raised when invoking the run method
with pytest.raises(ValueError):
metadata_builder.run(documents=documents, data=data, meta=metadata)

def test_metadata_with_same_keys(self):
"""
The component should correctly add the metadata if the Document metadata already has a reply.
"""
metadata_builder = MetadataBuilder(meta_keys=["replies"])
data = {"replies": ["reply_0"]}
documents = [Document(content="document content", meta={"reply": "original text"})]
result = metadata_builder.run(documents=documents, data=data)

assert isinstance(result["documents"], list)
assert len(result["documents"]) == 1