From 8d15314890e0cc0b6bcae0e21960b9036e445c5f Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Thu, 19 Oct 2023 15:17:58 +0530 Subject: [PATCH 1/2] Enrich documents with embeddings instead of recreating documents (#42) --- .../instructor_embedders/instructor_document_embedder.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 7fd369cd2..fc2d43b93 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -126,11 +126,7 @@ def run(self, documents: List[Document]): normalize_embeddings=self.normalize_embeddings, ) - documents_with_embeddings = [] for doc, emb in zip(documents, embeddings): - doc_as_dict = doc.to_dict() - doc_as_dict["embedding"] = emb - del doc_as_dict["id"] - documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + doc.embedding = emb - return {"documents": documents_with_embeddings} + return {"documents": documents} From a66add2a6ab03a57978d4e6f55893c1633a4a2a2 Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:08:11 +0530 Subject: [PATCH 2/2] refactor: Update `INSTRUCTOR Embedders` to work with new `Document` class (#44) * Refactor Document to content and meta * Update docstrings: Add usage examples * Update code --- .../instructor_document_embedder.py | 60 +++++++++++++++++-- .../instructor_text_embedder.py | 30 ++++++++-- .../tests/test_instructor_backend.py | 4 +- .../test_instructor_document_embedder.py | 16 ++--- .../tests/test_instructor_text_embedder.py | 8 ++- 5 files changed, 95 insertions(+), 23 deletions(-) diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py index fc2d43b93..083986385 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -10,7 +10,50 @@ class InstructorDocumentEmbedder: """ A component for computing Document embeddings using INSTRUCTOR embedding models. The embedding of each Document is stored in the `embedding` field of the Document. - """ + + Usage example: + ```python + # To use this component, install the "instructor-embedders-haystack" package. + # pip install instructor-embedders-haystack + + from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder + from haystack.preview.dataclasses import Document + + + doc_embedding_instruction = "Represent the Medical Document for retrieval:" + + doc_embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + instruction=doc_embedding_instruction, + batch_size=32, + device="cpu", + ) + + doc_embedder.warm_up() + + # Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa) + document_list = [ + Document( + content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.", + meta={ + "pubid": "25,445,628", + "long_answer": "yes", + }, + ), + Document( + content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.", + meta={ + "pubid": "25,445,712", + "long_answer": "yes", + }, + ), + ] + + result = doc_embedder.run(document_list) + print(f"Document Text: {result['documents'][0].text}") + print(f"Document Embedding: {result['documents'][0].embedding}") + print(f"Embedding Dimension: {len(result['documents'][0].embedding)}") + """ # noqa: E501 def __init__( self, @@ -100,8 +143,10 @@ def run(self, documents: List[Document]): The embedding of each Document is stored in the `embedding` field of the Document. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): - msg = ("InstructorDocumentEmbedder expects a list of Documents as input. " - "In case you want to embed a list of strings, please use the InstructorTextEmbedder.") + msg = ( + "InstructorDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a list of strings, please use the InstructorTextEmbedder." + ) raise TypeError(msg) if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." @@ -112,11 +157,14 @@ def run(self, documents: List[Document]): texts_to_embed = [] for doc in documents: meta_values_to_embed = [ - str(doc.metadata[key]) + str(doc.meta[key]) for key in self.metadata_fields_to_embed - if key in doc.metadata and doc.metadata[key] is not None + if key in doc.meta and doc.meta[key] is not None + ] + text_to_embed = [ + self.instruction, + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]), ] - text_to_embed = [self.instruction, self.embedding_separator.join([*meta_values_to_embed, doc.text or ""])] texts_to_embed.append(text_to_embed) embeddings = self.embedding_backend.embed( diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py index dd0ec48c3..693ef57bb 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -8,8 +8,28 @@ @component class InstructorTextEmbedder: """ - A component for embedding strings using Sentence Transformers models. - """ + A component for embedding strings using INSTRUCTOR embedding models. + + Usage example: + ```python + # To use this component, install the "instructor-embedders-haystack" package. + # pip install instructor-embedders-haystack + + from instructor_embedders.instructor_text_embedder import InstructorTextEmbedder + + text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!" + instruction = ( + "Represent the Amazon comment for classifying the sentence as positive or negative" + ) + + text_embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", instruction=instruction, + device="cpu" + ) + + embedding = text_embedder.run(text) + ``` + """ # noqa: E501 def __init__( self, @@ -88,8 +108,10 @@ def warm_up(self): def run(self, text: str): """Embed a string.""" if not isinstance(text, str): - msg = ("InstructorTextEmbedder expects a string as input. " - "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder.") + msg = ( + "InstructorTextEmbedder expects a string as input. " + "In case you want to embed a list of Documents, please use the InstructorDocumentEmbedder." + ) raise TypeError(msg) if not hasattr(self, "embedding_backend"): msg = "The embedding model has not been loaded. Please call warm_up() before running." diff --git a/components/instructor-embedders/tests/test_instructor_backend.py b/components/instructor-embedders/tests/test_instructor_backend.py index 334e02f6f..6cd9d8b77 100644 --- a/components/instructor-embedders/tests/test_instructor_backend.py +++ b/components/instructor-embedders/tests/test_instructor_backend.py @@ -7,7 +7,7 @@ @pytest.mark.unit @patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") -def test_factory_behavior(mock_instructor): # noqa: ARG001 +def test_factory_behavior(mock_instructor): # noqa: ARG001 embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( model_name_or_path="hkunlp/instructor-large", device="cpu" ) @@ -33,7 +33,7 @@ def test_model_initialization(mock_instructor): @pytest.mark.unit @patch("instructor_embedders.embedding_backend.instructor_backend.INSTRUCTOR") -def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 +def test_embedding_function_with_kwargs(mock_instructor): # noqa: ARG001 embedding_backend = _InstructorEmbeddingBackendFactory.get_embedding_backend( model_name_or_path="hkunlp/instructor-base" ) diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/instructor-embedders/tests/test_instructor_document_embedder.py index faa9d715b..f65d81b5f 100644 --- a/components/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_document_embedder.py @@ -198,7 +198,7 @@ def test_embed(self): embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005 - documents = [Document(text=f"Sample-document text {i}") for i in range(5)] + documents = [Document(content=f"Sample-document text {i}") for i in range(5)] result = embedder.run(documents=documents) @@ -239,9 +239,7 @@ def test_embed_metadata(self): ) embedder.embedding_backend = MagicMock() - documents = [ - Document(text=f"document-number {i}", metadata={"meta_field": f"meta_value {i}"}) for i in range(5) - ] + documents = [Document(content=f"document-number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] embedder.run(documents=documents) @@ -260,12 +258,14 @@ def test_embed_metadata(self): @pytest.mark.integration def test_run(self): - embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base", - device="cpu", - instruction="Represent the Science document for retrieval") + embedder = InstructorDocumentEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science document for retrieval", + ) embedder.warm_up() - doc = Document(text="Parton energy loss in QCD matter") + doc = Document(content="Parton energy loss in QCD matter") result = embedder.run(documents=[doc]) embedding = result["documents"][0].embedding diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/instructor-embedders/tests/test_instructor_text_embedder.py index e3afe91e8..4481fcb97 100644 --- a/components/instructor-embedders/tests/test_instructor_text_embedder.py +++ b/components/instructor-embedders/tests/test_instructor_text_embedder.py @@ -200,9 +200,11 @@ def test_run_wrong_incorrect_format(self): @pytest.mark.integration def test_run(self): - embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base", - device="cpu", - instruction="Represent the Science sentence for retrieval") + embedder = InstructorTextEmbedder( + model_name_or_path="hkunlp/instructor-base", + device="cpu", + instruction="Represent the Science sentence for retrieval", + ) embedder.warm_up() text = "Parton energy loss in QCD matter"