Skip to content

Commit

Permalink
feat(FastEmbed): renaming SPLADE to Sparse because it makes more sense
Browse files Browse the repository at this point in the history
  • Loading branch information
lambda-science committed Mar 14, 2024
1 parent 8e20cee commit d4f836a
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 75 deletions.
10 changes: 5 additions & 5 deletions integrations/fastembed/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,24 @@ doc = Document(content="fastembed is supported by and maintained by Qdrant.", me
result = embedder.run(documents=[doc])
```

You can use `FastembedTextSPLADEEmbedder` and `FastembedDocumentSPLADEEmbedder` by importing as:
You can use `FastembedSparseTextEmbedder` and `FastembedSparseDocumentEmbedder` by importing as:

```python
from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder

text = "fastembed is supported by and maintained by Qdrant."
text_embedder = FastembedTextSPLADEEmbedder(
text_embedder = FastembedSparseTextEmbedder(
model="prithvida/SPLADE_PP_en_v1"
)
text_embedder.warm_up()
embedding = text_embedder.run(text)["embedding"]
```

```python
from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder
from haystack.dataclasses import Document

embedder = FastembedDocumentSPLADEEmbedder(
embedder = FastembedSparseDocumentEmbedder(
model="prithvida/SPLADE_PP_en_v1",
)
embedder.warm_up()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from .fastembed_document_embedder import FastembedDocumentEmbedder
from .fastembed_text_embedder import FastembedTextEmbedder
from .fastembed_document_SPLADE_embedder import FastembedDocumentSPLADEEmbedder
from .fastembed_text_SPLADE_embedder import FastembedTextSPLADEEmbedder
from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder
from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder

__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedDocumentSPLADEEmbedder", "FastembedTextSPLADEEmbedder"]
__all__ = ["FastembedDocumentEmbedder", "FastembedTextEmbedder", "FastembedSparseDocumentEmbedder", "FastembedSparseTextEmbedder"]
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@


@component
class FastembedDocumentSPLADEEmbedder:
class FastembedSparseDocumentEmbedder:
"""
FastembedDocumentSPLADEEmbedder computes Document embeddings using Fastembed SPLADE models.
FastembedSparseDocumentEmbedder computes Document embeddings using Fastembed sparse models.
The embedding of each Document is stored in the `meta["_sparse_vector"]` field of the Document.
Expand All @@ -17,10 +17,10 @@ class FastembedDocumentSPLADEEmbedder:
# To use this component, install the "fastembed-haystack" package.
# pip install fastembed-haystack
from haystack_integrations.components.embedders.fastembed import FastembedDocumentSPLADEEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder
from haystack.dataclasses import Document
doc_embedder = FastembedDocumentSPLADEEmbedder(
doc_embedder = FastembedSparseDocumentEmbedder(
model="prithvida/SPLADE_PP_en_v1",
batch_size=256,
)
Expand Down Expand Up @@ -150,7 +150,7 @@ def run(self, documents: List[Document]):
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
msg = (
"FastembedDocumentSPLADEEmbedder expects a list of Documents as input. "
"FastembedSparseDocumentEmbedder expects a list of Documents as input. "
"In case you want to embed a list of strings, please use the FastembedTextEmbedder."
)
raise TypeError(msg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@


@component
class FastembedTextSPLADEEmbedder:
class FastembedSparseTextEmbedder:
"""
FastembedTextSPLADEEmbedder computes string embedding using fastembed SPLADE models.
FastembedSparseTextEmbedder computes string embedding using fastembed sparse models.
Usage example:
```python
# To use this component, install the "fastembed-haystack" package.
# pip install fastembed-haystack
from haystack_integrations.components.embedders.fastembed import FastembedTextSPLADEEmbedder
from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder
text = "It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows. Do Not order this if you have a Mac!!"
text_embedder = FastembedTextSPLADEEmbedder(
text_embedder = FastembedSparseTextEmbedder(
model="prithvida/SPLADE_PP_en_v1"
)
text_embedder.warm_up()
Expand All @@ -40,7 +40,7 @@ def __init__(
parallel: Optional[int] = None,
):
"""
Create a FastembedTextSPLADEEmbedder component.
Create a FastembedSparseTextEmbedder component.
:param model: Local path or name of the model in Fastembed's model hub, such as `prithvida/SPLADE_PP_en_v1`
:param cache_dir: The path to the cache directory.
Expand Down Expand Up @@ -107,7 +107,7 @@ def run(self, text: str):
"""
if not isinstance(text, str):
msg = (
"FastembedTextSPLADEEmbedder expects a string as input. "
"FastembedSparseTextEmbedder expects a string as input. "
"In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder."
)
raise TypeError(msg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
import numpy as np
import pytest
from haystack import Document, default_from_dict
from haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder import (
FastembedDocumentSPLADEEmbedder,
from haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder import (
FastembedSparseDocumentEmbedder,
)


class TestFastembedDocumentSPLADEEmbedderDoc:
class TestFastembedSparseDocumentEmbedderDoc:
def test_init_default(self):
"""
Test default initialization parameters for FastembedDocumentSPLADEEmbedder.
Test default initialization parameters for FastembedSparseDocumentEmbedder.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")
assert embedder.model_name == "prithvida/SPLADE_PP_en_v1"
assert embedder.cache_dir is None
assert embedder.threads is None
Expand All @@ -27,9 +27,9 @@ def test_init_default(self):

def test_init_with_parameters(self):
"""
Test custom initialization parameters for FastembedDocumentSPLADEEmbedder.
Test custom initialization parameters for FastembedSparseDocumentEmbedder.
"""
embedder = FastembedDocumentSPLADEEmbedder(
embedder = FastembedSparseDocumentEmbedder(
model="prithvida/SPLADE_PP_en_v1",
cache_dir="fake_dir",
threads=2,
Expand All @@ -54,12 +54,12 @@ def test_init_with_parameters(self):

def test_to_dict(self):
"""
Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using default initialization parameters.
Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using default initialization parameters.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder_dict = embedder.to_dict()
assert embedder_dict == {
"type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa
"type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa
"init_parameters": {
"model": "prithvida/SPLADE_PP_en_v1",
"cache_dir": None,
Expand All @@ -76,9 +76,9 @@ def test_to_dict(self):

def test_to_dict_with_custom_init_parameters(self):
"""
Test serialization of FastembedDocumentSPLADEEmbedder to a dictionary, using custom initialization parameters.
Test serialization of FastembedSparseDocumentEmbedder to a dictionary, using custom initialization parameters.
"""
embedder = FastembedDocumentSPLADEEmbedder(
embedder = FastembedSparseDocumentEmbedder(
model="prithvida/SPLADE_PP_en_v1",
cache_dir="fake_dir",
threads=2,
Expand All @@ -92,7 +92,7 @@ def test_to_dict_with_custom_init_parameters(self):
)
embedder_dict = embedder.to_dict()
assert embedder_dict == {
"type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa
"type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa
"init_parameters": {
"model": "prithvida/SPLADE_PP_en_v1",
"cache_dir": "fake_dir",
Expand All @@ -109,10 +109,10 @@ def test_to_dict_with_custom_init_parameters(self):

def test_from_dict(self):
"""
Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using default initialization parameters.
Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using default initialization parameters.
"""
embedder_dict = {
"type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa
"type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa
"init_parameters": {
"model": "prithvida/SPLADE_PP_en_v1",
"cache_dir": None,
Expand All @@ -126,7 +126,7 @@ def test_from_dict(self):
"embedding_separator": "\n",
},
}
embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict)
embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict)
assert embedder.model_name == "prithvida/SPLADE_PP_en_v1"
assert embedder.cache_dir is None
assert embedder.threads is None
Expand All @@ -140,10 +140,10 @@ def test_from_dict(self):

def test_from_dict_with_custom_init_parameters(self):
"""
Test deserialization of FastembedDocumentSPLADEEmbedder from a dictionary, using custom initialization parameters.
Test deserialization of FastembedSparseDocumentEmbedder from a dictionary, using custom initialization parameters.
"""
embedder_dict = {
"type": "haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder.FastembedDocumentSPLADEEmbedder", # noqa
"type": "haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder.FastembedSparseDocumentEmbedder", # noqa
"init_parameters": {
"model": "prithvida/SPLADE_PP_en_v1",
"cache_dir": "fake_dir",
Expand All @@ -157,7 +157,7 @@ def test_from_dict_with_custom_init_parameters(self):
"embedding_separator": " | ",
},
}
embedder = default_from_dict(FastembedDocumentSPLADEEmbedder, embedder_dict)
embedder = default_from_dict(FastembedSparseDocumentEmbedder, embedder_dict)
assert embedder.model_name == "prithvida/SPLADE_PP_en_v1"
assert embedder.cache_dir == "fake_dir"
assert embedder.threads == 2
Expand All @@ -170,27 +170,27 @@ def test_from_dict_with_custom_init_parameters(self):
assert embedder.embedding_separator == " | "

@patch(
"haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory"
"haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory"
)
def test_warmup(self, mocked_factory):
"""
Test for checking embedder instances after warm-up.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
mocked_factory.get_embedding_backend.assert_called_once_with(
model_name="prithvida/SPLADE_PP_en_v1", cache_dir=None, threads=None
)

@patch(
"haystack_integrations.components.embedders.fastembed.fastembed_document_SPLADE_embedder._FastembedSparseEmbeddingBackendFactory"
"haystack_integrations.components.embedders.fastembed.fastembed_sparse_document_embedder._FastembedSparseEmbeddingBackendFactory"
)
def test_warmup_does_not_reload(self, mocked_factory):
"""
Test for checking backend instances after multiple warm-ups.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
embedder.warm_up()
Expand All @@ -211,7 +211,7 @@ def test_embed(self):
"""
Test for checking output dimensions and embedding dimensions.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: self._generate_mocked_sparse_embedding(len(x)) # noqa: ARG005

Expand All @@ -233,20 +233,20 @@ def test_embed_incorrect_input_format(self):
"""
Test for checking incorrect input format when creating embedding.
"""
embedder = FastembedDocumentSPLADEEmbedder(model="prithvida/SPLADE_PP_en_v1")
embedder = FastembedSparseDocumentEmbedder(model="prithvida/SPLADE_PP_en_v1")

string_input = "text"
list_integers_input = [1, 2, 3]

with pytest.raises(
TypeError,
match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.",
match="FastembedSparseDocumentEmbedder expects a list of Documents as input.",
):
embedder.run(documents=string_input)

with pytest.raises(
TypeError,
match="FastembedDocumentSPLADEEmbedder expects a list of Documents as input.",
match="FastembedSparseDocumentEmbedder expects a list of Documents as input.",
):
embedder.run(documents=list_integers_input)

Expand All @@ -255,7 +255,7 @@ def test_embed_metadata(self):
Test for checking output dimensions and embedding dimensions for documents
with a custom instruction and metadata.
"""
embedder = FastembedDocumentSPLADEEmbedder(
embedder = FastembedSparseDocumentEmbedder(
model="model",
meta_fields_to_embed=["meta_field"],
embedding_separator="\n",
Expand All @@ -281,7 +281,7 @@ def test_embed_metadata(self):

@pytest.mark.integration
def test_run(self):
embedder = FastembedDocumentSPLADEEmbedder(
embedder = FastembedSparseDocumentEmbedder(
model="prithvida/SPLADE_PP_en_v1",
)
embedder.warm_up()
Expand Down
Loading

0 comments on commit d4f836a

Please sign in to comment.