Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore!: Rename model_name_or_path to model in the Instructor integration #229

Merged
merged 5 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class InstructorDocumentEmbedder:
doc_embedding_instruction = "Represent the Medical Document for retrieval:"

doc_embedder = InstructorDocumentEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
instruction=doc_embedding_instruction,
batch_size=32,
device="cpu",
Expand Down Expand Up @@ -60,7 +60,7 @@ class InstructorDocumentEmbedder:

def __init__(
self,
model_name_or_path: str = "hkunlp/instructor-base",
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
instruction: str = "Represent the document",
Expand All @@ -73,7 +73,7 @@ def __init__(
"""
Create an InstructorDocumentEmbedder component.

:param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
:param model: Local path or name of the model in Hugging Face's model hub,
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
Expand All @@ -95,7 +95,7 @@ def __init__(
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
"""

self.model_name_or_path = model_name_or_path
shadeMe marked this conversation as resolved.
Show resolved Hide resolved
self.model_name_or_path = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
Expand All @@ -112,7 +112,7 @@ def to_dict(self) -> Dict[str, Any]:
"""
return default_to_dict(
self,
model_name_or_path=self.model_name_or_path,
model=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
instruction=self.instruction,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class InstructorTextEmbedder:
)

text_embedder = InstructorTextEmbedder(
model_name_or_path="hkunlp/instructor-base", instruction=instruction,
model="hkunlp/instructor-base", instruction=instruction,
device="cpu"
)

Expand All @@ -36,7 +36,7 @@ class InstructorTextEmbedder:

def __init__(
self,
model_name_or_path: str = "hkunlp/instructor-base",
model: str = "hkunlp/instructor-base",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
instruction: str = "Represent the sentence",
Expand All @@ -47,7 +47,7 @@ def __init__(
"""
Create an InstructorTextEmbedder component.

:param model_name_or_path: Local path or name of the model in Hugging Face's model hub,
:param model: Local path or name of the model in Hugging Face's model hub,
such as ``'hkunlp/instructor-base'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
If None, checks if a GPU can be used.
Expand All @@ -67,7 +67,7 @@ def __init__(
:param normalize_embeddings: If set to true, returned vectors will have the length of 1.
"""

self.model_name_or_path = model_name_or_path
shadeMe marked this conversation as resolved.
Show resolved Hide resolved
self.model_name_or_path = model
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
Expand All @@ -82,7 +82,7 @@ def to_dict(self) -> Dict[str, Any]:
"""
return default_to_dict(
self,
model_name_or_path=self.model_name_or_path,
model=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
instruction=self.instruction,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_init_default(self):
"""
Test default initialization parameters for InstructorDocumentEmbedder.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
Expand All @@ -28,7 +28,7 @@ def test_init_with_parameters(self):
Test custom initialization parameters for InstructorDocumentEmbedder.
"""
embedder = InstructorDocumentEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
instruction="Represent the 'domain' 'text_type' for 'task_objective'",
Expand All @@ -52,12 +52,12 @@ def test_to_dict(self):
"""
Test serialization of InstructorDocumentEmbedder to a dictionary, using default initialization parameters.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
embedder_dict = embedder.to_dict()
assert embedder_dict == {
"type": "instructor_embedders_haystack.instructor_document_embedder.InstructorDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"instruction": "Represent the document",
Expand All @@ -74,7 +74,7 @@ def test_to_dict_with_custom_init_parameters(self):
Test serialization of InstructorDocumentEmbedder to a dictionary, using custom initialization parameters.
"""
embedder = InstructorDocumentEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
instruction="Represent the financial document for retrieval",
Expand All @@ -88,7 +88,7 @@ def test_to_dict_with_custom_init_parameters(self):
assert embedder_dict == {
"type": "instructor_embedders_haystack.instructor_document_embedder.InstructorDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"instruction": "Represent the financial document for retrieval",
Expand All @@ -107,7 +107,7 @@ def test_from_dict(self):
embedder_dict = {
"type": "instructor_embedders_haystack.instructor_document_embedder.InstructorDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
Expand Down Expand Up @@ -136,7 +136,7 @@ def test_from_dict_with_custom_init_parameters(self):
embedder_dict = {
"type": "instructor_embedders_haystack.instructor_document_embedder.InstructorDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"instruction": "Represent the financial document for retrieval",
Expand All @@ -163,7 +163,7 @@ def test_warmup(self, mocked_factory):
"""
Test for checking embedder instances after warm-up.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
mocked_factory.get_embedding_backend.assert_called_once_with(
Expand All @@ -175,7 +175,7 @@ def test_warmup_does_not_reload(self, mocked_factory):
"""
Test for checking backend instances after multiple warm-ups.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
embedder.warm_up()
Expand All @@ -185,7 +185,7 @@ def test_embed(self):
"""
Test for checking output dimensions and embedding dimensions.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-large")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-large")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005

Expand All @@ -204,7 +204,7 @@ def test_embed_incorrect_input_format(self):
"""
Test for checking incorrect input format when creating embedding.
"""
embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-base")

string_input = "text"
list_integers_input = [1, 2, 3]
Expand All @@ -221,7 +221,7 @@ def test_embed_metadata(self):
with a custom instruction and metadata.
"""
embedder = InstructorDocumentEmbedder(
model_name_or_path="model",
model="model",
instruction="Represent the financial document for retrieval",
meta_fields_to_embed=["meta_field"],
embedding_separator="\n",
Expand All @@ -248,7 +248,7 @@ def test_embed_metadata(self):
@pytest.mark.integration
def test_run(self):
embedder = InstructorDocumentEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cpu",
instruction="Represent the Science document for retrieval",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_init_default(self):
"""
Test default initialization parameters for InstructorTextEmbedder.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-base")
assert embedder.model_name_or_path == "hkunlp/instructor-base"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
Expand All @@ -25,7 +25,7 @@ def test_init_with_parameters(self):
Test custom initialization parameters for InstructorTextEmbedder.
"""
embedder = InstructorTextEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
instruction="Represent the 'domain' 'text_type' for 'task_objective'",
Expand All @@ -45,12 +45,12 @@ def test_to_dict(self):
"""
Test serialization of InstructorTextEmbedder to a dictionary, using default initialization parameters.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-base")
embedder_dict = embedder.to_dict()
assert embedder_dict == {
"type": "instructor_embedders_haystack.instructor_text_embedder.InstructorTextEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"instruction": "Represent the sentence",
Expand All @@ -65,7 +65,7 @@ def test_to_dict_with_custom_init_parameters(self):
Test serialization of InstructorTextEmbedder to a dictionary, using custom initialization parameters.
"""
embedder = InstructorTextEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cuda",
use_auth_token=True,
instruction="Represent the financial document for retrieval",
Expand All @@ -77,7 +77,7 @@ def test_to_dict_with_custom_init_parameters(self):
assert embedder_dict == {
"type": "instructor_embedders_haystack.instructor_text_embedder.InstructorTextEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"instruction": "Represent the financial document for retrieval",
Expand All @@ -94,7 +94,7 @@ def test_from_dict(self):
embedder_dict = {
"type": "instructor_embedders_haystack.instructor_text_embedder.InstructorTextEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cpu",
"use_auth_token": None,
"instruction": "Represent the 'domain' 'text_type' for 'task_objective'",
Expand All @@ -119,7 +119,7 @@ def test_from_dict_with_custom_init_parameters(self):
embedder_dict = {
"type": "instructor_embedders_haystack.instructor_text_embedder.InstructorTextEmbedder",
"init_parameters": {
"model_name_or_path": "hkunlp/instructor-base",
"model": "hkunlp/instructor-base",
"device": "cuda",
"use_auth_token": True,
"instruction": "Represent the financial document for retrieval",
Expand All @@ -142,7 +142,7 @@ def test_warmup(self, mocked_factory):
"""
Test for checking embedder instances after warm-up.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-base")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
mocked_factory.get_embedding_backend.assert_called_once_with(
Expand All @@ -154,7 +154,7 @@ def test_warmup_does_not_reload(self, mocked_factory):
"""
Test for checking backend instances after multiple warm-ups.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-base")
mocked_factory.get_embedding_backend.assert_not_called()
embedder.warm_up()
embedder.warm_up()
Expand All @@ -164,7 +164,7 @@ def test_embed(self):
"""
Test for checking output dimensions and embedding dimensions.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-large")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist() # noqa: ARG005

Expand All @@ -180,7 +180,7 @@ def test_run_wrong_incorrect_format(self):
"""
Test for checking incorrect input format when creating embedding.
"""
embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-large")
embedder = InstructorTextEmbedder(model="hkunlp/instructor-large")
embedder.embedding_backend = MagicMock()

list_integers_input = [1, 2, 3]
Expand All @@ -191,7 +191,7 @@ def test_run_wrong_incorrect_format(self):
@pytest.mark.integration
def test_run(self):
embedder = InstructorTextEmbedder(
model_name_or_path="hkunlp/instructor-base",
model="hkunlp/instructor-base",
device="cpu",
instruction="Represent the Science sentence for retrieval",
)
Expand Down