Skip to content

Commit

Permalink
refactor: adopt token instead of use_auth_token in HF components (d…
Browse files Browse the repository at this point in the history
…eepset-ai#6040)

* move embedding backends

* use token in Sentence Transformers embeddings

* more compact token handling

* token parameter in reader

* add token to ranker

* release note

* add test for reader
  • Loading branch information
anakin87 authored Oct 17, 2023
1 parent 4e4af99 commit 21d894d
Show file tree
Hide file tree
Showing 9 changed files with 153 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
self,
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Union[bool, str, None] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
Expand All @@ -33,7 +33,7 @@ def __init__(
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each Document text before embedding.
Expand All @@ -48,7 +48,7 @@ def __init__(
self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.prefix = prefix
self.suffix = suffix
self.batch_size = batch_size
Expand All @@ -71,7 +71,7 @@ def to_dict(self) -> Dict[str, Any]:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
prefix=self.prefix,
suffix=self.suffix,
batch_size=self.batch_size,
Expand All @@ -94,7 +94,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(
self,
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Union[bool, str, None] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
Expand All @@ -30,7 +30,7 @@ def __init__(
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each text.
Expand All @@ -43,7 +43,7 @@ def __init__(
self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.prefix = prefix
self.suffix = suffix
self.batch_size = batch_size
Expand All @@ -64,7 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
prefix=self.prefix,
suffix=self.suffix,
batch_size=self.batch_size,
Expand All @@ -85,7 +85,7 @@ def warm_up(self):
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
)

@component.output_types(embedding=List[float])
Expand Down
21 changes: 16 additions & 5 deletions haystack/preview/components/rankers/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,19 @@ class SimilarityRanker:
def __init__(
self,
model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
top_k: int = 10,
device: str = "cpu",
token: Union[bool, str, None] = None,
top_k: int = 10,
):
"""
Creates an instance of SimilarityRanker.
:param model_name_or_path: Path to a pre-trained sentence-transformers model.
:param top_k: The maximum number of documents to return per query.
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param top_k: The maximum number of documents to return per query.
"""
torch_and_transformers_import.check()

Expand All @@ -53,6 +57,7 @@ def __init__(
raise ValueError(f"top_k must be > 0, but got {top_k}")
self.top_k = top_k
self.device = device
self.token = token
self.model = None
self.tokenizer = None

Expand All @@ -67,16 +72,22 @@ def warm_up(self):
Warm up the model and tokenizer used in scoring the documents.
"""
if self.model_name_or_path and not self.model:
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token)
self.model = self.model.to(self.device)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, top_k=self.top_k, device=self.device, model_name_or_path=self.model_name_or_path)
return default_to_dict(
self,
device=self.device,
model_name_or_path=self.model_name_or_path,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
top_k=self.top_k,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":
Expand Down
12 changes: 10 additions & 2 deletions haystack/preview/components/readers/extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self,
model_name_or_path: Union[Path, str] = "deepset/roberta-base-squad2-distilled",
device: Optional[str] = None,
token: Union[bool, str, None] = None,
top_k: int = 20,
confidence_threshold: Optional[float] = None,
max_seq_length: int = 384,
Expand All @@ -40,6 +41,9 @@ def __init__(
Can either be a path to a folder containing the model files or an identifier for the HF hub
Default: `'deepset/roberta-base-squad2-distilled'`
:param device: Pytorch device string. Uses GPU by default if available
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param top_k: Number of answers to return per query.
It is required even if confidence_threshold is set. Defaults to 20.
:param confidence_threshold: Answers with a confidence score below this value will not be returned
Expand All @@ -58,6 +62,7 @@ def __init__(
self.model_name_or_path = str(model_name_or_path)
self.model = None
self.device = device
self.token = token
self.max_seq_length = max_seq_length
self.top_k = top_k
self.confidence_threshold = confidence_threshold
Expand All @@ -81,6 +86,7 @@ def to_dict(self) -> Dict[str, Any]:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
token=self.token if not isinstance(self.token, str) else None,
max_seq_length=self.max_seq_length,
top_k=self.top_k,
confidence_threshold=self.confidence_threshold,
Expand All @@ -104,8 +110,10 @@ def warm_up(self):
self.device = self.device or "cuda:0"
else:
self.device = self.device or "cpu:0"
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path, token=self.token).to(
self.device
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)

def _flatten_documents(
self, queries: List[str], documents: List[List[Document]]
Expand Down
8 changes: 8 additions & 0 deletions releasenotes/notes/adopt-hf-token-770edaccf6278ad9.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
preview:
- |
Adopt Hugging Face `token` instead of the deprecated `use_auth_token`.
Add this parameter to `ExtractiveReader` and `SimilarityRanker` to allow
loading private models.
Proper handling of `token` during serialization: if it is a string (a possible valid token)
it is not serialized.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_init_default(self):
embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="model")
assert embedder.model_name_or_path == "model"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token is None
assert embedder.prefix == ""
assert embedder.suffix == ""
assert embedder.batch_size == 32
Expand All @@ -28,7 +28,7 @@ def test_init_with_parameters(self):
embedder = SentenceTransformersDocumentEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -39,7 +39,7 @@ def test_init_with_parameters(self):
)
assert embedder.model_name_or_path == "model"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token is True
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.batch_size == 64
Expand All @@ -57,7 +57,7 @@ def test_to_dict(self):
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"use_auth_token": None,
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
Expand All @@ -73,7 +73,7 @@ def test_to_dict_with_custom_init_parameters(self):
component = SentenceTransformersDocumentEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token="the-token",
token="the-token",
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -83,12 +83,13 @@ def test_to_dict_with_custom_init_parameters(self):
embedding_separator=" - ",
)
data = component.to_dict()

assert data == {
"type": "SentenceTransformersDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": "the-token",
"token": None, # the token is not serialized
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
Expand All @@ -106,7 +107,7 @@ def test_from_dict(self):
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": "the-token",
"token": None,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
Expand All @@ -119,7 +120,7 @@ def test_from_dict(self):
component = SentenceTransformersDocumentEmbedder.from_dict(data)
assert component.model_name_or_path == "model"
assert component.device == "cuda"
assert component.use_auth_token == "the-token"
assert component.token is None
assert component.prefix == "prefix"
assert component.suffix == "suffix"
assert component.batch_size == 64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_init_default(self):
embedder = SentenceTransformersTextEmbedder(model_name_or_path="model")
assert embedder.model_name_or_path == "model"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token is None
assert embedder.prefix == ""
assert embedder.suffix == ""
assert embedder.batch_size == 32
Expand All @@ -24,7 +24,7 @@ def test_init_with_parameters(self):
embedder = SentenceTransformersTextEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -33,7 +33,7 @@ def test_init_with_parameters(self):
)
assert embedder.model_name_or_path == "model"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token is True
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.batch_size == 64
Expand All @@ -49,7 +49,7 @@ def test_to_dict(self):
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"use_auth_token": None,
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
Expand All @@ -63,7 +63,7 @@ def test_to_dict_with_custom_init_parameters(self):
component = SentenceTransformersTextEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -76,7 +76,7 @@ def test_to_dict_with_custom_init_parameters(self):
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": True,
"token": True,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
Expand All @@ -85,14 +85,32 @@ def test_to_dict_with_custom_init_parameters(self):
},
}

@pytest.mark.unit
def test_to_dict_not_serialize_token(self):
component = SentenceTransformersTextEmbedder(model_name_or_path="model", token="awesome-token")
data = component.to_dict()
assert data == {
"type": "SentenceTransformersTextEmbedder",
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
"progress_bar": True,
"normalize_embeddings": False,
},
}

@pytest.mark.unit
def test_from_dict(self):
data = {
"type": "SentenceTransformersTextEmbedder",
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": True,
"token": True,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
Expand All @@ -103,7 +121,7 @@ def test_from_dict(self):
component = SentenceTransformersTextEmbedder.from_dict(data)
assert component.model_name_or_path == "model"
assert component.device == "cuda"
assert component.use_auth_token is True
assert component.token is True
assert component.prefix == "prefix"
assert component.suffix == "suffix"
assert component.batch_size == 64
Expand Down
Loading

0 comments on commit 21d894d

Please sign in to comment.