From 0f800eba2b0b4dfdf470df6fbe2d2e31522647b7 Mon Sep 17 00:00:00 2001 From: Charles Pierse Date: Tue, 2 Apr 2024 19:35:14 +0100 Subject: [PATCH 1/6] Add support for mixedbread-ai/mxbai-embed-large-v1, update docs --- .github/workflows/main.yaml | 14 ++++++++++---- README.md | 2 ++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0419302..c5c05bc 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -5,7 +5,7 @@ on: branches: - main tags: - - '**' + - "**" paths-ignore: - README.md - LICENSE @@ -96,6 +96,12 @@ jobs: - model_name: sentence-transformers/all-MiniLM-L6-v2 model_tag_name: sentence-transformers-all-MiniLM-L6-v2 onnx_runtime: true + - model_name: mixedbread-ai/mxbai-embed-large-v1 + model_tag_name: mixedbread-ai-mxbai-embed-large-v1 + onnx_runtime: false + - model_name: mixedbread-ai/mxbai-embed-large-v1 + model_tag_name: mixedbread-ai-mxbai-embed-large-v1 + onnx_runtime: true env: LOCAL_REPO: transformers-inference REMOTE_REPO: semitechnologies/transformers-inference @@ -107,9 +113,9 @@ jobs: - uses: actions/setup-python@v4 with: python-version: "3.11" - cache: 'pip' # caching pip dependencies + cache: "pip" # caching pip dependencies - name: Login to Docker Hub - if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork + if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork uses: docker/login-action@v2 with: username: ${{secrets.DOCKER_USERNAME}} @@ -137,7 +143,7 @@ jobs: with: python-version: "3.11" - name: Login to Docker Hub - if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork + if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork uses: docker/login-action@v2 with: username: ${{secrets.DOCKER_USERNAME}} diff --git a/README.md b/README.md index ed90c30..3a6336a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ The pre-built models include: |`google/flan-t5-large` ([Info](https://huggingface.co/google/flan-t5-large))|`semitechnologies/transformers-inference:sentence-transformers-gtr-t5-large`| |`BAAI/bge-small-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-small-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-small-en-v1.5`| |`BAAI/bge-base-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-base-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-base-en-v1.5`| +|`mixedbread-ai/mxbai-embed-large-v1` ([Info](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1))|`semitechnologies/transformers-inference:mixedbread-ai-mxbai-embed-large-v1`| |DPR Models| |`facebook/dpr-ctx_encoder-single-nq-base` ([Info](https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base))|`semitechnologies/transformers-inference:facebook-dpr-ctx_encoder-single-nq-base`| |`facebook/dpr-question_encoder-single-nq-base` ([Info](https://huggingface.co/facebook/dpr-question_encoder-single-nq-base))|`semitechnologies/transformers-inference:facebook-dpr-question_encoder-single-nq-base`| @@ -54,6 +55,7 @@ The pre-built models include: |`BAAI/bge-base-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-base-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-base-en-v1.5-onnx`| |`BAAI/bge-m3` ([Info](https://huggingface.co/BAAI/bge-m3))|`semitechnologies/transformers-inference:baai-bge-m3-onnx`| |`sentence-transformers/all-MiniLM-L6-v2` ([Info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2))|`semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2-onnx`| +|`mixedbread-ai/mxbai-embed-large-v1` ([Info](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1))|`semitechnologies/transformers-inference:mixedbread-ai-mxbai-embed-large-v1-onnx`| The above image names always point to the latest version of the inference From 96086ec0c6160b59b0cf2335fa86e388c66112ab Mon Sep 17 00:00:00 2001 From: Charles Pierse Date: Tue, 2 Apr 2024 19:35:29 +0100 Subject: [PATCH 2/6] Bump sentence_transformer version --- requirements-test.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index fa5867a..fa104cd 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ uvicorn==0.27.1 nltk==3.8.1 torch==2.0.1 sentencepiece==0.2.0 -sentence-transformers==2.2.2 +sentence-transformers==2.6.1 optimum==1.17.1 onnxruntime==1.17.1 onnx==1.15.0 diff --git a/requirements.txt b/requirements.txt index 835f458..25db8a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ uvicorn==0.27.1 nltk==3.8.1 torch==2.0.1 sentencepiece==0.2.0 -sentence-transformers==2.2.2 +sentence-transformers==2.6.1 optimum==1.17.1 onnxruntime==1.17.1 onnx==1.15.0 From 99b4bda9b501745a0e51cbdb9a1ae52568c3e981 Mon Sep 17 00:00:00 2001 From: Charles Pierse Date: Tue, 2 Apr 2024 20:04:50 +0100 Subject: [PATCH 3/6] Revert tag change --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c5c05bc..dd120a7 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -5,7 +5,7 @@ on: branches: - main tags: - - "**" + - '**' paths-ignore: - README.md - LICENSE From 38d7c04797d3f92aee2f27e321131658338eb3b0 Mon Sep 17 00:00:00 2001 From: Charles Pierse Date: Tue, 2 Apr 2024 20:06:05 +0100 Subject: [PATCH 4/6] Rever more formatting changes --- .github/workflows/main.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index dd120a7..f6b45ae 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -113,9 +113,9 @@ jobs: - uses: actions/setup-python@v4 with: python-version: "3.11" - cache: "pip" # caching pip dependencies + cache: 'pip' # caching pip dependencies - name: Login to Docker Hub - if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork + if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork uses: docker/login-action@v2 with: username: ${{secrets.DOCKER_USERNAME}} @@ -143,7 +143,7 @@ jobs: with: python-version: "3.11" - name: Login to Docker Hub - if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork + if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork uses: docker/login-action@v2 with: username: ${{secrets.DOCKER_USERNAME}} From 38c323711eb9c227e452926f4cb4aba35054fa1e Mon Sep 17 00:00:00 2001 From: Marcin Antas Date: Wed, 3 Apr 2024 13:06:39 +0200 Subject: [PATCH 5/6] Adjust SentenceTransformer vectorizer implementation --- .github/workflows/main.yaml | 2 ++ app.py | 9 +++++---- meta.py | 13 ++++++++----- requirements.txt | 2 +- vectorizer.py | 9 +++++---- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f6b45ae..83e39e7 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -99,6 +99,7 @@ jobs: - model_name: mixedbread-ai/mxbai-embed-large-v1 model_tag_name: mixedbread-ai-mxbai-embed-large-v1 onnx_runtime: false + use_sentence_transformers_vectorizer: true - model_name: mixedbread-ai/mxbai-embed-large-v1 model_tag_name: mixedbread-ai-mxbai-embed-large-v1 onnx_runtime: true @@ -108,6 +109,7 @@ jobs: MODEL_NAME: ${{matrix.model_name}} MODEL_TAG_NAME: ${{matrix.model_tag_name}} ONNX_RUNTIME: ${{matrix.onnx_runtime}} + USE_SENTENCE_TRANSFORMERS_VECTORIZER: ${{matrix.use_sentence_transformers_vectorizer}} steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/app.py b/app.py index bf4968f..752e0c9 100644 --- a/app.py +++ b/app.py @@ -48,7 +48,7 @@ def get_model_directory() -> (str, bool): if os.path.exists(f"{model_dir}/model_name"): with open(f"{model_dir}/model_name", "r") as f: model_name = f.read() - return f"{model_dir}/{model_name}", True + return model_name, True # Default model directory is ./models/model return model_dir, False @@ -67,14 +67,15 @@ def log_info_about_onnx(onnx_runtime: bool): onnx_quantization_info = f.read() logger.info(f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}") - model_dir, use_sentence_transformer_vectorizer = get_model_directory() + model_name, use_sentence_transformer_vectorizer = get_model_directory() onnx_runtime = get_onnx_runtime() log_info_about_onnx(onnx_runtime) - meta_config = Meta(model_dir) + meta_config = Meta(model_dir, model_name, use_sentence_transformer_vectorizer) vec = Vectorizer(model_dir, cuda_support, cuda_core, cuda_per_process_memory_fraction, meta_config.get_model_type(), meta_config.get_architecture(), - direct_tokenize, onnx_runtime, use_sentence_transformer_vectorizer) + direct_tokenize, onnx_runtime, use_sentence_transformer_vectorizer, + model_name) @app.get("/.well-known/live", response_class=Response) diff --git a/meta.py b/meta.py index 1574b03..eb12f5f 100644 --- a/meta.py +++ b/meta.py @@ -4,20 +4,23 @@ class Meta: config: AutoConfig - def __init__(self, model_path): - self.config = AutoConfig.from_pretrained(model_path) + def __init__(self, model_path: str, model_name: str, use_sentence_transformer_vectorizer: bool): + if use_sentence_transformer_vectorizer: + self.config = {"model_name": model_name, "model_type": None} + else: + self.config = AutoConfig.from_pretrained(model_path).to_dict() def get(self): return { - 'model': self.config.to_dict() + 'model': self.config } def get_model_type(self): - return self.config.to_dict()['model_type'] + return self.config['model_type'] def get_architecture(self): architecture = None - conf = self.config.to_dict() + conf = self.config if "architectures" in conf: architecture = conf["architectures"][0] return architecture diff --git a/requirements.txt b/requirements.txt index 25db8a0..e168d3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.38.2 +transformers==4.39.3 fastapi==0.110.0 uvicorn==0.27.1 nltk==3.8.1 diff --git a/vectorizer.py b/vectorizer.py index 6e403db..b4ee5a6 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -38,13 +38,14 @@ class Vectorizer: executor: ThreadPoolExecutor def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, - model_type: str, architecture: str, direct_tokenize: bool, onnx_runtime: bool, use_sentence_transformer_vectorizer: bool): + model_type: str, architecture: str, direct_tokenize: bool, onnx_runtime: bool, + use_sentence_transformer_vectorizer: bool, model_name: str): self.executor = ThreadPoolExecutor() if onnx_runtime: self.vectorizer = ONNXVectorizer(model_path) else: if model_type == 't5' or use_sentence_transformer_vectorizer: - self.vectorizer = SentenceTransformerVectorizer(model_path, cuda_core) + self.vectorizer = SentenceTransformerVectorizer(model_path, model_name, cuda_core) else: self.vectorizer = HuggingFaceVectorizer(model_path, cuda_support, cuda_core, cuda_per_process_memory_fraction, model_type, architecture, direct_tokenize) @@ -56,9 +57,9 @@ class SentenceTransformerVectorizer: model: SentenceTransformer cuda_core: str - def __init__(self, model_path: str, cuda_core: str): + def __init__(self, model_path: str, model_name: str, cuda_core: str): self.cuda_core = cuda_core - self.model = SentenceTransformer(model_path, device=self.get_device()) + self.model = SentenceTransformer(model_name, cache_folder=model_path, device=self.get_device()) self.model.eval() # make sure we're in inference mode, not training def get_device(self) -> Optional[str]: From ed938ac008c5f9f7cecd2a22ad3a684366727cd2 Mon Sep 17 00:00:00 2001 From: Charles Pierse Date: Wed, 3 Apr 2024 14:03:31 +0100 Subject: [PATCH 6/6] Remove str replacment on model name --- download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.py b/download.py index e6267de..cd6bc5c 100755 --- a/download.py +++ b/download.py @@ -89,7 +89,7 @@ def download_model(model_name: str, model_dir: str): if (model_type is not None and model_type == "t5") or use_sentence_transformers_vectorizer.lower() == "true": SentenceTransformer(model_name, cache_folder=model_dir) with open(f"{model_dir}/model_name", "w") as f: - f.write(model_name.replace("/", "_")) + f.write(model_name) else: if config.architectures and not force_automodel: print(f"Using class {config.architectures[0]} to load model weights")