diff --git a/Cargo.lock b/Cargo.lock index 42514a6..90f8280 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -848,7 +848,7 @@ dependencies = [ [[package]] name = "embed_anything_python" -version = "0.4.4" +version = "0.4.10" dependencies = [ "embed_anything", "pyo3", diff --git a/pyproject.toml b/pyproject.toml index 4056555..d32c010 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,15 +7,7 @@ name = "embed_anything" requires-python = ">=3.8" description = "Embed anything at lightning speed" readme = "README.md" -classifiers = [ - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "License :: OSI Approved :: MIT License" -] dynamic = ["version"] license = {file = "LICENSE"} @@ -27,4 +19,5 @@ manifest-path = "python/Cargo.toml" module-name = "embed_anything._embed_anything" [project.urls] -Homepage = "https://github.com/StarlightSearch/EmbedAnything/tree/main" \ No newline at end of file +Homepage = "https://github.com/StarlightSearch/EmbedAnything/tree/main" + diff --git a/python/Cargo.toml b/python/Cargo.toml index a13d4bc..236bc39 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "embed_anything_python" -version = "0.4.4" +version = "0.4.10" edition = "2021" [lib] diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 2028af3..0000000 --- a/test.ipynb +++ /dev/null @@ -1,348 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2024-10-10 20:53:22-- https://arxiv.org/pdf/1706.03762\n", - "Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.131.42, 151.101.3.42, ...\n", - "Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 2215244 (2.1M) [application/pdf]\n", - "Saving to: ‘bench/attention.pdf’\n", - "\n", - "bench/attention.pdf 100%[===================>] 2.11M 1.64MB/s in 1.3s \n", - "\n", - "2024-10-10 20:53:23 (1.64 MB/s) - ‘bench/attention.pdf’ saved [2215244/2215244]\n", - "\n", - "--2024-10-10 20:53:23-- https://arxiv.org/pdf/2407.01449\n", - "Resolving arxiv.org (arxiv.org)... " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "151.101.195.42, 151.101.3.42, 151.101.131.42, ...\n", - "Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 9380119 (8.9M) [application/pdf]\n", - "Saving to: ‘bench/colpali.pdf’\n", - "\n", - "bench/colpali.pdf 100%[===================>] 8.95M 2.70MB/s in 3.3s \n", - "\n", - "2024-10-10 20:53:27 (2.70 MB/s) - ‘bench/colpali.pdf’ saved [9380119/9380119]\n", - "\n", - "--2024-10-10 20:53:27-- https://arxiv.org/pdf/2310.06825\n", - "Resolving arxiv.org (arxiv.org)... " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "151.101.67.42, 151.101.195.42, 151.101.3.42, ...\n", - "Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 3749788 (3.6M) [application/pdf]\n", - "Saving to: ‘bench/mistral.pdf’\n", - "\n", - "bench/mistral.pdf 100%[===================>] 3.58M 2.88MB/s in 1.2s \n", - "\n", - "2024-10-10 20:53:29 (2.88 MB/s) - ‘bench/mistral.pdf’ saved [3749788/3749788]\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "if os.path.exists(\"bench\") == False:\n", - " os.mkdir(\"bench\")\n", - "\n", - "!wget https://arxiv.org/pdf/1706.03762 -O bench/attention.pdf\n", - "!wget https://arxiv.org/pdf/2407.01449 -O bench/colpali.pdf\n", - "!wget https://arxiv.org/pdf/2310.06825 -O bench/mistral.pdf\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# from sentence_transformers import SentenceTransformer\n", - "# import pymupdf\n", - "# from semantic_text_splitter import TextSplitter\n", - "# import time\n", - "# splitter = TextSplitter(1000)\n", - "# files = []\n", - "# model = SentenceTransformer('BAAI/bge-small-en-v1.5')\n", - "\n", - "# # get all pdfs from test_files\n", - "# now = time.time()\n", - "\n", - "# for file in os.listdir(\"bench\"):\n", - "\n", - "# text = []\n", - "# doc = pymupdf.open(\"bench/\" + file)\n", - "\n", - "# for page in doc:\n", - "# text.append(page.get_text())\n", - "\n", - "# text = \" \".join(text)\n", - "# chunks = splitter.chunks(text)\n", - "# embeddings = model.encode(chunks)\n", - "\n", - "\n", - "# print(time.time() - now)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import gc\n", - "gc.collect()\n", - "torch.cuda.empty_cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3f850e2ef272458e8dce6ebd7267533b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 9\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mTextEmbedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msentence-transformers/all-MiniLM-L6-v2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mproviders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCUDAExecutionProvider\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCPUExecutionProvider\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfastembed\u001b[39m():\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# get all pdfs from test_files\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m os\u001b[38;5;241m.\u001b[39mlistdir(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbench\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/fastembed/text/text_embedding.py:68\u001b[0m, in \u001b[0;36mTextEmbedding.__init__\u001b[0;34m(self, model_name, cache_dir, threads, providers, **kwargs)\u001b[0m\n\u001b[1;32m 63\u001b[0m supported_models \u001b[38;5;241m=\u001b[39m EMBEDDING_MODEL_TYPE\u001b[38;5;241m.\u001b[39mlist_supported_models()\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(\n\u001b[1;32m 65\u001b[0m model_name\u001b[38;5;241m.\u001b[39mlower() \u001b[38;5;241m==\u001b[39m model[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mlower()\n\u001b[1;32m 66\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m model \u001b[38;5;129;01min\u001b[39;00m supported_models\n\u001b[1;32m 67\u001b[0m ):\n\u001b[0;32m---> 68\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mEMBEDDING_MODEL_TYPE\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 69\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 70\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[43m \u001b[49m\u001b[43mthreads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthreads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[43m \u001b[49m\u001b[43mproviders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproviders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 78\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mModel \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not supported in TextEmbedding.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check the supported models using `TextEmbedding.list_supported_models()`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 80\u001b[0m )\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/fastembed/text/onnx_embedding.py:197\u001b[0m, in \u001b[0;36mOnnxTextEmbedding.__init__\u001b[0;34m(self, model_name, cache_dir, threads, providers, **kwargs)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache_dir \u001b[38;5;241m=\u001b[39m define_cache_dir(cache_dir)\n\u001b[1;32m 193\u001b[0m model_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_model(\n\u001b[1;32m 194\u001b[0m model_description, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache_dir, local_files_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_local_files_only\n\u001b[1;32m 195\u001b[0m )\n\u001b[0;32m--> 197\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_onnx_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_description\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_file\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43mthreads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthreads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43mproviders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproviders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/fastembed/text/onnx_text_model.py:46\u001b[0m, in \u001b[0;36mOnnxTextModel.load_onnx_model\u001b[0;34m(self, model_dir, model_file, threads, providers)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_onnx_model\u001b[39m(\n\u001b[1;32m 40\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 41\u001b[0m model_dir: Path,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 44\u001b[0m providers: Optional[Sequence[OnnxProvider]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 45\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 46\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_onnx_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 48\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 49\u001b[0m \u001b[43m \u001b[49m\u001b[43mthreads\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthreads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[43m \u001b[49m\u001b[43mproviders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproviders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 51\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtokenizer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspecial_token_to_id \u001b[38;5;241m=\u001b[39m load_tokenizer(model_dir\u001b[38;5;241m=\u001b[39mmodel_dir)\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/fastembed/common/onnx_model.py:84\u001b[0m, in \u001b[0;36mOnnxModel.load_onnx_model\u001b[0;34m(self, model_dir, model_file, threads, providers)\u001b[0m\n\u001b[1;32m 81\u001b[0m so\u001b[38;5;241m.\u001b[39mintra_op_num_threads \u001b[38;5;241m=\u001b[39m threads\n\u001b[1;32m 82\u001b[0m so\u001b[38;5;241m.\u001b[39minter_op_num_threads \u001b[38;5;241m=\u001b[39m threads\n\u001b[0;32m---> 84\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[43mort\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mInferenceSession\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproviders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43monnx_providers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msess_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mso\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCUDAExecutionProvider\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m requested_provider_names:\n\u001b[1;32m 88\u001b[0m current_providers \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mget_providers()\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:419\u001b[0m, in \u001b[0;36mInferenceSession.__init__\u001b[0;34m(self, path_or_bytes, sess_options, providers, provider_options, **kwargs)\u001b[0m\n\u001b[1;32m 416\u001b[0m disabled_optimizers \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdisabled_optimizers\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 418\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_create_inference_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproviders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprovider_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisabled_optimizers\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 420\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mRuntimeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_enable_fallback:\n", - "File \u001b[0;32m~/miniconda3/envs/langchain/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:480\u001b[0m, in \u001b[0;36mInferenceSession._create_inference_session\u001b[0;34m(self, providers, provider_options, disabled_optimizers)\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_register_ep_custom_ops(session_options, providers, provider_options, available_providers)\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_model_path:\n\u001b[0;32m--> 480\u001b[0m sess \u001b[38;5;241m=\u001b[39m \u001b[43mC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mInferenceSession\u001b[49m\u001b[43m(\u001b[49m\u001b[43msession_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_config_from_model\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 482\u001b[0m sess \u001b[38;5;241m=\u001b[39m C\u001b[38;5;241m.\u001b[39mInferenceSession(session_options, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_model_bytes, \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_read_config_from_model)\n", - "\u001b[0;31mNoSuchFile\u001b[0m: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from /tmp/fastembed_cache/fast-all-MiniLM-L6-v2/model.onnx failed:Load model /tmp/fastembed_cache/fast-all-MiniLM-L6-v2/model.onnx failed. File doesn't exist" - ] - } - ], - "source": [ - "from fastembed import TextEmbedding\n", - "import pymupdf\n", - "from semantic_text_splitter import TextSplitter\n", - "import time\n", - "import os\n", - "import numpy as np\n", - "splitter = TextSplitter(1000)\n", - "files = []\n", - "model = TextEmbedding(model_name = \"sentence-transformers/all-MiniLM-L6-v2\",\n", - " providers=[\"CUDAExecutionProvider\", \"CPUExecutionProvider\"])\n", - "\n", - "def fastembed():\n", - "# get all pdfs from test_files\n", - "\n", - " for file in os.listdir(\"bench\"):\n", - "\n", - " text = []\n", - " doc = pymupdf.open(\"bench/\" + file)\n", - "\n", - " for page in doc:\n", - " text.append(page.get_text())\n", - "\n", - " text = \" \".join(text)\n", - " chunks = splitter.chunks(text)\n", - " embeddings = list(model.embed(chunks))\n", - "\n", - "%timeit -n 20 fastembed()" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": null, -======= - "execution_count": 2, ->>>>>>> c65096182fc3ba0fd2f86b070944ee50bcce5989 - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Session is using CUDAExecutionProvider\n", -<<<<<<< HEAD - "207 ms ± 6.01 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)\n" -======= - "212 ms ± 3.45 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)\n" ->>>>>>> c65096182fc3ba0fd2f86b070944ee50bcce5989 - ] - } - ], - "source": [ - "from embed_anything import EmbeddingModel, WhichModel, embed_query, TextEmbedConfig\n", - "import os\n", - "import pymupdf\n", - "from semantic_text_splitter import TextSplitter\n", - "\n", - "model = EmbeddingModel.from_pretrained_onnx(\n", -<<<<<<< HEAD - " WhichModel.Bert, \"AllMiniLML6V2\"\n", - ")\n", - "splitter = TextSplitter(1000)\n", - "config = TextEmbedConfig(batch_size=256)\n", -======= - " WhichModel.Bert, \"BGESmallENV15Q\"\n", - ")\n", - "splitter = TextSplitter(1000)\n", - "config = TextEmbedConfig()\n", ->>>>>>> c65096182fc3ba0fd2f86b070944ee50bcce5989 - "\n", - "def embed_anything():\n", - "# get all pdfs from test_files\n", - "\n", - " for file in os.listdir(\"bench\"):\n", - " text = []\n", - " doc = pymupdf.open(\"bench/\" + file)\n", - "\n", - " for page in doc:\n", - " text.append(page.get_text())\n", - " \n", - " text = \" \".join(text)\n", - " chunks = splitter.chunks(text)\n", - " embed_query(chunks, model, config)\n", - "\n", - "%timeit -n 20 embed_anything()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading weights from \"/home/akshay/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/model.safetensors\"\n", - "1 s ± 19.4 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)\n" - ] - } - ], - "source": [ - "from embed_anything import EmbeddingModel, WhichModel, embed_query, TextEmbedConfig\n", - "import os\n", - "import pymupdf\n", - "from semantic_text_splitter import TextSplitter\n", - "import os\n", - "\n", - "model = EmbeddingModel.from_pretrained_hf(\n", - " WhichModel.Bert, \"sentence-transformers/all-MiniLM-L6-v2\"\n", - ")\n", - "splitter = TextSplitter(1000)\n", - "config = TextEmbedConfig()\n", - "\n", - "def embed_anything():\n", - "# get all pdfs from test_files\n", - "\n", - " for file in os.listdir(\"bench\"):\n", - " text = []\n", - " doc = pymupdf.open(\"bench/\" + file)\n", - "\n", - " for page in doc:\n", - " text.append(page.get_text())\n", - " \n", - " text = \" \".join(text)\n", - " chunks = splitter.chunks(text)\n", - " embeddings = embed_query(chunks, model, config)\n", - "\n", - "%timeit -n 20 embed_anything()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "langchain", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}