From 2139d0197e0928f252616cad0cf20eabded85475 Mon Sep 17 00:00:00 2001 From: Jeff Huber Date: Tue, 18 Jul 2023 17:20:54 -0700 Subject: [PATCH] upgrade chroma to 0.4.0 (#7749) ** This should land Monday the 17th ** Chroma is upgrading from `0.3.29` to `0.4.0`. `0.4.0` is easier to build, more durable, faster, smaller, and more extensible. This comes with a few changes: 1. A simplified and improved client setup. Instead of having to remember weird settings, users can just do `EphemeralClient`, `PersistentClient` or `HttpClient` (the underlying direct `Client` implementation is also still accessible) 2. We migrated data stores away from `duckdb` and `clickhouse`. This changes the api for the `PersistentClient` that used to reference `chroma_db_impl="duckdb+parquet"`. Now we simply set `is_persistent=true`. `is_persistent` is set for you to `true` if you use `PersistentClient`. 3. Because we migrated away from `duckdb` and `clickhouse` - this also means that users need to migrate their data into the new layout and schema. Chroma is committed to providing extension notification and tooling around any schema and data migrations (for example - this PR!). After upgrading to `0.4.0` - if users try to access their data that was stored in the previous regime, the system will throw an `Exception` and instruct them how to use the migration assistant to migrate their data. The migration assitant is a pip installable CLI: `pip install chroma_migrate`. And is runnable by calling `chroma_migrate` -- TODO ADD here is a short video demonstrating how it works. Please reference the readme at [chroma-core/chroma-migrate](https://github.com/chroma-core/chroma-migrate) to see a full write-up of our philosophy on migrations as well as more details about this particular migration. Please direct any users facing issues upgrading to our Discord channel called [#get-help](https://discord.com/channels/1073293645303795742/1129200523111841883). We have also created a [email listserv](https://airtable.com/shrHaErIs1j9F97BE) to notify developers directly in the future about breaking changes. --------- Co-authored-by: Bagatur --- .../integrations/merger_retriever.ipynb | 2 +- poetry.lock | 46 +++++++++++++++---- pyproject.toml | 2 +- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/docs/extras/modules/data_connection/retrievers/integrations/merger_retriever.ipynb b/docs/extras/modules/data_connection/retrievers/integrations/merger_retriever.ipynb index 7dfbaaab0c56c..0189c2d46d2d8 100644 --- a/docs/extras/modules/data_connection/retrievers/integrations/merger_retriever.ipynb +++ b/docs/extras/modules/data_connection/retrievers/integrations/merger_retriever.ipynb @@ -43,7 +43,7 @@ "\n", "# Instantiate 2 diff cromadb indexs, each one with a diff embedding.\n", "client_settings = chromadb.config.Settings(\n", - " chroma_db_impl=\"duckdb+parquet\",\n", + " is_persistent=True,\n", " persist_directory=DB_DIR,\n", " anonymized_telemetry=False,\n", ")\n", diff --git a/poetry.lock b/poetry.lock index 273f7fc77a642..a535c298c7cbf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1610,31 +1610,46 @@ files = [ {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, ] +[[package]] +name = "chroma-hnswlib" +version = "0.7.1" +description = "Chromas fork of hnswlib" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "chroma-hnswlib-0.7.1.tar.gz", hash = "sha256:f72592dc7d0522c25cc1f8864db7a3781f179ba989f209cc3ea01694c0d76493"}, + {file = "chroma_hnswlib-0.7.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:38f51585d81a5072db70b17207afd1f57670c209836d0fbbf2a1aa7e8bece6b7"}, +] + +[package.dependencies] +numpy = "*" + [[package]] name = "chromadb" -version = "0.3.26" +version = "0.4.1" description = "Chroma." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "chromadb-0.3.26-py3-none-any.whl", hash = "sha256:45a7848ee3ed8b694ca5789e5fd723406b76a13fa46f9a9a769f93317f29894c"}, - {file = "chromadb-0.3.26.tar.gz", hash = "sha256:a9b596d507f081993f2e32a7dcacabbbec2f6aebc2b6defe524442b07e265296"}, + {file = "chromadb-0.4.1-py3-none-any.whl", hash = "sha256:980e776bfbb76a2689418b03a254e7edb888961f57b7615f815c8d95f048b396"}, + {file = "chromadb-0.4.1.tar.gz", hash = "sha256:9b1a76d615dd2280e7b30ff82101ed31c26782a4d832070046309fde82515385"}, ] [package.dependencies] -clickhouse-connect = ">=0.5.7" -duckdb = ">=0.7.1" -fastapi = ">=0.85.1" +chroma-hnswlib = "0.7.1" +fastapi = ">=0.95.2,<0.100.0" graphlib-backport = {version = ">=1.0.3", markers = "python_version < \"3.9\""} -hnswlib = ">=0.7" +importlib-resources = "*" numpy = ">=1.21.6" onnxruntime = ">=1.14.1" overrides = ">=7.3.1" pandas = ">=1.3" posthog = ">=2.4.0" pulsar-client = ">=3.1.0" -pydantic = ">=1.9" +pydantic = ">=1.9,<2.0" +pypika = ">=0.48.9" requests = ">=2.28" tokenizers = ">=0.13.2" tqdm = ">=4.65.0" @@ -3604,7 +3619,7 @@ name = "hnswlib" version = "0.7.0" description = "hnswlib" category = "main" -optional = false +optional = true python-versions = "*" files = [ {file = "hnswlib-0.7.0.tar.gz", hash = "sha256:bc459668e7e44bb7454b256b90c98c5af750653919d9a91698dafcf416cf64c4"}, @@ -8563,6 +8578,17 @@ files = [ doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] +[[package]] +name = "pypika" +version = "0.48.9" +description = "A SQL query builder API for Python" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "PyPika-0.48.9.tar.gz", hash = "sha256:838836a61747e7c8380cd1b7ff638694b7a7335345d0f559b04b2cd832ad5378"}, +] + [[package]] name = "pyproject-hooks" version = "1.0.0" @@ -12857,4 +12883,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "f322b36103013bd59c34dddadf84209292ea61ed73bd26fbfa355d372011238b" +content-hash = "aee2f0c85636738d08d512c53fd551ab43a2e94c1ebf14c6178c9534da75dcaa" diff --git a/pyproject.toml b/pyproject.toml index ebfce6c09e620..35947d68c0939 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -193,7 +193,7 @@ deeplake = "^3.6.8" libdeeplake = "^0.0.60" weaviate-client = "^3.15.5" torch = "^1.0.0" -chromadb = "^0.3.21" +chromadb = "^0.4.0" tiktoken = "^0.3.3" python-dotenv = "^1.0.0" sentence-transformers = "^2"