Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Rust Sqlite migrations #3599

Merged
merged 9 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
523 changes: 514 additions & 9 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[workspace]
resolver = "2"

members = ["rust/benchmark", "rust/blockstore", "rust/cache", "rust/chroma", "rust/config", "rust/distance", "rust/error", "rust/frontend", "rust/garbage_collector", "rust/index", "rust/load", "rust/log", "rust/memberlist", "rust/storage", "rust/system", "rust/sysdb", "rust/types", "rust/worker", "rust/segment", "rust/python_bindings", "rust/mdac", "rust/tracing"]
members = ["rust/benchmark", "rust/blockstore", "rust/cache", "rust/chroma", "rust/config", "rust/distance", "rust/error", "rust/frontend", "rust/garbage_collector", "rust/index", "rust/load", "rust/log", "rust/memberlist", "rust/storage", "rust/system", "rust/sysdb", "rust/types", "rust/worker", "rust/segment", "rust/python_bindings", "rust/mdac", "rust/tracing", "rust/sqlite"]

[workspace.dependencies]
arrow = "52.2.0"
Expand All @@ -21,7 +21,6 @@ opentelemetry_sdk = { version = "0.27", features = ["rt-tokio"] }
parking_lot = { version = "0.12.3", features = ["serde"] }
prost = "0.13"
prost-types = "0.12"
regex = "1.11"
roaring = "0.10.6"
serde = { version = "1.0.215", features = ["derive"] }
serde_json = "1.0.133"
Expand All @@ -37,6 +36,10 @@ tracing-bunyan-formatter = "0.3"
tracing-opentelemetry = "0.28.0"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
uuid = { version = "1.11.0", features = ["v4", "fast-rng", "macro-diagnostics", "serde"] }
sqlx = { version = "0.8.3", features = [ "runtime-tokio", "sqlite"] }
sha2 = "0.10.8"
md5 = "0.7.0"
regex = "1.11.1"
tower-http = { version = "0.6.2", features = ["trace"] }

chroma-benchmark = { path = "rust/benchmark" }
Expand All @@ -55,6 +58,7 @@ chroma-system = { path = "rust/system" }
chroma-sysdb = { path = "rust/sysdb" }
chroma-tracing = { path = "rust/tracing" }
chroma-types = { path = "rust/types" }
chroma-sqlite = { path = "rust/sqlite" }
mdac = { path = "rust/mdac" }
worker = { path = "rust/worker" }

Expand Down
16 changes: 16 additions & 0 deletions rust/sqlite/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[package]
name = "sqlite"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please name chroma-sqlite.

version = "0.1.0"
edition = "2021"

[dependencies]
sqlx = { workspace = true }
sha2 = { workspace = true}
regex = { workspace = true }
tokio = { workspace = true}
md5 = { workspace = true}
rust-embed = {version = "8.5.0", features = ["include-exclude"]}
thiserror = { workspace = true }

[dev-dependencies]
tempfile = { workspace = true }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unneeded

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
CREATE TABLE embeddings_queue (
seq_id INTEGER PRIMARY KEY,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
operation INTEGER NOT NULL,
topic TEXT NOT NULL,
id TEXT NOT NULL,
vector BLOB,
encoding TEXT,
metadata TEXT
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CREATE TABLE embeddings_queue_config (
id INTEGER PRIMARY KEY,
config_json_str TEXT
);
24 changes: 24 additions & 0 deletions rust/sqlite/migrations/metadb/00001-embedding-metadata.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
CREATE TABLE embeddings (
id INTEGER PRIMARY KEY,
segment_id TEXT NOT NULL,
embedding_id TEXT NOT NULL,
seq_id BLOB NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE (segment_id, embedding_id)
);

CREATE TABLE embedding_metadata (
id INTEGER REFERENCES embeddings(id),
key TEXT NOT NULL,
string_value TEXT,
int_value INTEGER,
float_value REAL,
PRIMARY KEY (id, key)
);

CREATE TABLE max_seq_id (
segment_id TEXT PRIMARY KEY,
seq_id BLOB NOT NULL
);

CREATE VIRTUAL TABLE embedding_fulltext USING fts5(id, string_value);
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- SQLite does not support adding check with alter table, as a result, adding a check
-- involve creating a new table and copying the data over. It is over kill with adding
-- a boolean type column. The application write to the table needs to ensure the data
-- integrity.
ALTER TABLE embedding_metadata ADD COLUMN bool_value INTEGER
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CREATE VIRTUAL TABLE embedding_fulltext_search USING fts5(string_value, tokenize='trigram');
INSERT INTO embedding_fulltext_search (rowid, string_value) SELECT rowid, string_value FROM embedding_metadata;
DROP TABLE embedding_fulltext;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CREATE INDEX IF NOT EXISTS embedding_metadata_int_value ON embedding_metadata (key, int_value) WHERE int_value IS NOT NULL;
CREATE INDEX IF NOT EXISTS embedding_metadata_float_value ON embedding_metadata (key, float_value) WHERE float_value IS NOT NULL;
CREATE INDEX IF NOT EXISTS embedding_metadata_string_value ON embedding_metadata (key, string_value) WHERE string_value IS NOT NULL;
15 changes: 15 additions & 0 deletions rust/sqlite/migrations/sysdb/00001-collections.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CREATE TABLE collections (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
topic TEXT NOT NULL,
UNIQUE (name)
);

CREATE TABLE collection_metadata (
collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE,
key TEXT NOT NULL,
str_value TEXT,
int_value INTEGER,
float_value REAL,
PRIMARY KEY (collection_id, key)
);
16 changes: 16 additions & 0 deletions rust/sqlite/migrations/sysdb/00002-segments.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
CREATE TABLE segments (
id TEXT PRIMARY KEY,
type TEXT NOT NULL,
scope TEXT NOT NULL,
topic TEXT,
collection TEXT REFERENCES collection(id)
);

CREATE TABLE segment_metadata (
segment_id TEXT REFERENCES segments(id) ON DELETE CASCADE,
key TEXT NOT NULL,
str_value TEXT,
int_value INTEGER,
float_value REAL,
PRIMARY KEY (segment_id, key)
);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE collections ADD COLUMN dimension INTEGER;
29 changes: 29 additions & 0 deletions rust/sqlite/migrations/sysdb/00004-tenants-databases.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
CREATE TABLE IF NOT EXISTS tenants (
id TEXT PRIMARY KEY,
UNIQUE (id)
);

CREATE TABLE IF NOT EXISTS databases (
id TEXT PRIMARY KEY, -- unique globally
name TEXT NOT NULL, -- unique per tenant
tenant_id TEXT NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
UNIQUE (tenant_id, name) -- Ensure that a tenant has only one database with a given name
);

CREATE TABLE IF NOT EXISTS collections_tmp (
id TEXT PRIMARY KEY, -- unique globally
name TEXT NOT NULL, -- unique per database
topic TEXT NOT NULL,
dimension INTEGER,
database_id TEXT NOT NULL REFERENCES databases(id) ON DELETE CASCADE,
UNIQUE (name, database_id)
);

-- Create default tenant and database
INSERT OR REPLACE INTO tenants (id) VALUES ('default_tenant'); -- The default tenant id is 'default_tenant' others are UUIDs
INSERT OR REPLACE INTO databases (id, name, tenant_id) VALUES ('00000000-0000-0000-0000-000000000000', 'default_database', 'default_tenant');

INSERT OR REPLACE INTO collections_tmp (id, name, topic, dimension, database_id)
SELECT id, name, topic, dimension, '00000000-0000-0000-0000-000000000000' FROM collections;
DROP TABLE collections;
ALTER TABLE collections_tmp RENAME TO collections;
4 changes: 4 additions & 0 deletions rust/sqlite/migrations/sysdb/00005-remove-topic.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Remove the topic column from the Collections and Segments tables

ALTER TABLE collections DROP COLUMN topic;
ALTER TABLE segments DROP COLUMN topic;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- SQLite does not support adding check with alter table, as a result, adding a check
-- involve creating a new table and copying the data over. It is over kill with adding
-- a boolean type column. The application write to the table needs to ensure the data
-- integrity.
ALTER TABLE collection_metadata ADD COLUMN bool_value INTEGER;
ALTER TABLE segment_metadata ADD COLUMN bool_value INTEGER;
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- Stores collection configuration dictionaries.
ALTER TABLE collections ADD COLUMN config_json_str TEXT;
7 changes: 7 additions & 0 deletions rust/sqlite/migrations/sysdb/00008-maintenance-log.sqlite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- Records when database maintenance operations are performed.
-- At time of creation, this table is only used to record vacuum operations.
CREATE TABLE maintenance_log (
id INT PRIMARY KEY,
timestamp INT NOT NULL,
operation TEXT NOT NULL
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
-- This makes segments.collection non-nullable.
CREATE TABLE segments_temp (
id TEXT PRIMARY KEY,
type TEXT NOT NULL,
scope TEXT NOT NULL,
collection TEXT REFERENCES collection(id) NOT NULL
);

INSERT INTO segments_temp SELECT * FROM segments;
DROP TABLE segments;
ALTER TABLE segments_temp RENAME TO segments;
25 changes: 25 additions & 0 deletions rust/sqlite/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#[derive(Clone)]
pub struct SqliteDBConfig {
// The SQLite database URL
pub url: String,
pub hash_type: MigrationHash,
pub migration_mode: MigrationMode,
}

/// Migration mode for the database
/// - Apply: Apply the migrations
/// - Validate: Validate the applied migrations and ensure none are unappliued
#[derive(Clone, PartialEq)]
pub enum MigrationMode {
Apply,
Validate,
}

/// The hash function to use for the migration files
/// - SHA256: Use SHA256 hash
/// - MD5: Use MD5 hash
#[derive(Clone)]
pub enum MigrationHash {
SHA256,
MD5,
}
Loading
Loading