diff --git a/Cargo.lock b/Cargo.lock index 0a742595630..9245b027ef1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1027,6 +1027,9 @@ name = "bitflags" version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +dependencies = [ + "serde", +] [[package]] name = "bitpacking" @@ -1067,6 +1070,16 @@ dependencies = [ "bit-vec 0.4.4", ] +[[package]] +name = "bstr" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.15.4" @@ -1704,6 +1717,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32c" version = "0.6.8" @@ -1788,6 +1816,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -1943,6 +1980,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55bf8e7b65898637379c1b74eb1551107c8294ed26d855ceb9fd1a09cfc9bc0" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1982,6 +2030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -2018,6 +2067,12 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + [[package]] name = "downcast-rs" version = "1.2.1" @@ -2036,10 +2091,10 @@ version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ - "der", + "der 0.6.1", "elliptic-curve", "rfc6979", - "signature", + "signature 1.6.4", ] [[package]] @@ -2047,6 +2102,9 @@ name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +dependencies = [ + "serde", +] [[package]] name = "elliptic-curve" @@ -2056,12 +2114,12 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ "base16ct", "crypto-bigint 0.4.9", - "der", + "der 0.6.1", "digest", "ff", "generic-array", "group", - "pkcs8", + "pkcs8 0.9.0", "rand_core", "sec1", "subtle", @@ -2099,6 +2157,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + [[package]] name = "event-listener" version = "5.3.1" @@ -2486,6 +2555,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -2635,6 +2715,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "globset" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + [[package]] name = "gloo-timers" version = "0.3.0" @@ -2749,6 +2842,15 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashlink" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +dependencies = [ + "hashbrown 0.15.2", +] + [[package]] name = "heck" version = "0.4.1" @@ -2779,6 +2881,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + [[package]] name = "hmac" version = "0.12.1" @@ -3483,6 +3594,9 @@ name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] [[package]] name = "levenshtein_automata" @@ -3576,6 +3690,17 @@ dependencies = [ "libc", ] +[[package]] +name = "libsqlite3-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -3742,6 +3867,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "mdac" version = "0.1.0" @@ -3958,6 +4089,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -4351,6 +4499,15 @@ dependencies = [ "serde", ] +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -4444,14 +4601,35 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.9", + "pkcs8 0.10.2", + "spki 0.7.3", +] + [[package]] name = "pkcs8" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" dependencies = [ - "der", - "spki", + "der 0.6.1", + "spki 0.6.0", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.9", + "spki 0.7.3", ] [[package]] @@ -5100,12 +5278,67 @@ dependencies = [ "byteorder", ] +[[package]] +name = "rsa" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c75d7c5c6b673e58bf54d8544a9f432e3a925b0e80f7cd3602ab5c50c55519" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + [[package]] name = "rtrb" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3f94e84c073f3b85d4012b44722fa8842b9986d741590d4f2636ad0a5b14143" +[[package]] +name = "rust-embed" +version = "8.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa66af4a4fdd5e7ebc276f115e895611a34739a9c1c01028383d612d550953c0" +dependencies = [ + "rust-embed-impl", + "rust-embed-utils", + "walkdir", +] + +[[package]] +name = "rust-embed-impl" +version = "8.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478" +dependencies = [ + "proc-macro2", + "quote", + "rust-embed-utils", + "syn 2.0.89", + "walkdir", +] + +[[package]] +name = "rust-embed-utils" +version = "8.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d" +dependencies = [ + "globset", + "sha2", + "walkdir", +] + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -5371,9 +5604,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ "base16ct", - "der", + "der 0.6.1", "generic-array", - "pkcs8", + "pkcs8 0.9.0", "subtle", "zeroize", ] @@ -5627,6 +5860,16 @@ dependencies = [ "rand_core", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core", +] + [[package]] name = "siphasher" version = "1.0.1" @@ -5656,6 +5899,9 @@ name = "smallvec" version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +dependencies = [ + "serde", +] [[package]] name = "snafu" @@ -5704,7 +5950,218 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" dependencies = [ "base64ct", - "der", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.9", +] + +[[package]] +name = "sqlite" +version = "0.1.0" +dependencies = [ + "md5", + "regex", + "rust-embed", + "sha2", + "sqlx", + "tempfile", + "thiserror 1.0.69", + "tokio", +] + +[[package]] +name = "sqlx" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4410e73b3c0d8442c5f99b425d7a435b5ee0ae4167b3196771dd3f7a01be745f" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a007b6936676aa9ab40207cde35daab0a04b823be8ae004368c0793b96a61e0" +dependencies = [ + "bytes", + "crc", + "crossbeam-queue", + "either", + "event-listener", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashbrown 0.15.2", + "hashlink", + "indexmap 2.6.0", + "log", + "memchr", + "once_cell", + "percent-encoding", + "serde", + "serde_json", + "sha2", + "smallvec", + "thiserror 2.0.4", + "tokio", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "sqlx-macros" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3112e2ad78643fef903618d78cf0aec1cb3134b019730edb039b69eaf531f310" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 2.0.89", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e9f90acc5ab146a99bf5061a7eb4976b573f560bc898ef3bf8435448dd5e7ad" +dependencies = [ + "dotenvy", + "either", + "heck 0.5.0", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 2.0.89", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4560278f0e00ce64938540546f59f590d60beee33fffbd3b9cd47851e5fff233" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.6.0", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 2.0.4", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5b98a57f363ed6764d5b3a12bfedf62f07aa16e1856a7ddc2a0bb190a959613" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.6.0", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 2.0.4", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f85ca71d3a5b24e64e1d08dd8fe36c6c95c339a896cc33068148906784620540" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "serde_urlencoded", + "sqlx-core", + "tracing", + "url", ] [[package]] @@ -5719,6 +6176,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.10.0" @@ -6546,12 +7014,33 @@ dependencies = [ "version_check", ] +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" + [[package]] name = "unicode-width" version = "0.1.14" @@ -6704,6 +7193,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -6821,6 +7316,16 @@ dependencies = [ "rustix", ] +[[package]] +name = "whoami" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "372d5b87f58ec45c384ba03563b03544dc5fadc3983e434b286913f5b4a9bb6d" +dependencies = [ + "redox_syscall", + "wasite", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index e5de48dc943..c5db2f843fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] resolver = "2" -members = ["rust/benchmark", "rust/blockstore", "rust/cache", "rust/chroma", "rust/config", "rust/distance", "rust/error", "rust/frontend", "rust/garbage_collector", "rust/index", "rust/load", "rust/log", "rust/memberlist", "rust/storage", "rust/system", "rust/sysdb", "rust/types", "rust/worker", "rust/segment", "rust/python_bindings", "rust/mdac", "rust/tracing"] +members = ["rust/benchmark", "rust/blockstore", "rust/cache", "rust/chroma", "rust/config", "rust/distance", "rust/error", "rust/frontend", "rust/garbage_collector", "rust/index", "rust/load", "rust/log", "rust/memberlist", "rust/storage", "rust/system", "rust/sysdb", "rust/types", "rust/worker", "rust/segment", "rust/python_bindings", "rust/mdac", "rust/tracing", "rust/sqlite"] [workspace.dependencies] arrow = "52.2.0" @@ -21,7 +21,6 @@ opentelemetry_sdk = { version = "0.27", features = ["rt-tokio"] } parking_lot = { version = "0.12.3", features = ["serde"] } prost = "0.13" prost-types = "0.12" -regex = "1.11" roaring = "0.10.6" serde = { version = "1.0.215", features = ["derive"] } serde_json = "1.0.133" @@ -37,6 +36,10 @@ tracing-bunyan-formatter = "0.3" tracing-opentelemetry = "0.28.0" tracing-subscriber = { version = "0.3", features = ["env-filter"] } uuid = { version = "1.11.0", features = ["v4", "fast-rng", "macro-diagnostics", "serde"] } +sqlx = { version = "0.8.3", features = [ "runtime-tokio", "sqlite"] } +sha2 = "0.10.8" +md5 = "0.7.0" +regex = "1.11.1" tower-http = { version = "0.6.2", features = ["trace"] } chroma-benchmark = { path = "rust/benchmark" } @@ -55,6 +58,7 @@ chroma-system = { path = "rust/system" } chroma-sysdb = { path = "rust/sysdb" } chroma-tracing = { path = "rust/tracing" } chroma-types = { path = "rust/types" } +chroma-sqlite = { path = "rust/sqlite" } mdac = { path = "rust/mdac" } worker = { path = "rust/worker" } diff --git a/rust/sqlite/Cargo.toml b/rust/sqlite/Cargo.toml new file mode 100644 index 00000000000..8aef73436ec --- /dev/null +++ b/rust/sqlite/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "sqlite" +version = "0.1.0" +edition = "2021" + +[dependencies] +sqlx = { workspace = true } +sha2 = { workspace = true} +regex = { workspace = true } +tokio = { workspace = true} +md5 = { workspace = true} +rust-embed = {version = "8.5.0", features = ["include-exclude"]} +thiserror = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/rust/sqlite/migrations/__init__.py b/rust/sqlite/migrations/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/rust/sqlite/migrations/embeddings_queue/00001-embeddings.sqlite.sql b/rust/sqlite/migrations/embeddings_queue/00001-embeddings.sqlite.sql new file mode 100644 index 00000000000..078bd897f98 --- /dev/null +++ b/rust/sqlite/migrations/embeddings_queue/00001-embeddings.sqlite.sql @@ -0,0 +1,10 @@ +CREATE TABLE embeddings_queue ( + seq_id INTEGER PRIMARY KEY, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + operation INTEGER NOT NULL, + topic TEXT NOT NULL, + id TEXT NOT NULL, + vector BLOB, + encoding TEXT, + metadata TEXT +); diff --git a/rust/sqlite/migrations/embeddings_queue/00002-embeddings-queue-config.sqlite.sql b/rust/sqlite/migrations/embeddings_queue/00002-embeddings-queue-config.sqlite.sql new file mode 100644 index 00000000000..dde76515414 --- /dev/null +++ b/rust/sqlite/migrations/embeddings_queue/00002-embeddings-queue-config.sqlite.sql @@ -0,0 +1,4 @@ +CREATE TABLE embeddings_queue_config ( + id INTEGER PRIMARY KEY, + config_json_str TEXT +); diff --git a/rust/sqlite/migrations/metadb/00001-embedding-metadata.sqlite.sql b/rust/sqlite/migrations/metadb/00001-embedding-metadata.sqlite.sql new file mode 100644 index 00000000000..cf2e820da64 --- /dev/null +++ b/rust/sqlite/migrations/metadb/00001-embedding-metadata.sqlite.sql @@ -0,0 +1,24 @@ +CREATE TABLE embeddings ( + id INTEGER PRIMARY KEY, + segment_id TEXT NOT NULL, + embedding_id TEXT NOT NULL, + seq_id BLOB NOT NULL, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE (segment_id, embedding_id) +); + +CREATE TABLE embedding_metadata ( + id INTEGER REFERENCES embeddings(id), + key TEXT NOT NULL, + string_value TEXT, + int_value INTEGER, + float_value REAL, + PRIMARY KEY (id, key) +); + +CREATE TABLE max_seq_id ( + segment_id TEXT PRIMARY KEY, + seq_id BLOB NOT NULL +); + +CREATE VIRTUAL TABLE embedding_fulltext USING fts5(id, string_value); diff --git a/rust/sqlite/migrations/metadb/00002-embedding-metadata.sqlite.sql b/rust/sqlite/migrations/metadb/00002-embedding-metadata.sqlite.sql new file mode 100644 index 00000000000..9684b14ad6d --- /dev/null +++ b/rust/sqlite/migrations/metadb/00002-embedding-metadata.sqlite.sql @@ -0,0 +1,5 @@ +-- SQLite does not support adding check with alter table, as a result, adding a check +-- involve creating a new table and copying the data over. It is over kill with adding +-- a boolean type column. The application write to the table needs to ensure the data +-- integrity. +ALTER TABLE embedding_metadata ADD COLUMN bool_value INTEGER diff --git a/rust/sqlite/migrations/metadb/00003-full-text-tokenize.sqlite.sql b/rust/sqlite/migrations/metadb/00003-full-text-tokenize.sqlite.sql new file mode 100644 index 00000000000..2b8aa2111ad --- /dev/null +++ b/rust/sqlite/migrations/metadb/00003-full-text-tokenize.sqlite.sql @@ -0,0 +1,3 @@ +CREATE VIRTUAL TABLE embedding_fulltext_search USING fts5(string_value, tokenize='trigram'); +INSERT INTO embedding_fulltext_search (rowid, string_value) SELECT rowid, string_value FROM embedding_metadata; +DROP TABLE embedding_fulltext; diff --git a/rust/sqlite/migrations/metadb/00004-metadata-indices.sqlite.sql b/rust/sqlite/migrations/metadb/00004-metadata-indices.sqlite.sql new file mode 100644 index 00000000000..52bf53a50ea --- /dev/null +++ b/rust/sqlite/migrations/metadb/00004-metadata-indices.sqlite.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS embedding_metadata_int_value ON embedding_metadata (key, int_value) WHERE int_value IS NOT NULL; +CREATE INDEX IF NOT EXISTS embedding_metadata_float_value ON embedding_metadata (key, float_value) WHERE float_value IS NOT NULL; +CREATE INDEX IF NOT EXISTS embedding_metadata_string_value ON embedding_metadata (key, string_value) WHERE string_value IS NOT NULL; diff --git a/rust/sqlite/migrations/sysdb/00001-collections.sqlite.sql b/rust/sqlite/migrations/sysdb/00001-collections.sqlite.sql new file mode 100644 index 00000000000..99abeaab194 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00001-collections.sqlite.sql @@ -0,0 +1,15 @@ +CREATE TABLE collections ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + topic TEXT NOT NULL, + UNIQUE (name) +); + +CREATE TABLE collection_metadata ( + collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE, + key TEXT NOT NULL, + str_value TEXT, + int_value INTEGER, + float_value REAL, + PRIMARY KEY (collection_id, key) +); diff --git a/rust/sqlite/migrations/sysdb/00002-segments.sqlite.sql b/rust/sqlite/migrations/sysdb/00002-segments.sqlite.sql new file mode 100644 index 00000000000..4f4b8c25d0c --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00002-segments.sqlite.sql @@ -0,0 +1,16 @@ +CREATE TABLE segments ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, + scope TEXT NOT NULL, + topic TEXT, + collection TEXT REFERENCES collection(id) +); + +CREATE TABLE segment_metadata ( + segment_id TEXT REFERENCES segments(id) ON DELETE CASCADE, + key TEXT NOT NULL, + str_value TEXT, + int_value INTEGER, + float_value REAL, + PRIMARY KEY (segment_id, key) +); diff --git a/rust/sqlite/migrations/sysdb/00003-collection-dimension.sqlite.sql b/rust/sqlite/migrations/sysdb/00003-collection-dimension.sqlite.sql new file mode 100644 index 00000000000..cb793f49702 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00003-collection-dimension.sqlite.sql @@ -0,0 +1 @@ +ALTER TABLE collections ADD COLUMN dimension INTEGER; diff --git a/rust/sqlite/migrations/sysdb/00004-tenants-databases.sqlite.sql b/rust/sqlite/migrations/sysdb/00004-tenants-databases.sqlite.sql new file mode 100644 index 00000000000..43372bf97a8 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00004-tenants-databases.sqlite.sql @@ -0,0 +1,29 @@ +CREATE TABLE IF NOT EXISTS tenants ( + id TEXT PRIMARY KEY, + UNIQUE (id) +); + +CREATE TABLE IF NOT EXISTS databases ( + id TEXT PRIMARY KEY, -- unique globally + name TEXT NOT NULL, -- unique per tenant + tenant_id TEXT NOT NULL REFERENCES tenants(id) ON DELETE CASCADE, + UNIQUE (tenant_id, name) -- Ensure that a tenant has only one database with a given name +); + +CREATE TABLE IF NOT EXISTS collections_tmp ( + id TEXT PRIMARY KEY, -- unique globally + name TEXT NOT NULL, -- unique per database + topic TEXT NOT NULL, + dimension INTEGER, + database_id TEXT NOT NULL REFERENCES databases(id) ON DELETE CASCADE, + UNIQUE (name, database_id) +); + +-- Create default tenant and database +INSERT OR REPLACE INTO tenants (id) VALUES ('default_tenant'); -- The default tenant id is 'default_tenant' others are UUIDs +INSERT OR REPLACE INTO databases (id, name, tenant_id) VALUES ('00000000-0000-0000-0000-000000000000', 'default_database', 'default_tenant'); + +INSERT OR REPLACE INTO collections_tmp (id, name, topic, dimension, database_id) + SELECT id, name, topic, dimension, '00000000-0000-0000-0000-000000000000' FROM collections; +DROP TABLE collections; +ALTER TABLE collections_tmp RENAME TO collections; diff --git a/rust/sqlite/migrations/sysdb/00005-remove-topic.sqlite.sql b/rust/sqlite/migrations/sysdb/00005-remove-topic.sqlite.sql new file mode 100644 index 00000000000..3ed0e028423 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00005-remove-topic.sqlite.sql @@ -0,0 +1,4 @@ +-- Remove the topic column from the Collections and Segments tables + +ALTER TABLE collections DROP COLUMN topic; +ALTER TABLE segments DROP COLUMN topic; diff --git a/rust/sqlite/migrations/sysdb/00006-collection-segment-metadata.sqlite.sql b/rust/sqlite/migrations/sysdb/00006-collection-segment-metadata.sqlite.sql new file mode 100644 index 00000000000..8d0d5d603d8 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00006-collection-segment-metadata.sqlite.sql @@ -0,0 +1,6 @@ +-- SQLite does not support adding check with alter table, as a result, adding a check +-- involve creating a new table and copying the data over. It is over kill with adding +-- a boolean type column. The application write to the table needs to ensure the data +-- integrity. +ALTER TABLE collection_metadata ADD COLUMN bool_value INTEGER; +ALTER TABLE segment_metadata ADD COLUMN bool_value INTEGER; diff --git a/rust/sqlite/migrations/sysdb/00007-collection-config.sqlite.sql b/rust/sqlite/migrations/sysdb/00007-collection-config.sqlite.sql new file mode 100644 index 00000000000..35cab052bbb --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00007-collection-config.sqlite.sql @@ -0,0 +1,2 @@ +-- Stores collection configuration dictionaries. +ALTER TABLE collections ADD COLUMN config_json_str TEXT; diff --git a/rust/sqlite/migrations/sysdb/00008-maintenance-log.sqlite.sql b/rust/sqlite/migrations/sysdb/00008-maintenance-log.sqlite.sql new file mode 100644 index 00000000000..8ea44941676 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00008-maintenance-log.sqlite.sql @@ -0,0 +1,7 @@ +-- Records when database maintenance operations are performed. +-- At time of creation, this table is only used to record vacuum operations. +CREATE TABLE maintenance_log ( + id INT PRIMARY KEY, + timestamp INT NOT NULL, + operation TEXT NOT NULL +); diff --git a/rust/sqlite/migrations/sysdb/00009-segment-collection-not-null.sqlite.sql b/rust/sqlite/migrations/sysdb/00009-segment-collection-not-null.sqlite.sql new file mode 100644 index 00000000000..4f15f8d17a9 --- /dev/null +++ b/rust/sqlite/migrations/sysdb/00009-segment-collection-not-null.sqlite.sql @@ -0,0 +1,11 @@ +-- This makes segments.collection non-nullable. +CREATE TABLE segments_temp ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, + scope TEXT NOT NULL, + collection TEXT REFERENCES collection(id) NOT NULL +); + +INSERT INTO segments_temp SELECT * FROM segments; +DROP TABLE segments; +ALTER TABLE segments_temp RENAME TO segments; diff --git a/rust/sqlite/src/config.rs b/rust/sqlite/src/config.rs new file mode 100644 index 00000000000..b8be4ebe8dc --- /dev/null +++ b/rust/sqlite/src/config.rs @@ -0,0 +1,25 @@ +#[derive(Clone)] +pub struct SqliteDBConfig { + // The SQLite database URL + pub url: String, + pub hash_type: MigrationHash, + pub migration_mode: MigrationMode, +} + +/// Migration mode for the database +/// - Apply: Apply the migrations +/// - Validate: Validate the applied migrations and ensure none are unappliued +#[derive(Clone, PartialEq)] +pub enum MigrationMode { + Apply, + Validate, +} + +/// The hash function to use for the migration files +/// - SHA256: Use SHA256 hash +/// - MD5: Use MD5 hash +#[derive(Clone)] +pub enum MigrationHash { + SHA256, + MD5, +} diff --git a/rust/sqlite/src/db.rs b/rust/sqlite/src/db.rs new file mode 100644 index 00000000000..d06c12398c5 --- /dev/null +++ b/rust/sqlite/src/db.rs @@ -0,0 +1,438 @@ +use crate::config::{MigrationMode, SqliteDBConfig}; +use crate::migrations::{GetSourceMigrationsError, Migration, MigrationDir, MIGRATION_DIRS}; +use sqlx::sqlite::{SqliteConnectOptions, SqlitePool}; +use sqlx::{Executor, Row}; +use thiserror::Error; + +// // TODO: +// // - support memory mode, add concurrency tests +pub struct SqliteDb { + conn: SqlitePool, +} + +impl SqliteDb { + pub async fn try_from_config(config: &SqliteDBConfig) -> Result { + // TODO: copy all other pragmas from python and add basic tests + let options = SqliteConnectOptions::new() + .filename(&config.url) + // Due to a bug in the python code, foreign_keys is turned off + // The python code enabled it in a transaction, however, + // https://www.sqlite.org/pragma.html states that foreign_keys + // is a no-op in a transaction. In order to be able to run our migrations + // we turn it off + .pragma("foreign_keys", "OFF") + .pragma("case_sensitive_like", "ON") + .create_if_missing(true); + let conn = SqlitePool::connect_with(options) + .await + .map_err(SqliteCreationError::SqlxError)?; + + let db = Self { conn }; + + db.initialize_migrations_table().await?; + match config.migration_mode { + MigrationMode::Apply => { + let mut all_unapplied_migrations = Vec::new(); + for dir in MIGRATION_DIRS.iter() { + let applied_migrations = db.get_existing_migrations(dir).await; + let source_migrations = dir + .get_source_migrations(&config.hash_type) + .map_err(SqliteCreationError::GetSourceMigrationsError)?; + let unapplied = db + .validate_migrations_and_get_unapplied( + applied_migrations, + source_migrations, + ) + .map_err(SqliteCreationError::MigrationValidationError)?; + all_unapplied_migrations.extend(unapplied); + } + db.apply_migrations(all_unapplied_migrations).await?; + } + MigrationMode::Validate => { + // This should realistically never happen, since we just initialized the migrations table + // above in an idempotent way. This is defensive. + if !db.has_initialized_migrations().await { + return Err(SqliteCreationError::MigrationsTableNotInitialized); + } + for dir in MIGRATION_DIRS.iter() { + let applied_migrations = db.get_existing_migrations(dir).await; + let source_migrations = dir + .get_source_migrations(&config.hash_type) + .map_err(SqliteCreationError::GetSourceMigrationsError)?; + let unapplied = db.validate_migrations_and_get_unapplied( + applied_migrations, + source_migrations, + )?; + if !unapplied.is_empty() { + return Err(SqliteCreationError::UnappliedMigrationsFound); + } + } + } + } + Ok(db) + } + + //////////////////////// Migrations //////////////////////// + + /// Apply all migrations in a transaction + /// Arguments: + /// - migrations: Vec - The migrations to apply + async fn apply_migrations(&self, migrations: Vec) -> Result<(), sqlx::Error> { + let mut tx = self.conn.begin().await?; + for migration in migrations { + // Apply the migration + // TODO(hammadb): Determine how to handle foreign keys on + // this is copied over from the python code but it does + // not work in a transaction + tx.execute("PRAGMA foreign_keys = ON").await?; + tx.execute(sqlx::query(&migration.sql)).await?; + + // Bookkeeping + let query = r#" + INSERT INTO migrations (dir, version, filename, sql, hash) + VALUES ($1, $2, $3, $4, $5) + "#; + let query = sqlx::query(query) + .bind(&migration.dir) + .bind(migration.version) + .bind(&migration.filename) + .bind(&migration.sql) + .bind(&migration.hash); + tx.execute(query).await?; + } + tx.commit().await?; + Ok(()) + } + + /// Validate migration sequence and get the migrations that need to be applied + /// ## Arguments: + /// - applied_migrations: Vec - The migrations that have been applied, in ascending version order + /// - source_migrations: Vec - The migrations that are on disk, in ascending version order + /// ## Returns: + /// - Vec - The migrations that need to be applied + fn validate_migrations_and_get_unapplied( + &self, + applied_migrations: Vec, + source_migrations: Vec, + ) -> Result, MigrationValidationError> { + for (db_migration, source_migration) in + applied_migrations.iter().zip(source_migrations.iter()) + { + if db_migration.version != source_migration.version { + return Err(MigrationValidationError::InconsistentVersion( + db_migration.version, + source_migration.version, + )); + } + if db_migration.hash != source_migration.hash { + return Err(MigrationValidationError::InconsistentHash( + db_migration.hash.clone(), + source_migration.hash.clone(), + )); + } + if db_migration.sql != source_migration.sql { + return Err(MigrationValidationError::InconsistentHash( + db_migration.hash.clone(), + source_migration.hash.clone(), + )); + } + } + + let unapplied = source_migrations[applied_migrations.len()..].to_vec(); + Ok(unapplied) + } + + /// Initialize the migrations table + /// Note: + /// - This function is idempotent + async fn initialize_migrations_table(&self) -> Result<(), sqlx::Error> { + let query = r#" + CREATE TABLE IF NOT EXISTS migrations ( + dir TEXT NOT NULL, + version INTEGER NOT NULL, + filename TEXT NOT NULL, + sql TEXT NOT NULL, + hash TEXT NOT NULL, + PRIMARY KEY (dir, version) + ) + "#; + sqlx::query(query).execute(&self.conn).await?; + Ok(()) + } + + /// Check if the migrations table has been initialized + /// Returns: + /// - bool - True if the migrations table has been initialized + async fn has_initialized_migrations(&self) -> bool { + let query = r#" + SELECT name FROM sqlite_master WHERE type='table' AND name='migrations' + "#; + let row = sqlx::query(query) + .fetch_one(&self.conn) + .await + .expect("Expect it to be fetched"); + let name: String = row.get("name"); + name == "migrations" // Sanity check + } + + /// Get existing migrations for a given directory + /// Arguments: + /// - dir_name: str - The name of the directory that contains the migrations + /// ## Returns: + /// - Vec - A list of migrations + /// ## Notes + /// - dir_name has to be held constant for a given migration directory + /// - The migrations are sorted by version in ascending order + /// - The dir_name is consistent with the python implementation + async fn get_existing_migrations(&self, dir: &MigrationDir) -> Vec { + let query = r#" + SELECT dir, version, filename, sql, hash + FROM migrations + WHERE dir = $1 + ORDER BY version ASC + "#; + let rows = sqlx::query(query) + .bind(dir.as_str()) + .fetch_all(&self.conn) + .await + .expect("Expect it to be fetched"); + + let mut migrations = Vec::new(); + for row in rows { + let dir: String = row.get("dir"); + let version: i32 = row.get("version"); + let filename: String = row.get("filename"); + let sql: String = row.get("sql"); + let hash: String = row.get("hash"); + migrations.push(Migration::new(dir, filename, version, sql, hash)); + } + migrations + } +} + +//////////////////////// Error Types //////////////////////// + +#[derive(Error, Debug)] +pub enum SqliteCreationError { + #[error(transparent)] + SqlxError(#[from] sqlx::Error), + #[error(transparent)] + GetSourceMigrationsError(#[from] GetSourceMigrationsError), + #[error(transparent)] + MigrationValidationError(#[from] MigrationValidationError), + #[error("Migrations table not initialized")] + MigrationsTableNotInitialized, + #[error("Unapplied migrations found")] + UnappliedMigrationsFound, +} + +#[derive(Error, Debug)] +pub enum MigrationValidationError { + #[error("Inconsistent version: db={0}, source={1}")] + InconsistentVersion(i32, i32), + #[error("Inconsistent hash: db={0}, source={1}")] + InconsistentHash(String, String), +} + +//////////////////////// Tests //////////////////////// +#[cfg(test)] +mod tests { + use super::*; + use crate::config::MigrationHash; + use sqlx::Row; + use std::path::PathBuf; + use tempfile::tempdir; + + //////////////////////// Test Helpers //////////////////////// + + fn test_migration_dir() -> PathBuf { + let migration_dir = "migrations/".to_string(); + PathBuf::from(migration_dir) + } + + fn new_test_db_path() -> String { + let path = tempdir().unwrap().into_path().join("test.db"); + path.to_str().unwrap().to_string() + } + + //////////////////////// SqliteDb //////////////////////// + + #[tokio::test] + async fn test_sqlite_db() { + let config = SqliteDBConfig { + url: new_test_db_path(), + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Apply, + }; + let db = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created"); + + // Check if migrations table exists + let query = r#" + SELECT name FROM sqlite_master WHERE type='table' AND name='migrations' + "#; + let row = sqlx::query(query) + .fetch_one(&db.conn) + .await + .expect("Expect it to be fetched"); + let name: String = row.get("name"); + assert_eq!(name, "migrations"); + } + + #[tokio::test] + async fn test_it_initializes_and_validates() { + let config: SqliteDBConfig = SqliteDBConfig { + url: new_test_db_path(), + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Apply, + }; + let db = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created"); + + // Check if migrations table exists + let query = r#" + SELECT name FROM sqlite_master WHERE type='table' AND name='migrations' + "#; + let row = sqlx::query(query) + .fetch_one(&db.conn) + .await + .expect("Expect it to be fetched"); + let name: String = row.get("name"); + assert_eq!(name, "migrations"); + } + + #[tokio::test] + async fn test_migrations_get_applied_on_new_db() { + let test_db_path = new_test_db_path(); + let config = SqliteDBConfig { + url: test_db_path.clone(), + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Apply, + }; + let db = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created"); + + // Ensure the migrations were applied by checking the count of migrations we see + // after creating the db + for dir in MIGRATION_DIRS.iter() { + let migrations = db.get_existing_migrations(dir).await; + let on_disk_path = test_migration_dir().join(dir.as_str()); + // See how many files are in the directory + let files = std::fs::read_dir(on_disk_path).expect("Expect it to be read"); + let num_files = files.count(); + assert_eq!(migrations.len(), num_files); + } + + // Ensure validate mode works + let config = SqliteDBConfig { + url: test_db_path, + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Validate, + }; + + let _ = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created & validated"); + } + + #[tokio::test] + async fn test_migrations_tampered() { + let test_db_path = new_test_db_path(); + let config = SqliteDBConfig { + url: test_db_path.clone(), + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Apply, + }; + let db = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created"); + + // Tamper with a migration file in the db + let dir = &MIGRATION_DIRS[0]; + let migrations = db.get_existing_migrations(dir).await; + let mut tampered_migration = migrations[0].clone(); + tampered_migration.sql = "SELECT 1".to_string(); + let query = r#" + UPDATE migrations + SET sql = $1 + WHERE dir = $2 AND version = $3 + "#; + let query = sqlx::query(query) + .bind(&tampered_migration.sql) + .bind(&tampered_migration.dir) + .bind(tampered_migration.version); + db.conn + .execute(query) + .await + .expect("Expect it to be executed"); + + // Ensure validate mode fails + let config = SqliteDBConfig { + url: test_db_path, + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Validate, + }; + + let result = SqliteDb::try_from_config(&config).await; + match result { + Ok(_) => panic!("Expect it to fail"), + Err(SqliteCreationError::MigrationValidationError( + MigrationValidationError::InconsistentHash(_, _), + )) => {} + Err(_) => panic!("Expect it to be a MigrationValidationError"), + } + } + + #[tokio::test] + async fn test_migrations_reorder() { + let test_db_path = new_test_db_path(); + let config = SqliteDBConfig { + url: test_db_path.clone(), + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Apply, + }; + let db = SqliteDb::try_from_config(&config) + .await + .expect("Expect it to be created"); + + // Reorder the migrations in the db + let dir = &MIGRATION_DIRS[0]; + let migrations = db.get_existing_migrations(dir).await; + let mut reordered_migrations = migrations.clone(); + reordered_migrations.reverse(); + for (i, migration) in reordered_migrations.iter().enumerate() { + let query = r#" + UPDATE migrations + SET version = $1 + WHERE dir = $2 AND version = $3 + "#; + let query = sqlx::query(query) + .bind((i + reordered_migrations.len()) as u32) + .bind(&migration.dir) + .bind(migration.version); + db.conn + .execute(query) + .await + .expect("Expect it to be executed"); + } + + // Ensure validate mode fails + let config = SqliteDBConfig { + url: test_db_path, + hash_type: MigrationHash::MD5, + migration_mode: MigrationMode::Validate, + }; + + let result = SqliteDb::try_from_config(&config).await; + match result { + Ok(_) => panic!("Expect it to fail"), + Err(SqliteCreationError::MigrationValidationError( + MigrationValidationError::InconsistentVersion(_, _), + )) => {} + Err(_) => panic!("Expect it to be a MigrationValidationError"), + } + } +} diff --git a/rust/sqlite/src/lib.rs b/rust/sqlite/src/lib.rs new file mode 100644 index 00000000000..8a4ce59418d --- /dev/null +++ b/rust/sqlite/src/lib.rs @@ -0,0 +1,3 @@ +mod config; +pub mod db; +mod migrations; diff --git a/rust/sqlite/src/migrations.rs b/rust/sqlite/src/migrations.rs new file mode 100644 index 00000000000..0441f2f8335 --- /dev/null +++ b/rust/sqlite/src/migrations.rs @@ -0,0 +1,230 @@ +use crate::config::MigrationHash; +use core::str; +use regex::Regex; +use rust_embed::Embed; +use sha2::{Digest, Sha256}; +use std::{borrow::Cow, sync::LazyLock}; +use thiserror::Error; + +///////////// Migration Types ////////////// + +// A migration is a single SQL file that is executed to update the database schema +// ## Fields +// - dir: The directory where the migration file is located. One of "sysdb", "metadb", "embeddings_queue" +// - filename: The name of the migration file +// - version: The version of the migration file +// - sql: The SQL content of the migration file +// - hash: The hash of the migration file content +// ## Note +// - Due to legacy naming from the python codebase, the "log" table is known +// as "embeddings_queue" in the Rust codebase. Only in the sql files is it referred to as "embeddings_queue" +// Elsewhere in our code we should refer to it as "log" +#[derive(Clone, Debug)] +pub(crate) struct Migration { + pub(crate) dir: String, + pub(crate) filename: String, + pub(crate) version: i32, + pub(crate) sql: String, + pub(crate) hash: String, +} + +impl Migration { + pub(crate) fn new( + dir: String, + filename: String, + version: i32, + sql: String, + hash: String, + ) -> Self { + Self { + dir, + filename, + version, + sql, + hash, + } + } +} + +// A migration dir is a directory that contains migration files +// for a given subsystem +pub(crate) enum MigrationDir { + SysDb, + MetaDb, + EmbeddingsQueue, +} + +pub(crate) const MIGRATION_DIRS: [MigrationDir; 3] = [ + MigrationDir::SysDb, + MigrationDir::MetaDb, + MigrationDir::EmbeddingsQueue, +]; + +#[derive(Error, Debug)] +pub enum GetSourceMigrationsError { + #[error(transparent)] + ParseMigrationFilenameError(#[from] ParseMigrationFilenameError), + #[error("{0}")] + NoSuchMigrationFile(String), + #[error("Failed to get migration file: {0}")] + FailedToGetMigrationFile(String), +} + +impl MigrationDir { + pub(crate) fn as_str(&self) -> &str { + match self { + Self::SysDb => "sysdb", + Self::MetaDb => "metadb", + Self::EmbeddingsQueue => "embeddings_queue", + } + } + + fn iter(&self) -> Box>> { + match self { + Self::SysDb => Box::new(SysDbMigrationsFolder::iter()), + Self::MetaDb => Box::new(MetaDbMigrationsFolder::iter()), + Self::EmbeddingsQueue => Box::new(EmbeddingsQueueMigrationsFolder::iter()), + } + } + + fn get_file(&self, name: &str) -> Option { + match self { + Self::SysDb => SysDbMigrationsFolder::get(name), + Self::MetaDb => MetaDbMigrationsFolder::get(name), + Self::EmbeddingsQueue => EmbeddingsQueueMigrationsFolder::get(name), + } + } + + /// Get the migrations that are on disk + /// Arguments: + /// - migration_hash: MigrationHash - The hash function to use for the migration files + /// ## Returns: + /// - Vec - A list of migrations found on disk, sorted by version in ascending order + /// ## Notes: + /// - Uses the migrations_root_dir of this SqlDB instance + pub(crate) fn get_source_migrations( + &self, + migration_hash: &MigrationHash, + ) -> Result, GetSourceMigrationsError> { + let mut migrations = Vec::new(); + + for migration_name in self.iter() { + let (version, _) = parse_migration_filename(&migration_name) + .map_err(GetSourceMigrationsError::ParseMigrationFilenameError)?; + let sql = match self.get_file(&migration_name) { + Some(sql) => str::from_utf8(&sql.data) + .map_err(|_| { + GetSourceMigrationsError::FailedToGetMigrationFile( + migration_name.to_string(), + ) + })? + .to_string(), + None => { + return Err(GetSourceMigrationsError::NoSuchMigrationFile( + migration_name.to_string(), + )) + } + }; + let hash = match migration_hash { + MigrationHash::SHA256 => { + let mut hasher = Sha256::new(); + hasher.update(sql.as_bytes()); + format!("{:x}", hasher.finalize()) + } + MigrationHash::MD5 => { + let hash = md5::compute(sql.as_bytes()); + format!("{:x}", hash) + } + }; + migrations.push(Migration::new( + self.as_str().to_string(), + migration_name.to_string(), + version, + sql, + hash, + )); + } + + migrations.sort_by(|a, b| a.version.cmp(&b.version)); + Ok(migrations) + } +} + +///////////// MigrationDir Helpers ////////////// + +#[derive(Error, Debug)] +pub enum ParseMigrationFilenameError { + #[error("Invalid migration filename: {0}")] + InvalidMigrationFilename(String), + #[error("Failed to find version")] + FailedToFindVersion, + #[error("Failed to find scope")] + FailedToFindScope, +} + +// Parse the migration filename +// Arguments: +// - filename: str - The filename of the migration +// Returns: +// - (i32, str) - The version and scope of the migration +// Notes +// - Format is -..sql +// - e.g, 00001-users.sqlite.sql +// - scope is unused, it is legacy from the python implementation. It is +// written but never read +fn parse_migration_filename(filename: &str) -> Result<(i32, String), ParseMigrationFilenameError> { + let regex_match = MIGRATION_FILENAME_REGEX.captures(filename); + let groups = match regex_match { + Some(groups) => groups, + None => { + return Err(ParseMigrationFilenameError::InvalidMigrationFilename( + filename.to_string(), + )) + } + }; + + // Parse version + let version = match groups.get(1) { + Some(version) => version, + None => return Err(ParseMigrationFilenameError::FailedToFindVersion), + }; + let version = match version.as_str().parse::() { + Ok(version) => version, + Err(e) => { + return Err(ParseMigrationFilenameError::InvalidMigrationFilename( + e.to_string(), + )) + } + }; + + // Parse scope + let scope = match groups.get(3) { + Some(scope) => scope, + None => return Err(ParseMigrationFilenameError::FailedToFindScope), + }; + let scope = scope.as_str().to_string(); + + Ok((version, scope)) +} + +static MIGRATION_FILENAME_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r"(\d+)-(.+)\.(.+)\.sql").expect("Failed to compile regex")); + +///////////// Rust Embed Migrations Data ////////////// +// The migration files are embedded in the binary using the `rust_embed` crate +// These are internal to this file and should not be used elsewhere + +#[derive(Embed)] +#[folder = "./migrations/sysdb/"] +#[include = "*.sql"] +struct SysDbMigrationsFolder; + +#[derive(Embed)] +#[folder = "./migrations/metadb/"] +#[include = "*.sql"] +struct MetaDbMigrationsFolder; + +#[derive(Embed)] +#[folder = "./migrations/embeddings_queue/"] +#[include = "*.sql"] +struct EmbeddingsQueueMigrationsFolder;