diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..839cfb4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,48 @@ +# Generated by Cargo +# will have compiled files and executables +/target/ +# also ignore if it's a symbolic link +/target + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html + +# These are backup files generated by rustfmt +**/*.rs.bk + +# Mac DS_Store +**/*.DS_Store + +debug/ + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# JetBrains IDE config directory +.idea/ +*.iml + +# VSCode IDE config directory +.vscode/ + +# Logs +**/__unittest_logs +logs/ + +# cpython's generated python byte code +**/__pycache__/ + +# Benchmark dataset +benchmarks/data + +# dotenv +.env + +# dashboard files +!/src/servers/dashboard/VERSION +/src/servers/dashboard/* + +# Vscode workspace +*.code-workspace + +venv/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..1ec46c6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.11) +project(vsag_wrapper) + +set(CMAKE_CXX_STANDARD 11) + +# download and build vsag +include(FetchContent) +FetchContent_Declare( + vsag + GIT_REPOSITORY https://github.com/alipay/vsag + GIT_TAG v0.11.6 +) +FetchContent_MakeAvailable(vsag) + +add_library(vsag_wrapper STATIC src/wrapper.cpp) + +target_include_directories(vsag_wrapper PUBLIC + $ + $ + ${vsag_SOURCE_DIR}/include +) + +# link with vsag +target_link_libraries(vsag_wrapper PUBLIC vsag) diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8e11772 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,596 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "bindgen" +version = "0.69.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cc" +version = "1.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d2eb3cd3d1bf4529e31c215ee6f93ec5a3d536d9f578f93d9d33ee19562932" +dependencies = [ + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + +[[package]] +name = "distances" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6e6d14d6c64164e1457a5765821c991c327abb47099a934832f756c84a71572" +dependencies = [ + "rand 0.8.5", + "serde", +] + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.158" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" + +[[package]] +name = "libloading" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +dependencies = [ + "cfg-if", + "windows-targets", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" +dependencies = [ + "fuchsia-cprng", + "libc", + "rand_core 0.3.1", + "rdrand", + "winapi", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +dependencies = [ + "rand_core 0.4.2", +] + +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "regex" +version = "1.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "serde" +version = "1.0.209" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.209" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simsimd" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc843bc8f12d9c8e6b734a0fe8918fc497b42f6ae0f347dbfdad5b5138ab9b4" +dependencies = [ + "cc", +] + +[[package]] +name = "syn" +version = "2.0.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempdir" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8" +dependencies = [ + "rand 0.4.6", + "remove_dir_all", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "vsag-sys" +version = "0.0.1" +dependencies = [ + "bindgen", + "cmake", + "distances", + "libc", + "rand 0.8.5", + "simsimd", + "tempdir", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..b87ac46 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "vsag-sys" +version = "0.0.1" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +libc = "*" + +[build-dependencies] +cmake = "0.1" +bindgen = "0.69" + +[dev-dependencies] +rand = "0.8" +distances = "1" +simsimd = "4" +tempdir = "0.3" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8481549 --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +# VSAG Rust Binding + +A Rust binding for the [VSAG](https://github.com/alipay/vsag). + +## Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +vsag-sys = "0.0.1" +``` + +Try the example: + +```rust +use vsag_sys::VsagIndex; + +let index_type = "hnsw"; +let con_params = r#"{ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "hnsw": { + "max_degree": 16, + "ef_construction": 100 + } +}"#; +let search_params = r#"{ + "hnsw": { + "ef_search": 100 + } +}"#; + +let index = VsagIndex::new(index_type, con_params).unwrap(); + +let ids: Vec = (0..num_vectors as i64).collect(); +let vectors = (0..num_vectors) + .map(|_| { + (0..dim) + .map(|_| rand::random::()) + .collect::>() + }) + .collect::>(); +let vectors_for_index: Vec = vectors.iter().flat_map(|v| v.iter().copied()).collect(); + +let failed_ids = index + .build(num_vectors, dim, &ids, &vectors_for_index) + .unwrap(); +assert_eq!(failed_ids.len(), 0); + +let query_vector: Vec = (0..dim).map(|_| rand::random()).collect(); +let k = 10; +let output = index.knn_search(&query_vector, k, search_params).unwrap(); +assert_eq!(output.ids.len(), k.min(num_vectors)); +assert_eq!(output.distances.len(), k.min(num_vectors)); +``` diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..e472466 --- /dev/null +++ b/build.rs @@ -0,0 +1,33 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + println!("cargo:rerun-if-changed=include/wrapper.h"); + println!("cargo:rerun-if-changed=src/wrapper.cpp"); + + let dst = cmake::Config::new("") + .build_target("vsag_wrapper") + // Cargo sets TARGET to the target triple + // but building openblas via cmake will fail if it's set + .env("TARGET", "") + .build(); + + println!("cargo:rustc-link-search=native={}/build", dst.display()); + println!( + "cargo:rustc-link-search=native={}/build/_deps/vsag-build/src", + dst.display() + ); + println!("cargo:rustc-link-lib=dylib=vsag"); + println!("cargo:rustc-link-lib=static=vsag_wrapper"); +} diff --git a/include/wrapper.h b/include/wrapper.h new file mode 100644 index 0000000..81d26bb --- /dev/null +++ b/include/wrapper.h @@ -0,0 +1,79 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef WRAPPER_H +#define WRAPPER_H + +#include +#include + +extern "C" { + +struct CError { + int type_; + const char* message; + + CError(int type, const char* message) : type_(type), message(message) {} +}; + +const CError* create_index( + const char* in_index_type, + const char* in_parameters, + + void** out_index_ptr +); + +const CError* build_index( + void* in_index_ptr, + size_t in_num_vectors, + size_t in_dim, + const int64_t* in_ids, + const float* in_vectors, + + const int64_t** out_failed_ids, + size_t* out_num_failed +); + +const CError* knn_search_index( + void* in_index_ptr, + size_t in_dim, + const float* in_query_vector, + size_t in_k, + const char* in_search_parameters, + + const int64_t** out_ids, + const float** out_distances, + size_t* out_num_results +); + +const CError* dump_index( + void* in_index_ptr, + const char* in_file_path +); + +const CError* load_index( + const char* in_file_path, + const char* in_index_type, + const char* in_parameters, + + void** out_index_ptr +); + +void free_error(const CError*); +void free_index(void* index_ptr); +void free_i64_vector(int64_t* vector); +void free_f32_vector(float* vector); +} // extern "C" + +#endif // WRAPPER_H \ No newline at end of file diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..64d94de --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +group_imports = "StdExternalCrate" +imports_granularity = "Module" diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..23411a0 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,57 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct Error { + pub error_type: ErrorType, + pub message: String, +} + +#[derive(Debug)] +#[repr(C)] +pub enum ErrorType { + // [common errors] + /// unknown error + UnknownError = 1, + /// some internal errors occupied in algorithm + InternalError, + /// invalid argument + InvalidArgument, + + // [behavior errors] + /// index has been build, cannot build again + BuildTwice, + /// index object is NOT empty so that should not deserialize on it + IndexNotEmpty, + /// trying to create an unsupported index + UnsupportedIndex, + /// the index does not support this function + UnsupportedIndexOperation, + /// the dimension of add/build/search request is NOT equal to index + DimensionNotEqual, + /// index is empty, cannot search or serialize + IndexEmpty, + + // [runtime errors] + /// failed to alloc memory + NoEnoughMemory, + /// cannot read from binary + ReadError, + /// some file missing in index diskann deserialization + MissingFile, + /// the content of binary is invalid + InvalidBinary, +} diff --git a/src/ffi.rs b/src/ffi.rs new file mode 100644 index 0000000..6fa1377 --- /dev/null +++ b/src/ffi.rs @@ -0,0 +1,105 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::os::raw::{c_char, c_int, c_void}; + +extern "C" { + pub fn create_index( + in_index_type: *const c_char, + in_parameters: *const c_char, + + out_index_ptr: *mut *const c_void, + ) -> *const CError; + + pub fn build_index( + in_index_ptr: *const c_void, + in_num_vectors: usize, + in_dim: usize, + in_ids: *const i64, + in_vectors: *const f32, + + out_failed_ids: *mut *const i64, + out_num_failed: *mut usize, + ) -> *const CError; + + pub fn knn_search_index( + in_index_ptr: *const c_void, + in_dim: usize, + in_query_vector: *const f32, + in_k: usize, + in_search_parameters: *const c_char, + + out_ids: *mut *const i64, + out_distances: *mut *const f32, + out_num_results: *mut usize, + ) -> *const CError; + + pub fn dump_index(in_index_ptr: *const c_void, in_file_path: *const c_char) -> *const CError; + + pub fn load_index( + in_file_path: *const c_char, + in_index_type: *const c_char, + in_parameters: *const c_char, + + out_index_ptr: *mut *const c_void, + ) -> *const CError; + + pub fn free_index(index_ptr: *const c_void); + pub fn free_error(error: *const CError); + pub fn free_i64_vector(vector: *const i64); + pub fn free_f32_vector(vector: *const f32); +} + +#[repr(C)] +pub struct CError { + pub type_: c_int, + pub message: *const c_char, +} + +pub fn from_c_error(err: *const CError) -> crate::error::Error { + let error = crate::error::Error { + error_type: unsafe { std::mem::transmute::((*err).type_) }, + message: unsafe { + std::ffi::CStr::from_ptr((*err).message) + .to_string_lossy() + .into_owned() + }, + }; + unsafe { + free_error(err); + } + error +} + +pub fn from_c_i64_vector(vector: *const i64, len: usize) -> Vec { + let slice = unsafe { std::slice::from_raw_parts(vector, len) }; + let vec = slice.to_vec(); + unsafe { + free_i64_vector(vector); + } + vec +} + +pub fn from_c_f32_vector(vector: *const f32, len: usize) -> Vec { + let slice = unsafe { std::slice::from_raw_parts(vector, len) }; + let vec = slice.to_vec(); + unsafe { + free_f32_vector(vector); + } + vec +} + +pub fn to_c_string(s: &str) -> std::ffi::CString { + std::ffi::CString::new(s).expect("0 byte in string") +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7630c6f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,394 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod error; +mod ffi; + +use std::os::raw::c_void; + +use ffi::dump_index; + +use crate::error::Result; +use crate::ffi::{ + build_index, create_index, free_index, from_c_error, from_c_f32_vector, from_c_i64_vector, + knn_search_index, to_c_string, +}; + +/// `VsagIndex` is a wrapper around the C++ index object. +/// +/// When the `VsagIndex` is dropped, the C++ index object is freed. +pub struct VsagIndex { + /// Pointer to the C++ index object. + ptr: *const c_void, +} + +impl VsagIndex { + /// Creates a new vsag index. + /// + /// `index_type` is the type of index to create. Currently supported values are: + /// - `hnsw` + /// - `diskann` + /// + /// HNSW.params in JSON format: + /// - dtype: string, required, one of [float32] + /// - metric_type: string, required, one of [l2, ip] + /// - dim: integer, required + /// - hnsw.max_degree: integer, required + /// - hnsw.ef_construction: integer, required + /// e.g., + /// { + /// "dtype": "float32", + /// "metric_type": "l2", + /// "dim": 128, + /// "hnsw": { + /// "max_degree": 16, + /// "ef_construction": 200 + /// } + /// } + /// + /// DiskANN.params in JSON format: + /// - dtype: string, required, one of [float32] + /// - metric_type: string, required, one of [l2, ip] + /// - dim: integer, required + /// - diskann.max_degree: integer, required + /// - diskann.ef_construction: integer, required + /// - diskann.pq_dims: integer, required + /// - diskann.pq_sample_rate: floating number, required, in range (0.0, 1.0] + /// e.g., + /// { + /// "dtype": "float32", + /// "metric_type": "l2", + /// "dim": 128, + /// "diskann": { + /// "max_degree": 16, + /// "ef_construction": 200, + /// "pq_dims": 64, + /// "pq_sample_rate": 0.5 + /// } + /// } + pub fn new(index_type: &str, params: &str) -> Result { + let index_type_c = to_c_string(index_type); + let parameters_c = to_c_string(params); + + unsafe { + let out_index_ptr = &mut std::ptr::null(); + let err = create_index(index_type_c.as_ptr(), parameters_c.as_ptr(), out_index_ptr); + + if !err.is_null() { + Err(from_c_error(err)) + } else { + Ok(VsagIndex { + ptr: *out_index_ptr, + }) + } + } + } + + /// Builds index with all vectors + /// + /// All vectors are passed as a single slice of f32. If you have `num_vectors` vectors of dimension `dim`, + /// you should pass a `vectors` slice of length `num_vectors * dim` and `ids` slice of length `num_vectors`. + /// + /// Returns IDs of vectors that failed to be added to the index. + pub fn build( + &self, + num_vectors: usize, + dim: usize, + ids: &[i64], + vectors: &[f32], + ) -> Result> { + unsafe { + let out_failed_ids: *mut *const i64 = &mut std::ptr::null(); + let out_num_failed: *mut usize = &mut 0; + let err = build_index( + self.ptr, + num_vectors, + dim, + ids.as_ptr(), + vectors.as_ptr(), + out_failed_ids, + out_num_failed, + ); + + if !err.is_null() { + Err(from_c_error(err)) + } else { + Ok(from_c_i64_vector(*out_failed_ids, *out_num_failed)) + } + } + } + + /// Searches for the `k` nearest neighbors of the `query_vector`. + /// + /// `search_params` is a JSON string that specifies the search parameters. + /// + /// HNSW.search_params in JSON format: + /// - hnsw.ef_search: integer, required + /// - hnsw.use_conjugate_graph_search: boolean, optional, default is true + /// e.g., + /// { + /// "hnsw": { + /// "ef_search": 100, + /// "use_conjugate_graph_search": true + /// } + /// } + /// + /// DiskANN.search_params in JSON format: + /// - diskann.ef_search: integer, required + /// - diskann.beam_search: integer, required + /// - diskann.io_limit: integer, required + /// - diskann.use_reorder: boolean, optional, default is false + /// e.g., + /// { + /// "diskann": { + /// "ef_search": 100, + /// "beam_search": 4, + /// "io_limit": 200, + /// "use_reorder": false + /// } + /// } + pub fn knn_search( + &self, + query_vector: &[f32], + k: usize, + search_params: &str, + ) -> Result { + let search_params = to_c_string(search_params); + + unsafe { + let out_ids: *mut *const i64 = &mut std::ptr::null(); + let out_distances: *mut *const f32 = &mut std::ptr::null(); + let out_num_results: *mut usize = &mut 0; + let err = knn_search_index( + self.ptr, + query_vector.len(), + query_vector.as_ptr(), + k, + search_params.as_ptr(), + out_ids, + out_distances, + out_num_results, + ); + + if !err.is_null() { + Err(from_c_error(err)) + } else { + Ok(KnnSearchOutput { + ids: from_c_i64_vector(*out_ids, *out_num_results), + distances: from_c_f32_vector(*out_distances, *out_num_results), + }) + } + } + } + + /// Dumps the index to the file at `path`. + pub fn dump(self, path: &str) -> Result<()> { + let path = to_c_string(path); + + unsafe { + let err = dump_index(self.ptr, path.as_ptr()); + if !err.is_null() { + Err(from_c_error(err)) + } else { + Ok(()) + } + } + } + + /// Loads an index from the file at `path`. + /// + /// `index_type` and `params` should be the same as the ones used to create the index. + pub fn load(path: &str, index_type: &str, params: &str) -> Result { + let path = to_c_string(path); + let index_type = to_c_string(index_type); + let params = to_c_string(params); + + unsafe { + let out_index_ptr: *mut *const c_void = &mut std::ptr::null(); + let err = ffi::load_index( + path.as_ptr(), + index_type.as_ptr(), + params.as_ptr(), + out_index_ptr, + ); + + if !err.is_null() { + Err(from_c_error(err)) + } else { + Ok(VsagIndex { + ptr: *out_index_ptr, + }) + } + } + } +} + +impl Drop for VsagIndex { + fn drop(&mut self) { + if !self.ptr.is_null() { + unsafe { + free_index(self.ptr); + } + } + } +} + +/// Output of a k-NN search. +pub struct KnnSearchOutput { + /// IDs of the k-NNs. + pub ids: Vec, + /// Distances of the k-NNs. + pub distances: Vec, +} + +#[cfg(test)] +mod tests { + use simsimd::SpatialSimilarity; + + use super::*; + + #[test] + fn test_create_build_search_index_hnsw_l2() { + let index_type = "hnsw"; + let con_params = r#"{ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "hnsw": { + "max_degree": 16, + "ef_construction": 100 + } + }"#; + let search_params = r#"{ + "hnsw": { + "ef_search": 100 + } + }"#; + + let index = VsagIndex::new(index_type, con_params).unwrap(); + + let num_vectors: usize = 1000; + let dim: usize = 128; + + let ids: Vec = (0..num_vectors as i64).collect(); + let vectors = (0..num_vectors) + .map(|_| { + (0..dim) + .map(|_| rand::random::()) + .collect::>() + }) + .collect::>(); + let vectors_for_index: Vec = vectors.iter().flat_map(|v| v.iter().copied()).collect(); + + let failed_ids = index + .build(num_vectors, dim, &ids, &vectors_for_index) + .unwrap(); + assert_eq!(failed_ids.len(), 0); + + let query_vector: Vec = (0..dim).map(|_| rand::random()).collect(); + let k = 10; + let output = index.knn_search(&query_vector, k, search_params).unwrap(); + assert_eq!(output.ids.len(), k.min(num_vectors)); + assert_eq!(output.distances.len(), k.min(num_vectors)); + + let mut distances = vectors + .iter() + .zip(ids.iter()) + .map(|(v, id)| { + let d: f32 = f32::l2sq(&query_vector, &v).unwrap() as _; + (d, *id) + }) + .collect::>(); + distances.sort_by(|(a, _), (b, _)| a.total_cmp(b)); + distances.truncate(k.min(num_vectors)); + + // dump + let dir = tempdir::TempDir::new("test_create_build_search_index_l2_").unwrap(); + let path = dir.path().join("index"); + index.dump(path.to_str().unwrap()).unwrap(); + + // load + let index = VsagIndex::load(path.to_str().unwrap(), index_type, con_params).unwrap(); + let output2 = index.knn_search(&query_vector, k, search_params).unwrap(); + assert_eq!(output.ids, output2.ids); + assert_eq!(output.distances, output2.distances); + } + + #[test] + fn test_create_build_search_index_cos() { + let index_type = "hnsw"; + let con_params = r#"{ + "dtype": "float32", + "metric_type": "cosine", + "dim": 128, + "hnsw": { + "max_degree": 16, + "ef_construction": 100 + } + }"#; + let search_params = r#"{ + "hnsw": { + "ef_search": 100 + } + }"#; + let index = VsagIndex::new(index_type, con_params).unwrap(); + + let num_vectors: usize = 1000; + let dim: usize = 128; + + let ids: Vec = (0..num_vectors as i64).collect(); + let vectors = (0..num_vectors) + .map(|_| { + (0..dim) + .map(|_| rand::random::()) + .collect::>() + }) + .collect::>(); + let vectors_for_index: Vec = vectors.iter().flat_map(|v| v.iter().copied()).collect(); + + let failed_ids = index + .build(num_vectors, dim, &ids, &vectors_for_index) + .unwrap(); + assert_eq!(failed_ids.len(), 0); + + let query_vector: Vec = (0..dim).map(|_| rand::random()).collect(); + let k = 10; + + let output = index.knn_search(&query_vector, k, search_params).unwrap(); + assert_eq!(output.ids.len(), k.min(num_vectors)); + assert_eq!(output.distances.len(), k.min(num_vectors)); + + let mut distances = vectors + .iter() + .zip(ids.iter()) + .map(|(v, id)| { + let d: f32 = ::cos(&query_vector, &v).unwrap() as _; + (d, *id) + }) + .collect::>(); + distances.sort_by(|(a, _), (b, _)| a.total_cmp(b)); + distances.truncate(k.min(num_vectors)); + + // dump + let dir = tempdir::TempDir::new("test_create_build_search_index_cos").unwrap(); + let path = dir.path().join("index"); + index.dump(path.to_str().unwrap()).unwrap(); + + // load + let index = VsagIndex::load(path.to_str().unwrap(), index_type, con_params).unwrap(); + let output2 = index.knn_search(&query_vector, k, search_params).unwrap(); + assert_eq!(output.ids, output2.ids); + assert_eq!(output.distances, output2.distances); + } +} diff --git a/src/wrapper.cpp b/src/wrapper.cpp new file mode 100644 index 0000000..58409cd --- /dev/null +++ b/src/wrapper.cpp @@ -0,0 +1,287 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "wrapper.h" +#include +#include +#include +#include +#include +#include "vsag/index.h" +#include "vsag/factory.h" +#include +#include + +template +static void +writeBinaryPOD(std::ostream& out, const T& podRef) { + out.write((char*)&podRef, sizeof(T)); +} + +template +static void +readBinaryPOD(std::istream& in, T& podRef) { + in.read((char*)&podRef, sizeof(T)); +} + +extern "C" { + +const CError* create_index(const char* in_index_type, const char* in_parameters, void** out_index_ptr) { + if (!in_index_type || !in_parameters || !out_index_ptr) { + return new CError{static_cast(vsag::ErrorType::INVALID_ARGUMENT), "Invalid null argument."}; + } + + auto result = vsag::Factory::CreateIndex(in_index_type, in_parameters); + + if (!result.has_value()) { + // Convert C++ error to dynamically allocated CError + return new CError{static_cast(result.error().type), strdup(result.error().message.c_str())}; + } + + auto pIndex = new std::shared_ptr(result.value()); + *out_index_ptr = static_cast(pIndex); + + return nullptr; // Success: Return NULL +} + +const CError* build_index( + void* in_index_ptr, + size_t in_num_vectors, + size_t in_dim, + const int64_t* in_ids, + const float* in_vectors, + + const int64_t** out_failed_ids, + size_t* out_num_failed +) { + if (!in_index_ptr || !in_ids || !in_vectors || !out_failed_ids || !out_num_failed) { + return new CError{static_cast(vsag::ErrorType::INVALID_ARGUMENT), "Invalid null argument."}; + } + + // Cast the void pointer back to the original pointer type, std::shared_ptr* + auto pIndex = static_cast*>(in_index_ptr); + + auto base = vsag::Dataset::Make(); + base->NumElements(in_num_vectors)->Dim(in_dim)->Ids(in_ids)->Float32Vectors(in_vectors)->Owner(false); + auto result = (*pIndex)->Build(base); + + if (!result.has_value()) { + // Convert C++ error to dynamically allocated CError + return new CError{static_cast(result.error().type), strdup(result.error().message.c_str())}; + } + + // Copy the failed IDs to the output array + auto failed_ids = result.value(); + auto failed_ids_array = new int64_t[failed_ids.size()]; + std::copy(failed_ids.begin(), failed_ids.end(), failed_ids_array); + *out_failed_ids = failed_ids_array; + *out_num_failed = static_cast(failed_ids.size()); + + return nullptr; // Success: Return NULL +} + +const CError* knn_search_index( + void* in_index_ptr, + size_t in_dim, + const float* in_query_vector, + size_t in_k, + const char* in_search_parameters, + + const int64_t** out_ids, + const float** out_distances, + size_t* out_num_results +) { + if (!in_index_ptr || !in_query_vector || !in_search_parameters || !out_ids || !out_distances || !out_num_results) { + return new CError{static_cast(vsag::ErrorType::INVALID_ARGUMENT), "Invalid null argument."}; + } + + // Cast the void pointer back to the original pointer type, std::shared_ptr* + auto pIndex = static_cast*>(in_index_ptr); + + auto query = vsag::Dataset::Make(); + query->NumElements(1)->Dim(in_dim)->Float32Vectors(in_query_vector)->Owner(false); + auto result = (*pIndex)->KnnSearch(query, in_k, in_search_parameters); + + if (!result.has_value()) { + // Convert C++ error to dynamically allocated CError + return new CError{static_cast(result.error().type), strdup(result.error().message.c_str())}; + } + + auto dataset = result.value(); + auto num = dataset->GetDim(); + *out_num_results = num; + + auto ids_array = new int64_t[num]; + auto ids = dataset->GetIds(); + std::copy(ids, ids + num, ids_array); + auto distances_array = new float[num]; + auto distances = dataset->GetDistances(); + std::copy(distances, distances + num, distances_array); + + *out_ids = ids_array; + *out_distances = distances_array; + + return nullptr; // Success: Return NULL +} + +const CError* dump_index(void* in_index_ptr, const char* in_file_path) { + if (!in_index_ptr || !in_file_path) { + return new CError{static_cast(vsag::ErrorType::INVALID_ARGUMENT), "Invalid null argument."}; + } + + // Cast the void pointer back to the original pointer type, std::shared_ptr* + auto pIndex = static_cast*>(in_index_ptr); + + if (auto bs = (*pIndex)->Serialize(); bs.has_value()) { + auto keys = bs->GetKeys(); + std::vector offsets; + + std::ofstream file(in_file_path, std::ios::binary); + uint64_t offset = 0; + for (auto key : keys) { + // [len][data...][len][data...]... + vsag::Binary b = bs->Get(key); + writeBinaryPOD(file, b.size); + file.write((const char*)b.data.get(), b.size); + offsets.push_back(offset); + offset += sizeof(b.size) + b.size; + } + // footer + for (uint64_t i = 0; i < keys.size(); ++i) { + // [len][key...][offset][len][key...][offset]... + const auto& key = keys[i]; + int64_t len = key.length(); + writeBinaryPOD(file, len); + file.write(key.c_str(), key.length()); + writeBinaryPOD(file, offsets[i]); + } + // [num_keys][footer_offset]$ + writeBinaryPOD(file, keys.size()); + writeBinaryPOD(file, offset); + file.close(); + } else { + auto err = bs.error(); + return new CError{static_cast(err.type), strdup(err.message.c_str())}; + } + + return nullptr; // Success: Return NULL +} + +const CError* load_index( + const char* in_file_path, + const char* in_index_type, + const char* in_parameters, + + void** out_index_ptr +) { + if (!in_file_path || !in_index_type || !in_parameters || !out_index_ptr) { + return new CError{static_cast(vsag::ErrorType::INVALID_ARGUMENT), "Invalid null argument."}; + } + + std::ifstream file(in_file_path, std::ios::in); + file.seekg(-sizeof(uint64_t) * 2, std::ios::end); + uint64_t num_keys, footer_offset; + readBinaryPOD(file, num_keys); + readBinaryPOD(file, footer_offset); + // std::cout << "num_keys: " << num_keys << std::endl; + // std::cout << "footer_offset: " << footer_offset << std::endl; + file.seekg(footer_offset, std::ios::beg); + + std::vector keys; + std::vector offsets; + for (uint64_t i = 0; i < num_keys; ++i) { + int64_t key_len; + readBinaryPOD(file, key_len); + // std::cout << "key_len: " << key_len << std::endl; + char key_buf[key_len + 1]; + memset(key_buf, 0, key_len + 1); + file.read(key_buf, key_len); + // std::cout << "key: " << key_buf << std::endl; + keys.push_back(key_buf); + + uint64_t offset; + readBinaryPOD(file, offset); + // std::cout << "offset: " << offset << std::endl; + offsets.push_back(offset); + } + + vsag::ReaderSet rs; + for (uint64_t i = 0; i < num_keys; ++i) { + int64_t size = 0; + if (i + 1 == num_keys) { + size = footer_offset; + } else { + size = offsets[i + 1]; + } + size -= (offsets[i] + sizeof(uint64_t)); + auto file_reader = vsag::Factory::CreateLocalFileReader( + in_file_path, offsets[i] + sizeof(uint64_t), size); + rs.Set(keys[i], file_reader); + } + + std::shared_ptr hnsw; + if (auto index = vsag::Factory::CreateIndex(in_index_type, in_parameters); + index.has_value()) { + hnsw = index.value(); + } else { + auto err = index.error(); + return new CError{static_cast(err.type), strdup(err.message.c_str())}; + } + auto res = hnsw->Deserialize(rs); + if (!res.has_value()) { + auto err = res.error(); + return new CError{static_cast(err.type), strdup(err.message.c_str())}; + } + + auto pIndex = new std::shared_ptr(hnsw); + *out_index_ptr = static_cast(pIndex); + + return nullptr; // Success: Return NULL +} + +void free_error(const CError* error) { + if (error) { + free(const_cast(error->message)); // Properly deallocate the dynamically allocated message + delete error; // Deallocate the error struct + } +} + +void free_index(void* index_ptr) { + if (index_ptr) { + // Cast the void pointer back to the original pointer type, std::shared_ptr* + std::shared_ptr* pIndex = static_cast*>(index_ptr); + + // Delete the std::shared_ptr which was dynamically allocated + delete pIndex; + + // Note: Deleting the std::shared_ptr will automatically handle + // the decrement of the reference count and will delete the managed Index object + // if the reference count goes to zero. + } +} + +void free_i64_vector(int64_t* vector) { + if (vector) { + delete[] vector; + } +} +void free_f32_vector(float* vector) { + if (vector) { + delete[] vector; + } +} + +} // extern "C" + +