Skip to content

Commit

Permalink
use codegen-uint=1 for cli, update cluster read vec to iterator
Browse files Browse the repository at this point in the history
Signed-off-by: Keming <[email protected]>
  • Loading branch information
kemingy committed Oct 19, 2024
1 parent a2b3fec commit cdc024e
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ panic = "abort"
inherits = "release"
debug = true
lto = false
codegen-units = 16
codegen-units = 1 # this is important when `cli` is in another crates
panic = "unwind"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@
- [x] disk-based RaBitQ
- [x] HTTP service
- [ ] insert & update & delete
- [ ] cosine similarity distance
42 changes: 14 additions & 28 deletions scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
from tqdm import tqdm


def default_filter(i, vec):
return True


def reservoir_sampling(iterator, k: int):
"""Reservoir sampling from an iterator."""
res = []
Expand All @@ -23,6 +19,10 @@ def reservoir_sampling(iterator, k: int):
return res


def default_filter(i, vec):
return True


def read_vec_yield(
filepath: str, vec_type: np.dtype = np.float32, picker=default_filter
):
Expand All @@ -45,28 +45,6 @@ def read_vec_yield(
i += 1


def read_vec(filepath: str, vec_type: np.dtype = np.float32):
"""Read vectors from a file. Support `fvecs`, `ivecs` and `bvecs` format.
Args:
filepath: The path of the file.
vec_type: The type of the vectors.
"""
size = np.dtype(vec_type).itemsize
with open(filepath, "rb") as f:
vecs = []
while True:
try:
buf = f.read(4)
if len(buf) == 0:
break
dim = unpack("<i", buf)[0]
vecs.append(np.frombuffer(f.read(dim * size), dtype=vec_type))
except Exception as err:
print(err)
break
return np.array(vecs)


def write_vec(filepath: str, vecs: np.ndarray, vec_type: np.dtype = np.float32):
"""Write vectors to a file. Support `fvecs`, `ivecs` and `bvecs` format."""
with open(filepath, "wb") as f:
Expand All @@ -75,8 +53,16 @@ def write_vec(filepath: str, vecs: np.ndarray, vec_type: np.dtype = np.float32):
f.write(vec.tobytes())


def hierarchical_kmeans(vecs, n_cluster_top, n_cluster_down):
dim = vecs.shape[1]
def inspect_vecs_file_dim(filename: str) -> int:
with open(filename, "rb") as f:
buf = f.read(4)
dim = unpack("<i", buf)[0]
return dim


def hierarchical_kmeans(filename: str, n_cluster_top: int, n_cluster_down: int):
dim = inspect_vecs_file_dim(filename)
vecs = np.fromiter(read_vec_yield(filename), dtype=np.dtype((float, dim)))
top = Kmeans(dim, n_cluster_top)
top.train(vecs)
_, labels = top.assign(vecs)
Expand Down

0 comments on commit cdc024e

Please sign in to comment.