From 8fd4df5717b2181993ff460f09102e1fe7060323 Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Tue, 24 Dec 2024 11:42:42 -0600 Subject: [PATCH 1/4] update UPGRADING --- UPGRADING.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/UPGRADING.md b/UPGRADING.md index 1e61221f..e5f5beb8 100644 --- a/UPGRADING.md +++ b/UPGRADING.md @@ -1,6 +1,6 @@ -# Upgrading from 3.0.x to 3.1.x +# Upgrading from 3.0.x to 3.0.6 -## Critical API changes +## API changes - `VectorCompressor.encodeAll()` now returns a `CompressedVectors` object instead of a `ByteSequence[]`. This provides better encapsulation of the compression functionality while also allowing for more efficient @@ -8,9 +8,13 @@ - The `ByteSequence` interface now includes an `offset()` method to provide offset information for the sequence. any time the method `ByteSequence::get` is called, the full backing data is returned, and as such, the `offset()` method is necessary to determine the offset of the data in the backing array. -- `PQVectors` constructor has been updated to support immutable instances and explicit chunking parameters. +- `PQVectors` has been split into `MutablePQVectors` and `ImmutablePQVectors`. Generally you will use `new MutablePQVectors()` + directly, while `ImmutablePQVectors` will usually be accessed via `PQVectors.load` (whose signature has not changed). + These changes allow PQVectors to represent compressed vectors more efficiently under the hood. +- `BQVectors` has similarly been split into mutable and immutable implementations. - The `VectorCompressor.createCompressedVectors(Object[])` method is now deprecated in favor of the new API that returns `CompressedVectors` directly from `encodeAll()`. +- `PQVectors::getProductQuantization` is removed; it duplicated `CompressedVectors::getCompressor` unnecessarily # Upgrading from 2.0.x to 3.0.x From 06870579c48ed9d3abeef49ba2542337f5286e6c Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Tue, 24 Dec 2024 11:42:49 -0600 Subject: [PATCH 2/4] formatting --- .../jbellis/jvector/pq/ProductQuantization.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ProductQuantization.java b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ProductQuantization.java index f086f4f5..c56d91f2 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ProductQuantization.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ProductQuantization.java @@ -106,14 +106,13 @@ public static ProductQuantization compute(RandomAccessVectorValues ravv, int M, * the number of physical cores. * @param parallelExecutor ForkJoinPool instance for parallel stream operations */ - public static ProductQuantization compute( - RandomAccessVectorValues ravv, - int M, - int clusterCount, - boolean globallyCenter, - float anisotropicThreshold, - ForkJoinPool simdExecutor, - ForkJoinPool parallelExecutor) + public static ProductQuantization compute(RandomAccessVectorValues ravv, + int M, + int clusterCount, + boolean globallyCenter, + float anisotropicThreshold, + ForkJoinPool simdExecutor, + ForkJoinPool parallelExecutor) { var subvectorSizesAndOffsets = getSubvectorSizesAndOffsets(ravv.dimension(), M); var vectors = extractTrainingVectors(ravv, parallelExecutor); From 0a25715c9f9d1e462c4d51569059a56777c23a4f Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Tue, 24 Dec 2024 12:22:59 -0600 Subject: [PATCH 3/4] MutableBQVectors grows incrementally like MutablePQVectors --- .../github/jbellis/jvector/pq/BQVectors.java | 15 ++++------ .../jvector/pq/ImmutableBQVectors.java | 5 ++++ .../jbellis/jvector/pq/MutableBQVectors.java | 30 +++++++++++++++---- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/BQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/BQVectors.java index bf868b2e..a79d8b70 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/BQVectors.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/BQVectors.java @@ -31,29 +31,24 @@ public abstract class BQVectors implements CompressedVectors { protected final BinaryQuantization bq; protected long[][] compressedVectors; - protected int vectorCount; protected BQVectors(BinaryQuantization bq) { this.bq = bq; } - @Override - public int count() { - return vectorCount; - } - @Override public void write(DataOutput out, int version) throws IOException { // BQ centering data bq.write(out, version); // compressed vectors - out.writeInt(compressedVectors.length); - if (compressedVectors.length <= 0) { + out.writeInt(count()); + if (count() <= 0) { return; } out.writeInt(compressedVectors[0].length); - for (var v : compressedVectors) { + for (int i = 0; i < count(); i++) { + var v = compressedVectors[i]; for (long l : v) { out.writeLong(l); } @@ -134,7 +129,7 @@ public BinaryQuantization getCompressor() { @Override public long ramBytesUsed() { - return compressedVectors.length * RamUsageEstimator.sizeOf(compressedVectors[0]); + return count() * RamUsageEstimator.sizeOf(compressedVectors[0]); } @Override diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ImmutableBQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ImmutableBQVectors.java index e390cb66..85b2313f 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ImmutableBQVectors.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/ImmutableBQVectors.java @@ -21,4 +21,9 @@ public ImmutableBQVectors(BinaryQuantization bq, long[][] compressedVectors) { super(bq); this.compressedVectors = compressedVectors; } + + @Override + public int count() { + return compressedVectors.length; + } } diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/MutableBQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/MutableBQVectors.java index 5c6e39ca..a9cf8a47 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/pq/MutableBQVectors.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/pq/MutableBQVectors.java @@ -17,27 +17,47 @@ package io.github.jbellis.jvector.pq; public class MutableBQVectors extends BQVectors implements MutableCompressedVectors { + private static final int INITIAL_CAPACITY = 1024; + private static final float GROWTH_FACTOR = 1.5f; + + protected int vectorCount; + /** - * Construct a mutable BQVectors instance with the given BinaryQuantization and maximum number of vectors - * that will be stored in this instance. + * Construct a mutable BQVectors instance with the given BinaryQuantization. + * The vectors storage will grow dynamically as needed. * @param bq the BinaryQuantization to use - * @param maximumVectorCount the maximum number of vectors that will be stored in this instance */ - public MutableBQVectors(BinaryQuantization bq, int maximumVectorCount) { + public MutableBQVectors(BinaryQuantization bq) { super(bq); - this.compressedVectors = new long[maximumVectorCount][]; + this.compressedVectors = new long[INITIAL_CAPACITY][]; this.vectorCount = 0; } + private void ensureCapacity(int ordinal) { + if (ordinal >= compressedVectors.length) { + int newCapacity = Math.max(ordinal + 1, (int)(compressedVectors.length * GROWTH_FACTOR)); + long[][] newVectors = new long[newCapacity][]; + System.arraycopy(compressedVectors, 0, newVectors, 0, compressedVectors.length); + compressedVectors = newVectors; + } + } + @Override public void encodeAndSet(int ordinal, long[] vector) { + ensureCapacity(ordinal); compressedVectors[ordinal] = vector; vectorCount = Math.max(vectorCount, ordinal + 1); } @Override public void setZero(int ordinal) { + ensureCapacity(ordinal); compressedVectors[ordinal] = new long[bq.compressedVectorSize()]; vectorCount = Math.max(vectorCount, ordinal + 1); } + + @Override + public int count() { + return vectorCount; + } } From 431538e934661267bf96667344a2c6648acffe1d Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Thu, 2 Jan 2025 12:57:38 -0600 Subject: [PATCH 4/4] ada2-1M --- .../example/util/MultiFileDatasource.java | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java index 08845939..22a6e2e5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java @@ -50,7 +50,7 @@ public DataSet load() throws IOException { var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); + return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.DOT_PRODUCT, baseVectors, queryVectors, gtVectors); } public static Map byName = new HashMap<>() {{ @@ -83,24 +83,28 @@ public DataSet load() throws IOException { "wikipedia_squad/100k/text-embedding-3-small_1536_100000_query_vectors_10000.fvec", "wikipedia_squad/100k/text-embedding-3-small_1536_100000_indices_query_10000.ivec")); put("ada002-100k", new MultiFileDatasource("ada002-100k", - "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec", - "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec")); + "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec", + "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec")); + put("ada002-1M", new MultiFileDatasource("ada002-1M", + "wikipedia_squad/1M/ada_002_1000000_base_vectors.fvec", + "wikipedia_squad/1M/ada_002_1000000_query_vectors_10000.fvec", + "wikipedia_squad/1M/ada_002_1000000_indices_query_10000.ivec")); put("e5-small-v2-100k", new MultiFileDatasource("e5-small-v2-100k", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec")); + "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec", + "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec")); put("e5-base-v2-100k", new MultiFileDatasource("e5-base-v2-100k", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec")); + "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec", + "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec")); put("e5-large-v2-100k", new MultiFileDatasource("e5-large-v2-100k", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec")); + "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec", + "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec")); put("gecko-100k", new MultiFileDatasource("gecko-100k", - "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec", - "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec", - "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec")); + "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec", + "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec", + "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec")); }}; }