Merge remote-tracking branch 'origin/main' into nuveq

jbellis · Jan 2, 2025 · 45b8241 · 45b8241
2 parents 3115b42 + 431538e
commit 45b8241
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 42 deletions.
diff --git a/UPGRADING.md b/UPGRADING.md
@@ -1,16 +1,20 @@
-# Upgrading from 3.0.x to 3.1.x
+# Upgrading from 3.0.x to 3.0.6
 
-## Critical API changes
+## API changes
 
 - `VectorCompressor.encodeAll()` now returns a `CompressedVectors` object instead of a `ByteSequence<?>[]`.
   This provides better encapsulation of the compression functionality while also allowing for more efficient
   creation of the `CompressedVectors` object.
 - The `ByteSequence` interface now includes an `offset()` method to provide offset information for the sequence.
   any time the method `ByteSequence::get` is called, the full backing data is returned, and as such, the `offset()`
   method is necessary to determine the offset of the data in the backing array.
-- `PQVectors` constructor has been updated to support immutable instances and explicit chunking parameters.
+- `PQVectors` has been split into `MutablePQVectors` and `ImmutablePQVectors`.  Generally you will use `new MutablePQVectors()`
+  directly, while `ImmutablePQVectors` will usually be accessed via `PQVectors.load` (whose signature has not changed).
+  These changes allow PQVectors to represent compressed vectors more efficiently under the hood.
+- `BQVectors` has similarly been split into mutable and immutable implementations.
 - The `VectorCompressor.createCompressedVectors(Object[])` method is now deprecated in favor of the new API that returns
   `CompressedVectors` directly from `encodeAll()`.
+- `PQVectors::getProductQuantization` is removed; it duplicated `CompressedVectors::getCompressor` unnecessarily
 
 ## New features
 - Support for Non-uniform Vector Quantization (NVQ, pronounced as "new vec"). This new technique quantizes the values

diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/BQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/BQVectors.java
@@ -31,29 +31,24 @@
 public abstract class BQVectors implements CompressedVectors {
     protected final BinaryQuantization bq;
     protected long[][] compressedVectors;
-    protected int vectorCount;
 
     protected BQVectors(BinaryQuantization bq) {
         this.bq = bq;
     }
 
-    @Override
-    public int count() {
-        return vectorCount;
-    }
-
     @Override
     public void write(DataOutput out, int version) throws IOException {
         // BQ centering data
         bq.write(out, version);
 
         // compressed vectors
-        out.writeInt(compressedVectors.length);
-        if (compressedVectors.length <= 0) {
+        out.writeInt(count());
+        if (count() <= 0) {
             return;
         }
         out.writeInt(compressedVectors[0].length);
-        for (var v : compressedVectors) {
+        for (int i = 0; i < count(); i++) {
+            var v = compressedVectors[i];
             for (long l : v) {
                 out.writeLong(l);
             }
@@ -134,7 +129,7 @@ public BinaryQuantization getCompressor() {
 
     @Override
     public long ramBytesUsed() {
-        return compressedVectors.length * RamUsageEstimator.sizeOf(compressedVectors[0]);
+        return count() * RamUsageEstimator.sizeOf(compressedVectors[0]);
     }
 
     @Override

diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ImmutableBQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ImmutableBQVectors.java
@@ -21,4 +21,9 @@ public ImmutableBQVectors(BinaryQuantization bq, long[][] compressedVectors) {
         super(bq);
         this.compressedVectors = compressedVectors;
     }
+
+    @Override
+    public int count() {
+        return compressedVectors.length;
+    }
 }
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/MutableBQVectors.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/MutableBQVectors.java
@@ -17,27 +17,47 @@
 package io.github.jbellis.jvector.quantization;
 
 public class MutableBQVectors extends BQVectors implements MutableCompressedVectors<long[]> {
+    private static final int INITIAL_CAPACITY = 1024;
+    private static final float GROWTH_FACTOR = 1.5f;
+
+    protected int vectorCount;
+
     /**
-     * Construct a mutable BQVectors instance with the given BinaryQuantization and maximum number of vectors
-     * that will be stored in this instance.
+     * Construct a mutable BQVectors instance with the given BinaryQuantization.
+     * The vectors storage will grow dynamically as needed.
      * @param bq the BinaryQuantization to use
-     * @param maximumVectorCount the maximum number of vectors that will be stored in this instance
      */
-    public MutableBQVectors(BinaryQuantization bq, int maximumVectorCount) {
+    public MutableBQVectors(BinaryQuantization bq) {
         super(bq);
-        this.compressedVectors = new long[maximumVectorCount][];
+        this.compressedVectors = new long[INITIAL_CAPACITY][];
         this.vectorCount = 0;
     }
 
+    private void ensureCapacity(int ordinal) {
+        if (ordinal >= compressedVectors.length) {
+            int newCapacity = Math.max(ordinal + 1, (int)(compressedVectors.length * GROWTH_FACTOR));
+            long[][] newVectors = new long[newCapacity][];
+            System.arraycopy(compressedVectors, 0, newVectors, 0, compressedVectors.length);
+            compressedVectors = newVectors;
+        }
+    }
+
     @Override
     public void encodeAndSet(int ordinal, long[] vector) {
+        ensureCapacity(ordinal);
         compressedVectors[ordinal] = vector;
         vectorCount = Math.max(vectorCount, ordinal + 1);
     }
 
     @Override
     public void setZero(int ordinal) {
+        ensureCapacity(ordinal);
         compressedVectors[ordinal] = new long[bq.compressedVectorSize()];
         vectorCount = Math.max(vectorCount, ordinal + 1);
     }
+
+    @Override
+    public int count() {
+        return vectorCount;
+    }
 }
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ProductQuantization.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/ProductQuantization.java
@@ -106,14 +106,13 @@ public static ProductQuantization compute(RandomAccessVectorValues ravv, int M,
      *                         the number of physical cores.
      * @param parallelExecutor ForkJoinPool instance for parallel stream operations
      */
-    public static ProductQuantization compute(
-            RandomAccessVectorValues ravv,
-            int M,
-            int clusterCount,
-            boolean globallyCenter,
-            float anisotropicThreshold,
-            ForkJoinPool simdExecutor,
-            ForkJoinPool parallelExecutor)
+    public static ProductQuantization compute(RandomAccessVectorValues ravv,
+                                              int M,
+                                              int clusterCount,
+                                              boolean globallyCenter,
+                                              float anisotropicThreshold,
+                                              ForkJoinPool simdExecutor,
+                                              ForkJoinPool parallelExecutor)
     {
         var subvectorSizesAndOffsets = getSubvectorSizesAndOffsets(ravv.dimension(), M);
         var vectors = extractTrainingVectors(ravv, parallelExecutor);

diff --git a/...or-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java b/...or-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java
@@ -50,7 +50,7 @@ public DataSet load() throws IOException {
         var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
         var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
         var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
-        return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors);
+        return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.DOT_PRODUCT, baseVectors, queryVectors, gtVectors);
     }
 
     public static Map<String, MultiFileDatasource> byName = new HashMap<>() {{
@@ -83,24 +83,28 @@ public DataSet load() throws IOException {
                                                             "wikipedia_squad/100k/text-embedding-3-small_1536_100000_query_vectors_10000.fvec",
                                                             "wikipedia_squad/100k/text-embedding-3-small_1536_100000_indices_query_10000.ivec"));
         put("ada002-100k", new MultiFileDatasource("ada002-100k",
-                                                      "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
-                                                      "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
-                                                      "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"));
+                                                   "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
+                                                   "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
+                                                   "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"));
+        put("ada002-1M", new MultiFileDatasource("ada002-1M",
+                                                 "wikipedia_squad/1M/ada_002_1000000_base_vectors.fvec",
+                                                 "wikipedia_squad/1M/ada_002_1000000_query_vectors_10000.fvec",
+                                                 "wikipedia_squad/1M/ada_002_1000000_indices_query_10000.ivec"));
         put("e5-small-v2-100k", new MultiFileDatasource("e5-small-v2-100k",
-                                                                   "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec",
-                                                                   "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec",
-                                                                   "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec"));
+                                                        "wikipedia_squad/100k/intfloat_e5-small-v2_100000_base_vectors.fvec",
+                                                        "wikipedia_squad/100k/intfloat_e5-small-v2_100000_query_vectors_10000.fvec",
+                                                        "wikipedia_squad/100k/intfloat_e5-small-v2_100000_indices_query_10000.ivec"));
         put("e5-base-v2-100k", new MultiFileDatasource("e5-base-v2-100k",
-                                                                  "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec",
-                                                                  "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec",
-                                                                  "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec"));
+                                                       "wikipedia_squad/100k/intfloat_e5-base-v2_100000_base_vectors.fvec",
+                                                       "wikipedia_squad/100k/intfloat_e5-base-v2_100000_query_vectors_10000.fvec",
+                                                       "wikipedia_squad/100k/intfloat_e5-base-v2_100000_indices_query_10000.ivec"));
         put("e5-large-v2-100k", new MultiFileDatasource("e5-large-v2-100k",
-                                                                   "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec",
-                                                                   "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec",
-                                                                   "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec"));
+                                                        "wikipedia_squad/100k/intfloat_e5-large-v2_100000_base_vectors.fvec",
+                                                        "wikipedia_squad/100k/intfloat_e5-large-v2_100000_query_vectors_10000.fvec",
+                                                        "wikipedia_squad/100k/intfloat_e5-large-v2_100000_indices_query_10000.ivec"));
         put("gecko-100k", new MultiFileDatasource("gecko-100k",
-                                                                  "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec",
-                                                                  "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec",
-                                                                  "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec"));
+                                                  "wikipedia_squad/100k/textembedding-gecko_100000_base_vectors.fvec",
+                                                  "wikipedia_squad/100k/textembedding-gecko_100000_query_vectors_10000.fvec",
+                                                  "wikipedia_squad/100k/textembedding-gecko_100000_indices_query_10000.ivec"));
     }};
 }