Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deletes #117

Merged
merged 42 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
4545399
rename
jbellis Oct 6, 2023
596d133
new node value won't change, pull it out of loop
jbellis Oct 6, 2023
28db65e
TODO
jbellis Oct 6, 2023
430f057
wip
jbellis Oct 6, 2023
4c81c1e
Add markNodeDeleted
jbellis Oct 6, 2023
3b22f17
remove View.getSortedNodes
jbellis Oct 6, 2023
73525a1
wip
jbellis Oct 6, 2023
a7e258e
merge
jbellis Oct 6, 2023
9ec01ff
merge and get it building
jbellis Oct 7, 2023
b0f21d3
formatting
jbellis Oct 8, 2023
38bd370
replace validateGraph with assertGraphEquals
jbellis Oct 8, 2023
118c5b1
clean out vestigial document cruft from mock vectorvalues
jbellis Oct 8, 2023
4600ccf
format
jbellis Oct 8, 2023
18b99db
r/m numVectors field (always equal to array length)
jbellis Oct 8, 2023
f37bd49
createRandom[]Vectors no longer leaves null entries that need to be c…
jbellis Oct 8, 2023
b7db4a3
formatting
jbellis Oct 8, 2023
2cec2e2
first test for deletions
jbellis Oct 8, 2023
3abdfa9
wiring in the purge. almost passes tests
jbellis Oct 8, 2023
35bf868
fix mergeNeighbors to not add duplicate nodes, and fix test to check …
jbellis Oct 8, 2023
cb6ac31
- fix removeDeletedNeighbors
jbellis Oct 8, 2023
557d9d6
- fix removeDeletedNeighbors
jbellis Oct 8, 2023
291aefe
merge from main
jbellis Oct 11, 2023
9437684
finish implementing renumbering for writes
jbellis Oct 11, 2023
e05cd0c
rename nsize0 -> maxDegree
jbellis Oct 12, 2023
5cf1d74
show input vectors when assert fails
jbellis Oct 12, 2023
feec7f7
re-use buildSequentially
jbellis Oct 12, 2023
cc33203
encapsulate OHGI better
jbellis Oct 12, 2023
31a54e9
instead of renumbering implicitly, let caller provide remapper
jbellis Oct 12, 2023
1788350
add save and load methods for OHGI
jbellis Oct 12, 2023
15003b0
Merge remote-tracking branch 'origin/main' into deletes
jbellis Oct 12, 2023
d01d737
r/m unused CNS.insert method with confusing semantics
jbellis Oct 13, 2023
14ace18
Merge remote-tracking branch 'origin/deletes' into deletes
jbellis Oct 13, 2023
be714fd
fix insertDiverse ignoring current neighbors
jbellis Oct 13, 2023
bdda8a2
ram freed is proportional to nodes removed
jbellis Oct 13, 2023
952fe1a
merge ConcurrentNeighborArray into NeighborArray
jbellis Oct 13, 2023
fea613b
fix node-present check
jbellis Oct 13, 2023
bf989c1
make getSequentialRenumbering public
jbellis Oct 13, 2023
046c799
add failing testRenumberingOnDelete
jbellis Oct 13, 2023
ff77ff5
refactor to take Map instead of Function; sort writes by new ordinal …
jbellis Oct 13, 2023
2e0c63f
fix ci bitching about javadoc
jbellis Oct 13, 2023
a0fa7ac
fix typos
jbellis Oct 19, 2023
7020849
merge
jbellis Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import io.github.jbellis.jvector.graph.GraphIndex;
import io.github.jbellis.jvector.graph.NodesIterator;
import io.github.jbellis.jvector.util.Accountable;
import io.github.jbellis.jvector.util.Bits;

import java.io.IOException;
import java.io.UncheckedIOException;
Expand Down Expand Up @@ -106,13 +107,8 @@ public int entryNode() {
}

@Override
public int[] getSortedNodes() {
return View.super.getSortedNodes();
}

@Override
public int getNeighborCount(int node) {
return View.super.getNeighborCount(node);
public Bits liveNodes() {
return view.liveNodes();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@

import io.github.jbellis.jvector.graph.GraphIndex;
import io.github.jbellis.jvector.graph.NodesIterator;
import io.github.jbellis.jvector.graph.OnHeapGraphIndex;
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
import io.github.jbellis.jvector.util.Accountable;
import io.github.jbellis.jvector.util.Bits;

import java.io.DataOutput;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.HashSet;
import java.util.function.Function;
import java.util.stream.IntStream;

public class OnDiskGraphIndex<T> implements GraphIndex<T>, AutoCloseable, Accountable
{
Expand Down Expand Up @@ -118,6 +123,11 @@ public int entryNode() {
return OnDiskGraphIndex.this.entryNode;
}

@Override
public Bits liveNodes() {
return Bits.ALL;
}

@Override
public void close() throws IOException {
reader.close();
Expand All @@ -127,7 +137,7 @@ public void close() throws IOException {
@Override
public NodesIterator getNodes()
{
throw new UnsupportedOperationException();
return NodesIterator.fromPrimitiveIterator(IntStream.range(0, size).iterator(), size);
}

@Override
Expand All @@ -139,11 +149,29 @@ public void close() throws IOException {
readerSupplier.close();
}

// takes Graph and Vectors separately since I'm reluctant to introduce a Vectors reference
// to OnHeapGraphIndex just for this method. Maybe that will end up the best solution,
// but I'm not sure yet.
public static <T> void write(GraphIndex<T> graph, RandomAccessVectorValues<T> vectors, DataOutput out) throws IOException {
assert graph.size() == vectors.size() : String.format("graph size %d != vectors size %d", graph.size(), vectors.size());
/**
* @param graph the graph to write
* @param vectors the vectors associated with each node
* @param ordinalMapper A function that maps from the graph's ordinals to the ordinals in the output.
* For simple use cases this can be the identity function. To deal with deleted nodes,
* or to renumber the nodes to match rows or documents elsewhere, you may provide
* something custom. The mapper must map from the graph's ordinals
* [0..getMaxNodeId()], to the output's ordinals [0..size()), with no gaps and
* no duplicates.
* @param out the output to write to
*/
public static <T> void write(GraphIndex<T> graph,
RandomAccessVectorValues<T> vectors,
Function<Integer, Integer> ordinalMapper,
DataOutput out)
throws IOException
{
if (graph instanceof OnHeapGraphIndex) {
var ohgi = (OnHeapGraphIndex<T>) graph;
if (ohgi.getDeletedNodes().cardinality() > 0) {
throw new IllegalArgumentException("Run builder.cleanup() before writing the graph");
}
}

var view = graph.getView();

Expand All @@ -154,15 +182,22 @@ public static <T> void write(GraphIndex<T> graph, RandomAccessVectorValues<T> ve
out.writeInt(graph.maxDegree());

// for each graph node, write the associated vector and its neighbors
for (int node = 0; node < graph.size(); node++) {
out.writeInt(node); // unnecessary, but a reasonable sanity check
Io.writeFloats(out, (float[]) vectors.vectorValue(node));
var newOrdinals = new HashSet<Integer>();
for (int originalOrdinal = 0; originalOrdinal <= graph.getMaxNodeId(); originalOrdinal++) {
jbellis marked this conversation as resolved.
Show resolved Hide resolved
if (!graph.containsNode(originalOrdinal)) {
continue;
}

int newOrdinal = ordinalMapper.apply(originalOrdinal);
newOrdinals.add(newOrdinal);
out.writeInt(newOrdinal); // unnecessary, but a reasonable sanity check
Io.writeFloats(out, (float[]) vectors.vectorValue(originalOrdinal));

var neighbors = view.getNeighborsIterator(node);
var neighbors = view.getNeighborsIterator(originalOrdinal);
out.writeInt(neighbors.size());
int n = 0;
for ( ; n < neighbors.size(); n++) {
out.writeInt(neighbors.nextInt());
out.writeInt(ordinalMapper.apply(neighbors.nextInt()));
}
assert !neighbors.hasNext();

Expand All @@ -171,5 +206,16 @@ public static <T> void write(GraphIndex<T> graph, RandomAccessVectorValues<T> ve
out.writeInt(-1);
}
}

// verify that the provided mapper was well-behaved
if (newOrdinals.size() > graph.size()) {
throw new IllegalArgumentException("graph modified during write");
}
if (newOrdinals.size() < graph.size()) {
throw new IllegalArgumentException("ordinalMapper resulted in duplicate entries");
}
if (graph.size() > 0 && newOrdinals.stream().mapToInt(i -> i).max().getAsInt() != graph.size() - 1) {
throw new IllegalArgumentException("ordinalMapper produced out-of-range entries");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
package io.github.jbellis.jvector.graph;

import io.github.jbellis.jvector.util.BitSet;
import io.github.jbellis.jvector.util.Bits;
import io.github.jbellis.jvector.util.DocIdSetIterator;
import io.github.jbellis.jvector.util.FixedBitSet;

import java.util.HashSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;

Expand Down Expand Up @@ -48,17 +50,25 @@ public class ConcurrentNeighborSet {
/** the proportion of edges that are diverse at alpha=1.0. updated by removeAllNonDiverse */
private float shortEdges = Float.NaN;

public ConcurrentNeighborSet(
int nodeId, int maxConnections, NeighborSimilarity similarity, float alpha) {
public ConcurrentNeighborSet(int nodeId, int maxConnections, NeighborSimilarity similarity) {
this(nodeId, maxConnections, similarity, 1.0f);
}

public ConcurrentNeighborSet(int nodeId, int maxConnections, NeighborSimilarity similarity, float alpha) {
this(nodeId, maxConnections, similarity, alpha, new ConcurrentNeighborArray(maxConnections));
}

ConcurrentNeighborSet(int nodeId,
int maxConnections,
NeighborSimilarity similarity,
float alpha,
ConcurrentNeighborArray neighbors)
{
this.nodeId = nodeId;
this.maxConnections = maxConnections;
this.similarity = similarity;
neighborsRef = new AtomicReference<>(new ConcurrentNeighborArray(maxConnections));
this.alpha = alpha;
}

public ConcurrentNeighborSet(int nodeId, int maxConnections, NeighborSimilarity similarity) {
this(nodeId, maxConnections, similarity, 1.0f);
this.neighborsRef = new AtomicReference<>(neighbors);
}

private ConcurrentNeighborSet(ConcurrentNeighborSet old) {
Expand Down Expand Up @@ -100,6 +110,35 @@ public void cleanup() {
neighborsRef.getAndUpdate(this::removeAllNonDiverse);
}

/**
* @return true if we had deleted neighbors
*/
public boolean removeDeletedNeighbors(Bits deletedNodes) {
AtomicBoolean found = new AtomicBoolean();
jbellis marked this conversation as resolved.
Show resolved Hide resolved
neighborsRef.getAndUpdate(current -> {
// build a set of the entries we want to retain
var toRetain = new FixedBitSet(current.size);
for (int i = 0; i < current.size; i++) {
if (deletedNodes.get(current.node[i])) {
found.set(true);
} else {
toRetain.set(i);
}
}

// if we're retaining everything, no need to make a copy
if (!found.get()) {
return current;
}

// copy and purge the deleted ones
var next = current.copy();
next.retain(toRetain);
return next;
});
return found.get();
}

private static class NeighborIterator extends NodesIterator {
private final NeighborArray neighbors;
private int i;
Expand Down Expand Up @@ -141,18 +180,35 @@ public void insertDiverse(NeighborArray natural, NeighborArray concurrent) {
}

neighborsRef.getAndUpdate(current -> {
// if either natural or concurrent is empty, skip the merge
NeighborArray toMerge;
if (concurrent.size == 0) {
toMerge = natural;
} else if (natural.size == 0) {
toMerge = concurrent;
} else {
toMerge = mergeNeighbors(natural, concurrent);
}

// merge all the candidates into a single array and compute the diverse ones to keep
// from that. we do this first by selecting the ones to keep, and then by copying
// only those into a new NeighborArray. This is less expensive than doing the
// diversity computation in-place, since we are going to do multiple passes and
// pruning back extras is expensive.
var merged = mergeNeighbors(mergeNeighbors(natural, current), concurrent);
var merged = mergeNeighbors(natural, toMerge);
jbellis marked this conversation as resolved.
Show resolved Hide resolved
BitSet selected = selectDiverse(merged);
merged.retain(selected);
return merged;
});
}

void padWithRandom(NeighborArray connections) {
// we deliberately do not perform diversity checks here
// (it will be invoked when the cleanup code calls insertDiverse later
// with the results of the nn descent rebuild)
neighborsRef.getAndUpdate(current -> mergeNeighbors(current, connections));
}

/**
* Copies the selected neighbors from the merged array into a new array.
*/
Expand Down Expand Up @@ -201,7 +257,7 @@ private BitSet selectDiverse(NeighborArray neighbors) {
return selected;
}

public ConcurrentNeighborArray getCurrent() {
ConcurrentNeighborArray getCurrent() {
return neighborsRef.get();
}

Expand Down Expand Up @@ -411,7 +467,7 @@ private boolean duplicateExistsNear(int insertionPoint, int newNode, float newSc
* @param selected A BitSet where the bit at index i is set if the i-th element should be retained.
* (Thus, the elements of selected represent positions in the NeighborArray, NOT node ids.)
*/
public void retain(BitSet selected) {
public void retain(Bits selected) {
int writeIdx = 0; // index for where to write the next retained element

for (int readIdx = 0; readIdx < size; readIdx++) {
Expand All @@ -422,7 +478,6 @@ public void retain(BitSet selected) {
score[writeIdx] = score[readIdx];
}
// else { we haven't created any gaps in the backing arrays yet, so we don't need to move anything }

writeIdx++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@

package io.github.jbellis.jvector.graph;

import io.github.jbellis.jvector.util.Bits;

import java.io.IOException;
import java.util.Arrays;

/**
* Represents a graph-based vector index. Nodes are represented as ints, and edges are
Expand Down Expand Up @@ -59,6 +60,21 @@ public interface GraphIndex<T> extends AutoCloseable {
*/
int maxDegree();

/**
* @return the maximum node id in the graph. May be different from size() if nodes are
* being added concurrently, or if nodes have been deleted (and cleaned up).
*/
default int getMaxNodeId() {
return size();
}

/**
* @return true iff the graph contains the node with the given ordinal id
*/
default boolean containsNode(int nodeId) {
return nodeId >= 0 && nodeId < size();
}

@Override
void close() throws IOException;

Expand All @@ -69,8 +85,14 @@ interface View<T> extends AutoCloseable {
*/
NodesIterator getNeighborsIterator(int node);

/**
* @return the number of nodes in the graph
*/
int size();

/**
* @return the node of the graph to start searches at
*/
int entryNode();

/**
Expand All @@ -82,16 +104,17 @@ interface View<T> extends AutoCloseable {
*/
T getVector(int node);

// for compatibility with Cassandra's ExtendedHnswGraph. Not sure if we still need it
default int[] getSortedNodes() {
int[] sortedNodes = new int[size()];
Arrays.setAll(sortedNodes, i -> i);
return sortedNodes;
}
/**
* Return a Bits instance indicating which nodes are live. The result is undefined for
* ordinals that do not correspond to nodes in the graph.
*/
Bits liveNodes();

// for compatibility with Cassandra's ExtendedHnswGraph. Not sure if we still want/need it
default int getNeighborCount(int node) {
return getNeighborsIterator(node).size();
/**
* @return the largest ordinal id in the graph. May be different from size() if nodes have been deleted.
*/
default int getMaxNodeId() {
return size();
}
}

Expand Down
Loading