From 4ab39a8b1fbf6818458340f4c96a4efb315ea886 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:55:18 -0400 Subject: [PATCH 1/3] feat: referenceId (#360) --- pkg/pointer/referenced_value.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pkg/pointer/referenced_value.go b/pkg/pointer/referenced_value.go index 6ff83b0..eb7f861 100644 --- a/pkg/pointer/referenced_value.go +++ b/pkg/pointer/referenced_value.go @@ -3,6 +3,7 @@ package pointer import ( "bytes" "fmt" + "github.com/kevmo314/appendable/pkg/hnsw" ) type ReferencedValue struct { @@ -18,10 +19,19 @@ type ReferencedValue struct { Value []byte } +type ReferencedId struct { + DataPointer MemoryPointer + Value hnsw.Id +} + func (rv ReferencedValue) String() string { return fmt.Sprintf("ReferencedValue@%s{%s}", rv.DataPointer, rv.Value) } +func (rv ReferencedId) String() string { + return fmt.Sprintf("ReferencedId@%s{%d}", rv.DataPointer, rv.Value) +} + func CompareReferencedValues(a, b ReferencedValue) int { if cmp := bytes.Compare(a.Value, b.Value); cmp != 0 { return cmp @@ -36,3 +46,13 @@ func CompareReferencedValues(a, b ReferencedValue) int { } return 0 } + +func CompareReferencedId(a, b ReferencedId) int { + if a.Value > b.Value { + return 1 + } else if a.Value < b.Value { + return -1 + } + + return 0 +} From e7763246fe8cefd0a6e104585851cd67f5cfc71c Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 27 Jun 2024 12:00:39 -0400 Subject: [PATCH 2/3] leaf (#361) --- pkg/btree/node.go | 11 ++++++++--- pkg/pointer/referenced_value.go | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pkg/btree/node.go b/pkg/btree/node.go index c3653dc..0b73018 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -2,21 +2,26 @@ package btree import ( "github.com/kevmo314/appendable/pkg/hnsw" + "github.com/kevmo314/appendable/pkg/pointer" "io" ) type BTreeNode struct { - Ids []hnsw.Id + Ids []pointer.ReferencedId Vectors []hnsw.Point - Pointers []uint64 - Width uint16 + Offsets []uint64 + Width uint16 } func (n *BTreeNode) Size() int64 { return 0 } +func (n *BTreeNode) Leaf() bool { + return len(n.Offsets) == 0 +} + // MarshalBinary TODO! func (n *BTreeNode) MarshalBinary() ([]byte, error) { b := []byte{} diff --git a/pkg/pointer/referenced_value.go b/pkg/pointer/referenced_value.go index eb7f861..16ee262 100644 --- a/pkg/pointer/referenced_value.go +++ b/pkg/pointer/referenced_value.go @@ -47,7 +47,7 @@ func CompareReferencedValues(a, b ReferencedValue) int { return 0 } -func CompareReferencedId(a, b ReferencedId) int { +func CompareReferencedIds(a, b ReferencedId) int { if a.Value > b.Value { return 1 } else if a.Value < b.Value { From 64f1b73ed635ca4f0541cfbb29d8e64be548238b Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:02:58 -0400 Subject: [PATCH 3/3] feat: marshal, unmarshal, size (#362) --- pkg/bptree/node.go | 1 - pkg/btree/btree.go | 4 +- pkg/btree/node.go | 129 ++++++++++++++++++++++++++++++++++++++--- pkg/btree/node_test.go | 88 ++++++++++++++++++++++++++++ 4 files changed, 212 insertions(+), 10 deletions(-) create mode 100644 pkg/btree/node_test.go diff --git a/pkg/bptree/node.go b/pkg/bptree/node.go index db8c0ae..ad167f7 100644 --- a/pkg/bptree/node.go +++ b/pkg/bptree/node.go @@ -41,7 +41,6 @@ func (n *BPTreeNode) NumPointers() int { } func (n *BPTreeNode) Size() int64 { - size := 4 // number of keys for _, k := range n.Keys { o := encoding.SizeVarint(uint64(k.DataPointer.Offset)) diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index e8c72de..3e63e84 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -11,6 +11,8 @@ type BTree struct { MetaPage metapage.MetaPage PageFile pagefile.ReadWriteSeekPager + VectorDim uint64 + Width uint16 } @@ -33,7 +35,7 @@ func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { return nil, err } - node := &BTreeNode{Width: t.Width} + node := &BTreeNode{Width: t.Width, VectorDim: t.VectorDim} buf := make([]byte, t.PageFile.PageSize()) if _, err := t.PageFile.Read(buf); err != nil { diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 0b73018..37b2768 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -1,36 +1,149 @@ package btree import ( + "encoding/binary" + "fmt" + "github.com/kevmo314/appendable/pkg/encoding" "github.com/kevmo314/appendable/pkg/hnsw" "github.com/kevmo314/appendable/pkg/pointer" "io" + "math" ) type BTreeNode struct { Ids []pointer.ReferencedId Vectors []hnsw.Point - Offsets []uint64 - Width uint16 + Offsets []uint64 + Width uint16 + VectorDim uint64 } func (n *BTreeNode) Size() int64 { - return 0 + size := 4 + + for _, k := range n.Ids { + size += encoding.SizeVarint(k.DataPointer.Offset) + size += encoding.SizeVarint(uint64(k.DataPointer.Length)) + size += encoding.SizeVarint(uint64(k.Value)) + } + + for _, n := range n.Offsets { + size += encoding.SizeVarint(n) + } + + size += encoding.SizeVarint(n.VectorDim) + size += len(n.Vectors) * (4 * int(n.VectorDim)) + + return int64(size) } func (n *BTreeNode) Leaf() bool { - return len(n.Offsets) == 0 + return n.Offsets == nil || len(n.Offsets) == 0 } -// MarshalBinary TODO! func (n *BTreeNode) MarshalBinary() ([]byte, error) { - b := []byte{} + size := int32(len(n.Ids)) + + if size == 0 { + panic("writing empty node, no ids found!") + } + + buf := make([]byte, n.Size()) + + if n.Leaf() { + binary.LittleEndian.PutUint32(buf[:4], uint32(-size)) + } else { + binary.LittleEndian.PutUint32(buf[:4], uint32(size)) + } + + ct := 4 + for _, k := range n.Ids { + on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset) + ln := binary.PutUvarint(buf[ct+on:], uint64(k.DataPointer.Length)) + vn := binary.PutUvarint(buf[ct+on+ln:], uint64(k.Value)) + ct += on + ln + vn + } + + for _, n := range n.Offsets { + on := binary.PutUvarint(buf[ct:], n) + ct += on + } + + vdn := binary.PutUvarint(buf[ct:], n.VectorDim) + ct += vdn + + for _, v := range n.Vectors { + for _, elem := range v { + binary.LittleEndian.PutUint32(buf[ct:], math.Float32bits(elem)) + ct += 4 + } + } + + if ct != int(n.Size()) { + panic(fmt.Sprintf("size mismatch. ct: %v, size: %v", ct, n.Size())) + } - return b, nil + return buf, nil } -// UnmarshalBinary TODO! func (n *BTreeNode) UnmarshalBinary(buf []byte) error { + size := int32(binary.LittleEndian.Uint32(buf[:4])) + leaf := size < 0 + + if leaf { + n.Ids = make([]pointer.ReferencedId, -size) + n.Vectors = make([]hnsw.Point, -size) + n.Offsets = make([]uint64, 0) + } else { + n.Ids = make([]pointer.ReferencedId, size) + n.Vectors = make([]hnsw.Point, size) + n.Offsets = make([]uint64, size+1) + } + + if size == 0 { + panic("empty node") + } + + m := 4 + for i := range n.Ids { + o, on := binary.Uvarint(buf[m:]) + l, ln := binary.Uvarint(buf[m+on:]) + + n.Ids[i].DataPointer.Offset = o + n.Ids[i].DataPointer.Length = uint32(l) + + m += on + ln + + v, vn := binary.Uvarint(buf[m:]) + n.Ids[i].Value = hnsw.Id(v) + + m += vn + } + + if !leaf { + for i := range n.Offsets { + o, on := binary.Uvarint(buf[m:]) + n.Offsets[i] = o + m += on + } + } + + vecdim, vdn := binary.Uvarint(buf[m:]) + n.VectorDim = vecdim + m += vdn + + for i := range n.Vectors { + vector := make(hnsw.Point, vecdim) + + for vi := range vector { + vector[vi] = float32(binary.LittleEndian.Uint32(buf[m:])) + m += 4 + } + + n.Vectors[i] = vector + } + return nil } diff --git a/pkg/btree/node_test.go b/pkg/btree/node_test.go new file mode 100644 index 0000000..516f677 --- /dev/null +++ b/pkg/btree/node_test.go @@ -0,0 +1,88 @@ +package btree + +import ( + "bytes" + "github.com/kevmo314/appendable/pkg/hnsw" + "github.com/kevmo314/appendable/pkg/pointer" + "reflect" + "testing" +) + +func TestBTreeNode_Size(t *testing.T) { + t.Run("node size", func(t *testing.T) { + n := &BTreeNode{ // 4 + Ids: []pointer.ReferencedId{{Value: 1}, {Value: 2}, {Value: 3}}, // 3 * (3) + Vectors: []hnsw.Point{{1, 1}, {2, 2}, {3, 3}}, // 6 * 4 == 3 * 2 * 4 // 24 + Offsets: make([]uint64, 0), + VectorDim: 2, // 1 + } + + if n.Size() != 38 { + t.Fatalf("wrong size: %d", n.Size()) + } + }) +} + +func TestBTreeNode_MarshalBinary(t *testing.T) { + t.Run("leaf node", func(t *testing.T) { + n := &BTreeNode{ + Ids: []pointer.ReferencedId{ + {Value: 1}, + {Value: 2}, + {Value: 3}, + }, + Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}}, + Offsets: make([]uint64, 0), + VectorDim: 2, + } + + buf := &bytes.Buffer{} + if _, err := n.WriteTo(buf); err != nil { + t.Fatal(err) + } + + m := &BTreeNode{} + if err := m.UnmarshalBinary(buf.Bytes()); err != nil { + t.Fatal(err) + } + + if !m.Leaf() { + t.Fatalf("expected leaf node, but got %v offsets", len(m.Offsets)) + } + + if !reflect.DeepEqual(n, m) { + t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m) + } + }) + + t.Run("intermediate node", func(t *testing.T) { + n := &BTreeNode{ + Ids: []pointer.ReferencedId{ + {Value: 1}, + {Value: 2}, + {Value: 3}, + }, + Vectors: []hnsw.Point{{0, 0}, {0, 0}, {0, 0}}, + Offsets: []uint64{0, 4096, 8192, 6969}, + VectorDim: 2, + } + + buf := &bytes.Buffer{} + if _, err := n.WriteTo(buf); err != nil { + t.Fatal(err) + } + + m := &BTreeNode{} + if err := m.UnmarshalBinary(buf.Bytes()); err != nil { + t.Fatal(err) + } + + if m.Leaf() { + t.Fatal("expected intermediate node") + } + + if !reflect.DeepEqual(n, m) { + t.Fatalf("encoded\n%#v\ndecoded\n%#v", n, m) + } + }) +}