diff --git a/README.md b/README.md index b624630..516fad8 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,8 @@ The [`lsh.Fit`/`lsh.FitWide`](https://pkg.go.dev/github.com/keilerkonzept/bitkn If your vectors are longer than 64 bits, you can still use `bitknn` if you [pack](https://pkg.go.dev/github.com/keilerkonzept/bitknn/pack) them into `[]uint64`. The [`pack` package](https://pkg.go.dev/github.com/keilerkonzept/bitknn/pack) defines helper functions to pack `string`s and `[]byte`s into `[]uint64`s. +> It's faster to use a `[][]uint64` allocated using a flat backing slice, laid out in one contiguous memory block. If you already have a non-contiguous `[][]uint64`, you can use [`pack.ReallocateFlat`](https://pkg.go.dev/github.com/keilerkonzept/bitknn/pack#ReallocateFlat) to re-allocate the dataset using a flat 1d backing slice. + The exact k-NN model in `bitknn` and the approximate-NN model in `lsh` each have a `Wide` variant that accepts slice-valued data points: ```go diff --git a/lsh/model_test.go b/lsh/model_test.go index 91285e6..f7690f8 100644 --- a/lsh/model_test.go +++ b/lsh/model_test.go @@ -16,7 +16,7 @@ func Test_Model_NoHash_IsExact(t *testing.T) { var h0 lsh.ConstantHash id := func(a uint64) uint64 { return a } rapid.Check(t, func(t *rapid.T) { - k := rapid.IntRange(3, 1001).Draw(t, "k") + k := rapid.IntRange(1, 1001).Draw(t, "k") data := rapid.SliceOfNDistinct(rapid.Uint64(), 3, 1000, id).Draw(t, "data") labels := rapid.SliceOfN(rapid.IntRange(0, 3), len(data), len(data)).Draw(t, "labels") values := rapid.SliceOfN(rapid.Float64(), len(data), len(data)).Draw(t, "values") diff --git a/lsh/model_wide_test.go b/lsh/model_wide_test.go index 2c1f9b1..1dbbe93 100644 --- a/lsh/model_wide_test.go +++ b/lsh/model_wide_test.go @@ -14,7 +14,7 @@ import ( func Test_WideModel_64bit_Equal_To_Narrow(t *testing.T) { id := func(a uint64) uint64 { return a } rapid.Check(t, func(t *rapid.T) { - k := rapid.IntRange(3, 1001).Draw(t, "k") + k := rapid.IntRange(1, 1001).Draw(t, "k") data := rapid.SliceOfNDistinct(rapid.Uint64(), 3, 1000, id).Draw(t, "data") dataWide := make([][]uint64, len(data)) for i := range data { diff --git a/pack/bytes.go b/pack/bytes.go index 5340de8..3ee6a6b 100644 --- a/pack/bytes.go +++ b/pack/bytes.go @@ -1,17 +1,15 @@ // Package pack provides helpers to pack bytes and strings into []uint64 slices. package pack -// Bytes packs a byte slice into a uint64 slice. -// If the length of the byte slice is not a multiple of 8, it will pad the remaining bytes with zeroes. -func Bytes(data []byte) []uint64 { +// BytesInto packs a byte slice into the given pre-allocated uint64 slice. +// The output slice should have length >=[BytesPackedLength](data). +func BytesInto(data []byte, out []uint64) { n := len(data) - dims := (n + 7) / 8 // round up division - - out := make([]uint64, dims) i := 0 + j := 0 for ; i+8 <= n; i += 8 { - out[i/8] = uint64(data[i]) | + out[j] = uint64(data[i]) | uint64(data[i+1])<<8 | uint64(data[i+2])<<16 | uint64(data[i+3])<<24 | @@ -19,6 +17,7 @@ func Bytes(data []byte) []uint64 { uint64(data[i+5])<<40 | uint64(data[i+6])<<48 | uint64(data[i+7])<<56 + j++ } if i < n { @@ -46,9 +45,23 @@ func Bytes(data []byte) []uint64 { case 1: packed |= uint64(data[i]) } - out[i/8] = packed + out[j] = packed } +} + +// BytesPackedLength return the packed length of the given byte slice. +func BytesPackedLength(data []byte) int { + return (len(data) + 7) / 8 +} + +// Bytes packs a byte slice into a uint64 slice. +// If the length of the byte slice is not a multiple of 8, it will pad the remaining bytes with zeroes. +func Bytes(data []byte) []uint64 { + n := len(data) + dims := (n + 7) / 8 // round up division + out := make([]uint64, dims) + BytesInto(data, out) return out } diff --git a/pack/bytes_test.go b/pack/bytes_test.go index 68f11b4..7b58627 100644 --- a/pack/bytes_test.go +++ b/pack/bytes_test.go @@ -14,7 +14,7 @@ func TestPackBytes(t *testing.T) { // Property 1: Length of packed []uint64 should be (len(bytes) + 7) / 8 packed := pack.Bytes(bytesInput) - expectedLength := (len(bytesInput) + 7) / 8 + expectedLength := pack.BytesPackedLength(bytesInput) if len(packed) != expectedLength { t.Fatalf("Expected packed length: %d, got: %d", expectedLength, len(packed)) } diff --git a/pack/compact.go b/pack/compact.go new file mode 100644 index 0000000..63fcf08 --- /dev/null +++ b/pack/compact.go @@ -0,0 +1,16 @@ +package pack + +// ReallocateFlat re-allocates the given 2d slice with a flat backing slice. +func ReallocateFlat[T any](d [][]T) { + n := 0 + for _, d := range d { + n += len(d) + } + flat := make([]T, n) + j := 0 + for i, row := range d { + copy(flat[j:], row) + d[i] = flat[j : j+len(row)] + j += len(row) + } +} diff --git a/pack/compact_test.go b/pack/compact_test.go new file mode 100644 index 0000000..693c9ac --- /dev/null +++ b/pack/compact_test.go @@ -0,0 +1,27 @@ +package pack_test + +import ( + "reflect" + "slices" + "testing" + + "github.com/keilerkonzept/bitknn/pack" + "pgregory.net/rapid" +) + +func TestPackReallocateFlat(t *testing.T) { + rapid.Check(t, func(t *rapid.T) { + dims := rapid.IntRange(3, 100).Draw(t, "dims") + n := rapid.IntRange(0, 1000).Draw(t, "n") + data := rapid.SliceOfN(rapid.SliceOfN(rapid.Uint64(), dims, dims), n, n).Draw(t, "data") + + dataCopy := make([][]uint64, len(data)) + for i := range dataCopy { + dataCopy[i] = slices.Clone(data[i]) + } + pack.ReallocateFlat(data) + if !reflect.DeepEqual(data, dataCopy) { + t.Fatalf("Original: %v, Packed: %v", dataCopy, data) + } + }) +} diff --git a/pack/string.go b/pack/string.go index b33acb9..42813ce 100644 --- a/pack/string.go +++ b/pack/string.go @@ -9,6 +9,18 @@ func String(data string) []uint64 { return Bytes(b) } +// StringPackedLength return the packed length of the given byte slice. +func StringPackedLength(data string) int { + return (len(data) + 7) / 8 +} + +// String packs a string into the given pre-allocated uint64 slice. +// The output slice should have length >=[StringPackedLength](data). +func StringInto(data string, out []uint64) { + b := unsafe.Slice(unsafe.StringData(data), len(data)) + BytesInto(b, out) +} + // StringInv unpacks a []uint64 slice as packed by [String], func StringInv(data []uint64, originalLengthBytes int) string { b := BytesInv(data, originalLengthBytes) diff --git a/pack/string_test.go b/pack/string_test.go index ff2cd1a..3a186fc 100644 --- a/pack/string_test.go +++ b/pack/string_test.go @@ -12,8 +12,8 @@ func TestPackString(t *testing.T) { data := rapid.String().Draw(t, "data") // Property 1: Length of packed []uint64 should be (len(data) + 7) / 8 + expectedLength := pack.StringPackedLength(data) packed := pack.String(data) - expectedLength := (len(data) + 7) / 8 if len(packed) != expectedLength { t.Fatalf("Expected packed length: %d, got: %d", expectedLength, len(packed)) } @@ -25,3 +25,19 @@ func TestPackString(t *testing.T) { } }) } + +func TestPackStringInto(t *testing.T) { + rapid.Check(t, func(t *rapid.T) { + data := rapid.String().Draw(t, "data") + + n := pack.StringPackedLength(data) + packed := make([]uint64, n) + pack.StringInto(data, packed) + + // Property 2: Roundtrip + unpacked := pack.StringInv(packed, len(data)) + if data != unpacked { + t.Fatalf("Original string: %v, Unpacked string: %v", data, unpacked) + } + }) +}