Skip to content

Commit

Permalink
Merge pull request #236 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
Infer boolean types in Python
  • Loading branch information
ashvardanian authored Nov 19, 2024
2 parents 364e736 + daa41bd commit 1d0d2ac
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 33 deletions.
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,22 @@ dist = simsimd.cosine(vec1, vec2, "int8")
dist = simsimd.cosine(vec1, vec2, "float16")
dist = simsimd.cosine(vec1, vec2, "float32")
dist = simsimd.cosine(vec1, vec2, "float64")
dist = simsimd.hamming(vec1, vec2, "bit8")
dist = simsimd.hamming(vec1, vec2, "bin8")
```

Binary distance functions are computed at a bit-level.
Meaning a vector of 10x 8-bit integers will be treated as a sequence of 80 individual bits or dimensions.
This differs from NumPy, that can't handle smaller-than-byte types, but you can still avoid the `bin8` argument by reinterpreting the vector as booleans:

```py
vec1 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
vec2 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
hamming_distance = simsimd.hamming(vec1, vec2)
jaccard_distance = simsimd.jaccard(vec1, vec2)
```

With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
Here is an example of using SimSIMD with PyTorch to compute the cosine similarity between two `bfloat16` vectors:

```py
import numpy as np
Expand All @@ -181,7 +193,7 @@ torch.randn(8, out=vec2)

# Both libs will look into the same memory buffers and report the same results
dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
dist_fast = simsimd.cosine(buf1, buf2, "bf16")
dist_fast = simsimd.cosine(buf1, buf2, "bfloat16")
```

It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
Expand Down Expand Up @@ -254,9 +266,9 @@ distances: DistancesTensor = simsimd.cdist(matrix1, matrix2, metric="cosine")
distances_array: np.ndarray = np.array(distances, copy=True) # now managed by NumPy
```

### Elementwise Kernels
### Element-wise Kernels

SimSIMD also provides mixed-precision elementwise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
SimSIMD also provides mixed-precision element-wise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.

```py
import numpy as np
Expand Down
8 changes: 4 additions & 4 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ SIMSIMD_INTERNAL simsimd_u32_t _simsimd_reduce_u8x16_neon(uint8x16_t vec) {
// Sum the widened halves
uint16x8_t sum16 = vaddq_u16(low_half, high_half);

// Now reduce the `uint16x8_t` to a single `uint32_t`
uint32x4_t sum32 = vpaddlq_u16(sum16); // pairwise add into 32-bit integers
uint64x2_t sum64 = vpaddlq_u32(sum32); // pairwise add into 64-bit integers
uint32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
// Now reduce the `uint16x8_t` to a single `simsimd_u32_t`
uint32x4_t sum32 = vpaddlq_u16(sum16); // pairwise add into 32-bit integers
uint64x2_t sum64 = vpaddlq_u32(sum32); // pairwise add into 64-bit integers
simsimd_u32_t final_sum = vaddvq_u64(sum64); // final horizontal add to 32-bit result
return final_sum;
}

Expand Down
8 changes: 4 additions & 4 deletions include/simsimd/dot.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,9 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
}

// Take care of the tail:
int32_t ab = vaddvq_s32(ab_vec);
simsimd_i32_t ab = vaddvq_s32(ab_vec);
for (; i < n; ++i) {
int32_t ai = a[i], bi = b[i];
simsimd_i32_t ai = a[i], bi = b[i];
ab += ai * bi;
}

Expand All @@ -383,9 +383,9 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
}

// Take care of the tail:
uint32_t ab = vaddvq_u32(ab_vec);
simsimd_u32_t ab = vaddvq_u32(ab_vec);
for (; i < n; ++i) {
uint32_t ai = a[i], bi = b[i];
simsimd_u32_t ai = a[i], bi = b[i];
ab += ai * bi;
}

Expand Down
2 changes: 1 addition & 1 deletion include/simsimd/simsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_x86(void) {
SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_arm(void) {
#if defined(_SIMSIMD_DEFINED_APPLE)
// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
uint32_t supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
unsigned supports_neon = 0, supports_fp16 = 0, supports_bf16 = 0, supports_i8mm = 0;
size_t size = sizeof(supports_neon);
if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
if (sysctlbyname("hw.optional.arm.FEAT_FP16", &supports_fp16, &size, NULL, 0) != 0) supports_fp16 = 0;
Expand Down
36 changes: 18 additions & 18 deletions include/simsimd/spatial.h
Original file line number Diff line number Diff line change
Expand Up @@ -595,10 +595,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i8_neon(simsimd_i8_t const *a, simsimd_i8_t con
uint8x16_t d_vec = vreinterpretq_u8_s8(vabdq_s8(a_vec, b_vec));
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
uint32_t d2 = vaddvq_u32(d2_vec);
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
int32_t n = (int32_t)a[i] - b[i];
d2 += (uint32_t)(n * n);
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
Expand Down Expand Up @@ -693,9 +693,9 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
// products_high_vec = vmmlaq_s32(products_high_vec, v_vec, y_w_vecs.val[1]);
// }
// int32x4_t products_vec = vaddq_s32(products_high_vec, products_low_vec);
// int32_t a2 = products_vec[0];
// int32_t ab = products_vec[1];
// int32_t b2 = products_vec[3];
// simsimd_i32_t a2 = products_vec[0];
// simsimd_i32_t ab = products_vec[1];
// simsimd_i32_t b2 = products_vec[3];
//
// That solution is elegant, but it requires the additional `+i8mm` extension and is currently slower,
// at least on AWS Graviton 3.
Expand All @@ -709,13 +709,13 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons
a2_vec = vdotq_s32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_s32(b2_vec, b_vec, b_vec);
}
int32_t ab = vaddvq_s32(ab_vec);
int32_t a2 = vaddvq_s32(a2_vec);
int32_t b2 = vaddvq_s32(b2_vec);
simsimd_i32_t ab = vaddvq_s32(ab_vec);
simsimd_i32_t a2 = vaddvq_s32(a2_vec);
simsimd_i32_t b2 = vaddvq_s32(b2_vec);

// Take care of the tail:
for (; i < n; ++i) {
int32_t ai = a[i], bi = b[i];
simsimd_i32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}

Expand All @@ -737,10 +737,10 @@ SIMSIMD_PUBLIC void simsimd_l2sq_u8_neon(simsimd_u8_t const *a, simsimd_u8_t con
uint8x16_t d_vec = vabdq_u8(a_vec, b_vec);
d2_vec = vdotq_u32(d2_vec, d_vec, d_vec);
}
uint32_t d2 = vaddvq_u32(d2_vec);
simsimd_u32_t d2 = vaddvq_u32(d2_vec);
for (; i < n; ++i) {
int32_t n = (int32_t)a[i] - b[i];
d2 += (uint32_t)(n * n);
simsimd_i32_t n = (simsimd_i32_t)a[i] - b[i];
d2 += (simsimd_u32_t)(n * n);
}
*result = d2;
}
Expand All @@ -759,13 +759,13 @@ SIMSIMD_PUBLIC void simsimd_cos_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons
a2_vec = vdotq_u32(a2_vec, a_vec, a_vec);
b2_vec = vdotq_u32(b2_vec, b_vec, b_vec);
}
uint32_t ab = vaddvq_u32(ab_vec);
uint32_t a2 = vaddvq_u32(a2_vec);
uint32_t b2 = vaddvq_u32(b2_vec);
simsimd_u32_t ab = vaddvq_u32(ab_vec);
simsimd_u32_t a2 = vaddvq_u32(a2_vec);
simsimd_u32_t b2 = vaddvq_u32(b2_vec);

// Take care of the tail:
for (; i < n; ++i) {
uint32_t ai = a[i], bi = b[i];
simsimd_u32_t ai = a[i], bi = b[i];
ab += ai * bi, a2 += ai * ai, b2 += bi * bi;
}

Expand Down Expand Up @@ -1050,7 +1050,7 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f
// Load the squares into an __m128 register for single-precision floating-point operations
__m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register

// Compute the reciprocal square root of the squares using _mm_rsqrt_ps (single-precision)
// Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
__m128 rsqrts = _mm_rsqrt_ps(squares);

// Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:
Expand Down
4 changes: 2 additions & 2 deletions python/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ simsimd_datatype_t python_string_to_datatype(char const *name) {

//! Boolean values:
else if (same_string(name, "bin8") || // SimSIMD-specific
same_string(name, "c")) // Named type
same_string(name, "?")) // Named type
return simsimd_datatype_b8_k;

// Signed integers:
Expand Down Expand Up @@ -276,7 +276,7 @@ char const *datatype_to_python_string(simsimd_datatype_t dtype) {
case simsimd_datatype_f32c_k: return "Zf";
case simsimd_datatype_f16c_k: return "Ze";
// Boolean values:
case simsimd_datatype_b8_k: return "c";
case simsimd_datatype_b8_k: return "?";
// Signed integers:
case simsimd_datatype_i8_k: return "b";
case simsimd_datatype_i16_k: return "h";
Expand Down
7 changes: 7 additions & 0 deletions scripts/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,13 @@ def test_dense_bits(ndim, metric, capability, stats_fixture):
np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)

# Aside from overriding the `dtype` parameter, we can also view as booleans
result_dt, result = profile(simd_kernel, np.packbits(a).view(np.bool_), np.packbits(b).view(np.bool_))
result = np.array(result)

np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)


@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
@pytest.mark.skipif(not scipy_available, reason="SciPy is not installed")
Expand Down

0 comments on commit 1d0d2ac

Please sign in to comment.