From ce1e918d50b6b1cf867142aff8058728d5b3bc65 Mon Sep 17 00:00:00 2001
From: Cheng <git@zcbenz.com>
Date: Sat, 7 Dec 2024 11:01:04 +0900
Subject: [PATCH] Update to MLX 0.21.0

---
 deps/mlx                 |  2 +-
 lib/nn/layers/pooling.ts | 66 +++++++++++-----------------------------
 src/fast.cc              | 17 +----------
 3 files changed, 20 insertions(+), 65 deletions(-)

diff --git a/deps/mlx b/deps/mlx
index cb431dfc..bb303c45 160000
--- a/deps/mlx
+++ b/deps/mlx
@@ -1 +1 @@
-Subproject commit cb431dfc9fcd63f5fe0a7b33e8efd19ad7d44d07
+Subproject commit bb303c45a55d7147bc261e9aa8be218d49500d09
diff --git a/lib/nn/layers/pooling.ts b/lib/nn/layers/pooling.ts
index 60f2785d..d37cb9c6 100644
--- a/lib/nn/layers/pooling.ts
+++ b/lib/nn/layers/pooling.ts
@@ -97,12 +97,8 @@ class Pool3d extends Pool {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, L, C)` and `kernelSize` is `k`, the output is
- * a tensor of shape `(N, L_out, C)`, given by:
- *
- * `out(N_i, t, C_j) = max_{m=0,...,k-1} input(N_i, stride * t + m, C_j)`
- *
- * where `L_out = floor((L + 2 * padding - kernelSize) / stride) + 1`.
+ * Spatially downsamples the input by taking the maximum of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
  * @param kernelSize - The size of the pooling window kernel.
  * @param stride - The stride of the pooling window. Default: `kernelSize`.
@@ -122,12 +118,8 @@ export class MaxPool1d extends Pool1d {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, L, C)` and `kernelSize` is `k`, the output is
- * a tensor of shape `(N, L_out, C)`, given by:
- *
- * `out(N_i, t, C_j) = 1/k * sum_{m=0,...,k-1} input(N_i, stride * t + m, C_j)`
- *
- * where `L_out = floor((L + 2 * padding - kernelSize) / stride) + 1`.
+ * Spatially downsamples the input by taking the average of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
  * @param kernelSize - The size of the pooling window kernel.
  * @param stride - The stride of the pooling window. Default: `kernelSize`.
@@ -147,15 +139,11 @@ export class AvgPool1d extends Pool1d {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, H, W, C)` and `kernelSize` is `(k_H, k_W)`,
- * the output is a tensor of shape `(N, H_out, W_out, C)`, given by:
+ * Spatially downsamples the input by taking the maximum of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
- * `out(N_i, h, w, C_j) = max_{m=0,...,k_H-1} max_{n=0,...,k_W-1} input(N_i, stride[0] * h + m, stride[1] * w + n, C_j)`
+ * The parameters `kernelSize`, `stride` and `padding` can either be:
  *
- * where `H_out = floor((H + 2 * padding[0] - kernelSize[0]) / stride[0]) + 1`
- *       `W_out = floor((W + 2 * padding[1] - kernelSize[1]) / stride[1]) + 1`
- *
- * The parameters `kernelSize`, `stride`, `padding`, can either be:
  *   - a single `number` -- in which case the same value is used for both the
  *     height and width axis;
  *   - a `tuple` of two `numbers`s -- in which case, the first `number` is used
@@ -179,16 +167,10 @@ export class MaxPool2d extends Pool2d {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, H, W, C)` and `kernelSize` is `(kH, kW)`,
- * the output is a tensor of shape `(N, H_out, W_out, C)`, given by:
- *
- * `out(N_i, h, w, C_j) = 1/(kH*kW) * sum_{m=0,...,kH-1} sum_{n=0,...,kW-1}
- *    input(N_i, stride[0] * h + m, stride[1] * w + n, C_j)`
+ * Spatially downsamples the input by taking the average of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
- * where `H_out = floor((H + 2 * padding[0] - kernelSize[0]) / stride[0]) + 1`,
- * `W_out = floor((W + 2 * padding[1] - kernelSize[1]) / stride[1]) + 1`.
- *
- * The parameters `kernelSize`, `stride`, `padding`, can either be:
+ * The parameters `kernelSize`, `stride` and `padding` can either be:
  *
  * - a single `number` -- in which case the same value is used for both the
  *   height and width axis
@@ -213,22 +195,16 @@ export class AvgPool2d extends Pool2d {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, D, H, W, C)` and `kernelSize` is `(k_D, k_H, k_W)`,
- * the output is a tensor of shape `(N, D_out, H_out, W_out, C)`, given by:
- *
- * `out(N_i, d, h, w, C_j) = max_{l=0,...,k_D-1} max_{m=0,...,k_H-1} max_{n=0,...,k_W-1}
- *                           input(N_i, stride[0] * d + l, stride[1] * h + m, stride[2] * w + n, C_j)`
+ * Spatially downsamples the input by taking the maximum of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
- * where `D_out = floor((D + 2 * padding[0] - kernelSize[0]) / stride[0]) + 1`
- *       `H_out = floor((H + 2 * padding[1] - kernelSize[1]) / stride[1]) + 1`
- *       `W_out = floor((W + 2 * padding[2] - kernelSize[2]) / stride[2]) + 1`
+ * The parameters `kernelSize`, `stride` and `padding` can either be:
  *
- * The parameters `kernelSize`, `stride`, `padding`, can either be:
  *   - a single `number` -- in which case the same value is used for the depth,
  *     height and width axis;
- *   - a `tuple` of three `numbers`s -- in which case, the first `number` is used
- *     for the depth axis, the second `number` for the height axis, and the third
- *     `number` for the width axis.
+ *   - a `tuple` of three `numbers`s -- in which case, the first `number` is
+ *     used for the depth axis, the second `number` for the height axis, and the
+ *     third `number` for the width axis.
  *
  * @param kernelSize - The size of the pooling window.
  * @param stride - The stride of the pooling window. Default: `kernelSize`.
@@ -248,14 +224,8 @@ export class MaxPool3d extends Pool3d {
  *
  * @remarks
  *
- * Assuming an input of shape `(N, D, H, W, C)` and `kernelSize` is `(k_D, k_H, k_W)`,
- * the output is a tensor of shape `(N, D_out, H_out, W_out, C)`, given by:
- *
- * `out(N_i, d, h, w, C_j) = (1 / (k_D * k_H * k_W)) * sum_{l=0,...,k_D-1} sum_{m=0,...,k_H-1} sum_{n=0,...,k_W-1} input(N_i, stride[0] * d + l, stride[1] * h + m, stride[2] * w + n, C_j)`
- *
- * where `D_out = floor((D + 2 * padding[0] - kernelSize[0]) / stride[0]) + 1`
- *       `H_out = floor((H + 2 * padding[1] - kernelSize[1]) / stride[1]) + 1`
- *       `W_out = floor((W + 2 * padding[2] - kernelSize[2]) / stride[2]) + 1`
+ * Spatially downsamples the input by taking the average of a sliding window
+ * of size `kernel_size` and sliding stride `stride`.
  *
  * The parameters `kernelSize`, `stride`, `padding`, can either be:
  *
diff --git a/src/fast.cc b/src/fast.cc
index 4db0c8e7..943f8d44 100644
--- a/src/fast.cc
+++ b/src/fast.cc
@@ -1,20 +1,6 @@
 #include "src/array.h"
 #include "src/stream.h"
 
-namespace fast_ops {
-
-mx::array AffineQuantize(const mx::array& w,
-                         const mx::array& scales,
-                         const mx::array& biases,
-                         std::optional<int> group_size,
-                         std::optional<int> bits,
-                         mx::StreamOrDevice s) {
-  return mx::fast::affine_quantize(w, scales, biases, group_size.value_or(64),
-                                   bits.value_or(4));
-}
-
-}  // namespace fast_ops
-
 void InitFast(napi_env env, napi_value exports) {
   napi_value fast = ki::CreateObject(env);
   ki::Set(env, exports, "fast", fast);
@@ -23,6 +9,5 @@ void InitFast(napi_env env, napi_value exports) {
           "rmsNorm", &mx::fast::rms_norm,
           "layerNorm", &mx::fast::layer_norm,
           "rope", &mx::fast::rope,
-          "scaledDotProductAttention", &mx::fast::scaled_dot_product_attention,
-          "affineQuantize", &fast_ops::AffineQuantize);
+          "scaledDotProductAttention", &mx::fast::scaled_dot_product_attention);
 }