From 10ebaab9fcb796319df88dbf0db1a05d13a812c5 Mon Sep 17 00:00:00 2001
From: TFLM-bot <tflm-github-bot@google.com>
Date: Tue, 7 Jan 2025 11:10:40 -0800
Subject: [PATCH 1/4] Automated sync from github.com/tensorflow/tensorflow
 (#3031)

BUG=automated sync from upstream
NO_CHECK_TFLITE_FILES=automated sync from upstream
---
 .../internal/reference/integer_ops/fully_connected.h      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index 3a74402ed98..c6d06077934 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -42,12 +42,14 @@ void FullyConnectedPerChannel(
   const int32_t output_activation_min = params.quantized_activation_min;
   const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
 
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   const int filter_dim_count = filter_shape.DimensionsCount();
-  const int batches = output_shape.Dims(0);
-  const int output_depth = output_shape.Dims(1);
+
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
   TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {

From 9b79b9faf208fddb509a0efc671bf338b5675ab9 Mon Sep 17 00:00:00 2001
From: chase <mzheng086@gmail.com>
Date: Thu, 16 Jan 2025 13:56:18 -0800
Subject: [PATCH 2/4] Allow signed char for Cortex M cores (#3035)

BUG=384562154
---
 .../tools/make/targets/cortex_m_generic_makefile.inc      | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
index 8c43b4159f5..99d72c49463 100644
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
@@ -28,6 +28,7 @@ endif
 
 FLOAT := soft
 GCC_TARGET_ARCH := $(TARGET_ARCH)
+SIGNED_CHAR := false
 
 # Explicitly set this to true to include the kissfft symbols.
 INCLUDE_MICRO_SPEECH := false
@@ -174,7 +175,6 @@ PLATFORM_FLAGS = \
   -DTF_LITE_MCU_DEBUG_LOG \
   -mthumb \
   -mfloat-abi=$(FLOAT) \
-  -funsigned-char \
   -mlittle-endian \
   -Wno-type-limits \
   -Wno-unused-private-field \
@@ -182,6 +182,12 @@ PLATFORM_FLAGS = \
   -MD \
   -DCPU_$(CORE)=1
 
+ifeq ($(SIGNED_CHAR), false)
+  PLATFORM_FLAGS += -funsigned-char
+else
+  PLATFORM_FLAGS += -fsigned-char
+endif
+
 # For DWT/PMU counters. Header file name is depending on target architecture.
 PLATFORM_FLAGS += -DCMSIS_DEVICE_ARM_CORTEX_M_XX_HEADER_FILE=\"$(ARM_CPU).h\"
 PLATFORM_FLAGS += -D$(ARM_CPU)

From 740cef3881cdd1f3954be97c2982c2cfcb3f81e0 Mon Sep 17 00:00:00 2001
From: Ryan OShea <86965113+ArmRyan@users.noreply.github.com>
Date: Wed, 5 Feb 2025 02:33:16 +0100
Subject: [PATCH 3/4] CMSIS-NN Min Max int8 support (#2753)

* Moves common functions to new maximum_minimum.h
  * Creates cmsis-nn/maximum_minimum.cc

BUG=#2752
Change-Id: Ifbb3fedf53043b2f8d4c48d73c2ca44c7f0f87ca
---
 tensorflow/lite/micro/kernels/BUILD           |   1 +
 .../micro/kernels/cmsis_nn/maximum_minimum.cc | 247 ++++++++++++++++++
 .../lite/micro/kernels/maximum_minimum.cc     |  50 +---
 .../lite/micro/kernels/maximum_minimum.h      | 105 ++++++++
 .../lite/micro/micro_mutable_op_resolver.h    |  22 +-
 5 files changed, 367 insertions(+), 58 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/cmsis_nn/maximum_minimum.cc
 create mode 100644 tensorflow/lite/micro/kernels/maximum_minimum.h

diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 1615d774907..8562d8bb53f 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -333,6 +333,7 @@ tflm_kernel_cc_library(
         "logistic.h",
         "lstm_eval.h",
         "lstm_shared.h",
+        "maximum_minimum.h",
         "micro_ops.h",
         "mul.h",
         "pad.h",
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/maximum_minimum.cc b/tensorflow/lite/micro/kernels/cmsis_nn/maximum_minimum.cc
new file mode 100644
index 00000000000..a6affaa11bb
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/maximum_minimum.cc
@@ -0,0 +1,247 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/maximum_minimum.h"
+
+#include "Include/arm_nnfunctions.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+
+namespace {
+
+cmsis_nn_dims FillVariableShape(int32_t rank, int32_t* tensor_dims) {
+  if (rank == 4) {
+    return {tensor_dims[0], tensor_dims[1], tensor_dims[2], tensor_dims[3]};
+  } else if (rank == 3) {
+    return {1, tensor_dims[0], tensor_dims[1], tensor_dims[2]};
+  } else if (rank == 2) {
+    return {1, 1, tensor_dims[0], tensor_dims[1]};
+  } else {
+    return {1, 1, 1, 1};
+  }
+}
+
+TfLiteStatus EvalMaximum(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input_1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input_2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+
+  cmsis_nn_dims input_1_dims = FillVariableShape(
+      input_1_shape.DimensionsCount(), input_1_shape.DimsData());
+  cmsis_nn_dims input_2_dims = FillVariableShape(
+      input_2_shape.DimensionsCount(), input_2_shape.DimsData());
+  cmsis_nn_dims output_dims = FillVariableShape(output_shape.DimensionsCount(),
+                                                output_shape.DimsData());
+
+  switch (op_context.output->type) {
+    case kTfLiteInt8:
+      cmsis_nn_context ctx;
+      ctx.buf = nullptr;
+      ctx.size = 0;
+
+      arm_maximum_s8(
+          &ctx, tflite::micro::GetTensorData<int8_t>(input1), &input_1_dims,
+          tflite::micro::GetTensorData<int8_t>(input2), &input_2_dims,
+          tflite::micro::GetTensorData<int8_t>(output), &output_dims);
+      break;
+    case kTfLiteFloat32:
+      TFLiteOperation<float, MaximumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt16:
+      TFLiteOperation<int16_t, MaximumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt32:
+      TFLiteOperation<int32_t, MaximumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt64:
+      TFLiteOperation<int64_t, MaximumOp>(context, node, op_context);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) is not supported by Maximum/Minimum.",
+                  TfLiteTypeGetName(op_context.output->type),
+                  op_context.output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMaximumInt8(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input_1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input_2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+
+  cmsis_nn_dims input_1_dims = FillVariableShape(
+      input_1_shape.DimensionsCount(), input_1_shape.DimsData());
+  cmsis_nn_dims input_2_dims = FillVariableShape(
+      input_2_shape.DimensionsCount(), input_2_shape.DimsData());
+  cmsis_nn_dims output_dims = FillVariableShape(output_shape.DimensionsCount(),
+                                                output_shape.DimsData());
+
+  switch (op_context.output->type) {
+    case kTfLiteInt8:
+      cmsis_nn_context ctx;
+      ctx.buf = nullptr;
+      ctx.size = 0;
+
+      arm_maximum_s8(
+          &ctx, tflite::micro::GetTensorData<int8_t>(input1), &input_1_dims,
+          tflite::micro::GetTensorData<int8_t>(input2), &input_2_dims,
+          tflite::micro::GetTensorData<int8_t>(output), &output_dims);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) is not supported by Maximum Int8 Registration.",
+                  TfLiteTypeGetName(op_context.output->type),
+                  op_context.output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMinimum(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input_1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input_2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+
+  cmsis_nn_dims input_1_dims = FillVariableShape(
+      input_1_shape.DimensionsCount(), input_1_shape.DimsData());
+  cmsis_nn_dims input_2_dims = FillVariableShape(
+      input_2_shape.DimensionsCount(), input_2_shape.DimsData());
+  cmsis_nn_dims output_dims = FillVariableShape(output_shape.DimensionsCount(),
+                                                output_shape.DimsData());
+
+  switch (op_context.output->type) {
+    case kTfLiteInt8:
+      cmsis_nn_context ctx;
+      ctx.buf = nullptr;
+      ctx.size = 0;
+
+      arm_minimum_s8(
+          &ctx, tflite::micro::GetTensorData<int8_t>(input1), &input_1_dims,
+          tflite::micro::GetTensorData<int8_t>(input2), &input_2_dims,
+          tflite::micro::GetTensorData<int8_t>(output), &output_dims);
+      break;
+    case kTfLiteFloat32:
+      TFLiteOperation<float, MinimumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt16:
+      TFLiteOperation<int16_t, MinimumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt32:
+      TFLiteOperation<int32_t, MinimumOp>(context, node, op_context);
+      break;
+    case kTfLiteInt64:
+      TFLiteOperation<int64_t, MinimumOp>(context, node, op_context);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) is not supported by Maximum/Minimum.",
+                  TfLiteTypeGetName(op_context.output->type),
+                  op_context.output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMinimumInt8(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input_1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input_2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+
+  cmsis_nn_dims input_1_dims = FillVariableShape(
+      input_1_shape.DimensionsCount(), input_1_shape.DimsData());
+  cmsis_nn_dims input_2_dims = FillVariableShape(
+      input_2_shape.DimensionsCount(), input_2_shape.DimsData());
+  cmsis_nn_dims output_dims = FillVariableShape(output_shape.DimensionsCount(),
+                                                output_shape.DimsData());
+
+  switch (op_context.output->type) {
+    case kTfLiteInt8:
+      cmsis_nn_context ctx;
+      ctx.buf = nullptr;
+      ctx.size = 0;
+
+      arm_minimum_s8(
+          &ctx, tflite::micro::GetTensorData<int8_t>(input1), &input_1_dims,
+          tflite::micro::GetTensorData<int8_t>(input2), &input_2_dims,
+          tflite::micro::GetTensorData<int8_t>(output), &output_dims);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) is not supported by Minimum Int8 registration.",
+                  TfLiteTypeGetName(op_context.output->type),
+                  op_context.output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_MAXIMUM() {
+  return tflite::micro::RegisterOp(nullptr, nullptr, EvalMaximum);
+}
+
+TFLMRegistration Register_MINIMUM() {
+  return tflite::micro::RegisterOp(nullptr, nullptr, EvalMinimum);
+}
+
+TFLMRegistration Register_MAXIMUM_INT8() {
+  return tflite::micro::RegisterOp(nullptr, nullptr, EvalMaximumInt8);
+}
+
+TFLMRegistration Register_MINIMUM_INT8() {
+  return tflite::micro::RegisterOp(nullptr, nullptr, EvalMinimumInt8);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum.cc b/tensorflow/lite/micro/kernels/maximum_minimum.cc
index 4dc87b40148..ef4a0a6a522 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,59 +23,13 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/maximum_minimum.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
 
 namespace {
 
-// This file has a reference implementation of TFMaximum/TFMinimum.
-enum KernelType {
-  kReference,
-};
-
-constexpr int kInputTensor1 = 0;
-constexpr int kInputTensor2 = 1;
-constexpr int kOutputTensor = 0;
-
-struct OpContext {
-  OpContext(TfLiteContext* context, TfLiteNode* node) {
-    input1 = tflite::micro::GetEvalInput(context, node, kInputTensor1);
-    input2 = tflite::micro::GetEvalInput(context, node, kInputTensor2);
-    output = tflite::micro::GetEvalOutput(context, node, kOutputTensor);
-  }
-  const TfLiteEvalTensor* input1;
-  const TfLiteEvalTensor* input2;
-  TfLiteEvalTensor* output;
-};
-
-struct MaximumOp {
-  template <typename data_type>
-  static data_type op(data_type el1, data_type el2) {
-    return el1 > el2 ? el1 : el2;
-  }
-};
-
-struct MinimumOp {
-  template <typename data_type>
-  static data_type op(data_type el1, data_type el2) {
-    return el1 < el2 ? el1 : el2;
-  }
-};
-
-template <typename data_type, typename op_type>
-void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
-                     const OpContext& op_context) {
-  reference_ops::MaximumMinimumBroadcastSlow(
-      tflite::micro::GetTensorShape(op_context.input1),
-      tflite::micro::GetTensorData<data_type>(op_context.input1),
-      tflite::micro::GetTensorShape(op_context.input2),
-      tflite::micro::GetTensorData<data_type>(op_context.input2),
-      tflite::micro::GetTensorShape(op_context.output),
-      tflite::micro::GetTensorData<data_type>(op_context.output),
-      op_type::template op<data_type>);
-}
-
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum.h b/tensorflow/lite/micro/kernels/maximum_minimum.h
new file mode 100644
index 00000000000..34d7e2399f3
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/maximum_minimum.h
@@ -0,0 +1,105 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_MAXIMUM_MINIMUM_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_MAXIMUM_MINIMUM_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+
+// This file has a reference implementation of TFMaximum/TFMinimum.
+enum KernelType {
+  kReference,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input1 = tflite::micro::GetEvalInput(context, node, kInputTensor1);
+    input2 = tflite::micro::GetEvalInput(context, node, kInputTensor2);
+    output = tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  }
+  const TfLiteEvalTensor* input1;
+  const TfLiteEvalTensor* input2;
+  TfLiteEvalTensor* output;
+};
+
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
+
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                     const OpContext& op_context) {
+  reference_ops::MaximumMinimumBroadcastSlow(
+      tflite::micro::GetTensorShape(op_context.input1),
+      tflite::micro::GetTensorData<data_type>(op_context.input1),
+      tflite::micro::GetTensorShape(op_context.input2),
+      tflite::micro::GetTensorData<data_type>(op_context.input2),
+      tflite::micro::GetTensorShape(op_context.output),
+      tflite::micro::GetTensorData<data_type>(op_context.output),
+      op_type::template op<data_type>);
+}
+
+TFLMRegistration Register_MAXIMUM();
+
+TFLMRegistration Register_MINIMUM();
+
+#if defined(CMSIS_NN)
+// Returns a TFLMRegistration struct for kernel variant that only supports
+// int8.
+TFLMRegistration Register_MAXIMUM_INT8();
+
+// Returns a TFLMRegistration struct for kernel variant that only supports
+// int8.
+TFLMRegistration Register_MINIMUM_INT8();
+
+#else
+// Note that while this block gets used for both reference and optimized kernels
+// that do not have any specialized implementations, the only goal here is to
+// define fallback implementation that allow reference kernels to still be used
+// from applications that call a more specific kernel variant.
+inline TFLMRegistration Register_MAXIMUM_INT8() { return Register_MAXIMUM(); }
+
+inline TFLMRegistration Register_MINIMUM_INT8() { return Register_MINIMUM(); }
+
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_MAXIMUM_MINIMUM_H_
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index f5f6e38e003..ad642ddbc06 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/depthwise_conv.h"
 #include "tensorflow/lite/micro/kernels/ethosu.h"
 #include "tensorflow/lite/micro/kernels/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/maximum_minimum.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/kernels/mul.h"
 #include "tensorflow/lite/micro/kernels/pooling.h"
@@ -414,9 +415,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::Register_LOG_SOFTMAX(), ParseLogSoftmax);
   }
 
-  TfLiteStatus AddMaximum() {
-    return AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
-                      ParseMaximum);
+  TfLiteStatus AddMaximum(
+      const TFLMRegistration& registration = Register_MAXIMUM()) {
+    return AddBuiltin(BuiltinOperator_MAXIMUM, registration, ParseMaximum);
   }
 
   TfLiteStatus AddMaxPool2D(
@@ -433,9 +434,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(), ParseReducer);
   }
 
-  TfLiteStatus AddMinimum() {
-    return AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
-                      ParseMinimum);
+  TfLiteStatus AddMinimum(
+      const TFLMRegistration& registration = Register_MINIMUM()) {
+    return AddBuiltin(BuiltinOperator_MINIMUM, registration, ParseMinimum);
   }
 
   TfLiteStatus AddMul(const TFLMRegistration& registration = Register_MUL()) {
@@ -452,7 +453,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddOverlapAdd() {
-    // TODO(b/286250473): change back name to "OverlapAdd" and remove namespace
+    // TODO(b/286250473): change back name to "OverlapAdd" and remove
+    // namespace
     return AddCustom("SignalOverlapAdd",
                      tflite::tflm_signal::Register_OVERLAP_ADD());
   }
@@ -684,8 +686,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
     }
 
     registrations_[registrations_len_] = registration;
-    // Strictly speaking, the builtin_code is not necessary for TFLM but filling
-    // it in regardless.
+    // Strictly speaking, the builtin_code is not necessary for TFLM but
+    // filling it in regardless.
     registrations_[registrations_len_].builtin_code = op;
     registrations_len_++;
 

From ef64591270691022a329cf04ba9e73ecfb15ddb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Fri, 7 Feb 2025 04:02:49 +0100
Subject: [PATCH 4/4] Fix quant specific op registration for some ops (#2770)

BUG=Quantization specific registration for BatchMatmul, SVDF and LSTM were not working correctly.
---
 tensorflow/lite/micro/kernels/BUILD           |   2 +
 tensorflow/lite/micro/kernels/batch_matmul.cc |   2 +
 tensorflow/lite/micro/kernels/batch_matmul.h  |  97 ++------------
 .../lite/micro/kernels/batch_matmul_common.cc | 119 ++++++++++++++++++
 .../lite/micro/micro_mutable_op_resolver.h    |  10 +-
 tensorflow/lite/micro/tools/make/Makefile     |   1 +
 6 files changed, 140 insertions(+), 91 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/batch_matmul_common.cc

diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 8562d8bb53f..d0bf2cbc0a2 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -222,6 +222,7 @@ tflm_kernel_cc_library(
         "arg_min_max.cc",
         "assign_variable.cc",
         "batch_matmul.cc",
+        "batch_matmul_common.cc",
         "batch_to_space_nd.cc",
         "broadcast_args.cc",
         "broadcast_to.cc",
@@ -347,6 +348,7 @@ tflm_kernel_cc_library(
         "sub.h",
         "svdf.h",
         "transpose_conv.h",
+        "unidirectional_sequence_lstm.h",
     ] + select({
         xtensa_fusion_f1_config(): glob(["xtensa/**/*.h"]),
         xtensa_hifi_3_config(): glob(["xtensa/**/*.h"]),
diff --git a/tensorflow/lite/micro/kernels/batch_matmul.cc b/tensorflow/lite/micro/kernels/batch_matmul.cc
index 15112e3b4cd..bbb1c0b0a7e 100644
--- a/tensorflow/lite/micro/kernels/batch_matmul.cc
+++ b/tensorflow/lite/micro/kernels/batch_matmul.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/transpose.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/batch_matmul.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/kernels/batch_matmul.h b/tensorflow/lite/micro/kernels/batch_matmul.h
index 198b1d48ead..5e811fa3782 100644
--- a/tensorflow/lite/micro/kernels/batch_matmul.h
+++ b/tensorflow/lite/micro/kernels/batch_matmul.h
@@ -16,22 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_KERNELS_BATCH_MATMUL_H_
 #define TENSORFLOW_LITE_MICRO_KERNELS_BATCH_MATMUL_H_
 
-#include <cstdint>
-
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/internal/reference/transpose.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_common.h"
-#include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
 
-extern constexpr int kBatchMatmulInputLhsTensor = 0;
-extern constexpr int kBatchMatmulInputRhsTensor = 1;
-extern constexpr int kBatchMatmulOutputTensor = 0;
-
 struct QuantizationOpDataBatchMatmul {
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
@@ -59,98 +49,29 @@ struct OpDataBatchMatmul {
   bool rhs_is_constant_tensor;
 };
 
+extern const int kBatchMatmulInputLhsTensor;
+extern const int kBatchMatmulInputRhsTensor;
+extern const int kBatchMatmulOutputTensor;
+
 TfLiteStatus ReshapeOutputTensor(TfLiteContext* context, TfLiteNode* node,
                                  const RuntimeShape& extended_lhs_shape,
                                  const RuntimeShape& extended_rhs_shape,
                                  bool adj_x, bool adj_y, int output_rank,
-                                 TfLiteTensor* output) {
-  int64_t orig_size = NumElements(output);
-
-  // make sure the new output dims rank does not exceed the original rank
-  TF_LITE_ENSURE(context, output_rank <= NumDimensions(output));
-
-  // make sure output tensor dims are not in the FlatBuffer
-  TfLiteEvalTensor* output_eval =
-      tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
-  TF_LITE_ENSURE_OK(context, tflite::micro::CreateWritableTensorDimsWithCopy(
-                                 context, output, output_eval));
-
-  // Fill in any broadcast dimensions.
-  for (int i = 0; i < output_rank - 2; ++i) {
-    const int lhs_dim = extended_lhs_shape.Dims(i);
-    const int rhs_dim = extended_rhs_shape.Dims(i);
-    int broadcast_dim = lhs_dim;
-    if ((lhs_dim != rhs_dim) && (lhs_dim == 1)) {
-      broadcast_dim = rhs_dim;
-    }
-    output->dims->data[i] = broadcast_dim;
-  }
-  // Fill in the matmul dimensions.
-  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
-  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
-
-  output->dims->data[output_rank - 2] = extended_lhs_shape.Dims(lhs_rows_index);
-  output->dims->data[output_rank - 1] = extended_rhs_shape.Dims(rhs_cols_index);
-  output->dims->size = output_rank;
-
-  // Check that output tensor has not been resized
-  // since TFLM doesn't support tensor resizing.
-  TF_LITE_ENSURE_EQ(context, orig_size, NumElements(output));
-
-  return kTfLiteOk;
-}
+                                 TfLiteTensor* output);
 
 template <typename T>
 void TransposeRowsColumnsImpl(const TfLiteEvalTensor& tensor_in,
-                              TfLiteEvalTensor* tensor_out) {
-  const T* input = tflite::micro::GetTensorData<T>(&tensor_in);
-  T* output = tflite::micro::GetTensorData<T>(tensor_out);
-  RuntimeShape transposed_shape(tflite::micro::GetTensorShape(&tensor_in));
-  RuntimeShape shape(transposed_shape);
-  TransposeParams params;
-  const int rank = shape.DimensionsCount();
-  params.perm_count = rank;
-  for (int i = 0; i < rank - 2; ++i) {
-    params.perm[i] = i;
-  }
-  // Transpose the last two dimensions.
-  params.perm[rank - 2] = rank - 1;
-  params.perm[rank - 1] = rank - 2;
-  transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2));
-  transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1));
-  reference_ops::Transpose(params, shape, input, transposed_shape, output);
-}
+                              TfLiteEvalTensor* tensor_out);
 
 TfLiteStatus TransposeRowsColumns(const TfLiteEvalTensor& tensor_in,
-                                  TfLiteEvalTensor* tensor_out) {
-  if (tensor_in.type == kTfLiteFloat32) {
-    TransposeRowsColumnsImpl<float>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else if (tensor_in.type == kTfLiteInt8) {
-    TransposeRowsColumnsImpl<int8_t>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else if (tensor_in.type == kTfLiteInt16) {
-    TransposeRowsColumnsImpl<int16_t>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else {
-    MicroPrintf(
-        "BATCH_MATMUL can only transpose tensors with FLOAT32, INT8, INT16 "
-        "type.");
-  }
-  return kTfLiteError;
-}
+                                  TfLiteEvalTensor* tensor_out);
 
-RuntimeShape SwapRowColumnDims(const RuntimeShape& shape) {
-  RuntimeShape swapped_shape(shape);
-  const int32_t dims = shape.DimensionsCount();
-  swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
-  swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
-  return swapped_shape;
-}
+RuntimeShape SwapRowColumnDims(const RuntimeShape& shape);
 
 TFLMRegistration Register_BATCH_MATMUL();
 
 #if defined(CMSIS_NN)
+
 // Returns a TFLMRegistration struct for kernel variant that only supports
 // int8 matrix multiplication and uses the latency optimized
 // implementations.
diff --git a/tensorflow/lite/micro/kernels/batch_matmul_common.cc b/tensorflow/lite/micro/kernels/batch_matmul_common.cc
new file mode 100644
index 00000000000..1447cd489e9
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/batch_matmul_common.cc
@@ -0,0 +1,119 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/batch_matmul.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+
+const int kBatchMatmulInputLhsTensor = 0;
+const int kBatchMatmulInputRhsTensor = 1;
+const int kBatchMatmulOutputTensor = 0;
+
+TfLiteStatus ReshapeOutputTensor(TfLiteContext* context, TfLiteNode* node,
+                                 const RuntimeShape& extended_lhs_shape,
+                                 const RuntimeShape& extended_rhs_shape,
+                                 bool adj_x, bool adj_y, int output_rank,
+                                 TfLiteTensor* output) {
+  int64_t orig_size = NumElements(output);
+
+  // make sure the new output dims rank does not exceed the original rank
+  TF_LITE_ENSURE(context, output_rank <= NumDimensions(output));
+
+  // make sure output tensor dims are not in the FlatBuffer
+  TfLiteEvalTensor* output_eval =
+      tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
+  TF_LITE_ENSURE_OK(context, tflite::micro::CreateWritableTensorDimsWithCopy(
+                                 context, output, output_eval));
+
+  // Fill in any broadcast dimensions.
+  for (int i = 0; i < output_rank - 2; ++i) {
+    const int lhs_dim = extended_lhs_shape.Dims(i);
+    const int rhs_dim = extended_rhs_shape.Dims(i);
+    int broadcast_dim = lhs_dim;
+    if ((lhs_dim != rhs_dim) && (lhs_dim == 1)) {
+      broadcast_dim = rhs_dim;
+    }
+    output->dims->data[i] = broadcast_dim;
+  }
+  // Fill in the matmul dimensions.
+  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
+  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
+
+  output->dims->data[output_rank - 2] = extended_lhs_shape.Dims(lhs_rows_index);
+  output->dims->data[output_rank - 1] = extended_rhs_shape.Dims(rhs_cols_index);
+  output->dims->size = output_rank;
+
+  // Check that output tensor has not been resized
+  // since TFLM doesn't support tensor resizing.
+  TF_LITE_ENSURE_EQ(context, orig_size, NumElements(output));
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+void TransposeRowsColumnsImpl(const TfLiteEvalTensor& tensor_in,
+                              TfLiteEvalTensor* tensor_out) {
+  const T* input = tflite::micro::GetTensorData<T>(&tensor_in);
+  T* output = tflite::micro::GetTensorData<T>(tensor_out);
+  RuntimeShape transposed_shape(tflite::micro::GetTensorShape(&tensor_in));
+  RuntimeShape shape(transposed_shape);
+  TransposeParams params;
+  const int rank = shape.DimensionsCount();
+  params.perm_count = rank;
+  for (int i = 0; i < rank - 2; ++i) {
+    params.perm[i] = i;
+  }
+  // Transpose the last two dimensions.
+  params.perm[rank - 2] = rank - 1;
+  params.perm[rank - 1] = rank - 2;
+  transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2));
+  transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1));
+  reference_ops::Transpose(params, shape, input, transposed_shape, output);
+}
+
+TfLiteStatus TransposeRowsColumns(const TfLiteEvalTensor& tensor_in,
+                                  TfLiteEvalTensor* tensor_out) {
+  if (tensor_in.type == kTfLiteFloat32) {
+    TransposeRowsColumnsImpl<float>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else if (tensor_in.type == kTfLiteInt8) {
+    TransposeRowsColumnsImpl<int8_t>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else if (tensor_in.type == kTfLiteInt16) {
+    TransposeRowsColumnsImpl<int16_t>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else {
+    MicroPrintf(
+        "BATCH_MATMUL can only transpose tensors with FLOAT32, INT8, INT16 "
+        "type.");
+  }
+  return kTfLiteError;
+}
+
+RuntimeShape SwapRowColumnDims(const RuntimeShape& shape) {
+  RuntimeShape swapped_shape(shape);
+  const int32_t dims = shape.DimensionsCount();
+  swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
+  swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
+  return swapped_shape;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ad642ddbc06..f3f2080f0aa 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/kernels/add.h"
+#include "tensorflow/lite/micro/kernels/batch_matmul.h"
 #include "tensorflow/lite/micro/kernels/conv.h"
 #include "tensorflow/lite/micro/kernels/depthwise_conv.h"
 #include "tensorflow/lite/micro/kernels/ethosu.h"
@@ -34,7 +35,9 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/pooling.h"
 #include "tensorflow/lite/micro/kernels/reduce.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
+#include "tensorflow/lite/micro/kernels/svdf.h"
 #include "tensorflow/lite/micro/kernels/transpose_conv.h"
+#include "tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.h"
 #include "tensorflow/lite/micro/micro_log.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -146,9 +149,10 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, registration, ParsePool);
   }
 
-  TfLiteStatus AddBatchMatMul() {
-    return AddBuiltin(BuiltinOperator_BATCH_MATMUL,
-                      tflite::Register_BATCH_MATMUL(), ParseBatchMatMul);
+  TfLiteStatus AddBatchMatMul(
+      const TFLMRegistration& registration = Register_BATCH_MATMUL()) {
+    return AddBuiltin(BuiltinOperator_BATCH_MATMUL, registration,
+                      ParseBatchMatMul);
   }
 
   TfLiteStatus AddBatchToSpaceNd() {
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index e6912e91705..1b4f9d4bf2c 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -365,6 +365,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/add_n.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/arg_min_max.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/assign_variable.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/batch_matmul.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/batch_matmul_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/batch_to_space_nd.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/broadcast_args.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/broadcast_to.cc \