From d1d620be24b814989b71a58ded8012b1733a0510 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <adrian.lundell@arm.com>
Date: Wed, 20 Mar 2024 15:32:38 +0100
Subject: [PATCH 1/2] Add CMSIS-NN int8 and int16 batch matmul

 * Moves some common functions with ref to new header file
 * Creates new cmsis_nn batch_matmul.cc

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
Co-authored-by: Adrian Lundell <adrian.lundell@arm.com>

Change-Id: I88de8284bb10abaebf7b649436c5375474e1d44d
---
 tensorflow/lite/micro/kernels/BUILD           |   1 +
 tensorflow/lite/micro/kernels/batch_matmul.cc | 119 +---
 tensorflow/lite/micro/kernels/batch_matmul.h  | 175 ++++++
 .../micro/kernels/cmsis_nn/batch_matmul.cc    | 530 ++++++++++++++++++
 .../tools/make/ext_libs/cmsis_nn_download.sh  |   4 +-
 5 files changed, 730 insertions(+), 99 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/batch_matmul.h
 create mode 100644 tensorflow/lite/micro/kernels/cmsis_nn/batch_matmul.cc

diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index f2ccb067a36..29a369eda33 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -300,6 +300,7 @@ tflm_kernel_cc_library(
     hdrs = [
         "activations.h",
         "add.h",
+        "batch_matmul.h",
         "circular_buffer.h",
         "conv.h",
         "depthwise_conv.h",
diff --git a/tensorflow/lite/micro/kernels/batch_matmul.cc b/tensorflow/lite/micro/kernels/batch_matmul.cc
index bd621f4c2cb..4a242054df4 100644
--- a/tensorflow/lite/micro/kernels/batch_matmul.cc
+++ b/tensorflow/lite/micro/kernels/batch_matmul.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,60 +24,31 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/transpose.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/batch_matmul.h"
 #include "tensorflow/lite/micro/micro_log.h"
 
 namespace tflite {
 namespace {
 
-constexpr int kInputLhsTensor = 0;
-constexpr int kInputRhsTensor = 1;
-constexpr int kOutputTensor = 0;
-
-struct QuantizationOpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;  // exponent
-
-  // The range of the fused activation layer. For example for kNone and
-  // int8_t these would be -128 and 127.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-
-  int32_t lhs_zero_point;
-  int32_t rhs_zero_point;
-  int32_t output_zero_point;
-};
-
-struct OpData {
-  QuantizationOpData* quantization;
-
-  // Transpose tensors and state
-  TfLiteEvalTensor* lhs_transposed_tensor;
-  TfLiteEvalTensor* rhs_transposed_tensor;
-  bool rhs_is_transposed;
-  bool lhs_is_constant_tensor;
-  bool rhs_is_constant_tensor;
-};
-
 struct OpContext {
   OpContext(TfLiteContext* context, TfLiteNode* node)
       : params(static_cast<TfLiteBatchMatMulParams*>(node->builtin_data)),
-        op_data(static_cast<OpData*>(node->user_data)) {}
+        op_data(static_cast<OpDataBatchMatmul*>(node->user_data)) {}
 
   TfLiteBatchMatMulParams* params;
-  OpData* op_data;
+  OpDataBatchMatmul* op_data;
 };
 
 struct PrepareOpContext : OpContext {
   PrepareOpContext(TfLiteContext* context, TfLiteNode* node)
       : OpContext(context, node),
         micro_context_(GetMicroContext(context)),
-        lhs(micro_context_->AllocateTempInputTensor(node, kInputLhsTensor)),
-        rhs(micro_context_->AllocateTempInputTensor(node, kInputRhsTensor)),
-        output(micro_context_->AllocateTempOutputTensor(node, kOutputTensor)) {}
+        lhs(micro_context_->AllocateTempInputTensor(
+            node, kBatchMatmulInputLhsTensor)),
+        rhs(micro_context_->AllocateTempInputTensor(
+            node, kBatchMatmulInputRhsTensor)),
+        output(micro_context_->AllocateTempOutputTensor(
+            node, kBatchMatmulOutputTensor)) {}
 
   ~PrepareOpContext() {
     if (lhs != nullptr) {
@@ -103,56 +74,18 @@ struct PrepareOpContext : OpContext {
 struct EvalOpContext : OpContext {
   EvalOpContext(TfLiteContext* context, TfLiteNode* node)
       : OpContext(context, node),
-        lhs(tflite::micro::GetEvalInput(context, node, kInputLhsTensor)),
-        rhs(tflite::micro::GetEvalInput(context, node, kInputRhsTensor)),
-        output(tflite::micro::GetEvalOutput(context, node, kOutputTensor)) {}
+        lhs(tflite::micro::GetEvalInput(context, node,
+                                        kBatchMatmulInputLhsTensor)),
+        rhs(tflite::micro::GetEvalInput(context, node,
+                                        kBatchMatmulInputRhsTensor)),
+        output(tflite::micro::GetEvalOutput(context, node,
+                                            kBatchMatmulOutputTensor)) {}
 
   const TfLiteEvalTensor* lhs;
   const TfLiteEvalTensor* rhs;
   TfLiteEvalTensor* output;
 };
 
-TfLiteStatus ReshapeOutputTensor(TfLiteContext* context, TfLiteNode* node,
-                                 const RuntimeShape& extended_lhs_shape,
-                                 const RuntimeShape& extended_rhs_shape,
-                                 bool adj_x, bool adj_y, int output_rank,
-                                 TfLiteTensor* output) {
-  int64_t orig_size = NumElements(output);
-
-  // make sure the new output dims rank does not exceed the original rank
-  TF_LITE_ENSURE(context, output_rank <= NumDimensions(output));
-
-  // make sure output tensor dims are not in the FlatBuffer
-  TfLiteEvalTensor* output_eval =
-      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_OK(context, tflite::micro::CreateWritableTensorDimsWithCopy(
-                                 context, output, output_eval));
-
-  // Fill in any broadcast dimensions.
-  for (int i = 0; i < output_rank - 2; ++i) {
-    const int lhs_dim = extended_lhs_shape.Dims(i);
-    const int rhs_dim = extended_rhs_shape.Dims(i);
-    int broadcast_dim = lhs_dim;
-    if ((lhs_dim != rhs_dim) && (lhs_dim == 1)) {
-      broadcast_dim = rhs_dim;
-    }
-    output->dims->data[i] = broadcast_dim;
-  }
-  // Fill in the matmul dimensions.
-  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
-  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
-
-  output->dims->data[output_rank - 2] = extended_lhs_shape.Dims(lhs_rows_index);
-  output->dims->data[output_rank - 1] = extended_rhs_shape.Dims(rhs_cols_index);
-  output->dims->size = output_rank;
-
-  // Check that output tensor has not been resized
-  // since TFLM doesn't support tensor resizing.
-  TF_LITE_ENSURE_EQ(context, orig_size, NumElements(output));
-
-  return kTfLiteOk;
-}
-
 TfLiteEvalTensor* AllocInitTransposeTensorFromTfLiteTensor(
     TfLiteContext* context, const TfLiteTensor& tensor) {
   MicroContext* micro_context = GetMicroContext(context);
@@ -195,7 +128,7 @@ TfLiteEvalTensor* AllocInitTransposeTensorFromTfLiteTensor(
 // Allocate normal quantization data if needed.
 TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                                    const PrepareOpContext& op_context) {
-  OpData* op_data = op_context.op_data;
+  OpDataBatchMatmul* op_data = op_context.op_data;
   const TfLiteTensor* lhs = op_context.lhs;
   const TfLiteTensor* rhs = op_context.rhs;
   MicroContext* micro_context = GetMicroContext(context);
@@ -271,14 +204,6 @@ TfLiteStatus TransposeRowsColumns(const TfLiteEvalTensor& tensor_in,
   return kTfLiteError;
 }
 
-RuntimeShape SwapRowColumnDims(const RuntimeShape& shape) {
-  RuntimeShape swapped_shape(shape);
-  const int32_t dims = shape.DimensionsCount();
-  swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
-  swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
-  return swapped_shape;
-}
-
 void* BatchMatMulInit(TfLiteContext* context, const char* buffer,
                       size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
@@ -286,7 +211,7 @@ void* BatchMatMulInit(TfLiteContext* context, const char* buffer,
   // Eval().
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   MicroContext* micro_context = GetMicroContext(context);
-  return micro_context->AllocatePersistentBuffer(sizeof(OpData));
+  return micro_context->AllocatePersistentBuffer(sizeof(OpDataBatchMatmul));
 }
 
 TfLiteStatus BatchMatMulPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -323,7 +248,7 @@ TfLiteStatus BatchMatMulPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, op_context));
 
-  OpData* op_data = op_context.op_data;
+  OpDataBatchMatmul* op_data = op_context.op_data;
   // If the RHS is constant, we only transpose once.
   op_data->rhs_is_transposed = false;
   op_data->lhs_is_constant_tensor = IsConstantTensor(lhs_data);
@@ -393,7 +318,7 @@ TfLiteStatus BatchMatMulPrepare(TfLiteContext* context, TfLiteNode* node) {
   return status;
 }
 
-TfLiteStatus EvalInt8(TfLiteContext* context, const OpData& data,
+TfLiteStatus EvalInt8(TfLiteContext* context, const OpDataBatchMatmul& data,
                       const RuntimeShape& lhs_shape,
                       const TfLiteEvalTensor& lhs,
                       const RuntimeShape& rhs_shape,
@@ -423,7 +348,7 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData& data,
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalInt16(TfLiteContext* context, const OpData& data,
+TfLiteStatus EvalInt16(TfLiteContext* context, const OpDataBatchMatmul& data,
                        const RuntimeShape& lhs_shape,
                        const TfLiteEvalTensor& lhs,
                        const RuntimeShape& rhs_shape,
@@ -466,7 +391,7 @@ TfLiteStatus EvalInt16(TfLiteContext* context, const OpData& data,
 // A X C row-oriented.
 TfLiteStatus BatchMatMulEval(TfLiteContext* context, TfLiteNode* node) {
   EvalOpContext op_context(context, node);
-  OpData* op_data = op_context.op_data;
+  OpDataBatchMatmul* op_data = op_context.op_data;
   const TfLiteEvalTensor* lhs = op_context.lhs;
   const TfLiteEvalTensor* rhs = op_context.rhs;
   TfLiteEvalTensor* output = op_context.output;
diff --git a/tensorflow/lite/micro/kernels/batch_matmul.h b/tensorflow/lite/micro/kernels/batch_matmul.h
new file mode 100644
index 00000000000..f4934534b52
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/batch_matmul.h
@@ -0,0 +1,175 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_BATCH_MATMUL_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_BATCH_MATMUL_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_common.h"
+
+namespace tflite {
+
+extern constexpr int kBatchMatmulInputLhsTensor = 0;
+extern constexpr int kBatchMatmulInputRhsTensor = 1;
+extern constexpr int kBatchMatmulOutputTensor = 0;
+
+struct QuantizationOpDataBatchMatmul {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;  // exponent
+
+  // The range of the fused activation layer. For example for kNone and
+  // int8_t these would be -128 and 127.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  int32_t lhs_zero_point;
+  int32_t rhs_zero_point;
+  int32_t output_zero_point;
+};
+
+struct OpDataBatchMatmul {
+  QuantizationOpDataBatchMatmul* quantization;
+
+  // Transpose tensors and state
+  TfLiteEvalTensor* lhs_transposed_tensor;
+  TfLiteEvalTensor* rhs_transposed_tensor;
+  bool rhs_is_transposed;
+  bool lhs_is_constant_tensor;
+  bool rhs_is_constant_tensor;
+};
+
+TfLiteStatus ReshapeOutputTensor(TfLiteContext* context, TfLiteNode* node,
+                                 const RuntimeShape& extended_lhs_shape,
+                                 const RuntimeShape& extended_rhs_shape,
+                                 bool adj_x, bool adj_y, int output_rank,
+                                 TfLiteTensor* output) {
+  int64_t orig_size = NumElements(output);
+
+  // make sure the new output dims rank does not exceed the original rank
+  TF_LITE_ENSURE(context, output_rank <= NumDimensions(output));
+
+  // make sure output tensor dims are not in the FlatBuffer
+  TfLiteEvalTensor* output_eval =
+      tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
+  TF_LITE_ENSURE_OK(context, tflite::micro::CreateWritableTensorDimsWithCopy(
+                                 context, output, output_eval));
+
+  // Fill in any broadcast dimensions.
+  for (int i = 0; i < output_rank - 2; ++i) {
+    const int lhs_dim = extended_lhs_shape.Dims(i);
+    const int rhs_dim = extended_rhs_shape.Dims(i);
+    int broadcast_dim = lhs_dim;
+    if ((lhs_dim != rhs_dim) && (lhs_dim == 1)) {
+      broadcast_dim = rhs_dim;
+    }
+    output->dims->data[i] = broadcast_dim;
+  }
+  // Fill in the matmul dimensions.
+  int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2;
+  int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1;
+
+  output->dims->data[output_rank - 2] = extended_lhs_shape.Dims(lhs_rows_index);
+  output->dims->data[output_rank - 1] = extended_rhs_shape.Dims(rhs_cols_index);
+  output->dims->size = output_rank;
+
+  // Check that output tensor has not been resized
+  // since TFLM doesn't support tensor resizing.
+  TF_LITE_ENSURE_EQ(context, orig_size, NumElements(output));
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+void TransposeRowsColumnsImpl(const TfLiteEvalTensor& tensor_in,
+                              TfLiteEvalTensor* tensor_out) {
+  const T* input = tflite::micro::GetTensorData<T>(&tensor_in);
+  T* output = tflite::micro::GetTensorData<T>(tensor_out);
+  RuntimeShape transposed_shape(tflite::micro::GetTensorShape(&tensor_in));
+  RuntimeShape shape(transposed_shape);
+  TransposeParams params;
+  const int rank = shape.DimensionsCount();
+  params.perm_count = rank;
+  for (int i = 0; i < rank - 2; ++i) {
+    params.perm[i] = i;
+  }
+  // Transpose the last two dimensions.
+  params.perm[rank - 2] = rank - 1;
+  params.perm[rank - 1] = rank - 2;
+  transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2));
+  transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1));
+  reference_ops::Transpose(params, shape, input, transposed_shape, output);
+}
+
+TfLiteStatus TransposeRowsColumns(const TfLiteEvalTensor& tensor_in,
+                                  TfLiteEvalTensor* tensor_out) {
+  if (tensor_in.type == kTfLiteFloat32) {
+    TransposeRowsColumnsImpl<float>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else if (tensor_in.type == kTfLiteInt8) {
+    TransposeRowsColumnsImpl<int8_t>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else if (tensor_in.type == kTfLiteInt16) {
+    TransposeRowsColumnsImpl<int16_t>(tensor_in, tensor_out);
+    return kTfLiteOk;
+  } else {
+    MicroPrintf(
+        "BATCH_MATMUL can only transpose tensors with FLOAT32, INT8, INT16 "
+        "type.");
+  }
+  return kTfLiteError;
+}
+
+RuntimeShape SwapRowColumnDims(const RuntimeShape& shape) {
+  RuntimeShape swapped_shape(shape);
+  const int32_t dims = shape.DimensionsCount();
+  swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1));
+  swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2));
+  return swapped_shape;
+}
+
+TFLMRegistration Register_BATCH_MATMUL();
+
+#if defined(CMSIS_NN)
+// Returns a TFLMRegistration struct for kernel variant that only supports
+// int8 matrix multiplication and uses the latency optimized
+// implementations.
+TFLMRegistration Register_BATCH_MATMUL_INT8();
+
+// Returns a TFLMRegistration struct for kernel variant that only supports
+// int16 matrix multiplication and uses the latency optimized
+// implementations.
+TFLMRegistration Register_BATCH_MATMUL_INT16();
+
+#else
+inline TFLMRegistration Register_BATCH_MATMUL_INT8() {
+  return Register_BATCH_MATMUL();
+}
+
+inline TFLMRegistration Register_BATCH_MATMUL_INT16() {
+  return Register_BATCH_MATMUL();
+}
+#endif  // defined(CMSIS_NN)
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_BATCH_MATMUL_H_
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/batch_matmul.cc b/tensorflow/lite/micro/kernels/cmsis_nn/batch_matmul.cc
new file mode 100644
index 00000000000..f73ceed57c2
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/batch_matmul.cc
@@ -0,0 +1,530 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/batch_matmul.h"
+
+#include "Include/arm_nnfunctions.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_arena_constants.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+namespace {
+
+struct OpData {
+  OpDataBatchMatmul reference_op_data;
+
+  cmsis_nn_dims output_shape;
+
+  int buffer_idx;
+};
+
+cmsis_nn_dims FillVariableShape(int32_t rank, int32_t* tensor_dims) {
+  if (rank == 4) {
+    return {tensor_dims[0], tensor_dims[1], tensor_dims[2], tensor_dims[3]};
+  } else if (rank == 3) {
+    return {1, tensor_dims[0], tensor_dims[1], tensor_dims[2]};
+  } else if (rank == 2) {
+    return {1, 1, tensor_dims[0], tensor_dims[1]};
+  } else {
+    return {1, 1, 1, 1};
+  }
+}
+
+inline TfLiteStatus PopulateEvalData(
+    TfLiteContext* context, OpData* data, const TfLiteBatchMatMulParams* params,
+    TfLiteNode* node, const TfLiteEvalTensor* original_lhs_input,
+    RuntimeShape* lhs_shape, TfLiteEvalTensor** updated_lhs_input,
+    const TfLiteEvalTensor* original_rhs_input, RuntimeShape* rhs_shape,
+    TfLiteEvalTensor** updated_rhs_input, const TfLiteEvalTensor* output) {
+  RuntimeShape orig_out_shape = tflite::micro::GetTensorShape(output);
+
+  *updated_rhs_input = params->adj_y
+                           ? const_cast<TfLiteEvalTensor*>(original_rhs_input)
+                           : data->reference_op_data.rhs_transposed_tensor;
+  *updated_lhs_input = params->adj_x
+                           ? data->reference_op_data.lhs_transposed_tensor
+                           : const_cast<TfLiteEvalTensor*>(original_lhs_input);
+
+  TF_LITE_ENSURE(context, *updated_rhs_input != nullptr);
+  TF_LITE_ENSURE(context, *updated_lhs_input != nullptr);
+  if (!params->adj_y) {
+    // TODO(b/154760341): Constant tensors should already be transposed, but
+    // we transpose once if necessary for now.
+    if (!(data->reference_op_data.rhs_is_constant_tensor &&
+          data->reference_op_data.rhs_is_transposed)) {
+      TransposeRowsColumns(*original_rhs_input, *updated_rhs_input);
+      data->reference_op_data.rhs_is_transposed = true;
+    }
+  }
+  if (params->adj_x) {
+    TransposeRowsColumns(*original_lhs_input, *updated_lhs_input);
+  }
+
+  // Compress BatchMatMul when third from last RHS dimension is one.
+  int32_t rhs_dims_count = rhs_shape->DimensionsCount();
+  int32_t lhs_dims_count = lhs_shape->DimensionsCount();
+  int32_t out_dims_count = orig_out_shape.DimensionsCount();
+  // Compress ops where rhs shape is [..., 1, X, Y] and lhs shape is
+  // [..., Q, R, S] which is equivalent to rhs: [..., X, Y] and
+  // lhs: [..., Q * R, S].
+  if (rhs_dims_count > 2 && lhs_dims_count > 2) {
+    int rhs_one = rhs_shape->DimsData()[rhs_dims_count - 3];
+    if (rhs_one == 1) {
+      int32_t* lhs_dims = lhs_shape->DimsData();
+      int32_t* rhs_dims = rhs_shape->DimsData();
+      int32_t* out_dims = orig_out_shape.DimsData();
+      RuntimeShape tmp_l(lhs_dims_count - 1, lhs_dims);
+      tmp_l.SetDim(lhs_dims_count - 3,
+                   lhs_dims[lhs_dims_count - 3] * lhs_dims[lhs_dims_count - 2]);
+      tmp_l.SetDim(lhs_dims_count - 2, lhs_dims[lhs_dims_count - 1]);
+      lhs_shape->ReplaceWith(tmp_l.DimensionsCount(), tmp_l.DimsData());
+      RuntimeShape tmp_r(rhs_dims_count - 1, rhs_shape->DimsData());
+      tmp_r.SetDim(rhs_dims_count - 3, rhs_dims[rhs_dims_count - 2]);
+      tmp_r.SetDim(rhs_dims_count - 2, rhs_dims[rhs_dims_count - 1]);
+      rhs_shape->ReplaceWith(tmp_r.DimensionsCount(), tmp_r.DimsData());
+      rhs_dims_count = rhs_shape->DimensionsCount();
+      lhs_dims_count = lhs_shape->DimensionsCount();
+
+      RuntimeShape tmp_o(out_dims_count - 1, out_dims);
+      tmp_o.SetDim(out_dims_count - 3, lhs_shape->Dims(lhs_dims_count - 2));
+      tmp_o.SetDim(out_dims_count - 2, orig_out_shape.Dims(out_dims_count - 1));
+      orig_out_shape.ReplaceWith(tmp_o.DimensionsCount(), tmp_o.DimsData());
+      out_dims_count = orig_out_shape.DimensionsCount();
+      data->output_shape =
+          FillVariableShape(out_dims_count, orig_out_shape.DimsData());
+    }
+  }
+
+  if (!params->adj_y) {
+    RuntimeShape tmp_r = SwapRowColumnDims(*rhs_shape);
+    rhs_shape->ReplaceWith(tmp_r.DimensionsCount(), tmp_r.DimsData());
+  }
+  if (!params->adj_x) {
+    RuntimeShape tmp_l = SwapRowColumnDims(*lhs_shape);
+    lhs_shape->ReplaceWith(tmp_l.DimensionsCount(), tmp_l.DimsData());
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteEvalTensor* AllocInitTransposeTensorFromTfLiteTensor(
+    TfLiteContext* context, MicroContext* micro_context,
+    const TfLiteTensor& tensor) {
+  TfLiteEvalTensor* eval_tensor = static_cast<TfLiteEvalTensor*>(
+      micro_context->AllocatePersistentBuffer(sizeof(TfLiteEvalTensor)));
+  if (eval_tensor == nullptr) {
+    return nullptr;
+  }
+
+  eval_tensor->type = tensor.type;
+
+  const int tensor_rank = NumDimensions(&tensor);
+  const size_t eval_dims_size = TfLiteIntArrayGetSizeInBytes(tensor_rank);
+  eval_tensor->dims = static_cast<TfLiteIntArray*>(
+      micro_context->AllocatePersistentBuffer(eval_dims_size));
+  if (eval_tensor->dims == nullptr) {
+    return nullptr;
+  }
+  eval_tensor->dims->size = tensor_rank;
+  for (int i = 0; i < tensor_rank - 2; ++i) {
+    eval_tensor->dims->data[i] = tensor.dims->data[i];
+  }
+  // Swap last two dimensions.
+  eval_tensor->dims->data[tensor_rank - 2] = tensor.dims->data[tensor_rank - 1];
+  eval_tensor->dims->data[tensor_rank - 1] = tensor.dims->data[tensor_rank - 2];
+
+  const size_t eval_data_size = static_cast<size_t>(NumElements(&tensor)) *
+                                TfLiteTypeGetSize(tensor.type);
+  eval_tensor->data.data =
+      micro_context->AllocatePersistentBuffer(eval_data_size);
+  if (eval_tensor->data.data == nullptr) {
+    return nullptr;
+  }
+
+  return eval_tensor;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteBatchMatMulParams*>(node->builtin_data);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* lhs_input =
+      micro_context->AllocateTempInputTensor(node, kBatchMatmulInputLhsTensor);
+  TF_LITE_ENSURE(context, lhs_input != nullptr);
+  TfLiteTensor* rhs_input =
+      micro_context->AllocateTempInputTensor(node, kBatchMatmulInputRhsTensor);
+  TF_LITE_ENSURE(context, rhs_input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kBatchMatmulOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, lhs_input->type, rhs_input->type);
+  TF_LITE_ENSURE_EQ(context, lhs_input->type, output->type);
+  TF_LITE_ENSURE_MSG(context,
+                     lhs_input->type == kTfLiteFloat32 ||
+                         lhs_input->type == kTfLiteInt16 ||
+                         lhs_input->type == kTfLiteInt8,
+                     "Input data type not supported");
+
+  const int lhs_rank = NumDimensions(lhs_input);
+  const int rhs_rank = NumDimensions(rhs_input);
+
+  TF_LITE_ENSURE(context, lhs_rank >= 2);
+  TF_LITE_ENSURE(context, lhs_rank <= 4);
+  TF_LITE_ENSURE(context, rhs_rank >= 2);
+  TF_LITE_ENSURE(context, rhs_rank <= 4);
+
+  data->reference_op_data.rhs_is_transposed = false;
+  data->reference_op_data.lhs_is_constant_tensor = IsConstantTensor(lhs_input);
+  data->reference_op_data.rhs_is_constant_tensor = IsConstantTensor(rhs_input);
+
+  const int output_rank = std::max(lhs_rank, rhs_rank);
+  TFLITE_DCHECK_GE(output_rank, 2);
+  TFLITE_DCHECK_LE(output_rank, 4);
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(output_rank, GetTensorShape(lhs_input));
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(output_rank, GetTensorShape(rhs_input));
+
+  // Ensure any batch dimensions obey broacasting rules.
+  for (int i = 0; i < output_rank - 2; ++i) {
+    const int lhs_dim = extended_lhs_shape.Dims(i);
+    const int rhs_dim = extended_rhs_shape.Dims(i);
+    if (lhs_dim != rhs_dim) {
+      if (lhs_dim != 1) {
+        TF_LITE_ENSURE_EQ(context, rhs_dim, 1);
+      }
+    }
+  }
+
+  bool adj_x = params->adj_x;
+  bool adj_y = params->adj_y;
+  // Ensure other dimensions work for matrix multiplication.
+  int accum_dim_lhs = adj_x ? extended_lhs_shape.Dims(output_rank - 2)
+                            : extended_lhs_shape.Dims(output_rank - 1);
+  int accum_dim_rhs = adj_y ? extended_rhs_shape.Dims(output_rank - 1)
+                            : extended_rhs_shape.Dims(output_rank - 2);
+
+  TF_LITE_ENSURE_EQ(context, accum_dim_lhs, accum_dim_rhs);
+
+  // Tensor for transposed LHS;
+  if (adj_x) {
+    data->reference_op_data.lhs_transposed_tensor =
+        AllocInitTransposeTensorFromTfLiteTensor(context, micro_context,
+                                                 *lhs_input);
+    TF_LITE_ENSURE(context,
+                   data->reference_op_data.lhs_transposed_tensor != nullptr);
+  }
+
+  // If RHS needs to be transposed, then it is actually in the correct shape
+  // already.
+  if (!adj_y) {
+    data->reference_op_data.rhs_transposed_tensor =
+        AllocInitTransposeTensorFromTfLiteTensor(context, micro_context,
+                                                 *rhs_input);
+    TF_LITE_ENSURE(context,
+                   data->reference_op_data.rhs_transposed_tensor != nullptr);
+  }
+
+  TF_LITE_ENSURE_STATUS(ReshapeOutputTensor(context, node, extended_lhs_shape,
+                                            extended_rhs_shape, adj_x, adj_y,
+                                            output_rank, output));
+
+  data->output_shape = FillVariableShape(
+      output_rank, reinterpret_cast<int32_t*>(output->dims->data));
+
+  int buf_size = 0;
+  if (lhs_input->type != kTfLiteFloat32 && rhs_input->type != kTfLiteFloat32) {
+    data->reference_op_data.quantization =
+        static_cast<decltype(data->reference_op_data.quantization)>(
+            micro_context->AllocatePersistentBuffer(
+                sizeof(*data->reference_op_data.quantization)));
+    TF_LITE_ENSURE(context, data->reference_op_data.quantization != nullptr);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, lhs_input, rhs_input, output, &real_multiplier));
+    QuantizeMultiplier(real_multiplier,
+                       &data->reference_op_data.quantization->output_multiplier,
+                       &data->reference_op_data.quantization->output_shift);
+
+    data->reference_op_data.quantization->lhs_zero_point =
+        lhs_input->params.zero_point;
+    data->reference_op_data.quantization->rhs_zero_point =
+        rhs_input->params.zero_point;
+    data->reference_op_data.quantization->output_zero_point =
+        output->params.zero_point;
+
+    if (lhs_input->type == kTfLiteInt8) {
+      data->reference_op_data.quantization->output_activation_min =
+          std::numeric_limits<int8_t>::min();
+      data->reference_op_data.quantization->output_activation_max =
+          std::numeric_limits<int8_t>::max();
+
+      data->buffer_idx = -1;
+      buf_size = arm_fully_connected_s8_get_buffer_size(&data->output_shape);
+    } else {
+      data->reference_op_data.quantization->output_activation_min =
+          std::numeric_limits<int16_t>::min();
+      data->reference_op_data.quantization->output_activation_max =
+          std::numeric_limits<int16_t>::max();
+
+      TF_LITE_ENSURE_EQ(context, lhs_input->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, rhs_input->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    }
+  }
+
+  if (buf_size > 0) {
+    TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, buf_size, &data->buffer_idx));
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(lhs_input);
+  micro_context->DeallocateTempTfLiteTensor(rhs_input);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalInt8(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const TfLiteEvalTensor* original_lhs_input =
+      tflite::micro::GetEvalInput(context, node, kBatchMatmulInputLhsTensor);
+  const TfLiteEvalTensor* original_rhs_input =
+      tflite::micro::GetEvalInput(context, node, kBatchMatmulInputRhsTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
+
+  OpData& data = *(static_cast<OpData*>(node->user_data));
+  const auto* params =
+      static_cast<const TfLiteBatchMatMulParams*>(node->builtin_data);
+
+  RuntimeShape rhs_shape = tflite::micro::GetTensorShape(original_rhs_input);
+  RuntimeShape lhs_shape = tflite::micro::GetTensorShape(original_lhs_input);
+  TfLiteEvalTensor* updated_lhs_input;
+  TfLiteEvalTensor* updated_rhs_input;
+
+  TF_LITE_ENSURE_STATUS(
+      PopulateEvalData(context, &data, params, node, original_lhs_input,
+                       &lhs_shape, &updated_lhs_input, original_rhs_input,
+                       &rhs_shape, &updated_rhs_input, output));
+
+  cmsis_nn_dims rhs_dims =
+      FillVariableShape(rhs_shape.DimensionsCount(), rhs_shape.DimsData());
+  cmsis_nn_dims lhs_dims =
+      FillVariableShape(lhs_shape.DimensionsCount(), lhs_shape.DimsData());
+
+  cmsis_nn_per_tensor_quant_params quant_params = {
+      data.reference_op_data.quantization->output_multiplier,
+      data.reference_op_data.quantization->output_shift};
+  cmsis_nn_context ctx;
+  ctx.buf = nullptr;
+  ctx.size = 0;
+
+  if (data.buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
+    // Note: ctx.size is currently not used in cmsis_nn.
+    // The buffer should be allocated in the prepare function through
+    // the corresponding arm_convolve_wrapper_[type]_get_buffer_size
+  }
+
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = -data.reference_op_data.quantization->lhs_zero_point;
+  fc_params.filter_offset =
+      -data.reference_op_data.quantization->rhs_zero_point;
+  fc_params.output_offset =
+      data.reference_op_data.quantization->output_zero_point;
+
+  cmsis_nn_activation activation;
+  activation.min = data.reference_op_data.quantization->output_activation_min;
+  activation.max = data.reference_op_data.quantization->output_activation_max;
+  fc_params.activation = activation;
+
+  cmsis_nn_bmm_params bmm_params = {
+      params->adj_x,
+      params->adj_y,
+      fc_params,
+  };
+
+  TF_LITE_ENSURE_EQ(
+      context,
+      arm_batch_matmul_s8(
+          &ctx, &bmm_params, &quant_params, &lhs_dims,
+          tflite::micro::GetTensorData<int8_t>(updated_lhs_input), &rhs_dims,
+          tflite::micro::GetTensorData<int8_t>(updated_rhs_input),
+          &data.output_shape, tflite::micro::GetTensorData<int8_t>(output)),
+      ARM_CMSIS_NN_SUCCESS);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalInt16(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const TfLiteEvalTensor* original_lhs_input =
+      tflite::micro::GetEvalInput(context, node, kBatchMatmulInputLhsTensor);
+  const TfLiteEvalTensor* original_rhs_input =
+      tflite::micro::GetEvalInput(context, node, kBatchMatmulInputRhsTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
+
+  OpData& data = *(static_cast<OpData*>(node->user_data));
+  const auto* params =
+      static_cast<const TfLiteBatchMatMulParams*>(node->builtin_data);
+
+  RuntimeShape rhs_shape = tflite::micro::GetTensorShape(original_rhs_input);
+  RuntimeShape lhs_shape = tflite::micro::GetTensorShape(original_lhs_input);
+
+  // These pointers will be updated to point at the actual tensor being used in
+  // the batch matmul function
+  TfLiteEvalTensor* updated_lhs_input;
+  TfLiteEvalTensor* updated_rhs_input;
+
+  TF_LITE_ENSURE_STATUS(
+      PopulateEvalData(context, &data, params, node, original_lhs_input,
+                       &lhs_shape, &updated_lhs_input, original_rhs_input,
+                       &rhs_shape, &updated_rhs_input, output));
+
+  cmsis_nn_dims rhs_dims =
+      FillVariableShape(rhs_shape.DimensionsCount(), rhs_shape.DimsData());
+  cmsis_nn_dims lhs_dims =
+      FillVariableShape(lhs_shape.DimensionsCount(), lhs_shape.DimsData());
+
+  cmsis_nn_per_tensor_quant_params quant_params = {
+      data.reference_op_data.quantization->output_multiplier,
+      data.reference_op_data.quantization->output_shift};
+  cmsis_nn_context ctx;
+  ctx.buf = nullptr;
+  ctx.size = 0;
+
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = -data.reference_op_data.quantization->lhs_zero_point;
+  fc_params.filter_offset =
+      -data.reference_op_data.quantization->rhs_zero_point;
+  fc_params.output_offset =
+      data.reference_op_data.quantization->output_zero_point;
+
+  cmsis_nn_activation activation;
+  activation.min = data.reference_op_data.quantization->output_activation_min;
+  activation.max = data.reference_op_data.quantization->output_activation_max;
+  fc_params.activation = activation;
+
+  cmsis_nn_bmm_params bmm_params = {
+      params->adj_x,
+      params->adj_y,
+      fc_params,
+  };
+
+  TF_LITE_ENSURE_EQ(
+      context,
+      arm_batch_matmul_s16(
+          &ctx, &bmm_params, &quant_params, &lhs_dims,
+          tflite::micro::GetTensorData<int16_t>(updated_lhs_input), &rhs_dims,
+          tflite::micro::GetTensorData<int16_t>(updated_rhs_input),
+          &data.output_shape, tflite::micro::GetTensorData<int16_t>(output)),
+      ARM_CMSIS_NN_SUCCESS);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const TfLiteEvalTensor* original_lhs_input =
+      tflite::micro::GetEvalInput(context, node, kBatchMatmulInputLhsTensor);
+  switch (original_lhs_input->type) {
+    case kTfLiteFloat32: {
+      const TfLiteEvalTensor* original_rhs_input = tflite::micro::GetEvalInput(
+          context, node, kBatchMatmulInputRhsTensor);
+      TfLiteEvalTensor* output =
+          tflite::micro::GetEvalOutput(context, node, kBatchMatmulOutputTensor);
+
+      TFLITE_DCHECK(node->user_data != nullptr);
+      OpData& data = *(static_cast<OpData*>(node->user_data));
+      const auto* params =
+          static_cast<const TfLiteBatchMatMulParams*>(node->builtin_data);
+
+      RuntimeShape rhs_shape =
+          tflite::micro::GetTensorShape(original_rhs_input);
+      RuntimeShape lhs_shape =
+          tflite::micro::GetTensorShape(original_lhs_input);
+      TfLiteEvalTensor* updated_lhs_input;
+      TfLiteEvalTensor* updated_rhs_input;
+
+      TF_LITE_ENSURE_STATUS(
+          PopulateEvalData(context, &data, params, node, original_lhs_input,
+                           &lhs_shape, &updated_lhs_input, original_rhs_input,
+                           &rhs_shape, &updated_rhs_input, output));
+
+      // Note we pass RHS args first, LHS args second.
+      reference_ops::BatchMatMul(
+          rhs_shape, tflite::micro::GetTensorData<float>(updated_rhs_input),
+          lhs_shape, tflite::micro::GetTensorData<float>(updated_lhs_input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+    } break;
+    case kTfLiteInt8:
+      return EvalInt8(context, node);
+    case kTfLiteInt16:
+      return EvalInt16(context, node);
+    default: {
+      MicroPrintf("CMSIS-NN Batch Matmul: Type %s (%d) not supported.",
+                  TfLiteTypeGetName(original_lhs_input->type),
+                  original_lhs_input->type);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_BATCH_MATMUL() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+TFLMRegistration Register_BATCH_MATMUL_INT8() {
+  return tflite::micro::RegisterOp(Init, Prepare, EvalInt8);
+}
+
+TFLMRegistration Register_BATCH_MATMUL_INT16() {
+  return tflite::micro::RegisterOp(Init, Prepare, EvalInt16);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
index fb0ad928bd6..40dc0fdc50d 100755
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
@@ -38,9 +38,9 @@ source ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/bash_helpers.sh
 DOWNLOADS_DIR=${1}
 DOWNLOADED_CMSIS_NN_PATH=${DOWNLOADS_DIR}/cmsis_nn
 
-ZIP_PREFIX_NN="01dee38e6d6bfbbf202f0cd425bbea1731747d51"
+ZIP_PREFIX_NN="9d924bdaee51ca8e0c4e86779bbb6d0c9644e555"
 CMSIS_NN_URL="http://github.com/ARM-software/CMSIS-NN/archive/${ZIP_PREFIX_NN}.zip"
-CMSIS_NN_MD5="f20be93ededf42bb704c19f699a24313"
+CMSIS_NN_MD5="03174b48831dfbe4f9de1c2c23119ef5"
 
 should_download=$(check_should_download ${DOWNLOADS_DIR})
 

From 1b262f1d6ab707eddb37a2605c2b0bf21c62e563 Mon Sep 17 00:00:00 2001
From: Ryan O'Shea <ryan.oshea3@arm.com>
Date: Mon, 9 Sep 2024 18:05:14 +0200
Subject: [PATCH 2/2] Fix batch matmul review comments and update cmsis sha

Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
---
 tensorflow/lite/micro/kernels/batch_matmul.cc | 40 -------------------
 .../tools/make/ext_libs/cmsis_nn_download.sh  |  4 +-
 2 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/batch_matmul.cc b/tensorflow/lite/micro/kernels/batch_matmul.cc
index 4a242054df4..15112e3b4cd 100644
--- a/tensorflow/lite/micro/kernels/batch_matmul.cc
+++ b/tensorflow/lite/micro/kernels/batch_matmul.cc
@@ -164,46 +164,6 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-template <typename Scalar>
-void TransposeRowsColumnsImpl(const TfLiteEvalTensor& tensor_in,
-                              TfLiteEvalTensor* tensor_out) {
-  const Scalar* input = tflite::micro::GetTensorData<Scalar>(&tensor_in);
-  Scalar* output = tflite::micro::GetTensorData<Scalar>(tensor_out);
-  RuntimeShape transposed_shape(tflite::micro::GetTensorShape(&tensor_in));
-  RuntimeShape shape(transposed_shape);
-  TransposeParams params;
-  const int rank = shape.DimensionsCount();
-  params.perm_count = rank;
-  for (int i = 0; i < rank - 2; ++i) {
-    params.perm[i] = i;
-  }
-  // Transpose the last two dimensions.
-  params.perm[rank - 2] = rank - 1;
-  params.perm[rank - 1] = rank - 2;
-  transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2));
-  transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1));
-  reference_ops::Transpose(params, shape, input, transposed_shape, output);
-}
-
-TfLiteStatus TransposeRowsColumns(const TfLiteEvalTensor& tensor_in,
-                                  TfLiteEvalTensor* tensor_out) {
-  if (tensor_in.type == kTfLiteFloat32) {
-    TransposeRowsColumnsImpl<float>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else if (tensor_in.type == kTfLiteInt8) {
-    TransposeRowsColumnsImpl<int8_t>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else if (tensor_in.type == kTfLiteInt16) {
-    TransposeRowsColumnsImpl<int16_t>(tensor_in, tensor_out);
-    return kTfLiteOk;
-  } else {
-    MicroPrintf(
-        "BATCH_MATMUL can only transpose tensors with FLOAT32, INT8, INT16 "
-        "type.");
-  }
-  return kTfLiteError;
-}
-
 void* BatchMatMulInit(TfLiteContext* context, const char* buffer,
                       size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
index 40dc0fdc50d..393c184d1e7 100755
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
@@ -38,9 +38,9 @@ source ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/bash_helpers.sh
 DOWNLOADS_DIR=${1}
 DOWNLOADED_CMSIS_NN_PATH=${DOWNLOADS_DIR}/cmsis_nn
 
-ZIP_PREFIX_NN="9d924bdaee51ca8e0c4e86779bbb6d0c9644e555"
+ZIP_PREFIX_NN="95f293df19c9a38806868fe12a64a4f9b457f9c1"
 CMSIS_NN_URL="http://github.com/ARM-software/CMSIS-NN/archive/${ZIP_PREFIX_NN}.zip"
-CMSIS_NN_MD5="03174b48831dfbe4f9de1c2c23119ef5"
+CMSIS_NN_MD5="5e0c4cd60a5f074c4d26d1be236caefd"
 
 should_download=$(check_should_download ${DOWNLOADS_DIR})