pytorch · Erik-Lundell · Jan 9, 2026 · Apr 15, 2026 · Apr 15, 2026 · May 19, 2026
@@ -7,6 +7,7 @@
 # Optional parameter:
 # --build_type= "Release" | "Debug" | "RelWithDebInfo" | "UndefinedSanitizer" | "AddressSanitizer"
 # --etdump      build with devtools-etdump support
+# --cmake-args= Additional arguments passed to cmake configure
 
 set -eu
 
@@ -24,6 +25,7 @@ build_type="Release"
 build_devtools=OFF
 build_with_etdump=OFF
 is_linux_musl=0
+extra_cmake_args=()
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -32,6 +34,7 @@ help() {
     echo "  --build_type=<TYPE>       Build with Release, Debug, RelWithDebInfo, UndefinedSanitizer or AddressSanitizer, default is ${build_type}"
     echo "  --devtools                Build Devtools libs"
     echo "  --etdump                  Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --cmake-args=<ARGS>       Additional arguments passed to cmake configure"
     echo "  --toolchain=<TOOLCHAIN>   Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}"
     exit 0
 }
@@ -43,6 +46,10 @@ for arg in "$@"; do
       --build_type=*) build_type="${arg#*=}";;
       --devtools) build_devtools=ON ;;
       --etdump) build_with_etdump=ON ;;
+      --cmake-args=*)
+        # shellcheck disable=SC2206
+        extra_cmake_args=(${arg#*=})
+        ;;
       --toolchain=*) toolchain="${arg#*=}";;
       *)
       ;;
@@ -85,6 +92,7 @@ cmake_args=(
     -DCMAKE_BUILD_TYPE=${build_type}
     -DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools}
     -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump}
+    "${extra_cmake_args[@]}"
 )
 
 if [[ ${is_linux_musl} -eq 1 ]]; then

@@ -30,6 +30,10 @@ set(CMSIS_NN_LOCAL_PATH
     ""
     CACHE PATH "Path to existing local CMSIS-NN installation"
 )
+option(CORTEX_M_ENABLE_ASSERTS
+       "Enable additional Cortex-M runtime assertions and validation checks"
+       OFF
+)
 
 # Try to find existing / local CMSIS-NN installation. This is useful for
 # debugging and testing with local changes. This is not common, as the CMSIS-NN
@@ -87,6 +91,10 @@ target_link_libraries(
   PRIVATE executorch
   PRIVATE kernels_util_all_deps
 )
+target_compile_definitions(
+  cortex_m_kernels
+  PRIVATE $<$<BOOL:${CORTEX_M_ENABLE_ASSERTS}>:CORTEX_M_ENABLE_ASSERTS>
+)
 
 # Include directories for cortex_m_kernels
 target_include_directories(

@@ -1,6 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -71,6 +72,7 @@ Tensor& quantized_batch_matmul_out(
     int64_t output_offset,
     int64_t output_multiplier,
     int64_t output_shift,
+    const Tensor& scratch,
     Tensor& out) {
   if (!validate_batch_matmul_arguments(context, lhs, rhs_transposed, out)) {
     return out;
@@ -100,25 +102,26 @@ Tensor& quantized_batch_matmul_out(
   quant_params.multiplier = static_cast<int32_t>(output_multiplier);
   quant_params.shift = static_cast<int32_t>(output_shift);
 
-  const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&out_dims);
-
   cmsis_nn_context ctx;
   ctx.buf = nullptr;
-  ctx.size = 0;
-
-  if (buf_size > 0) {
-    auto buffer_or_error = context.allocate_temp(buf_size);
-    if (!buffer_or_error.ok()) {
-      ET_LOG(
-          Error,
-          "quantized_batch_matmul: failed to allocate scratch buffer (%d bytes)",
-          buf_size);
-      context.fail(buffer_or_error.error());
-      return out;
-    }
-    ctx.buf = buffer_or_error.get();
-    ctx.size = buf_size;
+  ctx.size = scratch.nbytes();
+  if (ctx.size > 0) {
+    ctx.buf = scratch.mutable_data_ptr<int8_t>();
+  }
+
+#ifdef CORTEX_M_ENABLE_ASSERTS
+  const int32_t runtime_buffer_bytes =
+      arm_fully_connected_s8_get_buffer_size(&out_dims);
+  if (ctx.size != static_cast<size_t>(runtime_buffer_bytes)) {
+    ET_LOG(
+        Error,
+        "quantized_batch_matmul: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(ctx.size),
+        runtime_buffer_bytes);
+    context.fail(Error::Internal);
+    return out;
   }
+#endif
 
   const arm_cmsis_nn_status status = arm_batch_matmul_s8(
       &ctx,

@@ -112,6 +112,7 @@ Tensor& quantized_conv2d_out(
     const Tensor& requantize_shifts,
     const int64_t activation_min,
     const int64_t activation_max,
+    const Tensor& scratch,
     Tensor& out) {
   if (!validate_conv2d_arguments(
           context,
@@ -182,31 +183,30 @@ Tensor& quantized_conv2d_out(
 
   cmsis_nn_context cmsis_context;
   cmsis_context.buf = nullptr;
-  cmsis_context.size = 0;
+  cmsis_context.size = scratch.nbytes();
+  if (cmsis_context.size > 0) {
+    cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
+  }
 
-  const int32_t buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
+#ifdef CORTEX_M_ENABLE_ASSERTS
+  const int32_t runtime_buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
       &conv_params, &input_dims, &filter_dims, &output_dims);
-  if (buffer_bytes < 0) {
+  if (runtime_buffer_bytes < 0) {
     ET_LOG(
         Error, "quantized_conv2d_out: CMSIS-NN buffer size calculation failed");
     context.fail(Error::Internal);
     return out;
   }
-  if (buffer_bytes > 0) {
-    auto buffer_or_error =
-        context.allocate_temp(buffer_bytes, kCortexMMveAlignment);
-    if (!buffer_or_error.ok()) {
-      ET_LOG(
-          Error,
-          "quantized_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
-          static_cast<int>(buffer_bytes),
-          static_cast<int>(buffer_or_error.error()));
-      context.fail(buffer_or_error.error());
-      return out;
-    }
-    cmsis_context.buf = buffer_or_error.get();
-    cmsis_context.size = buffer_bytes;
+  if (scratch.nbytes() != static_cast<size_t>(runtime_buffer_bytes)) {
+    ET_LOG(
+        Error,
+        "quantized_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(scratch.nbytes()),
+        static_cast<int>(runtime_buffer_bytes));
+    context.fail(Error::Internal);
+    return out;
   }
+#endif
 
   const arm_cmsis_nn_status status = arm_convolve_wrapper_s8(
       &cmsis_context,

@@ -150,6 +150,7 @@ Tensor& quantized_depthwise_conv2d_out(
     const Tensor& requantize_shifts,
     const int64_t activation_min,
     const int64_t activation_max,
+    const Tensor& scratch,
     Tensor& out) {
   if (!validate_depthwise_conv2d_arguments(
           context,
@@ -220,32 +221,32 @@ Tensor& quantized_depthwise_conv2d_out(
 
   cmsis_nn_context cmsis_context;
   cmsis_context.buf = nullptr;
-  cmsis_context.size = 0;
+  cmsis_context.size = scratch.nbytes();
+  if (cmsis_context.size > 0) {
+    cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
+  }
 
-  const int32_t buffer_bytes = arm_depthwise_conv_wrapper_s8_get_buffer_size(
-      &dw_conv_params, &input_dims, &filter_dims, &output_dims);
-  if (buffer_bytes < 0) {
+#ifdef CORTEX_M_ENABLE_ASSERTS
+  const int32_t runtime_buffer_bytes =
+      arm_depthwise_conv_wrapper_s8_get_buffer_size(
+          &dw_conv_params, &input_dims, &filter_dims, &output_dims);
+  if (runtime_buffer_bytes < 0) {
     ET_LOG(
         Error,
         "quantized_depthwise_conv2d_out: CMSIS-NN buffer size calculation failed");
     context.fail(Error::Internal);
     return out;
   }
-
-  auto buffer_or_error = context.allocate_temp(
-      static_cast<size_t>(buffer_bytes), kCortexMMveAlignment);
-  if (!buffer_or_error.ok()) {
+  if (scratch.nbytes() != static_cast<size_t>(runtime_buffer_bytes)) {
     ET_LOG(
         Error,
-        "quantized_depthwise_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
-        static_cast<int>(buffer_bytes),
-        static_cast<int>(buffer_or_error.error()));
-    context.fail(buffer_or_error.error());
+        "quantized_depthwise_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(scratch.nbytes()),
+        static_cast<int>(runtime_buffer_bytes));
+    context.fail(Error::Internal);
     return out;
   }
-  cmsis_context.buf = buffer_or_error.get();
-  cmsis_context.size = buffer_bytes;
-
+#endif
   const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8(
       &cmsis_context,
       &dw_conv_params,

@@ -1,6 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -97,6 +98,8 @@ Tensor& quantized_transpose_conv2d_out(
     const Tensor& requantize_shifts,
     const int64_t activation_min,
     const int64_t activation_max,
+    const Tensor& scratch,
+    const Tensor& output_scratch,
     Tensor& out) {
   if (!validate_transpose_conv2d_arguments(
           context,
@@ -179,44 +182,43 @@ Tensor& quantized_transpose_conv2d_out(
 
   cmsis_nn_context cmsis_context;
   cmsis_context.buf = nullptr;
-  cmsis_context.size = 0;
+  cmsis_context.size = scratch.nbytes();
+  if (cmsis_context.size > 0) {
+    cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
+  }
 
   cmsis_nn_context output_context;
   output_context.buf = nullptr;
-  output_context.size = 0;
-
+  output_context.size = output_scratch.nbytes();
+  if (output_context.size > 0) {
+    output_context.buf = output_scratch.mutable_data_ptr<int8_t>();
+  }
+#ifdef CORTEX_M_ENABLE_ASSERTS
   const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size(
       &transpose_conv_params, &input_dims, &filter_dims, &output_dims);
-  auto buffer_or_error = context.allocate_temp(
-      static_cast<size_t>(buffer_bytes), kCortexMMveAlignment);
-  if (!buffer_or_error.ok()) {
+  if (scratch.nbytes() != static_cast<size_t>(buffer_bytes)) {
     ET_LOG(
         Error,
-        "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
-        buffer_bytes,
-        static_cast<int>(buffer_or_error.error()));
-    context.fail(buffer_or_error.error());
+        "quantized_transpose_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(scratch.nbytes()),
+        buffer_bytes);
+    context.fail(Error::Internal);
     return out;
   }
-  cmsis_context.buf = buffer_or_error.get();
-  cmsis_context.size = buffer_bytes;
 
   const int32_t output_buffer_bytes =
       arm_transpose_conv_s8_get_reverse_conv_buffer_size(
           &transpose_conv_params, &input_dims, &filter_dims);
-  auto output_buffer_or_error = context.allocate_temp(
-      static_cast<size_t>(output_buffer_bytes), kCortexMMveAlignment);
-  if (!output_buffer_or_error.ok()) {
+  if (output_scratch.nbytes() != static_cast<size_t>(output_buffer_bytes)) {
     ET_LOG(
         Error,
-        "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)",
-        output_buffer_bytes,
-        static_cast<int>(output_buffer_or_error.error()));
-    context.fail(output_buffer_or_error.error());
+        "quantized_transpose_conv2d_out: output scratch buffer size incorrect - actual: (%d) needed: (%d)",
+        static_cast<int>(output_scratch.nbytes()),
+        output_buffer_bytes);
+    context.fail(Error::Internal);
     return out;
   }
-  output_context.buf = output_buffer_or_error.get();
-  output_context.size = output_buffer_bytes;
+#endif
 
   const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8(
       &cmsis_context,