diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh index cf7e327b9ce..3d672a041f3 100755 --- a/backends/arm/scripts/build_executorch.sh +++ b/backends/arm/scripts/build_executorch.sh @@ -7,6 +7,7 @@ # Optional parameter: # --build_type= "Release" | "Debug" | "RelWithDebInfo" | "UndefinedSanitizer" | "AddressSanitizer" # --etdump build with devtools-etdump support +# --cmake-args= Additional arguments passed to cmake configure set -eu @@ -24,6 +25,7 @@ build_type="Release" build_devtools=OFF build_with_etdump=OFF is_linux_musl=0 +extra_cmake_args=() help() { echo "Usage: $(basename $0) [options]" @@ -32,6 +34,7 @@ help() { echo " --build_type= Build with Release, Debug, RelWithDebInfo, UndefinedSanitizer or AddressSanitizer, default is ${build_type}" echo " --devtools Build Devtools libs" echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --cmake-args= Additional arguments passed to cmake configure" echo " --toolchain= Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}" exit 0 } @@ -43,6 +46,10 @@ for arg in "$@"; do --build_type=*) build_type="${arg#*=}";; --devtools) build_devtools=ON ;; --etdump) build_with_etdump=ON ;; + --cmake-args=*) + # shellcheck disable=SC2206 + extra_cmake_args=(${arg#*=}) + ;; --toolchain=*) toolchain="${arg#*=}";; *) ;; @@ -85,6 +92,7 @@ cmake_args=( -DCMAKE_BUILD_TYPE=${build_type} -DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools} -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump} + "${extra_cmake_args[@]}" ) if [[ ${is_linux_musl} -eq 1 ]]; then diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index 8c8255b7b1b..75dc0b49e27 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -30,6 +30,10 @@ set(CMSIS_NN_LOCAL_PATH "" CACHE PATH "Path to existing local CMSIS-NN installation" ) +option(CORTEX_M_ENABLE_ASSERTS + "Enable additional Cortex-M runtime assertions and validation checks" + OFF +) # Try to find existing / local CMSIS-NN installation. This is useful for # debugging and testing with local changes. This is not common, as the CMSIS-NN @@ -87,6 +91,10 @@ target_link_libraries( PRIVATE executorch PRIVATE kernels_util_all_deps ) +target_compile_definitions( + cortex_m_kernels + PRIVATE $<$:CORTEX_M_ENABLE_ASSERTS> +) # Include directories for cortex_m_kernels target_include_directories( diff --git a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp index e6bc5a949ce..0aea882b107 100644 --- a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp +++ b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -71,6 +72,7 @@ Tensor& quantized_batch_matmul_out( int64_t output_offset, int64_t output_multiplier, int64_t output_shift, + const Tensor& scratch, Tensor& out) { if (!validate_batch_matmul_arguments(context, lhs, rhs_transposed, out)) { return out; @@ -100,25 +102,26 @@ Tensor& quantized_batch_matmul_out( quant_params.multiplier = static_cast(output_multiplier); quant_params.shift = static_cast(output_shift); - const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&out_dims); - cmsis_nn_context ctx; ctx.buf = nullptr; - ctx.size = 0; - - if (buf_size > 0) { - auto buffer_or_error = context.allocate_temp(buf_size); - if (!buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_batch_matmul: failed to allocate scratch buffer (%d bytes)", - buf_size); - context.fail(buffer_or_error.error()); - return out; - } - ctx.buf = buffer_or_error.get(); - ctx.size = buf_size; + ctx.size = scratch.nbytes(); + if (ctx.size > 0) { + ctx.buf = scratch.mutable_data_ptr(); + } + +#ifdef CORTEX_M_ENABLE_ASSERTS + const int32_t runtime_buffer_bytes = + arm_fully_connected_s8_get_buffer_size(&out_dims); + if (ctx.size != static_cast(runtime_buffer_bytes)) { + ET_LOG( + Error, + "quantized_batch_matmul: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(ctx.size), + runtime_buffer_bytes); + context.fail(Error::Internal); + return out; } +#endif const arm_cmsis_nn_status status = arm_batch_matmul_s8( &ctx, diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 7d4433690f6..b94c49b1d74 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -112,6 +112,7 @@ Tensor& quantized_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, Tensor& out) { if (!validate_conv2d_arguments( context, @@ -182,31 +183,30 @@ Tensor& quantized_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } - const int32_t buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size( +#ifdef CORTEX_M_ENABLE_ASSERTS + const int32_t runtime_buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size( &conv_params, &input_dims, &filter_dims, &output_dims); - if (buffer_bytes < 0) { + if (runtime_buffer_bytes < 0) { ET_LOG( Error, "quantized_conv2d_out: CMSIS-NN buffer size calculation failed"); context.fail(Error::Internal); return out; } - if (buffer_bytes > 0) { - auto buffer_or_error = - context.allocate_temp(buffer_bytes, kCortexMMveAlignment); - if (!buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - static_cast(buffer_bytes), - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); - return out; - } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; + if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) { + ET_LOG( + Error, + "quantized_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(runtime_buffer_bytes)); + context.fail(Error::Internal); + return out; } +#endif const arm_cmsis_nn_status status = arm_convolve_wrapper_s8( &cmsis_context, diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index 8dec61e0af1..7ed761bad9d 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -150,6 +150,7 @@ Tensor& quantized_depthwise_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, Tensor& out) { if (!validate_depthwise_conv2d_arguments( context, @@ -220,32 +221,32 @@ Tensor& quantized_depthwise_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } - const int32_t buffer_bytes = arm_depthwise_conv_wrapper_s8_get_buffer_size( - &dw_conv_params, &input_dims, &filter_dims, &output_dims); - if (buffer_bytes < 0) { +#ifdef CORTEX_M_ENABLE_ASSERTS + const int32_t runtime_buffer_bytes = + arm_depthwise_conv_wrapper_s8_get_buffer_size( + &dw_conv_params, &input_dims, &filter_dims, &output_dims); + if (runtime_buffer_bytes < 0) { ET_LOG( Error, "quantized_depthwise_conv2d_out: CMSIS-NN buffer size calculation failed"); context.fail(Error::Internal); return out; } - - auto buffer_or_error = context.allocate_temp( - static_cast(buffer_bytes), kCortexMMveAlignment); - if (!buffer_or_error.ok()) { + if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) { ET_LOG( Error, - "quantized_depthwise_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - static_cast(buffer_bytes), - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); + "quantized_depthwise_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(runtime_buffer_bytes)); + context.fail(Error::Internal); return out; } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; - +#endif const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8( &cmsis_context, &dw_conv_params, diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index e3f6135c7b9..ebb713531b8 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -97,6 +98,8 @@ Tensor& quantized_transpose_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, + const Tensor& output_scratch, Tensor& out) { if (!validate_transpose_conv2d_arguments( context, @@ -179,44 +182,43 @@ Tensor& quantized_transpose_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } cmsis_nn_context output_context; output_context.buf = nullptr; - output_context.size = 0; - + output_context.size = output_scratch.nbytes(); + if (output_context.size > 0) { + output_context.buf = output_scratch.mutable_data_ptr(); + } +#ifdef CORTEX_M_ENABLE_ASSERTS const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size( &transpose_conv_params, &input_dims, &filter_dims, &output_dims); - auto buffer_or_error = context.allocate_temp( - static_cast(buffer_bytes), kCortexMMveAlignment); - if (!buffer_or_error.ok()) { + if (scratch.nbytes() != static_cast(buffer_bytes)) { ET_LOG( Error, - "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - buffer_bytes, - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); + "quantized_transpose_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + buffer_bytes); + context.fail(Error::Internal); return out; } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; const int32_t output_buffer_bytes = arm_transpose_conv_s8_get_reverse_conv_buffer_size( &transpose_conv_params, &input_dims, &filter_dims); - auto output_buffer_or_error = context.allocate_temp( - static_cast(output_buffer_bytes), kCortexMMveAlignment); - if (!output_buffer_or_error.ok()) { + if (output_scratch.nbytes() != static_cast(output_buffer_bytes)) { ET_LOG( Error, - "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)", - output_buffer_bytes, - static_cast(output_buffer_or_error.error())); - context.fail(output_buffer_or_error.error()); + "quantized_transpose_conv2d_out: output scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(output_scratch.nbytes()), + output_buffer_bytes); + context.fail(Error::Internal); return out; } - output_context.buf = output_buffer_or_error.get(); - output_context.size = output_buffer_bytes; +#endif const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8( &cmsis_context, diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 2c35ed8730b..d4393bc7ada 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -271,13 +271,15 @@ def quantized_mul_impl( "quantized_batch_matmul(" "Tensor lhs, int lhs_zero_point, " "Tensor rhs_transposed, int rhs_zero_point, " - "int output_zero_point, int output_multiplier, int output_shift) -> Tensor" + "int output_zero_point, int output_multiplier, int output_shift, " + "Tensor scratch) -> Tensor" ) lib.define( "quantized_batch_matmul.out(" "Tensor lhs, int lhs_zero_point, " "Tensor rhs_transposed, int rhs_zero_point, " "int output_zero_point, int output_multiplier, int output_shift, " + "Tensor scratch, " "*, Tensor(a!) out) -> Tensor(a!)" ) @@ -291,6 +293,7 @@ def quantized_batch_matmul_meta( output_zero_point: int, output_multiplier: int, output_shift: int, + scratch: torch.Tensor, ) -> torch.Tensor: batch, lhs_rows, inner = lhs.shape batch_rhs, rhs_cols, inner_rhs = rhs_transposed.shape @@ -307,6 +310,7 @@ def quantized_batch_matmul_impl( output_zero_point: int, output_multiplier: int, output_shift: int, + scratch: torch.Tensor, ) -> torch.Tensor: # Offsets are negated zero points (CMSIS-NN convention) lhs_fp = lhs.to(torch.float32) + float(lhs_zero_point) @@ -638,7 +642,8 @@ def pad_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch" ") -> Tensor" ) @@ -657,6 +662,7 @@ def pad_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " "*, Tensor(a!) out" ") -> Tensor(a!)" ) @@ -733,6 +739,7 @@ def quantized_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -762,6 +769,7 @@ def quantized_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: if input.dim() != 4 or weight.dim() != 4: raise RuntimeError("quantized_conv2d expects 4D input and weight tensors") @@ -830,7 +838,8 @@ def quantized_conv2d_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch" ") -> Tensor" ) @@ -850,6 +859,7 @@ def quantized_conv2d_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " "*, Tensor(a!) out" ") -> Tensor(a!)" ) @@ -870,6 +880,7 @@ def quantized_depthwise_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -900,6 +911,7 @@ def quantized_depthwise_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: if input.dim() != 4 or weight.dim() != 4: raise RuntimeError( @@ -973,7 +985,9 @@ def quantized_depthwise_conv2d_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch, " + "Tensor output_scratch" ") -> Tensor" ) @@ -992,6 +1006,8 @@ def quantized_depthwise_conv2d_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " + "Tensor output_scratch, " "*, Tensor(a!) out) -> Tensor(a!)" ) @@ -1057,6 +1073,8 @@ def quantized_transpose_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, + output_scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -1095,6 +1113,8 @@ def quantized_transpose_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, + output_scratch: torch.Tensor, ) -> torch.Tensor: """ Reference implementation of quantized transposed convolution. diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index e0ebbfab868..8db109dea43 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -65,19 +65,20 @@ - arg_meta: null kernel_name: cortex_m::pad_out -- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: cortex_m::quantized_conv2d_out -- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) + +- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: cortex_m::quantized_depthwise_conv2d_out -- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, Tensor output_scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null @@ -94,7 +95,7 @@ - arg_meta: null kernel_name: cortex_m::quantized_max_pool2d_out -- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py index 92179ec6654..c379461949f 100644 --- a/backends/cortex_m/passes/__init__.py +++ b/backends/cortex_m/passes/__init__.py @@ -33,6 +33,7 @@ def _ensure_cortex_m_dependencies() -> None: _ensure_cortex_m_dependencies() +from .cortex_m_pass import CortexMPass # noqa # usort: skip from .activation_fusion_pass import ActivationFusionPass # noqa from .clamp_hardswish_pass import ClampHardswishPass # noqa from .convert_to_cortex_m_pass import ConvertToCortexMPass # noqa diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 418f6cd63ff..e61ddaf63bc 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -6,25 +6,32 @@ # LICENSE file in the root directory of this source tree. import executorch.backends.cortex_m.ops.operators # noqa +import executorch.exir as exir import torch import torch.fx from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor + +from executorch.backends.cortex_m.passes import CortexMPass from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot +from executorch.backends.cortex_m.passes.scratch_buffer_sizes import ( + required_cmsis_nn_buffer_sizes, +) from executorch.backends.transforms.utils import ( create_constant_placeholder, get_param_tensor, is_param_node, ) - -from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.passes import make_alloc_node +from torch._subclasses.fake_tensor import FakeTensorMode + from torch.export.graph_signature import InputKind from torch.fx.passes.infra.pass_manager import PassResult -class ConvertToCortexMPass(XNNPACKPass): +class ConvertToCortexMPass(CortexMPass): """ Cortex-M backend pass for replacing supported quantized kernels with Cortex-M accelerated kernels. @@ -33,6 +40,15 @@ class ConvertToCortexMPass(XNNPACKPass): by call_operator. """ + def _create_uninitialized_alloc_node(self): + """Create an unitialized alloc node to be initialize at a later point.""" + with FakeTensorMode() as mode: + return make_alloc_node( + self.exported_program.graph_module, + mode.from_tensor(torch.empty(0)), + None, + ) + def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset): """ Computes the precomputed kernel sum term (bias optional) @@ -238,6 +254,9 @@ def _get_convolution_replacement(self, node): torch.tensor(quantized_shifts, dtype=torch.int32), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + if use_depthwise_conv: # Compute depth_multiplier for depthwise convolution # For depthwise: output_channels = input_channels * depth_multiplier @@ -263,6 +282,7 @@ def _get_convolution_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, ) return exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default, new_args else: @@ -280,9 +300,36 @@ def _get_convolution_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, ) return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args + def _initialize_alloc_node_size(self, node: torch.fx.Node) -> None: + """For nodes with a registered buffer size function for node.target, set the buffer sizes + of the last n args, which should be exir.memory.alloc nodes. For nodes without a + registered function, do nothing. + """ + + scratch_buffer_sizes = required_cmsis_nn_buffer_sizes( + node, self.target_config.backend + ) + if scratch_buffer_sizes is None: + return + + # Assume that scratch_buffer_sizes are given from left to right in the call signature of node.target. + for i, scratch_buffer_size in enumerate(reversed(scratch_buffer_sizes)): + scratch_arg = node.args[-(i + 1)] + if ( + not isinstance(scratch_arg, torch.fx.Node) + or scratch_arg.target != exir.memory.alloc + ): + raise RuntimeError( + f"Expected scratch alloc node as final argument(s) for {node.target}, got {scratch_arg}." + ) + + # buffer size is given in bytes, always use uint8 as dtype. + scratch_arg.args = (((scratch_buffer_size,), torch.uint8),) + def _get_transpose_conv2d_replacement(self, node): """ Transform aten.convolution with transposed=True to cortex_m.quantized_transpose_conv2d @@ -363,6 +410,10 @@ def _get_transpose_conv2d_replacement(self, node): torch.tensor(quantized_shifts, dtype=torch.int32), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + output_scratch = self._create_uninitialized_alloc_node() + new_args = ( x, weight_nhwc, @@ -377,6 +428,8 @@ def _get_transpose_conv2d_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, + output_scratch, ) return exir_ops.edge.cortex_m.quantized_transpose_conv2d.default, new_args @@ -415,6 +468,9 @@ def _get_bmm_replacement(self, node): args=(rhs_node, [0, 2, 1]), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + args = ( lhs_node, -lhs_zp, @@ -423,6 +479,7 @@ def _get_bmm_replacement(self, node): output_zp, output_mult, output_shift, + scratch, ) return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args @@ -459,6 +516,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: args=args, kwargs={}, ) + self._initialize_alloc_node_size(cortex_m_op) node.replace_all_uses_with(cortex_m_op) graph_module.graph.erase_node(node) diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py new file mode 100644 index 00000000000..36f3f8bbc17 --- /dev/null +++ b/backends/cortex_m/passes/scratch_buffer_sizes.py @@ -0,0 +1,266 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from collections.abc import Callable +from typing import Any, cast + +import cmsis_nn # type: ignore[import-not-found, import-untyped] +import executorch.backends.cortex_m.ops.operators # noqa + +import torch +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops + +BufferSizeFunction = Callable[[cmsis_nn.Backend, torch.fx.Node], list[int]] + + +def _tensor_from_node(node: torch.fx.Node) -> torch.Tensor: + if "val" in node.meta: + return node.meta["val"] + elif node.op == "call_function": + args = ( + _tensor_from_node(arg) if isinstance(arg, torch.fx.Node) else arg + for arg in node.args + ) + return node.target(*args, **node.kwargs) # type: ignore[operator] + else: + raise RuntimeError("Encountered non-call_function without 'val' meta.") + + +def _shape_from_node(node: torch.fx.Node) -> torch.Size: + return _tensor_from_node(node).shape + + +def _get_common_conv_buffer_size_inputs( + conv_node: torch.fx.Node, + *, + stride_arg_idx: int = 3, + padding_arg_idx: int = 4, + dilation_arg_idx: int = 5, +) -> tuple[ + list[int], + list[int], + list[int], + list[int], + list[int], + list[int], +]: + x = cast(torch.fx.Node, conv_node.args[0]) + weight = cast(torch.fx.Node, conv_node.args[1]) + stride = cast(list[int], conv_node.args[stride_arg_idx]) + padding = cast(list[int], conv_node.args[padding_arg_idx]) + dilation = cast(list[int], conv_node.args[dilation_arg_idx]) + + # Input is NCHW (PyTorch); CMSIS-NN wants NHWC dims. + n, c_in, height, width = _shape_from_node(x) + + weight_shape = _shape_from_node(weight) + + # Output is NCHW; convert to NHWC dims. + out_n, out_c, out_h, out_w = _shape_from_node(conv_node) + + input_nhwc = [n, height, width, c_in] + output_nhwc = [out_n, out_h, out_w, out_c] + stride_hw = [int(stride[0]), int(stride[1])] + padding_hw = [int(padding[0]), int(padding[1])] + dilation_hw = [int(dilation[0]), int(dilation[1])] + + return ( + input_nhwc, + list(weight_shape), + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) + + +def cmsis_nn_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node) + input_offset = cast(int, conv_node.args[6]) + output_offset = cast(int, conv_node.args[7]) + output_qmin = cast(int, conv_node.args[10]) + output_qmax = cast(int, conv_node.args[11]) + + # Weight is in OHWI layout after conversion. + c_out, kernel_h, kernel_w, c_in = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, c_in] + + return [ + int( + cmsis_nn.convolve_wrapper_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ) + ] + + +def cmsis_nn_depthwise_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node) + depth_multiplier = cast(int, conv_node.args[6]) + input_offset = cast(int, conv_node.args[7]) + output_offset = cast(int, conv_node.args[8]) + output_qmin = cast(int, conv_node.args[11]) + output_qmax = cast(int, conv_node.args[12]) + + # Weight is in IHWO layout after conversion. + _, kernel_h, kernel_w, c_out = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, 1] + + return [ + int( + cmsis_nn.depthwise_conv_wrapper_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + ch_mult=depth_multiplier, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ) + ] + + +def cmsis_nn_batch_matmul_buffer_size( + backend: cmsis_nn.Backend, + matmul_node: torch.fx.Node, +) -> list[int]: + rhs_transposed = cast(torch.fx.Node, matmul_node.args[2]) + rhs_shape = _shape_from_node(rhs_transposed) + + _, rhs_cols, inner = rhs_shape + + return [ + int( + cmsis_nn.fully_connected_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + filter_nhwc=[inner, -1, -1, rhs_cols], # H and W values are unused. + ) + ) + ] + + +def cmsis_nn_transpose_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs( + conv_node=conv_node, + stride_arg_idx=3, + padding_arg_idx=4, + dilation_arg_idx=6, + ) + output_padding = cast(list[int], conv_node.args[5]) + input_offset = cast(int, conv_node.args[7]) + output_offset = cast(int, conv_node.args[8]) + output_qmin = cast(int, conv_node.args[11]) + output_qmax = cast(int, conv_node.args[12]) + c_out, kernel_h, kernel_w, kernel_c_in = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, kernel_c_in] + padding_offsets_hw = [int(output_padding[0]), int(output_padding[1])] + + return [ + int( + cmsis_nn.transpose_conv_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + padding_offsets_hw=padding_offsets_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ), + int( + cmsis_nn.transpose_conv_reverse_conv_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + padding_offsets_hw=padding_offsets_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ), + ] + + +_target_to_buffer_sizes_registry: dict[Any, BufferSizeFunction] = { + exir_ops.edge.cortex_m.quantized_conv2d.default: cmsis_nn_conv_buffer_size, + exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default: cmsis_nn_depthwise_conv_buffer_size, + exir_ops.edge.cortex_m.quantized_batch_matmul.default: cmsis_nn_batch_matmul_buffer_size, + exir_ops.edge.cortex_m.quantized_transpose_conv2d.default: cmsis_nn_transpose_conv_buffer_size, +} + + +def required_cmsis_nn_buffer_sizes( + node: torch.fx.Node, backend: cmsis_nn.Backend +) -> list[int] | None: + """Returns a sequence of scratch buffer sizes required by node, in bytes. + If no function is registered to compute this for the target of the node, return None. + """ + if node.target not in _target_to_buffer_sizes_registry: + return None + + buffer_size_function = _target_to_buffer_sizes_registry[node.target] + return buffer_size_function(backend, node) diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh index 2505f83c9da..4080c481c1f 100755 --- a/backends/cortex_m/test/build_test_runner.sh +++ b/backends/cortex_m/test/build_test_runner.sh @@ -12,7 +12,7 @@ set -eu script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")") et_root_dir=$(realpath "${script_dir}/../../..") build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh" -${build_executorch} --devtools +${build_executorch} --devtools --cmake-args="-DCORTEX_M_ENABLE_ASSERTS=ON" # Build executor runner with selected aten ops and semi hosting build_dir="${et_root_dir}/arm_test" @@ -32,4 +32,4 @@ aten::unsqueeze_copy.out,\ aten::select_copy.int_out,\ aten::amax.out" -${build_executor_runner} --pte=semihosting --bundleio --target=ethos-u55-128 --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0" +${build_executor_runner} --pte=semihosting --bundleio --target=ethos-u55-128 --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0 -DET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=0"