Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions backends/arm/scripts/build_executorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Optional parameter:
# --build_type= "Release" | "Debug" | "RelWithDebInfo" | "UndefinedSanitizer" | "AddressSanitizer"
# --etdump build with devtools-etdump support
# --cmake-args= Additional arguments passed to cmake configure

set -eu

Expand All @@ -24,6 +25,7 @@ build_type="Release"
build_devtools=OFF
build_with_etdump=OFF
is_linux_musl=0
extra_cmake_args=()

help() {
echo "Usage: $(basename $0) [options]"
Expand All @@ -32,6 +34,7 @@ help() {
echo " --build_type=<TYPE> Build with Release, Debug, RelWithDebInfo, UndefinedSanitizer or AddressSanitizer, default is ${build_type}"
echo " --devtools Build Devtools libs"
echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
echo " --cmake-args=<ARGS> Additional arguments passed to cmake configure"
echo " --toolchain=<TOOLCHAIN> Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}"
exit 0
}
Expand All @@ -43,6 +46,10 @@ for arg in "$@"; do
--build_type=*) build_type="${arg#*=}";;
--devtools) build_devtools=ON ;;
--etdump) build_with_etdump=ON ;;
--cmake-args=*)
# shellcheck disable=SC2206
extra_cmake_args=(${arg#*=})
;;
--toolchain=*) toolchain="${arg#*=}";;
*)
;;
Expand Down Expand Up @@ -85,6 +92,7 @@ cmake_args=(
-DCMAKE_BUILD_TYPE=${build_type}
-DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools}
-DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump}
"${extra_cmake_args[@]}"
)

if [[ ${is_linux_musl} -eq 1 ]]; then
Expand Down
8 changes: 8 additions & 0 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ set(CMSIS_NN_LOCAL_PATH
""
CACHE PATH "Path to existing local CMSIS-NN installation"
)
option(CORTEX_M_ENABLE_ASSERTS
"Enable additional Cortex-M runtime assertions and validation checks"
OFF
)

# Try to find existing / local CMSIS-NN installation. This is useful for
# debugging and testing with local changes. This is not common, as the CMSIS-NN
Expand Down Expand Up @@ -87,6 +91,10 @@ target_link_libraries(
PRIVATE executorch
PRIVATE kernels_util_all_deps
)
target_compile_definitions(
cortex_m_kernels
PRIVATE $<$<BOOL:${CORTEX_M_ENABLE_ASSERTS}>:CORTEX_M_ENABLE_ASSERTS>
)

# Include directories for cortex_m_kernels
target_include_directories(
Expand Down
35 changes: 19 additions & 16 deletions backends/cortex_m/ops/op_quantized_batch_matmul.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
* Copyright 2026 Arm Limited and/or its affiliates.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
Expand Down Expand Up @@ -71,6 +72,7 @@ Tensor& quantized_batch_matmul_out(
int64_t output_offset,
int64_t output_multiplier,
int64_t output_shift,
const Tensor& scratch,
Tensor& out) {
if (!validate_batch_matmul_arguments(context, lhs, rhs_transposed, out)) {
return out;
Expand Down Expand Up @@ -100,25 +102,26 @@ Tensor& quantized_batch_matmul_out(
quant_params.multiplier = static_cast<int32_t>(output_multiplier);
quant_params.shift = static_cast<int32_t>(output_shift);

const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&out_dims);

cmsis_nn_context ctx;
ctx.buf = nullptr;
ctx.size = 0;

if (buf_size > 0) {
auto buffer_or_error = context.allocate_temp(buf_size);
if (!buffer_or_error.ok()) {
ET_LOG(
Error,
"quantized_batch_matmul: failed to allocate scratch buffer (%d bytes)",
buf_size);
context.fail(buffer_or_error.error());
return out;
}
ctx.buf = buffer_or_error.get();
ctx.size = buf_size;
ctx.size = scratch.nbytes();
if (ctx.size > 0) {
ctx.buf = scratch.mutable_data_ptr<int8_t>();
}

#ifdef CORTEX_M_ENABLE_ASSERTS
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As much as I want to limit the runtime checks, I think it'd be good to have this check always on and non-optional. Without this, we could wind up writing past end of buffers.

Also, naming nit: technically this is not an assert, as it should not crash the program if it fails. Maybe ENABLE_RUNTIME_CHECKS?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that we should be confident that we are doing the correct allocation after testing. Users can turn this on to verify for example that they have not mixed up cmsis_nn versions, but then skip it in production. That's also why I want it to be a crash. If there is a mismatch here, I want to enforce a fix. Also, when we have this flag available, we can use it in more places.

const int32_t runtime_buffer_bytes =
arm_fully_connected_s8_get_buffer_size(&out_dims);
if (ctx.size != static_cast<size_t>(runtime_buffer_bytes)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT about doing ctx.size < runtime_buffer_bytes here? Essentially, if we have more memory than what we actually need, do we still want to error out?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me, this is a correctness assert. We don't just want to avoid failure, we want to make sure to ensure correctness.

ET_LOG(
Error,
"quantized_batch_matmul: scratch buffer size incorrect - actual: (%d) needed: (%d)",
static_cast<int>(ctx.size),
runtime_buffer_bytes);
context.fail(Error::Internal);
return out;
}
#endif

const arm_cmsis_nn_status status = arm_batch_matmul_s8(
&ctx,
Expand Down
34 changes: 17 additions & 17 deletions backends/cortex_m/ops/op_quantized_conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ Tensor& quantized_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
const Tensor& scratch,
Tensor& out) {
if (!validate_conv2d_arguments(
context,
Expand Down Expand Up @@ -182,31 +183,30 @@ Tensor& quantized_conv2d_out(

cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
cmsis_context.size = 0;
cmsis_context.size = scratch.nbytes();
if (cmsis_context.size > 0) {
cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
}

const int32_t buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
#ifdef CORTEX_M_ENABLE_ASSERTS
const int32_t runtime_buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size(
&conv_params, &input_dims, &filter_dims, &output_dims);
if (buffer_bytes < 0) {
if (runtime_buffer_bytes < 0) {
ET_LOG(
Error, "quantized_conv2d_out: CMSIS-NN buffer size calculation failed");
context.fail(Error::Internal);
return out;
}
if (buffer_bytes > 0) {
auto buffer_or_error =
context.allocate_temp(buffer_bytes, kCortexMMveAlignment);
if (!buffer_or_error.ok()) {
ET_LOG(
Error,
"quantized_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
static_cast<int>(buffer_bytes),
static_cast<int>(buffer_or_error.error()));
context.fail(buffer_or_error.error());
return out;
}
cmsis_context.buf = buffer_or_error.get();
cmsis_context.size = buffer_bytes;
if (scratch.nbytes() != static_cast<size_t>(runtime_buffer_bytes)) {
ET_LOG(
Error,
"quantized_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
static_cast<int>(scratch.nbytes()),
static_cast<int>(runtime_buffer_bytes));
context.fail(Error::Internal);
return out;
}
#endif

const arm_cmsis_nn_status status = arm_convolve_wrapper_s8(
&cmsis_context,
Expand Down
31 changes: 16 additions & 15 deletions backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ Tensor& quantized_depthwise_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
const Tensor& scratch,
Tensor& out) {
if (!validate_depthwise_conv2d_arguments(
context,
Expand Down Expand Up @@ -220,32 +221,32 @@ Tensor& quantized_depthwise_conv2d_out(

cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
cmsis_context.size = 0;
cmsis_context.size = scratch.nbytes();
if (cmsis_context.size > 0) {
cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
}

const int32_t buffer_bytes = arm_depthwise_conv_wrapper_s8_get_buffer_size(
&dw_conv_params, &input_dims, &filter_dims, &output_dims);
if (buffer_bytes < 0) {
#ifdef CORTEX_M_ENABLE_ASSERTS
const int32_t runtime_buffer_bytes =
arm_depthwise_conv_wrapper_s8_get_buffer_size(
&dw_conv_params, &input_dims, &filter_dims, &output_dims);
if (runtime_buffer_bytes < 0) {
ET_LOG(
Error,
"quantized_depthwise_conv2d_out: CMSIS-NN buffer size calculation failed");
context.fail(Error::Internal);
return out;
}

auto buffer_or_error = context.allocate_temp(
static_cast<size_t>(buffer_bytes), kCortexMMveAlignment);
if (!buffer_or_error.ok()) {
if (scratch.nbytes() != static_cast<size_t>(runtime_buffer_bytes)) {
ET_LOG(
Error,
"quantized_depthwise_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
static_cast<int>(buffer_bytes),
static_cast<int>(buffer_or_error.error()));
context.fail(buffer_or_error.error());
"quantized_depthwise_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
static_cast<int>(scratch.nbytes()),
static_cast<int>(runtime_buffer_bytes));
context.fail(Error::Internal);
return out;
}
cmsis_context.buf = buffer_or_error.get();
cmsis_context.size = buffer_bytes;

#endif
const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8(
&cmsis_context,
&dw_conv_params,
Expand Down
44 changes: 23 additions & 21 deletions backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
* Copyright 2026 Arm Limited and/or its affiliates.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
Expand Down Expand Up @@ -97,6 +98,8 @@ Tensor& quantized_transpose_conv2d_out(
const Tensor& requantize_shifts,
const int64_t activation_min,
const int64_t activation_max,
const Tensor& scratch,
const Tensor& output_scratch,
Tensor& out) {
if (!validate_transpose_conv2d_arguments(
context,
Expand Down Expand Up @@ -179,44 +182,43 @@ Tensor& quantized_transpose_conv2d_out(

cmsis_nn_context cmsis_context;
cmsis_context.buf = nullptr;
cmsis_context.size = 0;
cmsis_context.size = scratch.nbytes();
if (cmsis_context.size > 0) {
cmsis_context.buf = scratch.mutable_data_ptr<int8_t>();
}

cmsis_nn_context output_context;
output_context.buf = nullptr;
output_context.size = 0;

output_context.size = output_scratch.nbytes();
if (output_context.size > 0) {
output_context.buf = output_scratch.mutable_data_ptr<int8_t>();
}
#ifdef CORTEX_M_ENABLE_ASSERTS
const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size(
&transpose_conv_params, &input_dims, &filter_dims, &output_dims);
auto buffer_or_error = context.allocate_temp(
static_cast<size_t>(buffer_bytes), kCortexMMveAlignment);
if (!buffer_or_error.ok()) {
if (scratch.nbytes() != static_cast<size_t>(buffer_bytes)) {
ET_LOG(
Error,
"quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)",
buffer_bytes,
static_cast<int>(buffer_or_error.error()));
context.fail(buffer_or_error.error());
"quantized_transpose_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)",
static_cast<int>(scratch.nbytes()),
buffer_bytes);
context.fail(Error::Internal);
return out;
}
cmsis_context.buf = buffer_or_error.get();
cmsis_context.size = buffer_bytes;

const int32_t output_buffer_bytes =
arm_transpose_conv_s8_get_reverse_conv_buffer_size(
&transpose_conv_params, &input_dims, &filter_dims);
auto output_buffer_or_error = context.allocate_temp(
static_cast<size_t>(output_buffer_bytes), kCortexMMveAlignment);
if (!output_buffer_or_error.ok()) {
if (output_scratch.nbytes() != static_cast<size_t>(output_buffer_bytes)) {
ET_LOG(
Error,
"quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)",
output_buffer_bytes,
static_cast<int>(output_buffer_or_error.error()));
context.fail(output_buffer_or_error.error());
"quantized_transpose_conv2d_out: output scratch buffer size incorrect - actual: (%d) needed: (%d)",
static_cast<int>(output_scratch.nbytes()),
output_buffer_bytes);
context.fail(Error::Internal);
return out;
}
output_context.buf = output_buffer_or_error.get();
output_context.size = output_buffer_bytes;
#endif

const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8(
&cmsis_context,
Expand Down
Loading
Loading