Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ compile_commands.json
python/_task_interface*.so
python/_task_interface*.dylib
.claude/scheduled_tasks.lock

# Log files
*.log
profiling_logs_*/
8 changes: 7 additions & 1 deletion simpler_setup/build_runtimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def build_all(
raise

for platform in platforms:
arch, variant = parse_platform(platform)
arch, _ = parse_platform(platform)
runtimes = discover_runtimes(arch)

if not runtimes:
Expand All @@ -152,6 +152,12 @@ def build_all(
logger.error(f" Failed to build {platform}/{runtime_name}: {e}")
raise

# No device-side deployment step here. The dispatcher SO is uploaded
# into the main aicpu_scheduler at runtime, on the first
# DeviceRunner::ensure_binaries_loaded call, via
# LoadAicpuOp::BootstrapDispatcher (see src/common/host/load_aicpu_op.cpp
# and src/common/aicpu_dispatcher/aicpu_dispatcher.h for architecture).


def main():
parser = argparse.ArgumentParser(description="Pre-build runtime binaries for available platforms")
Expand Down
1 change: 1 addition & 0 deletions simpler_setup/runtime_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def _compile_target(target: str) -> Path:
source_dirs,
build_dir=str(cache_dir),
output_dir=output_dir,
runtime_name=name,
)

logger.info("Compiling AICore, AICPU, Host in parallel...")
Expand Down
33 changes: 30 additions & 3 deletions simpler_setup/runtime_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,27 @@ def get_root_dir(self) -> str:
def get_binary_name(self) -> str:
return self._binary_name

def gen_cmake_args(self, include_dirs: list[str], source_dirs: list[str]) -> list[str]:
"""Generate CMake arguments list from toolchain args + custom directories."""
def gen_cmake_args(
self,
include_dirs: list[str],
source_dirs: list[str],
runtime_name: Optional[str] = None,
) -> list[str]:
"""Generate CMake arguments list from toolchain args + custom directories.

``runtime_name`` is propagated to CMake as ``-DRUNTIME_NAME=<name>`` so
per-runtime build outputs (e.g. the AICPU dispatcher SO) can pick a
per-runtime basename — needed for ChipWorker to bind multiple runtimes
in a single process without colliding on dispatcher state.
"""
inc = ";".join(os.path.abspath(d) for d in include_dirs)
src = ";".join(os.path.abspath(d) for d in source_dirs)
args = self.toolchain.get_cmake_args() + [
f"-DCUSTOM_INCLUDE_DIRS={inc}",
f"-DCUSTOM_SOURCE_DIRS={src}",
]
if runtime_name is not None:
args.append(f"-DRUNTIME_NAME={runtime_name}")
if logger.isEnabledFor(logging.DEBUG):
args.append("--log-level=VERBOSE")
return args
Expand Down Expand Up @@ -201,6 +214,7 @@ def compile(
source_dirs: list[str],
build_dir: Optional[str] = None,
output_dir: Optional[Union[str, Path]] = None,
runtime_name: Optional[str] = None,
) -> Union[bytes, Path]:
"""
Compile binary for the specified target platform.
Expand Down Expand Up @@ -231,7 +245,7 @@ def compile(
else:
raise ValueError(f"Invalid target platform: {target_platform}. Must be 'aicore', 'aicpu', or 'host'.")

cmake_args = target.gen_cmake_args(include_dirs, source_dirs)
cmake_args = target.gen_cmake_args(include_dirs, source_dirs, runtime_name=runtime_name)
cmake_source_dir = target.get_root_dir()
binary_name = target.get_binary_name()
platform = target_platform.upper()
Expand All @@ -249,6 +263,19 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]:
od.mkdir(parents=True, exist_ok=True)
dest = od / binary_name
shutil.copy2(binary_path, dest)
# The AICPU dispatcher SO has a stable, runtime-invariant name.
# Host BootstrapDispatcher uploads it into the main aicpu_scheduler
# at process startup (no tar.gz / sudo), and the dispatcher
# self-deploys into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/.
# Per-runtime AICPU kernel SOs (libaicpu_kernel.so) are uploaded
# by host at runtime via DeviceArgs.aicpu_so_bin and lazily
# loaded by the dispatcher.
dispatcher_name = "libsimpler_aicpu_dispatcher.so"
dispatcher_so = Path(actual_build_dir) / dispatcher_name
if dispatcher_so.is_file():
dest_dispatcher = od / dispatcher_name
shutil.copy2(dispatcher_so, dest_dispatcher)
subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True)
return dest
else:
with open(binary_path, "rb") as f:
Expand Down
45 changes: 45 additions & 0 deletions src/a2a3/platform/onboard/aicpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher")
if(DEFINED CUSTOM_INCLUDE_DIRS)
foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS})
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}")
Expand Down Expand Up @@ -84,3 +85,47 @@ target_link_directories(aicpu_kernel

# Output name
set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel)

# Build dispatcher SO (direction 1: stable single dispatcher, runtime kernel
# uploaded at runtime). The dispatcher has NO runtime-specific code; it
# receives the per-runtime AICPU kernel SO bytes via DeviceArgs.aicpu_so_bin
# at Null phase, writes them to disk, dlopens, and dlsyms the inner
# DynTileFwkBackendKernelServer{,Init} symbols. Cache key is
# (aicpu_so_bin device address, aicpu_so_len) — different ChipWorker
# instances in the same process get separate cache entries, enabling
# single-process multi-runtime without firstCreatSo_-style locks.
#
# Output name is fixed ("simpler_aicpu_dispatcher"). Host bootstrap uploads
# this SO into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/ at process
# startup via LoadAicpuOp::BootstrapDispatcher — no tar.gz, no sudo.
# Building per-runtime libaicpu_kernel.so stays in this same CMakeLists
# (aicpu_kernel target above).
set(AICPU_DISPATCHER_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp"
)
add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES})

target_compile_options(aicpu_dispatcher
PRIVATE
-Wall
-Wextra
-rdynamic
-O3
-fPIC
-g
$<$<COMPILE_LANGUAGE:CXX>:-std=gnu++17>
)

target_include_directories(aicpu_dispatcher
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CUSTOM_INCLUDE_DIRS}
${ASCEND_HOME_PATH}/include
)

target_link_libraries(aicpu_dispatcher PRIVATE dl)

set_target_properties(aicpu_dispatcher PROPERTIES
LINK_FLAGS "-Wl,--build-id"
OUTPUT_NAME "simpler_aicpu_dispatcher"
)
43 changes: 16 additions & 27 deletions src/a2a3/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#include "runtime.h"

// Run-wall capture: g_device_start_cycle is set once in
// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread
// of the multi-threaded DynTileFwkBackendKernelServer writes the converted
// simpler_aicpu_init (single-threaded launch); each thread
// of the multi-threaded simpler_aicpu_exec writes the converted
// (end - start) into KernelArgs.device_wall_ns on exit. Plain stores —
// last-writer-wins is fine for wall measurement (concurrent exiting threads'
// `my_end` values differ by µs, the final overwrite is within benchmark
Expand All @@ -35,27 +35,18 @@ static uint64_t g_device_start_cycle = 0;
// Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
extern "C" int aicpu_execute(Runtime *arg);

extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) {
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
return -1;
}

return 0;
}

/**
* AICPU kernel initialization entry point
* AICPU kernel initialization entry point.
*
* This function is called once during kernel initialization by the CANN
* runtime. It initializes logging and validates kernel arguments.
*
* Note: Function name is hardcoded in libaicpu_extend_kernels.so
* Called once by simpler_dispatcher in the Init phase. The dispatcher
* dlsym's "simpler_aicpu_init" inside this inner SO (an internal
* dispatcher↔inner protocol — independent of CANN's preinstalled
* libaicpu_extend_kernels contract, which only binds the dispatcher itself).
*
* @param arg Pointer to KernelArgs structure
* @return 0 on success, -1 on error
*/
extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) {
extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) {
init_log_switch();
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
Expand All @@ -67,7 +58,7 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer

// Init is launched single-threaded (block_dim=1), so the race-free spot
// to capture run start and reset the wall accumulator. Subsequent
// DynTileFwkBackendKernelServer threads stamp end on their way out, via
// simpler_aicpu_exec threads stamp end on their way out, via
// the device-resident 8-byte buffer addressed by device_wall_data_base.
g_device_start_cycle = get_sys_cnt_aicpu();
if (k_args->device_wall_data_base != 0) {
Expand All @@ -79,17 +70,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
}

/**
* AICPU kernel main execution entry point
*
* This is the main entry point for the AICPU runtime executor kernel.
* It extracts the Runtime from KernelArgs and delegates to AicpuExecute.
* AICPU kernel main execution entry point.
*
* Note: Function name is hardcoded in libaicpu_extend_kernels.so
* Called per-thread by simpler_dispatcher in the Run phase via dlsym
* "simpler_aicpu_exec" on the inner SO.
*
* @param arg Pointer to KernelArgs structure containing runtime_args
* @return 0 on success, non-zero on error
*/
extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) {
extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) {
if (arg == nullptr) {
LOG_ERROR("%s", "Invalid kernel arguments: null pointer");
return -1;
Expand Down Expand Up @@ -128,13 +117,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
return 0;
}

LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime");
LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime");
int rc = aicpu_execute(runtime);
if (rc != 0) {
LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc);
LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc);
return rc;
}
LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully");
LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully");

// Stamp end into the device_wall buffer (addressed via
// device_wall_data_base). Last-writer-wins across threads — wall
Expand Down
26 changes: 22 additions & 4 deletions src/a2a3/platform/onboard/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host")
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher")
if(DEFINED CUSTOM_INCLUDE_DIRS)
foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS})
list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}")
Expand Down Expand Up @@ -59,6 +61,10 @@ list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp"
)
# Add common/host sources (LoadAicpuOp)
list(APPEND HOST_RUNTIME_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp"
)
if(DEFINED CUSTOM_SOURCE_DIRS)
foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS})
file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c")
Expand Down Expand Up @@ -108,15 +114,21 @@ target_include_directories(host_runtime
${ASCEND_HOME_PATH}/include
${ASCEND_HOME_PATH}/pkg_inc
${ASCEND_HOME_PATH}/pkg_inc/runtime
# pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+
# rtsLaunchCpuKernel API used by LoadAicpuOp).
${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime
${ASCEND_HOME_PATH}/pkg_inc/profiling
${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/asc/include
${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver
)

target_link_directories(host_runtime
PRIVATE
${ASCEND_HOME_PATH}/lib64
${ASCEND_HOME_PATH}/runtime/lib64
# Stable dispatcher basename. The dispatcher SO is bundled with the host
# runtime and resolved next to host_runtime.so via dladdr at runtime;
# LoadAicpuOp::BootstrapDispatcher uploads it (along with the per-runtime
# AICPU kernel SO bytes) into the main aicpu_scheduler at host process
# startup via libaicpu_extend_kernels — no tar.gz, no sudo.
target_compile_definitions(host_runtime PRIVATE
SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so"
)

if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE)
Expand Down Expand Up @@ -156,4 +168,10 @@ if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE)
target_link_libraries(host_runtime PRIVATE nnopbase)
endif()

target_link_directories(host_runtime
PRIVATE
${ASCEND_HOME_PATH}/lib64
${ASCEND_HOME_PATH}/runtime/lib64
)

set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime")
Loading
Loading