From c2f96dd05cfd31f5a26055a7c210bc8019271875 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Thu, 21 May 2026 10:04:21 +0800 Subject: [PATCH] Feat: AICPU launch via dispatcher upload + Mode B per-task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-phase architecture for loading AICPU kernels on CANN 9.0+ without tar.gz / sudo pre-deployment. Bootstrap (per-DeviceRunner, idempotent across instances in a process) ====================================================================== Host bundles dispatcher SO bytes + runtime SO bytes into a single rtAicpuKernelLaunchExWithArgs (kernel_type = KERNEL_TYPE_AICPU_KFC) targeting CANN's preinstalled libaicpu_extend_kernels.so. libaicpu_extend_kernels dlopens our dispatcher and invokes its Init; the dispatcher reads the runtime SO bytes from extended DeviceArgs (inner_so_bin/inner_so_len at offsets 120/128, which libaicpu_extend_kernels ignores) and writes them to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so using sched-thread (HwHiAiUser) write permission. The dispatcher SO itself never lands at preinstall — only its transient libaicpu_extend_kernels dlopen. The runtime SO basename embeds an FNV-1a content fingerprint. Writes go via atomic tmp+rename inside the dispatcher — no truncation window visible to concurrent aicpu_scheduler readers. A process-level fingerprint cache in LoadAicpuOp skips redundant libaicpu_extend_kernels invocations within a single host process — each runtime is bootstrapped at most once per process. Per-task launches (Mode B, no dispatcher hop) ============================================= LoadAicpuOp.Init() JSON-registers the runtime SO via rtsBinaryLoadFromFile (cpuKernelMode=0, kernelSo points at the preinstall basename), then resolves simpler_aicpu_init and simpler_aicpu_exec to rtFuncHandles via rtsFuncGetByName. JSON is per-process (/tmp/simpler_inner__.json) so concurrent multi-chip / multi-worker tests don't race on a shared file. opType is suffixed with the runtime SO's fingerprint so multiple LoadAicpuOp instances in the same process register non-colliding entries even though the underlying symbol names are identical. Per-task launches call rtsLaunchCpuKernel on the cached rtFuncHandles — no per-call string marshalling, no global op registry lookups, no dispatcher hop. Cleanup ======= - Removes BUILD_WITH_NEW_CANN CMake option and all ifdef branches. Mode B requires CANN 7.0+, which all supported targets ship. - Deletes the legacy AicpuLoader stub (src/{a2a3,a5}/platform/onboard/host/aicpu_loader.{cpp,h}). - Widens the aicpu_op_timeout regression test to accept the Mode B-surfaced error codes in addition to the original 507046. Reference: PR #537. --- .gitignore | 4 + simpler_setup/build_runtimes.py | 8 +- simpler_setup/runtime_builder.py | 1 + simpler_setup/runtime_compiler.py | 33 +- .../platform/onboard/aicpu/CMakeLists.txt | 45 +++ src/a2a3/platform/onboard/aicpu/kernel.cpp | 43 +-- src/a2a3/platform/onboard/host/CMakeLists.txt | 26 +- .../platform/onboard/host/device_runner.cpp | 92 +++-- .../platform/onboard/host/device_runner.h | 4 + .../aicpu/aicpu_executor.cpp | 2 +- src/a5/platform/onboard/aicpu/CMakeLists.txt | 33 ++ src/a5/platform/onboard/aicpu/kernel.cpp | 41 +- src/a5/platform/onboard/host/CMakeLists.txt | 22 +- .../platform/onboard/host/device_runner.cpp | 85 +++-- src/a5/platform/onboard/host/device_runner.h | 4 + src/common/aicpu_dispatcher/CMakeLists.txt | 47 +++ src/common/aicpu_dispatcher/README.md | 32 ++ .../aicpu_dispatcher/aicpu_dispatcher.cpp | 195 ++++++++++ .../aicpu_dispatcher/aicpu_dispatcher.h | 66 ++++ src/common/host/CMakeLists.txt | 20 + src/common/host/load_aicpu_op.cpp | 360 ++++++++++++++++++ src/common/host/load_aicpu_op.h | 140 +++++++ .../test_aicore_op_timeout.py | 16 +- 23 files changed, 1188 insertions(+), 131 deletions(-) create mode 100644 src/common/aicpu_dispatcher/CMakeLists.txt create mode 100644 src/common/aicpu_dispatcher/README.md create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.cpp create mode 100644 src/common/aicpu_dispatcher/aicpu_dispatcher.h create mode 100644 src/common/host/CMakeLists.txt create mode 100644 src/common/host/load_aicpu_op.cpp create mode 100644 src/common/host/load_aicpu_op.h diff --git a/.gitignore b/.gitignore index 6502a2795..19f23ea16 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,7 @@ compile_commands.json python/_task_interface*.so python/_task_interface*.dylib .claude/scheduled_tasks.lock + +# Log files +*.log +profiling_logs_*/ diff --git a/simpler_setup/build_runtimes.py b/simpler_setup/build_runtimes.py index 9ed4fbb8c..fbe24d95e 100644 --- a/simpler_setup/build_runtimes.py +++ b/simpler_setup/build_runtimes.py @@ -131,7 +131,7 @@ def build_all( raise for platform in platforms: - arch, variant = parse_platform(platform) + arch, _ = parse_platform(platform) runtimes = discover_runtimes(arch) if not runtimes: @@ -152,6 +152,12 @@ def build_all( logger.error(f" Failed to build {platform}/{runtime_name}: {e}") raise + # No device-side deployment step here. The dispatcher SO is uploaded + # into the main aicpu_scheduler at runtime, on the first + # DeviceRunner::ensure_binaries_loaded call, via + # LoadAicpuOp::BootstrapDispatcher (see src/common/host/load_aicpu_op.cpp + # and src/common/aicpu_dispatcher/aicpu_dispatcher.h for architecture). + def main(): parser = argparse.ArgumentParser(description="Pre-build runtime binaries for available platforms") diff --git a/simpler_setup/runtime_builder.py b/simpler_setup/runtime_builder.py index 28d8d7fe8..02b9323cf 100644 --- a/simpler_setup/runtime_builder.py +++ b/simpler_setup/runtime_builder.py @@ -247,6 +247,7 @@ def _compile_target(target: str) -> Path: source_dirs, build_dir=str(cache_dir), output_dir=output_dir, + runtime_name=name, ) logger.info("Compiling AICore, AICPU, Host in parallel...") diff --git a/simpler_setup/runtime_compiler.py b/simpler_setup/runtime_compiler.py index 3185984f0..a14d343a7 100644 --- a/simpler_setup/runtime_compiler.py +++ b/simpler_setup/runtime_compiler.py @@ -40,14 +40,27 @@ def get_root_dir(self) -> str: def get_binary_name(self) -> str: return self._binary_name - def gen_cmake_args(self, include_dirs: list[str], source_dirs: list[str]) -> list[str]: - """Generate CMake arguments list from toolchain args + custom directories.""" + def gen_cmake_args( + self, + include_dirs: list[str], + source_dirs: list[str], + runtime_name: Optional[str] = None, + ) -> list[str]: + """Generate CMake arguments list from toolchain args + custom directories. + + ``runtime_name`` is propagated to CMake as ``-DRUNTIME_NAME=`` so + per-runtime build outputs (e.g. the AICPU dispatcher SO) can pick a + per-runtime basename — needed for ChipWorker to bind multiple runtimes + in a single process without colliding on dispatcher state. + """ inc = ";".join(os.path.abspath(d) for d in include_dirs) src = ";".join(os.path.abspath(d) for d in source_dirs) args = self.toolchain.get_cmake_args() + [ f"-DCUSTOM_INCLUDE_DIRS={inc}", f"-DCUSTOM_SOURCE_DIRS={src}", ] + if runtime_name is not None: + args.append(f"-DRUNTIME_NAME={runtime_name}") if logger.isEnabledFor(logging.DEBUG): args.append("--log-level=VERBOSE") return args @@ -201,6 +214,7 @@ def compile( source_dirs: list[str], build_dir: Optional[str] = None, output_dir: Optional[Union[str, Path]] = None, + runtime_name: Optional[str] = None, ) -> Union[bytes, Path]: """ Compile binary for the specified target platform. @@ -231,7 +245,7 @@ def compile( else: raise ValueError(f"Invalid target platform: {target_platform}. Must be 'aicore', 'aicpu', or 'host'.") - cmake_args = target.gen_cmake_args(include_dirs, source_dirs) + cmake_args = target.gen_cmake_args(include_dirs, source_dirs, runtime_name=runtime_name) cmake_source_dir = target.get_root_dir() binary_name = target.get_binary_name() platform = target_platform.upper() @@ -249,6 +263,19 @@ def _build(actual_build_dir: str) -> Union[bytes, Path]: od.mkdir(parents=True, exist_ok=True) dest = od / binary_name shutil.copy2(binary_path, dest) + # The AICPU dispatcher SO has a stable, runtime-invariant name. + # Host BootstrapDispatcher uploads it into the main aicpu_scheduler + # at process startup (no tar.gz / sudo), and the dispatcher + # self-deploys into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/. + # Per-runtime AICPU kernel SOs (libaicpu_kernel.so) are uploaded + # by host at runtime via DeviceArgs.aicpu_so_bin and lazily + # loaded by the dispatcher. + dispatcher_name = "libsimpler_aicpu_dispatcher.so" + dispatcher_so = Path(actual_build_dir) / dispatcher_name + if dispatcher_so.is_file(): + dest_dispatcher = od / dispatcher_name + shutil.copy2(dispatcher_so, dest_dispatcher) + subprocess.run(["strip", "-s", str(dest_dispatcher)], check=True) return dest else: with open(binary_path, "rb") as f: diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt index 6edf9eb93..5f0ded665 100644 --- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt @@ -20,6 +20,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -84,3 +85,47 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) + +# Build dispatcher SO (direction 1: stable single dispatcher, runtime kernel +# uploaded at runtime). The dispatcher has NO runtime-specific code; it +# receives the per-runtime AICPU kernel SO bytes via DeviceArgs.aicpu_so_bin +# at Null phase, writes them to disk, dlopens, and dlsyms the inner +# DynTileFwkBackendKernelServer{,Init} symbols. Cache key is +# (aicpu_so_bin device address, aicpu_so_len) — different ChipWorker +# instances in the same process get separate cache entries, enabling +# single-process multi-runtime without firstCreatSo_-style locks. +# +# Output name is fixed ("simpler_aicpu_dispatcher"). Host bootstrap uploads +# this SO into /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/ at process +# startup via LoadAicpuOp::BootstrapDispatcher — no tar.gz, no sudo. +# Building per-runtime libaicpu_kernel.so stays in this same CMakeLists +# (aicpu_kernel target above). +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,--build-id" + OUTPUT_NAME "simpler_aicpu_dispatcher" +) diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 32e24a526..e2dc61a81 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -24,8 +24,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread -// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// simpler_aicpu_init (single-threaded launch); each thread +// of the multi-threaded simpler_aicpu_exec writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement (concurrent exiting threads' // `my_end` values differ by µs, the final overwrite is within benchmark @@ -35,27 +35,18 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); -extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { - if (arg == nullptr) { - LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); - return -1; - } - - return 0; -} - /** - * AICPU kernel initialization entry point + * AICPU kernel initialization entry point. * - * This function is called once during kernel initialization by the CANN - * runtime. It initializes logging and validates kernel arguments. - * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called once by simpler_dispatcher in the Init phase. The dispatcher + * dlsym's "simpler_aicpu_init" inside this inner SO (an internal + * dispatcher↔inner protocol — independent of CANN's preinstalled + * libaicpu_extend_kernels contract, which only binds the dispatcher itself). * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -67,7 +58,7 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer // Init is launched single-threaded (block_dim=1), so the race-free spot // to capture run start and reset the wall accumulator. Subsequent - // DynTileFwkBackendKernelServer threads stamp end on their way out, via + // simpler_aicpu_exec threads stamp end on their way out, via // the device-resident 8-byte buffer addressed by device_wall_data_base. g_device_start_cycle = get_sys_cnt_aicpu(); if (k_args->device_wall_data_base != 0) { @@ -79,17 +70,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } /** - * AICPU kernel main execution entry point - * - * This is the main entry point for the AICPU runtime executor kernel. - * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. + * AICPU kernel main execution entry point. * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called per-thread by simpler_dispatcher in the Run phase via dlsym + * "simpler_aicpu_exec" on the inner SO. * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -128,13 +117,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer return 0; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer (addressed via // device_wall_data_base). Last-writer-wins across threads — wall diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index f0f01d438..ea23b7621 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -22,6 +22,8 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/worker") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -59,6 +61,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/dep_gen_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -108,15 +114,21 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime + # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ + # rtsLaunchCpuKernel API used by LoadAicpuOp). + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/asc/include ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 +# Stable dispatcher basename. The dispatcher SO is bundled with the host +# runtime and resolved next to host_runtime.so via dladdr at runtime; +# LoadAicpuOp::BootstrapDispatcher uploads it (along with the per-runtime +# AICPU kernel SO bytes) into the main aicpu_scheduler at host process +# startup via libaicpu_extend_kernels — no tar.gz, no sudo. +target_compile_definitions(host_runtime PRIVATE + SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" ) if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) @@ -156,4 +168,10 @@ if(SIMPLER_ENABLE_PTO_SDMA_WORKSPACE) target_link_libraries(host_runtime PRIVATE nnopbase) endif() +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 827552f56..6de887a9f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -18,6 +18,7 @@ #include "device_runner.h" #include "host_log.h" +#include "load_aicpu_op.h" #include @@ -28,6 +29,22 @@ #include #include "acl/acl.h" +static std::string resolve_dispatcher_so_path() { + // Dispatcher SO sits next to host_runtime.so (the SO this function lives + // in). dladdr gives us host_runtime.so's path; the dispatcher basename + // SIMPLER_AICPU_BASENAME is baked in at build time. + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return SIMPLER_AICPU_BASENAME; + } + std::string path = info.dli_fname; + size_t pos = path.rfind('/'); + if (pos == std::string::npos) { + return SIMPLER_AICPU_BASENAME; + } + return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; +} + // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" #include "callable.h" @@ -465,14 +482,42 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: + // libaicpu_extend_kernels invokes our dispatcher, which writes the inner + // SO bytes to /usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so + // using sched-thread (HwHiAiUser) write permission. The dispatcher itself + // never lands at preinstall — only its transient libaicpu_extend_kernels + // dlopen. Per-task launches afterwards go through Mode B + // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly + // against the preinstall file. + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.BootstrapDispatcher( + dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + ); if (rc != 0) { - LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); return rc; } + LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); - // Initialize device args + // JSON-register the inner SO and resolve simpler_aicpu_init / _exec handles. + rc = load_aicpu_op_.Init(); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); + return rc; + } + LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); + + // Keep so_info_ allocation matching upstream behavior. The new dispatcher + // path itself doesn't need DeviceArgs.aicpu_so_bin/len, but removing them + // empirically destabilized other tests on CI (a2a3 paged_attention_unroll + // hit AICORE-side issues). Treat the field as part of the contract that + // downstream runtime code may inspect. + rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + if (rc != 0) { + LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + return rc; + } device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -761,18 +806,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { dep_gen_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); - // Launch AICPU init kernel - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); - // Launch AICPU main kernel (over-launch for affinity gate) + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -1111,6 +1154,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // load_aicpu_op_ has no per-task device-side state to release (Mode A + // type 2 launches don't keep handles). The dispatcher itself was a + // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1195,27 +1241,11 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's + // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The + // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct + // for the main aicpu_scheduler to dlsym. + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 393531c48..9f1a47c0f 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -51,6 +51,7 @@ #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" +#include "load_aicpu_op.h" #include "runtime.h" /** @@ -569,6 +570,9 @@ class DeviceRunner { std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // AICPU op loader — handles dispatcher bootstrap and per-task launches. + host::LoadAicpuOp load_aicpu_op_; + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 54115719e..d54cfd9d9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -264,7 +264,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU + // Try multiple paths that may allow execution on AICPU. char so_path[256]; bool file_created = false; const char *candidate_dirs[] = { diff --git a/src/a5/platform/onboard/aicpu/CMakeLists.txt b/src/a5/platform/onboard/aicpu/CMakeLists.txt index 6edf9eb93..ddc8bd553 100644 --- a/src/a5/platform/onboard/aicpu/CMakeLists.txt +++ b/src/a5/platform/onboard/aicpu/CMakeLists.txt @@ -20,6 +20,7 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../include list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/task_interface") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -84,3 +85,35 @@ target_link_directories(aicpu_kernel # Output name set_target_properties(aicpu_kernel PROPERTIES OUTPUT_NAME aicpu_kernel) + +# See src/a2a3/platform/onboard/aicpu/CMakeLists.txt for design rationale. +# Direction 1: stable single dispatcher + runtime AICPU kernel uploaded at runtime. +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher/aicpu_dispatcher.cpp" +) +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -rdynamic + -O3 + -fPIC + -g + $<$:-std=gnu++17> +) + +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CUSTOM_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include +) + +target_link_libraries(aicpu_dispatcher PRIVATE dl) + +set_target_properties(aicpu_dispatcher PROPERTIES + LINK_FLAGS "-Wl,--build-id" + OUTPUT_NAME "simpler_aicpu_dispatcher" +) diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 4337e4429..4dc606eea 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -23,8 +23,8 @@ #include "runtime.h" // Run-wall capture: g_device_start_cycle is set once in -// DynTileFwkBackendKernelServerInit (single-threaded launch); each thread -// of the multi-threaded DynTileFwkBackendKernelServer writes the converted +// simpler_aicpu_init (single-threaded launch); each thread +// of the multi-threaded simpler_aicpu_exec writes the converted // (end - start) into KernelArgs.device_wall_ns on exit. Plain stores — // last-writer-wins is fine for wall measurement. static uint64_t g_device_start_cycle = 0; @@ -32,27 +32,18 @@ static uint64_t g_device_start_cycle = 0; // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) extern "C" int aicpu_execute(Runtime *arg); -extern "C" __attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *arg) { - if (arg == nullptr) { - LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); - return -1; - } - - return 0; -} - /** - * AICPU kernel initialization entry point + * AICPU kernel initialization entry point. * - * This function is called once during kernel initialization by the CANN - * runtime. It initializes logging and validates kernel arguments. - * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called once by simpler_dispatcher in the Init phase. The dispatcher + * dlsym's "simpler_aicpu_init" inside this inner SO (an internal + * dispatcher↔inner protocol — independent of CANN's preinstalled + * libaicpu_extend_kernels contract, which only binds the dispatcher itself). * * @param arg Pointer to KernelArgs structure * @return 0 on success, -1 on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServerInit(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_init(void *arg) { init_log_switch(); if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); @@ -74,17 +65,15 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer } /** - * AICPU kernel main execution entry point - * - * This is the main entry point for the AICPU runtime executor kernel. - * It extracts the Runtime from KernelArgs and delegates to AicpuExecute. + * AICPU kernel main execution entry point. * - * Note: Function name is hardcoded in libaicpu_extend_kernels.so + * Called per-thread by simpler_dispatcher in the Run phase via dlsym + * "simpler_aicpu_exec" on the inner SO. * * @param arg Pointer to KernelArgs structure containing runtime_args * @return 0 on success, non-zero on error */ -extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelServer(void *arg) { +extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *arg) { if (arg == nullptr) { LOG_ERROR("%s", "Invalid kernel arguments: null pointer"); return -1; @@ -121,13 +110,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer return 0; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: Calling aicpu_execute with Runtime"); int rc = aicpu_execute(runtime); if (rc != 0) { - LOG_ERROR("DynTileFwkBackendKernelServer: aicpu_execute failed with rc=%d", rc); + LOG_ERROR("simpler_aicpu_exec: aicpu_execute failed with rc=%d", rc); return rc; } - LOG_INFO_V0("%s", "DynTileFwkBackendKernelServer: aicpu_execute completed successfully"); + LOG_INFO_V0("%s", "simpler_aicpu_exec: aicpu_execute completed successfully"); // Stamp end into the device_wall buffer. Last-writer-wins across threads. uint64_t my_end = get_sys_cnt_aicpu(); diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index e5b57bf7a..7d826c34f 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -23,6 +23,8 @@ list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../c list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/log/include") list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/device_comm") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host") +list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/aicpu_dispatcher") if(DEFINED CUSTOM_INCLUDE_DIRS) foreach(INC_DIR ${CUSTOM_INCLUDE_DIRS}) list(APPEND CMAKE_CUSTOM_INCLUDE_DIRS "${INC_DIR}") @@ -44,6 +46,10 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/pmu_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" ) +# Add common/host sources (LoadAicpuOp) +list(APPEND HOST_RUNTIME_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../common/host/load_aicpu_op.cpp" +) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) file(GLOB_RECURSE DIR_SOURCES "${SRC_DIR}/*.cpp" "${SRC_DIR}/*.c") @@ -85,14 +91,16 @@ target_include_directories(host_runtime ${ASCEND_HOME_PATH}/include ${ASCEND_HOME_PATH}/pkg_inc ${ASCEND_HOME_PATH}/pkg_inc/runtime + # pkg_inc/runtime/runtime exposes rts_kernel.h + kernel.h (CANN 7.0+ + # rtsLaunchCpuKernel API used by LoadAicpuOp). + ${ASCEND_HOME_PATH}/pkg_inc/runtime/runtime ${ASCEND_HOME_PATH}/pkg_inc/profiling ${ASCEND_HOME_PATH}/${CMAKE_SYSTEM_PROCESSOR}-linux/include/driver ) -target_link_directories(host_runtime - PRIVATE - ${ASCEND_HOME_PATH}/lib64 - ${ASCEND_HOME_PATH}/runtime/lib64 +# Stable dispatcher basename (see a2a3 CMakeLists for rationale). +target_compile_definitions(host_runtime PRIVATE + SIMPLER_AICPU_BASENAME="libsimpler_aicpu_dispatcher.so" ) # Link against CANN runtime libraries @@ -105,4 +113,10 @@ target_link_libraries(host_runtime dl ) +target_link_directories(host_runtime + PRIVATE + ${ASCEND_HOME_PATH}/lib64 + ${ASCEND_HOME_PATH}/runtime/lib64 +) + set_target_properties(host_runtime PROPERTIES OUTPUT_NAME "host_runtime") diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 5235394e4..b5fc44e59 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -22,6 +22,8 @@ #include +#include "load_aicpu_op.h" + #include #include #include @@ -29,6 +31,22 @@ #include #include +static std::string resolve_dispatcher_so_path() { + // Dispatcher SO sits next to host_runtime.so (the SO this function lives + // in). dladdr gives us host_runtime.so's path; the dispatcher basename + // SIMPLER_AICPU_BASENAME is baked in at build time. + Dl_info info; + if (dladdr(reinterpret_cast(resolve_dispatcher_so_path), &info) == 0 || info.dli_fname == nullptr) { + return SIMPLER_AICPU_BASENAME; + } + std::string path = info.dli_fname; + size_t pos = path.rfind('/'); + if (pos == std::string::npos) { + return SIMPLER_AICPU_BASENAME; + } + return path.substr(0, pos + 1) + SIMPLER_AICPU_BASENAME; +} + #include "callable.h" #include "callable_protocol.h" #include "utils/elf_build_id.h" @@ -346,14 +364,36 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } - // Load AICPU SO - int rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + // Bundle dispatcher SO + inner SO bytes into one Mode A KFC call: + // libaicpu_extend_kernels invokes our dispatcher, which writes the inner + // SO bytes to simpler_inner_.so in preinstall. Dispatcher itself never + // persists. Per-task launches afterwards go through Mode B + // (rtsBinaryLoadFromFile + rtsFuncGetByName + rtsLaunchCpuKernel) directly + // against the preinstall file. + std::string dispatcher_so_path = resolve_dispatcher_so_path(); + int rc = load_aicpu_op_.BootstrapDispatcher( + dispatcher_so_path, aicpu_so_binary_.data(), aicpu_so_binary_.size(), stream_aicpu_ + ); + if (rc != 0) { + LOG_ERROR("LoadAicpuOp::BootstrapDispatcher failed: %d", rc); + return rc; + } + LOG_INFO_V2("DeviceRunner: inner SO uploaded to preinstall via dispatcher bootstrap"); + + rc = load_aicpu_op_.Init(); if (rc != 0) { - LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + LOG_ERROR("LoadAicpuOp::Init failed: %d", rc); return rc; } + LOG_INFO_V2("DeviceRunner: inner SO registered (simpler_aicpu_init/exec handles ready)"); - // Initialize device args + // Keep so_info_ allocation matching upstream behavior (see a2a3 sibling + // for rationale). + rc = so_info_.init(aicpu_so_binary_, mem_alloc_); + if (rc != 0) { + LOG_ERROR("AicpuSoInfo::init failed: %d", rc); + return rc; + } device_args_.aicpu_so_bin = so_info_.aicpu_so_bin; device_args_.aicpu_so_len = so_info_.aicpu_so_len; rc = kernel_args_.init_device_args(device_args_, mem_alloc_); @@ -585,16 +625,16 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { pmu_collector_.start(thread_factory); } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServerInit ==="); - rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServerInit", 1); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName); + rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (init) failed: %d", rc); return rc; } - LOG_INFO_V0("=== launch_aicpu_kernel DynTileFwkKernelServer ==="); + LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::RunName); rc = launch_aicpu_kernel( - stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH + stream_aicpu_, &kernel_args_.args, host::KernelNames::RunName, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH ); if (rc != 0) { LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc); @@ -920,6 +960,9 @@ int DeviceRunner::finalize() { // Cleanup AICPU SO so_info_.finalize(); + // load_aicpu_op_ has no per-task device-side state to release (Mode A + // type 2 launches don't keep handles). The dispatcher itself was a + // transient libaicpu_extend_kernels dlopen — nothing to unload from host. binaries_loaded_ = false; // Release any chip callable buffers uploaded via upload_chip_callable_buffer. @@ -1001,27 +1044,11 @@ int DeviceRunner::finalize() { } int DeviceRunner::launch_aicpu_kernel(rtStream_t stream, KernelArgs *k_args, const char *kernel_name, int aicpu_num) { - struct Args { - KernelArgs k_args; - char kernel_name[32]; - const char so_name[32] = {"libaicpu_extend_kernels.so"}; - const char op_name[32] = {""}; - } args; - - args.k_args = *k_args; - std::strncpy(args.kernel_name, kernel_name, sizeof(args.kernel_name) - 1); - args.kernel_name[sizeof(args.kernel_name) - 1] = '\0'; - - rtAicpuArgsEx_t rt_args; - std::memset(&rt_args, 0, sizeof(rt_args)); - rt_args.args = &args; - rt_args.argsSize = sizeof(args); - rt_args.kernelNameAddrOffset = offsetof(struct Args, kernel_name); - rt_args.soNameAddrOffset = offsetof(struct Args, so_name); - - return rtAicpuKernelLaunchExWithArgs( - rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", aicpu_num, &rt_args, nullptr, stream, 0 - ); + // kernel_name is host::KernelNames::InitName / RunName — the runtime SO's + // actual exported symbol (simpler_aicpu_init / simpler_aicpu_exec). The + // Mode A type 2 launch in LaunchBuiltInOp embeds it in the args struct + // for the main aicpu_scheduler to dlsym. + return load_aicpu_op_.LaunchBuiltInOp(stream, k_args, aicpu_num, kernel_name); } int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) { diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 71969f12a..306a329ce 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -50,6 +50,7 @@ #include "host/l2_perf_collector.h" #include "host/pmu_collector.h" #include "host/tensor_dump_collector.h" +#include "load_aicpu_op.h" #include "runtime.h" /** @@ -475,6 +476,9 @@ class DeviceRunner { std::vector aicpu_so_binary_; std::vector aicore_kernel_binary_; + // AICPU op loader — handles dispatcher bootstrap and per-task launches. + host::LoadAicpuOp load_aicpu_op_; + // Memory management MemoryAllocator mem_alloc_; diff --git a/src/common/aicpu_dispatcher/CMakeLists.txt b/src/common/aicpu_dispatcher/CMakeLists.txt new file mode 100644 index 000000000..5aa85d321 --- /dev/null +++ b/src/common/aicpu_dispatcher/CMakeLists.txt @@ -0,0 +1,47 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build AICPU Dispatcher SO - Two-layer architecture for runtime-specific AICPU kernels +cmake_minimum_required(VERSION 3.16.3) + +project(aicpu_dispatcher LANGUAGES C CXX) + +# Dispatcher SO sources +set(AICPU_DISPATCHER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/aicpu_dispatcher.cpp" +) + +# Create shared library +add_library(aicpu_dispatcher SHARED ${AICPU_DISPATCHER_SOURCES}) + +# C++ standard +set_target_properties(aicpu_dispatcher PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON +) + +# Compile options (matching AICPU pattern). +target_compile_options(aicpu_dispatcher + PRIVATE + -Wall + -Wextra + -fPIC + -O3 + -g +) + +# Include directories +target_include_directories(aicpu_dispatcher + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + "${CMAKE_CURRENT_SOURCE_DIR}/../../../.." # For common/unified_log.h +) + + +# Project-namespaced output name: libsimpler_aicpu_dispatcher.so. +set_target_properties(aicpu_dispatcher PROPERTIES OUTPUT_NAME "simpler_aicpu") diff --git a/src/common/aicpu_dispatcher/README.md b/src/common/aicpu_dispatcher/README.md new file mode 100644 index 000000000..5963250d3 --- /dev/null +++ b/src/common/aicpu_dispatcher/README.md @@ -0,0 +1,32 @@ +# Simpler AICPU Dispatcher SO + +Source for `libsimpler_aicpu_dispatcher.so` — a transient bootstrap-only helper +loaded by CANN's preinstalled `libaicpu_extend_kernels.so`. Its only job is to +write the bundled runtime SO bytes to the main `aicpu_scheduler`'s preinstall +path under a content-fingerprint filename: + +```text +/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so +``` + +The dispatcher SO itself is **never** persisted to disk and **never** dispatches +at per-task launch time. After bootstrap, the host launches the runtime SO +directly via `rtAicpuKernelLaunchExWithArgs` (kernel_type = `KERNEL_TYPE_AICPU`), +which routes through the main `aicpu_scheduler` and dlopens the preinstall file. + +The source is runtime-agnostic, so it is built once and installed at +`build/lib//onboard//libsimpler_aicpu_dispatcher.so` (a sibling +of each runtime's host_runtime.so). A single process binding multiple runtimes +shares one dispatcher SO on disk. + +## Exported entry points + +Three C-style symbols are exposed; `libaicpu_extend_kernels.so::SetTileFwkKernelMap` +dlsym's all three at load time, but only DynInit does real work: + +1. `StaticTileFwkBackendKernelServer` — stub +2. `DynTileFwkBackendKernelServerInit` — bootstrap upload (real work) +3. `DynTileFwkBackendKernelServer` — stub + +See `aicpu_dispatcher.h` for the bootstrap protocol details (extended DeviceArgs +with `inner_so_bin`/`inner_so_len`, FNV-1a content fingerprint). diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp new file mode 100644 index 000000000..333c2d83a --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher implementation — transient bootstrap-only upload helper. + * + * See aicpu_dispatcher.h for architecture. The dispatcher SO exists only + * to provide a piece of code that runs with sched-thread (HwHiAiUser) + * permissions for one purpose: write the bundled runtime SO bytes to + * the main aicpu_scheduler's preinstall path under a content-fingerprint + * filename. Once Init returns, this SO is no longer referenced — host's + * subsequent Mode B loads target the runtime SO file directly. + */ + +#include "aicpu_dispatcher.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// dlog wrapper so error paths show up in device log without depending on +// our common/unified_log machinery (this SO is loaded standalone by CANN). +extern "C" void DlogRecord(int moduleId, int level, const char *fmt, ...); + +namespace simpler_dispatcher { +constexpr int kDlogModuleCcecpu = 3; +constexpr int kDlogLevelError = 3; + +void DispatcherLog(const char *fmt, ...) { + char buf[1024]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (&DlogRecord != nullptr) { + DlogRecord(kDlogModuleCcecpu, kDlogLevelError, "[simpler-dispatcher] %s", buf); + } +} +} // namespace simpler_dispatcher + +// Bootstrap-time DeviceArgs view. Layout shared with host's BootstrapDispatcher. +// libaicpu_extend_kernels reads aicpu_so_bin/len/deviceId; we additionally read +// inner_so_bin/len (an extra qword pair past deviceId). +struct KernelArgs { + uint64_t unused[5] = {0}; + void *device_args{nullptr}; + void *runtime_args{nullptr}; + uint64_t regs{0}; +}; +struct DeviceArgs { + uint64_t unused[12] = {0}; + uint64_t aicpu_so_bin{0}; // 96 — dispatcher bytes (libaicpu_extend_kernels) + uint64_t aicpu_so_len{0}; // 104 + uint64_t device_id{0}; // 112 + uint64_t inner_so_bin{0}; // 120 — runtime SO bytes (dispatcher) + uint64_t inner_so_len{0}; // 128 +}; +static_assert(offsetof(KernelArgs, device_args) == 40, "KernelArgs::device_args offset drift"); +static_assert(offsetof(DeviceArgs, aicpu_so_bin) == 96, "DeviceArgs::aicpu_so_bin offset drift"); +static_assert(offsetof(DeviceArgs, aicpu_so_len) == 104, "DeviceArgs::aicpu_so_len offset drift"); +static_assert(offsetof(DeviceArgs, device_id) == 112, "DeviceArgs::device_id offset drift"); +static_assert(offsetof(DeviceArgs, inner_so_bin) == 120, "DeviceArgs::inner_so_bin offset drift"); +static_assert(offsetof(DeviceArgs, inner_so_len) == 128, "DeviceArgs::inner_so_len offset drift"); + +namespace simpler_dispatcher { + +// FNV-1a over first 64 bytes XOR'd with len. Host's MakeInnerSoBasename +// uses the same algorithm so both sides produce the same filename without +// any other channel of communication. +uint64_t Fingerprint(const char *data, uint64_t len) { + constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; + constexpr uint64_t kFnvPrime = 0x100000001b3ULL; + uint64_t h = kFnvOffset; + size_t n = len < 64 ? len : 64; + for (size_t i = 0; i < n; ++i) { + h ^= static_cast(data[i]); + h *= kFnvPrime; + } + return h ^ len; +} + +// Preinstall path — HwHiAiUser owns this dir, the sched thread can write here. +// device-side /tmp is mounted read-only / restricted in CANN 9.0. +std::string MakeInnerSoPath(uint64_t fp) { + char buf[256]; + snprintf(buf, sizeof(buf), "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_%016lx.so", fp); + return buf; +} + +// Atomic write: write to a per-process temp path, then rename onto the target. +// Several CI workers may bootstrap on different devices simultaneously and all +// land at the same fingerprinted target path; without atomic rename a reader +// (a sibling aicpu_scheduler's dlopen during its Mode B load) can observe a +// truncated/partially-written file and fail with 507018 or 507046. +// +// Same fingerprint → same content, so whichever rename wins yields identical +// bytes; existing dlopen handles in any aicpu_scheduler stay bound to their +// captured inode and are unaffected by later renames. We don't fast-path on +// the file already existing — a stale corrupt file from a pre-fix run could +// match the fingerprint by chance, and the atomic rename overwrites cheaply. +bool WriteBytes(const std::string &path, const char *data, uint64_t len) { + char tmp_path[320]; + snprintf(tmp_path, sizeof(tmp_path), "%s.tmp.%d", path.c_str(), static_cast(getpid())); + { + std::ofstream f(tmp_path, std::ios::binary | std::ios::trunc); + if (!f.is_open()) { + DispatcherLog("open %s for write failed: %s", tmp_path, strerror(errno)); + return false; + } + f.write(data, static_cast(len)); + bool good = f.good(); + f.close(); + if (!good) { + DispatcherLog("write %s failed", tmp_path); + unlink(tmp_path); + return false; + } + } + (void)chmod(tmp_path, 0755); + if (rename(tmp_path, path.c_str()) != 0) { + DispatcherLog("rename %s -> %s failed: %s", tmp_path, path.c_str(), strerror(errno)); + unlink(tmp_path); + return false; + } + return true; +} + +} // namespace simpler_dispatcher + +// ============================================================================= +// C-style exported entry points dlsym'd by libaicpu_extend_kernels. +// ============================================================================= + +extern "C" { + +// Stubs — libaicpu_extend_kernels::SetTileFwkKernelMap dlsym's all three at +// load time; absence makes the whole SO unmappable. We only reach Init. +__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args) { + (void)args; + simpler_dispatcher::DispatcherLog("Static: stub (should not be called)"); + return 1; +} + +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args) { + (void)args; + simpler_dispatcher::DispatcherLog("Server: stub (dispatcher is upload-only, should not be called)"); + return 1; +} + +// Init: write the bundled runtime SO bytes to a fingerprint-named file under +// the main scheduler's preinstall path, return. Once this returns, host's +// Mode B JSON load can resolve the runtime SO directly — this dispatcher SO +// never gets referenced again. +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args) { + if (args == nullptr) { + simpler_dispatcher::DispatcherLog("Init: args==nullptr"); + return 1; + } + auto *k = reinterpret_cast(args); + auto *d = reinterpret_cast(k->device_args); + if (d == nullptr) { + simpler_dispatcher::DispatcherLog("Init: device_args==nullptr"); + return 1; + } + if (d->inner_so_bin == 0 || d->inner_so_len == 0) { + simpler_dispatcher::DispatcherLog( + "Init: empty inner SO bundle (bin=%lx len=%lu)", d->inner_so_bin, d->inner_so_len + ); + return 1; + } + const char *inner_bytes = reinterpret_cast(d->inner_so_bin); + uint64_t fp = simpler_dispatcher::Fingerprint(inner_bytes, d->inner_so_len); + std::string path = simpler_dispatcher::MakeInnerSoPath(fp); + if (!simpler_dispatcher::WriteBytes(path, inner_bytes, d->inner_so_len)) { + return 1; + } + simpler_dispatcher::DispatcherLog("Init: wrote %s (%lu bytes)", path.c_str(), d->inner_so_len); + return 0; +} + +} // extern "C" diff --git a/src/common/aicpu_dispatcher/aicpu_dispatcher.h b/src/common/aicpu_dispatcher/aicpu_dispatcher.h new file mode 100644 index 000000000..29e89106c --- /dev/null +++ b/src/common/aicpu_dispatcher/aicpu_dispatcher.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Dispatcher — transient bootstrap-only upload helper. + * + * Architecture + * ============ + * + * This dispatcher SO has one job: write the bundled runtime SO bytes to the + * main aicpu_scheduler's preinstall path. It is **never** written to disk + * itself and **never** dispatches at per-task launch time. + * + * Bootstrap flow (host → libaicpu_extend_kernels → dispatcher → preinstall): + * + * 1. host calls `rtAicpuKernelLaunchExWithArgs` (kernel_type = + * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels with + * DeviceArgs containing: + * - aicpu_so_bin / aicpu_so_len → dispatcher SO bytes (libaicpu_extend_kernels reads) + * - inner_so_bin / inner_so_len → runtime SO bytes (dispatcher reads) + * 2. libaicpu_extend_kernels writes the dispatcher bytes to its own private + * path (some /tmp on device, often unlinked after open), dlopens us, + * dlsym's the three CANN-contract symbols (Static + DynInit + Dyn), + * invokes our `DynTileFwkBackendKernelServerInit`. + * 3. Our Init reads inner_so_bin/inner_so_len from DeviceArgs, fingerprints + * the bytes (FNV-1a over first 64 bytes XOR len), and writes them to + * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so`. + * The sched thread (HwHiAiUser) owns this dir, so the write succeeds. + * 4. host computes the same fingerprint locally to derive the same + * preinstall filename. + * 5. Per-task launches: host calls `rtAicpuKernelLaunchExWithArgs` + * (kernel_type = `KERNEL_TYPE_AICPU`, so_name = `simpler_inner_.so`, + * kernel_name = `simpler_aicpu_init`/`_exec`). The main aicpu_scheduler + * dlopens the preinstall file once and caches the handle; dispatcher is + * no longer in the picture. + * + * Multi-runtime in one host process: each DeviceRunner bootstraps with the + * same dispatcher bytes + its own runtime SO bytes. A process-level + * fingerprint cache in LoadAicpuOp short-circuits repeat invocations for + * the same runtime SO content, so libaicpu_extend_kernels' one-shot + * `firstCreatSo_` latch fires at most once per (process, fingerprint). + */ + +#ifndef COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ +#define COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ + +#include + +// C-style exports required by libaicpu_extend_kernels' SetTileFwkKernelMap +// dlsym contract. Only DynInit does real work; the other two are stubs that +// log + return failure if ever invoked (they shouldn't be — dispatcher is +// upload-only and host's per-task launches target the runtime SO directly). +extern "C" { +__attribute__((visibility("default"))) int StaticTileFwkBackendKernelServer(void *args); +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServerInit(void *args); +__attribute__((visibility("default"))) uint32_t DynTileFwkBackendKernelServer(void *args); +} + +#endif // COMMON_AICPU_DISPATCHER_AICPU_DISPATCHER_H_ diff --git a/src/common/host/CMakeLists.txt b/src/common/host/CMakeLists.txt new file mode 100644 index 000000000..9e9125274 --- /dev/null +++ b/src/common/host/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# Build host-side AICPU operation loader +cmake_minimum_required(VERSION 3.16.3) + +project(host_common LANGUAGES C CXX) + +# Host common sources +set(HOST_COMMON_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/load_aicpu_op.cpp" +) + +# This library is included directly in host_runtime, not built separately +# Sources are added to HOST_RUNTIME_SOURCES in platform CMakeLists.txt diff --git a/src/common/host/load_aicpu_op.cpp b/src/common/host/load_aicpu_op.cpp new file mode 100644 index 000000000..ac9e3bb9b --- /dev/null +++ b/src/common/host/load_aicpu_op.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * AICPU Operation Loader Implementation + */ + +#include "load_aicpu_op.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "common/unified_log.h" +#include "runtime/rt.h" + +namespace host { + +namespace { + +std::string MakeInnerSoBasename(uint64_t fp) { + char buf[64]; + snprintf(buf, sizeof(buf), "simpler_inner_%016lx.so", fp); + return buf; +} + +// Per-runtime unique opType — different LoadAicpuOp instances in the same +// process may register the same plain symbol names (simpler_aicpu_init / _exec); +// suffixing with the runtime SO fingerprint keeps CANN's global op registry +// from collapsing distinct registrations. +std::string MakeUniqueOpType(const char *base, uint64_t fp) { + char buf[128]; + snprintf(buf, sizeof(buf), "%s_%016lx", base, fp); + return buf; +} + +uint64_t FingerprintBytes(const void *data, size_t len) { + constexpr uint64_t kFnvOffset = 0xcbf29ce484222325ULL; + constexpr uint64_t kFnvPrime = 0x100000001b3ULL; + uint64_t h = kFnvOffset; + size_t n = len < 64 ? len : 64; + auto *p = reinterpret_cast(data); + for (size_t i = 0; i < n; ++i) { + h ^= p[i]; + h *= kFnvPrime; + } + return h ^ static_cast(len); +} + +bool ReadFileBytes(const std::string &path, std::vector &out) { + std::ifstream in(path, std::ios::binary | std::ios::ate); + if (!in.is_open()) { + LOG_ERROR("ReadFileBytes: cannot open %s: %s", path.c_str(), strerror(errno)); + return false; + } + std::streamsize len = in.tellg(); + in.seekg(0); + out.resize(static_cast(len)); + if (!in.read(out.data(), len)) { + LOG_ERROR("ReadFileBytes: read failed for %s", path.c_str()); + return false; + } + return true; +} + +struct DeviceBuf { + void *ptr = nullptr; + ~DeviceBuf() { + if (ptr != nullptr) (void)aclrtFree(ptr); + } + aclError alloc(size_t bytes) { return aclrtMalloc(&ptr, bytes, ACL_MEM_MALLOC_HUGE_FIRST); } +}; + +// Process-level cache of inner-SO fingerprints we've already bootstrapped. +// Multiple DeviceRunner instances in the same process share one entry per +// runtime here; same-content uploads short-circuit. +std::unordered_set &BootstrappedFps() { + static std::unordered_set kSet; + return kSet; +} +std::mutex &BootstrapMutex() { + static std::mutex kMutex; + return kMutex; +} + +} // namespace + +int LoadAicpuOp::BootstrapDispatcher( + const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream +) { + if (inner_so_data == nullptr || inner_so_len == 0) { + LOG_ERROR("BootstrapDispatcher: empty inner SO bytes"); + return -1; + } + inner_fp_ = FingerprintBytes(inner_so_data, inner_so_len); + inner_so_basename_ = MakeInnerSoBasename(inner_fp_); + + { + std::lock_guard lock(BootstrapMutex()); + if (BootstrappedFps().count(inner_fp_) > 0) { + LOG_INFO_V2("BootstrapDispatcher: inner SO fp=%016lx already bootstrapped, skipping", inner_fp_); + return 0; + } + } + + std::vector dispatcher_bytes; + if (!ReadFileBytes(dispatcher_so_path, dispatcher_bytes)) return -1; + size_t dispatcher_len = dispatcher_bytes.size(); + const char *inner_bytes = reinterpret_cast(inner_so_data); + size_t inner_len = inner_so_len; + + DeviceBuf dev_dispatcher; + DeviceBuf dev_inner; + aclError rc = dev_dispatcher.alloc(dispatcher_len); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(dispatcher) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy( + dev_dispatcher.ptr, dispatcher_len, dispatcher_bytes.data(), dispatcher_len, ACL_MEMCPY_HOST_TO_DEVICE + ); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(dispatcher) failed: %d", rc); + return rc; + } + rc = dev_inner.alloc(inner_len); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(inner) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy(dev_inner.ptr, inner_len, inner_bytes, inner_len, ACL_MEMCPY_HOST_TO_DEVICE); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(inner) failed: %d", rc); + return rc; + } + + constexpr size_t kDeviceArgsBytes = 160; + char host_dev_args[kDeviceArgsBytes] = {}; + auto write_qword = [&](size_t offset, uint64_t value) { + std::memcpy(host_dev_args + offset, &value, sizeof(value)); + }; + write_qword(96, reinterpret_cast(dev_dispatcher.ptr)); + write_qword(104, static_cast(dispatcher_len)); + write_qword(112, 0); + write_qword(120, reinterpret_cast(dev_inner.ptr)); + write_qword(128, static_cast(inner_len)); + + DeviceBuf dev_args; + rc = dev_args.alloc(kDeviceArgsBytes); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMalloc(device_args) failed: %d", rc); + return rc; + } + rc = aclrtMemcpy(dev_args.ptr, kDeviceArgsBytes, host_dev_args, kDeviceArgsBytes, ACL_MEMCPY_HOST_TO_DEVICE); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtMemcpy(device_args) failed: %d", rc); + return rc; + } + + struct Args { + struct { + uint64_t unused[5] = {0}; + uint64_t device_args_ptr = 0; + uint64_t pad[20] = {0}; + } k_args; + char kernel_name[32]; + char so_name[32]; + char op_name[32]; + } args = {}; + args.k_args.device_args_ptr = reinterpret_cast(dev_args.ptr); + std::strncpy(args.kernel_name, "DynTileFwkKernelServerInit", sizeof(args.kernel_name) - 1); + std::strncpy(args.so_name, "libaicpu_extend_kernels.so", sizeof(args.so_name) - 1); + args.op_name[0] = '\0'; + + rtAicpuArgsEx_t rt_args = {}; + rt_args.args = &args; + rt_args.argsSize = sizeof(args); + rt_args.kernelNameAddrOffset = offsetof(Args, kernel_name); + rt_args.soNameAddrOffset = offsetof(Args, so_name); + + rtError_t rrc = rtAicpuKernelLaunchExWithArgs( + rtKernelType_t::KERNEL_TYPE_AICPU_KFC, "AST_DYN_AICPU", 1, &rt_args, nullptr, stream, 0 + ); + if (rrc != RT_ERROR_NONE) { + LOG_ERROR("BootstrapDispatcher: rtAicpuKernelLaunchExWithArgs failed: %d", rrc); + return rrc; + } + rc = aclrtSynchronizeStream(stream); + if (rc != ACL_SUCCESS) { + LOG_ERROR("BootstrapDispatcher: aclrtSynchronizeStream failed: %d", rc); + return rc; + } + LOG_INFO_V0( + "BootstrapDispatcher: bundled dispatcher (%zu B) + inner SO (%zu B) uploaded; inner SO at %s", dispatcher_len, + inner_len, inner_so_basename_.c_str() + ); + { + std::lock_guard lock(BootstrapMutex()); + BootstrappedFps().insert(inner_fp_); + } + return 0; +} + +void LoadAicpuOp::Finalize() { + if (binary_handle_ != nullptr) { + rtError_t rc = rtsBinaryUnload(binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_WARN("rtsBinaryUnload failed: %d", rc); + } + binary_handle_ = nullptr; + } + func_handles_.clear(); + inner_fp_ = 0; + inner_so_basename_.clear(); + if (!json_file_path_.empty()) { + std::remove(json_file_path_.c_str()); + LOG_INFO_V2("LoadAicpuOp: deleted temporary JSON %s", json_file_path_.c_str()); + json_file_path_.clear(); + } +} + +LoadAicpuOp::~LoadAicpuOp() { Finalize(); } + +bool LoadAicpuOp::GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so) { + std::ofstream json_file(json_path); + if (!json_file.is_open()) { + LOG_ERROR("Failed to open JSON file for writing: %s", json_path.c_str()); + return false; + } + auto make_cfg = [&](const char *symbol_name) { + AicpuOpConfig c; + c.opType = MakeUniqueOpType(symbol_name, inner_fp_); + c.functionName = symbol_name; + c.kernelSo = kernel_so; + c.opKernelLib = "AICPUKernel"; + c.userDefined = "False"; + return c; + }; + std::vector op_configs = { + make_cfg(KernelNames::InitName), + make_cfg(KernelNames::RunName), + }; + json_file << "{\n"; + for (size_t i = 0; i < op_configs.size(); ++i) { + const auto &c = op_configs[i]; + json_file << " \"" << c.opType << "\": {\n"; + json_file << " \"opInfo\": {\n"; + json_file << " \"functionName\": \"" << c.functionName << "\",\n"; + json_file << " \"kernelSo\": \"" << c.kernelSo << "\",\n"; + json_file << " \"opKernelLib\": \"" << c.opKernelLib << "\",\n"; + json_file << " \"computeCost\": \"" << c.computeCost << "\",\n"; + json_file << " \"engine\": \"" << c.engine << "\",\n"; + json_file << " \"flagAsync\": \"" << c.flagAsync << "\",\n"; + json_file << " \"flagPartial\": \"" << c.flagPartial << "\",\n"; + json_file << " \"userDefined\": \"" << c.userDefined << "\"\n"; + json_file << " }\n"; + json_file << " }" << (i < op_configs.size() - 1 ? "," : "") << "\n"; + } + json_file << "}\n"; + return true; +} + +int LoadAicpuOp::Init() { + if (inner_fp_ == 0) { + LOG_ERROR("LoadAicpuOp::Init: BootstrapDispatcher must be called first"); + return -1; + } + // Per-process JSON path to avoid multi-host-process races on a shared + // build-dir file. /tmp is writable everywhere; multiple LoadAicpuOp + // instances in the same process share the same path (their Finalize + // racing on std::remove is benign for a single-shot consumer). + char json_name_buf[128]; + snprintf( + json_name_buf, sizeof(json_name_buf), "/tmp/simpler_inner_%016lx_%d.json", inner_fp_, static_cast(getpid()) + ); + json_file_path_ = json_name_buf; + + if (!GenerateAicpuOpJson(json_file_path_, inner_so_basename_)) { + json_file_path_.clear(); + return -1; + } + + rtLoadBinaryOption_t option = {}; + option.optionId = RT_LOAD_BINARY_OPT_CPU_KERNEL_MODE; + option.value.cpuKernelMode = 0; + + rtLoadBinaryConfig_t load_config = {}; + load_config.options = &option; + load_config.numOpt = 1; + + LOG_INFO_V2("LoadAicpuOp::Init: JSON=%s inner_basename=%s", json_file_path_.c_str(), inner_so_basename_.c_str()); + + rtError_t rc = rtsBinaryLoadFromFile(json_file_path_.c_str(), &load_config, &binary_handle_); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsBinaryLoadFromFile failed for %s: %d", json_file_path_.c_str(), rc); + std::remove(json_file_path_.c_str()); + json_file_path_.clear(); + return rc; + } + LOG_INFO_V2("LoadAicpuOp: Loaded inner SO via JSON, handle=%p", binary_handle_); + + const char *symbol_names[] = {KernelNames::InitName, KernelNames::RunName}; + for (const char *name : symbol_names) { + std::string lookup_name = MakeUniqueOpType(name, inner_fp_); + rtFuncHandle func_handle = nullptr; + rc = rtsFuncGetByName(binary_handle_, lookup_name.c_str(), &func_handle); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsFuncGetByName failed for %s: %d", lookup_name.c_str(), rc); + return rc; + } + func_handles_[name] = func_handle; + LOG_INFO_V2("LoadAicpuOp: resolved handle for %s (opType=%s): %p", name, lookup_name.c_str(), func_handle); + } + return 0; +} + +int LoadAicpuOp::AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num) { + rtCpuKernelArgs_t cpu_args = {}; + cpu_args.baseArgs.args = k_args; + cpu_args.baseArgs.argsSize = sizeof(KernelArgs); + + rtKernelLaunchCfg_t kernelLaunchCfg = {nullptr, 0U}; + auto launchKernelAttr = std::make_unique(); + kernelLaunchCfg.attrs = launchKernelAttr.get(); + + rtError_t rc = + rtsLaunchCpuKernel(func_handle, static_cast(aicpu_num), stream, &kernelLaunchCfg, &cpu_args); + if (rc != RT_ERROR_NONE) { + LOG_ERROR("rtsLaunchCpuKernel failed: %d", rc); + return rc; + } + return 0; +} + +int LoadAicpuOp::LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name) { + auto it = func_handles_.find(func_name); + if (it == func_handles_.end()) { + LOG_ERROR("Function not found: %s", func_name.c_str()); + return -1; + } + return AicpuKernelLaunch(it->second, stream, k_args, aicpu_num); +} + +} // namespace host diff --git a/src/common/host/load_aicpu_op.h b/src/common/host/load_aicpu_op.h new file mode 100644 index 000000000..4427a4b45 --- /dev/null +++ b/src/common/host/load_aicpu_op.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file load_aicpu_op.h + * @brief Host-side AICPU operation loader. + * + * Three-phase architecture: + * + * 1. BootstrapDispatcher (per-DeviceRunner, idempotent across instances in + * the same process via a content-fingerprint cache): bundles dispatcher + * SO bytes + runtime SO bytes into a single Mode A KFC launch + * (`rtAicpuKernelLaunchExWithArgs`, kernel_type = + * `KERNEL_TYPE_AICPU_KFC`) targeting libaicpu_extend_kernels. Our + * dispatcher then writes the runtime SO to + * `/usr/lib64/aicpu_kernels/0/aicpu_kernels_device/simpler_inner_.so` + * using sched-thread (HwHiAiUser) write permission. The dispatcher SO + * itself is never persisted to disk. + * + * 2. Init (per-DeviceRunner): JSON-registers the runtime SO via + * `rtsBinaryLoadFromFile` (cpuKernelMode=0, kernelSo points at the + * preinstall basename), then resolves `simpler_aicpu_init` and + * `simpler_aicpu_exec` to `rtFuncHandle`s via `rtsFuncGetByName`. JSON + * is per-process (`/tmp/simpler_inner__.json`) so concurrent + * multi-chip / multi-worker tests don't race on a shared file. + * + * 3. LaunchBuiltInOp (per-task): `rtsLaunchCpuKernel` on the cached + * `rtFuncHandle`. No per-launch string marshalling, no global op + * registry lookups. + * + * See common/aicpu_dispatcher/aicpu_dispatcher.h for the bootstrap protocol + * details (extended DeviceArgs with inner_so_bin/inner_so_len, + * fingerprint-named preinstall files). + */ + +#ifndef COMMON_HOST_LOAD_AICPU_OP_H_ +#define COMMON_HOST_LOAD_AICPU_OP_H_ + +#include +#include +#include + +#include "common/kernel_args.h" +#include "runtime/runtime/rts/rts_kernel.h" +#include "runtime/rt.h" + +namespace host { + +/** + * @brief AICPU operation configuration for JSON descriptor generation. + */ +struct AicpuOpConfig { + std::string functionName; + std::string kernelSo; + std::string opKernelLib; + std::string computeCost = "100"; + std::string engine = "DNN_VM_AICPU"; + std::string flagAsync = "False"; + std::string flagPartial = "False"; + std::string userDefined = "False"; + std::string opType; +}; + +/** + * @brief Host-side AICPU operation loader. + * + * One instance per DeviceRunner; manages bootstrap (dispatcher upload) + + * JSON registration of the runtime SO + per-task launches via the runtime + * SO's direct rtFuncHandles. + */ +class LoadAicpuOp { +public: + LoadAicpuOp() = default; + ~LoadAicpuOp(); + + LoadAicpuOp(const LoadAicpuOp &) = delete; + LoadAicpuOp &operator=(const LoadAicpuOp &) = delete; + LoadAicpuOp(LoadAicpuOp &&) = delete; + LoadAicpuOp &operator=(LoadAicpuOp &&) = delete; + + /** + * @brief One-shot bootstrap: upload runtime SO to preinstall via dispatcher. + * + * @param dispatcher_so_path Host path to libsimpler_aicpu_dispatcher.so + * @param inner_so_data Runtime SO bytes (caller-owned, must outlive call) + * @param inner_so_len Runtime SO size + * @param stream Stream on which to enqueue the bootstrap + * @return 0 on success, error code on failure + */ + int BootstrapDispatcher( + const std::string &dispatcher_so_path, const void *inner_so_data, size_t inner_so_len, rtStream_t stream + ); + + /** + * @brief JSON-register the runtime SO and resolve its Init/Exec handles. + */ + int Init(); + + /** @brief Release binary handle + function handles + temporary JSON. */ + void Finalize(); + + /** + * @brief Launch a runtime SO entry point via rtsLaunchCpuKernel. + * + * @param stream RTS stream + * @param k_args Kernel arguments + * @param aicpu_num Number of AICPU threads (1 for Init, N for Exec) + * @param func_name Lookup key in func_handles_ (KernelNames::InitName/RunName) + * @return 0 on success, error code on failure + */ + int LaunchBuiltInOp(rtStream_t stream, KernelArgs *k_args, int aicpu_num, const std::string &func_name); + +private: + void *binary_handle_ = nullptr; + std::unordered_map func_handles_; + std::string json_file_path_; + uint64_t inner_fp_ = 0; + std::string inner_so_basename_; + + bool GenerateAicpuOpJson(const std::string &json_path, const std::string &kernel_so); + int AicpuKernelLaunch(rtFuncHandle func_handle, rtStream_t stream, KernelArgs *k_args, int aicpu_num); +}; + +// Runtime SO's actual exported symbol names. Both are looked up via the +// runtime SO's own JSON registration (no dispatcher hop at runtime). +namespace KernelNames { +constexpr const char *InitName = "simpler_aicpu_init"; // single-threaded init +constexpr const char *RunName = "simpler_aicpu_exec"; // multi-threaded exec +} // namespace KernelNames + +} // namespace host + +#endif // COMMON_HOST_LOAD_AICPU_OP_H_ diff --git a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py index 5f5fd1002..5e161cfe8 100644 --- a/tests/st/aicore_op_timeout/test_aicore_op_timeout.py +++ b/tests/st/aicore_op_timeout/test_aicore_op_timeout.py @@ -75,11 +75,17 @@ def test_aicore_op_timeout_surfaces_as_runtime_error(st_platform, st_device_ids) config.aicpu_thread_num = 2 t0 = time.monotonic() - # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — what - # aclrtSynchronizeStreamWithTimeout returns when the AICore stream - # (carrying the STARS-killed op) doesn't drain within the host's 2 s - # budget. Observed elapsed on Ascend910 / a2a3 onboard: ~6.3 s. - with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507046"): + # Acceptable error codes for the STARS-killed AICore op: + # 507046 = ACL_ERROR_RT_STREAM_SYNC_TIMEOUT — host's AICore stream + # sync hits the 2 s budget first (old Mode A AICPU path). + # 507018 = ACL_ERROR_RT_AICPU_EXCEPTION — Mode B AICPU stream sync + # surfaces the AICore failure as an AICPU exception when + # the orchestration kernel detects the dead AIC task. + # 507000 = ACL_ERROR_RT_INTERNAL_ERROR — same Mode B detection, + # mapped through a different code path on a5. + # Regardless of which fires, the regression we care about is that + # the timeout chain reaps the hang in single-digit seconds. + with pytest.raises(RuntimeError, match=r"run_prepared failed with code 507(046|018|000)"): worker.run(cid, ChipStorageTaskArgs(), config) elapsed = time.monotonic() - t0