diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index f6e39a63b92..840c72dd5ca 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -release/2.11 \ No newline at end of file +da74fecc24c85f9694061e961858303c44be4338 diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16-math.h b/runtime/core/portable_type/c10/c10/util/BFloat16-math.h index 8291cd74481..bce89e8acd9 100644 --- a/runtime/core/portable_type/c10/c10/util/BFloat16-math.h +++ b/runtime/core/portable_type/c10/c10/util/BFloat16-math.h @@ -181,7 +181,7 @@ template < typename T, typename std::enable_if_t, int> = 0> inline T rsqrt(T a) { - return 1.0 / std::sqrt(float(a)); + return 1.0f / std::sqrt(float(a)); } template < typename T, diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h index d369df50592..2b9bbea6c71 100644 --- a/runtime/core/portable_type/c10/c10/util/complex_math.h +++ b/runtime/core/portable_type/c10/c10/util/complex_math.h @@ -327,7 +327,7 @@ C10_HOST_DEVICE inline c10::complex atanh(const c10::complex& x) { template C10_HOST_DEVICE inline c10::complex log1p(const c10::complex& z) { #if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \ - defined(__HIPCC__) + defined(__HIPCC__) || defined(__SYCL_DEVICE_ONLY__) // For Mac, the new implementation yielded a high relative error. Falling back // to the old version for now. // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354 diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h index 63aa0d20d8e..cef99df3f56 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h +++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h @@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #define C10_HIP_HOST_DEVICE #endif -#if defined(USE_ROCM) // C10_WARP_SIZE is only allowed for device code. -// Host code _must_ use at::cuda::warp_size() +// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size(). +// Host or device statically-sized arrays _must_ use either +// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed. +// // HIP header used to define warpSize as a constexpr that was either 32 or 64 // depending on the target device, and then always set it to 64 for host code. -// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we -// set it to something unreasonable to trigger obvious host code errors. - +// For a time, that allowed C10_WARP_SIZE to be defined like so: +// +// #ifdef USE_ROCM +// #define C10_WARP_SIZE warpSize +// #else +// #define C10_WARP_SIZE 32 +// #endif +// +// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior. +// We can now only use warpSize for C10_WARP_SIZE in device code and this is +// enforced by using __device__ in its definition. In host code where +// C10_WARP_SIZE was previously used as a compile-time constant, this will now +// cause a compile-time error. +// +// If an array was previously expected to be sized at compile-time using +// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or +// C10_WARP_SIZE_LOWER_BOUND depending on the situation. +// +// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users +// must now use at::cuda::warp_size() for the dynamic runtime query. +// +// Unfortunately, C10_WARP_SIZE has been public and available for both host and +// device since approximately 2019, so forcing it to be device-only would break +// existing code in the wild. +#if defined(USE_ROCM) namespace at::cuda { TORCH_CUDA_CPP_API int warp_size(); } -#ifdef __HIPCC__ -static inline int __host__ C10_WARP_SIZE_INTERNAL() { +#if defined(__HIPCC__) +static __host__ inline int C10_WARP_SIZE_INTERNAL() { return at::cuda::warp_size(); } - -static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() { +// NOTE: __device__ C10_WARP_SIZE_INTERNAL +// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__, +// we can use constexpr. This matches prior behavior. We preserve this for +// backward compatibility instead of forcing old code to use dynamic warpSize +// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv +// could expose where C10_WARP_SIZE was used incorrectly where the dynamic +// warpSize is not allowed. +#if defined(__SPIRV__) +static __device__ inline int C10_WARP_SIZE_INTERNAL() { + return warpSize; +} +#else // __SPIRV__ +static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() { #if defined(__GFX9__) return 64; #else // __GFX9__ return 32; #endif // __GFX9__ } -#else // __HIPCC__ +#endif // __SPIRV__ +#if defined(__SPIRV__) +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 64 +#elif defined(__GFX9__) +#define C10_WARP_SIZE_LOWER_BOUND 64 +#define C10_WARP_SIZE_UPPER_BOUND 64 +#else +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 32 +#endif +#else // !__HIPCC__ static inline int C10_WARP_SIZE_INTERNAL() { return at::cuda::warp_size(); } +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 64 #endif // __HIPCC__ - #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL()) -#define C10_WARP_SIZE_STATIC 64 - -#else // defined(USE_ROCM) +#else // !USE_ROCM #define C10_WARP_SIZE 32 -#endif +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 32 +#endif // USE_ROCM #if defined(_MSC_VER) && _MSC_VER <= 1900 #define __func__ __FUNCTION__ @@ -629,7 +676,7 @@ __host__ __device__ // This macro is used to find older C++ compilers // that don't support move optimization for return values. -#if (defined(__GNUC__) && __GNUC__ < 13) || \ +#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \ (defined(__clang_major__) && __clang_major__ < 13) #define C10_RETURN_MOVE_IF_OLD_COMPILER 1 #else diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h index 64479ba36f1..9aa08c265bd 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h +++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h @@ -12,7 +12,7 @@ #include #include -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) #include #endif @@ -46,7 +46,7 @@ struct alignas(2) BFloat16 { /* implicit */ inline C10_HOST_DEVICE BFloat16(float value); inline C10_HOST_DEVICE operator float() const; -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value); explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const; #endif @@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") /// Constructors inline C10_HOST_DEVICE BFloat16::BFloat16(float value) : -#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 800 +#if defined(__CUDACC__) && \ + (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \ + defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702)) x(__bfloat16_as_ushort(__float2bfloat16(value))) #elif defined(__SYCL_DEVICE_ONLY__) && \ defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) @@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value) /// Implicit conversions inline C10_HOST_DEVICE BFloat16::operator float() const { -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) return __bfloat162float(*reinterpret_cast(&x)); #elif defined(__SYCL_DEVICE_ONLY__) && \ defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) @@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const { #endif } -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) { x = *reinterpret_cast(&value); } diff --git a/torch_pin.py b/torch_pin.py index 3575d9a376d..007df1aca89 100644 --- a/torch_pin.py +++ b/torch_pin.py @@ -1,2 +1,2 @@ TORCH_VERSION = "2.11.0" -# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287 +NIGHTLY_VERSION = "dev20260517"