Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ esac
TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
BUILD_DOCS=1

if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
PYTORCH_BUILD_MAX_JOBS=6
fi

# Copy requirements-lintrunner.txt from root to here
cp ../../requirements-lintrunner.txt ./

Expand All @@ -109,6 +113,7 @@ docker build \
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
--build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
--build-arg "TORCH_VERSION=${TORCH_VERSION}" \
--build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \
--build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
--build-arg "LINTRUNNER=${LINTRUNNER:-}" \
--build-arg "BUILD_DOCS=${BUILD_DOCS}" \
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/pytorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
release/2.11
release/2.12
3 changes: 3 additions & 0 deletions .ci/docker/common/install_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ init_sccache() {
# This is the remote cache bucket
export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
export SCCACHE_S3_KEY_PREFIX=executorch
export SCCACHE_REGION=us-east-1
export AWS_REGION=us-east-1
export AWS_DEFAULT_REGION=us-east-1
export SCCACHE_IDLE_TIMEOUT=0
export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
export RUST_LOG=sccache::server=error
Expand Down
8 changes: 7 additions & 1 deletion .ci/docker/common/install_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,20 @@ install_pytorch_and_domains() {
chown -R ci-user .

export _GLIBCXX_USE_CXX11_ABI=1
if [[ "$(uname -m)" == "aarch64" ]]; then
export BUILD_IGNORE_SVE_UNAVAILABLE=1
fi
if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then
export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}"
fi
# Then build and install PyTorch
conda_run python setup.py bdist_wheel
pip_install "$(echo dist/*.whl)"

# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=release/2.11
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=release/0.26
TORCHVISION_VERSION=release/0.27
export TORCHVISION_VERSION

install_domains
Expand Down
3 changes: 3 additions & 0 deletions .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh
ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
ENV SCCACHE_S3_KEY_PREFIX executorch
ENV SCCACHE_REGION us-east-1
ENV AWS_REGION us-east-1
ENV AWS_DEFAULT_REGION us-east-1

ARG TORCH_VERSION
ARG SKIP_PYTORCH
ARG PYTORCH_BUILD_MAX_JOBS
COPY ./common/install_pytorch.sh install_pytorch.sh
COPY ./common/utils.sh utils.sh
RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh
Expand Down
7 changes: 5 additions & 2 deletions .ci/scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ install_pytorch_and_domains() {
local torch_release=$(cat version.txt)
# Download key must match the upload key below (basename of dist/*.whl,
# which always carries setup.py's resolved +gitHASH). Branch-ref pins
# like `release/2.11` would otherwise produce `+gitrelease` here and
# like `release/2.12` would otherwise produce `+gitrelease` here and
# never hit the cache.
local torch_short_hash=$(git rev-parse --short=7 HEAD)
local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
Expand All @@ -132,6 +132,9 @@ install_pytorch_and_domains() {
# (e.g. executorch's requirements-ci.txt).
pip install -r requirements-build.txt
git submodule update --init --recursive
if [[ "$(uname -m)" == "aarch64" ]]; then
export BUILD_IGNORE_SVE_UNAVAILABLE=1
fi
USE_DISTRIBUTED=1 python setup.py bdist_wheel
pip install "$(echo dist/*.whl)"

Expand Down Expand Up @@ -175,7 +178,7 @@ install_pytorch_and_domains() {
# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=release/2.11
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=release/0.26
TORCHVISION_VERSION=release/0.27
export TORCHVISION_VERSION

install_domains
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/mlx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ jobs:
--prompt-len 4 \
--max-new-tokens 5 2>&1)
echo "$OUTPUT"
if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 94, 253, 88, 227\]"; then
echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
else
echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
echo "Failed: unexpected output (expected [167, 94, 253, 88, 227])"
exit 1
fi
echo "::endgroup::"
Expand Down
38 changes: 38 additions & 0 deletions backends/arm/_passes/arm_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
from abc import abstractmethod
from typing import Any, List, Optional, Set, Type

import torch
from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
from executorch.exir.dialects._ops import ops as exir_ops
from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
from torch.fx import GraphModule
from torch.fx.passes.infra.pass_base import PassResult
from torch.utils import _pytree as pytree


class ArmPass(ExportPass):
Expand Down Expand Up @@ -79,6 +81,13 @@ def get_name(pass_) -> str:
)

def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
if (
op == exir_ops.edge.aten.bmm.default
and isinstance(meta, NodeMetadata)
and len(meta.data.get("input_qparams", {})) > 0
):
return self._call_quantized_bmm_without_fake_kernel(op, args, kwargs, meta)

if not updated:
return super().call_operator(op, args, kwargs, meta)

Expand All @@ -91,6 +100,35 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False)
new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))

def _call_quantized_bmm_without_fake_kernel(
self,
op,
args: tuple[ProxyValue, ...],
kwargs: dict[str, Any],
meta: NodeMetadata,
) -> ProxyValue:
old_val = meta.data["val"]
output_qparams = meta.data.get("output_qparams", {})
dtype = (
next(iter(output_qparams.values())).dtype
if len(output_qparams) > 0
else old_val.dtype
)
res_data = torch.empty_like(old_val, dtype=dtype)

args_proxy, kwargs_proxy = pytree.tree_map_only(
ProxyValue, lambda x: x.proxy, (args, kwargs)
)
res_proxy = self.tracer.create_proxy(
"call_function",
op,
args_proxy,
kwargs_proxy,
)
res_proxy.node.meta.update(meta.data)
self.tracer.set_metadata(res_proxy.node, res_data)
return ProxyValue(res_data, res_proxy)

def call_submodule(
self, graph_module: GraphModule, inputs: tuple[Any, ...]
) -> PassResult:
Expand Down
15 changes: 10 additions & 5 deletions backends/nxp/tests/generic_tests/test_per_channel_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,19 @@ def test_per_channel_convolution(self, _, use_qat: bool):
atol=1.0,
)

nodes = list(exported_program.graph.nodes)

conv_nodes = [
node
for node in exported_program.graph.nodes
if node.target == exir_ops.edge.aten.convolution.default
]
assert len(conv_nodes) == 1

conv_node = conv_nodes[0]
assert (
nodes[8].target
conv_node.args[1].target
== exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
)
assert (
nodes[9].target
conv_node.args[2].target
== exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
)
assert nodes[10].target == exir_ops.edge.aten.convolution.default
14 changes: 7 additions & 7 deletions examples/models/llama3_2_vision/text_decoder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,19 +181,19 @@ def get_example_kwarg_inputs(self):
return None

def get_dynamic_shapes(self):
batch_size = 1
static = torch.export.Dim.STATIC
dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
# Hardcoding # of tiles to be 2. image tokens per tile is 1601.
if self.use_kv_cache:
dynamic_shapes = {
"tokens": {0: batch_size, 1: dim_seq_len},
"encoder_input": None,
"encoder_mask": {0: 1, 1: dim_seq_len, 2: None},
"mask": {0: batch_size, 1: dim_seq_len, 2: None},
"input_pos": {0: batch_size, 1: dim_seq_len},
"tokens": {0: static, 1: dim_seq_len},
"encoder_input": {0: static, 1: static, 2: static},
"encoder_mask": {0: static, 1: dim_seq_len, 2: static},
"mask": {0: static, 1: dim_seq_len, 2: static},
"input_pos": {0: static, 1: dim_seq_len},
}
else:
dynamic_shapes = {
"tokens": {0: batch_size, 1: dim_seq_len},
"tokens": {0: static, 1: dim_seq_len},
}
return dynamic_shapes
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_llama3_2_text_decoder_aoti(self) -> None:
encoder,
model.get_example_inputs(),
kwargs=model.get_example_kwarg_inputs(),
dynamic_shapes=model.get_dynamic_shapes(),
strict=True,
)
with tempfile.TemporaryDirectory() as tmpdir:
Expand Down
4 changes: 2 additions & 2 deletions examples/models/parakeet/export_parakeet_tdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,8 @@ def export_all(
preprocessor_wrapper,
(sample_audio, sample_length),
dynamic_shapes={
# min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit
"audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
# min=10 frames = 0.1 sec @ 16kHz, max aligned with encoder limit.
"audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)},
"length": {},
},
strict=False,
Expand Down
8 changes: 3 additions & 5 deletions extension/llm/modules/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,11 +302,9 @@ def false_fn(y):
k, v = calculate_kv(y)
else:
# Expecting the k, v returning here to be the same size of self.kv_cache
# In eager, we expect this predicate to specialize. In export, this will
# become a SymBool so it's not specialized.
k, v, cache_pos = torch.cond(
torch.isnan(y).all().item(), true_fn, false_fn, (y,)
)
# In eager, we expect this predicate to specialize. In export, keep it
# as a tensor predicate so AOTI does not introduce unbacked symbols.
k, v, cache_pos = torch.cond(torch.isnan(y).all(), true_fn, false_fn, (y,))
# Update key-value cache
self.kv_cache.k_cache.copy_(k)
self.kv_cache.v_cache.copy_(v)
Expand Down
4 changes: 2 additions & 2 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def install_requirements(use_pytorch_nightly):
# Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
# that we don't need to set any version number there because they have already
# been installed on CI before this step, so pip won't reinstall them
("torch==2.11.0" if use_pytorch_nightly else "torch"),
("torch==2.12.0" if use_pytorch_nightly else "torch"),
]

# Install the requirements for core ExecuTorch package.
Expand Down Expand Up @@ -112,7 +112,7 @@ def install_optional_example_requirements(use_pytorch_nightly):

print("Installing torch domain libraries")
DOMAIN_LIBRARIES = [
("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
("torchvision==0.27.0" if use_pytorch_nightly else "torchvision"),
("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
]
# Then install domain libraries
Expand Down
79 changes: 63 additions & 16 deletions runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#define C10_HIP_HOST_DEVICE
#endif

#if defined(USE_ROCM)
// C10_WARP_SIZE is only allowed for device code.
// Host code _must_ use at::cuda::warp_size()
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
// Host or device statically-sized arrays _must_ use either
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
//
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
// set it to something unreasonable to trigger obvious host code errors.

// For a time, that allowed C10_WARP_SIZE to be defined like so:
//
// #ifdef USE_ROCM
// #define C10_WARP_SIZE warpSize
// #else
// #define C10_WARP_SIZE 32
// #endif
//
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
// enforced by using __device__ in its definition. In host code where
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
// cause a compile-time error.
//
// If an array was previously expected to be sized at compile-time using
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
//
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
// must now use at::cuda::warp_size() for the dynamic runtime query.
//
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
// device since approximately 2019, so forcing it to be device-only would break
// existing code in the wild.
#if defined(USE_ROCM)
namespace at::cuda {
TORCH_CUDA_CPP_API int warp_size();
}
#ifdef __HIPCC__
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
#if defined(__HIPCC__)
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}

static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
// we can use constexpr. This matches prior behavior. We preserve this for
// backward compatibility instead of forcing old code to use dynamic warpSize
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
// warpSize is not allowed.
#if defined(__SPIRV__)
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
return warpSize;
}
#else // __SPIRV__
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
return 64;
#else // __GFX9__
return 32;
#endif // __GFX9__
}
#else // __HIPCC__
#endif // __SPIRV__
#if defined(__SPIRV__)
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#elif defined(__GFX9__)
#define C10_WARP_SIZE_LOWER_BOUND 64
#define C10_WARP_SIZE_UPPER_BOUND 64
#else
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif
#else // !__HIPCC__
static inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#endif // __HIPCC__

#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
#define C10_WARP_SIZE_STATIC 64

#else // defined(USE_ROCM)
#else // !USE_ROCM
#define C10_WARP_SIZE 32
#endif
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif // USE_ROCM

#if defined(_MSC_VER) && _MSC_VER <= 1900
#define __func__ __FUNCTION__
Expand Down Expand Up @@ -629,7 +676,7 @@ __host__ __device__
// This macro is used to find older C++ compilers
// that don't support move optimization for return values.

#if (defined(__GNUC__) && __GNUC__ < 13) || \
#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
(defined(__clang_major__) && __clang_major__ < 13)
#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
#else
Expand Down
Loading
Loading