diff --git a/pyproject.toml b/pyproject.toml
index 959699f9..8c6fee88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["scikit-build-core", "pybind11", "libclang"]
+requires = ["scikit-build-core", "pybind11", "libclang", "PyYAML"]
 build-backend = "scikit_build_core.build"
 
 [project]
diff --git a/scripts/generate_torch_ops.py b/scripts/generate_torch_ops.py
new file mode 100644
index 00000000..387698a0
--- /dev/null
+++ b/scripts/generate_torch_ops.py
@@ -0,0 +1,740 @@
+"""Generate InfiniOps PyTorch wrappers from ATen `native_functions.yaml`.
+
+For each op listed in `scripts/torch_ops.yaml`, this script finds the `.out`
+variant in PyTorch's `native_functions.yaml` (fetched on demand from the
+PyTorch GitHub release matching `_PYTORCH_VERSION`), parses its schema,
+and emits:
+
+  - `generated/base/<op>.h` — the InfiniOps base class
+    `class <Op> : public Operator<<Op>>`, with a constructor and pure-virtual
+    `operator()` mirroring the ATen schema.
+  - `generated/torch/<op>/<op>.h` and `<op>.cc` — the PyTorch backend
+    `Operator<<Op>, kDev, 8>` that calls `at::<op>_out(out, ...)`.
+  - `generated/torch_ops_metadata.json` — the kind (`unary` / `binary` /
+    `binary_alpha`) of every successfully-generated op, consumed by the
+    parametrized test suite.
+
+Slot 8 is the reserved convention for PyTorch backends; slots 0-7 are
+left for native or vendor implementations.  (The slot must also be > 0
+to side-step a partial-specialization-after-instantiation conflict with
+the primary template `Operator<<Op>>` instantiated at index 0.)
+
+The generated files are not committed; CMake regenerates them at configure
+time when `WITH_TORCH=ON`.
+"""
+
+import argparse
+import dataclasses
+import json
+import pathlib
+import re
+import shutil
+import sys
+import urllib.request
+
+import yaml
+
+_SCRIPTS_DIR = pathlib.Path(__file__).resolve().parent
+_REPO_ROOT = _SCRIPTS_DIR.parent
+_OPS_YAML_PATH = _SCRIPTS_DIR / "torch_ops.yaml"
+_BASE_DIR = _REPO_ROOT / "src" / "base"
+_GENERATED_DIR = _REPO_ROOT / "generated"
+_GENERATED_BASE_DIR = _GENERATED_DIR / "base"
+_GENERATED_TORCH_DIR = _GENERATED_DIR / "torch"
+_METADATA_PATH = _GENERATED_DIR / "torch_ops_metadata.json"
+
+# Reserved slot for PyTorch backends.  Native and vendor implementations
+# claim slots 0-7; PyTorch wrappers always live at 8.
+_PYTORCH_SLOT = 8
+
+# ATen uses symbolic names for some `int`/`float` defaults (e.g.
+# `reduction=Mean`).  Map them to C++ identifiers usable in a call.
+_ENUM_DEFAULTS = {
+    "Mean": "at::Reduction::Mean",
+    "Sum": "at::Reduction::Sum",
+    "Contiguous": "at::MemoryFormat::Contiguous",
+}
+
+# PyTorch release tag whose `native_functions.yaml` defines the schemas
+# we generate against.  Bump in lockstep with the minimum PyTorch version
+# the generated wrappers should target.
+_PYTORCH_VERSION = "v2.4.0"
+_ATEN_YAML_URL = (
+    f"https://raw.githubusercontent.com/pytorch/pytorch/{_PYTORCH_VERSION}"
+    "/aten/src/ATen/native/native_functions.yaml"
+)
+_ATEN_YAML_CACHE = (
+    _REPO_ROOT / "generated" / ".cache" / f"native_functions-{_PYTORCH_VERSION}.yaml"
+)
+
+# Order matches the device list in existing hand-written torch backends
+# (see `src/torch/add/add.cc`).
+_DEVICE_TYPES = (
+    "kCpu",
+    "kNvidia",
+    "kCambricon",
+    "kAscend",
+    "kMetax",
+    "kMoore",
+    "kIluvatar",
+    "kKunlun",
+    "kHygon",
+    "kQy",
+)
+
+# YAML scalar-type tokens → C++ types.  Reference types (e.g. `const Scalar&`)
+# are not used so the generated signatures match the existing hand-written
+# ones, which pass by value to keep pybind11 binding generation simple.
+_SCALAR_TYPE_MAP = {
+    # `at::Scalar` is implicitly constructible from `double`, so we expose
+    # scalars as `double` in the base class to keep it torch-independent.
+    "Scalar": "double",
+    "int": "int64_t",
+    "bool": "bool",
+    "float": "double",
+    # `SymInt` / `SymInt[]` exist for `torch.compile` internals; at runtime
+    # they're just `int64`/IntArrayRef.
+    "SymInt": "int64_t",
+    # `str` for required string params (e.g. `index_reduce.reduce`).
+    # `std::string` marshals through pybind11 cleanly and converts
+    # implicitly to ATen's `c10::string_view`.
+    "str": "std::string",
+}
+
+# `Dimname` overloads (named-tensor dim) are skipped — passing them
+# from Python to ATen requires a wrapper conversion through
+# `at::Dimname::fromSymbol(...)` that doesn't fit the cleanly-rendered
+# 1:1 arg model, and named tensors remain experimental in PyTorch.
+# The int-dim overload is always emitted alongside, so we lose nothing
+# user-visible.
+
+# Optional ATen types we hide from the user-facing API and pass as a
+# typed empty optional at the call site.  Covers the common "full
+# default" case for most reductions and activations.  We use a typed
+# `c10::optional<T>{}` rather than bare `at::nullopt` so the compiler
+# can disambiguate ops with multiple `_out` overloads (e.g. `clamp_out`
+# accepts both `optional<Scalar>` and `optional<Tensor>` for `min`/`max`).
+_NULLOPT_BY_TYPE = {
+    "Scalar?": "c10::optional<at::Scalar>{}",
+    "int?": "c10::optional<int64_t>{}",
+    "bool?": "c10::optional<bool>{}",
+    "float?": "c10::optional<double>{}",
+    "str?": "c10::optional<c10::string_view>{}",
+    "ScalarType?": "c10::optional<at::ScalarType>{}",
+    "MemoryFormat?": "c10::optional<at::MemoryFormat>{}",
+    "Layout?": "c10::optional<at::Layout>{}",
+    "Device?": "c10::optional<at::Device>{}",
+    "Generator?": "c10::optional<at::Generator>{}",
+    "Tensor?": "c10::optional<at::Tensor>{}",
+    "Tensor?[]": "c10::List<c10::optional<at::Tensor>>{}",
+    "int[]?": "c10::optional<at::IntArrayRef>{}",
+    "int[1]?": "c10::optional<at::IntArrayRef>{}",
+    "int[2]?": "c10::optional<at::IntArrayRef>{}",
+    "int[3]?": "c10::optional<at::IntArrayRef>{}",
+    "SymInt?": "c10::optional<int64_t>{}",
+    "SymInt[]?": "c10::optional<at::IntArrayRef>{}",
+    "SymInt[1]?": "c10::optional<at::IntArrayRef>{}",
+    "SymInt[2]?": "c10::optional<at::IntArrayRef>{}",
+    "SymInt[3]?": "c10::optional<at::IntArrayRef>{}",
+    "float[]?": "c10::optional<at::ArrayRef<double>>{}",
+}
+_HARDCODE_NULLOPT_TYPES = frozenset(_NULLOPT_BY_TYPE)
+
+
+@dataclasses.dataclass
+class Param:
+    name: str
+    aten_type: str
+    default: str | None
+    keyword_only: bool
+
+    @property
+    def is_tensor(self) -> bool:
+        # Real tensors only.  `Tensor?` is optional and falls through to
+        # the hidden-param path (substituted with `at::nullopt`).
+        return self.aten_type == "Tensor" or self.aten_type.startswith("Tensor(")
+
+    @property
+    def is_out(self) -> bool:
+        # Mutable tensors carry `!` in their alias annotation, e.g. `Tensor(a!)`.
+        return self.is_tensor and "!" in self.aten_type
+
+    @property
+    def is_hardcoded_nullopt(self) -> bool:
+        """If `True`, the param is omitted from the user-facing API and
+        passed as `at::nullopt` to ATen."""
+        return self.aten_type in _HARDCODE_NULLOPT_TYPES
+
+    @property
+    def is_hidden(self) -> bool:
+        """True if the param is omitted from the user-facing API.  Covers
+        hardcoded-nullopt plus `bool`s and `int`/`float`s with a numeric
+        default (typical for `keepdim`-style flags and `reduction`-style
+        enums).  Also hides `int[]`/`int[1]` with a `[]` default (empty
+        dim list means "all dims" for reductions like `amax`).  `Scalar`
+        defaults are kept visible so ops like `sub(..., alpha=1)` expose
+        `alpha` meaningfully."""
+        if self.is_hardcoded_nullopt:
+            return True
+        if self.aten_type == "bool" and self.default in {"False", "True"}:
+            return True
+        if self.aten_type in {"int", "float", "SymInt"} and self.default is not None:
+            return True
+        if (
+            self.aten_type.startswith("int[") or self.aten_type.startswith("SymInt[")
+        ) and self.default is not None:
+            return True
+        if self.aten_type == "str" and self.default is not None:
+            return True
+        return False
+
+    def hidden_value(self) -> str:
+        """C++ literal substituted for a hidden param in the ATen call."""
+        if self.is_hardcoded_nullopt:
+            return _NULLOPT_BY_TYPE[self.aten_type]
+        if self.default == "True":
+            return "true"
+        if self.default == "False":
+            return "false"
+        if self.aten_type.startswith(("int[", "SymInt[")) and self.default is not None:
+            # `int[N]=[a, b, c]` → `{a, b, c}`; `int[N]=0` (scalar default
+            # for list type) → `{0, 0, ...}` replicated to size N.
+            if self.default.startswith("["):
+                return "{" + self.default[1:-1] + "}"
+            size_match = re.search(r"\[(\d+)\]", self.aten_type)
+            n = int(size_match.group(1)) if size_match else 1
+            return "{" + ", ".join([self.default] * n) + "}"
+        if self.aten_type == "str" and self.default is not None:
+            # YAML uses single-quoted strings (e.g. `'none'`); C++ char
+            # literals also use single quotes, so swap to doubles.
+            return '"' + self.default.strip("'\"") + '"'
+        if self.aten_type in {"int", "float", "SymInt"} and self.default is not None:
+            # Translate known ATen enum defaults to their C++ identifiers.
+            return _ENUM_DEFAULTS.get(self.default, self.default)
+        raise AssertionError(
+            f"param {self.name!r} of type {self.aten_type!r} with default "
+            f"{self.default!r} is not hidden"
+        )
+
+    @property
+    def cpp_type(self) -> str:
+        if self.is_tensor:
+            # `Tensor[]` / `Tensor(a!)[]` would need `std::vector<Tensor>` and a
+            # different ATen call shape — not yet supported, so reject so the
+            # whole overload gets skipped instead of emitting code that calls
+            # `at::<op>_out(at::Tensor, ...)` against an `at::TensorList`
+            # signature.
+            if self.aten_type.endswith("[]"):
+                raise NotImplementedError(
+                    f"`Tensor[]` param {self.name!r} not supported yet"
+                )
+            return "Tensor"
+        if self.is_hidden:
+            # Not exposed — the ATen call substitutes a hardcoded value
+            # so the `cpp_type` is irrelevant.
+            return "void"
+        bare = self.aten_type.rstrip("?")
+        # Required `int[N]` / `SymInt[N]` (no default) — pybind11 accepts
+        # a Python list of ints into `std::vector<int64_t>`, which ATen
+        # promotes to `IntArrayRef` implicitly.
+        if bare.startswith(("int[", "SymInt[")) or bare in {"int[]", "SymInt[]"}:
+            return "std::vector<int64_t>"
+        try:
+            return _SCALAR_TYPE_MAP[bare]
+        except KeyError as exc:
+            raise NotImplementedError(
+                f"unsupported ATen type {self.aten_type!r} for param {self.name!r}"
+            ) from exc
+
+
+@dataclasses.dataclass
+class Op:
+    aten_name: str
+    overload: str
+    params: list[Param]
+
+    @property
+    def pascal_name(self) -> str:
+        return _snake_to_pascal(self.infini_name)
+
+    @property
+    def infini_name(self) -> str:
+        """InfiniOps op name.  Includes the overload to disambiguate
+        between schemas of the same ATen op
+        (e.g. `pow.Tensor_Tensor_out` → `pow_tensor_tensor`,
+        `pow.Tensor_Scalar_out` → `pow_tensor_scalar`,
+        `div.out_mode` → `div_mode`).  The `out` suffix/prefix used by
+        ATen to disambiguate the out-variant carries no semantic info
+        and is stripped."""
+        suffix = self.overload
+        suffix = suffix.removesuffix("_out").removeprefix("out_")
+        if suffix and suffix != "out":
+            return f"{self.aten_name}_{suffix.lower()}"
+        return self.aten_name
+
+    @property
+    def tensor_params(self) -> list[Param]:
+        return [p for p in self.params if p.is_tensor]
+
+    @property
+    def out_params(self) -> list[Param]:
+        """Mutable tensor outputs.  Most ops have one (`Tensor(a!) out`);
+        multi-output ops like `frexp` or `sort` have several
+        (`Tensor(a!) values`, `Tensor(b!) indices`)."""
+        return [p for p in self.params if p.is_out]
+
+    @property
+    def out_param(self) -> Param:
+        """Single-output convenience.  Asserts there's exactly one."""
+        outs = self.out_params
+        assert len(outs) == 1, f"op {self.aten_name!r} has {len(outs)} out tensors"
+        return outs[0]
+
+    @property
+    def visible_params(self) -> list[Param]:
+        """Params the wrapper exposes to the user; hidden ones (hardcoded
+        optional nullopt, default-`False`/`True` bools) are filtered."""
+        return [p for p in self.params if not p.is_hidden]
+
+    @property
+    def is_testable(self) -> bool:
+        """Cheap structural check: at least one out tensor, and the first
+        constructor parameter is a tensor.  The latter is needed because
+        `Operator::Make(Tensor tensor, Args... args)` dispatches on
+        `tensor.device()`, so an op like `pow.Scalar_out(Scalar self,
+        Tensor exponent, *, Tensor(a!) out)` cannot be wired up without
+        a separate dispatch path.  Generators like `arange` / `linspace`
+        also fall under this rule (no input tensors at all)."""
+        if not self.out_params:
+            return False
+        # `params` includes out tensors at the end; check the first
+        # non-out param.  If there are no non-out params (`empty.out`,
+        # `arange.out`), this op also fails the dispatch precondition.
+        non_out = [p for p in self.params if not p.is_out]
+        if not non_out:
+            return False
+        return non_out[0].is_tensor
+
+
+_FUNC_RE = re.compile(
+    r"^(?P<name>[a-zA-Z_][a-zA-Z0-9_]*)"
+    r"(?:\.(?P<overload>\w+))?"
+    r"\((?P<args>.*)\)\s*->\s*.+$"
+)
+
+_ARG_RE = re.compile(
+    r"^(?P<type>\S+(?:\([^)]*\))?\??)"  # type with optional alias and `?`
+    r"\s+(?P<name>\w+)"
+    r"(?:\s*=\s*(?P<default>.+))?$"
+)
+
+
+def _parse_func(func_str: str) -> Op:
+    m = _FUNC_RE.match(func_str)
+    if not m:
+        raise ValueError(f"could not parse func: {func_str!r}")
+    return Op(
+        aten_name=m.group("name"),
+        overload=m.group("overload") or "",
+        params=_parse_args(m.group("args")),
+    )
+
+
+def _parse_args(args_str: str) -> list[Param]:
+    params: list[Param] = []
+    keyword_only = False
+    for token in _split_args(args_str):
+        if token == "*":
+            keyword_only = True
+            continue
+        params.append(_parse_one_arg(token, keyword_only))
+    return params
+
+
+def _split_args(args_str: str) -> list[str]:
+    """Split on top-level commas, respecting `(...)` and `[...]`."""
+    parts: list[str] = []
+    depth = 0
+    current: list[str] = []
+    for ch in args_str:
+        if ch in "([":
+            depth += 1
+            current.append(ch)
+        elif ch in ")]":
+            depth -= 1
+            current.append(ch)
+        elif ch == "," and depth == 0:
+            piece = "".join(current).strip()
+            if piece:
+                parts.append(piece)
+            current = []
+        else:
+            current.append(ch)
+    tail = "".join(current).strip()
+    if tail:
+        parts.append(tail)
+    return parts
+
+
+def _parse_one_arg(token: str, keyword_only: bool) -> Param:
+    m = _ARG_RE.match(token)
+    if not m:
+        raise ValueError(f"could not parse arg: {token!r}")
+    return Param(
+        name=m.group("name"),
+        aten_type=m.group("type"),
+        default=m.group("default"),
+        keyword_only=keyword_only,
+    )
+
+
+def _snake_to_pascal(s: str) -> str:
+    return "".join(p.capitalize() for p in s.split("_"))
+
+
+def _base_path(op_name: str) -> pathlib.Path:
+    return _BASE_DIR / f"{op_name}.h"
+
+
+def _load_aten_yaml() -> str:
+    """Return the contents of `native_functions.yaml`, fetching and caching
+    the version pinned by `_PYTORCH_VERSION` on the first call."""
+    if not _ATEN_YAML_CACHE.exists():
+        _ATEN_YAML_CACHE.parent.mkdir(parents=True, exist_ok=True)
+        print(
+            f"fetching `native_functions.yaml` ({_PYTORCH_VERSION})...",
+            file=sys.stderr,
+        )
+        with urllib.request.urlopen(_ATEN_YAML_URL) as response:
+            _ATEN_YAML_CACHE.write_bytes(response.read())
+    return _ATEN_YAML_CACHE.read_text()
+
+
+def _find_out_entries(entries: list[dict], op_name: str) -> list[dict]:
+    """Return all out-variant entries for `op_name`, with the bare
+    `<name>.out(` form first and overload-suffixed variants
+    (e.g. `pow.Tensor_Tensor_out(`, `kthvalue.values(`) after.  An
+    entry counts as an out-variant when it (a) is named
+    `<op_name>.out`, (b) ends in `_out`, or (c) carries a
+    `Tensor(<letter>!)` mutability annotation — that last case covers
+    multi-output ops named after their output tensors
+    (`kthvalue.values`, `mode.values`, …)."""
+    bare_prefix = f"{op_name}.out("
+    op_overload = re.compile(rf"^{re.escape(op_name)}\.\w+\(")
+    mut_tensor = re.compile(r"Tensor\([a-z]!\)")
+    bare: list[dict] = []
+    others: list[dict] = []
+    for entry in entries:
+        func = entry.get("func", "")
+        if func.startswith(bare_prefix):
+            bare.append(entry)
+        elif op_overload.match(func) and (
+            func.split("(", 1)[0].endswith("_out") or mut_tensor.search(func)
+        ):
+            others.append(entry)
+    return bare + others
+
+
+def _format_signature(op: Op, *, include_defaults: bool = False) -> str:
+    parts = []
+    for param in op.visible_params:
+        prefix = "" if param.is_out else "const "
+        text = f"{prefix}{param.cpp_type} {param.name}"
+        if include_defaults and param.default is not None:
+            text += f" = {_translate_default(param)}"
+        parts.append(text)
+    return ", ".join(parts)
+
+
+def _normalize_cxx_signature(text: str) -> str:
+    return re.sub(r"\s+", " ", text.strip())
+
+
+def _has_compatible_base(op: Op) -> bool:
+    path = _base_path(op.infini_name)
+    if not path.exists():
+        return False
+
+    text = _normalize_cxx_signature(path.read_text())
+    ctor_signature = _normalize_cxx_signature(
+        f"{op.pascal_name}({_format_signature(op)})"
+    )
+    op_call_signature = _normalize_cxx_signature(f"operator()({_format_signature(op)})")
+
+    return (
+        f"class {op.pascal_name} : public Operator<{op.pascal_name}>" in text
+        and ctor_signature in text
+        and op_call_signature in text
+    )
+
+
+def _translate_default(param: Param) -> str:
+    """Translate a YAML default literal to a C++ literal."""
+    raw = param.default
+    if raw == "True":
+        return "true"
+    if raw == "False":
+        return "false"
+    if raw == "None":
+        return "{}"
+    return raw  # numeric literals (`0`, `1`, `1.0`) pass through
+
+
+def _generate_base_header(op: Op) -> str:
+    init_pieces = []
+    member_decls = []
+    for param in op.tensor_params:
+        init_pieces.append(f"        {param.name}_shape_{{{param.name}.shape()}}")
+        init_pieces.append(f"        {param.name}_strides_{{{param.name}.strides()}}")
+        init_pieces.append(f"        {param.name}_type_{{{param.name}.dtype()}}")
+        member_decls.append(f"  Tensor::Shape {param.name}_shape_;")
+        member_decls.append(f"  Tensor::Strides {param.name}_strides_;")
+        member_decls.append(f"  DataType {param.name}_type_;")
+    # All out tensors share a device; use the first one.
+    init_pieces.append(
+        f"        device_index_{{{op.out_params[0].name}.device().index()}}"
+    )
+    member_decls.append("  int device_index_{0};")
+
+    init_list = ",\n".join(init_pieces).lstrip()
+
+    return _BASE_TEMPLATE.format(
+        name_uc=op.infini_name.upper(),
+        pascal=op.pascal_name,
+        ctor_signature=_format_signature(op),
+        init_list=init_list,
+        op_call_signature=_format_signature(op),
+        member_decls="\n".join(member_decls),
+    )
+
+
+def _generate_torch_header(op: Op) -> str:
+    return _TORCH_HEADER_TEMPLATE.format(
+        name_uc=op.infini_name.upper(),
+        name=op.infini_name,
+        pascal=op.pascal_name,
+        op_call_signature=_format_signature(op),
+        slot=_PYTORCH_SLOT,
+    )
+
+
+def _generate_torch_source(op: Op) -> str:
+    conversion_lines = []
+    for param in op.tensor_params:
+        data_expr = (
+            f"{param.name}.data()"
+            if param.is_out
+            else f"const_cast<void*>({param.name}.data())"
+        )
+        conversion_lines.append(
+            f"  auto at_{param.name} = ToAtenTensor<kDev>(\n"
+            f"      {data_expr}, {param.name}_shape_, {param.name}_strides_,\n"
+            f"      {param.name}_type_, device_index_);"
+        )
+
+    # ATen `_out` form puts all out tensors first, then non-out params
+    # in YAML order.  Hardcoded-nullopt params become `at::nullopt`.
+    arg_order = op.out_params + [p for p in op.params if not p.is_out]
+
+    def _render_arg(p):
+        if p.is_hidden:
+            return p.hidden_value()
+        if p.is_tensor:
+            return f"at_{p.name}"
+        return p.name
+
+    aten_args = ", ".join(_render_arg(p) for p in arg_order)
+
+    instantiations = "\n".join(
+        f"template class Operator<{op.pascal_name}, "
+        f"Device::Type::{dev}, {_PYTORCH_SLOT}>;"
+        for dev in _DEVICE_TYPES
+    )
+
+    return _TORCH_SOURCE_TEMPLATE.format(
+        name=op.infini_name,
+        pascal=op.pascal_name,
+        op_call_signature=_format_signature(op),
+        tensor_conversions="\n".join(conversion_lines),
+        # `at::<aten_name>_out` resolves the right kernel via C++ overload
+        # resolution from the argument types we pass.
+        aten_call=f"{op.aten_name}_out({aten_args})",
+        slot=_PYTORCH_SLOT,
+        instantiations=instantiations,
+    )
+
+
+_BASE_TEMPLATE = """\
+// AUTO-GENERATED by `scripts/generate_torch_ops.py` — DO NOT EDIT.
+#ifndef INFINI_OPS_BASE_{name_uc}_H_
+#define INFINI_OPS_BASE_{name_uc}_H_
+
+#include "operator.h"
+
+namespace infini::ops {{
+
+class {pascal} : public Operator<{pascal}> {{
+ public:
+  {pascal}({ctor_signature})
+      : {init_list} {{}}
+
+  virtual void operator()({op_call_signature}) const = 0;
+
+ protected:
+{member_decls}
+}};
+
+}}  // namespace infini::ops
+
+#endif
+"""
+
+
+_TORCH_HEADER_TEMPLATE = """\
+// AUTO-GENERATED by `scripts/generate_torch_ops.py` — DO NOT EDIT.
+#ifndef INFINI_OPS_TORCH_{name_uc}_H_
+#define INFINI_OPS_TORCH_{name_uc}_H_
+
+#include "base/{name}.h"
+
+namespace infini::ops {{
+
+template <Device::Type kDev>
+class Operator<{pascal}, kDev, {slot}> : public {pascal} {{
+ public:
+  using {pascal}::{pascal};
+
+  void operator()({op_call_signature}) const override;
+}};
+
+}}  // namespace infini::ops
+
+#endif
+"""
+
+
+_TORCH_SOURCE_TEMPLATE = """\
+// AUTO-GENERATED by `scripts/generate_torch_ops.py` — DO NOT EDIT.
+#include "torch/{name}/{name}.h"
+
+#include "torch/tensor_.h"
+
+namespace infini::ops {{
+
+template <Device::Type kDev>
+void Operator<{pascal}, kDev, {slot}>::operator()({op_call_signature}) const {{
+{tensor_conversions}
+
+  at::{aten_call};
+}}
+
+{instantiations}
+
+}}  // namespace infini::ops
+"""
+
+
+def _emit(op: Op, *, emit_base: bool) -> None:
+    base_path = _GENERATED_BASE_DIR / f"{op.infini_name}.h"
+    torch_dir = _GENERATED_TORCH_DIR / op.infini_name
+    torch_header_path = torch_dir / f"{op.infini_name}.h"
+    torch_source_path = torch_dir / f"{op.infini_name}.cc"
+
+    if emit_base:
+        _GENERATED_BASE_DIR.mkdir(parents=True, exist_ok=True)
+        base_path.write_text(_generate_base_header(op))
+
+    torch_dir.mkdir(parents=True, exist_ok=True)
+
+    torch_header_path.write_text(_generate_torch_header(op))
+    torch_source_path.write_text(_generate_torch_source(op))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument(
+        "--ops",
+        nargs="*",
+        help="Override the op allowlist. If omitted, reads `scripts/torch_ops.yaml`.",
+    )
+    args = parser.parse_args()
+
+    op_names = args.ops or yaml.safe_load(_OPS_YAML_PATH.read_text())
+    aten_entries = yaml.safe_load(_load_aten_yaml())
+
+    # Wipe previous outputs so files for ops that have since been removed,
+    # renamed, or rejected by `cpp_type` don't linger and get picked up by
+    # the CMake glob.  Both `generated/base/` and `generated/torch/` are
+    # written exclusively by this script.
+    if _GENERATED_BASE_DIR.exists():
+        shutil.rmtree(_GENERATED_BASE_DIR)
+    if _GENERATED_TORCH_DIR.exists():
+        shutil.rmtree(_GENERATED_TORCH_DIR)
+
+    skipped: list[tuple[str, str]] = []
+    metadata: list[dict] = []
+
+    for name in op_names:
+        candidates = _find_out_entries(aten_entries, name)
+        if not candidates:
+            skipped.append((name, f"no `.out` variant for `{name}` in YAML"))
+            continue
+
+        usable: list[Op] = []
+        last_reason = ""
+        for entry in candidates:
+            try:
+                op = _parse_func(entry["func"])
+                for param in op.params:
+                    _ = param.cpp_type  # eagerly raise on unsupported types
+            except (NotImplementedError, ValueError) as exc:
+                last_reason = str(exc)
+                continue
+            if not op.is_testable:
+                last_reason = "no testable tensor input/output pair"
+                continue
+            if _base_path(op.infini_name).exists() and not _has_compatible_base(op):
+                last_reason = (
+                    f"`src/base/{op.infini_name}.h` exists but does not match "
+                    "the generated torch wrapper signature"
+                )
+                continue
+            usable.append(op)
+
+        if not usable:
+            skipped.append((name, last_reason or "no usable overload"))
+            continue
+
+        # Emit one InfiniOps wrapper per usable overload — `pow.Tensor_Tensor_out`
+        # and `pow.Tensor_Scalar_out` become distinct classes
+        # (`PowTensorTensor`, `PowTensorScalar`) so users get the right
+        # behaviour by naming the variant they want.
+        for op in usable:
+            _emit(op, emit_base=not _has_compatible_base(op))
+            metadata.append(
+                {
+                    "name": op.infini_name,
+                    "aten_name": op.aten_name,
+                    "params": [
+                        {
+                            "name": p.name,
+                            "type": p.aten_type,
+                            "is_tensor": p.is_tensor,
+                            "is_out": p.is_out,
+                        }
+                        for p in op.visible_params
+                    ],
+                }
+            )
+
+    _GENERATED_DIR.mkdir(parents=True, exist_ok=True)
+    _METADATA_PATH.write_text(json.dumps({"ops": metadata}, indent=2) + "\n")
+
+    print(f"generated {len(metadata)} ops: {[m['name'] for m in metadata]}")
+    for name, reason in skipped:
+        print(f"  skipped {name!r}: {reason}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
index 49b6c199..38bf26c3 100644
--- a/scripts/generate_wrappers.py
+++ b/scripts/generate_wrappers.py
@@ -15,6 +15,11 @@
 
 _GENERATION_DIR = pathlib.Path("generated")
 
+# Base headers emitted by `generate_torch_ops.py` live alongside the
+# hand-written ones in `src/base/`, but in a parallel tree under
+# `generated/base/` so they are not committed.
+_GENERATED_BASE_DIR = _GENERATION_DIR / "base"
+
 _BINDINGS_DIR = _GENERATION_DIR / "bindings"
 
 _GENERATED_SRC_DIR = _GENERATION_DIR / "src"
@@ -24,6 +29,18 @@
 _INDENTATION = "  "
 
 
+def _find_base_header(op_name):
+    """Return the base header for `op_name`, looking under both `src/base/`
+    and `generated/base/` (preferring the hand-written one)."""
+    src_path = _BASE_DIR / f"{op_name}.h"
+    if src_path.exists():
+        return src_path
+    generated_path = _GENERATED_BASE_DIR / f"{op_name}.h"
+    if generated_path.exists():
+        return generated_path
+    raise FileNotFoundError(f"no base header for op {op_name!r}")
+
+
 class _OperatorExtractor:
     def __call__(self, op_name):
         def _get_system_include_flags():
@@ -53,8 +70,16 @@ def _get_compilers():
         system_include_flags = _get_system_include_flags()
 
         index = clang.cindex.Index.create()
-        args = ("-std=c++17", "-x", "c++", "-I", "src") + tuple(system_include_flags)
-        translation_unit = index.parse(f"src/base/{op_name}.h", args=args)
+        args = (
+            "-std=c++17",
+            "-x",
+            "c++",
+            "-I",
+            "src",
+            "-I",
+            str(_GENERATION_DIR),
+        ) + tuple(system_include_flags)
+        translation_unit = index.parse(str(_find_base_header(op_name)), args=args)
 
         nodes = tuple(type(self)._find(translation_unit.cursor, op_name))
 
@@ -98,7 +123,7 @@ def _find_optional_tensor_params(op_name):
     headers are not fully available, so we fall back to a regex scan of the
     source text.
     """
-    source = (_BASE_DIR / f"{op_name}.h").read_text()
+    source = _find_base_header(op_name).read_text()
 
     return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
 
@@ -107,7 +132,7 @@ def _find_vector_tensor_params(op_name):
     """Return a set of parameter names declared as `std::vector<Tensor>` in
     the base header.
     """
-    source = (_BASE_DIR / f"{op_name}.h").read_text()
+    source = _find_base_header(op_name).read_text()
 
     return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
 
@@ -216,8 +241,10 @@ def _generate_call(op_name, call, method=True):
                 f'  }}, {py_args_str}py::kw_only(), py::arg("stream") = 0, py::arg("implementation_index") = 0);'
             )
 
-        return f"""      .def("__call__", [](const Self& self, {call_params}) {{
-        return static_cast<const Operator<Self>&>(self)({call_args});
+        # Use `op` rather than `self` for the operator instance — `self` is
+        # a common ATen parameter name and would collide.
+        return f"""      .def("__call__", [](const Self& op, {call_params}) {{
+        return static_cast<const Operator<Self>&>(op)({call_args});
       }})"""
 
     inits = "\n".join(
@@ -268,8 +295,15 @@ def _generate_call(op_name, call, method=True):
 
 def _generate_legacy_c(operator, paths):
     def _generate_source(operator):
+        def _to_include_path(path):
+            text = str(path)
+            for prefix in ("src/", "generated/"):
+                if text.startswith(prefix):
+                    return text[len(prefix) :]
+            return text
+
         impl_includes = "\n".join(
-            f'#include "{str(path).removeprefix("src/")}"' for path in paths
+            f'#include "{_to_include_path(path)}"' for path in paths
         )
 
         return f"""#include "../../handle.h"
@@ -452,20 +486,36 @@ def _get_all_ops(devices, with_torch=False):
 
     ops = {}
 
-    for file_path in _BASE_DIR.iterdir():
-        if not file_path.is_file():
-            continue
+    base_dirs = [_BASE_DIR]
+    if _GENERATED_BASE_DIR.exists():
+        base_dirs.append(_GENERATED_BASE_DIR)
 
-        op_name = file_path.stem
+    impl_roots = [_SRC_DIR]
+    if with_torch and (_GENERATION_DIR / "torch").exists():
+        impl_roots.append(_GENERATION_DIR)
 
-        ops[op_name] = []
+    for base_dir in base_dirs:
+        for file_path in base_dir.iterdir():
+            if not file_path.is_file():
+                continue
 
-        for file_path in _SRC_DIR.rglob("*.h"):
-            if file_path.parent.parent.name not in scan_dirs:
+            op_name = file_path.stem
+
+            if op_name in ops:
                 continue
 
-            if f"class Operator<{_snake_to_pascal(op_name)}" in file_path.read_text():
-                ops[op_name].append(file_path)
+            ops[op_name] = []
+
+            for impl_root in impl_roots:
+                for impl_path in impl_root.rglob("*.h"):
+                    if impl_path.parent.parent.name not in scan_dirs:
+                        continue
+
+                    if (
+                        f"class Operator<{_snake_to_pascal(op_name)}"
+                        in impl_path.read_text()
+                    ):
+                        ops[op_name].append(impl_path)
 
     return ops
 
@@ -489,9 +539,16 @@ def _get_all_ops(devices, with_torch=False):
 
     args = parser.parse_args()
 
-    _BINDINGS_DIR.mkdir(parents=True, exist_ok=True)
-    _GENERATED_SRC_DIR.mkdir(parents=True, exist_ok=True)
-    _INCLUDE_DIR.mkdir(parents=True, exist_ok=True)
+    # Wipe previous outputs so files for ops that have since been removed
+    # don't linger and cause build failures (the per-op `.h` includes are
+    # written from `header_paths`, but `ops.cc`'s `impl_includes` reads
+    # from the live `ops` map — a stale `<op>/<op>.h` file referenced by
+    # a previous run's `ops.cc` is harmless, but a stale source file in
+    # `generated/src/<op>/` can still get globbed by the build).
+    for d in (_BINDINGS_DIR, _GENERATED_SRC_DIR, _INCLUDE_DIR):
+        if d.exists():
+            shutil.rmtree(d)
+        d.mkdir(parents=True)
 
     ops_json = pathlib.Path("ops.json")
 
diff --git a/scripts/merge_base_branches.py b/scripts/merge_base_branches.py
new file mode 100644
index 00000000..9664dc52
--- /dev/null
+++ b/scripts/merge_base_branches.py
@@ -0,0 +1,77 @@
+"""Create an integration branch from multiple independent base-op branches."""
+
+import argparse
+import subprocess
+
+
+def _run(args: list[str]) -> None:
+    print("+", " ".join(args))
+    subprocess.run(args, check=True)
+
+
+def _branch_ref(branch: str) -> str:
+    result = subprocess.run(
+        ["git", "rev-parse", "--verify", "--quiet", branch],
+        check=False,
+        stdout=subprocess.DEVNULL,
+    )
+
+    if result.returncode == 0:
+        return branch
+
+    remote_ref = f"origin/{branch}"
+    result = subprocess.run(
+        ["git", "rev-parse", "--verify", "--quiet", remote_ref],
+        check=False,
+        stdout=subprocess.DEVNULL,
+    )
+
+    if result.returncode == 0:
+        return remote_ref
+
+    _run(["git", "fetch", "origin", branch])
+
+    return remote_ref
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("branches", nargs="+", help="Branches to integrate.")
+    parser.add_argument(
+        "--base",
+        default="feat/torch-codegen",
+        help="Base branch for the integration branch.",
+    )
+    parser.add_argument(
+        "--target",
+        default="codex/integrate-base-branches",
+        help="Integration branch to create or reset.",
+    )
+    parser.add_argument(
+        "--strategy",
+        choices=("merge", "cherry-pick"),
+        default="merge",
+        help="How to apply each branch.",
+    )
+    parser.add_argument(
+        "--reset-target",
+        action="store_true",
+        help="Reset the target branch if it already exists.",
+    )
+
+    args = parser.parse_args()
+
+    switch_flag = "--force-create" if args.reset_target else "--create"
+    _run(["git", "switch", switch_flag, args.target, _branch_ref(args.base)])
+
+    for branch in args.branches:
+        ref = _branch_ref(branch)
+
+        if args.strategy == "merge":
+            _run(["git", "merge", "--no-edit", ref])
+        else:
+            _run(["git", "cherry-pick", ref])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/torch_ops.yaml b/scripts/torch_ops.yaml
new file mode 100644
index 00000000..73867355
--- /dev/null
+++ b/scripts/torch_ops.yaml
@@ -0,0 +1,468 @@
+# Allowlist of ATen ops to expose as InfiniOps operators.
+#
+# Auto-discovered: every base op name with at least one parsable
+# `.out` overload using the supported type vocabulary.  The
+# generator emits one InfiniOps wrapper per overload, so this
+# file lists ~390 base names but produces 500+ wrappers.
+#
+# To exclude an op, comment out its line.
+
+- abs
+- absolute
+- acos
+- acosh
+- adaptive_avg_pool2d
+- adaptive_avg_pool3d
+- adaptive_avg_pool3d_backward
+- adaptive_max_pool2d
+- adaptive_max_pool2d_backward
+- adaptive_max_pool3d
+- adaptive_max_pool3d_backward
+- add
+- addbmm
+- addcdiv
+- addcmul
+- addmm
+- addmv
+- addr
+- all
+- amax
+- amin
+- aminmax
+- angle
+- any
+- arange
+- arccos
+- arccosh
+- arcsin
+- arcsinh
+- arctan
+- arctan2
+- arctanh
+- argmax
+- argmin
+- asin
+- asinh
+- atan
+- atan2
+- atanh
+- avg_pool2d
+- avg_pool2d_backward
+- avg_pool3d
+- avg_pool3d_backward
+- baddbmm
+- batch_norm_elemt
+- bernoulli
+- binary_cross_entropy
+- binary_cross_entropy_backward
+- bitwise_and
+- bitwise_left_shift
+- bitwise_not
+- bitwise_or
+- bitwise_right_shift
+- bitwise_xor
+- bmm
+- bucketize
+- ceil
+- cholesky
+- cholesky_inverse
+- cholesky_solve
+- clamp
+- clamp_max
+- clamp_min
+- clip
+- col2im
+- complex
+- conj_physical
+- copysign
+- cos
+- cosh
+- cross
+- cudnn_convolution
+- cummax
+- cummin
+- cumprod
+- cumsum
+- deg2rad
+- diag
+- diff
+- digamma
+- div
+- divide
+- dot
+- elu
+- elu_backward
+- empty
+- eq
+- erf
+- erfc
+- erfinv
+- exp
+- exp2
+- expm1
+- eye
+- fft_fft
+- fft_fft2
+- fft_fftfreq
+- fft_fftn
+- fft_hfft
+- fft_hfft2
+- fft_hfftn
+- fft_ifft
+- fft_ifft2
+- fft_ifftn
+- fft_ihfft
+- fft_ihfft2
+- fft_ihfftn
+- fft_irfft
+- fft_irfft2
+- fft_irfftn
+- fft_rfft
+- fft_rfft2
+- fft_rfftfreq
+- fft_rfftn
+- fix
+- float_power
+- floor
+- floor_divide
+- fmax
+- fmin
+- fmod
+- frac
+- fractional_max_pool2d
+- fractional_max_pool2d_backward
+- fractional_max_pool3d
+- fractional_max_pool3d_backward
+- frexp
+- frobenius_norm
+- full
+- gather
+- gcd
+- ge
+- gelu
+- gelu_backward
+- geqrf
+- ger
+- glu
+- glu_backward
+- greater
+- greater_equal
+- gt
+- hardshrink
+- hardshrink_backward
+- hardsigmoid
+- hardsigmoid_backward
+- hardswish
+- hardtanh
+- hardtanh_backward
+- heaviside
+- histc
+- histogram
+- hspmm
+- huber_loss
+- huber_loss_backward
+- hypot
+- i0
+- igamma
+- igammac
+- im2col
+- index
+- index_add
+- index_copy
+- index_reduce
+- index_select
+- inner
+- inverse
+- isin
+- isneginf
+- isposinf
+- kron
+- kthvalue
+- lcm
+- ldexp
+- le
+- leaky_relu
+- leaky_relu_backward
+- lerp
+- less
+- less_equal
+- lgamma
+- linalg_cholesky
+- linalg_cholesky_ex
+- linalg_cond
+- linalg_cross
+- linalg_det
+- linalg_eig
+- linalg_eigh
+- linalg_eigvals
+- linalg_eigvalsh
+- linalg_householder_product
+- linalg_inv
+- linalg_inv_ex
+- linalg_ldl_factor
+- linalg_ldl_factor_ex
+- linalg_ldl_solve
+- linalg_lstsq
+- linalg_lu
+- linalg_lu_factor
+- linalg_lu_factor_ex
+- linalg_lu_solve
+- linalg_matmul
+- linalg_matrix_norm
+- linalg_matrix_power
+- linalg_matrix_rank
+- linalg_norm
+- linalg_pinv
+- linalg_qr
+- linalg_slogdet
+- linalg_solve
+- linalg_solve_ex
+- linalg_solve_triangular
+- linalg_svd
+- linalg_svdvals
+- linalg_tensorinv
+- linalg_tensorsolve
+- linalg_vecdot
+- linalg_vector_norm
+- linear
+- linspace
+- log
+- log10
+- log1p
+- log2
+- log_sigmoid
+- log_sigmoid_backward
+- log_sigmoid_forward
+- log_softmax
+- logaddexp
+- logaddexp2
+- logcumsumexp
+- logical_and
+- logical_not
+- logical_or
+- logical_xor
+- logit
+- logit_backward
+- logspace
+- logsumexp
+- lt
+- lu_solve
+- lu_unpack
+- masked_select
+- matmul
+- matrix_power
+- max
+- max_pool2d_with_indices
+- max_pool2d_with_indices_backward
+- max_pool3d_with_indices
+- max_pool3d_with_indices_backward
+- max_unpool2d
+- max_unpool3d
+- maximum
+- mean
+- median
+- min
+- minimum
+- mish
+- mkldnn_adaptive_avg_pool2d
+- mm
+- mode
+- mse_loss
+- mse_loss_backward
+- msort
+- mul
+- multi_margin_loss
+- multi_margin_loss_backward
+- multilabel_margin_loss
+- multilabel_margin_loss_backward
+- multilabel_margin_loss_forward
+- multinomial
+- multiply
+- mv
+- mvlgamma
+- nan_to_num
+- nanmean
+- nanmedian
+- nanquantile
+- nansum
+- narrow_copy
+- native_batch_norm
+- ne
+- neg
+- negative
+- nextafter
+- nll_loss
+- nll_loss2d
+- nll_loss2d_backward
+- nll_loss2d_forward
+- nll_loss_backward
+- nll_loss_forward
+- nonzero
+- nonzero_static
+- norm
+- normal
+- not_equal
+- nuclear_norm
+- ones
+- orgqr
+- ormqr
+- outer
+- polar
+- polygamma
+- pow
+- prod
+- qr
+- quantile
+- rad2deg
+- rand
+- randint
+- randn
+- randperm
+- range
+- reciprocal
+- reflection_pad1d
+- reflection_pad1d_backward
+- reflection_pad2d
+- reflection_pad2d_backward
+- reflection_pad3d
+- reflection_pad3d_backward
+- remainder
+- renorm
+- replication_pad1d
+- replication_pad1d_backward
+- replication_pad2d
+- replication_pad2d_backward
+- replication_pad3d
+- replication_pad3d_backward
+- round
+- rrelu_with_noise
+- rsqrt
+- scatter
+- scatter_add
+- scatter_reduce
+- searchsorted
+- sgn
+- sigmoid
+- sigmoid_backward
+- sign
+- signbit
+- silu
+- silu_backward
+- sin
+- sinc
+- sinh
+- slogdet
+- slow_conv3d
+- slow_conv3d_forward
+- slow_conv_transpose2d
+- slow_conv_transpose3d
+- smooth_l1_loss
+- smooth_l1_loss_backward
+- soft_margin_loss
+- soft_margin_loss_backward
+- softmax
+- softplus
+- softplus_backward
+- softshrink
+- softshrink_backward
+- sort
+- sparse_sampled_addmm
+- special_airy_ai
+- special_bessel_j0
+- special_bessel_j1
+- special_bessel_y0
+- special_bessel_y1
+- special_chebyshev_polynomial_t
+- special_chebyshev_polynomial_u
+- special_chebyshev_polynomial_v
+- special_chebyshev_polynomial_w
+- special_digamma
+- special_entr
+- special_erf
+- special_erfc
+- special_erfcx
+- special_erfinv
+- special_exp2
+- special_expit
+- special_expm1
+- special_gammainc
+- special_gammaincc
+- special_gammaln
+- special_hermite_polynomial_h
+- special_hermite_polynomial_he
+- special_i0
+- special_i0e
+- special_i1
+- special_i1e
+- special_laguerre_polynomial_l
+- special_legendre_polynomial_p
+- special_log1p
+- special_log_ndtr
+- special_logit
+- special_logsumexp
+- special_modified_bessel_i0
+- special_modified_bessel_i1
+- special_modified_bessel_k0
+- special_modified_bessel_k1
+- special_multigammaln
+- special_ndtr
+- special_ndtri
+- special_polygamma
+- special_psi
+- special_round
+- special_scaled_modified_bessel_k0
+- special_scaled_modified_bessel_k1
+- special_shifted_chebyshev_polynomial_t
+- special_shifted_chebyshev_polynomial_u
+- special_shifted_chebyshev_polynomial_v
+- special_shifted_chebyshev_polynomial_w
+- special_sinc
+- special_spherical_bessel_j0
+- special_xlog1py
+- special_xlogy
+- special_zeta
+- split_copy
+- split_with_sizes_copy
+- sqrt
+- square
+- sspaddmm
+- std
+- sub
+- subtract
+- sum
+- svd
+- take
+- take_along_dim
+- tan
+- tanh
+- tanh_backward
+- tensordot
+- thnn_conv2d
+- threshold
+- threshold_backward
+- topk
+- triangular_solve
+- tril
+- triu
+- true_divide
+- trunc
+- unbind_copy
+- upsample_bicubic2d
+- upsample_bicubic2d_backward
+- upsample_bilinear2d
+- upsample_bilinear2d_backward
+- upsample_linear1d
+- upsample_linear1d_backward
+- upsample_nearest1d
+- upsample_nearest1d_backward
+- upsample_nearest2d
+- upsample_nearest2d_backward
+- upsample_nearest3d
+- upsample_nearest3d_backward
+- upsample_trilinear3d
+- upsample_trilinear3d_backward
+- var
+- vdot
+- where
+- xlogy
+- zeros
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 32c92949..0181e134 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -252,11 +252,30 @@ if(WITH_ASCEND)
 endif()
 
 if(WITH_TORCH)
-    file(GLOB_RECURSE TORCH_SOURCES CONFIGURE_DEPENDS "torch/*.cc" "torch/*.cpp")
+    # Generate ATen-backed operator wrappers from `scripts/torch_ops.yaml`.
+    find_package(Python COMPONENTS Interpreter REQUIRED)
+    execute_process(
+        COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_torch_ops.py
+        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+        RESULT_VARIABLE _torch_ops_result
+    )
+    if(NOT _torch_ops_result EQUAL 0)
+        message(FATAL_ERROR "Generating torch op wrappers - failed")
+    endif()
+    message(STATUS "Generating torch op wrappers - done")
+
+    file(GLOB_RECURSE TORCH_SOURCES CONFIGURE_DEPENDS
+        "torch/*.cc" "torch/*.cpp"
+        "${PROJECT_SOURCE_DIR}/generated/torch/*.cc"
+        "${PROJECT_SOURCE_DIR}/generated/torch/*.cpp"
+    )
 
     target_compile_definitions(infiniops PUBLIC WITH_TORCH=1)
     target_link_libraries(infiniops PUBLIC ${TORCH_LIBRARIES})
-    target_include_directories(infiniops PUBLIC ${TORCH_INCLUDE_DIRS})
+    target_include_directories(infiniops PUBLIC
+        ${TORCH_INCLUDE_DIRS}
+        ${PROJECT_SOURCE_DIR}/generated
+    )
 
     if(WITH_METAX OR WITH_MOORE)
         # Vendor compilers (`mxcc`/`mcc`) cannot compile vendor-forked `torch`
@@ -297,6 +316,7 @@ if(WITH_TORCH)
                 COMMAND ${SYSTEM_CXX}
                     -std=c++17 -fPIC -O2
                     "-I${CMAKE_CURRENT_SOURCE_DIR}"
+                    "-I${PROJECT_SOURCE_DIR}/generated"
                     ${_torch_include_flags}
                     ${_torch_extra_flags}
                     -c "${_src}" -o "${_obj}"
diff --git a/src/base/xlogy_outscalar_other.h b/src/base/xlogy_outscalar_other.h
new file mode 100644
index 00000000..e5e672c8
--- /dev/null
+++ b/src/base/xlogy_outscalar_other.h
@@ -0,0 +1,40 @@
+#ifndef INFINI_OPS_BASE_XLOGY_OUTSCALAR_OTHER_H_
+#define INFINI_OPS_BASE_XLOGY_OUTSCALAR_OTHER_H_
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class XlogyOutscalarOther : public Operator<XlogyOutscalarOther> {
+ public:
+  XlogyOutscalarOther(const Tensor self, const double other, Tensor out)
+      : self_shape_{self.shape()},
+        self_strides_{self.strides()},
+        self_type_{self.dtype()},
+        out_shape_{out.shape()},
+        out_strides_{out.strides()},
+        out_type_{out.dtype()},
+        device_index_{out.device().index()} {}
+
+  virtual void operator()(const Tensor self, const double other,
+                          Tensor out) const = 0;
+
+ protected:
+  Tensor::Shape self_shape_;
+
+  Tensor::Strides self_strides_;
+
+  DataType self_type_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides out_strides_;
+
+  DataType out_type_;
+
+  int device_index_{0};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/cuda/swiglu/kernel.cuh b/src/cuda/swiglu/kernel.cuh
index 36b9f975..c765b06b 100644
--- a/src/cuda/swiglu/kernel.cuh
+++ b/src/cuda/swiglu/kernel.cuh
@@ -7,10 +7,15 @@
 
 namespace infini::ops {
 
+namespace detail {
+
 // Optimized sigmoid function with support for FP16 and BF16 types.
 // TODO: The unified FP16/BF16 branch uses `Caster` and scalar float
 // arithmetic instead of native vectorized intrinsics (e.g. `h2rcp`,
 // `__hmul2`). Profile and restore specialized paths if needed.
+//
+// Lives in `detail::` to avoid colliding with the auto-generated
+// `infini::ops::Sigmoid` operator class.
 template <Device::Type kDev, typename T>
 __device__ __forceinline__ T Sigmoid(const T& x) {
   if constexpr (IsFP16<kDev, T> || IsBFloat16<kDev, T>) {
@@ -24,6 +29,8 @@ __device__ __forceinline__ T Sigmoid(const T& x) {
   }
 }
 
+}  // namespace detail
+
 // SwiGLU(x, gate) = Swish(x) * gate = (x * sigmoid(x)) * gate.
 template <Device::Type kDev, typename T, unsigned int BLOCK_SIZE>
 __global__ void SwigluKernel(T* __restrict__ out, const T* __restrict__ a,
@@ -70,9 +77,10 @@ __global__ void SwigluKernel(T* __restrict__ out, const T* __restrict__ a,
       out[out_idx] = Caster<kDev>::template Cast<T>(
           __fmul_rn(__fmul_rn(gatef, sigf), upf));
     } else if constexpr (std::is_same_v<T, float>) {
-      out[out_idx] = __fmul_rn(__fmul_rn(gate, Sigmoid<kDev>(gate)), up);
+      out[out_idx] =
+          __fmul_rn(__fmul_rn(gate, detail::Sigmoid<kDev>(gate)), up);
     } else {
-      out[out_idx] = gate * Sigmoid<kDev>(gate) * up;
+      out[out_idx] = gate * detail::Sigmoid<kDev>(gate) * up;
     }
   }
 }
diff --git a/tests/conftest.py b/tests/conftest.py
index d995459f..7e8ba698 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -290,7 +290,16 @@ def pytest_pyfunc_call(pyfuncitem):
         rtol = payload.rtol
         atol = payload.atol
 
-        assert torch.allclose(output, expected, rtol=rtol, atol=atol)
+        # `torch.allclose` rejects `bool` dtypes — use `torch.equal` for
+        # non-floating outputs (bool, int) so comparison ops work.  Pass
+        # `equal_nan=True` so NaN-in-both-positions (common for special
+        # functions fed out-of-domain inputs) doesn't fail the test.
+        if output.dtype.is_floating_point:
+            assert torch.allclose(
+                output, expected, rtol=rtol, atol=atol, equal_nan=True
+            )
+        else:
+            assert torch.equal(output, expected)
 
         return True
 
diff --git a/tests/test_torch_ops.py b/tests/test_torch_ops.py
new file mode 100644
index 00000000..67b7866c
--- /dev/null
+++ b/tests/test_torch_ops.py
@@ -0,0 +1,384 @@
+"""Unified test for every operator emitted by `generate_torch_ops.py`.
+
+The generator writes `generated/torch_ops_metadata.json` listing every op
+with full per-parameter info (`name`, `type`, `is_tensor`, `is_out`).
+A single parametrized test reads that metadata, builds inputs from the
+parameter list, calls the InfiniOps wrapper and the torch reference, and
+compares each output tensor.  Adding an op to `scripts/torch_ops.yaml`
+extends coverage with no test changes.
+"""
+
+import json
+import pathlib
+import re
+
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import randn_strided
+
+# PyTorch backends are emitted at this slot — see `_PYTORCH_SLOT` in
+# `scripts/generate_torch_ops.py`.
+_PYTORCH_SLOT = 8
+
+_METADATA_PATH = (
+    pathlib.Path(__file__).resolve().parent.parent
+    / "generated"
+    / "torch_ops_metadata.json"
+)
+_METADATA = (
+    json.loads(_METADATA_PATH.read_text()) if _METADATA_PATH.exists() else {"ops": []}
+)
+
+_SHAPES = (
+    (13, 4),
+    (13, 4, 4),
+    (4, 4, 5632),
+)
+
+_DTYPES = (
+    (torch.float32, 1e-5, 1e-5),
+    (torch.float16, 1e-2, 1e-2),
+    (torch.bfloat16, 1e-2, 1e-2),
+)
+
+# Op-specific input shapes for matrix ops (`mm` etc.) which cannot use
+# `randn_strided(shape)` for both inputs.  The tuple is one shape per
+# tensor input, in YAML order.
+_TENSOR_SHAPES = {
+    "mm": ((8, 16), (16, 12)),
+    "bmm": ((4, 8, 16), (4, 16, 12)),
+    "matmul": ((8, 16), (16, 12)),
+    "dot": ((16,), (16,)),
+    "vdot": ((16,), (16,)),
+    "mv": ((8, 16), (16,)),
+    "inner": ((8, 16), (8, 16)),
+    "outer": ((8,), (12,)),
+    "ger": ((8,), (12,)),
+    "kron": ((3, 4), (2, 3)),
+}
+
+# Per-(op, param-name) values for non-tensor inputs.  Lookup falls back
+# to a type-based default if no entry exists.
+_SCALAR_VALUES = {
+    ("clamp_min", "min"): -0.5,
+    ("clamp_max", "max"): 0.5,
+    ("leaky_relu", "negative_slope"): 0.01,
+    ("hardshrink", "lambd"): 0.5,
+    ("softshrink", "lambd"): 0.5,
+    ("mvlgamma", "p"): 2,
+    ("prod", "dim"): 0,
+    ("cumsum", "dim"): 0,
+    ("cumprod", "dim"): 0,
+    ("logcumsumexp", "dim"): 0,
+    ("cummax", "dim"): 0,
+    ("cummin", "dim"): 0,
+    ("softmax", "dim"): -1,
+    ("log_softmax", "dim"): -1,
+    ("threshold", "threshold"): 0.0,
+    ("threshold", "value"): 0.0,
+    ("hardtanh", "min_val"): -1.0,
+    ("hardtanh", "max_val"): 1.0,
+    ("softplus", "beta"): 1.0,
+    ("softplus", "threshold"): 20.0,
+    ("elu", "alpha"): 1.0,
+    ("elu", "scale"): 1.0,
+    ("elu", "input_scale"): 1.0,
+    ("sub", "alpha"): 1.0,
+    ("addcmul", "value"): 1.0,
+    ("addcdiv", "value"): 1.0,
+    # `str reduce` modes accepted by the corresponding ATen kernels.
+    ("index_reduce", "reduce"): "amax",
+    ("scatter_reduce", "reduce"): "amax",
+    ("scatter_reduce_two", "reduce"): "amax",
+    # `int dim` for ops where 0 is a safe choice for our test shapes.
+    ("kthvalue_values", "k"): 1,
+    ("kthvalue_values", "dim"): 0,
+    ("mode_values", "dim"): 0,
+}
+
+_TYPE_DEFAULTS = {"int": 0, "SymInt": 0, "bool": False, "str": "none"}
+
+# Mirrors `kStringToDataType` in `src/data_type.h`.  Any tensor passed to
+# an InfiniOps op must have one of these dtypes; others (`bool`, complex,
+# quantised types) abort the process inside `DataTypeFromString`.
+_SUPPORTED_DTYPES = frozenset(
+    {
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    }
+)
+
+
+_LIST_SIZE_RE = re.compile(r"\[(\d+)\]")
+
+
+def _list_default(aten_type):
+    """Default value for a required `int[N]` / `SymInt[N]` param.  Most
+    such params name a `dim` or `kernel_size`; `[0]` works for `dim` and
+    causes `kernel_size`-style ops to fail their reference call cleanly,
+    which the test then skips."""
+    size_match = _LIST_SIZE_RE.search(aten_type)
+    n = int(size_match.group(1)) if size_match else 1
+    return [0] * n
+
+
+# Errors emitted by upstream PyTorch and vendor-forked variants for
+# unsupported (op, dtype, device) combinations.  We skip rather than fail
+# on these — the gap is in PyTorch, not InfiniOps.
+_VENDOR_SKIP_PATTERNS = (
+    "not implemented for",  # upstream PyTorch
+    "CNNL_STATUS_BAD_PARAM",  # `torch_mlu` (Cambricon)
+    "MUDNN failed",  # `torch_musa` (Moore)
+    "Could not run",  # missing dispatcher entry on this backend
+    "don't support tensor dtype",  # `torch_mlu` dtype check
+    "result requires dtype",  # output dtype mismatch (e.g. `float_power`)
+    # ATen kernels for some loss ops (`mse_loss`, `huber_loss`, …) use
+    # the `out` buffer as intermediate scratch and resize it before the
+    # final reduction.  Our `from_blob` outputs are non-resizable, so
+    # the kernel aborts the call with this message.  Skip these — the
+    # zero-copy wrapper can't drive that codepath.
+    "Trying to resize storage that is not resizable",
+)
+
+# Random-sampling ops never match a fresh torch reference call —
+# they consume RNG state and return different draws.  Skip rather
+# than try to align the two PRNG streams.
+_RANDOM_OPS = frozenset(
+    {
+        "bernoulli",
+        "multinomial",
+        "normal",
+        "rand",
+        "randn",
+        "randint",
+        "randperm",
+        "rrelu_with_noise",
+    }
+)
+
+# Full reductions with low-precision inputs diverge between the functional
+# (`torch.<op>(x)`) and `_out` paths because of intermediate-precision
+# choices we cannot align from outside ATen.
+_LARGE_REDUCTION_OPS = frozenset(
+    {"sum", "mean", "nansum", "nanmean", "prod", "std", "var"}
+)
+
+# Ops with input-domain `TORCH_CHECK` macros that fire as device-side
+# `assert` on CUDA when our generic random fp32 inputs fall outside the
+# expected range.  The Python-side `RuntimeError` is catchable, but the
+# CUDA context is left poisoned and every subsequent test errors at
+# setup.  Skip these on cuda; the CPU path raises a clean exception
+# that the existing harness already handles.
+_DEVICE_ASSERTING_OPS = frozenset(
+    {
+        "binary_cross_entropy",  # requires inputs in [0, 1]
+        "multi_margin_loss",
+        "multilabel_margin_loss",
+        "nll_loss",
+        "nll_loss2d",
+        # cuDNN paths divide by `kernel_size`/`stride` and SIGFPE on the
+        # `[0, 0]` defaults our harness substitutes for required `int[N]`
+        # parameters.
+        "cudnn_convolution",
+        "slow_conv3d",
+        "slow_conv_transpose2d",
+        "slow_conv_transpose3d",
+        "thnn_conv2d",
+        "im2col",
+        "col2im",
+        "max_unpool2d",
+        "max_unpool3d",
+        "reflection_pad1d",
+        "reflection_pad2d",
+        "reflection_pad3d",
+        "replication_pad1d",
+        "replication_pad2d",
+        "replication_pad3d",
+        "upsample_bicubic2d",
+        "upsample_bilinear2d",
+        "upsample_linear1d",
+        "upsample_nearest1d",
+        "upsample_nearest2d",
+        "upsample_nearest3d",
+        "upsample_trilinear3d",
+        "avg_pool2d",
+        "avg_pool3d",
+        "max_pool2d_with_indices",
+        "max_pool3d_with_indices",
+        "adaptive_max_pool2d",
+        "adaptive_max_pool3d",
+        "adaptive_avg_pool2d",
+        "adaptive_avg_pool3d",
+    }
+)
+
+
+def _torch_func(op_name):
+    """Resolve the reference function across `torch`, `torch.special`,
+    and `torch.nn.functional`.  `special_<x>` falls through to
+    `torch.special.<x>` with the prefix stripped."""
+    candidates = [
+        (torch, op_name),
+        (torch.special, op_name),
+        (torch.nn.functional, op_name),
+    ]
+    if op_name.startswith("special_"):
+        candidates.append((torch.special, op_name.removeprefix("special_")))
+    for namespace, attr in candidates:
+        func = getattr(namespace, attr, None)
+        if func is not None:
+            return func
+    pytest.skip(f"no reference function for `{op_name}` in PyTorch")
+
+
+def _pascal(snake_name):
+    return "".join(part.capitalize() for part in snake_name.split("_"))
+
+
+def _skip_if_not_active(op_name, device):
+    op_class = getattr(infini.ops, _pascal(op_name), None)
+    if op_class is None:
+        pytest.skip(f"`{op_name}` class not exposed on this build")
+    if _PYTORCH_SLOT not in op_class.active_implementation_indices(device):
+        pytest.skip(f"`{op_name}` slot {_PYTORCH_SLOT} not active on `{device}`")
+
+
+def _skip_low_precision_reduction(op_name, dtype, device):
+    if op_name in _LARGE_REDUCTION_OPS:
+        if dtype in (torch.float16, torch.bfloat16):
+            pytest.skip(f"`{op_name}` precision diverges on fp16/bf16")
+        if device == "musa":
+            pytest.skip(f"`{op_name}` on `torch_musa` diverges from CPU reference")
+
+
+def _build_input_value(op_name, param, shape, dtype, device, tensor_idx):
+    """Build the value passed to a non-out parameter."""
+    if param["is_tensor"]:
+        per_op = _TENSOR_SHAPES.get(op_name)
+        tshape = per_op[tensor_idx] if per_op is not None else shape
+        return randn_strided(tshape, None, dtype=dtype, device=device)
+    key = (op_name, param["name"])
+    if key in _SCALAR_VALUES:
+        return _SCALAR_VALUES[key]
+    t = param["type"]
+    if t.startswith(("int[", "SymInt[")) or t in {"int[]", "SymInt[]"}:
+        return _list_default(t)
+    return _TYPE_DEFAULTS.get(t, 0.5)
+
+
+def _call_infini(op_name, *args):
+    try:
+        getattr(infini.ops, op_name)(*args, implementation_index=_PYTORCH_SLOT)
+    except RuntimeError as exc:
+        if any(p in str(exc) for p in _VENDOR_SKIP_PATTERNS):
+            pytest.skip(f"`{op_name}` unsupported by torch on this device/dtype")
+        raise
+
+
+def _assert_close(actual, expected, rtol, atol):
+    if actual.dtype.is_floating_point:
+        assert torch.allclose(actual, expected, rtol=rtol, atol=atol, equal_nan=True)
+    else:
+        assert torch.equal(actual, expected)
+
+
+def _testable_ops():
+    """Filter out ops the harness can't drive — currently just bool-output
+    ops, since InfiniOps `DataType` has no `kBool`.  Unknown until runtime,
+    so we skip-at-test-time rather than filter here."""
+    return _METADATA.get("ops", [])
+
+
+@pytest.mark.parametrize("op_meta", _testable_ops(), ids=lambda m: m["name"])
+@pytest.mark.parametrize("shape", _SHAPES, ids=lambda s: "x".join(map(str, s)))
+@pytest.mark.parametrize(("dtype", "rtol", "atol"), _DTYPES)
+def test_op(op_meta, shape, dtype, device, rtol, atol):
+    op_name = op_meta["name"]
+    aten_name = op_meta.get("aten_name", op_name)
+    _skip_if_not_active(op_name, device)
+    _skip_low_precision_reduction(aten_name, dtype, device)
+    if aten_name in _RANDOM_OPS:
+        pytest.skip(f"`{aten_name}` is non-deterministic (independent draws diverge)")
+    if device == "cuda" and aten_name in _DEVICE_ASSERTING_OPS:
+        pytest.skip(
+            f"`{aten_name}` triggers a CUDA device-side assert on random inputs"
+        )
+
+    in_params = [p for p in op_meta["params"] if not p["is_out"]]
+    out_params = [p for p in op_meta["params"] if p["is_out"]]
+
+    # Build inputs in YAML order.
+    inputs = []
+    tensor_idx = 0
+    for p in in_params:
+        inputs.append(
+            _build_input_value(aten_name, p, shape, dtype, device, tensor_idx)
+        )
+        if p["is_tensor"]:
+            tensor_idx += 1
+
+    # Run the reference to discover output shape(s)/dtype(s).
+    # An op may reject our generic `randn(shape)` input with any of these
+    # exception types — the gap is in our test harness's input synthesis,
+    # not in the InfiniOps wrapper.
+    try:
+        ref = _torch_func(aten_name)(*inputs)
+    except (
+        RuntimeError,
+        TypeError,
+        ValueError,
+        IndexError,
+        NotImplementedError,
+    ) as exc:
+        pytest.skip(f"`torch.{aten_name}` rejects these inputs: {exc}")
+
+    ref_outs = ref if isinstance(ref, tuple) else (ref,)
+    if len(ref_outs) != len(out_params):
+        # The Python-facing function (e.g. `F.adaptive_max_pool2d`) often
+        # exposes a subset of the ATen `_out` schema's outputs (returning
+        # only `out`, hiding `indices` behind a `return_indices=True`
+        # kwarg).  Without a per-op map of how to coax the full tuple
+        # out, skip — the InfiniOps wrapper itself is fine.
+        pytest.skip(
+            f"`{aten_name}` reference produced {len(ref_outs)} output(s); "
+            f"schema declares {len(out_params)}"
+        )
+
+    # InfiniOps `DataType` enumerates only int{8,16,32,64}, uint{8,16,32,64},
+    # float{16,32,64}, and bfloat16.  Tensors with any other torch dtype
+    # (`bool`, `complex64`, `complex128`, …) abort on `DataTypeFromString`,
+    # so skip the test rather than crash the process.
+    tensors = [*ref_outs, *(x for x in inputs if isinstance(x, torch.Tensor))]
+    unsupported = next(
+        (t.dtype for t in tensors if t.dtype not in _SUPPORTED_DTYPES), None
+    )
+    if unsupported is not None:
+        pytest.skip(
+            f"`{op_name}` uses dtype {unsupported} — not in InfiniOps `DataType`"
+        )
+
+    # On CUDA, `torch.empty_like` of a 0-element tensor gives a tensor
+    # whose `data_ptr()` is unregistered with the device; passing it
+    # through to the wrapper trips "pointer resides on host memory".
+    if any(t.numel() == 0 for t in ref_outs):
+        pytest.skip(
+            f"`{op_name}` produced 0-element output (unregistered data_ptr on cuda)"
+        )
+
+    outs = [torch.empty_like(t) for t in ref_outs]
+    _call_infini(op_name, *inputs, *outs)
+
+    for actual, expected in zip(outs, ref_outs):
+        _assert_close(actual, expected, rtol, atol)