jesus

kainzhong · kainzhong · commit 465737154d22 · 2026-04-06T02:22:26.000Z
Signed-off-by: Kaining Zhong &lt;kainingz@nvidia.com&gt;
diff --git a/tests/pytorch/test_mhc.py b/tests/pytorch/test_mhc.py
@@ -23,7 +23,6 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-@torch.compile
 def mHCProjectionRef(x, phi):
     """
     Reference operator for mHC's projection building operation.
@@ -48,7 +47,6 @@ def mHCProjectionRef(x, phi):
     return Hs.to(x_dtype), ms
 
 
-@torch.compile
 def mHCScaleRef(H, alpha, beta, ms, n):
     """
     Reference operator for mHC's pre and post calculations
@@ -101,7 +99,6 @@ def mHCScaleRef(H, alpha, beta, ms, n):
     return out.to(H_dtype)
 
 
-@torch.compile
 def mHCSinkhornRef(H_res, n=4, iterations=20):
     """
     Sinkhorn-Knopp algorithm to convert a matrix into a doubly stochastic matrix.
@@ -136,7 +133,6 @@ def mHCSinkhornRef(H_res, n=4, iterations=20):
     return H_res_out
 
 
-@torch.compile
 def mHCAggregateRef(x, H_pre, n):
     """
     Reference operator for applying mHC's pre matrix H to a vector x.
@@ -153,7 +149,6 @@ def mHCAggregateRef(x, H_pre, n):
 
     return out
 
-@torch.compile
 def mHCExpandCombineRef(f, bias, H_post, x, H_res, n):
     """
     Reference operator for applying mHC's post transformation and residual transformation
@@ -167,15 +162,25 @@ def mHCExpandCombineRef(f, bias, H_post, x, H_res, n):
 
     s, b, C, n = x.shape
 
+    # My triton kernels use FMA and MMA instructions with fp32 accumulator for bf16 test cases
+    # which has better numerical stability than this pytorch implementation
+    # To match the kernel's accuracy we need to cast to fp32 here to match kernels' result
+    input_dtype = f.dtype
+    f = f.to(torch.float32)
+    bias = bias.to(torch.float32) if bias is not None else None
+    H_post = H_post.to(torch.float32)
+    x = x.to(torch.float32)
+    H_res = H_res.to(torch.float32)
+
     if bias is not None:
-        f = f + bias
+        f = f + bias[None, None, :]
 
     f = f.view(s, b, C, 1)
     H_post = H_post.view(s, b, 1, n)
 
     out = f @ H_post + x @ H_res  # (s, b, C, n)
 
-    return out
+    return out.to(input_dtype)
 
 @dataclass
 class MHCConfig:
@@ -232,27 +237,27 @@ def desc(cfg):
     MHCConfig(
         8,
         128,
-        16 * 192,
+        5129,
     ),
     MHCConfig(
         8,
-        1,
-        16 * 500,
+        512,
+        8000,
     ),
     MHCConfig(
-        8,
-        128,
-        16 * 512,
+        4,
+        1024,
+        8192,
     ),
     MHCConfig(
-        8,
-        1,
-        16 * 376,
+        2,
+        4096,
+        8192,
     ),
     MHCConfig(
         8,
         128,
-        16 * 1024,
+        16384,
     ),
 ]
 
@@ -449,8 +454,7 @@ def test_mhc_expand_combine(cfg: MHCConfig, dtype, with_bias):
     f = torch.randn(s, b, C, device="cuda", requires_grad=True, dtype=dtype)
     bias = None
     if with_bias:
-        bias_raw = torch.randn(C, device="cuda", requires_grad=True, dtype=dtype) * 0.1
-        bias = bias_raw.detach().clone().requires_grad_(True)
+        bias = torch.randn(C, device="cuda", requires_grad=True, dtype=dtype)
     H_post = torch.randn(s, b, n, device="cuda", requires_grad=True, dtype=dtype)
     x = torch.randn(s, b, C, n, device="cuda", requires_grad=True, dtype=dtype)
     H_res = torch.randn(s, b, n, n, device="cuda", requires_grad=True, dtype=dtype)
diff --git a/transformer_engine/common/triton/mhc.py b/transformer_engine/common/triton/mhc.py
@@ -203,11 +203,12 @@ def _mhc_projection_bwd_fused(
     phi = tl.load(
         phi_ptrs, mask=(offs_n_full[:, None] < N) & mask_k[None, :], other=0.0
     )  # (BLOCK_SIZE_N, BLOCK_SIZE_K)
+    grad_ms = tl.load(grad_ms_ptrs, mask=offs_r < M, other=0.0, cache_modifier=".ca")  # (BLOCK_SIZE_M,)
+
+    grad_x = x * (grad_ms * 2 / tl.cast(K, tl.float32))[:, None]
     grad_x = tl.dot(
-        grad_h, phi, input_precision=precision, out_dtype=tl.float32
+        grad_h, phi, acc=grad_x, input_precision=precision, out_dtype=tl.float32
     )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    grad_ms = tl.load(grad_ms_ptrs, mask=offs_r < M, other=0.0, cache_modifier=".ca")  # (BLOCK_SIZE_M,)
-    grad_x += x * (grad_ms * 2 / tl.cast(K, tl.float32))[:, None]
     grad_x_ptrs = grad_x_ptr + offs_m[:, None] * stride_grad_xm + offs_k[None, :] * stride_grad_xk
     grad_x = grad_x.to(x.dtype)
     tl.store(grad_x_ptrs, grad_x, mask=mask_m[:, None] & mask_k[None, :])
@@ -1179,7 +1180,8 @@ def _mhc_expand_combine_fwd(
     # Residual connection path: res_out = f @ H_post:
     # (BLOCK_SIZE_M, BLOCK_SIZE_C, 1) @ (BLOCK_SIZE_M, 1, n)  = (BLOCK_SIZE_M, n, BLOCK_SIZE_C)
     # Due to broadcasting, it's equivalent to a multiplicaiton
-    res_out = f[:, :, None ] * H_post[:, None, :]  # (BLOCK_SIZE_M, BLOCK_SIZE_C, n)
+    out_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_C, n), dtype=tl.float32)
+    out_acc = tl.fma(f[:, :, None], H_post[:, None, :], out_acc)
 
     H_res_offs = pid_m * BLOCK_SIZE_M * n * n + tl.arange(0, BLOCK_SIZE_M * n * n)
     H_res = tl.load(H_res_ptr + H_res_offs, mask=H_res_offs < M * n * n, other=0.0, cache_modifier=".ca")
@@ -1199,7 +1201,6 @@ def _mhc_expand_combine_fwd(
     #           + x[:, :, 1] @ H_res[:, 1, :]
     #           + x[:, :, 2] @ H_res[:, 2, :]
     #           + x[:, :, 3] @ H_res[:, 3, :]
-    manifold_out_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_C, n), dtype=tl.float32)
 
     x_reshape = tl.reshape(x, (BLOCK_SIZE_M, BLOCK_SIZE_C, 2, 2))
     x01, x23 = tl.split(x_reshape)  # (BLOCK_SIZE_M, BLOCK_SIZE_C, 2), (BLOCK_SIZE_M, BLOCK_SIZE_C, 2)
@@ -1211,14 +1212,12 @@ def _mhc_expand_combine_fwd(
     H_res0, H_res1 = tl.split(H_res01)  # (BLOCK_SIZE_M, n), (BLOCK_SIZE_M, n)
     H_res2, H_res3 = tl.split(H_res23)  # (BLOCK_SIZE_M, n), (BLOCK_SIZE_M, n)
 
-    manifold_out_acc = tl.fma(x0[:, :, None], H_res0[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x1[:, :, None], H_res1[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x2[:, :, None], H_res2[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x3[:, :, None], H_res3[:, None, :], manifold_out_acc)
+    out_acc = tl.fma(x0[:, :, None], H_res0[:, None, :], out_acc)
+    out_acc = tl.fma(x1[:, :, None], H_res1[:, None, :], out_acc)
+    out_acc = tl.fma(x2[:, :, None], H_res2[:, None, :], out_acc)
+    out_acc = tl.fma(x3[:, :, None], H_res3[:, None, :], out_acc)
 
-    manifold_out = manifold_out_acc.to(x.dtype)
-
-    out = manifold_out + res_out
+    out = out_acc.to(x.dtype)
     out = tl.reshape(out, (BLOCK_SIZE_M, BLOCK_SIZE_C * n))  # (BLOCK_SIZE_M, BLOCK_SIZE_C*n)
 
     output_ptrs = (
@@ -1486,7 +1485,6 @@ def _mhc_expand_combine_with_bias_fwd(
     f_ptrs = f_ptr + offs_m[:, None] * stride_fm + offs_c[None, :] * stride_fc
     f = tl.load(f_ptrs, mask=mask_m[:, None] & mask_c[None, :], other=0.0)
     bias = tl.load(bias_ptr + offs_c * stride_bias, mask=mask_c, other=0.0)  # (BLOCK_SIZE_C,)
-    f = f + bias[None, :]  # (BLOCK_SIZE_M, BLOCK_SIZE_C)
 
     offs_H_post = pid_m * BLOCK_SIZE_M * n + tl.arange(0, BLOCK_SIZE_M * n)
     H_post = tl.load(H_post_ptr + offs_H_post, mask=offs_H_post < M * n, other=0.0, cache_modifier=".ca")
@@ -1495,7 +1493,9 @@ def _mhc_expand_combine_with_bias_fwd(
     # Residual connection path: res_out = f @ H_post + bias @ H_post:
     # (BLOCK_SIZE_M, BLOCK_SIZE_C, 1) @ (BLOCK_SIZE_M, 1, n)  = (BLOCK_SIZE_M, n, BLOCK_SIZE_C)
     # Due to broadcasting, it's equivalent to a multiplicaiton
-    res_out = f[:, :, None] * H_post[:, None, :]  # (BLOCK_SIZE_M, BLOCK_SIZE_C, n)
+    out_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_C, n), dtype=tl.float32)
+    out_acc = tl.fma(bias[None, :, None], H_post[:, None, :], out_acc)
+    out_acc = tl.fma(f[:, :, None], H_post[:, None, :], out_acc)
 
     H_res_offs = pid_m * BLOCK_SIZE_M * n * n + tl.arange(0, BLOCK_SIZE_M * n * n)
     H_res = tl.load(H_res_ptr + H_res_offs, mask=H_res_offs < M * n * n, other=0.0, cache_modifier=".ca")
@@ -1515,7 +1515,6 @@ def _mhc_expand_combine_with_bias_fwd(
     #           + x[:, :, 1] @ H_res[:, 1, :]
     #           + x[:, :, 2] @ H_res[:, 2, :]
     #           + x[:, :, 3] @ H_res[:, 3, :]
-    manifold_out_acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_C, n), dtype=tl.float32)
 
     x_reshape = tl.reshape(x, (BLOCK_SIZE_M, BLOCK_SIZE_C, 2, 2))
     x01, x23 = tl.split(x_reshape)  # (BLOCK_SIZE_M, BLOCK_SIZE_C, 2), (BLOCK_SIZE_M, BLOCK_SIZE_C, 2)
@@ -1527,14 +1526,12 @@ def _mhc_expand_combine_with_bias_fwd(
     H_res0, H_res1 = tl.split(H_res01)  # (BLOCK_SIZE_M, n), (BLOCK_SIZE_M, n)
     H_res2, H_res3 = tl.split(H_res23)  # (BLOCK_SIZE_M, n), (BLOCK_SIZE_M, n)
 
-    manifold_out_acc = tl.fma(x0[:, :, None], H_res0[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x1[:, :, None], H_res1[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x2[:, :, None], H_res2[:, None, :], manifold_out_acc)
-    manifold_out_acc = tl.fma(x3[:, :, None], H_res3[:, None, :], manifold_out_acc)
-
-    manifold_out = manifold_out_acc.to(x.dtype)
+    out_acc = tl.fma(x0[:, :, None], H_res0[:, None, :], out_acc)
+    out_acc = tl.fma(x1[:, :, None], H_res1[:, None, :], out_acc)
+    out_acc = tl.fma(x2[:, :, None], H_res2[:, None, :], out_acc)
+    out_acc = tl.fma(x3[:, :, None], H_res3[:, None, :], out_acc)
 
-    out = manifold_out + res_out
+    out = out_acc.to(x.dtype)
     out = tl.reshape(out, (BLOCK_SIZE_M, BLOCK_SIZE_C * n))  # (BLOCK_SIZE_M, BLOCK_SIZE_C*n)
 
     output_ptrs = (
@@ -1636,7 +1633,6 @@ def _mhc_expand_combine_with_bias_bwd(
     f = tl.load(f_ptrs, mask=mask_m[:, None] & mask_c[None, :], other=0.0)
 
     bias = tl.load(bias_ptr + offs_c * stride_bias, mask=mask_c, other=0.0)  # (BLOCK_SIZE_C,)
-    f = f + bias[None, :]  # (BLOCK_SIZE_M, BLOCK_SIZE_C)
 
     H_post_offs = pid_m * BLOCK_SIZE_M * n + tl.arange(0, BLOCK_SIZE_M * n)
     H_post = tl.load(H_post_ptr + H_post_offs, mask=H_post_offs < M * n, other=0.0)
@@ -1665,6 +1661,13 @@ def _mhc_expand_combine_with_bias_bwd(
         input_precision=precision,
         out_dtype=tl.float32,
     ) # (BLOCK_SIZE_M, 1, n)
+    grad_H_post = tl.dot(
+        tl.broadcast_to(bias[None, None, :], (BLOCK_SIZE_M, 1, BLOCK_SIZE_C)),
+        tl.reshape(grad_out, (BLOCK_SIZE_M, BLOCK_SIZE_C, n)),
+        acc=grad_H_post,
+        input_precision=precision,
+        out_dtype=tl.float32,
+    ) # (BLOCK_SIZE_M, 1, n)
     grad_H_post = tl.reshape(grad_H_post, (BLOCK_SIZE_M * n,))  # (BLOCK_SIZE_M * n)
     offs_grad_H_post = pid_m * BLOCK_SIZE_M * n + tl.arange(0, BLOCK_SIZE_M * n)
     grad_H_post_ptrs = grad_H_post_ptr + offs_grad_H_post