NVIDIA
diff --git a/‎SOL.py‎
Lines changed: 10 additions & 8 deletions b/‎SOL.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎cutile_kernels.py‎
Lines changed: 1 addition & 1 deletion b/‎cutile_kernels.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mhc_bench.py‎
Lines changed: 22 additions & 2 deletions b/‎mhc_bench.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎native_kernels.py‎
Lines changed: 1 addition & 1 deletion b/‎native_kernels.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/pytorch/test_mhc.py‎
Lines changed: 4 additions & 1 deletion b/‎tests/pytorch/test_mhc.py‎
Lines changed: 4 additions & 1 deletion
@@ -18,8 +18,8 @@
 hardware_specs = {
     "B200": {
         "MEM_BANDWIDTH": 8.0e12,  # 8 TB/s
-        "FP32_FLOPS": 75.0e12,    # 75 TFLOPS (Vector FP32 for tiny inner dims)
-        "TF32_FLOPS": 1.125e15,   # 1,125 TFLOPS (Dense TF32 Tensor Core)
+        "FP32_FLOPS": 75.0e12,  # 75 TFLOPS (Vector FP32 for tiny inner dims)
+        "TF32_FLOPS": 1.125e15,  # 1,125 TFLOPS (Dense TF32 Tensor Core)
     }
 }
 
@@ -30,9 +30,10 @@
 print(f" Format: {dtype} ({bytes_per_elem} bytes/elem)")
 print(f"================================================================================\n")
 
+
 def print_sol_breakdown(name, mem_gb, flops_g=None, use_tf32=False):
     bw_gb_s = hardware_specs[GPU]["MEM_BANDWIDTH"] / 1e9
-    
+
     # Select peak FLOPs based on TensorCore utilization
     if use_tf32:
         peak_flops_g = hardware_specs[GPU]["TF32_FLOPS"] / 1e9
@@ -42,13 +43,13 @@ def print_sol_breakdown(name, mem_gb, flops_g=None, use_tf32=False):
         math_type = "FP32 Vector"
 
     time_mem_ms = (mem_gb / bw_gb_s) * 1000
-    
+
     print(f"[{name}]")
     if flops_g is not None:
         time_math_ms = (flops_g / peak_flops_g) * 1000
         sol_time = max(time_mem_ms, time_math_ms)
         bound = "FLOPS bounded" if time_math_ms > time_mem_ms else "Memory bounded"
-        
+
         print(f"  ├─ Architecture : {math_type}")
         print(f"  ├─ Total Mem R/W: {mem_gb:8.4f} GB")
         print(f"  ├─ Total Math   : {flops_g:8.4f} GFLOPS")
@@ -62,6 +63,7 @@ def print_sol_breakdown(name, mem_gb, flops_g=None, use_tf32=False):
         print(f"  ├─ Mem Time     : {time_mem_ms:8.4f} ms")
         print(f"  └─ SOL Time     : {sol_time:8.4f} ms ({bound})\n")
 
+
 # ---------------------------------------------------------
 # 1. Projection kernel: (B, T, n*C) @ (n*C, 32)
 # ---------------------------------------------------------
@@ -133,14 +135,14 @@ def print_sol_breakdown(name, mem_gb, flops_g=None, use_tf32=False):
 post_in1_2_gb = B * T * 1 * C * bytes_per_elem / 1e9
 post_in2_1_gb = B * T * n * n * bytes_per_elem / 1e9
 post_in2_2_gb = B * T * n * C * bytes_per_elem / 1e9
-post_out_gb   = B * T * n * C * bytes_per_elem / 1e9
+post_out_gb = B * T * n * C * bytes_per_elem / 1e9
 
 post_mem_gb = post_in1_1_gb + post_in1_2_gb + post_in2_1_gb + post_in2_2_gb + post_out_gb
 
 flops_term1_g = B * T * (2 * n * 1 * C) / 1e9
 flops_term2_g = B * T * (2 * n * n * C) / 1e9
-flops_add_g   = B * T * n * C / 1e9
-post_flops_g  = flops_term1_g + flops_term2_g + flops_add_g
+flops_add_g = B * T * n * C / 1e9
+post_flops_g = flops_term1_g + flops_term2_g + flops_add_g
 
 print(f"================================================================================")
 print(f"5. Post + Res Kernel (Fused): (B, T, n, 1) @ (B, T, 1, C) + (B, T, n, n) @ (B, T, n, C)")
 
@@ -961,4 +961,4 @@ def fused_proj_rms(x: Tensor, weight: Tensor, eps: float = 1e-6) -> Tuple[Tensor
             proj: [M, N] = x @ weight^T
             r: [M, 1] = 1 / (||x|| / sqrt(K) + eps)
         """
-        return FusedProjRms.apply(x, weight, eps)
+        return FusedProjRms.apply(x, weight, eps)
@@ -26,9 +26,10 @@
     mHCSinkhornRef,
     mHCElementwiseRef,
     mHCPreRef,
-    mHCPostResRef
+    mHCPostResRef,
 )
 
+
 def run_sinkhorn_triton(B, T, n, dtype, device, iters, do_backward):
     nvtx.range_push("mhc_sinkhorn_triton_fwd")
     x = torch.randn((B, T, n, n), device=device, dtype=dtype, requires_grad=do_backward)
@@ -39,6 +40,7 @@ def run_sinkhorn_triton(B, T, n, dtype, device, iters, do_backward):
         y.sum().backward()
     nvtx.range_pop()
 
+
 def run_sinkhorn_cutile(B, T, n, dtype, device, iters, do_backward):
     nvtx.range_push("mhc_sinkhorn_cutile_fwd")
     x = torch.randn((B, T, n, n), device=device, dtype=dtype, requires_grad=do_backward)
@@ -49,6 +51,7 @@ def run_sinkhorn_cutile(B, T, n, dtype, device, iters, do_backward):
         y.sum().backward()
     nvtx.range_pop()
 
+
 def run_sinkhorn_compile(B, T, n, dtype, device, iters, do_backward):
     nvtx.range_push("mhc_sinkhorn_compile_fwd")
     x = torch.randn((B, T, n, n), device=device, dtype=dtype, requires_grad=do_backward)
@@ -59,6 +62,7 @@ def run_sinkhorn_compile(B, T, n, dtype, device, iters, do_backward):
         y.sum().backward()
     nvtx.range_pop()
 
+
 def run_sinkhorn(B, T, n, dtype, device, iters, do_backward):
     run_sinkhorn_cutile(B, T, n, dtype, device, iters, do_backward)
     run_sinkhorn_triton(B, T, n, dtype, device, iters, do_backward)
@@ -78,6 +82,7 @@ def run_projection_triton(B, T, n, C, dtype, device, do_backward):
         (Hs.sum() + r.sum()).backward()
     nvtx.range_pop()
 
+
 def run_projection_cutile(B, T, n, C, dtype, device, do_backward):
     nC = n * C
     N = 2 * n + n * n
@@ -91,6 +96,7 @@ def run_projection_cutile(B, T, n, C, dtype, device, do_backward):
         (Hs.sum() + r.sum()).backward()
     nvtx.range_pop()
 
+
 def run_projection_compile(B, T, n, C, dtype, device, do_backward):
     nC = n * C
     N = 2 * n + n * n
@@ -104,6 +110,7 @@ def run_projection_compile(B, T, n, C, dtype, device, do_backward):
         (Hs.sum() + r.sum()).backward()
     nvtx.range_pop()
 
+
 def run_projection(B, T, n, C, dtype, device, do_backward):
     run_projection_cutile(B, T, n, C, dtype, device, do_backward)
     run_projection_triton(B, T, n, C, dtype, device, do_backward)
@@ -124,6 +131,7 @@ def run_elementwise_triton(B, T, n, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_elementwise_compile(B, T, n, dtype, device, do_backward):
     N = 2 * n + n * n
     nvtx.range_push("mhc_elementwise_compile_fwd")
@@ -138,6 +146,7 @@ def run_elementwise_compile(B, T, n, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_elementwise(B, T, n, dtype, device, do_backward):
     run_elementwise_triton(B, T, n, dtype, device, do_backward)
     run_elementwise_compile(B, T, n, dtype, device, do_backward)
@@ -154,6 +163,7 @@ def run_pre_triton(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_pre_cutile(B, T, n, C, dtype, device, do_backward):
     nvtx.range_push("mhc_pre_cutile_fwd")
     x = torch.randn(B, T, n, C, dtype=dtype, requires_grad=True, device=device)
@@ -165,6 +175,7 @@ def run_pre_cutile(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_pre_compile(B, T, n, C, dtype, device, do_backward):
     nvtx.range_push("mhc_pre_compile_fwd")
     x = torch.randn(B, T, n, C, dtype=dtype, requires_grad=True, device=device)
@@ -176,6 +187,7 @@ def run_pre_compile(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_pre(B, T, n, C, dtype, device, do_backward):
     run_pre_cutile(B, T, n, C, dtype, device, do_backward)
     run_pre_triton(B, T, n, C, dtype, device, do_backward)
@@ -195,6 +207,7 @@ def run_post_res_triton(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_post_res_cutile(B, T, n, C, dtype, device, do_backward):
     nvtx.range_push("mhc_post_res_cutile_fwd")
     x = torch.randn(B, T, n, C, dtype=dtype, requires_grad=True, device=device)
@@ -208,6 +221,7 @@ def run_post_res_cutile(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_post_res_compile(B, T, n, C, dtype, device, do_backward):
     nvtx.range_push("mhc_post_res_compile_fwd")
     x = torch.randn(B, T, n, C, dtype=dtype, requires_grad=True, device=device)
@@ -221,14 +235,20 @@ def run_post_res_compile(B, T, n, C, dtype, device, do_backward):
         out.sum().backward()
     nvtx.range_pop()
 
+
 def run_post_res(B, T, n, C, dtype, device, do_backward):
     run_post_res_cutile(B, T, n, C, dtype, device, do_backward)
     run_post_res_triton(B, T, n, C, dtype, device, do_backward)
     run_post_res_compile(B, T, n, C, dtype, device, do_backward)
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--operation", choices=["sinkhorn", "projection", "elementwise", "pre", "post_res", "all"], required=True)
+    parser.add_argument(
+        "--operation",
+        choices=["sinkhorn", "projection", "elementwise", "pre", "post_res", "all"],
+        required=True,
+    )
     parser.add_argument("--dtype", choices=["float32", "bfloat16"], default="float32")
     parser.add_argument("--warmup", type=int, default=2)
     parser.add_argument("--iters", type=int, default=1)
 
@@ -1,6 +1,7 @@
 import torch
 import torch.nn.functional as F
 
+
 @torch.compile
 def mHCProjectionRef(x, phi):
     """
@@ -151,4 +152,3 @@ def mHCPostResRef(f, H_post, x, H_res, n):
     out = H_post @ f + H_res @ x  # (B, T, n, C)
 
     return out
-
@@ -153,6 +153,7 @@ def mHCAggregateRef(x, H_pre, n):
 
     return out
 
+
 @torch.compile
 def mHCExpandCombineRef(f, bias, H_post, x, H_res, n):
     """
@@ -177,6 +178,7 @@ def mHCExpandCombineRef(f, bias, H_post, x, H_res, n):
 
     return out
 
+
 @dataclass
 class MHCConfig:
     s: int = 2048  # Sequence length
@@ -413,6 +415,7 @@ def test_mhc_sinkhorn_knopp(cfg: MHCConfig, dtype, recompute):
 
     torch.testing.assert_close(x.grad, x_ref.grad, **tols)
 
+
 @pytest.mark.parametrize("cfg", mhc_configs, ids=MHCConfig.desc)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16], ids=["fp32", "bf16"])
 def test_mhc_aggregate(cfg: MHCConfig, dtype):
@@ -456,7 +459,7 @@ def test_mhc_expand_combine(cfg: MHCConfig, dtype, with_bias):
     H_res = torch.randn(s, b, n, n, device="cuda", requires_grad=True, dtype=dtype)
 
     f_ref = f.detach().clone().requires_grad_(True)
-    bias_ref = None  if bias is None else bias.detach().clone().requires_grad_(True)
+    bias_ref = None if bias is None else bias.detach().clone().requires_grad_(True)
     H_post_ref = H_post.detach().clone().requires_grad_(True)
     x_ref = x.detach().clone().requires_grad_(True)
     H_res_ref = H_res.detach().clone().requires_grad_(True)