Comm gemm fixes (#2818)

almogsegal · web-flow · commit 86edac47c5c5 · 2026-04-06T10:28:40.000-07:00
* Fix GemmRs B descriptor lld for transb=true

With a row_major (1×P) grid, all rows are on a single process row,
so the local leading dimension must be n (full row count), not
block_size(n) which is n/P.

Signed-off-by: Almog Segal &lt;asegal@nvidia.com&gt;

* Set GemmRs communication type to output data type

Match the UserBuffers behavior where the reduce-scatter operates
in the output precision rather than FP32.

Signed-off-by: Almog Segal &lt;asegal@nvidia.com&gt;

---------

Signed-off-by: Almog Segal &lt;asegal@nvidia.com&gt;
diff --git a/transformer_engine/common/comm_gemm/comm_gemm.cpp b/transformer_engine/common/comm_gemm/comm_gemm.cpp
@@ -186,9 +186,9 @@ void GemmRsInitMatrices(NVTECommGemmCtx* ctx, int64_t* ldd, int64_t m, int64_t n
   }
   if (transb) {
     NVTE_CHECK(b1 == n, "Unsupported tensor dimension in B: expected ", n, ", got ", b1);
-    NVTE_CHECK_CUBLASMP(cublasMpMatrixDescriptorInit(
-        n, k, block_size(ctx, n), block_size(ctx, k), 0, 0, block_size(ctx, n),
-        get_cuda_dtype(b->dtype()), ctx->grid_row_major.get(), ctx->b_desc.get()));
+    NVTE_CHECK_CUBLASMP(cublasMpMatrixDescriptorInit(n, k, block_size(ctx, n), block_size(ctx, k),
+                                                     0, 0, n, get_cuda_dtype(b->dtype()),
+                                                     ctx->grid_row_major.get(), ctx->b_desc.get()));
   } else {
     NVTE_CHECK(b0 == n, "Unsupported tensor dimension in B: expected ", n, ", got ", b0);
     NVTE_CHECK_CUBLASMP(cublasMpMatrixDescriptorInit(
@@ -200,6 +200,11 @@ void GemmRsInitMatrices(NVTECommGemmCtx* ctx, int64_t* ldd, int64_t m, int64_t n
   NVTE_CHECK_CUBLASMP(cublasMpMatrixDescriptorInit(m, n, m, block_size(ctx, n), 0, 0, *ldd,
                                                    get_cuda_dtype(d->dtype()),
                                                    ctx->grid_row_major.get(), ctx->d_desc.get()));
+
+  const cudaDataType_t comm_type = get_cuda_dtype(d->dtype());
+  NVTE_CHECK_CUBLASMP(cublasMpMatmulDescriptorSetAttribute(
+      ctx->matmul_desc.get(), CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_COMMUNICATION_TYPE, &comm_type,
+      sizeof comm_type));
 }
 
 void GemmArInitMatrices(NVTECommGemmCtx* ctx, int64_t* ldd, int64_t m, int64_t n, int64_t k,