From e40c006e52652eba9c7478164f7a53d4df4c908d Mon Sep 17 00:00:00 2001
From: wangpengcheng <wangpengcheng@example.com>
Date: Fri, 8 May 2026 08:38:49 +0000
Subject: [PATCH] issue/1155 - add topksoftmax interface of infinicore

---
 include/infinicore/ops.hpp                    |   1 +
 include/infinicore/ops/topksoftmax.hpp        |  14 ++
 src/infinicore/ops/topksoftmax/topksoftmax.cc |  30 ++++
 .../ops/topksoftmax/topksoftmax_infiniop.cc   |  56 +++++++
 src/infinicore/pybind11/ops.hpp               |   2 +
 src/infinicore/pybind11/ops/topksoftmax.hpp   |  32 ++++
 test/infinicore/ops/topksoftmax.py            | 147 ++++++++++++++++++
 7 files changed, 282 insertions(+)
 create mode 100644 include/infinicore/ops/topksoftmax.hpp
 create mode 100644 src/infinicore/ops/topksoftmax/topksoftmax.cc
 create mode 100644 src/infinicore/ops/topksoftmax/topksoftmax_infiniop.cc
 create mode 100644 src/infinicore/pybind11/ops/topksoftmax.hpp
 create mode 100644 test/infinicore/ops/topksoftmax.py
diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index 6019c3a9b..832f48683 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -45,3 +45,4 @@
 #include "ops/silu_and_mul.hpp"
 #include "ops/softmax.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/topksoftmax.hpp"
diff --git a/include/infinicore/ops/topksoftmax.hpp b/include/infinicore/ops/topksoftmax.hpp
new file mode 100644
index 000000000..2cd3ac7c5
--- /dev/null
+++ b/include/infinicore/ops/topksoftmax.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "../tensor.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Topksoftmax, Tensor, Tensor, const Tensor &, const size_t, const int);
+
+void topksoftmax(Tensor values, Tensor indices, const Tensor &x, const size_t topk, const int norm = 0);
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/topksoftmax/topksoftmax.cc b/src/infinicore/ops/topksoftmax/topksoftmax.cc
new file mode 100644
index 000000000..c0572ca83
--- /dev/null
+++ b/src/infinicore/ops/topksoftmax/topksoftmax.cc
@@ -0,0 +1,30 @@
+#include "infinicore/ops/topksoftmax.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Topksoftmax);
+
+Topksoftmax::Topksoftmax(Tensor values,
+                         Tensor indices,
+                         const Tensor &x,
+                         const size_t topk,
+                         const int norm) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(values, indices, x);
+    INFINICORE_GRAPH_OP_DISPATCH(values->device().getType(), values, indices, x, topk, norm);
+}
+
+void Topksoftmax::execute(Tensor values,
+                          Tensor indices,
+                          const Tensor &x,
+                          const size_t topk,
+                          const int norm) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Topksoftmax, values, indices, x, topk, norm);
+}
+
+void topksoftmax(Tensor values, Tensor indices, const Tensor &x, const size_t topk, const int norm) {
+    Topksoftmax::execute(values, indices, x, topk, norm);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/topksoftmax/topksoftmax_infiniop.cc b/src/infinicore/ops/topksoftmax/topksoftmax_infiniop.cc
new file mode 100644
index 000000000..1c016538c
--- /dev/null
+++ b/src/infinicore/ops/topksoftmax/topksoftmax_infiniop.cc
@@ -0,0 +1,56 @@
+#include "infinicore/ops/topksoftmax.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::topksoftmax_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Topksoftmax, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, values, indices, x;
+    size_t topk;
+    int norm;
+};
+
+void *plan(Tensor values, Tensor indices, const Tensor &x, const size_t topk, const int norm) {
+    size_t seed = hash_combine(values, indices, x);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Topksoftmax, seed, x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Topksoftmax, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(values),
+        graph::GraphTensor(indices),
+        graph::GraphTensor(x),
+        topk,
+        norm};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopTopksoftmax(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->values->data(),
+        planned->indices->data(),
+        planned->x->data(),
+        planned->topk,
+        planned->norm,
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Topksoftmax, &plan, &run, cleanup);
+
+} // namespace infinicore::op::topksoftmax_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index c9c780aad..383429f8f 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -97,6 +97,7 @@
 #include "ops/tan.hpp"
 #include "ops/tanhshrink.hpp"
 #include "ops/topk.hpp"
+#include "ops/topksoftmax.hpp"
 #include "ops/triplet_margin_loss.hpp"
 #include "ops/triplet_margin_with_distance_loss.hpp"
 #include "ops/unfold.hpp"
@@ -218,6 +219,7 @@ inline void bind(py::module &m) {
     bind_selu(m);
     bind_sinh(m);
     bind_layer_norm(m);
+    bind_topksoftmax(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/topksoftmax.hpp b/src/infinicore/pybind11/ops/topksoftmax.hpp
new file mode 100644
index 000000000..ca93edff0
--- /dev/null
+++ b/src/infinicore/pybind11/ops/topksoftmax.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/topksoftmax.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_topksoftmax(py::module &m) {
+    m.def("topksoftmax",
+          &op::topksoftmax,
+          py::arg("values"),
+          py::arg("indices"),
+          py::arg("x"),
+          py::arg("topk"),
+          py::arg("norm") = 0,
+          R"doc(In-place Top-k Softmax.
+
+Writes results to pre-allocated values and indices tensors.
+
+Args:
+    values: Output tensor for softmax weights [N, topk]
+    indices: Output tensor for selected indices [N, topk], int32
+    x: Input tensor [N, width], router logits
+    topk: Number of top values to select
+    norm: Whether to re-normalize top-k probabilities (1=yes, 0=no), default 0
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/test/infinicore/ops/topksoftmax.py b/test/infinicore/ops/topksoftmax.py
new file mode 100644
index 000000000..76582ecf0
--- /dev/null
+++ b/test/infinicore/ops/topksoftmax.py
@@ -0,0 +1,147 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import torch.nn.functional as F
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorInitializer,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+from infinicore.lib import _infinicore
+
+import infinicore
+
+# (input_shape, input_strides, topk, norm) — norm is 0/1 for C++ binding (infiniop bool).
+# Strides None only: kernel path matches contiguous layouts as in test/infiniop/topksoftmax.py.
+_TEST_CASES_DATA = [
+    ((1, 10), None, 7, 1),
+    ((8, 20), None, 4, 1),
+    ((2, 64), None, 6, 1),
+    ((4, 16), None, 3, 0),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def torch_topksoftmax(router_logits, top_k, norm_topk_prob=False):
+    """Reference implementation aligned with test/infiniop/topksoftmax.py."""
+    routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+    routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    if norm_topk_prob:
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+    return routing_weights, selected_experts.to(torch.int32)
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, in_strides, topk, norm in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-3, "rtol": 1e-3})
+            base = (
+                torch.arange(0, shape[0] * shape[1], dtype=torch.float32)
+                .reshape(shape)
+                * 0.5
+            )
+            input_spec = TensorSpec.from_tensor(
+                shape,
+                in_strides,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=base,
+            )
+            n = shape[0]
+            out_shape = (n, topk)
+
+            desc_parts = [f"topk={topk}", f"norm={norm}"]
+            if in_strides:
+                desc_parts.append(f"strides={in_strides}")
+            suffix = ", ".join(desc_parts)
+
+            kwargs = {"topk": topk, "norm": norm}
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=kwargs,
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description=f"topksoftmax - OUT_OF_PLACE - {suffix}",
+                    output_count=2,
+                )
+            )
+
+            values_spec = TensorSpec.from_tensor(out_shape, None, infinicore.float32)
+            indices_spec = TensorSpec.from_tensor(out_shape, None, infinicore.int32)
+
+            if not is_broadcast(values_spec.strides) and not is_broadcast(
+                indices_spec.strides
+            ):
+                test_cases.append(
+                    TestCase(
+                        inputs=[input_spec],
+                        kwargs=kwargs.copy(),
+                        output_specs=[values_spec, indices_spec],
+                        comparison_target="out",
+                        tolerance=tol,
+                        description=f"topksoftmax - INPLACE(out) - {suffix}",
+                        output_count=2,
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("topksoftmax")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, x, topk, norm=0, out=None, **kwargs):
+        norm_bool = norm != 0
+        values, indices = torch_topksoftmax(x, topk, norm_bool)
+        if out is not None:
+            out_v, out_i = out
+            out_v.copy_(values)
+            out_i.copy_(indices)
+        return values, indices
+
+    def infinicore_operator(self, x, topk, norm=0, out=None, **kwargs):
+        n = x.shape[0]
+        if out is None:
+            values = infinicore.empty(
+                (n, topk), dtype=infinicore.float32, device=x.device
+            )
+            indices = infinicore.empty(
+                (n, topk), dtype=infinicore.int32, device=x.device
+            )
+        else:
+            values, indices = out[0], out[1]
+
+        _infinicore.topksoftmax(
+            values._underlying, indices._underlying, x._underlying, topk, int(norm)
+        )
+        return values, indices
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()