diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index 832f48683..d72e8b5c3 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -6,12 +6,18 @@
 #include "ops/addcmul.hpp"
 #include "ops/asin.hpp"
 #include "ops/asinh.hpp"
+#include "ops/asum.hpp"
 #include "ops/atanh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
+#include "ops/axpy.hpp"
 #include "ops/baddbmm.hpp"
 #include "ops/bilinear.hpp"
 #include "ops/binary_cross_entropy_with_logits.hpp"
+#include "ops/blas_amax.hpp"
+#include "ops/blas_amin.hpp"
+#include "ops/blas_copy.hpp"
+#include "ops/blas_dot.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/cdist.hpp"
 #include "ops/conv2d.hpp"
@@ -28,6 +34,7 @@
 #include "ops/layer_norm.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
+#include "ops/nrm2.hpp"
 #include "ops/ones.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
@@ -41,8 +48,14 @@
 #include "ops/relu.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
+#include "ops/rot.hpp"
+#include "ops/rotg.hpp"
+#include "ops/rotm.hpp"
+#include "ops/rotmg.hpp"
+#include "ops/scal.hpp"
 #include "ops/silu.hpp"
 #include "ops/silu_and_mul.hpp"
 #include "ops/softmax.hpp"
+#include "ops/swap.hpp"
 #include "ops/swiglu.hpp"
 #include "ops/topksoftmax.hpp"
diff --git a/include/infinicore/ops/asum.hpp b/include/infinicore/ops/asum.hpp
new file mode 100644
index 000000000..6471e6fe4
--- /dev/null
+++ b/include/infinicore/ops/asum.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Asum, const Tensor &, Tensor);
+
+Tensor asum(const Tensor &x);
+void asum_(const Tensor &x, Tensor result);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/axpy.hpp b/include/infinicore/ops/axpy.hpp
new file mode 100644
index 000000000..280d5ab60
--- /dev/null
+++ b/include/infinicore/ops/axpy.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Axpy, const Tensor &, const Tensor &, Tensor);
+
+void axpy_(const Tensor &alpha, const Tensor &x, Tensor y);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/blas_amax.hpp b/include/infinicore/ops/blas_amax.hpp
new file mode 100644
index 000000000..a6a571f95
--- /dev/null
+++ b/include/infinicore/ops/blas_amax.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(BlasAmax, const Tensor &, Tensor);
+
+Tensor blas_amax(const Tensor &x);
+void blas_amax_(const Tensor &x, Tensor result);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/blas_amin.hpp b/include/infinicore/ops/blas_amin.hpp
new file mode 100644
index 000000000..a2ed21c7b
--- /dev/null
+++ b/include/infinicore/ops/blas_amin.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(BlasAmin, const Tensor &, Tensor);
+
+Tensor blas_amin(const Tensor &x);
+void blas_amin_(const Tensor &x, Tensor result);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/blas_copy.hpp b/include/infinicore/ops/blas_copy.hpp
new file mode 100644
index 000000000..dd32646a0
--- /dev/null
+++ b/include/infinicore/ops/blas_copy.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(BlasCopy, const Tensor &, Tensor);
+
+void blas_copy_(const Tensor &x, Tensor y);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/blas_dot.hpp b/include/infinicore/ops/blas_dot.hpp
new file mode 100644
index 000000000..157c167f6
--- /dev/null
+++ b/include/infinicore/ops/blas_dot.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(BlasDot, const Tensor &, const Tensor &, Tensor);
+
+Tensor blas_dot(const Tensor &x, const Tensor &y);
+void blas_dot_(const Tensor &x, const Tensor &y, Tensor result);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/nrm2.hpp b/include/infinicore/ops/nrm2.hpp
new file mode 100644
index 000000000..c5552a4c1
--- /dev/null
+++ b/include/infinicore/ops/nrm2.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Nrm2, const Tensor &, Tensor);
+
+Tensor nrm2(const Tensor &x);
+void nrm2_(const Tensor &x, Tensor result);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/rot.hpp b/include/infinicore/ops/rot.hpp
new file mode 100644
index 000000000..ff473f33e
--- /dev/null
+++ b/include/infinicore/ops/rot.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Rot, Tensor, Tensor, const Tensor &, const Tensor &);
+
+void rot_(Tensor x, Tensor y, const Tensor &c, const Tensor &s);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/rotg.hpp b/include/infinicore/ops/rotg.hpp
new file mode 100644
index 000000000..c65f211dd
--- /dev/null
+++ b/include/infinicore/ops/rotg.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Rotg {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor);
+    static void execute(Tensor x, Tensor y, Tensor c, Tensor s);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void rotg_(Tensor x, Tensor y, Tensor c, Tensor s);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/rotm.hpp b/include/infinicore/ops/rotm.hpp
new file mode 100644
index 000000000..813fc27ee
--- /dev/null
+++ b/include/infinicore/ops/rotm.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Rotm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor x, Tensor y, Tensor param);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void rotm_(Tensor x, Tensor y, Tensor param);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/rotmg.hpp b/include/infinicore/ops/rotmg.hpp
new file mode 100644
index 000000000..e245840a4
--- /dev/null
+++ b/include/infinicore/ops/rotmg.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Rotmg {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor);
+    static void execute(Tensor d1, Tensor d2, Tensor x1, Tensor y1, Tensor param);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void rotmg_(Tensor d1, Tensor d2, Tensor x1, Tensor y1, Tensor param);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/scal.hpp b/include/infinicore/ops/scal.hpp
new file mode 100644
index 000000000..d6cb5ce8c
--- /dev/null
+++ b/include/infinicore/ops/scal.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Scal, const Tensor &, Tensor);
+
+void scal_(const Tensor &alpha, Tensor x);
+
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/swap.hpp b/include/infinicore/ops/swap.hpp
new file mode 100644
index 000000000..aba3ad563
--- /dev/null
+++ b/include/infinicore/ops/swap.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(Swap, Tensor, Tensor);
+
+void swap_(Tensor x, Tensor y);
+
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index 0ec995823..de33a7a4b 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -15,11 +15,17 @@
 #include "infiniop/ops/all.h"
 #include "infiniop/ops/asin.h"
 #include "infiniop/ops/asinh.h"
+#include "infiniop/ops/asum.h"
 #include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/avg_pool1d.h"
 #include "infiniop/ops/avg_pool3d.h"
+#include "infiniop/ops/axpy.h"
 #include "infiniop/ops/binary_cross_entropy_with_logits.h"
+#include "infiniop/ops/blas_amax.h"
+#include "infiniop/ops/blas_amin.h"
+#include "infiniop/ops/blas_copy.h"
+#include "infiniop/ops/blas_dot.h"
 #include "infiniop/ops/block_diag.h"
 #include "infiniop/ops/broadcast_to.h"
 #include "infiniop/ops/causal_softmax.h"
@@ -78,6 +84,7 @@
 #include "infiniop/ops/matrix_power.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/multi_margin_loss.h"
+#include "infiniop/ops/nrm2.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/pad.h"
 #include "infiniop/ops/paged_attention.h"
@@ -93,6 +100,11 @@
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/rot.h"
+#include "infiniop/ops/rotg.h"
+#include "infiniop/ops/rotm.h"
+#include "infiniop/ops/rotmg.h"
+#include "infiniop/ops/scal.h"
 #include "infiniop/ops/scatter.h"
 #include "infiniop/ops/selu.h"
 #include "infiniop/ops/sigmoid.h"
@@ -105,6 +117,7 @@
 #include "infiniop/ops/softsign.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/sum.h"
+#include "infiniop/ops/swap.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/take.h"
 #include "infiniop/ops/tan.h"
diff --git a/include/infiniop/ops/asum.h b/include/infiniop/ops/asum.h
new file mode 100644
index 000000000..89336bce7
--- /dev/null
+++ b/include/infiniop/ops/asum.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASUM_API_H__
+#define __INFINIOP_ASUM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsumDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateAsumDescriptor(infiniopHandle_t handle,
+                                                                infiniopAsumDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t result);
+
+__INFINI_C __export infiniStatus_t infiniopGetAsumWorkspaceSize(infiniopAsumDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopAsum(infiniopAsumDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                const void *x,
+                                                void *result,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyAsumDescriptor(infiniopAsumDescriptor_t desc);
+
+#endif // __INFINIOP_ASUM_API_H__
diff --git a/include/infiniop/ops/axpy.h b/include/infiniop/ops/axpy.h
new file mode 100644
index 000000000..ce6b2a23b
--- /dev/null
+++ b/include/infiniop/ops/axpy.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_AXPY_API_H__
+#define __INFINIOP_AXPY_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAxpyDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateAxpyDescriptor(infiniopHandle_t handle,
+                                                                infiniopAxpyDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t alpha,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t y);
+
+__INFINI_C __export infiniStatus_t infiniopGetAxpyWorkspaceSize(infiniopAxpyDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopAxpy(infiniopAxpyDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                const void *alpha,
+                                                const void *x,
+                                                void *y,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyAxpyDescriptor(infiniopAxpyDescriptor_t desc);
+
+#endif // __INFINIOP_AXPY_API_H__
diff --git a/include/infiniop/ops/blas_amax.h b/include/infiniop/ops/blas_amax.h
new file mode 100644
index 000000000..9981c7fb7
--- /dev/null
+++ b/include/infiniop/ops/blas_amax.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_BLAS_AMAX_API_H__
+#define __INFINIOP_BLAS_AMAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBlasAmaxDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateBlasAmaxDescriptor(infiniopHandle_t handle,
+                                                                    infiniopBlasAmaxDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t x,
+                                                                    infiniopTensorDescriptor_t result);
+
+__INFINI_C __export infiniStatus_t infiniopGetBlasAmaxWorkspaceSize(infiniopBlasAmaxDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopBlasAmax(infiniopBlasAmaxDescriptor_t desc,
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    const void *x,
+                                                    void *result,
+                                                    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyBlasAmaxDescriptor(infiniopBlasAmaxDescriptor_t desc);
+
+#endif // __INFINIOP_BLAS_AMAX_API_H__
diff --git a/include/infiniop/ops/blas_amin.h b/include/infiniop/ops/blas_amin.h
new file mode 100644
index 000000000..6bc8680ba
--- /dev/null
+++ b/include/infiniop/ops/blas_amin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_BLAS_AMIN_API_H__
+#define __INFINIOP_BLAS_AMIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBlasAminDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateBlasAminDescriptor(infiniopHandle_t handle,
+                                                                    infiniopBlasAminDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t x,
+                                                                    infiniopTensorDescriptor_t result);
+
+__INFINI_C __export infiniStatus_t infiniopGetBlasAminWorkspaceSize(infiniopBlasAminDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopBlasAmin(infiniopBlasAminDescriptor_t desc,
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    const void *x,
+                                                    void *result,
+                                                    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyBlasAminDescriptor(infiniopBlasAminDescriptor_t desc);
+
+#endif // __INFINIOP_BLAS_AMIN_API_H__
diff --git a/include/infiniop/ops/blas_copy.h b/include/infiniop/ops/blas_copy.h
new file mode 100644
index 000000000..7c6f3611c
--- /dev/null
+++ b/include/infiniop/ops/blas_copy.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_BLAS_COPY_API_H__
+#define __INFINIOP_BLAS_COPY_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBlasCopyDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateBlasCopyDescriptor(infiniopHandle_t handle,
+                                                                    infiniopBlasCopyDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t x,
+                                                                    infiniopTensorDescriptor_t y);
+
+__INFINI_C __export infiniStatus_t infiniopGetBlasCopyWorkspaceSize(infiniopBlasCopyDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopBlasCopy(infiniopBlasCopyDescriptor_t desc,
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    const void *x,
+                                                    void *y,
+                                                    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyBlasCopyDescriptor(infiniopBlasCopyDescriptor_t desc);
+
+#endif // __INFINIOP_BLAS_COPY_API_H__
diff --git a/include/infiniop/ops/blas_dot.h b/include/infiniop/ops/blas_dot.h
new file mode 100644
index 000000000..9d03af2c0
--- /dev/null
+++ b/include/infiniop/ops/blas_dot.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_BLAS_DOT_API_H__
+#define __INFINIOP_BLAS_DOT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBlasDotDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateBlasDotDescriptor(infiniopHandle_t handle,
+                                                                   infiniopBlasDotDescriptor_t *desc_ptr,
+                                                                   infiniopTensorDescriptor_t x,
+                                                                   infiniopTensorDescriptor_t y,
+                                                                   infiniopTensorDescriptor_t result);
+
+__INFINI_C __export infiniStatus_t infiniopGetBlasDotWorkspaceSize(infiniopBlasDotDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopBlasDot(infiniopBlasDotDescriptor_t desc,
+                                                   void *workspace,
+                                                   size_t workspace_size,
+                                                   const void *x,
+                                                   const void *y,
+                                                   void *result,
+                                                   void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyBlasDotDescriptor(infiniopBlasDotDescriptor_t desc);
+
+#endif // __INFINIOP_BLAS_DOT_API_H__
diff --git a/include/infiniop/ops/nrm2.h b/include/infiniop/ops/nrm2.h
new file mode 100644
index 000000000..2f1eed348
--- /dev/null
+++ b/include/infiniop/ops/nrm2.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_NRM2_API_H__
+#define __INFINIOP_NRM2_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopNrm2Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateNrm2Descriptor(infiniopHandle_t handle,
+                                                                infiniopNrm2Descriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t result);
+
+__INFINI_C __export infiniStatus_t infiniopGetNrm2WorkspaceSize(infiniopNrm2Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopNrm2(infiniopNrm2Descriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                const void *x,
+                                                void *result,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyNrm2Descriptor(infiniopNrm2Descriptor_t desc);
+
+#endif // __INFINIOP_NRM2_API_H__
diff --git a/include/infiniop/ops/rot.h b/include/infiniop/ops/rot.h
new file mode 100644
index 000000000..0ecbae52d
--- /dev/null
+++ b/include/infiniop/ops/rot.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_ROT_API_H__
+#define __INFINIOP_ROT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRotDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateRotDescriptor(infiniopHandle_t handle,
+                                                               infiniopRotDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t x,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t c,
+                                                               infiniopTensorDescriptor_t s);
+
+__INFINI_C __export infiniStatus_t infiniopGetRotWorkspaceSize(infiniopRotDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopRot(infiniopRotDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *x,
+                                               void *y,
+                                               const void *c,
+                                               const void *s,
+                                               void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyRotDescriptor(infiniopRotDescriptor_t desc);
+
+#endif // __INFINIOP_ROT_API_H__
diff --git a/include/infiniop/ops/rotg.h b/include/infiniop/ops/rotg.h
new file mode 100644
index 000000000..63c2dad46
--- /dev/null
+++ b/include/infiniop/ops/rotg.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_ROTG_API_H__
+#define __INFINIOP_ROTG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRotgDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateRotgDescriptor(infiniopHandle_t handle,
+                                                                infiniopRotgDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t c,
+                                                                infiniopTensorDescriptor_t s);
+
+__INFINI_C __export infiniStatus_t infiniopGetRotgWorkspaceSize(infiniopRotgDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopRotg(infiniopRotgDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *x,
+                                                void *y,
+                                                void *c,
+                                                void *s,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyRotgDescriptor(infiniopRotgDescriptor_t desc);
+
+#endif // __INFINIOP_ROTG_API_H__
diff --git a/include/infiniop/ops/rotm.h b/include/infiniop/ops/rotm.h
new file mode 100644
index 000000000..6cc6a636c
--- /dev/null
+++ b/include/infiniop/ops/rotm.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_ROTM_API_H__
+#define __INFINIOP_ROTM_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRotmDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateRotmDescriptor(infiniopHandle_t handle,
+                                                                infiniopRotmDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t param);
+
+__INFINI_C __export infiniStatus_t infiniopGetRotmWorkspaceSize(infiniopRotmDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopRotm(infiniopRotmDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *x,
+                                                void *y,
+                                                const void *param,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyRotmDescriptor(infiniopRotmDescriptor_t desc);
+
+#endif // __INFINIOP_ROTM_API_H__
diff --git a/include/infiniop/ops/rotmg.h b/include/infiniop/ops/rotmg.h
new file mode 100644
index 000000000..0295339cb
--- /dev/null
+++ b/include/infiniop/ops/rotmg.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_ROTMG_API_H__
+#define __INFINIOP_ROTMG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRotmgDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateRotmgDescriptor(infiniopHandle_t handle,
+                                                                 infiniopRotmgDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t d1,
+                                                                 infiniopTensorDescriptor_t d2,
+                                                                 infiniopTensorDescriptor_t x1,
+                                                                 infiniopTensorDescriptor_t y1,
+                                                                 infiniopTensorDescriptor_t param);
+
+__INFINI_C __export infiniStatus_t infiniopGetRotmgWorkspaceSize(infiniopRotmgDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopRotmg(infiniopRotmgDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *d1,
+                                                 void *d2,
+                                                 void *x1,
+                                                 const void *y1,
+                                                 void *param,
+                                                 void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyRotmgDescriptor(infiniopRotmgDescriptor_t desc);
+
+#endif // __INFINIOP_ROTMG_API_H__
diff --git a/include/infiniop/ops/scal.h b/include/infiniop/ops/scal.h
new file mode 100644
index 000000000..f7903de56
--- /dev/null
+++ b/include/infiniop/ops/scal.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SCAL_API_H__
+#define __INFINIOP_SCAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopScalDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateScalDescriptor(infiniopHandle_t handle,
+                                                                infiniopScalDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t alpha,
+                                                                infiniopTensorDescriptor_t x);
+
+__INFINI_C __export infiniStatus_t infiniopGetScalWorkspaceSize(infiniopScalDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopScal(infiniopScalDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                const void *alpha,
+                                                void *x,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyScalDescriptor(infiniopScalDescriptor_t desc);
+
+#endif // __INFINIOP_SCAL_API_H__
diff --git a/include/infiniop/ops/swap.h b/include/infiniop/ops/swap.h
new file mode 100644
index 000000000..7eb14b57a
--- /dev/null
+++ b/include/infiniop/ops/swap.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SWAP_API_H__
+#define __INFINIOP_SWAP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSwapDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateSwapDescriptor(infiniopHandle_t handle,
+                                                                infiniopSwapDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x,
+                                                                infiniopTensorDescriptor_t y);
+
+__INFINI_C __export infiniStatus_t infiniopGetSwapWorkspaceSize(infiniopSwapDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopSwap(infiniopSwapDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *x,
+                                                void *y,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroySwapDescriptor(infiniopSwapDescriptor_t desc);
+
+#endif // __INFINIOP_SWAP_API_H__
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 9c87a5108..8c9adc64c 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -58,14 +58,20 @@
 from infinicore.ops.argwhere import argwhere
 from infinicore.ops.asin import asin
 from infinicore.ops.asinh import asinh
+from infinicore.ops.asum import asum
 from infinicore.ops.atanh import atanh
 from infinicore.ops.attention import attention
+from infinicore.ops.axpy import axpy
 from infinicore.ops.baddbmm import baddbmm
 from infinicore.ops.bilinear import bilinear
 from infinicore.ops.binary_cross_entropy_with_logits import (
     binary_cross_entropy_with_logits,
 )
 from infinicore.ops.bitwise_right_shift import bitwise_right_shift
+from infinicore.ops.blas_amax import blas_amax
+from infinicore.ops.blas_amin import blas_amin
+from infinicore.ops.blas_copy import blas_copy
+from infinicore.ops.blas_dot import blas_dot
 from infinicore.ops.block_diag import block_diag
 from infinicore.ops.broadcast_to import broadcast_to
 from infinicore.ops.cat import cat
@@ -102,15 +108,22 @@
 from infinicore.ops.mha_varlen import mha_varlen
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
+from infinicore.ops.nrm2 import nrm2
 from infinicore.ops.paged_attention import paged_attention
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.reciprocal import reciprocal
+from infinicore.ops.rot import rot
+from infinicore.ops.rotg import rotg
+from infinicore.ops.rotm import rotm
+from infinicore.ops.rotmg import rotmg
+from infinicore.ops.scal import scal
 from infinicore.ops.scatter import scatter
 from infinicore.ops.sinh import sinh
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.sum import sum
+from infinicore.ops.swap import swap
 from infinicore.ops.take import take
 from infinicore.ops.tan import tan
 from infinicore.ops.topk import topk
@@ -185,6 +198,12 @@
     "add_rms_norm",
     "argwhere",
     "asin",
+    "asum",
+    "axpy",
+    "blas_amax",
+    "blas_amin",
+    "blas_copy",
+    "blas_dot",
     "acos",
     "addbmm",
     "floor",
@@ -210,6 +229,7 @@
     "dist",
     "logdet",
     "narrow",
+    "nrm2",
     "ldexp",
     "lerp",
     "kthvalue",
@@ -231,6 +251,11 @@
     "float_power",
     "flipud",
     "scatter",
+    "rot",
+    "rotg",
+    "rotm",
+    "rotmg",
+    "scal",
     "logcumsumexp",
     "logical_not",
     "logical_and",
@@ -243,6 +268,7 @@
     "index_add",
     "take",
     "sinh",
+    "swap",
     "ones",
     "broadcast_to",
     "strided_empty",
diff --git a/python/infinicore/ops/asum.py b/python/infinicore/ops/asum.py
new file mode 100644
index 000000000..589a02129
--- /dev/null
+++ b/python/infinicore/ops/asum.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def asum(x: Tensor, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.asum(x._underlying))
+
+    _infinicore.asum_(x._underlying, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/axpy.py b/python/infinicore/ops/axpy.py
new file mode 100644
index 000000000..3457038fb
--- /dev/null
+++ b/python/infinicore/ops/axpy.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def axpy(alpha: Tensor, x: Tensor, y: Tensor):
+    _infinicore.axpy_(alpha._underlying, x._underlying, y._underlying)
+    return y
diff --git a/python/infinicore/ops/blas_amax.py b/python/infinicore/ops/blas_amax.py
new file mode 100644
index 000000000..65279963d
--- /dev/null
+++ b/python/infinicore/ops/blas_amax.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def blas_amax(x: Tensor, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.blas_amax(x._underlying))
+
+    _infinicore.blas_amax_(x._underlying, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/blas_amin.py b/python/infinicore/ops/blas_amin.py
new file mode 100644
index 000000000..472313faa
--- /dev/null
+++ b/python/infinicore/ops/blas_amin.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def blas_amin(x: Tensor, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.blas_amin(x._underlying))
+
+    _infinicore.blas_amin_(x._underlying, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/blas_copy.py b/python/infinicore/ops/blas_copy.py
new file mode 100644
index 000000000..75d4abef3
--- /dev/null
+++ b/python/infinicore/ops/blas_copy.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def blas_copy(x: Tensor, y: Tensor):
+    _infinicore.blas_copy_(x._underlying, y._underlying)
+    return y
diff --git a/python/infinicore/ops/blas_dot.py b/python/infinicore/ops/blas_dot.py
new file mode 100644
index 000000000..60f0541b0
--- /dev/null
+++ b/python/infinicore/ops/blas_dot.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def blas_dot(x: Tensor, y: Tensor, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.blas_dot(x._underlying, y._underlying))
+
+    _infinicore.blas_dot_(x._underlying, y._underlying, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/nrm2.py b/python/infinicore/ops/nrm2.py
new file mode 100644
index 000000000..fcc3a0d3a
--- /dev/null
+++ b/python/infinicore/ops/nrm2.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def nrm2(x: Tensor, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.nrm2(x._underlying))
+
+    _infinicore.nrm2_(x._underlying, out._underlying)
+
+    return out
diff --git a/python/infinicore/ops/rot.py b/python/infinicore/ops/rot.py
new file mode 100644
index 000000000..091775c4e
--- /dev/null
+++ b/python/infinicore/ops/rot.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rot(x: Tensor, y: Tensor, c: Tensor, s: Tensor):
+    _infinicore.rot_(x._underlying, y._underlying, c._underlying, s._underlying)
+    return x, y
diff --git a/python/infinicore/ops/rotg.py b/python/infinicore/ops/rotg.py
new file mode 100644
index 000000000..8299e6205
--- /dev/null
+++ b/python/infinicore/ops/rotg.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rotg(x: Tensor, y: Tensor, c: Tensor, s: Tensor):
+    _infinicore.rotg_(x._underlying, y._underlying, c._underlying, s._underlying)
+    return x, y, c, s
diff --git a/python/infinicore/ops/rotm.py b/python/infinicore/ops/rotm.py
new file mode 100644
index 000000000..68acfc231
--- /dev/null
+++ b/python/infinicore/ops/rotm.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rotm(x: Tensor, y: Tensor, param: Tensor):
+    _infinicore.rotm_(x._underlying, y._underlying, param._underlying)
+    return x, y
diff --git a/python/infinicore/ops/rotmg.py b/python/infinicore/ops/rotmg.py
new file mode 100644
index 000000000..468dd687e
--- /dev/null
+++ b/python/infinicore/ops/rotmg.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rotmg(d1: Tensor, d2: Tensor, x1: Tensor, y1: Tensor, param: Tensor):
+    _infinicore.rotmg_(
+        d1._underlying,
+        d2._underlying,
+        x1._underlying,
+        y1._underlying,
+        param._underlying,
+    )
+    return d1, d2, x1, param
diff --git a/python/infinicore/ops/scal.py b/python/infinicore/ops/scal.py
new file mode 100644
index 000000000..8302e74a5
--- /dev/null
+++ b/python/infinicore/ops/scal.py
@@ -0,0 +1,8 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def scal(x: Tensor, alpha: Tensor):
+    _infinicore.scal_(alpha._underlying, x._underlying)
+
+    return x
diff --git a/python/infinicore/ops/swap.py b/python/infinicore/ops/swap.py
new file mode 100644
index 000000000..f773a34eb
--- /dev/null
+++ b/python/infinicore/ops/swap.py
@@ -0,0 +1,7 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def swap(x: Tensor, y: Tensor):
+    _infinicore.swap_(x._underlying, y._underlying)
+    return x, y
diff --git a/src/infinicore/ops/asum/asum.cc b/src/infinicore/ops/asum/asum.cc
new file mode 100644
index 000000000..c757574bb
--- /dev/null
+++ b/src/infinicore/ops/asum/asum.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/asum.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Asum);
+
+Asum::Asum(const Tensor &x, Tensor result) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, result);
+    INFINICORE_GRAPH_OP_DISPATCH(result->device().getType(), x, result);
+}
+
+void Asum::execute(const Tensor &x, Tensor result) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Asum, x, result);
+}
+
+Tensor asum(const Tensor &x) {
+    auto result = Tensor::empty({}, x->dtype(), x->device());
+    asum_(x, result);
+    return result;
+}
+
+void asum_(const Tensor &x, Tensor result) {
+    Asum::execute(x, result);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/asum/asum_infiniop.cc b/src/infinicore/ops/asum/asum_infiniop.cc
new file mode 100644
index 000000000..0cfd8b721
--- /dev/null
+++ b/src/infinicore/ops/asum/asum_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/asum.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::asum_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Asum, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, result;
+};
+
+void *plan(const Tensor &x, Tensor result) {
+    size_t seed = hash_combine(x, result);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Asum,
+        seed,
+        x->desc(), result->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Asum, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(result)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopAsum(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->result->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Asum, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::asum_impl::infiniop
diff --git a/src/infinicore/ops/axpy/axpy.cc b/src/infinicore/ops/axpy/axpy.cc
new file mode 100644
index 000000000..d5d3241ec
--- /dev/null
+++ b/src/infinicore/ops/axpy/axpy.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/axpy.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Axpy);
+
+Axpy::Axpy(const Tensor &alpha, const Tensor &x, Tensor y) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(alpha, x, y);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), alpha, x, y);
+}
+
+void Axpy::execute(const Tensor &alpha, const Tensor &x, Tensor y) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Axpy, alpha, x, y);
+}
+
+void axpy_(const Tensor &alpha, const Tensor &x, Tensor y) {
+    Axpy::execute(alpha, x, y);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/axpy/axpy_infiniop.cc b/src/infinicore/ops/axpy/axpy_infiniop.cc
new file mode 100644
index 000000000..8d54c3823
--- /dev/null
+++ b/src/infinicore/ops/axpy/axpy_infiniop.cc
@@ -0,0 +1,52 @@
+#include "infinicore/ops/axpy.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::axpy_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Axpy, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, alpha, x, y;
+};
+
+void *plan(const Tensor &alpha, const Tensor &x, Tensor y) {
+    size_t seed = hash_combine(y, alpha, x);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Axpy,
+        seed,
+        alpha->desc(), x->desc(), y->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Axpy, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(alpha),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopAxpy(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->alpha->data(),
+        planned->x->data(),
+        planned->y->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Axpy, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::axpy_impl::infiniop
diff --git a/src/infinicore/ops/blas_amax/blas_amax.cc b/src/infinicore/ops/blas_amax/blas_amax.cc
new file mode 100644
index 000000000..9579589ff
--- /dev/null
+++ b/src/infinicore/ops/blas_amax/blas_amax.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/blas_amax.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(BlasAmax);
+
+BlasAmax::BlasAmax(const Tensor &x, Tensor result) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, result);
+    INFINICORE_GRAPH_OP_DISPATCH(result->device().getType(), x, result);
+}
+
+void BlasAmax::execute(const Tensor &x, Tensor result) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(BlasAmax, x, result);
+}
+
+Tensor blas_amax(const Tensor &x) {
+    auto result = Tensor::empty({}, DataType::I32, x->device());
+    blas_amax_(x, result);
+    return result;
+}
+
+void blas_amax_(const Tensor &x, Tensor result) {
+    BlasAmax::execute(x, result);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/blas_amax/blas_amax_infiniop.cc b/src/infinicore/ops/blas_amax/blas_amax_infiniop.cc
new file mode 100644
index 000000000..780fca744
--- /dev/null
+++ b/src/infinicore/ops/blas_amax/blas_amax_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/blas_amax.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::blas_amax_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, BlasAmax, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, result;
+};
+
+void *plan(const Tensor &x, Tensor result) {
+    size_t seed = hash_combine(x, result);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, BlasAmax,
+        seed,
+        x->desc(), result->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, BlasAmax, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(result)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopBlasAmax(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->result->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(BlasAmax, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::blas_amax_impl::infiniop
diff --git a/src/infinicore/ops/blas_amin/blas_amin.cc b/src/infinicore/ops/blas_amin/blas_amin.cc
new file mode 100644
index 000000000..e18e22739
--- /dev/null
+++ b/src/infinicore/ops/blas_amin/blas_amin.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/blas_amin.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(BlasAmin);
+
+BlasAmin::BlasAmin(const Tensor &x, Tensor result) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, result);
+    INFINICORE_GRAPH_OP_DISPATCH(result->device().getType(), x, result);
+}
+
+void BlasAmin::execute(const Tensor &x, Tensor result) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(BlasAmin, x, result);
+}
+
+Tensor blas_amin(const Tensor &x) {
+    auto result = Tensor::empty({}, DataType::I32, x->device());
+    blas_amin_(x, result);
+    return result;
+}
+
+void blas_amin_(const Tensor &x, Tensor result) {
+    BlasAmin::execute(x, result);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/blas_amin/blas_amin_infiniop.cc b/src/infinicore/ops/blas_amin/blas_amin_infiniop.cc
new file mode 100644
index 000000000..00abf77e1
--- /dev/null
+++ b/src/infinicore/ops/blas_amin/blas_amin_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/blas_amin.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::blas_amin_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, BlasAmin, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, result;
+};
+
+void *plan(const Tensor &x, Tensor result) {
+    size_t seed = hash_combine(x, result);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, BlasAmin,
+        seed,
+        x->desc(), result->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, BlasAmin, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(result)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopBlasAmin(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->result->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(BlasAmin, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::blas_amin_impl::infiniop
diff --git a/src/infinicore/ops/blas_copy/blas_copy.cc b/src/infinicore/ops/blas_copy/blas_copy.cc
new file mode 100644
index 000000000..77b2e7a5f
--- /dev/null
+++ b/src/infinicore/ops/blas_copy/blas_copy.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/blas_copy.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(BlasCopy);
+
+BlasCopy::BlasCopy(const Tensor &x, Tensor y) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), x, y);
+}
+
+void BlasCopy::execute(const Tensor &x, Tensor y) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(BlasCopy, x, y);
+}
+
+void blas_copy_(const Tensor &x, Tensor y) {
+    BlasCopy::execute(x, y);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/blas_copy/blas_copy_infiniop.cc b/src/infinicore/ops/blas_copy/blas_copy_infiniop.cc
new file mode 100644
index 000000000..33a70523b
--- /dev/null
+++ b/src/infinicore/ops/blas_copy/blas_copy_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/blas_copy.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::blas_copy_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, BlasCopy, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, y;
+};
+
+void *plan(const Tensor &x, Tensor y) {
+    size_t seed = hash_combine(x, y);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, BlasCopy,
+        seed,
+        x->desc(), y->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, BlasCopy, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopBlasCopy(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->y->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(BlasCopy, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::blas_copy_impl::infiniop
diff --git a/src/infinicore/ops/blas_dot/blas_dot.cc b/src/infinicore/ops/blas_dot/blas_dot.cc
new file mode 100644
index 000000000..8dba37acc
--- /dev/null
+++ b/src/infinicore/ops/blas_dot/blas_dot.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/blas_dot.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(BlasDot);
+
+BlasDot::BlasDot(const Tensor &x, const Tensor &y, Tensor result) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y, result);
+    INFINICORE_GRAPH_OP_DISPATCH(result->device().getType(), x, y, result);
+}
+
+void BlasDot::execute(const Tensor &x, const Tensor &y, Tensor result) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(BlasDot, x, y, result);
+}
+
+Tensor blas_dot(const Tensor &x, const Tensor &y) {
+    auto result = Tensor::empty({}, x->dtype(), x->device());
+    blas_dot_(x, y, result);
+    return result;
+}
+
+void blas_dot_(const Tensor &x, const Tensor &y, Tensor result) {
+    BlasDot::execute(x, y, result);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/blas_dot/blas_dot_infiniop.cc b/src/infinicore/ops/blas_dot/blas_dot_infiniop.cc
new file mode 100644
index 000000000..78559eea7
--- /dev/null
+++ b/src/infinicore/ops/blas_dot/blas_dot_infiniop.cc
@@ -0,0 +1,52 @@
+#include "infinicore/ops/blas_dot.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::blas_dot_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, BlasDot, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, y, result;
+};
+
+void *plan(const Tensor &x, const Tensor &y, Tensor result) {
+    size_t seed = hash_combine(x, y, result);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, BlasDot,
+        seed,
+        x->desc(), y->desc(), result->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, BlasDot, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y),
+        graph::GraphTensor(result)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopBlasDot(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->y->data(),
+        planned->result->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(BlasDot, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::blas_dot_impl::infiniop
diff --git a/src/infinicore/ops/nrm2/nrm2.cc b/src/infinicore/ops/nrm2/nrm2.cc
new file mode 100644
index 000000000..276e1015e
--- /dev/null
+++ b/src/infinicore/ops/nrm2/nrm2.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/nrm2.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Nrm2);
+
+Nrm2::Nrm2(const Tensor &x, Tensor result) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, result);
+    INFINICORE_GRAPH_OP_DISPATCH(result->device().getType(), x, result);
+}
+
+void Nrm2::execute(const Tensor &x, Tensor result) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Nrm2, x, result);
+}
+
+Tensor nrm2(const Tensor &x) {
+    auto result = Tensor::empty({}, x->dtype(), x->device());
+    nrm2_(x, result);
+    return result;
+}
+
+void nrm2_(const Tensor &x, Tensor result) {
+    Nrm2::execute(x, result);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/nrm2/nrm2_infiniop.cc b/src/infinicore/ops/nrm2/nrm2_infiniop.cc
new file mode 100644
index 000000000..3f3ca8c74
--- /dev/null
+++ b/src/infinicore/ops/nrm2/nrm2_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/nrm2.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::nrm2_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Nrm2, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, result;
+};
+
+void *plan(const Tensor &x, Tensor result) {
+    size_t seed = hash_combine(x, result);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Nrm2,
+        seed,
+        x->desc(), result->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Nrm2, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(result)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopNrm2(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->result->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Nrm2, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::nrm2_impl::infiniop
diff --git a/src/infinicore/ops/rot/rot.cc b/src/infinicore/ops/rot/rot.cc
new file mode 100644
index 000000000..262cce001
--- /dev/null
+++ b/src/infinicore/ops/rot/rot.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/rot.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Rot);
+
+Rot::Rot(Tensor x, Tensor y, const Tensor &c, const Tensor &s) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y, c, s);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, y, c, s);
+}
+
+void Rot::execute(Tensor x, Tensor y, const Tensor &c, const Tensor &s) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Rot, x, y, c, s);
+}
+
+void rot_(Tensor x, Tensor y, const Tensor &c, const Tensor &s) {
+    Rot::execute(x, y, c, s);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rot/rot_infiniop.cc b/src/infinicore/ops/rot/rot_infiniop.cc
new file mode 100644
index 000000000..a114bf110
--- /dev/null
+++ b/src/infinicore/ops/rot/rot_infiniop.cc
@@ -0,0 +1,54 @@
+#include "infinicore/ops/rot.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::rot_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Rot, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, y, c, s;
+};
+
+void *plan(Tensor x, Tensor y, const Tensor &c, const Tensor &s) {
+    size_t seed = hash_combine(x, y, c, s);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Rot,
+        seed,
+        x->desc(), y->desc(), c->desc(), s->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Rot, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y),
+        graph::GraphTensor(c),
+        graph::GraphTensor(s)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopRot(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->y->data(),
+        planned->c->data(),
+        planned->s->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Rot, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::rot_impl::infiniop
diff --git a/src/infinicore/ops/rotg/rotg.cc b/src/infinicore/ops/rotg/rotg.cc
new file mode 100644
index 000000000..233177b68
--- /dev/null
+++ b/src/infinicore/ops/rotg/rotg.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/rotg.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Rotg::schema> &Rotg::dispatcher() {
+    static common::OpDispatcher<Rotg::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Rotg::execute(Tensor x, Tensor y, Tensor c, Tensor s) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y, c, s);
+    infinicore::context::setDevice(x->device());
+    dispatcher().lookup(x->device().getType())(x, y, c, s);
+}
+
+void rotg_(Tensor x, Tensor y, Tensor c, Tensor s) {
+    Rotg::execute(x, y, c, s);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rotg/rotg_infiniop.cc b/src/infinicore/ops/rotg/rotg_infiniop.cc
new file mode 100644
index 000000000..d5e7a0e99
--- /dev/null
+++ b/src/infinicore/ops/rotg/rotg_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rotg.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rotg_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRotgDescriptor_t> caches(
+    100,
+    [](infiniopRotgDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRotgDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor x, Tensor y, Tensor c, Tensor s) {
+    size_t seed = hash_combine(x, y, c, s);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopRotgDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRotgDescriptor(
+            context::getInfiniopHandle(x->device()), &desc,
+            x->desc(), y->desc(), c->desc(), s->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetRotgWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopRotg(
+        desc, workspace->data(), workspace_size,
+        x->data(), y->data(), c->data(), s->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Rotg::dispatcher().registerDevice({Device::Type::CPU,
+                                       Device::Type::CAMBRICON,
+                                       Device::Type::METAX},
+                                      &calculate,
+                                      false);
+    return true;
+}();
+
+} // namespace infinicore::op::rotg_impl::infiniop
diff --git a/src/infinicore/ops/rotm/rotm.cc b/src/infinicore/ops/rotm/rotm.cc
new file mode 100644
index 000000000..2b1921f8b
--- /dev/null
+++ b/src/infinicore/ops/rotm/rotm.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/rotm.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Rotm::schema> &Rotm::dispatcher() {
+    static common::OpDispatcher<Rotm::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Rotm::execute(Tensor x, Tensor y, Tensor param) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y, param);
+    infinicore::context::setDevice(x->device());
+    dispatcher().lookup(x->device().getType())(x, y, param);
+}
+
+void rotm_(Tensor x, Tensor y, Tensor param) {
+    Rotm::execute(x, y, param);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rotm/rotm_infiniop.cc b/src/infinicore/ops/rotm/rotm_infiniop.cc
new file mode 100644
index 000000000..9df5c9ab1
--- /dev/null
+++ b/src/infinicore/ops/rotm/rotm_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rotm.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rotm_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRotmDescriptor_t> caches(
+    100,
+    [](infiniopRotmDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRotmDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor x, Tensor y, Tensor param) {
+    size_t seed = hash_combine(x, y, param);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopRotmDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRotmDescriptor(
+            context::getInfiniopHandle(x->device()), &desc,
+            x->desc(), y->desc(), param->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetRotmWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopRotm(
+        desc, workspace->data(), workspace_size,
+        x->data(), y->data(), param->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Rotm::dispatcher().registerDevice({Device::Type::CPU,
+                                       Device::Type::CAMBRICON,
+                                       Device::Type::METAX},
+                                      &calculate,
+                                      false);
+    return true;
+}();
+
+} // namespace infinicore::op::rotm_impl::infiniop
diff --git a/src/infinicore/ops/rotmg/rotmg.cc b/src/infinicore/ops/rotmg/rotmg.cc
new file mode 100644
index 000000000..8a2860d63
--- /dev/null
+++ b/src/infinicore/ops/rotmg/rotmg.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/rotmg.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Rotmg::schema> &Rotmg::dispatcher() {
+    static common::OpDispatcher<Rotmg::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Rotmg::execute(Tensor d1, Tensor d2, Tensor x1, Tensor y1, Tensor param) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(d1, d2, x1, y1, param);
+    infinicore::context::setDevice(d1->device());
+    dispatcher().lookup(d1->device().getType())(d1, d2, x1, y1, param);
+}
+
+void rotmg_(Tensor d1, Tensor d2, Tensor x1, Tensor y1, Tensor param) {
+    Rotmg::execute(d1, d2, x1, y1, param);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rotmg/rotmg_infiniop.cc b/src/infinicore/ops/rotmg/rotmg_infiniop.cc
new file mode 100644
index 000000000..ade0bd7aa
--- /dev/null
+++ b/src/infinicore/ops/rotmg/rotmg_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rotmg.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rotmg_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRotmgDescriptor_t> caches(
+    100,
+    [](infiniopRotmgDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRotmgDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor d1, Tensor d2, Tensor x1, Tensor y1, Tensor param) {
+    size_t seed = hash_combine(d1, d2, x1, y1, param);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopRotmgDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRotmgDescriptor(
+            context::getInfiniopHandle(d1->device()), &desc,
+            d1->desc(), d2->desc(), x1->desc(), y1->desc(), param->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetRotmgWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopRotmg(
+        desc, workspace->data(), workspace_size,
+        d1->data(), d2->data(), x1->data(), y1->data(), param->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Rotmg::dispatcher().registerDevice({Device::Type::CPU,
+                                        Device::Type::CAMBRICON,
+                                        Device::Type::METAX},
+                                       &calculate,
+                                       false);
+    return true;
+}();
+
+} // namespace infinicore::op::rotmg_impl::infiniop
diff --git a/src/infinicore/ops/scal/scal.cc b/src/infinicore/ops/scal/scal.cc
new file mode 100644
index 000000000..21258af2a
--- /dev/null
+++ b/src/infinicore/ops/scal/scal.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/scal.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Scal);
+
+Scal::Scal(const Tensor &alpha, Tensor x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(alpha, x);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), alpha, x);
+}
+
+void Scal::execute(const Tensor &alpha, Tensor x) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Scal, alpha, x);
+}
+
+void scal_(const Tensor &alpha, Tensor x) {
+    Scal::execute(alpha, x);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/scal/scal_infiniop.cc b/src/infinicore/ops/scal/scal_infiniop.cc
new file mode 100644
index 000000000..25521f4ee
--- /dev/null
+++ b/src/infinicore/ops/scal/scal_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/scal.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::scal_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Scal, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, alpha, x;
+};
+
+void *plan(const Tensor &alpha, Tensor x) {
+    size_t seed = hash_combine(alpha, x);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Scal,
+        seed,
+        alpha->desc(), x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Scal, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(alpha),
+        graph::GraphTensor(x)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopScal(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->alpha->data(),
+        planned->x->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Scal, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::scal_impl::infiniop
diff --git a/src/infinicore/ops/swap/swap.cc b/src/infinicore/ops/swap/swap.cc
new file mode 100644
index 000000000..6ce4b86b9
--- /dev/null
+++ b/src/infinicore/ops/swap/swap.cc
@@ -0,0 +1,22 @@
+#include "infinicore/ops/swap.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Swap);
+
+Swap::Swap(Tensor x, Tensor y) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, y);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, y);
+}
+
+void Swap::execute(Tensor x, Tensor y) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Swap, x, y);
+}
+
+void swap_(Tensor x, Tensor y) {
+    Swap::execute(x, y);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/swap/swap_infiniop.cc b/src/infinicore/ops/swap/swap_infiniop.cc
new file mode 100644
index 000000000..32a163f65
--- /dev/null
+++ b/src/infinicore/ops/swap/swap_infiniop.cc
@@ -0,0 +1,50 @@
+#include "infinicore/ops/swap.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::swap_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Swap, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, y;
+};
+
+void *plan(Tensor x, Tensor y) {
+    size_t seed = hash_combine(x, y);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Swap,
+        seed,
+        x->desc(), y->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Swap, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopSwap(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->y->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Swap, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::swap_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 383429f8f..198821594 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -16,13 +16,19 @@
 #include "ops/argwhere.hpp"
 #include "ops/asin.hpp"
 #include "ops/asinh.hpp"
+#include "ops/asum.hpp"
 #include "ops/atanh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
+#include "ops/axpy.hpp"
 #include "ops/baddbmm.hpp"
 #include "ops/bilinear.hpp"
 #include "ops/binary_cross_entropy_with_logits.hpp"
 #include "ops/bitwise_right_shift.hpp"
+#include "ops/blas_amax.hpp"
+#include "ops/blas_amin.hpp"
+#include "ops/blas_copy.hpp"
+#include "ops/blas_dot.hpp"
 #include "ops/block_diag.hpp"
 #include "ops/broadcast_to.hpp"
 #include "ops/cat.hpp"
@@ -72,6 +78,7 @@
 #include "ops/mha_varlen.hpp"
 #include "ops/mul.hpp"
 #include "ops/multi_margin_loss.hpp"
+#include "ops/nrm2.hpp"
 #include "ops/pad.hpp"
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
@@ -83,6 +90,11 @@
 #include "ops/relu6.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
+#include "ops/rot.hpp"
+#include "ops/rotg.hpp"
+#include "ops/rotm.hpp"
+#include "ops/rotmg.hpp"
+#include "ops/scal.hpp"
 #include "ops/scatter.hpp"
 #include "ops/selu.hpp"
 #include "ops/silu.hpp"
@@ -92,6 +104,7 @@
 #include "ops/softplus.hpp"
 #include "ops/softsign.hpp"
 #include "ops/sum.hpp"
+#include "ops/swap.hpp"
 #include "ops/swiglu.hpp"
 #include "ops/take.hpp"
 #include "ops/tan.hpp"
@@ -127,8 +140,14 @@ inline void bind(py::module &m) {
     bind_adaptive_avg_pool1d(m);
     bind_attention(m);
     bind_asinh(m);
+    bind_asum(m);
+    bind_axpy(m);
     bind_baddbmm(m);
     bind_bilinear(m);
+    bind_blas_amax(m);
+    bind_blas_amin(m);
+    bind_blas_copy(m);
+    bind_blas_dot(m);
     bind_block_diag(m);
     bind_bitwise_right_shift(m);
     bind_causal_softmax(m);
@@ -153,6 +172,7 @@ inline void bind(py::module &m) {
     bind_matmul(m);
     bind_kron(m);
     bind_mul(m);
+    bind_nrm2(m);
     bind_mha_kvcache(m);
     bind_mha_varlen(m);
     bind_hardswish(m);
@@ -185,11 +205,16 @@ inline void bind(py::module &m) {
     bind_vander(m);
     bind_unfold(m);
     bind_rope(m);
+    bind_rot(m);
+    bind_rotg(m);
+    bind_rotm(m);
+    bind_rotmg(m);
     bind_floor_divide(m);
     bind_float_power(m);
     bind_flipud(m);
     bind_multi_margin_loss(m);
     bind_scatter(m);
+    bind_scal(m);
     bind_broadcast_to(m);
     bind_softplus(m);
     bind_softsign(m);
@@ -217,6 +242,7 @@ inline void bind(py::module &m) {
     bind_lerp(m);
     bind_triplet_margin_loss(m);
     bind_selu(m);
+    bind_swap(m);
     bind_sinh(m);
     bind_layer_norm(m);
     bind_topksoftmax(m);
diff --git a/src/infinicore/pybind11/ops/asum.hpp b/src/infinicore/pybind11/ops/asum.hpp
new file mode 100644
index 000000000..b094d12f5
--- /dev/null
+++ b/src/infinicore/pybind11/ops/asum.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/asum.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_asum(py::module &m) {
+    m.def("asum",
+          &op::asum,
+          py::arg("x"),
+          R"doc(BLAS level-1 asum.)doc");
+
+    m.def("asum_",
+          &op::asum_,
+          py::arg("x"),
+          py::arg("result"),
+          R"doc(In-place BLAS level-1 asum.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/axpy.hpp b/src/infinicore/pybind11/ops/axpy.hpp
new file mode 100644
index 000000000..fd0aff633
--- /dev/null
+++ b/src/infinicore/pybind11/ops/axpy.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/axpy.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_axpy(py::module &m) {
+    m.def("axpy_",
+          &op::axpy_,
+          py::arg("alpha"),
+          py::arg("x"),
+          py::arg("y"),
+          R"doc(In-place BLAS level-1 axpy, updating y.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/blas_amax.hpp b/src/infinicore/pybind11/ops/blas_amax.hpp
new file mode 100644
index 000000000..51e8cfe4d
--- /dev/null
+++ b/src/infinicore/pybind11/ops/blas_amax.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/blas_amax.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_blas_amax(py::module &m) {
+    m.def("blas_amax",
+          &op::blas_amax,
+          py::arg("x"),
+          R"doc(BLAS level-1 amax.)doc");
+
+    m.def("blas_amax_",
+          &op::blas_amax_,
+          py::arg("x"),
+          py::arg("result"),
+          R"doc(In-place BLAS level-1 amax.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/blas_amin.hpp b/src/infinicore/pybind11/ops/blas_amin.hpp
new file mode 100644
index 000000000..8961a9363
--- /dev/null
+++ b/src/infinicore/pybind11/ops/blas_amin.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/blas_amin.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_blas_amin(py::module &m) {
+    m.def("blas_amin",
+          &op::blas_amin,
+          py::arg("x"),
+          R"doc(BLAS level-1 amin.)doc");
+
+    m.def("blas_amin_",
+          &op::blas_amin_,
+          py::arg("x"),
+          py::arg("result"),
+          R"doc(In-place BLAS level-1 amin.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/blas_copy.hpp b/src/infinicore/pybind11/ops/blas_copy.hpp
new file mode 100644
index 000000000..c348ac38b
--- /dev/null
+++ b/src/infinicore/pybind11/ops/blas_copy.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/blas_copy.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_blas_copy(py::module &m) {
+    m.def("blas_copy_",
+          &op::blas_copy_,
+          py::arg("x"),
+          py::arg("y"),
+          R"doc(In-place BLAS level-1 copy from x to y.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/blas_dot.hpp b/src/infinicore/pybind11/ops/blas_dot.hpp
new file mode 100644
index 000000000..73b4f0bc9
--- /dev/null
+++ b/src/infinicore/pybind11/ops/blas_dot.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/blas_dot.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_blas_dot(py::module &m) {
+    m.def("blas_dot",
+          &op::blas_dot,
+          py::arg("x"),
+          py::arg("y"),
+          R"doc(BLAS level-1 dot.)doc");
+
+    m.def("blas_dot_",
+          &op::blas_dot_,
+          py::arg("x"),
+          py::arg("y"),
+          py::arg("result"),
+          R"doc(In-place BLAS level-1 dot.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/nrm2.hpp b/src/infinicore/pybind11/ops/nrm2.hpp
new file mode 100644
index 000000000..02b21f53b
--- /dev/null
+++ b/src/infinicore/pybind11/ops/nrm2.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/nrm2.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_nrm2(py::module &m) {
+    m.def("nrm2",
+          &op::nrm2,
+          py::arg("x"),
+          R"doc(BLAS level-1 nrm2.)doc");
+
+    m.def("nrm2_",
+          &op::nrm2_,
+          py::arg("x"),
+          py::arg("result"),
+          R"doc(In-place BLAS level-1 nrm2.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rot.hpp b/src/infinicore/pybind11/ops/rot.hpp
new file mode 100644
index 000000000..359cb9745
--- /dev/null
+++ b/src/infinicore/pybind11/ops/rot.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rot.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rot(py::module &m) {
+    m.def("rot_",
+          &op::rot_,
+          py::arg("x"),
+          py::arg("y"),
+          py::arg("c"),
+          py::arg("s"),
+          R"doc(In-place BLAS level-1 rot, updating x and y.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rotg.hpp b/src/infinicore/pybind11/ops/rotg.hpp
new file mode 100644
index 000000000..a37e79336
--- /dev/null
+++ b/src/infinicore/pybind11/ops/rotg.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rotg.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rotg(py::module &m) {
+    m.def("rotg_",
+          py::overload_cast<Tensor, Tensor, Tensor, Tensor>(&op::rotg_),
+          py::arg("x"),
+          py::arg("y"),
+          py::arg("c"),
+          py::arg("s"),
+          R"doc(In-place BLAS level-1 rotg, updating x, y, c, and s.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rotm.hpp b/src/infinicore/pybind11/ops/rotm.hpp
new file mode 100644
index 000000000..a88db38ba
--- /dev/null
+++ b/src/infinicore/pybind11/ops/rotm.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rotm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rotm(py::module &m) {
+    m.def("rotm_",
+          py::overload_cast<Tensor, Tensor, Tensor>(&op::rotm_),
+          py::arg("x"),
+          py::arg("y"),
+          py::arg("param"),
+          R"doc(In-place BLAS level-1 rotm, updating x and y.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rotmg.hpp b/src/infinicore/pybind11/ops/rotmg.hpp
new file mode 100644
index 000000000..72b816751
--- /dev/null
+++ b/src/infinicore/pybind11/ops/rotmg.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rotmg.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rotmg(py::module &m) {
+    m.def("rotmg_",
+          py::overload_cast<Tensor, Tensor, Tensor, Tensor, Tensor>(&op::rotmg_),
+          py::arg("d1"),
+          py::arg("d2"),
+          py::arg("x1"),
+          py::arg("y1"),
+          py::arg("param"),
+          R"doc(In-place BLAS level-1 rotmg, updating d1, d2, x1, and param.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/scal.hpp b/src/infinicore/pybind11/ops/scal.hpp
new file mode 100644
index 000000000..75a2c5ca8
--- /dev/null
+++ b/src/infinicore/pybind11/ops/scal.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/scal.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_scal(py::module &m) {
+    m.def("scal_",
+          &op::scal_,
+          py::arg("alpha"),
+          py::arg("x"),
+          R"doc(In-place BLAS level-1 scal, updating x.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/swap.hpp b/src/infinicore/pybind11/ops/swap.hpp
new file mode 100644
index 000000000..0b8f2ae3e
--- /dev/null
+++ b/src/infinicore/pybind11/ops/swap.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/swap.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_swap(py::module &m) {
+    m.def("swap_",
+          &op::swap_,
+          py::arg("x"),
+          py::arg("y"),
+          R"doc(In-place BLAS level-1 swap.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/devices/metax/metax_ht2mc.h b/src/infiniop/devices/metax/metax_ht2mc.h
index d391f61d2..888f1c72f 100644
--- a/src/infiniop/devices/metax/metax_ht2mc.h
+++ b/src/infiniop/devices/metax/metax_ht2mc.h
@@ -1,6 +1,7 @@
 #ifdef ENABLE_METAX_MC_API
 #define hpccDataType macaDataType
 #define HPCC_R_32F MACA_R_32F
+#define HPCC_R_64F MACA_R_64F
 #define HPCC_R_16F MACA_R_16F
 #define HPCC_R_16BF MACA_R_16BF
 #define hpcc_bfloat162 maca_bfloat162
@@ -118,6 +119,28 @@
 #define hcblasGemmEx mcblasGemmEx
 #define hcblasCreate mcblasCreate
 #define hcblasComputeType_t mcblasComputeType_t
+#define hcblasSetPointerMode mcblasSetPointerMode
+#define hcblasIsamax mcblasIsamax
+#define hcblasIdamax mcblasIdamax
+#define hcblasIsamin mcblasIsamin
+#define hcblasIdamin mcblasIdamin
+#define hcblasSasum mcblasSasum
+#define hcblasDasum mcblasDasum
+#define hcblasAxpyEx mcblasAxpyEx
+#define hcblasScopy mcblasScopy
+#define hcblasDcopy mcblasDcopy
+#define hcblasDotEx mcblasDotEx
+#define hcblasNrm2Ex mcblasNrm2Ex
+#define hcblasRotEx mcblasRotEx
+#define hcblasSrotg mcblasSrotg
+#define hcblasDrotg mcblasDrotg
+#define hcblasSrotm mcblasSrotm
+#define hcblasDrotm mcblasDrotm
+#define hcblasSrotmg mcblasSrotmg
+#define hcblasDrotmg mcblasDrotmg
+#define hcblasScalEx mcblasScalEx
+#define hcblasSswap mcblasSswap
+#define hcblasDswap mcblasDswap
 #define HCBLAS_STATUS_SUCCESS MCBLAS_STATUS_SUCCESS
 #define HCBLAS_OP_T MCBLAS_OP_T
 #define HCBLAS_OP_N MCBLAS_OP_N
@@ -125,6 +148,8 @@
 #define HCBLAS_GEMM_DEFAULT MCBLAS_GEMM_DEFAULT
 #define HCBLAS_COMPUTE_32F_FAST_TF32 MCBLAS_COMPUTE_32F_FAST_TF32
 #define HCBLAS_COMPUTE_32F MCBLAS_COMPUTE_32F
+#define HCBLAS_POINTER_MODE_DEVICE MCBLAS_POINTER_MODE_DEVICE
+#define HCBLAS_POINTER_MODE_HOST MCBLAS_POINTER_MODE_HOST
 #define __hpcc_fp8_e4m3 __maca_fp8_e4m3
 #define __hpcc_bfloat16 __maca_bfloat16
 #endif
diff --git a/src/infiniop/ops/asum/asum.h b/src/infiniop/ops/asum/asum.h
new file mode 100644
index 000000000..dd7aef22e
--- /dev/null
+++ b/src/infiniop/ops/asum/asum.h
@@ -0,0 +1,47 @@
+#ifndef __ASUM_H__
+#define __ASUM_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::asum::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AsumInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AsumInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t result_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            void *result,                                        \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __ASUM_H__
diff --git a/src/infiniop/ops/asum/bang/asum_bang.h b/src/infiniop/ops/asum/bang/asum_bang.h
new file mode 100644
index 000000000..bf388c744
--- /dev/null
+++ b/src/infiniop/ops/asum/bang/asum_bang.h
@@ -0,0 +1,8 @@
+#ifndef __ASUM_BANG_H__
+#define __ASUM_BANG_H__
+
+#include "../asum.h"
+
+DESCRIPTOR(bang)
+
+#endif // __ASUM_BANG_H__
diff --git a/src/infiniop/ops/asum/bang/asum_bang.mlu b/src/infiniop/ops/asum/bang/asum_bang.mlu
new file mode 100644
index 000000000..1c8cba0c3
--- /dev/null
+++ b/src/infiniop/ops/asum/bang/asum_bang.mlu
@@ -0,0 +1,95 @@
+#include "../../../devices/bang/common_bang.h"
+#include "asum_bang.h"
+#include "asum_bang_kernel.mlu"
+
+namespace op::asum::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = AsumInfo::createAsumInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateAsum(
+    const AsumInfo &info,
+    const Tdata *x,
+    Tdata *result,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1) {
+        asumKernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            result);
+    } else {
+        asumKernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            result);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ASUM(TDATA)       \
+    calculateAsum(_info,            \
+                  (const TDATA *)x, \
+                  (TDATA *)result,  \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ASUM(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ASUM(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ASUM(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ASUM
+
+} // namespace op::asum::bang
diff --git a/src/infiniop/ops/asum/bang/asum_bang_kernel.mlu b/src/infiniop/ops/asum/bang/asum_bang_kernel.mlu
new file mode 100644
index 000000000..633ad767d
--- /dev/null
+++ b/src/infiniop/ops/asum/bang/asum_bang_kernel.mlu
@@ -0,0 +1,155 @@
+#include "../../../devices/bang/common_bang.h"
+#include "asum_bang.h"
+
+#include <type_traits>
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_device__ void asumToCompute(float *dst, const Tdata *src, int size) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_half2float(dst, src, size);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_bfloat162float(dst, src, size);
+    } else {
+        __memcpy(dst, src, size * sizeof(float), NRAM2NRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_device__ float asumToCompute(Tdata value) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __half2float(value);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        return __bfloat162float(value);
+    } else {
+        return static_cast<float>(value);
+    }
+}
+
+template <typename Tdata>
+__mlu_device__ void asumStoreResult(Tdata *result, Tdata *nram_result, float *nram_compute, float value) {
+    nram_compute[0] = value;
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_float2half(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_float2bfloat16(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else {
+        result[0] = nram_compute[0];
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void asumKernelContiguous(
+    int n,
+    const Tdata *x,
+    Tdata *result) {
+
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (sizeof(Tdata) + sizeof(float));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    float *nram_compute = (float *)(nram_x + chunk_size);
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    float partial_sum = 0.0f;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        asumToCompute(nram_compute, nram_x, chunk_size);
+        __bang_abs(nram_compute, nram_compute, chunk_size);
+
+        partial_sum += __bang_sum(nram_compute, chunk_size);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        asumToCompute(nram_compute, nram_x, chunk_rem);
+        __bang_abs(nram_compute, nram_compute, chunk_rem);
+
+        partial_sum += __bang_sum(nram_compute, chunk_rem);
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+
+        for (int i = 0; i < coreDim; i++) {
+            cluster_sum += shared_partial_sum[i];
+        }
+
+        asumStoreResult(result, nram_x, nram_compute, cluster_sum);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void asumKernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    Tdata *result) {
+
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    float *nram_compute = (float *)nram_aligned;
+    Tdata *nram_result = (Tdata *)(nram_compute + 1);
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    float partial_sum = 0.0f;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int offset = i * incx;
+        float x_val = asumToCompute(x[offset]);
+        float abs_val = x_val > 0.0f ? x_val : -x_val;
+
+        partial_sum += abs_val;
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+
+        for (int i = 0; i < coreDim; i++) {
+            cluster_sum += shared_partial_sum[i];
+        }
+
+        asumStoreResult(result, nram_result, nram_compute, cluster_sum);
+    }
+}
diff --git a/src/infiniop/ops/asum/cpu/asum_cpu.cc b/src/infiniop/ops/asum/cpu/asum_cpu.cc
new file mode 100644
index 000000000..c1178a74c
--- /dev/null
+++ b/src/infiniop/ops/asum/cpu/asum_cpu.cc
@@ -0,0 +1,91 @@
+#include "asum_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::asum::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = AsumInfo::createAsumInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateAsum(
+    const AsumInfo &info,
+    const Tdata *x,
+    Tdata *result) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+
+    if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+        float total_sum = 0.0;
+
+        for (size_t i = 0; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            total_sum += std::abs(utils::cast<float>(x[idx]));
+        }
+
+        result[0] = utils::cast<Tdata>(total_sum);
+    } else {
+        Tdata total_sum = 0.0;
+
+        for (size_t i = 0; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            total_sum += std::abs(x[idx]);
+        }
+
+        result[0] = total_sum;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ASUM(TDATA)       \
+    calculateAsum(_info,            \
+                  (const TDATA *)x, \
+                  (TDATA *)result)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ASUM(fp16_t);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ASUM(bf16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ASUM(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_ASUM(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ASUM
+
+} // namespace op::asum::cpu
diff --git a/src/infiniop/ops/asum/cpu/asum_cpu.h b/src/infiniop/ops/asum/cpu/asum_cpu.h
new file mode 100644
index 000000000..84b7572d8
--- /dev/null
+++ b/src/infiniop/ops/asum/cpu/asum_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ASUM_CPU_H__
+#define __ASUM_CPU_H__
+
+#include "../asum.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ASUM_CPU_H__
diff --git a/src/infiniop/ops/asum/info.h b/src/infiniop/ops/asum/info.h
new file mode 100644
index 000000000..3efcc0e39
--- /dev/null
+++ b/src/infiniop/ops/asum/info.h
@@ -0,0 +1,41 @@
+#ifndef __ASUM_INFO_H__
+#define __ASUM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class AsumInfo {
+private:
+    AsumInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    infiniDtype_t data_type;
+
+    static utils::Result<AsumInfo> createAsumInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t result_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(result_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(result_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(result_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+
+        return utils::Result<AsumInfo>(AsumInfo{
+            n,
+            incx,
+            data_type});
+    }
+};
+
+#endif // __ASUM_INFO_H__
diff --git a/src/infiniop/ops/asum/metax/asum_metax.cc b/src/infiniop/ops/asum/metax/asum_metax.cc
new file mode 100644
index 000000000..4a084fbf9
--- /dev/null
+++ b/src/infiniop/ops/asum/metax/asum_metax.cc
@@ -0,0 +1,71 @@
+#include "asum_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::asum::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = AsumInfo::createAsumInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasSasum(handle, n, (const float *)x, incx, (float *)result));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDasum(handle, n, (const double *)x, incx, (double *)result));
+                break;
+            default:
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::asum::metax
diff --git a/src/infiniop/ops/asum/metax/asum_metax.h b/src/infiniop/ops/asum/metax/asum_metax.h
new file mode 100644
index 000000000..f5ade8e58
--- /dev/null
+++ b/src/infiniop/ops/asum/metax/asum_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ASUM_METAX_H__
+#define __ASUM_METAX_H__
+
+#include "../asum.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ASUM_METAX_H__
diff --git a/src/infiniop/ops/asum/operator.cc b/src/infiniop/ops/asum/operator.cc
new file mode 100644
index 000000000..51769f0d3
--- /dev/null
+++ b/src/infiniop/ops/asum/operator.cc
@@ -0,0 +1,124 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asum.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asum_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/asum_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/asum_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAsumDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsumDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::asum::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::asum::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                         \
+            result_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetAsumWorkspaceSize(infiniopAsumDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::asum::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopAsum(
+    infiniopAsumDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::asum::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, result, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t
+infiniopDestroyAsumDescriptor(infiniopAsumDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::asum::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/axpy/axpy.h b/src/infiniop/ops/axpy/axpy.h
new file mode 100644
index 000000000..617e9d01b
--- /dev/null
+++ b/src/infiniop/ops/axpy/axpy.h
@@ -0,0 +1,49 @@
+#ifndef __AXPY_H__
+#define __AXPY_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::axpy::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AxpyInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AxpyInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t alpha_desc,               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *alpha,                                   \
+            const void *x,                                       \
+            void *y,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __AXPY_H__
diff --git a/src/infiniop/ops/axpy/bang/axpy_bang.h b/src/infiniop/ops/axpy/bang/axpy_bang.h
new file mode 100644
index 000000000..dbce7e5ca
--- /dev/null
+++ b/src/infiniop/ops/axpy/bang/axpy_bang.h
@@ -0,0 +1,8 @@
+#ifndef __AXPY_BANG_H__
+#define __AXPY_BANG_H__
+
+#include "../axpy.h"
+
+DESCRIPTOR(bang)
+
+#endif // __AXPY_BANG_H__
diff --git a/src/infiniop/ops/axpy/bang/axpy_bang.mlu b/src/infiniop/ops/axpy/bang/axpy_bang.mlu
new file mode 100644
index 000000000..a7cd75bf7
--- /dev/null
+++ b/src/infiniop/ops/axpy/bang/axpy_bang.mlu
@@ -0,0 +1,104 @@
+#include "../../../devices/bang/common_bang.h"
+#include "axpy_bang.h"
+#include "axpy_bang_kernel.mlu"
+
+namespace op::axpy::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = AxpyInfo::createAxpyInfo(alpha_desc, x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateAxpy(
+    const AxpyInfo &info,
+    const Tdata *alpha,
+    const Tdata *x,
+    Tdata *y,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        axpyKernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            alpha,
+            x,
+            y);
+    } else {
+        axpyKernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            alpha,
+            x,
+            incx,
+            y,
+            incy);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_AXPY(TDATA)           \
+    calculateAxpy(_info,                \
+                  (const TDATA *)alpha, \
+                  (const TDATA *)x,     \
+                  (TDATA *)y,           \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_AXPY(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_AXPY(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_AXPY(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_AXPY
+
+} // namespace op::axpy::bang
diff --git a/src/infiniop/ops/axpy/bang/axpy_bang_kernel.mlu b/src/infiniop/ops/axpy/bang/axpy_bang_kernel.mlu
new file mode 100644
index 000000000..df0409987
--- /dev/null
+++ b/src/infiniop/ops/axpy/bang/axpy_bang_kernel.mlu
@@ -0,0 +1,85 @@
+#include "../../../devices/bang/common_bang.h"
+#include "axpy_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void axpyKernelContiguous(
+    int n,
+    const Tdata *alpha,
+    const Tdata *x,
+    Tdata *y) {
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (2 * sizeof(Tdata));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    Tdata *nram_y = nram_x + chunk_size;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x, nram_x, alpha[0], chunk_size);
+        __bang_add(nram_y, nram_y, nram_x, chunk_size);
+
+        __memcpy(y + current_offset, nram_y, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        int align_rem = ((chunk_rem + align_elements - 1) / align_elements) * align_elements;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x, nram_x, alpha[0], align_rem);
+        __bang_add(nram_y, nram_y, nram_x, align_rem);
+
+        __memcpy(y + current_offset, nram_y, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void axpyKernelStrided(
+    int n,
+    const Tdata *alpha,
+    const Tdata *x,
+    int incx,
+    Tdata *y,
+    int incy) {
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int idx_x = i * incx;
+        int idx_y = i * incy;
+
+        y[idx_y] += alpha[0] * x[idx_x];
+    }
+}
diff --git a/src/infiniop/ops/axpy/cpu/axpy_cpu.cc b/src/infiniop/ops/axpy/cpu/axpy_cpu.cc
new file mode 100644
index 000000000..d2f97f8df
--- /dev/null
+++ b/src/infiniop/ops/axpy/cpu/axpy_cpu.cc
@@ -0,0 +1,95 @@
+#include "axpy_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::axpy::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = AxpyInfo::createAxpyInfo(alpha_desc, x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateAxpy(
+    const AxpyInfo &info,
+    const Tdata *alpha,
+    const Tdata *x,
+    Tdata *y) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+    const ptrdiff_t incy = info.incy;
+
+    if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+        const float alpha_f = utils::cast<float>(alpha[0]);
+        for (size_t i = 0; i < n; ++i) {
+            const ptrdiff_t x_idx = utils::cast<ptrdiff_t>(i) * incx;
+            const ptrdiff_t y_idx = utils::cast<ptrdiff_t>(i) * incy;
+            const float x_f = utils::cast<float>(x[x_idx]);
+            const float y_f = utils::cast<float>(y[y_idx]);
+            y[y_idx] = utils::cast<Tdata>(alpha_f * x_f + y_f);
+        }
+    } else {
+        const Tdata alpha_v = alpha[0];
+        for (size_t i = 0; i < n; ++i) {
+            const ptrdiff_t x_idx = utils::cast<ptrdiff_t>(i) * incx;
+            const ptrdiff_t y_idx = utils::cast<ptrdiff_t>(i) * incy;
+            y[y_idx] = alpha_v * x[x_idx] + y[y_idx];
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_AXPY(TDATA)           \
+    calculateAxpy(_info,                \
+                  (const TDATA *)alpha, \
+                  (const TDATA *)x,     \
+                  (TDATA *)y)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_AXPY(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_AXPY(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_AXPY(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_AXPY(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_AXPY
+
+} // namespace op::axpy::cpu
diff --git a/src/infiniop/ops/axpy/cpu/axpy_cpu.h b/src/infiniop/ops/axpy/cpu/axpy_cpu.h
new file mode 100644
index 000000000..f25b49ef3
--- /dev/null
+++ b/src/infiniop/ops/axpy/cpu/axpy_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AXPY_CPU_H__
+#define __AXPY_CPU_H__
+
+#include "../axpy.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __AXPY_CPU_H__
diff --git a/src/infiniop/ops/axpy/info.h b/src/infiniop/ops/axpy/info.h
new file mode 100644
index 000000000..9dfaa6c07
--- /dev/null
+++ b/src/infiniop/ops/axpy/info.h
@@ -0,0 +1,49 @@
+#ifndef __AXPY_INFO_H__
+#define __AXPY_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class AxpyInfo {
+private:
+    AxpyInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<AxpyInfo> createAxpyInfo(
+        infiniopTensorDescriptor_t alpha_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc) {
+
+        CHECK_OR_RETURN(alpha_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(alpha_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(x_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(alpha_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<AxpyInfo>(AxpyInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __AXPY_INFO_H__
diff --git a/src/infiniop/ops/axpy/metax/axpy_metax.cc b/src/infiniop/ops/axpy/metax/axpy_metax.cc
new file mode 100644
index 000000000..55f173cb0
--- /dev/null
+++ b/src/infiniop/ops/axpy/metax/axpy_metax.cc
@@ -0,0 +1,102 @@
+#include "axpy_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::axpy::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = AxpyInfo::createAxpyInfo(alpha_desc, x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    hpccDataType alpha_type, x_type, y_type;
+    hpccDataType execution_type;
+
+    switch (data_type) {
+    case INFINI_DTYPE_F16:
+        alpha_type = x_type = y_type = HPCC_R_16F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        alpha_type = x_type = y_type = HPCC_R_16BF;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        alpha_type = x_type = y_type = HPCC_R_32F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F64:
+        alpha_type = x_type = y_type = HPCC_R_64F;
+        execution_type = HPCC_R_64F;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(
+                handle,
+                HCBLAS_POINTER_MODE_DEVICE));
+
+            CHECK_MCBLAS(hcblasAxpyEx(
+                handle,
+                n,
+                alpha,
+                alpha_type,
+                x,
+                x_type,
+                incx,
+                y,
+                y_type,
+                incy,
+                execution_type));
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::axpy::metax
diff --git a/src/infiniop/ops/axpy/metax/axpy_metax.h b/src/infiniop/ops/axpy/metax/axpy_metax.h
new file mode 100644
index 000000000..8129ca2e4
--- /dev/null
+++ b/src/infiniop/ops/axpy/metax/axpy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __AXPY_METAX_H__
+#define __AXPY_METAX_H__
+
+#include "../axpy.h"
+
+DESCRIPTOR(metax)
+
+#endif // __AXPY_METAX_H__
diff --git a/src/infiniop/ops/axpy/operator.cc b/src/infiniop/ops/axpy/operator.cc
new file mode 100644
index 000000000..cd57223e4
--- /dev/null
+++ b/src/infiniop/ops/axpy/operator.cc
@@ -0,0 +1,127 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/axpy.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/axpy_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/axpy_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/axpy_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAxpyDescriptor(
+    infiniopHandle_t handle,
+    infiniopAxpyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::axpy::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::axpy::NAMESPACE::Descriptor **>(desc_ptr), \
+            alpha_desc,                                                     \
+            x_desc,                                                         \
+            y_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetAxpyWorkspaceSize(infiniopAxpyDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::axpy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopAxpy(
+    infiniopAxpyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    const void *x,
+    void *y,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::axpy::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, alpha, x, y, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t
+infiniopDestroyAxpyDescriptor(infiniopAxpyDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::axpy::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/blas_amax/bang/blas_amax_bang.h b/src/infiniop/ops/blas_amax/bang/blas_amax_bang.h
new file mode 100644
index 000000000..8bc6ca2c5
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/bang/blas_amax_bang.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMAX_BANG_H__
+#define __BLAS_AMAX_BANG_H__
+
+#include "../blas_amax.h"
+
+DESCRIPTOR(bang)
+
+#endif // __BLAS_AMAX_BANG_H__
diff --git a/src/infiniop/ops/blas_amax/bang/blas_amax_bang.mlu b/src/infiniop/ops/blas_amax/bang/blas_amax_bang.mlu
new file mode 100644
index 000000000..26cc3c501
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/bang/blas_amax_bang.mlu
@@ -0,0 +1,96 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_amax_bang.h"
+#include "blas_amax_bang_kernel.mlu"
+
+namespace op::blas_amax::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = BlasAmaxInfo::createBlasAmaxInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasAmax(
+    const BlasAmaxInfo &info,
+    const Tdata *x,
+    int *result,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1) {
+        blasAmaxKernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            result);
+    } else {
+        blasAmaxKernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            result);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_AMAX(TDATA)      \
+    calculateBlasAmax(_info,            \
+                      (const TDATA *)x, \
+                      (int *)result,    \
+                      (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_AMAX(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_AMAX(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_AMAX(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_AMAX
+
+} // namespace op::blas_amax::bang
diff --git a/src/infiniop/ops/blas_amax/bang/blas_amax_bang_kernel.mlu b/src/infiniop/ops/blas_amax/bang/blas_amax_bang_kernel.mlu
new file mode 100644
index 000000000..2939f2dd7
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/bang/blas_amax_bang_kernel.mlu
@@ -0,0 +1,136 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_amax_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void blasAmaxKernelContiguous(
+    int n,
+    const Tdata *x,
+    int *result) {
+
+    __mlu_shared__ int shared_max_index[4];
+    __mlu_shared__ Tdata shared_max_value[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / sizeof(Tdata);
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    int max_index = -1;
+    Tdata max_value = static_cast<Tdata>(0);
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_abs(nram_x, nram_x, chunk_size);
+
+        for (int i = 0; i < chunk_size; i++) {
+            Tdata abs_val = nram_x[i];
+            if (abs_val > max_value) {
+                max_value = abs_val;
+                max_index = current_offset + i;
+            }
+        }
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_abs(nram_x, nram_x, chunk_rem);
+
+        for (int i = 0; i < chunk_rem; i++) {
+            Tdata abs_val = nram_x[i];
+            if (abs_val > max_value) {
+                max_value = abs_val;
+                max_index = current_offset + i;
+            }
+        }
+    }
+
+    shared_max_index[coreId] = max_index;
+    shared_max_value[coreId] = max_value;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        int cluster_max_index = -1;
+        Tdata cluster_max_value = static_cast<Tdata>(0);
+
+        for (int i = 0; i < coreDim; i++) {
+            if (shared_max_value[i] > cluster_max_value) {
+                cluster_max_value = shared_max_value[i];
+                cluster_max_index = shared_max_index[i];
+            }
+        }
+
+        result[0] = cluster_max_index + 1; // Convert to 1-based index
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void blasAmaxKernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    int *result) {
+
+    __mlu_shared__ int shared_max_index[4];
+    __mlu_shared__ Tdata shared_max_value[4];
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    int max_index = -1;
+    Tdata max_value = static_cast<Tdata>(0);
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int offset = i * incx;
+        Tdata abs_val = x[offset] > static_cast<Tdata>(0) ? x[offset] : -x[offset];
+
+        if (abs_val > max_value) {
+            max_value = abs_val;
+            max_index = i;
+        }
+    }
+
+    shared_max_index[coreId] = max_index;
+    shared_max_value[coreId] = max_value;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        int cluster_max_index = -1;
+        Tdata cluster_max_value = static_cast<Tdata>(0);
+
+        for (int i = 0; i < coreDim; i++) {
+            if (shared_max_value[i] > cluster_max_value) {
+                cluster_max_value = shared_max_value[i];
+                cluster_max_index = shared_max_index[i];
+            }
+        }
+
+        result[0] = cluster_max_index + 1; // Convert to 1-based index
+    }
+}
diff --git a/src/infiniop/ops/blas_amax/blas_amax.h b/src/infiniop/ops/blas_amax/blas_amax.h
new file mode 100644
index 000000000..627b7b754
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/blas_amax.h
@@ -0,0 +1,47 @@
+#ifndef __BLAS_AMAX_H__
+#define __BLAS_AMAX_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::blas_amax::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        BlasAmaxInfo _info;                                      \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            BlasAmaxInfo info,                                   \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t result_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            void *result,                                        \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __BLAS_AMAX_H__
diff --git a/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.cc b/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.cc
new file mode 100644
index 000000000..4b3553041
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.cc
@@ -0,0 +1,105 @@
+#include "blas_amax_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::blas_amax::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = BlasAmaxInfo::createBlasAmaxInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasAmax(
+    const BlasAmaxInfo &info,
+    const Tdata *x,
+    int *result) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+
+    if (n < 1 || incx == 0) {
+        result[0] = 0;
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t max_index = 0;
+    if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+        float max_value = std::abs(utils::cast<float>(x[0]));
+
+        for (size_t i = 1; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            float current_value = std::abs(utils::cast<float>(x[idx]));
+            if (current_value > max_value) {
+                max_value = current_value;
+                max_index = i;
+            }
+        }
+    } else {
+        Tdata max_value = std::abs(x[0]);
+
+        for (size_t i = 1; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            Tdata current_value = std::abs(x[idx]);
+            if (current_value > max_value) {
+                max_value = current_value;
+                max_index = i;
+            }
+        }
+    }
+
+    result[0] = utils::cast<int>(max_index) + 1;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_AMAX(TDATA)      \
+    calculateBlasAmax(_info,            \
+                      (const TDATA *)x, \
+                      (int *)result)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_AMAX(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_AMAX(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_BLAS_AMAX(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_AMAX(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_AMAX
+
+} // namespace op::blas_amax::cpu
diff --git a/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.h b/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.h
new file mode 100644
index 000000000..66197c9c1
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/cpu/blas_amax_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMAX_CPU_H__
+#define __BLAS_AMAX_CPU_H__
+
+#include "../blas_amax.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BLAS_AMAX_CPU_H__
diff --git a/src/infiniop/ops/blas_amax/info.h b/src/infiniop/ops/blas_amax/info.h
new file mode 100644
index 000000000..b10f84046
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/info.h
@@ -0,0 +1,41 @@
+#ifndef __BLAS_AMAX_INFO_H__
+#define __BLAS_AMAX_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class BlasAmaxInfo {
+private:
+    BlasAmaxInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    infiniDtype_t data_type;
+
+    static utils::Result<BlasAmaxInfo> createBlasAmaxInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t result_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(result_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+        auto itype = result_desc->dtype();
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE(itype, INFINI_DTYPE_I32);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(result_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+
+        return utils::Result<BlasAmaxInfo>(BlasAmaxInfo{
+            n,
+            incx,
+            data_type});
+    }
+};
+
+#endif // __BLAS_AMAX_INFO_H__
diff --git a/src/infiniop/ops/blas_amax/metax/blas_amax_metax.cc b/src/infiniop/ops/blas_amax/metax/blas_amax_metax.cc
new file mode 100644
index 000000000..05816020d
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/metax/blas_amax_metax.cc
@@ -0,0 +1,71 @@
+#include "blas_amax_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::blas_amax::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = BlasAmaxInfo::createBlasAmaxInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasIsamax(handle, n, (const float *)x, incx, (int *)result));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasIdamax(handle, n, (const double *)x, incx, (int *)result));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::blas_amax::metax
diff --git a/src/infiniop/ops/blas_amax/metax/blas_amax_metax.h b/src/infiniop/ops/blas_amax/metax/blas_amax_metax.h
new file mode 100644
index 000000000..19e79851f
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/metax/blas_amax_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMAX_METAX_H__
+#define __BLAS_AMAX_METAX_H__
+
+#include "../blas_amax.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BLAS_AMAX_METAX_H__
diff --git a/src/infiniop/ops/blas_amax/operator.cc b/src/infiniop/ops/blas_amax/operator.cc
new file mode 100644
index 000000000..c6b48eeb4
--- /dev/null
+++ b/src/infiniop/ops/blas_amax/operator.cc
@@ -0,0 +1,122 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/blas_amax.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/blas_amax_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/blas_amax_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/blas_amax_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateBlasAmaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopBlasAmaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::blas_amax::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::blas_amax::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                              \
+            result_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetBlasAmaxWorkspaceSize(infiniopBlasAmaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::blas_amax::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopBlasAmax(
+    infiniopBlasAmaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::blas_amax::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, result, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyBlasAmaxDescriptor(infiniopBlasAmaxDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::blas_amax::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/blas_amin/bang/blas_amin_bang.h b/src/infiniop/ops/blas_amin/bang/blas_amin_bang.h
new file mode 100644
index 000000000..ba9dbaa21
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/bang/blas_amin_bang.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMIN_BANG_H__
+#define __BLAS_AMIN_BANG_H__
+
+#include "../blas_amin.h"
+
+DESCRIPTOR(bang)
+
+#endif // __BLAS_AMIN_BANG_H__
diff --git a/src/infiniop/ops/blas_amin/bang/blas_amin_bang.mlu b/src/infiniop/ops/blas_amin/bang/blas_amin_bang.mlu
new file mode 100644
index 000000000..dfd2a7e64
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/bang/blas_amin_bang.mlu
@@ -0,0 +1,96 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_amin_bang.h"
+#include "blas_amin_bang_kernel.mlu"
+
+namespace op::blas_amin::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = BlasAminInfo::createBlasAminInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasAmin(
+    const BlasAminInfo &info,
+    const Tdata *x,
+    int *result,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1) {
+        blasAminKernelContiguous<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            result);
+    } else {
+        blasAminKernelStrided<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            result);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_AMIN(TDATA)      \
+    calculateBlasAmin(_info,            \
+                      (const TDATA *)x, \
+                      (int *)result,    \
+                      (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_AMIN(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_AMIN(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_AMIN(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_AMIN
+
+} // namespace op::blas_amin::bang
diff --git a/src/infiniop/ops/blas_amin/bang/blas_amin_bang_kernel.mlu b/src/infiniop/ops/blas_amin/bang/blas_amin_bang_kernel.mlu
new file mode 100644
index 000000000..2c5ff7c48
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/bang/blas_amin_bang_kernel.mlu
@@ -0,0 +1,135 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_amin_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void blasAminKernelContiguous(
+    int n,
+    const Tdata *x,
+    int *result) {
+
+    __mlu_shared__ int shared_min_index[4];
+    __mlu_shared__ Tdata shared_min_value[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / sizeof(Tdata);
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    int min_index = -1;
+    Tdata min_value = static_cast<Tdata>(0);
+    bool initialized = false;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_abs(nram_x, nram_x, chunk_size);
+
+        for (int i = 0; i < chunk_size; i++) {
+            Tdata abs_val = nram_x[i];
+            if (!initialized || abs_val < min_value) {
+                min_value = abs_val;
+                min_index = current_offset + i;
+                initialized = true;
+            }
+        }
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_abs(nram_x, nram_x, chunk_rem);
+
+        for (int i = 0; i < chunk_rem; i++) {
+            Tdata abs_val = nram_x[i];
+            if (!initialized || abs_val < min_value) {
+                min_value = abs_val;
+                min_index = current_offset + i;
+                initialized = true;
+            }
+        }
+    }
+
+    shared_min_index[coreId] = min_index;
+    shared_min_value[coreId] = min_value;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        for (int i = 1; i < coreDim; i++) {
+            if (shared_min_index[i] >= 0 && shared_min_value[i] < min_value) {
+                min_value = shared_min_value[i];
+                min_index = shared_min_index[i];
+            }
+        }
+
+        result[0] = min_index + 1; // Convert to 1-based index
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void blasAminKernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    int *result) {
+
+    __mlu_shared__ int shared_min_index[4];
+    __mlu_shared__ Tdata shared_min_value[4];
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    int min_index = -1;
+    Tdata min_value = static_cast<Tdata>(0);
+    bool initialized = false;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int offset = i * incx;
+        Tdata abs_val = x[offset] > static_cast<Tdata>(0) ? x[offset] : -x[offset];
+
+        if (!initialized || abs_val < min_value) {
+            min_value = abs_val;
+            min_index = i;
+            initialized = true;
+        }
+    }
+
+    shared_min_index[coreId] = min_index;
+    shared_min_value[coreId] = min_value;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        for (int i = 1; i < coreDim; i++) {
+            if (shared_min_index[i] >= 0 && shared_min_value[i] < min_value) {
+                min_value = shared_min_value[i];
+                min_index = shared_min_index[i];
+            }
+        }
+
+        result[0] = min_index + 1; // Convert to 1-based index
+    }
+}
diff --git a/src/infiniop/ops/blas_amin/blas_amin.h b/src/infiniop/ops/blas_amin/blas_amin.h
new file mode 100644
index 000000000..5128fcf85
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/blas_amin.h
@@ -0,0 +1,47 @@
+#ifndef __BLAS_AMIN_H__
+#define __BLAS_AMIN_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::blas_amin::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        BlasAminInfo _info;                                      \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            BlasAminInfo info,                                   \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t result_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            void *result,                                        \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __BLAS_AMIN_H__
diff --git a/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.cc b/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.cc
new file mode 100644
index 000000000..07cad0461
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.cc
@@ -0,0 +1,105 @@
+#include "blas_amin_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::blas_amin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = BlasAminInfo::createBlasAminInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasAmin(
+    const BlasAminInfo &info,
+    const Tdata *x,
+    int *result) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+
+    if (n < 1 || incx == 0) {
+        result[0] = 0;
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t min_index = 0;
+    if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+        float min_value = std::abs(utils::cast<float>(x[0]));
+
+        for (size_t i = 1; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            float current_value = std::abs(utils::cast<float>(x[idx]));
+            if (current_value < min_value) {
+                min_value = current_value;
+                min_index = i;
+            }
+        }
+    } else {
+        Tdata min_value = std::abs(x[0]);
+
+        for (size_t i = 1; i < n; ++i) {
+            const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+            Tdata current_value = std::abs(x[idx]);
+            if (current_value < min_value) {
+                min_value = current_value;
+                min_index = i;
+            }
+        }
+    }
+
+    result[0] = utils::cast<int>(min_index) + 1;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_AMIN(TDATA)      \
+    calculateBlasAmin(_info,            \
+                      (const TDATA *)x, \
+                      (int *)result)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_AMIN(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_AMIN(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_BLAS_AMIN(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_AMIN(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_AMIN
+
+} // namespace op::blas_amin::cpu
diff --git a/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.h b/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.h
new file mode 100644
index 000000000..c5e4936d7
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/cpu/blas_amin_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMIN_CPU_H__
+#define __BLAS_AMIN_CPU_H__
+
+#include "../blas_amin.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BLAS_AMIN_CPU_H__
diff --git a/src/infiniop/ops/blas_amin/info.h b/src/infiniop/ops/blas_amin/info.h
new file mode 100644
index 000000000..0522edecb
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/info.h
@@ -0,0 +1,41 @@
+#ifndef __BLAS_AMIN_INFO_H__
+#define __BLAS_AMIN_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class BlasAminInfo {
+private:
+    BlasAminInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    infiniDtype_t data_type;
+
+    static utils::Result<BlasAminInfo> createBlasAminInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t result_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(result_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+        auto itype = result_desc->dtype();
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE(itype, INFINI_DTYPE_I32);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(result_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+
+        return utils::Result<BlasAminInfo>(BlasAminInfo{
+            n,
+            incx,
+            data_type});
+    }
+};
+
+#endif // __BLAS_AMIN_INFO_H__
diff --git a/src/infiniop/ops/blas_amin/metax/blas_amin_metax.cc b/src/infiniop/ops/blas_amin/metax/blas_amin_metax.cc
new file mode 100644
index 000000000..dadd2706d
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/metax/blas_amin_metax.cc
@@ -0,0 +1,71 @@
+#include "blas_amin_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::blas_amin::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = BlasAminInfo::createBlasAminInfo(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasIsamin(handle, n, (const float *)x, incx, (int *)result));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasIdamin(handle, n, (const double *)x, incx, (int *)result));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::blas_amin::metax
diff --git a/src/infiniop/ops/blas_amin/metax/blas_amin_metax.h b/src/infiniop/ops/blas_amin/metax/blas_amin_metax.h
new file mode 100644
index 000000000..42a5b5fe9
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/metax/blas_amin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_AMIN_METAX_H__
+#define __BLAS_AMIN_METAX_H__
+
+#include "../blas_amin.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BLAS_AMIN_METAX_H__
diff --git a/src/infiniop/ops/blas_amin/operator.cc b/src/infiniop/ops/blas_amin/operator.cc
new file mode 100644
index 000000000..7f960b773
--- /dev/null
+++ b/src/infiniop/ops/blas_amin/operator.cc
@@ -0,0 +1,122 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/blas_amin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/blas_amin_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/blas_amin_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/blas_amin_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateBlasAminDescriptor(
+    infiniopHandle_t handle,
+    infiniopBlasAminDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::blas_amin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::blas_amin::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                              \
+            result_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetBlasAminWorkspaceSize(infiniopBlasAminDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::blas_amin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopBlasAmin(
+    infiniopBlasAminDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::blas_amin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, result, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyBlasAminDescriptor(infiniopBlasAminDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::blas_amin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/blas_copy/bang/blas_copy_bang.h b/src/infiniop/ops/blas_copy/bang/blas_copy_bang.h
new file mode 100644
index 000000000..fb326fb3d
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/bang/blas_copy_bang.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_COPY_BANG_H__
+#define __BLAS_COPY_BANG_H__
+
+#include "../blas_copy.h"
+
+DESCRIPTOR(bang)
+
+#endif // __BLAS_COPY_BANG_H__
diff --git a/src/infiniop/ops/blas_copy/bang/blas_copy_bang.mlu b/src/infiniop/ops/blas_copy/bang/blas_copy_bang.mlu
new file mode 100644
index 000000000..fe47729c0
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/bang/blas_copy_bang.mlu
@@ -0,0 +1,96 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_copy_bang.h"
+#include "blas_copy_bang_kernel.mlu"
+
+namespace op::blas_copy::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = BlasCopyInfo::createBlasCopyInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasCopy(
+    const BlasCopyInfo &info,
+    const Tdata *x,
+    Tdata *y,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        blasCopyKernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            y);
+    } else {
+        blasCopyKernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            y,
+            incy);
+    }
+
+    cnrtQueueSync(queue);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_COPY(TDATA)      \
+    calculateBlasCopy(_info,            \
+                      (const TDATA *)x, \
+                      (TDATA *)y,       \
+                      (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_COPY(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_COPY(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_COPY(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_COPY
+
+} // namespace op::blas_copy::bang
diff --git a/src/infiniop/ops/blas_copy/bang/blas_copy_bang_kernel.mlu b/src/infiniop/ops/blas_copy/bang/blas_copy_bang_kernel.mlu
new file mode 100644
index 000000000..63bc22a66
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/bang/blas_copy_bang_kernel.mlu
@@ -0,0 +1,66 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_copy_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void blasCopyKernelContiguous(
+    int n,
+    const Tdata *x,
+    Tdata *y) {
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (2 * sizeof(Tdata));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(y + current_offset, nram_x, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(y + current_offset, nram_x, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void blasCopyKernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    Tdata *y,
+    int incy) {
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        y[i * incy] = x[i * incx];
+    }
+}
diff --git a/src/infiniop/ops/blas_copy/blas_copy.h b/src/infiniop/ops/blas_copy/blas_copy.h
new file mode 100644
index 000000000..3670ba204
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/blas_copy.h
@@ -0,0 +1,47 @@
+#ifndef __BLAS_COPY_H__
+#define __BLAS_COPY_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::blas_copy::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        BlasCopyInfo _info;                                      \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            BlasCopyInfo info,                                   \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            void *y,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __BLAS_COPY_H__
diff --git a/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.cc b/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.cc
new file mode 100644
index 000000000..47123d371
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.cc
@@ -0,0 +1,77 @@
+#include "blas_copy_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::blas_copy::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = BlasCopyInfo::createBlasCopyInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasCopy(
+    const BlasCopyInfo &info,
+    const Tdata *x,
+    Tdata *y) {
+
+    const size_t n = info.n;
+
+    for (size_t i = 0; i < n; ++i) {
+        ptrdiff_t x_idx = utils::cast<ptrdiff_t>(i) * info.incx;
+        ptrdiff_t y_idx = utils::cast<ptrdiff_t>(i) * info.incy;
+        y[y_idx] = x[x_idx];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_COPY(TDATA)      \
+    calculateBlasCopy(_info,            \
+                      (const TDATA *)x, \
+                      (TDATA *)y)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_COPY(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_COPY(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_BLAS_COPY(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_COPY(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_COPY
+
+} // namespace op::blas_copy::cpu
diff --git a/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.h b/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.h
new file mode 100644
index 000000000..7cfe48752
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/cpu/blas_copy_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_COPY_CPU_H__
+#define __BLAS_COPY_CPU_H__
+
+#include "../blas_copy.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BLAS_COPY_CPU_H__
diff --git a/src/infiniop/ops/blas_copy/info.h b/src/infiniop/ops/blas_copy/info.h
new file mode 100644
index 000000000..585138ab1
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/info.h
@@ -0,0 +1,45 @@
+#ifndef __BLAS_COPY_INFO_H__
+#define __BLAS_COPY_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class BlasCopyInfo {
+private:
+    BlasCopyInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<BlasCopyInfo> createBlasCopyInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<BlasCopyInfo>(BlasCopyInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __BLAS_COPY_INFO_H__
diff --git a/src/infiniop/ops/blas_copy/metax/blas_copy_metax.cc b/src/infiniop/ops/blas_copy/metax/blas_copy_metax.cc
new file mode 100644
index 000000000..e51ec95e9
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/metax/blas_copy_metax.cc
@@ -0,0 +1,72 @@
+#include "blas_copy_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::blas_copy::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = BlasCopyInfo::createBlasCopyInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasScopy(handle, n, (const float *)x, incx, (float *)y, incy));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDcopy(handle, n, (const double *)x, incx, (double *)y, incy));
+                break;
+            default:
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::blas_copy::metax
diff --git a/src/infiniop/ops/blas_copy/metax/blas_copy_metax.h b/src/infiniop/ops/blas_copy/metax/blas_copy_metax.h
new file mode 100644
index 000000000..88f118dbf
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/metax/blas_copy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_COPY_METAX_H__
+#define __BLAS_COPY_METAX_H__
+
+#include "../blas_copy.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BLAS_COPY_METAX_H__
diff --git a/src/infiniop/ops/blas_copy/operator.cc b/src/infiniop/ops/blas_copy/operator.cc
new file mode 100644
index 000000000..394adc665
--- /dev/null
+++ b/src/infiniop/ops/blas_copy/operator.cc
@@ -0,0 +1,121 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/blas_copy.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/blas_copy_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/blas_copy_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/blas_copy_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateBlasCopyDescriptor(
+    infiniopHandle_t handle,
+    infiniopBlasCopyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::blas_copy::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::blas_copy::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, y_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetBlasCopyWorkspaceSize(infiniopBlasCopyDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::blas_copy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopBlasCopy(
+    infiniopBlasCopyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *y,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::blas_copy::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyBlasCopyDescriptor(infiniopBlasCopyDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::blas_copy::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/blas_dot/bang/blas_dot_bang.h b/src/infiniop/ops/blas_dot/bang/blas_dot_bang.h
new file mode 100644
index 000000000..1c2c18019
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/bang/blas_dot_bang.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_DOT_BANG_H__
+#define __BLAS_DOT_BANG_H__
+
+#include "../blas_dot.h"
+
+DESCRIPTOR(bang)
+
+#endif // __BLAS_DOT_BANG_H__
diff --git a/src/infiniop/ops/blas_dot/bang/blas_dot_bang.mlu b/src/infiniop/ops/blas_dot/bang/blas_dot_bang.mlu
new file mode 100644
index 000000000..8d7f1b816
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/bang/blas_dot_bang.mlu
@@ -0,0 +1,103 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_dot_bang.h"
+#include "blas_dot_bang_kernel.mlu"
+
+namespace op::blas_dot::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = BlasDotInfo::createBlasDotInfo(x_desc, y_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasDot(
+    const BlasDotInfo &info,
+    const Tdata *x,
+    const Tdata *y,
+    Tdata *result,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        blasDotKernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            y,
+            result);
+    } else {
+        blasDotKernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            y,
+            incy,
+            result);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_DOT(TDATA)      \
+    calculateBlasDot(_info,            \
+                     (const TDATA *)x, \
+                     (const TDATA *)y, \
+                     (TDATA *)result,  \
+                     (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    const void *y,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_DOT(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_DOT(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_DOT(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_DOT
+
+} // namespace op::blas_dot::bang
diff --git a/src/infiniop/ops/blas_dot/bang/blas_dot_bang_kernel.mlu b/src/infiniop/ops/blas_dot/bang/blas_dot_bang_kernel.mlu
new file mode 100644
index 000000000..bf249101e
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/bang/blas_dot_bang_kernel.mlu
@@ -0,0 +1,157 @@
+#include "../../../devices/bang/common_bang.h"
+#include "blas_dot_bang.h"
+
+#include <type_traits>
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_device__ void blasDotToCompute(float *dst, const Tdata *src, int size) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_half2float(dst, src, size);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_bfloat162float(dst, src, size);
+    } else {
+        __memcpy(dst, src, size * sizeof(float), NRAM2NRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_device__ float blasDotToCompute(Tdata value) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __half2float(value);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        return __bfloat162float(value);
+    } else {
+        return static_cast<float>(value);
+    }
+}
+
+template <typename Tdata>
+__mlu_device__ void blasDotStoreResult(Tdata *result, Tdata *nram_result, float *nram_compute, float value) {
+    nram_compute[0] = value;
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_float2half(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_float2bfloat16(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else {
+        result[0] = nram_compute[0];
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void blasDotKernelContiguous(
+    int n,
+    const Tdata *x,
+    const Tdata *y,
+    Tdata *result) {
+
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (2 * sizeof(Tdata) + 2 * sizeof(float));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    Tdata *nram_y = nram_x + chunk_size;
+    float *nram_compute_x = (float *)(nram_y + chunk_size);
+    float *nram_compute_y = nram_compute_x + chunk_size;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    float partial_sum = 0.0f;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        blasDotToCompute(nram_compute_x, nram_x, chunk_size);
+        blasDotToCompute(nram_compute_y, nram_y, chunk_size);
+        __bang_mul(nram_compute_x, nram_compute_x, nram_compute_y, chunk_size);
+        partial_sum += __bang_sum(nram_compute_x, chunk_size);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        blasDotToCompute(nram_compute_x, nram_x, chunk_rem);
+        blasDotToCompute(nram_compute_y, nram_y, chunk_rem);
+        __bang_mul(nram_compute_x, nram_compute_x, nram_compute_y, chunk_rem);
+        partial_sum += __bang_sum(nram_compute_x, chunk_rem);
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+
+        for (int i = 0; i < coreDim; i++) {
+            cluster_sum += shared_partial_sum[i];
+        }
+
+        blasDotStoreResult(result, nram_x, nram_compute_x, cluster_sum);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void blasDotKernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    const Tdata *y,
+    int incy,
+    Tdata *result) {
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    float *nram_compute = (float *)nram_aligned;
+    Tdata *nram_result = (Tdata *)(nram_compute + 1);
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    float partial_sum = 0.0f;
+    int x_offset = start_idx * incx;
+    int y_offset = start_idx * incy;
+
+    for (int i = 0; i < core_elements; ++i) {
+        partial_sum += blasDotToCompute(x[x_offset]) * blasDotToCompute(y[y_offset]);
+        x_offset += incx;
+        y_offset += incy;
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+        for (int i = 0; i < coreDim; ++i) {
+            cluster_sum += shared_partial_sum[i];
+        }
+        blasDotStoreResult(result, nram_result, nram_compute, cluster_sum);
+    }
+}
diff --git a/src/infiniop/ops/blas_dot/blas_dot.h b/src/infiniop/ops/blas_dot/blas_dot.h
new file mode 100644
index 000000000..09e81a7fc
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/blas_dot.h
@@ -0,0 +1,49 @@
+#ifndef __BLAS_DOT_H__
+#define __BLAS_DOT_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::blas_dot::NAMESPACE {                          \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        BlasDotInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            BlasDotInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t result_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            const void *y,                                       \
+            void *result,                                        \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __BLAS_DOT_H__
diff --git a/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.cc b/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.cc
new file mode 100644
index 000000000..e250aa2ee
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.cc
@@ -0,0 +1,102 @@
+#include "blas_dot_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::blas_dot::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = BlasDotInfo::createBlasDotInfo(x_desc, y_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateBlasDot(
+    const BlasDotInfo &info,
+    const Tdata *x,
+    const Tdata *y,
+    Tdata *result) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+    const ptrdiff_t incy = info.incy;
+
+    ptrdiff_t ix = (incx < 0) ? (1 - utils::cast<ptrdiff_t>(n)) * incx : 0;
+    ptrdiff_t iy = (incy < 0) ? (1 - utils::cast<ptrdiff_t>(n)) * incy : 0;
+
+    if constexpr (std::is_same<Tdata, fp16_t>::value || std::is_same<Tdata, bf16_t>::value) {
+        float total = 0.0f;
+
+        for (size_t i = 0; i < n; ++i) {
+            total += utils::cast<float>(x[ix]) * utils::cast<float>(y[iy]);
+            ix += incx;
+            iy += incy;
+        }
+
+        result[0] = utils::cast<Tdata>(total);
+    } else {
+        Tdata total = utils::cast<Tdata>(0);
+
+        for (size_t i = 0; i < n; ++i) {
+            total += x[ix] * y[iy];
+            ix += incx;
+            iy += incy;
+        }
+
+        result[0] = total;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_BLAS_DOT(TDATA)      \
+    calculateBlasDot(_info,            \
+                     (const TDATA *)x, \
+                     (const TDATA *)y, \
+                     (TDATA *)result)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    const void *y,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_BLAS_DOT(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_BLAS_DOT(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_BLAS_DOT(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_BLAS_DOT(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_BLAS_DOT
+
+} // namespace op::blas_dot::cpu
diff --git a/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.h b/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.h
new file mode 100644
index 000000000..0f09f8d08
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/cpu/blas_dot_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_DOT_CPU_H__
+#define __BLAS_DOT_CPU_H__
+
+#include "../blas_dot.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BLAS_DOT_CPU_H__
diff --git a/src/infiniop/ops/blas_dot/info.h b/src/infiniop/ops/blas_dot/info.h
new file mode 100644
index 000000000..01e145f6e
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/info.h
@@ -0,0 +1,49 @@
+#ifndef __BLAS_DOT_INFO_H__
+#define __BLAS_DOT_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class BlasDotInfo {
+private:
+    BlasDotInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<BlasDotInfo> createBlasDotInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t result_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(result_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(result_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(result_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<BlasDotInfo>(BlasDotInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __BLAS_DOT_INFO_H__
diff --git a/src/infiniop/ops/blas_dot/metax/blas_dot_metax.cc b/src/infiniop/ops/blas_dot/metax/blas_dot_metax.cc
new file mode 100644
index 000000000..28e7f301a
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/metax/blas_dot_metax.cc
@@ -0,0 +1,102 @@
+#include "blas_dot_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::blas_dot::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = BlasDotInfo::createBlasDotInfo(x_desc, y_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    const void *y,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    hpccDataType x_type, y_type, result_type;
+    hpccDataType execution_type;
+
+    switch (data_type) {
+    case INFINI_DTYPE_F16:
+        x_type = y_type = result_type = HPCC_R_16F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        x_type = y_type = result_type = HPCC_R_16BF;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        x_type = y_type = result_type = HPCC_R_32F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F64:
+        x_type = y_type = result_type = HPCC_R_64F;
+        execution_type = HPCC_R_64F;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(
+                handle,
+                HCBLAS_POINTER_MODE_DEVICE));
+
+            CHECK_MCBLAS(hcblasDotEx(
+                handle,
+                n,
+                x,
+                x_type,
+                incx,
+                y,
+                y_type,
+                incy,
+                result,
+                result_type,
+                execution_type));
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::blas_dot::metax
diff --git a/src/infiniop/ops/blas_dot/metax/blas_dot_metax.h b/src/infiniop/ops/blas_dot/metax/blas_dot_metax.h
new file mode 100644
index 000000000..0c5cefbf8
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/metax/blas_dot_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BLAS_DOT_METAX_H__
+#define __BLAS_DOT_METAX_H__
+
+#include "../blas_dot.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BLAS_DOT_METAX_H__
diff --git a/src/infiniop/ops/blas_dot/operator.cc b/src/infiniop/ops/blas_dot/operator.cc
new file mode 100644
index 000000000..e28943b8b
--- /dev/null
+++ b/src/infiniop/ops/blas_dot/operator.cc
@@ -0,0 +1,125 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/blas_dot.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/blas_dot_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/blas_dot_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/blas_dot_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateBlasDotDescriptor(
+    infiniopHandle_t handle,
+    infiniopBlasDotDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::blas_dot::NAMESPACE::Descriptor::create(                     \
+            handle,                                                             \
+            reinterpret_cast<op::blas_dot::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                             \
+            y_desc,                                                             \
+            result_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetBlasDotWorkspaceSize(infiniopBlasDotDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::blas_dot::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopBlasDot(
+    infiniopBlasDotDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    const void *y,
+    void *result,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                     \
+        return reinterpret_cast<const op::blas_dot::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, result, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyBlasDotDescriptor(infiniopBlasDotDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        delete reinterpret_cast<const op::blas_dot::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/nrm2/bang/nrm2_bang.h b/src/infiniop/ops/nrm2/bang/nrm2_bang.h
new file mode 100644
index 000000000..1a9a4761f
--- /dev/null
+++ b/src/infiniop/ops/nrm2/bang/nrm2_bang.h
@@ -0,0 +1,8 @@
+#ifndef __NRM2_BANG_H__
+#define __NRM2_BANG_H__
+
+#include "../nrm2.h"
+
+DESCRIPTOR(bang)
+
+#endif // __NRM2_BANG_H__
diff --git a/src/infiniop/ops/nrm2/bang/nrm2_bang.mlu b/src/infiniop/ops/nrm2/bang/nrm2_bang.mlu
new file mode 100644
index 000000000..0d6e54517
--- /dev/null
+++ b/src/infiniop/ops/nrm2/bang/nrm2_bang.mlu
@@ -0,0 +1,96 @@
+#include "../../../devices/bang/common_bang.h"
+#include "nrm2_bang.h"
+#include "nrm2_bang_kernel.mlu"
+
+namespace op::nrm2::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = Nrm2Info::createNrm2Info(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateNrm2(
+    const Nrm2Info &info,
+    const Tdata *x,
+    Tdata *result,
+    cnrtQueue_t queue) {
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1) {
+        Nrm2KernelContiguous<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            result);
+    } else {
+        Nrm2KernelStrided<Tdata><<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            result);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_NRM2(TDATA)       \
+    calculateNrm2(_info,            \
+                  (const TDATA *)x, \
+                  (TDATA *)result,  \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_NRM2(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_NRM2(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_NRM2(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_NRM2
+
+} // namespace op::nrm2::bang
diff --git a/src/infiniop/ops/nrm2/bang/nrm2_bang_kernel.mlu b/src/infiniop/ops/nrm2/bang/nrm2_bang_kernel.mlu
new file mode 100644
index 000000000..3778b24ec
--- /dev/null
+++ b/src/infiniop/ops/nrm2/bang/nrm2_bang_kernel.mlu
@@ -0,0 +1,172 @@
+#include "../../../devices/bang/common_bang.h"
+#include "nrm2_bang.h"
+
+#include <type_traits>
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_device__ void nrm2ToCompute(float *dst, const Tdata *src, int size) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_half2float(dst, src, size);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_bfloat162float(dst, src, size);
+    } else {
+        __memcpy(dst, src, size * sizeof(float), NRAM2NRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_device__ void nrm2StoreResult(Tdata *result, Tdata *nram_result, float *nram_compute, float value) {
+    nram_compute[0] = value;
+    if constexpr (std::is_same_v<Tdata, half>) {
+        __bang_float2half(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        __bang_float2bfloat16(nram_result, nram_compute, 1);
+        result[0] = nram_result[0];
+    } else {
+        result[0] = nram_compute[0];
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void Nrm2KernelContiguous(
+    int n,
+    const Tdata *x,
+    Tdata *result) {
+
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (sizeof(Tdata) + sizeof(float));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    float *nram_compute = (float *)(nram_x + chunk_size);
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    float partial_sum = 0.0f;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        nrm2ToCompute(nram_compute, nram_x, chunk_size);
+        __bang_square(nram_compute, nram_compute, chunk_size);
+
+        partial_sum += __bang_sum(nram_compute, chunk_size);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        nrm2ToCompute(nram_compute, nram_x, chunk_rem);
+        __bang_square(nram_compute, nram_compute, chunk_rem);
+
+        partial_sum += __bang_sum(nram_compute, chunk_rem);
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+
+        for (int i = 0; i < coreDim; i++) {
+            cluster_sum += shared_partial_sum[i];
+        }
+
+        nrm2StoreResult(result, nram_x, nram_compute, std::sqrt(cluster_sum));
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void Nrm2KernelStrided(
+    int n,
+    const Tdata *x,
+    int incx,
+    Tdata *result) {
+
+    __mlu_shared__ float shared_partial_sum[4];
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (sizeof(Tdata) + sizeof(float));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    float *nram_compute = (float *)(nram_x + chunk_size);
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    float partial_sum = 0.0f;
+
+    int chunks = actual_tasks / chunk_size;
+    int chunk_rem = actual_tasks % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_elements = chunk_size;
+        int current_start = start_idx + c * chunk_size;
+        for (int i = 0; i < current_elements; ++i) {
+            nram_x[i] = x[(current_start + i) * incx];
+        }
+
+        nrm2ToCompute(nram_compute, nram_x, current_elements);
+        __bang_square(nram_compute, nram_compute, current_elements);
+
+        partial_sum += __bang_sum(nram_compute, current_elements);
+    }
+
+    if (chunk_rem > 0) {
+        int current_start = start_idx + chunks * chunk_size;
+        for (int i = 0; i < chunk_rem; ++i) {
+            nram_x[i] = x[(current_start + i) * incx];
+        }
+
+        nrm2ToCompute(nram_compute, nram_x, chunk_rem);
+        __bang_square(nram_compute, nram_compute, chunk_rem);
+
+        partial_sum += __bang_sum(nram_compute, chunk_rem);
+    }
+
+    shared_partial_sum[coreId] = partial_sum;
+
+    __sync_cluster();
+
+    if (coreId == 0) {
+        float cluster_sum = 0.0f;
+
+        for (int i = 0; i < coreDim; i++) {
+            cluster_sum += shared_partial_sum[i];
+        }
+
+        nrm2StoreResult(result, nram_x, nram_compute, std::sqrt(cluster_sum));
+    }
+}
diff --git a/src/infiniop/ops/nrm2/cpu/nrm2_cpu.cc b/src/infiniop/ops/nrm2/cpu/nrm2_cpu.cc
new file mode 100644
index 000000000..b494fe7d3
--- /dev/null
+++ b/src/infiniop/ops/nrm2/cpu/nrm2_cpu.cc
@@ -0,0 +1,164 @@
+#include "nrm2_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+namespace op::nrm2::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = Nrm2Info::createNrm2Info(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    // Create descriptor
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateNrm2(
+    const Nrm2Info &info,
+    const Tdata *x,
+    Tdata *result) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+
+    // Blue's scaling constants (float vs double)
+    constexpr Tcompute tsml = [] {
+        if constexpr (std::is_same_v<Tcompute, float>) {
+            return Tcompute(0x1p-63f); // 2^-63
+        } else {
+            return Tcompute(0x1p-511); // 2^-511
+        }
+    }();
+    constexpr Tcompute tbig = [] {
+        if constexpr (std::is_same_v<Tcompute, float>) {
+            return Tcompute(0x1p52f); // 2^52
+        } else {
+            return Tcompute(0x1p486); // 2^486
+        }
+    }();
+    constexpr Tcompute ssml = [] {
+        if constexpr (std::is_same_v<Tcompute, float>) {
+            return Tcompute(0x1p75f); // 2^75
+        } else {
+            return Tcompute(0x1p600); // 2^600
+        }
+    }();
+    constexpr Tcompute sbig = [] {
+        if constexpr (std::is_same_v<Tcompute, float>) {
+            return Tcompute(0x1p-76f); // 2^-76
+        } else {
+            return Tcompute(0x1p-601); // 2^-601
+        }
+    }();
+
+    Tcompute scl = Tcompute(1);
+    Tcompute sumsq = Tcompute(0);
+
+    bool notbig = true;
+    Tcompute asml = Tcompute(0);
+    Tcompute amed = Tcompute(0);
+    Tcompute abig = Tcompute(0);
+
+    // 0-based index; handle negative stride
+    ptrdiff_t ix = (incx < 0) ? (ptrdiff_t(1) - utils::cast<ptrdiff_t>(n)) * incx : 0;
+
+    for (size_t i = 0; i < n; ++i) {
+        Tcompute ax = std::abs(utils::cast<Tcompute>(x[ix]));
+
+        if (ax > tbig) {
+            const Tcompute y = ax * sbig;
+            abig += y * y;
+            notbig = false;
+        } else if (ax < tsml) {
+            if (notbig) {
+                const Tcompute y = ax * ssml;
+                asml += y * y;
+            }
+        } else {
+            amed += ax * ax;
+        }
+
+        ix += incx;
+    }
+
+    if (abig > Tcompute(0)) {
+        if (amed > Tcompute(0) || std::isinf(amed) || std::isnan(amed)) {
+            abig += (amed * sbig) * sbig;
+        }
+        scl = Tcompute(1) / sbig;
+        sumsq = abig;
+    } else if (asml > Tcompute(0)) {
+        if (amed > Tcompute(0) || std::isinf(amed) || std::isnan(amed)) {
+            amed = std::sqrt(amed);
+            asml = std::sqrt(asml) / ssml;
+
+            const Tcompute ymin = std::min(amed, asml);
+            const Tcompute ymax = std::max(amed, asml);
+
+            scl = Tcompute(1);
+            sumsq = (ymax * ymax) * (Tcompute(1) + (ymin / ymax) * (ymin / ymax));
+        } else {
+            scl = Tcompute(1) / ssml;
+            sumsq = asml;
+        }
+    } else {
+        scl = Tcompute(1);
+        sumsq = amed;
+    }
+
+    result[0] = utils::cast<Tdata>(scl * std::sqrt(sumsq));
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_NRM2(TDATA)       \
+    calculateNrm2(_info,            \
+                  (const TDATA *)x, \
+                  (TDATA *)result)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_NRM2(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_NRM2(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_NRM2(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_NRM2(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_NRM2
+
+} // namespace op::nrm2::cpu
diff --git a/src/infiniop/ops/nrm2/cpu/nrm2_cpu.h b/src/infiniop/ops/nrm2/cpu/nrm2_cpu.h
new file mode 100644
index 000000000..cf1e3fdef
--- /dev/null
+++ b/src/infiniop/ops/nrm2/cpu/nrm2_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __NRM2_CPU_H__
+#define __NRM2_CPU_H__
+
+#include "../nrm2.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __NRM2_CPU_H__
diff --git a/src/infiniop/ops/nrm2/info.h b/src/infiniop/ops/nrm2/info.h
new file mode 100644
index 000000000..04a64abee
--- /dev/null
+++ b/src/infiniop/ops/nrm2/info.h
@@ -0,0 +1,41 @@
+#ifndef __NRM2_INFO_H__
+#define __NRM2_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class Nrm2Info {
+private:
+    Nrm2Info() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    infiniDtype_t data_type;
+
+    static utils::Result<Nrm2Info> createNrm2Info(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t result_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(result_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(result_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(result_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+
+        return utils::Result<Nrm2Info>(Nrm2Info{
+            n,
+            incx,
+            data_type});
+    }
+};
+
+#endif // __NRM2_INFO_H__
diff --git a/src/infiniop/ops/nrm2/metax/nrm2_metax.cc b/src/infiniop/ops/nrm2/metax/nrm2_metax.cc
new file mode 100644
index 000000000..6b23502bf
--- /dev/null
+++ b/src/infiniop/ops/nrm2/metax/nrm2_metax.cc
@@ -0,0 +1,96 @@
+#include "nrm2_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::nrm2::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = Nrm2Info::createNrm2Info(x_desc, result_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const infiniDtype_t data_type = _info.data_type;
+
+    hpccDataType x_type, result_type;
+    hpccDataType execution_type;
+
+    switch (data_type) {
+    case INFINI_DTYPE_F16:
+        x_type = result_type = HPCC_R_16F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        x_type = result_type = HPCC_R_16BF;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        x_type = result_type = HPCC_R_32F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F64:
+        x_type = result_type = HPCC_R_64F;
+        execution_type = HPCC_R_64F;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(
+                handle,
+                HCBLAS_POINTER_MODE_DEVICE));
+
+            CHECK_MCBLAS(hcblasNrm2Ex(
+                handle,
+                n,
+                x,
+                x_type,
+                incx,
+                result,
+                result_type,
+                execution_type));
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::nrm2::metax
diff --git a/src/infiniop/ops/nrm2/metax/nrm2_metax.h b/src/infiniop/ops/nrm2/metax/nrm2_metax.h
new file mode 100644
index 000000000..5ebf0aaf2
--- /dev/null
+++ b/src/infiniop/ops/nrm2/metax/nrm2_metax.h
@@ -0,0 +1,8 @@
+#ifndef __NRM2_METAX_H__
+#define __NRM2_METAX_H__
+
+#include "../nrm2.h"
+
+DESCRIPTOR(metax)
+
+#endif // __NRM2_METAX_H__
diff --git a/src/infiniop/ops/nrm2/nrm2.h b/src/infiniop/ops/nrm2/nrm2.h
new file mode 100644
index 000000000..a4094cd67
--- /dev/null
+++ b/src/infiniop/ops/nrm2/nrm2.h
@@ -0,0 +1,47 @@
+#ifndef __NRM2_H__
+#define __NRM2_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::nrm2::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        Nrm2Info _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Nrm2Info info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t result_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *x,                                       \
+            void *result,                                        \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __NRM2_H__
diff --git a/src/infiniop/ops/nrm2/operator.cc b/src/infiniop/ops/nrm2/operator.cc
new file mode 100644
index 000000000..d1ecfa2bb
--- /dev/null
+++ b/src/infiniop/ops/nrm2/operator.cc
@@ -0,0 +1,121 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/nrm2.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/nrm2_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/nrm2_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/nrm2_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateNrm2Descriptor(
+    infiniopHandle_t handle,
+    infiniopNrm2Descriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t result_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::nrm2::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::nrm2::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, result_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetNrm2WorkspaceSize(infiniopNrm2Descriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::nrm2::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopNrm2(
+    infiniopNrm2Descriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *x,
+    void *result,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::nrm2::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, result, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyNrm2Descriptor(infiniopNrm2Descriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::nrm2::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/rot/bang/rot_bang.h b/src/infiniop/ops/rot/bang/rot_bang.h
new file mode 100644
index 000000000..7ef66bb62
--- /dev/null
+++ b/src/infiniop/ops/rot/bang/rot_bang.h
@@ -0,0 +1,8 @@
+#ifndef __ROT_BANG_H__
+#define __ROT_BANG_H__
+
+#include "../rot.h"
+
+DESCRIPTOR(bang)
+
+#endif // __ROT_BANG_H__
diff --git a/src/infiniop/ops/rot/bang/rot_bang.mlu b/src/infiniop/ops/rot/bang/rot_bang.mlu
new file mode 100644
index 000000000..b9601bde3
--- /dev/null
+++ b/src/infiniop/ops/rot/bang/rot_bang.mlu
@@ -0,0 +1,107 @@
+#include "../../../devices/bang/common_bang.h"
+#include "rot_bang.h"
+#include "rot_bang_kernel.mlu"
+
+namespace op::rot::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = RotInfo::createRotInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRot(
+    const RotInfo &info,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *c,
+    const Tdata *s,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        rotKernelContiguous<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            y,
+            c,
+            s);
+    } else {
+        rotKernelStrided<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            y,
+            incy,
+            c,
+            s);
+    }
+
+    cnrtQueueSync(queue);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROT(TDATA)       \
+    calculateRot(_info,            \
+                 (TDATA *)x,       \
+                 (TDATA *)y,       \
+                 (const TDATA *)c, \
+                 (const TDATA *)s, \
+                 (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *c,
+    const void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROT(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROT(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROT(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROT
+
+} // namespace op::rot::bang
diff --git a/src/infiniop/ops/rot/bang/rot_bang_kernel.mlu b/src/infiniop/ops/rot/bang/rot_bang_kernel.mlu
new file mode 100644
index 000000000..24cf3ead3
--- /dev/null
+++ b/src/infiniop/ops/rot/bang/rot_bang_kernel.mlu
@@ -0,0 +1,102 @@
+#include "../../../devices/bang/common_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void rotKernelContiguous(
+    int n,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *c,
+    const Tdata *s) {
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (4 * sizeof(Tdata));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    Tdata *nram_y = nram_x + chunk_size;
+    Tdata *nram_x_out = nram_y + chunk_size;
+    Tdata *nram_y_out = nram_x_out + chunk_size;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int ck = 0; ck < chunks; ck++) {
+        int current_offset = core_offset + ck * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x_out, nram_x, c[0], chunk_size);
+        __bang_mul_scalar(nram_y_out, nram_y, s[0], chunk_size);
+        __bang_add(nram_x_out, nram_x_out, nram_y_out, chunk_size);
+
+        __memcpy(x + current_offset, nram_x_out, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+
+        __bang_mul_scalar(nram_y_out, nram_y, c[0], chunk_size);
+        __bang_mul_scalar(nram_x_out, nram_x, s[0], chunk_size);
+        __bang_sub(nram_y_out, nram_y_out, nram_x_out, chunk_size);
+
+        __memcpy(y + current_offset, nram_y_out, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x_out, nram_x, c[0], chunk_rem);
+        __bang_mul_scalar(nram_y_out, nram_y, s[0], chunk_rem);
+        __bang_add(nram_x_out, nram_x_out, nram_y_out, chunk_rem);
+
+        __memcpy(x + current_offset, nram_x_out, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+
+        __bang_mul_scalar(nram_y_out, nram_y, c[0], chunk_rem);
+        __bang_mul_scalar(nram_x_out, nram_x, s[0], chunk_rem);
+        __bang_sub(nram_y_out, nram_y_out, nram_x_out, chunk_rem);
+
+        __memcpy(y + current_offset, nram_y_out, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void rotKernelStrided(
+    int n,
+    Tdata *x,
+    int incx,
+    Tdata *y,
+    int incy,
+    const Tdata *c,
+    const Tdata *s) {
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int x_idx = i * incx;
+        int y_idx = i * incy;
+        Tdata x_val = x[x_idx];
+        Tdata y_val = y[y_idx];
+
+        x[x_idx] = c[0] * x_val + s[0] * y_val;
+        y[y_idx] = c[0] * y_val - s[0] * x_val;
+    }
+}
diff --git a/src/infiniop/ops/rot/cpu/rot_cpu.cc b/src/infiniop/ops/rot/cpu/rot_cpu.cc
new file mode 100644
index 000000000..ce804ee0f
--- /dev/null
+++ b/src/infiniop/ops/rot/cpu/rot_cpu.cc
@@ -0,0 +1,104 @@
+#include "rot_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::rot::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = RotInfo::createRotInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRot(
+    const RotInfo &info,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *c,
+    const Tdata *s) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const Tcompute c_val = utils::cast<Tcompute>(c[0]);
+    const Tcompute s_val = utils::cast<Tcompute>(s[0]);
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+    const ptrdiff_t incy = info.incy;
+
+    if (n == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    const ptrdiff_t ix = incx >= 0 ? 0 : utils::cast<ptrdiff_t>(n - 1) * (-incx);
+    const ptrdiff_t iy = incy >= 0 ? 0 : utils::cast<ptrdiff_t>(n - 1) * (-incy);
+
+    for (size_t i = 0; i < n; ++i) {
+        const ptrdiff_t x_idx = ix + utils::cast<ptrdiff_t>(i) * incx;
+        const ptrdiff_t y_idx = iy + utils::cast<ptrdiff_t>(i) * incy;
+
+        const Tcompute x_val = utils::cast<Tcompute>(x[x_idx]);
+        const Tcompute y_val = utils::cast<Tcompute>(y[y_idx]);
+        const Tcompute temp = c_val * x_val + s_val * y_val;
+        y[y_idx] = utils::cast<Tdata>(c_val * y_val - s_val * x_val);
+        x[x_idx] = utils::cast<Tdata>(temp);
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROT(TDATA)       \
+    calculateRot(_info,            \
+                 (TDATA *)x,       \
+                 (TDATA *)y,       \
+                 (const TDATA *)c, \
+                 (const TDATA *)s)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *c,
+    const void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROT(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROT(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_ROT(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROT(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROT
+
+} // namespace op::rot::cpu
diff --git a/src/infiniop/ops/rot/cpu/rot_cpu.h b/src/infiniop/ops/rot/cpu/rot_cpu.h
new file mode 100644
index 000000000..2a5bd0ab8
--- /dev/null
+++ b/src/infiniop/ops/rot/cpu/rot_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ROT_CPU_H__
+#define __ROT_CPU_H__
+
+#include "../rot.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ROT_CPU_H__
diff --git a/src/infiniop/ops/rot/info.h b/src/infiniop/ops/rot/info.h
new file mode 100644
index 000000000..5a1ddf5ed
--- /dev/null
+++ b/src/infiniop/ops/rot/info.h
@@ -0,0 +1,53 @@
+#ifndef __ROT_INFO_H__
+#define __ROT_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class RotInfo {
+private:
+    RotInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<RotInfo> createRotInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t s_desc) {
+
+        CHECK_OR_RETURN(c_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(s_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(c_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(s_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(c_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(s_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<RotInfo>(RotInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __ROT_INFO_H__
diff --git a/src/infiniop/ops/rot/metax/rot_metax.cc b/src/infiniop/ops/rot/metax/rot_metax.cc
new file mode 100644
index 000000000..a1c3e13ef
--- /dev/null
+++ b/src/infiniop/ops/rot/metax/rot_metax.cc
@@ -0,0 +1,105 @@
+#include "rot_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::rot::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = RotInfo::createRotInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *c,
+    const void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    hpccDataType x_type, y_type, cs_type;
+    hpccDataType execution_type;
+
+    switch (data_type) {
+    case INFINI_DTYPE_F16:
+        x_type = y_type = cs_type = HPCC_R_16F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        x_type = y_type = cs_type = HPCC_R_16BF;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        x_type = y_type = cs_type = HPCC_R_32F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F64:
+        x_type = y_type = cs_type = HPCC_R_64F;
+        execution_type = HPCC_R_64F;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(
+                handle,
+                HCBLAS_POINTER_MODE_DEVICE));
+
+            CHECK_MCBLAS(hcblasRotEx(
+                handle,
+                n,
+                x,
+                x_type,
+                incx,
+                y,
+                y_type,
+                incy,
+                c,
+                s,
+                cs_type,
+                execution_type));
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rot::metax
diff --git a/src/infiniop/ops/rot/metax/rot_metax.h b/src/infiniop/ops/rot/metax/rot_metax.h
new file mode 100644
index 000000000..b6bdd1553
--- /dev/null
+++ b/src/infiniop/ops/rot/metax/rot_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ROT_METAX_H__
+#define __ROT_METAX_H__
+
+#include "../rot.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ROT_METAX_H__
diff --git a/src/infiniop/ops/rot/operator.cc b/src/infiniop/ops/rot/operator.cc
new file mode 100644
index 000000000..6c1345d7a
--- /dev/null
+++ b/src/infiniop/ops/rot/operator.cc
@@ -0,0 +1,128 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/rot.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rot_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/rot_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/rot_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateRotDescriptor(
+    infiniopHandle_t handle,
+    infiniopRotDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::rot::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::rot::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc,                                                        \
+            y_desc,                                                        \
+            c_desc,                                                        \
+            s_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetRotWorkspaceSize(infiniopRotDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::rot::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopRot(
+    infiniopRotDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *c,
+    const void *s,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::rot::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, c, s, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyRotDescriptor(infiniopRotDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::rot::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/rot/rot.h b/src/infiniop/ops/rot/rot.h
new file mode 100644
index 000000000..8304442d7
--- /dev/null
+++ b/src/infiniop/ops/rot/rot.h
@@ -0,0 +1,51 @@
+#ifndef __ROT_H__
+#define __ROT_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::rot::NAMESPACE {                               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        RotInfo _info;                                           \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            RotInfo info,                                        \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t c_desc,                   \
+            infiniopTensorDescriptor_t s_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *x,                                             \
+            void *y,                                             \
+            const void *c,                                       \
+            const void *s,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __ROT_H__
diff --git a/src/infiniop/ops/rotg/bang/rotg_bang.h b/src/infiniop/ops/rotg/bang/rotg_bang.h
new file mode 100644
index 000000000..e74696b9e
--- /dev/null
+++ b/src/infiniop/ops/rotg/bang/rotg_bang.h
@@ -0,0 +1,8 @@
+#ifndef __ROTG_BANG_H__
+#define __ROTG_BANG_H__
+
+#include "../rotg.h"
+
+DESCRIPTOR(bang)
+
+#endif // __ROTG_BANG_H__
diff --git a/src/infiniop/ops/rotg/bang/rotg_bang.mlu b/src/infiniop/ops/rotg/bang/rotg_bang.mlu
new file mode 100644
index 000000000..b0f271786
--- /dev/null
+++ b/src/infiniop/ops/rotg/bang/rotg_bang.mlu
@@ -0,0 +1,89 @@
+#include "../../../devices/bang/common_bang.h"
+#include "rotg_bang.h"
+#include "rotg_bang_kernel.mlu"
+
+namespace op::rotg::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = RotgInfo::createRotgInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotg(
+    Tdata *x,
+    Tdata *y,
+    Tdata *c,
+    Tdata *s,
+    cnrtQueue_t queue) {
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    k_dim.x = 1;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeBlock;
+
+    rotgKernel<<<k_dim, k_type, queue>>>(
+        x,
+        y,
+        c,
+        s);
+
+    cnrtQueueSync(queue);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTG(TDATA) \
+    calculateRotg((TDATA *)x, \
+                  (TDATA *)y, \
+                  (TDATA *)c, \
+                  (TDATA *)s, \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *c,
+    void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTG(half);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTG(bfloat16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTG(float);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTG
+
+} // namespace op::rotg::bang
diff --git a/src/infiniop/ops/rotg/bang/rotg_bang_kernel.mlu b/src/infiniop/ops/rotg/bang/rotg_bang_kernel.mlu
new file mode 100644
index 000000000..25fb6a8f4
--- /dev/null
+++ b/src/infiniop/ops/rotg/bang/rotg_bang_kernel.mlu
@@ -0,0 +1,76 @@
+#include "../../../devices/bang/common_bang.h"
+
+#include <limits>
+#include <type_traits>
+
+template <typename Tdata>
+__mlu_func__ float rotgToCompute(Tdata value) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __half2float(value);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        return __bfloat162float(value);
+    } else {
+        return static_cast<float>(value);
+    }
+}
+
+template <typename Tdata>
+__mlu_func__ Tdata rotgFromCompute(float value) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __float2half(value);
+    } else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
+        return __float2bfloat16(value);
+    } else {
+        return static_cast<Tdata>(value);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void rotgKernel(
+    Tdata *x,
+    Tdata *y,
+    Tdata *c,
+    Tdata *s) {
+
+    const float zero = 0.0f;
+    const float one = 1.0f;
+    const float safmin = std::numeric_limits<float>::min();
+    const float safmax = std::numeric_limits<float>::max();
+
+    const float x_val = rotgToCompute(*x);
+    const float y_val = rotgToCompute(*y);
+
+    const float xnorm = std::fabs(x_val);
+    const float ynorm = std::fabs(y_val);
+
+    if (ynorm == zero) {
+        *c = rotgFromCompute<Tdata>(one);
+        *s = rotgFromCompute<Tdata>(zero);
+        *y = rotgFromCompute<Tdata>(zero);
+    } else if (xnorm == zero) {
+        *c = rotgFromCompute<Tdata>(zero);
+        *s = rotgFromCompute<Tdata>(one);
+        *x = rotgFromCompute<Tdata>(y_val);
+        *y = rotgFromCompute<Tdata>(one);
+    } else {
+        const float scl = std::min(safmax, std::max(safmin, std::max(xnorm, ynorm)));
+        const float sigma = xnorm > ynorm ? std::copysign(one, x_val) : std::copysign(one, y_val);
+        const float r = sigma * (scl * std::sqrt((x_val / scl) * (x_val / scl) + (y_val / scl) * (y_val / scl)));
+        const float c_val = x_val / r;
+        const float s_val = y_val / r;
+
+        float z;
+        if (xnorm > ynorm) {
+            z = s_val;
+        } else if (c_val != zero) {
+            z = one / c_val;
+        } else {
+            z = one;
+        }
+
+        *x = rotgFromCompute<Tdata>(r);
+        *y = rotgFromCompute<Tdata>(z);
+        *c = rotgFromCompute<Tdata>(c_val);
+        *s = rotgFromCompute<Tdata>(s_val);
+    }
+}
diff --git a/src/infiniop/ops/rotg/cpu/rotg_cpu.cc b/src/infiniop/ops/rotg/cpu/rotg_cpu.cc
new file mode 100644
index 000000000..bf95a5c3e
--- /dev/null
+++ b/src/infiniop/ops/rotg/cpu/rotg_cpu.cc
@@ -0,0 +1,121 @@
+#include "rotg_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+#include <cmath>
+
+namespace op::rotg::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = RotgInfo::createRotgInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotg(
+    Tdata *x,
+    Tdata *y,
+    Tdata *c,
+    Tdata *s) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const Tcompute zero = utils::cast<Tcompute>(0.0f);
+    const Tcompute one = utils::cast<Tcompute>(1.0f);
+
+    Tcompute x_val = utils::cast<Tcompute>(x[0]);
+    Tcompute y_val = utils::cast<Tcompute>(y[0]);
+
+    const Tcompute anorm = std::abs(x_val);
+    const Tcompute bnorm = std::abs(y_val);
+
+    if (bnorm == zero) {
+        c[0] = utils::cast<Tdata>(one);
+        s[0] = utils::cast<Tdata>(zero);
+        y[0] = utils::cast<Tdata>(zero);
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    if (anorm == zero) {
+        c[0] = utils::cast<Tdata>(zero);
+        s[0] = utils::cast<Tdata>(one);
+        x[0] = utils::cast<Tdata>(y_val);
+        y[0] = utils::cast<Tdata>(one);
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    const Tcompute sigma = anorm > bnorm ? std::copysign(one, x_val) : std::copysign(one, y_val);
+    const Tcompute r = sigma * std::hypot(x_val, y_val);
+    const Tcompute c_val = x_val / r;
+    const Tcompute s_val = y_val / r;
+
+    Tcompute z;
+    if (anorm > bnorm) {
+        z = s_val;
+    } else if (c_val != zero) {
+        z = one / c_val;
+    } else {
+        z = one;
+    }
+
+    x[0] = utils::cast<Tdata>(r);
+    y[0] = utils::cast<Tdata>(z);
+    c[0] = utils::cast<Tdata>(c_val);
+    s[0] = utils::cast<Tdata>(s_val);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTG(TDATA) \
+    calculateRotg((TDATA *)x, \
+                  (TDATA *)y, \
+                  (TDATA *)c, \
+                  (TDATA *)s)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *c,
+    void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTG(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTG(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_ROTG(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTG(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTG
+
+} // namespace op::rotg::cpu
diff --git a/src/infiniop/ops/rotg/cpu/rotg_cpu.h b/src/infiniop/ops/rotg/cpu/rotg_cpu.h
new file mode 100644
index 000000000..a83cb2612
--- /dev/null
+++ b/src/infiniop/ops/rotg/cpu/rotg_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ROTG_CPU_H__
+#define __ROTG_CPU_H__
+
+#include "../rotg.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ROTG_CPU_H__
diff --git a/src/infiniop/ops/rotg/info.h b/src/infiniop/ops/rotg/info.h
new file mode 100644
index 000000000..99e486a1b
--- /dev/null
+++ b/src/infiniop/ops/rotg/info.h
@@ -0,0 +1,41 @@
+#ifndef __ROTG_INFO_H__
+#define __ROTG_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class RotgInfo {
+private:
+    RotgInfo() = default;
+
+public:
+    infiniDtype_t data_type;
+
+    static utils::Result<RotgInfo> createRotgInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t s_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(c_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(s_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(c_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(s_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_OR_RETURN(x_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(c_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(s_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<RotgInfo>(RotgInfo{
+            data_type});
+    }
+};
+
+#endif // __ROTG_INFO_H__
diff --git a/src/infiniop/ops/rotg/metax/rotg_metax.cc b/src/infiniop/ops/rotg/metax/rotg_metax.cc
new file mode 100644
index 000000000..32b12704b
--- /dev/null
+++ b/src/infiniop/ops/rotg/metax/rotg_metax.cc
@@ -0,0 +1,73 @@
+#include "rotg_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::rotg::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = RotgInfo::createRotgInfo(x_desc, y_desc, c_desc, s_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *c,
+    void *s,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasSrotg(handle, (float *)x, (float *)y, (float *)c, (float *)s));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDrotg(handle, (double *)x, (double *)y, (double *)c, (double *)s));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rotg::metax
diff --git a/src/infiniop/ops/rotg/metax/rotg_metax.h b/src/infiniop/ops/rotg/metax/rotg_metax.h
new file mode 100644
index 000000000..aaf5f2612
--- /dev/null
+++ b/src/infiniop/ops/rotg/metax/rotg_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ROTG_METAX_H__
+#define __ROTG_METAX_H__
+
+#include "../rotg.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ROTG_METAX_H__
diff --git a/src/infiniop/ops/rotg/operator.cc b/src/infiniop/ops/rotg/operator.cc
new file mode 100644
index 000000000..0fa83d664
--- /dev/null
+++ b/src/infiniop/ops/rotg/operator.cc
@@ -0,0 +1,125 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/rotg.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rotg_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/rotg_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/rotg_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateRotgDescriptor(
+    infiniopHandle_t handle,
+    infiniopRotgDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t s_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::rotg::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::rotg::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, y_desc, c_desc, s_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetRotgWorkspaceSize(infiniopRotgDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::rotg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopRotg(
+    infiniopRotgDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *c,
+    void *s,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::rotg::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, c, s, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyRotgDescriptor(infiniopRotgDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::rotg::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/rotg/rotg.h b/src/infiniop/ops/rotg/rotg.h
new file mode 100644
index 000000000..9aa0a59e0
--- /dev/null
+++ b/src/infiniop/ops/rotg/rotg.h
@@ -0,0 +1,51 @@
+#ifndef __ROTG_H__
+#define __ROTG_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::rotg::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        RotgInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            RotgInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t c_desc,                   \
+            infiniopTensorDescriptor_t s_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *x,                                             \
+            void *y,                                             \
+            void *c,                                             \
+            void *s,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __ROTG_H__
diff --git a/src/infiniop/ops/rotm/bang/rotm_bang.h b/src/infiniop/ops/rotm/bang/rotm_bang.h
new file mode 100644
index 000000000..49cbd6789
--- /dev/null
+++ b/src/infiniop/ops/rotm/bang/rotm_bang.h
@@ -0,0 +1,8 @@
+#ifndef __ROTM_BANG_H__
+#define __ROTM_BANG_H__
+
+#include "../rotm.h"
+
+DESCRIPTOR(bang)
+
+#endif // __ROTM_BANG_H__
diff --git a/src/infiniop/ops/rotm/bang/rotm_bang.mlu b/src/infiniop/ops/rotm/bang/rotm_bang.mlu
new file mode 100644
index 000000000..2f3ee92f6
--- /dev/null
+++ b/src/infiniop/ops/rotm/bang/rotm_bang.mlu
@@ -0,0 +1,101 @@
+#include "../../../devices/bang/common_bang.h"
+#include "rotm_bang.h"
+#include "rotm_bang_kernel.mlu"
+
+namespace op::rotm::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = RotmInfo::createRotmInfo(x_desc, y_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotm(
+    const RotmInfo &info,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *param,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        rotmKernelContiguous<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            y,
+            param);
+    } else {
+        rotmKernelStrided<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            y,
+            incy,
+            param);
+    }
+
+    cnrtQueueSync(queue);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTM(TDATA)           \
+    calculateRotm(_info,                \
+                  (TDATA *)x,           \
+                  (TDATA *)y,           \
+                  (const TDATA *)param, \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTM(half);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTM(bfloat16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTM(float);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTM
+
+} // namespace op::rotm::bang
diff --git a/src/infiniop/ops/rotm/bang/rotm_bang_kernel.mlu b/src/infiniop/ops/rotm/bang/rotm_bang_kernel.mlu
new file mode 100644
index 000000000..e943d1464
--- /dev/null
+++ b/src/infiniop/ops/rotm/bang/rotm_bang_kernel.mlu
@@ -0,0 +1,183 @@
+#include "../../../devices/bang/common_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void rotmKernelContiguous(
+    int n,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *param) {
+
+    const Tdata flag = param[0];
+    if (n == 0 || (flag + static_cast<Tdata>(2) == static_cast<Tdata>(0))) {
+        return;
+    }
+
+    Tdata h11 = static_cast<Tdata>(0);
+    Tdata h12 = static_cast<Tdata>(0);
+    Tdata h21 = static_cast<Tdata>(0);
+    Tdata h22 = static_cast<Tdata>(0);
+
+    if (flag < static_cast<Tdata>(0)) {
+        h11 = param[1];
+        h12 = param[3];
+        h21 = param[2];
+        h22 = param[4];
+    } else if (flag == static_cast<Tdata>(0)) {
+        h12 = param[3];
+        h21 = param[2];
+    } else {
+        h11 = param[1];
+        h22 = param[4];
+    }
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (4 * sizeof(Tdata));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    Tdata *nram_y = nram_x + chunk_size;
+    Tdata *nram_x_out = nram_y + chunk_size;
+    Tdata *nram_y_out = nram_x_out + chunk_size;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        if (flag < static_cast<Tdata>(0)) {
+            __bang_mul_scalar(nram_x_out, nram_x, h11, chunk_size);
+            __bang_mul_scalar(nram_y_out, nram_y, h12, chunk_size);
+            __bang_add(nram_x_out, nram_x_out, nram_y_out, chunk_size);
+
+            __bang_mul_scalar(nram_y_out, nram_x, h21, chunk_size);
+            __bang_mul_scalar(nram_y, nram_y, h22, chunk_size);
+            __bang_add(nram_y_out, nram_y_out, nram_y, chunk_size);
+        } else if (flag == static_cast<Tdata>(0)) {
+            __bang_mul_scalar(nram_x_out, nram_y, h12, chunk_size);
+            __bang_add(nram_x_out, nram_x, nram_x_out, chunk_size);
+
+            __bang_mul_scalar(nram_y_out, nram_x, h21, chunk_size);
+            __bang_add(nram_y_out, nram_y_out, nram_y, chunk_size);
+        } else {
+            __bang_mul_scalar(nram_x_out, nram_x, h11, chunk_size);
+            __bang_add(nram_x_out, nram_x_out, nram_y, chunk_size);
+
+            __bang_mul_scalar(nram_y_out, nram_y, h22, chunk_size);
+            __bang_sub(nram_y_out, nram_y_out, nram_x, chunk_size);
+        }
+
+        __memcpy(x + current_offset, nram_x_out, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+        __memcpy(y + current_offset, nram_y_out, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        if (flag < static_cast<Tdata>(0)) {
+            __bang_mul_scalar(nram_x_out, nram_x, h11, chunk_rem);
+            __bang_mul_scalar(nram_y_out, nram_y, h12, chunk_rem);
+            __bang_add(nram_x_out, nram_x_out, nram_y_out, chunk_rem);
+
+            __bang_mul_scalar(nram_y_out, nram_x, h21, chunk_rem);
+            __bang_mul_scalar(nram_y, nram_y, h22, chunk_rem);
+            __bang_add(nram_y_out, nram_y_out, nram_y, chunk_rem);
+        } else if (flag == static_cast<Tdata>(0)) {
+            __bang_mul_scalar(nram_x_out, nram_y, h12, chunk_rem);
+            __bang_add(nram_x_out, nram_x, nram_x_out, chunk_rem);
+
+            __bang_mul_scalar(nram_y_out, nram_x, h21, chunk_rem);
+            __bang_add(nram_y_out, nram_y_out, nram_y, chunk_rem);
+        } else {
+            __bang_mul_scalar(nram_x_out, nram_x, h11, chunk_rem);
+            __bang_add(nram_x_out, nram_x_out, nram_y, chunk_rem);
+
+            __bang_mul_scalar(nram_y_out, nram_y, h22, chunk_rem);
+            __bang_sub(nram_y_out, nram_y_out, nram_x, chunk_rem);
+        }
+
+        __memcpy(x + current_offset, nram_x_out, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+        __memcpy(y + current_offset, nram_y_out, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void rotmKernelStrided(
+    int n,
+    Tdata *x,
+    int incx,
+    Tdata *y,
+    int incy,
+    const Tdata *param) {
+
+    const Tdata flag = param[0];
+    if (n == 0 || (flag + static_cast<Tdata>(2) == static_cast<Tdata>(0))) {
+        return;
+    }
+
+    Tdata h11 = static_cast<Tdata>(0);
+    Tdata h12 = static_cast<Tdata>(0);
+    Tdata h21 = static_cast<Tdata>(0);
+    Tdata h22 = static_cast<Tdata>(0);
+
+    if (flag < static_cast<Tdata>(0)) {
+        h11 = param[1];
+        h12 = param[3];
+        h21 = param[2];
+        h22 = param[4];
+    } else if (flag == static_cast<Tdata>(0)) {
+        h12 = param[3];
+        h21 = param[2];
+    } else {
+        h11 = param[1];
+        h22 = param[4];
+    }
+
+    const int task = taskId;
+    const int tasks = taskDim;
+    const int per_task = n / tasks;
+    const int remain = n % tasks;
+    const int begin = task < remain ? task * (per_task + 1) : task * per_task + remain;
+    const int count = per_task + (task < remain ? 1 : 0);
+
+    for (int i = 0; i < count; ++i) {
+        const int index = begin + i;
+        const int x_idx = index * incx;
+        const int y_idx = index * incy;
+        const Tdata w = x[x_idx];
+        const Tdata z = y[y_idx];
+
+        if (flag < static_cast<Tdata>(0)) {
+            x[x_idx] = w * h11 + z * h12;
+            y[y_idx] = w * h21 + z * h22;
+        } else if (flag == static_cast<Tdata>(0)) {
+            x[x_idx] = w + z * h12;
+            y[y_idx] = w * h21 + z;
+        } else {
+            x[x_idx] = w * h11 + z;
+            y[y_idx] = -w + h22 * z;
+        }
+    }
+}
diff --git a/src/infiniop/ops/rotm/cpu/rotm_cpu.cc b/src/infiniop/ops/rotm/cpu/rotm_cpu.cc
new file mode 100644
index 000000000..1850a654d
--- /dev/null
+++ b/src/infiniop/ops/rotm/cpu/rotm_cpu.cc
@@ -0,0 +1,169 @@
+#include "rotm_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::rotm::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = RotmInfo::createRotmInfo(x_desc, y_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotm(
+    const RotmInfo &info,
+    Tdata *x,
+    Tdata *y,
+    const Tdata *param) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const Tcompute zero = utils::cast<Tcompute>(0.0f);
+    const Tcompute two = utils::cast<Tcompute>(2.0f);
+
+    Tcompute sflag = utils::cast<Tcompute>(param[0]);
+
+    if (info.n == 0 || (sflag + two == zero)) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+    const ptrdiff_t incy = info.incy;
+    const ptrdiff_t kx = incx >= 0 ? 0 : utils::cast<ptrdiff_t>(n - 1) * (-incx);
+    const ptrdiff_t ky = incy >= 0 ? 0 : utils::cast<ptrdiff_t>(n - 1) * (-incy);
+
+    Tcompute sh11 = zero;
+    Tcompute sh12 = zero;
+    Tcompute sh21 = zero;
+    Tcompute sh22 = zero;
+
+    if (incx == incy && incx > 0) {
+        const ptrdiff_t nsteps = utils::cast<ptrdiff_t>(n) * incx;
+        if (sflag < zero) {
+            sh11 = utils::cast<Tcompute>(param[1]);
+            sh12 = utils::cast<Tcompute>(param[3]);
+            sh21 = utils::cast<Tcompute>(param[2]);
+            sh22 = utils::cast<Tcompute>(param[4]);
+            for (ptrdiff_t i = 0; i < nsteps; i += incx) {
+                const Tcompute w = utils::cast<Tcompute>(x[i]);
+                const Tcompute z = utils::cast<Tcompute>(y[i]);
+                x[i] = utils::cast<Tdata>(w * sh11 + z * sh12);
+                y[i] = utils::cast<Tdata>(w * sh21 + z * sh22);
+            }
+        } else if (sflag == zero) {
+            sh12 = utils::cast<Tcompute>(param[3]);
+            sh21 = utils::cast<Tcompute>(param[2]);
+            for (ptrdiff_t i = 0; i < nsteps; i += incx) {
+                const Tcompute w = utils::cast<Tcompute>(x[i]);
+                const Tcompute z = utils::cast<Tcompute>(y[i]);
+                x[i] = utils::cast<Tdata>(w + z * sh12);
+                y[i] = utils::cast<Tdata>(w * sh21 + z);
+            }
+        } else {
+            sh11 = utils::cast<Tcompute>(param[1]);
+            sh22 = utils::cast<Tcompute>(param[4]);
+            for (ptrdiff_t i = 0; i < nsteps; i += incx) {
+                const Tcompute w = utils::cast<Tcompute>(x[i]);
+                const Tcompute z = utils::cast<Tcompute>(y[i]);
+                x[i] = utils::cast<Tdata>(w * sh11 + z);
+                y[i] = utils::cast<Tdata>(-w + sh22 * z);
+            }
+        }
+    } else {
+        ptrdiff_t ix = kx;
+        ptrdiff_t iy = ky;
+
+        if (sflag < zero) {
+            sh11 = utils::cast<Tcompute>(param[1]);
+            sh12 = utils::cast<Tcompute>(param[3]);
+            sh21 = utils::cast<Tcompute>(param[2]);
+            sh22 = utils::cast<Tcompute>(param[4]);
+            for (size_t i = 0; i < n; ++i) {
+                const Tcompute w = utils::cast<Tcompute>(x[ix]);
+                const Tcompute z = utils::cast<Tcompute>(y[iy]);
+                x[ix] = utils::cast<Tdata>(w * sh11 + z * sh12);
+                y[iy] = utils::cast<Tdata>(w * sh21 + z * sh22);
+                ix += incx;
+                iy += incy;
+            }
+        } else if (sflag == zero) {
+            sh12 = utils::cast<Tcompute>(param[3]);
+            sh21 = utils::cast<Tcompute>(param[2]);
+            for (size_t i = 0; i < n; ++i) {
+                const Tcompute w = utils::cast<Tcompute>(x[ix]);
+                const Tcompute z = utils::cast<Tcompute>(y[iy]);
+                x[ix] = utils::cast<Tdata>(w + z * sh12);
+                y[iy] = utils::cast<Tdata>(w * sh21 + z);
+                ix += incx;
+                iy += incy;
+            }
+        } else {
+            sh11 = utils::cast<Tcompute>(param[1]);
+            sh22 = utils::cast<Tcompute>(param[4]);
+            for (size_t i = 0; i < n; ++i) {
+                const Tcompute w = utils::cast<Tcompute>(x[ix]);
+                const Tcompute z = utils::cast<Tcompute>(y[iy]);
+                x[ix] = utils::cast<Tdata>(w * sh11 + z);
+                y[iy] = utils::cast<Tdata>(-w + sh22 * z);
+                ix += incx;
+                iy += incy;
+            }
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTM(TDATA) \
+    calculateRotm(_info,      \
+                  (TDATA *)x, \
+                  (TDATA *)y, \
+                  (const TDATA *)param)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTM(fp16_t);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTM(bf16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTM(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_ROTM(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTM
+
+} // namespace op::rotm::cpu
diff --git a/src/infiniop/ops/rotm/cpu/rotm_cpu.h b/src/infiniop/ops/rotm/cpu/rotm_cpu.h
new file mode 100644
index 000000000..972bd4c8a
--- /dev/null
+++ b/src/infiniop/ops/rotm/cpu/rotm_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ROTM_CPU_H__
+#define __ROTM_CPU_H__
+
+#include "../rotm.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ROTM_CPU_H__
diff --git a/src/infiniop/ops/rotm/info.h b/src/infiniop/ops/rotm/info.h
new file mode 100644
index 000000000..0cf44af91
--- /dev/null
+++ b/src/infiniop/ops/rotm/info.h
@@ -0,0 +1,50 @@
+#ifndef __ROTM_INFO_H__
+#define __ROTM_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class RotmInfo {
+private:
+    RotmInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<RotmInfo> createRotmInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t param_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(param_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(param_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(param_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(param_desc->numel() == 5, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(param_desc->stride(0) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<RotmInfo>(RotmInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __ROTM_INFO_H__
diff --git a/src/infiniop/ops/rotm/metax/rotm_metax.cc b/src/infiniop/ops/rotm/metax/rotm_metax.cc
new file mode 100644
index 000000000..0911623e9
--- /dev/null
+++ b/src/infiniop/ops/rotm/metax/rotm_metax.cc
@@ -0,0 +1,74 @@
+#include "rotm_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::rotm::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = RotmInfo::createRotmInfo(x_desc, y_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasSrotm(handle, n, (float *)x, incx, (float *)y, incy, (const float *)param));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDrotm(handle, n, (double *)x, incx, (double *)y, incy, (const double *)param));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rotm::metax
diff --git a/src/infiniop/ops/rotm/metax/rotm_metax.h b/src/infiniop/ops/rotm/metax/rotm_metax.h
new file mode 100644
index 000000000..07b031336
--- /dev/null
+++ b/src/infiniop/ops/rotm/metax/rotm_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ROTM_METAX_H__
+#define __ROTM_METAX_H__
+
+#include "../rotm.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ROTM_METAX_H__
diff --git a/src/infiniop/ops/rotm/operator.cc b/src/infiniop/ops/rotm/operator.cc
new file mode 100644
index 000000000..fcfe2060c
--- /dev/null
+++ b/src/infiniop/ops/rotm/operator.cc
@@ -0,0 +1,129 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/rotm.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rotm_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/rotm_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/rotm_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateRotmDescriptor(
+    infiniopHandle_t handle,
+    infiniopRotmDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::rotm::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::rotm::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, y_desc, param_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetRotmWorkspaceSize(infiniopRotmDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::rotm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopRotm(
+    infiniopRotmDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    const void *param,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::rotm::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, param, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyRotmDescriptor(infiniopRotmDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::rotm::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/rotm/rotm.h b/src/infiniop/ops/rotm/rotm.h
new file mode 100644
index 000000000..65b02e3a8
--- /dev/null
+++ b/src/infiniop/ops/rotm/rotm.h
@@ -0,0 +1,49 @@
+#ifndef __ROTM_H__
+#define __ROTM_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::rotm::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        RotmInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            RotmInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t param_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *x,                                             \
+            void *y,                                             \
+            const void *param,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __ROTM_H__
diff --git a/src/infiniop/ops/rotmg/bang/rotmg_bang.h b/src/infiniop/ops/rotmg/bang/rotmg_bang.h
new file mode 100644
index 000000000..8d1c678d6
--- /dev/null
+++ b/src/infiniop/ops/rotmg/bang/rotmg_bang.h
@@ -0,0 +1,8 @@
+#ifndef __ROTMG_BANG_H__
+#define __ROTMG_BANG_H__
+
+#include "../rotmg.h"
+
+DESCRIPTOR(bang)
+
+#endif // __ROTMG_BANG_H__
diff --git a/src/infiniop/ops/rotmg/bang/rotmg_bang.mlu b/src/infiniop/ops/rotmg/bang/rotmg_bang.mlu
new file mode 100644
index 000000000..53f7134b4
--- /dev/null
+++ b/src/infiniop/ops/rotmg/bang/rotmg_bang.mlu
@@ -0,0 +1,95 @@
+#include "../../../devices/bang/common_bang.h"
+#include "rotmg_bang.h"
+#include "rotmg_bang_kernel.mlu"
+
+namespace op::rotmg::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t d1_desc,
+    infiniopTensorDescriptor_t d2_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t y1_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = RotmgInfo::createRotmgInfo(d1_desc, d2_desc, x1_desc, y1_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotmg(
+    Tdata *d1,
+    Tdata *d2,
+    Tdata *x1,
+    const Tdata *y1,
+    Tdata *param,
+    cnrtQueue_t queue) {
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    k_dim.x = 1;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeBlock;
+
+    rotmgKernel<<<k_dim, k_type, queue>>>(
+        d1,
+        d2,
+        x1,
+        y1,
+        param);
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTMG(TDATA)        \
+    calculateRotmg((TDATA *)d1,       \
+                   (TDATA *)d2,       \
+                   (TDATA *)x1,       \
+                   (const TDATA *)y1, \
+                   (TDATA *)param,    \
+                   (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *d1,
+    void *d2,
+    void *x1,
+    const void *y1,
+    void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTMG(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTMG(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTMG(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTMG
+
+} // namespace op::rotmg::bang
diff --git a/src/infiniop/ops/rotmg/bang/rotmg_bang_kernel.mlu b/src/infiniop/ops/rotmg/bang/rotmg_bang_kernel.mlu
new file mode 100644
index 000000000..a9237e2d9
--- /dev/null
+++ b/src/infiniop/ops/rotmg/bang/rotmg_bang_kernel.mlu
@@ -0,0 +1,156 @@
+#include "../../../devices/bang/common_bang.h"
+
+#include <cmath>
+#include <type_traits>
+
+template <typename Tdata>
+__mlu_global__ void rotmgKernel(
+    Tdata *d1,
+    Tdata *d2,
+    Tdata *x1,
+    const Tdata *y1,
+    Tdata *param) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const Tcompute zero = static_cast<Tcompute>(0.0f);
+    const Tcompute one = static_cast<Tcompute>(1.0f);
+    const Tcompute two = static_cast<Tcompute>(2.0f);
+    const Tcompute gam = static_cast<Tcompute>(4096.0f);
+    const Tcompute gamsq = static_cast<Tcompute>(1.67772e7f);
+    const Tcompute rgamsq = static_cast<Tcompute>(5.96046e-8f);
+
+    Tcompute d1_val = static_cast<Tcompute>(*d1);
+    Tcompute d2_val = static_cast<Tcompute>(*d2);
+    Tcompute x1_val = static_cast<Tcompute>(*x1);
+    const Tcompute y1_val = static_cast<Tcompute>(*y1);
+
+    Tcompute sflag = zero;
+    Tcompute sh11 = zero;
+    Tcompute sh12 = zero;
+    Tcompute sh21 = zero;
+    Tcompute sh22 = zero;
+
+    if (d1_val < zero) {
+        sflag = -one;
+        d1_val = zero;
+        d2_val = zero;
+        x1_val = zero;
+    } else {
+        const Tcompute sp2 = d2_val * y1_val;
+        if (sp2 == zero) {
+            param[0] = static_cast<Tdata>(-two);
+            return;
+        }
+
+        const Tcompute sp1 = d1_val * x1_val;
+        const Tcompute sq2 = sp2 * y1_val;
+        const Tcompute sq1 = sp1 * x1_val;
+
+        if (std::fabs(sq1) > std::fabs(sq2)) {
+            sh21 = -y1_val / x1_val;
+            sh12 = sp2 / sp1;
+            const Tcompute su = one - sh12 * sh21;
+
+            if (su > zero) {
+                sflag = zero;
+                d1_val = d1_val / su;
+                d2_val = d2_val / su;
+                x1_val = x1_val * su;
+            } else {
+                sflag = -one;
+                sh11 = zero;
+                sh12 = zero;
+                sh21 = zero;
+                sh22 = zero;
+                d1_val = zero;
+                d2_val = zero;
+                x1_val = zero;
+            }
+        } else {
+            if (sq2 < zero) {
+                sflag = -one;
+                d1_val = zero;
+                d2_val = zero;
+                x1_val = zero;
+            } else {
+                sflag = one;
+                sh11 = sp1 / sp2;
+                sh22 = x1_val / y1_val;
+                const Tcompute su = one + sh11 * sh22;
+                const Tcompute stemp = d2_val / su;
+                d2_val = d1_val / su;
+                d1_val = stemp;
+                x1_val = y1_val * su;
+            }
+        }
+
+        if (d1_val != zero) {
+            while (d1_val <= rgamsq || d1_val >= gamsq) {
+                if (sflag == zero) {
+                    sh11 = one;
+                    sh22 = one;
+                    sflag = -one;
+                } else {
+                    sh21 = -one;
+                    sh12 = one;
+                    sflag = -one;
+                }
+
+                if (d1_val <= rgamsq) {
+                    d1_val = d1_val * gam * gam;
+                    x1_val = x1_val / gam;
+                    sh11 = sh11 / gam;
+                    sh12 = sh12 / gam;
+                } else {
+                    d1_val = d1_val / (gam * gam);
+                    x1_val = x1_val * gam;
+                    sh11 = sh11 * gam;
+                    sh12 = sh12 * gam;
+                }
+            }
+        }
+
+        if (d2_val != zero) {
+            while (std::fabs(d2_val) <= rgamsq || std::fabs(d2_val) >= gamsq) {
+                if (sflag == zero) {
+                    sh11 = one;
+                    sh22 = one;
+                    sflag = -one;
+                } else {
+                    sh21 = -one;
+                    sh12 = one;
+                    sflag = -one;
+                }
+
+                if (std::fabs(d2_val) <= rgamsq) {
+                    d2_val = d2_val * gam * gam;
+                    sh21 = sh21 / gam;
+                    sh22 = sh22 / gam;
+                } else {
+                    d2_val = d2_val / (gam * gam);
+                    sh21 = sh21 * gam;
+                    sh22 = sh22 * gam;
+                }
+            }
+        }
+    }
+
+    if (sflag < zero) {
+        param[1] = static_cast<Tdata>(sh11);
+        param[2] = static_cast<Tdata>(sh21);
+        param[3] = static_cast<Tdata>(sh12);
+        param[4] = static_cast<Tdata>(sh22);
+    } else if (sflag == zero) {
+        param[2] = static_cast<Tdata>(sh21);
+        param[3] = static_cast<Tdata>(sh12);
+    } else {
+        param[1] = static_cast<Tdata>(sh11);
+        param[4] = static_cast<Tdata>(sh22);
+    }
+
+    param[0] = static_cast<Tdata>(sflag);
+    *d1 = static_cast<Tdata>(d1_val);
+    *d2 = static_cast<Tdata>(d2_val);
+    *x1 = static_cast<Tdata>(x1_val);
+}
diff --git a/src/infiniop/ops/rotmg/cpu/rotmg_cpu.cc b/src/infiniop/ops/rotmg/cpu/rotmg_cpu.cc
new file mode 100644
index 000000000..258a936bf
--- /dev/null
+++ b/src/infiniop/ops/rotmg/cpu/rotmg_cpu.cc
@@ -0,0 +1,221 @@
+#include "rotmg_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+#include <cmath>
+
+namespace op::rotmg::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t d1_desc,
+    infiniopTensorDescriptor_t d2_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t y1_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = RotmgInfo::createRotmgInfo(d1_desc, d2_desc, x1_desc, y1_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateRotmg(
+    Tdata *d1,
+    Tdata *d2,
+    Tdata *x1,
+    const Tdata *y1,
+    Tdata *param) {
+
+    using Tcompute = std::conditional_t<std::is_same_v<Tdata, double>, double, float>;
+
+    const Tcompute zero = utils::cast<Tcompute>(0.0f);
+    const Tcompute one = utils::cast<Tcompute>(1.0f);
+    const Tcompute two = utils::cast<Tcompute>(2.0f);
+    const Tcompute gam = utils::cast<Tcompute>(4096.0f);
+    const Tcompute gamsq = utils::cast<Tcompute>(1.67772e7f);
+    const Tcompute rgamsq = utils::cast<Tcompute>(5.96046e-8f);
+
+    Tcompute d1_val = utils::cast<Tcompute>(d1[0]);
+    Tcompute d2_val = utils::cast<Tcompute>(d2[0]);
+    Tcompute x1_val = utils::cast<Tcompute>(x1[0]);
+    const Tcompute y1_val = utils::cast<Tcompute>(y1[0]);
+
+    Tcompute sflag;
+    Tcompute sh11 = zero;
+    Tcompute sh12 = zero;
+    Tcompute sh21 = zero;
+    Tcompute sh22 = zero;
+
+    if (d1_val < zero) {
+        sflag = -one;
+        d1_val = zero;
+        d2_val = zero;
+        x1_val = zero;
+    } else {
+        const Tcompute sp2 = d2_val * y1_val;
+        if (sp2 == zero) {
+            param[0] = utils::cast<Tdata>(-two);
+            return INFINI_STATUS_SUCCESS;
+        }
+
+        const Tcompute sp1 = d1_val * x1_val;
+        const Tcompute sq2 = sp2 * y1_val;
+        const Tcompute sq1 = sp1 * x1_val;
+
+        if (std::abs(sq1) > std::abs(sq2)) {
+            sh21 = -y1_val / x1_val;
+            sh12 = sp2 / sp1;
+            const Tcompute su = one - sh12 * sh21;
+
+            if (su > zero) {
+                sflag = zero;
+                d1_val = d1_val / su;
+                d2_val = d2_val / su;
+                x1_val = x1_val * su;
+            } else {
+                sflag = -one;
+                sh11 = zero;
+                sh12 = zero;
+                sh21 = zero;
+                sh22 = zero;
+                d1_val = zero;
+                d2_val = zero;
+                x1_val = zero;
+            }
+        } else {
+            if (sq2 < zero) {
+                sflag = -one;
+                d1_val = zero;
+                d2_val = zero;
+                x1_val = zero;
+            } else {
+                sflag = one;
+                sh11 = sp1 / sp2;
+                sh22 = x1_val / y1_val;
+                const Tcompute su = one + sh11 * sh22;
+                const Tcompute stemp = d2_val / su;
+                d2_val = d1_val / su;
+                d1_val = stemp;
+                x1_val = y1_val * su;
+            }
+        }
+
+        if (d1_val != zero) {
+            while (d1_val <= rgamsq || d1_val >= gamsq) {
+                if (sflag == zero) {
+                    sh11 = one;
+                    sh22 = one;
+                    sflag = -one;
+                } else {
+                    sh21 = -one;
+                    sh12 = one;
+                    sflag = -one;
+                }
+                if (d1_val <= rgamsq) {
+                    d1_val = d1_val * gam * gam;
+                    x1_val = x1_val / gam;
+                    sh11 = sh11 / gam;
+                    sh12 = sh12 / gam;
+                } else {
+                    d1_val = d1_val / (gam * gam);
+                    x1_val = x1_val * gam;
+                    sh11 = sh11 * gam;
+                    sh12 = sh12 * gam;
+                }
+            }
+        }
+
+        if (d2_val != zero) {
+            while (std::abs(d2_val) <= rgamsq || std::abs(d2_val) >= gamsq) {
+                if (sflag == zero) {
+                    sh11 = one;
+                    sh22 = one;
+                    sflag = -one;
+                } else {
+                    sh21 = -one;
+                    sh12 = one;
+                    sflag = -one;
+                }
+                if (std::abs(d2_val) <= rgamsq) {
+                    d2_val = d2_val * gam * gam;
+                    sh21 = sh21 / gam;
+                    sh22 = sh22 / gam;
+                } else {
+                    d2_val = d2_val / (gam * gam);
+                    sh21 = sh21 * gam;
+                    sh22 = sh22 * gam;
+                }
+            }
+        }
+    }
+
+    if (sflag < zero) {
+        param[1] = utils::cast<Tdata>(sh11);
+        param[2] = utils::cast<Tdata>(sh21);
+        param[3] = utils::cast<Tdata>(sh12);
+        param[4] = utils::cast<Tdata>(sh22);
+    } else if (sflag == zero) {
+        param[2] = utils::cast<Tdata>(sh21);
+        param[3] = utils::cast<Tdata>(sh12);
+    } else {
+        param[1] = utils::cast<Tdata>(sh11);
+        param[4] = utils::cast<Tdata>(sh22);
+    }
+
+    param[0] = utils::cast<Tdata>(sflag);
+    d1[0] = utils::cast<Tdata>(d1_val);
+    d2[0] = utils::cast<Tdata>(d2_val);
+    x1[0] = utils::cast<Tdata>(x1_val);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_ROTMG(TDATA)        \
+    calculateRotmg((TDATA *)d1,       \
+                   (TDATA *)d2,       \
+                   (TDATA *)x1,       \
+                   (const TDATA *)y1, \
+                   (TDATA *)param)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *d1,
+    void *d2,
+    void *x1,
+    const void *y1,
+    void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_ROTMG(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_ROTMG(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_ROTMG(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_ROTMG(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_ROTMG
+
+} // namespace op::rotmg::cpu
diff --git a/src/infiniop/ops/rotmg/cpu/rotmg_cpu.h b/src/infiniop/ops/rotmg/cpu/rotmg_cpu.h
new file mode 100644
index 000000000..49acbc062
--- /dev/null
+++ b/src/infiniop/ops/rotmg/cpu/rotmg_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ROTMG_CPU_H__
+#define __ROTMG_CPU_H__
+
+#include "../rotmg.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ROTMG_CPU_H__
diff --git a/src/infiniop/ops/rotmg/info.h b/src/infiniop/ops/rotmg/info.h
new file mode 100644
index 000000000..e7fdb8532
--- /dev/null
+++ b/src/infiniop/ops/rotmg/info.h
@@ -0,0 +1,48 @@
+#ifndef __ROTMG_INFO_H__
+#define __ROTMG_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class RotmgInfo {
+private:
+    RotmgInfo() = default;
+
+public:
+    infiniDtype_t data_type;
+
+    static utils::Result<RotmgInfo> createRotmgInfo(
+        infiniopTensorDescriptor_t d1_desc,
+        infiniopTensorDescriptor_t d2_desc,
+        infiniopTensorDescriptor_t x1_desc,
+        infiniopTensorDescriptor_t y1_desc,
+        infiniopTensorDescriptor_t param_desc) {
+
+        CHECK_OR_RETURN(d1_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(d2_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(x1_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y1_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(param_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = d1_desc->dtype();
+
+        CHECK_OR_RETURN(d2_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(x1_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(y1_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(param_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_OR_RETURN(param_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        CHECK_OR_RETURN(d1_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(d2_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x1_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y1_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(param_desc->numel() == 5, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(param_desc->stride(0) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<RotmgInfo>(RotmgInfo{
+            data_type});
+    }
+};
+
+#endif // __ROTMG_INFO_H__
diff --git a/src/infiniop/ops/rotmg/metax/rotmg_metax.cc b/src/infiniop/ops/rotmg/metax/rotmg_metax.cc
new file mode 100644
index 000000000..8fe2a881c
--- /dev/null
+++ b/src/infiniop/ops/rotmg/metax/rotmg_metax.cc
@@ -0,0 +1,75 @@
+#include "rotmg_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::rotmg::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t d1_desc,
+    infiniopTensorDescriptor_t d2_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t y1_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = RotmgInfo::createRotmgInfo(d1_desc, d2_desc, x1_desc, y1_desc, param_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *d1,
+    void *d2,
+    void *x1,
+    const void *y1,
+    void *param,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasSrotmg(handle, (float *)d1, (float *)d2, (float *)x1, (const float *)y1, (float *)param));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDrotmg(handle, (double *)d1, (double *)d2, (double *)x1, (const double *)y1, (double *)param));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rotmg::metax
diff --git a/src/infiniop/ops/rotmg/metax/rotmg_metax.h b/src/infiniop/ops/rotmg/metax/rotmg_metax.h
new file mode 100644
index 000000000..37c26c968
--- /dev/null
+++ b/src/infiniop/ops/rotmg/metax/rotmg_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ROTMG_METAX_H__
+#define __ROTMG_METAX_H__
+
+#include "../rotmg.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ROTMG_METAX_H__
diff --git a/src/infiniop/ops/rotmg/operator.cc b/src/infiniop/ops/rotmg/operator.cc
new file mode 100644
index 000000000..98c25bd5b
--- /dev/null
+++ b/src/infiniop/ops/rotmg/operator.cc
@@ -0,0 +1,127 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/rotmg.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/rotmg_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/rotmg_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/rotmg_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateRotmgDescriptor(
+    infiniopHandle_t handle,
+    infiniopRotmgDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t d1_desc,
+    infiniopTensorDescriptor_t d2_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t y1_desc,
+    infiniopTensorDescriptor_t param_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::rotmg::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::rotmg::NAMESPACE::Descriptor **>(desc_ptr), \
+            d1_desc, d2_desc, x1_desc, y1_desc, param_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetRotmgWorkspaceSize(infiniopRotmgDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::rotmg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopRotmg(
+    infiniopRotmgDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *d1,
+    void *d2,
+    void *x1,
+    const void *y1,
+    void *param,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::rotmg::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, d1, d2, x1, y1, param, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyRotmgDescriptor(infiniopRotmgDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::rotmg::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/rotmg/rotmg.h b/src/infiniop/ops/rotmg/rotmg.h
new file mode 100644
index 000000000..4cfc97ef0
--- /dev/null
+++ b/src/infiniop/ops/rotmg/rotmg.h
@@ -0,0 +1,53 @@
+#ifndef __ROTMG_H__
+#define __ROTMG_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::rotmg::NAMESPACE {                             \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        RotmgInfo _info;                                         \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            RotmgInfo info,                                      \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t d1_desc,                  \
+            infiniopTensorDescriptor_t d2_desc,                  \
+            infiniopTensorDescriptor_t x1_desc,                  \
+            infiniopTensorDescriptor_t y1_desc,                  \
+            infiniopTensorDescriptor_t param_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *d1,                                            \
+            void *d2,                                            \
+            void *x1,                                            \
+            const void *y1,                                      \
+            void *param,                                         \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __ROTMG_H__
diff --git a/src/infiniop/ops/scal/bang/scal_bang.h b/src/infiniop/ops/scal/bang/scal_bang.h
new file mode 100644
index 000000000..726790b8a
--- /dev/null
+++ b/src/infiniop/ops/scal/bang/scal_bang.h
@@ -0,0 +1,8 @@
+#ifndef __SCAL_BANG_H__
+#define __SCAL_BANG_H__
+
+#include "../scal.h"
+
+DESCRIPTOR(bang)
+
+#endif // __SCAL_BANG_H__
diff --git a/src/infiniop/ops/scal/bang/scal_bang.mlu b/src/infiniop/ops/scal/bang/scal_bang.mlu
new file mode 100644
index 000000000..a49fb3196
--- /dev/null
+++ b/src/infiniop/ops/scal/bang/scal_bang.mlu
@@ -0,0 +1,95 @@
+#include "../../../devices/bang/common_bang.h"
+#include "scal_bang.h"
+#include "scal_bang_kernel.mlu"
+
+namespace op::scal::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = ScalInfo::createScalInfo(alpha_desc, x_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateScal(
+    const ScalInfo &info,
+    const Tdata *alpha,
+    Tdata *x,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1) {
+        scalKernelContiguous<<<k_dim, k_type, queue>>>(
+            n,
+            alpha,
+            x);
+    } else {
+        scalKernelStrided<<<k_dim, k_type, queue>>>(
+            n,
+            alpha,
+            x,
+            incx);
+    }
+
+    cnrtQueueSync(queue);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_SCAL(TDATA)           \
+    calculateScal(_info,                \
+                  (const TDATA *)alpha, \
+                  (TDATA *)x,           \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    void *x,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_SCAL(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_SCAL(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_SCAL(bfloat16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_SCAL
+
+} // namespace op::scal::bang
diff --git a/src/infiniop/ops/scal/bang/scal_bang_kernel.mlu b/src/infiniop/ops/scal/bang/scal_bang_kernel.mlu
new file mode 100644
index 000000000..d320d3332
--- /dev/null
+++ b/src/infiniop/ops/scal/bang/scal_bang_kernel.mlu
@@ -0,0 +1,75 @@
+#include "../../../devices/bang/common_bang.h"
+#include "scal_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void scalKernelContiguous(
+    int n,
+    const Tdata *alpha,
+    Tdata *x) {
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / sizeof(Tdata);
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x, nram_x, alpha[0], chunk_size);
+
+        __memcpy(x + current_offset, nram_x, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        int align_rem = ((chunk_rem + align_elements - 1) / align_elements) * align_elements;
+
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+
+        __bang_mul_scalar(nram_x, nram_x, alpha[0], align_rem);
+
+        __memcpy(x + current_offset, nram_x, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void scalKernelStrided(
+    int n,
+    const Tdata *alpha,
+    Tdata *x,
+    int incx) {
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int offset = i * incx;
+
+        x[offset] *= alpha[0];
+    }
+}
diff --git a/src/infiniop/ops/scal/cpu/scal_cpu.cc b/src/infiniop/ops/scal/cpu/scal_cpu.cc
new file mode 100644
index 000000000..3f26bf483
--- /dev/null
+++ b/src/infiniop/ops/scal/cpu/scal_cpu.cc
@@ -0,0 +1,81 @@
+#include "scal_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::scal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = ScalInfo::createScalInfo(alpha_desc, x_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateScal(
+    const ScalInfo &info,
+    const Tdata *alpha,
+    Tdata *x) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+
+    for (size_t i = 0; i < n; ++i) {
+        const ptrdiff_t idx = utils::cast<ptrdiff_t>(i) * incx;
+
+        if constexpr (std::is_same_v<Tdata, fp16_t> || std::is_same_v<Tdata, bf16_t>) {
+            x[idx] = utils::cast<Tdata>(utils::cast<float>(x[idx]) * utils::cast<float>(alpha[0]));
+        } else {
+            x[idx] = x[idx] * alpha[0];
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_SCAL(TDATA)           \
+    calculateScal(_info,                \
+                  (const TDATA *)alpha, \
+                  (TDATA *)x)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    void *x,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_SCAL(fp16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_SCAL(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_SCAL(double);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_SCAL(bf16_t);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_SCAL
+
+} // namespace op::scal::cpu
diff --git a/src/infiniop/ops/scal/cpu/scal_cpu.h b/src/infiniop/ops/scal/cpu/scal_cpu.h
new file mode 100644
index 000000000..1a0f2fe6b
--- /dev/null
+++ b/src/infiniop/ops/scal/cpu/scal_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __SCAL_CPU_H__
+#define __SCAL_CPU_H__
+
+#include "../scal.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __SCAL_CPU_H__
diff --git a/src/infiniop/ops/scal/info.h b/src/infiniop/ops/scal/info.h
new file mode 100644
index 000000000..c9c3122b7
--- /dev/null
+++ b/src/infiniop/ops/scal/info.h
@@ -0,0 +1,40 @@
+#ifndef __SCAL_INFO_H__
+#define __SCAL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class ScalInfo {
+private:
+    ScalInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    infiniDtype_t data_type;
+
+    static utils::Result<ScalInfo> createScalInfo(
+        infiniopTensorDescriptor_t alpha_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(alpha_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(alpha_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_OR_RETURN(alpha_desc->numel() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+
+        return utils::Result<ScalInfo>(ScalInfo{
+            n,
+            incx,
+            data_type});
+    }
+};
+
+#endif // __SCAL_INFO_H__
diff --git a/src/infiniop/ops/scal/metax/scal_metax.cc b/src/infiniop/ops/scal/metax/scal_metax.cc
new file mode 100644
index 000000000..5e67f0a05
--- /dev/null
+++ b/src/infiniop/ops/scal/metax/scal_metax.cc
@@ -0,0 +1,96 @@
+#include "scal_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::scal::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = ScalInfo::createScalInfo(alpha_desc, x_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    void *x,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const infiniDtype_t data_type = _info.data_type;
+
+    hpccDataType alpha_type, x_type;
+    hpccDataType execution_type;
+
+    switch (data_type) {
+    case INFINI_DTYPE_F16:
+        alpha_type = x_type = HPCC_R_16F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_BF16:
+        alpha_type = x_type = HPCC_R_16BF;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F32:
+        alpha_type = x_type = HPCC_R_32F;
+        execution_type = HPCC_R_32F;
+        break;
+    case INFINI_DTYPE_F64:
+        alpha_type = x_type = HPCC_R_64F;
+        execution_type = HPCC_R_64F;
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(
+                handle,
+                HCBLAS_POINTER_MODE_DEVICE));
+
+            CHECK_MCBLAS(hcblasScalEx(
+                handle,
+                n,
+                alpha,
+                alpha_type,
+                x,
+                x_type,
+                incx,
+                execution_type));
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::scal::metax
diff --git a/src/infiniop/ops/scal/metax/scal_metax.h b/src/infiniop/ops/scal/metax/scal_metax.h
new file mode 100644
index 000000000..1e5760ffe
--- /dev/null
+++ b/src/infiniop/ops/scal/metax/scal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SCAL_METAX_H__
+#define __SCAL_METAX_H__
+
+#include "../scal.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SCAL_METAX_H__
diff --git a/src/infiniop/ops/scal/operator.cc b/src/infiniop/ops/scal/operator.cc
new file mode 100644
index 000000000..33fa8618a
--- /dev/null
+++ b/src/infiniop/ops/scal/operator.cc
@@ -0,0 +1,121 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/scal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/scal_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/scal_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/scal_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateScalDescriptor(
+    infiniopHandle_t handle,
+    infiniopScalDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t alpha_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::scal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::scal::NAMESPACE::Descriptor **>(desc_ptr), \
+            alpha_desc, x_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetScalWorkspaceSize(infiniopScalDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::scal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopScal(
+    infiniopScalDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *alpha,
+    void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::scal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, alpha, x, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyScalDescriptor(infiniopScalDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::scal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scal/scal.h b/src/infiniop/ops/scal/scal.h
new file mode 100644
index 000000000..e94bbe2b2
--- /dev/null
+++ b/src/infiniop/ops/scal/scal.h
@@ -0,0 +1,47 @@
+#ifndef __SCAL_H__
+#define __SCAL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::scal::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ScalInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            ScalInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t alpha_desc,               \
+            infiniopTensorDescriptor_t x_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            const void *alpha,                                   \
+            void *x,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __SCAL_H__
diff --git a/src/infiniop/ops/swap/bang/swap_bang.h b/src/infiniop/ops/swap/bang/swap_bang.h
new file mode 100644
index 000000000..dd908caa2
--- /dev/null
+++ b/src/infiniop/ops/swap/bang/swap_bang.h
@@ -0,0 +1,8 @@
+#ifndef __SWAP_BANG_H__
+#define __SWAP_BANG_H__
+
+#include "../swap.h"
+
+DESCRIPTOR(bang)
+
+#endif // __SWAP_BANG_H__
diff --git a/src/infiniop/ops/swap/bang/swap_bang.mlu b/src/infiniop/ops/swap/bang/swap_bang.mlu
new file mode 100644
index 000000000..ecd4a418a
--- /dev/null
+++ b/src/infiniop/ops/swap/bang/swap_bang.mlu
@@ -0,0 +1,96 @@
+#include "../../../devices/bang/common_bang.h"
+#include "swap_bang.h"
+#include "swap_bang_kernel.mlu"
+
+namespace op::swap::bang {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
+    auto result = SwapInfo::createSwapInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateSwap(
+    const SwapInfo &info,
+    Tdata *x,
+    Tdata *y,
+    cnrtQueue_t queue) {
+
+    const int n = utils::cast<int>(info.n);
+    const int incx = utils::cast<int>(info.incx);
+    const int incy = utils::cast<int>(info.incy);
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+
+    k_dim.x = 4;
+    k_dim.y = 1;
+    k_dim.z = 1;
+    k_type = cnrtFuncTypeUnion1;
+
+    if (incx == 1 && incy == 1) {
+        swapKernelContiguous<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            y);
+    } else {
+        swapKernelStrided<<<k_dim, k_type, queue>>>(
+            n,
+            x,
+            incx,
+            y,
+            incy);
+    }
+
+    cnrtQueueSync(queue);
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_SWAP(TDATA) \
+    calculateSwap(_info,      \
+                  (TDATA *)x, \
+                  (TDATA *)y, \
+                  (cnrtQueue_t)stream)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_SWAP(half);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_SWAP(bfloat16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_SWAP(float);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_SWAP
+
+} // namespace op::swap::bang
diff --git a/src/infiniop/ops/swap/bang/swap_bang_kernel.mlu b/src/infiniop/ops/swap/bang/swap_bang_kernel.mlu
new file mode 100644
index 000000000..6a7b86e2c
--- /dev/null
+++ b/src/infiniop/ops/swap/bang/swap_bang_kernel.mlu
@@ -0,0 +1,75 @@
+#include "../../../devices/bang/common_bang.h"
+#include "swap_bang.h"
+
+__nram__ char nram_buffer[NRAM_MAX_SIZE];
+
+template <typename Tdata>
+__mlu_global__ void swapKernelContiguous(
+    int n,
+    Tdata *x,
+    Tdata *y) {
+
+    char *nram_aligned = (char *)(((size_t)nram_buffer + ALIGN_SIZE - 1) & ~(ALIGN_SIZE - 1));
+
+    size_t nram_usable = NRAM_MAX_SIZE - (nram_aligned - nram_buffer);
+    size_t max_chunk_elements = nram_usable / (2 * sizeof(Tdata));
+
+    size_t align_elements = ALIGN_SIZE / sizeof(Tdata);
+    if (align_elements == 0) {
+        align_elements = 1;
+    }
+    int chunk_size = (int)((max_chunk_elements / align_elements) * align_elements);
+
+    Tdata *nram_x = (Tdata *)nram_aligned;
+    Tdata *nram_y = nram_x + chunk_size;
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int core_elements = elements_per_core + (taskId < remain ? 1 : 0);
+    int core_offset = taskId < remain ? taskId * core_elements : taskId * elements_per_core + remain;
+
+    if (core_elements <= 0) {
+        return;
+    }
+
+    int chunks = core_elements / chunk_size;
+    int chunk_rem = core_elements % chunk_size;
+
+    for (int c = 0; c < chunks; c++) {
+        int current_offset = core_offset + c * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_size * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(x + current_offset, nram_y, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+        __memcpy(y + current_offset, nram_x, chunk_size * sizeof(Tdata), NRAM2GDRAM);
+    }
+
+    if (chunk_rem > 0) {
+        int current_offset = core_offset + chunks * chunk_size;
+        __memcpy(nram_x, x + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(nram_y, y + current_offset, chunk_rem * sizeof(Tdata), GDRAM2NRAM);
+        __memcpy(x + current_offset, nram_y, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+        __memcpy(y + current_offset, nram_x, chunk_rem * sizeof(Tdata), NRAM2GDRAM);
+    }
+}
+
+template <typename Tdata>
+__mlu_global__ void swapKernelStrided(
+    int n,
+    Tdata *x,
+    int incx,
+    Tdata *y,
+    int incy) {
+
+    int elements_per_core = n / taskDim;
+    int remain = n % taskDim;
+    int actual_tasks = elements_per_core + (taskId < remain ? 1 : 0);
+    int start_idx = taskId < remain ? taskId * actual_tasks : taskId * elements_per_core + remain;
+
+    for (int i = start_idx; i < start_idx + actual_tasks; ++i) {
+        int x_idx = i * incx;
+        int y_idx = i * incy;
+        Tdata temp = x[x_idx];
+        x[x_idx] = y[y_idx];
+        y[y_idx] = temp;
+    }
+}
diff --git a/src/infiniop/ops/swap/cpu/swap_cpu.cc b/src/infiniop/ops/swap/cpu/swap_cpu.cc
new file mode 100644
index 000000000..1f55bcc98
--- /dev/null
+++ b/src/infiniop/ops/swap/cpu/swap_cpu.cc
@@ -0,0 +1,81 @@
+#include "swap_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::swap::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = SwapInfo::createSwapInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename Tdata>
+infiniStatus_t calculateSwap(
+    const SwapInfo &info,
+    Tdata *x,
+    Tdata *y) {
+
+    const size_t n = info.n;
+    const ptrdiff_t incx = info.incx;
+    const ptrdiff_t incy = info.incy;
+
+    for (size_t i = 0; i < n; ++i) {
+        const ptrdiff_t x_idx = utils::cast<ptrdiff_t>(i) * incx;
+        const ptrdiff_t y_idx = utils::cast<ptrdiff_t>(i) * incy;
+        Tdata temp = x[x_idx];
+        x[x_idx] = y[y_idx];
+        y[y_idx] = temp;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE_SWAP(TDATA) \
+    calculateSwap(_info,      \
+                  (TDATA *)x, \
+                  (TDATA *)y)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+    (void)stream;
+
+    switch (_info.data_type) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE_SWAP(fp16_t);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_SWAP(bf16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_SWAP(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE_SWAP(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE_SWAP
+
+} // namespace op::swap::cpu
diff --git a/src/infiniop/ops/swap/cpu/swap_cpu.h b/src/infiniop/ops/swap/cpu/swap_cpu.h
new file mode 100644
index 000000000..b37295614
--- /dev/null
+++ b/src/infiniop/ops/swap/cpu/swap_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __SWAP_CPU_H__
+#define __SWAP_CPU_H__
+
+#include "../swap.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __SWAP_CPU_H__
diff --git a/src/infiniop/ops/swap/info.h b/src/infiniop/ops/swap/info.h
new file mode 100644
index 000000000..0dad381f7
--- /dev/null
+++ b/src/infiniop/ops/swap/info.h
@@ -0,0 +1,44 @@
+#ifndef __SWAP_INFO_H__
+#define __SWAP_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+class SwapInfo {
+private:
+    SwapInfo() = default;
+
+public:
+    size_t n;
+    ptrdiff_t incx;
+    ptrdiff_t incy;
+    infiniDtype_t data_type;
+
+    static utils::Result<SwapInfo> createSwapInfo(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc) {
+
+        CHECK_OR_RETURN(x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+        CHECK_OR_RETURN(y_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        auto data_type = x_desc->dtype();
+
+        CHECK_OR_RETURN(y_desc->dtype() == data_type, INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_OR_RETURN(x_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(x_desc->numel() == y_desc->numel(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto n = x_desc->numel();
+        auto incx = x_desc->stride(0);
+        auto incy = y_desc->stride(0);
+
+        return utils::Result<SwapInfo>(SwapInfo{
+            n,
+            incx,
+            incy,
+            data_type});
+    }
+};
+
+#endif // __SWAP_INFO_H__
diff --git a/src/infiniop/ops/swap/metax/swap_metax.cc b/src/infiniop/ops/swap/metax/swap_metax.cc
new file mode 100644
index 000000000..b84411df6
--- /dev/null
+++ b/src/infiniop/ops/swap/metax/swap_metax.cc
@@ -0,0 +1,72 @@
+#include "swap_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+namespace op::swap::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto result = SwapInfo::createSwapInfo(x_desc, y_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int n = utils::cast<int>(_info.n);
+    const int incx = utils::cast<int>(_info.incx);
+    const int incy = utils::cast<int>(_info.incy);
+    const infiniDtype_t data_type = _info.data_type;
+
+    CHECK_STATUS(_opaque->internal->useMcblas(
+        (hcStream_t)stream,
+        [&](hcblasHandle_t handle) {
+            CHECK_MCBLAS(hcblasSetPointerMode(handle, HCBLAS_POINTER_MODE_DEVICE));
+
+            switch (data_type) {
+            case INFINI_DTYPE_F32:
+                CHECK_MCBLAS(hcblasSswap(handle, n, (float *)x, incx, (float *)y, incy));
+                break;
+            case INFINI_DTYPE_F64:
+                CHECK_MCBLAS(hcblasDswap(handle, n, (double *)x, incx, (double *)y, incy));
+                break;
+            default:
+                return INFINI_STATUS_BAD_TENSOR_DTYPE;
+            }
+
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::swap::metax
diff --git a/src/infiniop/ops/swap/metax/swap_metax.h b/src/infiniop/ops/swap/metax/swap_metax.h
new file mode 100644
index 000000000..db8817c9a
--- /dev/null
+++ b/src/infiniop/ops/swap/metax/swap_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SWAP_METAX_H__
+#define __SWAP_METAX_H__
+
+#include "../swap.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SWAP_METAX_H__
diff --git a/src/infiniop/ops/swap/operator.cc b/src/infiniop/ops/swap/operator.cc
new file mode 100644
index 000000000..22d688021
--- /dev/null
+++ b/src/infiniop/ops/swap/operator.cc
@@ -0,0 +1,121 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/swap.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/swap_cpu.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/swap_metax.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/swap_bang.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateSwapDescriptor(
+    infiniopHandle_t handle,
+    infiniopSwapDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::swap::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::swap::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_desc, y_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetSwapWorkspaceSize(infiniopSwapDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::swap::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopSwap(
+    infiniopSwapDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *x,
+    void *y,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::swap::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, x, y, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroySwapDescriptor(infiniopSwapDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::swap::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/swap/swap.h b/src/infiniop/ops/swap/swap.h
new file mode 100644
index 000000000..f9eadfff9
--- /dev/null
+++ b/src/infiniop/ops/swap/swap.h
@@ -0,0 +1,47 @@
+#ifndef __SWAP_H__
+#define __SWAP_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::swap::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        SwapInfo _info;                                          \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            SwapInfo info,                                       \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(std::move(info)),                            \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t y_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *x,                                             \
+            void *y,                                             \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __SWAP_H__
diff --git a/src/infinirt/bang/infinirt_bang.cc b/src/infinirt/bang/infinirt_bang.cc
index 5384add19..fe43e15cf 100644
--- a/src/infinirt/bang/infinirt_bang.cc
+++ b/src/infinirt/bang/infinirt_bang.cc
@@ -172,4 +172,12 @@ infiniStatus_t graphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stre
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
+infiniStatus_t getMemInfo(int device_id, size_t *free_bytes, size_t *total_bytes) {
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+infiniStatus_t getDeviceResourceSnapshot(int device_id, infinirtDeviceResourceSnapshot_t *snapshot) {
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
 } // namespace infinirt::bang
diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py
index 80dcb3eb1..e57e8ec11 100644
--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
@@ -80,6 +80,9 @@ def run_tests(self, devices, test_func, test_type="Test"):
             print(f"Testing {test_type} on {InfiniDeviceNames[device]}")
             print(f"{'='*60}")
 
+            # Keep InfiniCore's runtime aligned with the selected test device.
+            infinicore.set_device(infinicore.device(torch_device_map[device], 0))
+
             for test_case in self.test_cases:
                 try:
                     print(f"{test_case}")
diff --git a/test/infinicore/ops/asum.py b/test/infinicore/ops/asum.py
new file mode 100644
index 000000000..26c25d699
--- /dev/null
+++ b/test/infinicore/ops/asum.py
@@ -0,0 +1,114 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+
+import infinicore
+
+# =======================================================================
+# Test cases format: (shape, x_strides_or_None)
+# =======================================================================
+
+_TEST_CASES_DATA = [
+    ((13,), None),
+    ((13,), (10,)),
+    ((16,), None),
+    ((16,), (4,)),
+    ((255,), None),
+    ((5632,), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.float64: {"atol": 1e-9, "rtol": 1e-6},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+
+def torch_asum(x, *, out=None):
+    def _asum(x, out):
+        out.copy_(torch.sum(x.abs()))
+
+    if out is None:
+        out = torch.empty(1, dtype=x.dtype, device=x.device)
+
+    _asum(x, out)
+    return out
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        x_strides = data[1] if len(data) > 1 else None
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            out_spec = TensorSpec.from_tensor((), None, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="asum - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tol,
+                    description="asum - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 asum operator test"""
+
+    def __init__(self):
+        super().__init__("Asum")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_asum(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.asum(*args, **kwargs)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/axpy.py b/test/infinicore/ops/axpy.py
new file mode 100644
index 000000000..cf37c8100
--- /dev/null
+++ b/test/infinicore/ops/axpy.py
@@ -0,0 +1,92 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((3,), None, None),
+    ((8,), (2,), (3,)),
+    ((32,), None, (2,)),
+    ((257,), (3,), None),
+    ((65535,), None, None),
+]
+
+_TENSOR_DTYPES = [
+    infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-9, "rtol": 1e-9},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def torch_axpy(alpha, x, y):
+    y.add_(x, alpha=alpha.item())
+
+    return y
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            alpha_spec = TensorSpec.from_tensor(
+                (), None, dtype, init_mode=TensorInitializer.ONES
+            )
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[alpha_spec, x_spec, y_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=2,
+                    tolerance=tol,
+                    description="axpy - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 axpy operator test"""
+
+    def __init__(self):
+        super().__init__("Axpy")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_axpy(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.axpy(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/blas_amax.py b/test/infinicore/ops/blas_amax.py
new file mode 100644
index 000000000..7e0412c9a
--- /dev/null
+++ b/test/infinicore/ops/blas_amax.py
@@ -0,0 +1,100 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((3,), None),
+    ((8,), (2,)),
+    ((32,), None),
+    ((257,), (3,)),
+    ((65535,), None),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE = {"atol": 0, "rtol": 0}
+
+
+def torch_blas_amax(x, *, out=None):
+    result = torch.argmax(x.abs()).to(torch.int32) + 1
+    if out is None:
+        return result
+
+    out.copy_(result)
+    return out
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            out_spec = TensorSpec.from_tensor(
+                (), None, infinicore.int32, init_mode=TensorInitializer.ZEROS
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=_TOLERANCE,
+                    description="blas_amax - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=_TOLERANCE,
+                    description="blas_amax - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 amax operator test"""
+
+    def __init__(self):
+        super().__init__("BlasAmax")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_blas_amax(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.blas_amax(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/blas_amin.py b/test/infinicore/ops/blas_amin.py
new file mode 100644
index 000000000..6063508c5
--- /dev/null
+++ b/test/infinicore/ops/blas_amin.py
@@ -0,0 +1,100 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((3,), None),
+    ((8,), (2,)),
+    ((32,), None),
+    ((257,), (3,)),
+    ((65535,), None),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE = {"atol": 0, "rtol": 0}
+
+
+def torch_blas_amin(x, *, out=None):
+    result = torch.argmin(x.abs()).to(torch.int32) + 1
+    if out is None:
+        return result
+
+    out.copy_(result)
+    return out
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            out_spec = TensorSpec.from_tensor(
+                (), None, infinicore.int32, init_mode=TensorInitializer.ZEROS
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=_TOLERANCE,
+                    description="blas_amin - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=_TOLERANCE,
+                    description="blas_amin - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 amin operator test"""
+
+    def __init__(self):
+        super().__init__("BlasAmin")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_blas_amin(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.blas_amin(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/blas_copy.py b/test/infinicore/ops/blas_copy.py
new file mode 100644
index 000000000..62b9b81a8
--- /dev/null
+++ b/test/infinicore/ops/blas_copy.py
@@ -0,0 +1,87 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((3,), None, None),
+    ((8,), (2,), (3,)),
+    ((32,), None, (2,)),
+    ((257,), (3,), None),
+    ((65535,), None, None),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.float64: {"atol": 1e-15, "rtol": 1e-15},
+    infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+
+def torch_blas_copy(x, y):
+    y.copy_(x)
+    return y
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=1,
+                    tolerance=tol,
+                    description="blas_copy - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 copy operator test"""
+
+    def __init__(self):
+        super().__init__("BlasCopy")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_blas_copy(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.blas_copy(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/blas_dot.py b/test/infinicore/ops/blas_dot.py
new file mode 100644
index 000000000..edc4e9ff5
--- /dev/null
+++ b/test/infinicore/ops/blas_dot.py
@@ -0,0 +1,111 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((3,), None, None),
+    ((8,), (2,), (3,)),
+    ((32,), None, (2,)),
+    ((257,), (3,), None),
+    ((65535,), None, None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-9, "rtol": 1e-9},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+_TENSOR_DTYPES = [
+    infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    infinicore.bfloat16,
+]
+
+
+def torch_blas_dot(x, y, *, out=None):
+    if x.dtype in (torch.float16, torch.bfloat16):
+        result = torch.dot(x.float(), y.float()).to(x.dtype)
+    else:
+        result = torch.dot(x, y)
+
+    if out is None:
+        return result
+
+    out.copy_(result)
+    return out
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+            out_spec = TensorSpec.from_tensor(
+                (), None, dtype, init_mode=TensorInitializer.ZEROS
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="blas_dot - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tol,
+                    description="blas_dot - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 dot operator test"""
+
+    def __init__(self):
+        super().__init__("BlasDot")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_blas_dot(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.blas_dot(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/nrm2.py b/test/infinicore/ops/nrm2.py
new file mode 100644
index 000000000..ca5d4a5fc
--- /dev/null
+++ b/test/infinicore/ops/nrm2.py
@@ -0,0 +1,104 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((13,), None),
+    ((13,), (10,)),
+    ((5632,), None),
+    ((5632,), (5,)),
+    ((16,), (4,)),
+    ((5632,), (32,)),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+_TENSOR_DTYPES = [
+    infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    infinicore.bfloat16,
+]
+
+
+def torch_nrm2(x, *, out=None):
+    result = torch.norm(x, p=2)
+    if out is None:
+        return result
+
+    out.copy_(result)
+    return out
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            out_spec = TensorSpec.from_tensor((), None, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="nrm2 - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tol,
+                    description="nrm2 - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 nrm2 operator test"""
+
+    def __init__(self):
+        super().__init__("Nrm2")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_nrm2(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.nrm2(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rot.py b/test/infinicore/ops/rot.py
new file mode 100644
index 000000000..bace043ee
--- /dev/null
+++ b/test/infinicore/ops/rot.py
@@ -0,0 +1,107 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((13,), None, None),
+    ((13,), (10,), (10,)),
+    ((5632,), None, None),
+    ((5632,), (5,), (5,)),
+    ((16,), (4,), (4,)),
+    ((5632,), (32,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-9, "rtol": 1e-9},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def torch_rot(x, y, c, s):
+    x0 = x.clone()
+    y0 = y.clone()
+    x.copy_(c * x0 + s * y0)
+    y.copy_(c * y0 - s * x0)
+    return x, y
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+            c_spec = TensorSpec.from_tensor(
+                (),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor(0.6),
+            )
+            s_spec = TensorSpec.from_tensor(
+                (),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor(0.8),
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec, c_spec, s_spec],
+                    kwargs={},
+                    comparison_target=[0, 1],
+                    tolerance=tol,
+                    output_count=2,
+                    description="rot - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 rot operator test"""
+
+    def __init__(self):
+        super().__init__("Rot")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_rot(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rot(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rotg.py b/test/infinicore/ops/rotg.py
new file mode 100644
index 000000000..3b269925b
--- /dev/null
+++ b/test/infinicore/ops/rotg.py
@@ -0,0 +1,145 @@
+import math
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    (0.0, 0.0),
+    (3.0, 4.0),
+    (-2.5, 5.0),
+    (7.0, -1.5),
+    (-3.2, -8.4),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def torch_rotg(a, b, c, s):
+    a0 = a.item()
+    b0 = b.item()
+    anorm = abs(a0)
+    bnorm = abs(b0)
+    if bnorm == 0.0:
+        a.fill_(a0)
+        b.zero_()
+        c.fill_(1.0)
+        s.zero_()
+        return a, b, c, s
+    if anorm == 0.0:
+        a.fill_(b0)
+        b.fill_(1.0)
+        c.zero_()
+        s.fill_(1.0)
+        return a, b, c, s
+
+    sigma = math.copysign(1.0, a0 if anorm > bnorm else b0)
+    r = sigma * math.hypot(a0, b0)
+    c0 = a0 / r
+    s0 = b0 / r
+    if anorm > bnorm:
+        z = s0
+    elif c0 != 0.0:
+        z = 1.0 / c0
+    else:
+        z = 1.0
+
+    a.fill_(r)
+    b.fill_(z)
+    c.fill_(c0)
+    s.fill_(s0)
+    return a, b, c, s
+
+
+def parse_test_cases():
+    test_cases = []
+    for a_value, b_value in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            a_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([a_value]),
+            )
+            b_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([b_value]),
+            )
+            c_spec = TensorSpec.from_tensor(
+                (),
+                None,
+                dtype,
+                init_mode=TensorInitializer.ZEROS,
+            )
+            s_spec = TensorSpec.from_tensor(
+                (),
+                None,
+                dtype,
+                init_mode=TensorInitializer.ZEROS,
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec, c_spec, s_spec],
+                    kwargs={},
+                    comparison_target=[0, 1, 2, 3],
+                    tolerance=tol,
+                    output_count=4,
+                    description="rotg - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 rotg operator test"""
+
+    def __init__(self):
+        super().__init__("Rotg")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_rotg(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rotg(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rotm.py b/test/infinicore/ops/rotm.py
new file mode 100644
index 000000000..36c3ca019
--- /dev/null
+++ b/test/infinicore/ops/rotm.py
@@ -0,0 +1,110 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((13,), None, None, (-1.0, 1.2, -0.3, 0.4, 0.8)),
+    ((13,), (10,), (10,), (0.0, 0.0, -0.25, 0.5, 0.0)),
+    ((5632,), None, None, (1.0, 1.1, 0.0, 0.0, 0.9)),
+    ((5632,), (5,), (5,), (-2.0, 0.0, 0.0, 0.0, 0.0)),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.float64: {"atol": 1e-9, "rtol": 1e-9},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def torch_rotm(x, y, param):
+    sflag, sh11, sh21, sh12, sh22 = param
+    if sflag == -2.0:
+        return x, y
+
+    w = x.clone()
+    z = y.clone()
+
+    if sflag < 0.0:
+        x.copy_(w * sh11 + z * sh12)
+        y.copy_(w * sh21 + z * sh22)
+    elif sflag == 0.0:
+        x.copy_(w + z * sh12)
+        y.copy_(w * sh21 + z)
+    else:
+        x.copy_(w * sh11 + z)
+        y.copy_(-w + sh22 * z)
+    return x, y
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides, param in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+            param_spec = TensorSpec.from_tensor(
+                (5,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor(param),
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec, param_spec],
+                    kwargs={},
+                    comparison_target=[0, 1],
+                    tolerance=tol,
+                    output_count=2,
+                    description="rotm - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 rotm operator test"""
+
+    def __init__(self):
+        super().__init__("Rotm")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_rotm(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rotm(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rotmg.py b/test/infinicore/ops/rotmg.py
new file mode 100644
index 000000000..52fa1c4c3
--- /dev/null
+++ b/test/infinicore/ops/rotmg.py
@@ -0,0 +1,233 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    (1.0, 2.0, 3.0, 4.0),
+    (2.5, 0.5, -1.2, 0.8),
+    (3.0, 4.0, 0.0, 2.0),
+    (1.5, 1.5, 2.0, -3.0),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.float64: {"atol": 1e-12, "rtol": 1e-12},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def _rotmg_values(d1, d2, x1, y1):
+    zero = 0.0
+    one = 1.0
+    two = 2.0
+    gam = 4096.0
+    gamsq = 1.67772e7
+    rgamsq = 5.96046e-8
+
+    param = [0.0] * 5
+    sh11 = sh12 = sh21 = sh22 = 0.0
+
+    if d1 < zero:
+        sflag = -one
+        d1 = d2 = x1 = zero
+    else:
+        sp2 = d2 * y1
+        if sp2 == zero:
+            param[0] = -two
+            return d1, d2, x1, param
+
+        sp1 = d1 * x1
+        sq2 = sp2 * y1
+        sq1 = sp1 * x1
+
+        if abs(sq1) > abs(sq2):
+            sh21 = -y1 / x1
+            sh12 = sp2 / sp1
+            su = one - sh12 * sh21
+            if su > zero:
+                sflag = zero
+                d1 = d1 / su
+                d2 = d2 / su
+                x1 = x1 * su
+            else:
+                sflag = -one
+                sh11 = sh12 = sh21 = sh22 = zero
+                d1 = d2 = x1 = zero
+        else:
+            if sq2 < zero:
+                sflag = -one
+                d1 = d2 = x1 = zero
+            else:
+                sflag = one
+                sh11 = sp1 / sp2
+                sh22 = x1 / y1
+                su = one + sh11 * sh22
+                stemp = d2 / su
+                d2 = d1 / su
+                d1 = stemp
+                x1 = y1 * su
+
+        if d1 != zero:
+            while d1 <= rgamsq or d1 >= gamsq:
+                if sflag == zero:
+                    sh11 = one
+                    sh22 = one
+                    sflag = -one
+                else:
+                    sh21 = -one
+                    sh12 = one
+                    sflag = -one
+                if d1 <= rgamsq:
+                    d1 = d1 * gam * gam
+                    x1 = x1 / gam
+                    sh11 = sh11 / gam
+                    sh12 = sh12 / gam
+                else:
+                    d1 = d1 / (gam * gam)
+                    x1 = x1 * gam
+                    sh11 = sh11 * gam
+                    sh12 = sh12 * gam
+
+        if d2 != zero:
+            while abs(d2) <= rgamsq or abs(d2) >= gamsq:
+                if sflag == zero:
+                    sh11 = one
+                    sh22 = one
+                    sflag = -one
+                else:
+                    sh21 = -one
+                    sh12 = one
+                    sflag = -one
+                if abs(d2) <= rgamsq:
+                    d2 = d2 * gam * gam
+                    sh21 = sh21 / gam
+                    sh22 = sh22 / gam
+                else:
+                    d2 = d2 / (gam * gam)
+                    sh21 = sh21 * gam
+                    sh22 = sh22 * gam
+
+    if sflag < zero:
+        param[1] = sh11
+        param[2] = sh21
+        param[3] = sh12
+        param[4] = sh22
+    elif sflag == zero:
+        param[2] = sh21
+        param[3] = sh12
+    else:
+        param[1] = sh11
+        param[4] = sh22
+
+    param[0] = sflag
+    return d1, d2, x1, param
+
+
+def torch_rotmg(d1, d2, x1, y1, param):
+    out_d1, out_d2, out_x1, out_param = _rotmg_values(
+        d1.item(), d2.item(), x1.item(), y1.item()
+    )
+    d1.fill_(out_d1)
+    d2.fill_(out_d2)
+    x1.fill_(out_x1)
+    param.copy_(torch.tensor(out_param, dtype=param.dtype, device=param.device))
+    return d1, d2, x1, param
+
+
+def parse_test_cases():
+    test_cases = []
+    for d1_value, d2_value, x1_value, y1_value in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            d1_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([d1_value]),
+            )
+            d2_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([d2_value]),
+            )
+            x1_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([x1_value]),
+            )
+            y1_spec = TensorSpec.from_tensor(
+                (1,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.MANUAL,
+                set_tensor=torch.tensor([y1_value]),
+            )
+            param_spec = TensorSpec.from_tensor(
+                (5,),
+                None,
+                dtype,
+                init_mode=TensorInitializer.ZEROS,
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[d1_spec, d2_spec, x1_spec, y1_spec, param_spec],
+                    kwargs={},
+                    comparison_target=[0, 1, 2, 4],
+                    tolerance=tol,
+                    output_count=4,
+                    description="rotmg - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 rotmg operator test"""
+
+    def __init__(self):
+        super().__init__("Rotmg")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_rotmg(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rotmg(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/scal.py b/test/infinicore/ops/scal.py
new file mode 100644
index 000000000..c044989a8
--- /dev/null
+++ b/test/infinicore/ops/scal.py
@@ -0,0 +1,91 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+from framework.tensor import TensorInitializer
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((13,), None),
+    ((13,), (10,)),
+    ((5632,), None),
+    ((5632,), (5,)),
+    ((16,), (4,)),
+    ((5632,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.float64: {"atol": 1e-15, "rtol": 1e-15},
+    infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+
+def torch_scal(x, alpha):
+    x.mul_(alpha)
+    return x
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            alpha_spec = TensorSpec.from_tensor(
+                (1,), None, dtype, init_mode=TensorInitializer.ONES
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, alpha_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=0,
+                    tolerance=tol,
+                    description="scal - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 scal operator test"""
+
+    def __init__(self):
+        super().__init__("Scal")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_scal(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.scal(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/swap.py b/test/infinicore/ops/swap.py
new file mode 100644
index 000000000..b30f1290d
--- /dev/null
+++ b/test/infinicore/ops/swap.py
@@ -0,0 +1,90 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+)
+
+import infinicore
+
+_TEST_CASES_DATA = [
+    ((13,), None, None),
+    ((13,), (10,), (10,)),
+    ((5632,), None, None),
+    ((5632,), (5,), (5,)),
+    ((16,), (4,), (4,)),
+    ((5632,), (32,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    # infinicore.float16,
+    infinicore.float32,
+    # infinicore.float64,
+    # infinicore.bfloat16,
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-7, "rtol": 1e-7},
+    infinicore.float64: {"atol": 1e-15, "rtol": 1e-15},
+    infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+
+def torch_swap(x, y):
+    tmp = x.clone()
+    x.copy_(y)
+    y.copy_(tmp)
+    return x, y
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, x_strides, y_strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            x_spec = TensorSpec.from_tensor(shape, x_strides, dtype)
+            y_spec = TensorSpec.from_tensor(shape, y_strides, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[x_spec, y_spec],
+                    kwargs={},
+                    comparison_target=[0, 1],
+                    tolerance=tol,
+                    output_count=2,
+                    description="swap - INPLACE",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """BLAS Level-1 swap operator test"""
+
+    def __init__(self):
+        super().__init__("Swap")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch_swap(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.swap(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infiniop/asum.py b/test/infiniop/asum.py
new file mode 100644
index 000000000..189b3dc7a
--- /dev/null
+++ b/test/infiniop/asum.py
@@ -0,0 +1,143 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride
+    (3, None),
+    (8, (2,)),
+    (32, None),
+    (257, (3,)),
+    (65535, None),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor((n,), x_stride, dtype, device)
+    result = TestTensor(tuple(), None, dtype, device, mode="zeros")
+
+    print(
+        f"Testing asum on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    result_ref = torch.sum(x.torch_tensor().abs())
+    result.update_torch_tensor(result_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAsumDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            result.descriptor,
+        )
+    )
+
+    for tensor in [x, result]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAsumWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, result.device)
+
+    def lib_asum():
+        check_error(
+            LIBINFINIOP.infiniopAsum(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                result.data(),
+                None,
+            )
+        )
+
+    lib_asum()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.sum(x.torch_tensor().abs()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_asum(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAsumDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/axpy.py b/test/infiniop/axpy.py
new file mode 100644
index 000000000..d0258b39b
--- /dev/null
+++ b/test/infiniop/axpy.py
@@ -0,0 +1,145 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride, y_stride
+    (3, None, None),
+    (8, (2,), (3,)),
+    (32, None, (2,)),
+    (257, (3,), None),
+    (65535, None, None),
+]
+
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    alpha = TestTensor(tuple(), None, dtype, device)
+    x = TestTensor((n,), x_stride, dtype, device)
+    y = TestTensor((n,), y_stride, dtype, device)
+
+    print(
+        f"Testing axpy on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    y_ref = alpha.torch_tensor() * x.torch_tensor() + y.torch_tensor()
+    y.update_torch_tensor(y_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAxpyDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            alpha.descriptor,
+            x.descriptor,
+            y.descriptor,
+        )
+    )
+
+    for tensor in [alpha, x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAxpyWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_axpy():
+        check_error(
+            LIBINFINIOP.infiniopAxpy(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                alpha.data(),
+                x.data(),
+                y.data(),
+                None,
+            )
+        )
+
+    lib_axpy()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: alpha.torch_tensor() * x.torch_tensor() + y.torch_tensor(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_axpy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAxpyDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/blas_amax.py b/test/infiniop/blas_amax.py
new file mode 100644
index 000000000..d4d806305
--- /dev/null
+++ b/test/infiniop/blas_amax.py
@@ -0,0 +1,132 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride
+    (3, None),
+    (8, (2,)),
+    (32, None),
+    (257, (3,)),
+    (65535, None),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor((n,), x_stride, dtype, device)
+    result = TestTensor(tuple(), None, InfiniDtype.I32, device, mode="zeros")
+
+    print(
+        f"Testing blas_amax on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    result_ref = torch.argmax(x.torch_tensor().abs()).to(torch.int32) + 1
+    result.update_torch_tensor(result_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBlasAmaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            result.descriptor,
+        )
+    )
+
+    for tensor in [x, result]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBlasAmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, result.device)
+
+    def lib_blas_amax():
+        check_error(
+            LIBINFINIOP.infiniopBlasAmax(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                result.data(),
+                None,
+            )
+        )
+
+    lib_blas_amax()
+
+    if DEBUG:
+        debug(result.actual_tensor(), result.torch_tensor())
+    assert torch.equal(result.actual_tensor(), result.torch_tensor())
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.argmax(x.torch_tensor().abs()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_blas_amax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyBlasAmaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/blas_amin.py b/test/infiniop/blas_amin.py
new file mode 100644
index 000000000..4899f85dd
--- /dev/null
+++ b/test/infiniop/blas_amin.py
@@ -0,0 +1,132 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride
+    (3, None),
+    (8, (2,)),
+    (32, None),
+    (257, (3,)),
+    (65535, None),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor((n,), x_stride, dtype, device)
+    result = TestTensor(tuple(), None, InfiniDtype.I32, device, mode="zeros")
+
+    print(
+        f"Testing blas_amin on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    result_ref = torch.argmin(x.torch_tensor().abs()).to(torch.int32) + 1
+    result.update_torch_tensor(result_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBlasAminDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            result.descriptor,
+        )
+    )
+
+    for tensor in [x, result]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBlasAminWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, result.device)
+
+    def lib_blas_amin():
+        check_error(
+            LIBINFINIOP.infiniopBlasAmin(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                result.data(),
+                None,
+            )
+        )
+
+    lib_blas_amin()
+
+    if DEBUG:
+        debug(result.actual_tensor(), result.torch_tensor())
+    assert torch.equal(result.actual_tensor(), result.torch_tensor())
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.argmax(x.torch_tensor().abs()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_blas_amin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyBlasAminDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/blas_copy.py b/test/infiniop/blas_copy.py
new file mode 100644
index 000000000..c1d083d2d
--- /dev/null
+++ b/test/infiniop/blas_copy.py
@@ -0,0 +1,151 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride, y_stride
+    (3, None, None),
+    (8, (2,), (3,)),
+    (32, None, (2,)),
+    (257, (3,), None),
+    (65535, None, None),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_copy(x, y):
+    y.copy_(x)
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    x = TestTensor((n,), x_stride, dtype, device)
+    y = TestTensor((n,), y_stride, dtype, device)
+
+    if x.is_broadcast():
+        return
+
+    print(
+        f"Testing BlasCopy on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} "
+        f"y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_copy(x.torch_tensor(), y.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBlasCopyDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            y.descriptor,
+        )
+    )
+
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBlasCopyWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_copy():
+        check_error(
+            LIBINFINIOP.infiniopBlasCopy(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                y.data(),
+                None,
+            )
+        )
+
+    lib_copy()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: torch_copy(x.torch_tensor(), y.torch_tensor()),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_copy(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyBlasCopyDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/blas_dot.py b/test/infiniop/blas_dot.py
new file mode 100644
index 000000000..2d51b883d
--- /dev/null
+++ b/test/infiniop/blas_dot.py
@@ -0,0 +1,154 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+
+_TEST_CASES = [
+    # n, x_stride, y_stride
+    (3, None, None),
+    (8, (2,), (3,)),
+    (32, None, (2,)),
+    (257, (3,), None),
+    (65535, None, None),
+]
+
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    n,
+    x_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    torch.manual_seed(0)
+    if device != 0:
+        torch.cuda.manual_seed_all(0)
+
+    x = TestTensor((n,), x_stride, dtype, device)
+    y = TestTensor((n,), y_stride, dtype, device)
+    result = TestTensor(tuple(), None, dtype, device, mode="zeros")
+
+    print(
+        f"Testing blas_dot on {InfiniDeviceNames[device]} with n:{n} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    def torch_blas_dot():
+        if dtype in (InfiniDtype.F16, InfiniDtype.BF16):
+            return torch.dot(x.torch_tensor().float(), y.torch_tensor().float()).to(
+                x.torch_tensor().dtype
+            )
+        return torch.dot(x.torch_tensor(), y.torch_tensor())
+
+    result_ref = torch_blas_dot()
+    result.update_torch_tensor(result_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateBlasDotDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            y.descriptor,
+            result.descriptor,
+        )
+    )
+
+    for tensor in [x, result]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetBlasDotWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, result.device)
+
+    def lib_blas_dot():
+        check_error(
+            LIBINFINIOP.infiniopBlasDot(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                y.data(),
+                result.data(),
+                None,
+            )
+        )
+
+    lib_blas_dot()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_blas_dot(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_blas_dot(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyBlasDotDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 2802bc5bc..e43f3a18d 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -1353,6 +1353,7 @@ def gptq_qyblas_gemm_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
@@ -2159,3 +2160,439 @@ def fused_ffn_(lib):
     lib.infiniopDestroyFusedFFNDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def blas_amax_(lib):
+    lib.infiniopCreateBlasAmaxDescriptor.restype = c_int32
+    lib.infiniopCreateBlasAmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBlasAmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetBlasAmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopBlasAmax.restype = c_int32
+    lib.infiniopBlasAmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyBlasAmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyBlasAmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def blas_amin_(lib):
+    lib.infiniopCreateBlasAminDescriptor.restype = c_int32
+    lib.infiniopCreateBlasAminDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBlasAminWorkspaceSize.restype = c_int32
+    lib.infiniopGetBlasAminWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopBlasAmin.restype = c_int32
+    lib.infiniopBlasAmin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyBlasAminDescriptor.restype = c_int32
+    lib.infiniopDestroyBlasAminDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def asum_(lib):
+    lib.infiniopCreateAsumDescriptor.restype = c_int32
+    lib.infiniopCreateAsumDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAsumWorkspaceSize.restype = c_int32
+    lib.infiniopGetAsumWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAsum.restype = c_int32
+    lib.infiniopAsum.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAsumDescriptor.restype = c_int32
+    lib.infiniopDestroyAsumDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def axpy_(lib):
+    lib.infiniopCreateAxpyDescriptor.restype = c_int32
+    lib.infiniopCreateAxpyDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAxpyWorkspaceSize.restype = c_int32
+    lib.infiniopGetAxpyWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAxpy.restype = c_int32
+    lib.infiniopAxpy.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAxpyDescriptor.restype = c_int32
+    lib.infiniopDestroyAxpyDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def blas_copy_(lib):
+    lib.infiniopCreateBlasCopyDescriptor.restype = c_int32
+    lib.infiniopCreateBlasCopyDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBlasCopyWorkspaceSize.restype = c_int32
+    lib.infiniopGetBlasCopyWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopBlasCopy.restype = c_int32
+    lib.infiniopBlasCopy.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyBlasCopyDescriptor.restype = c_int32
+    lib.infiniopDestroyBlasCopyDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def blas_dot_(lib):
+    lib.infiniopCreateBlasDotDescriptor.restype = c_int32
+    lib.infiniopCreateBlasDotDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetBlasDotWorkspaceSize.restype = c_int32
+    lib.infiniopGetBlasDotWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopBlasDot.restype = c_int32
+    lib.infiniopBlasDot.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyBlasDotDescriptor.restype = c_int32
+    lib.infiniopDestroyBlasDotDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def nrm2_(lib):
+    lib.infiniopCreateNrm2Descriptor.restype = c_int32
+    lib.infiniopCreateNrm2Descriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetNrm2WorkspaceSize.restype = c_int32
+    lib.infiniopGetNrm2WorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopNrm2.restype = c_int32
+    lib.infiniopNrm2.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyNrm2Descriptor.restype = c_int32
+    lib.infiniopDestroyNrm2Descriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rot_(lib):
+    lib.infiniopCreateRotDescriptor.restype = c_int32
+    lib.infiniopCreateRotDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRotWorkspaceSize.restype = c_int32
+    lib.infiniopGetRotWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRot.restype = c_int32
+    lib.infiniopRot.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRotDescriptor.restype = c_int32
+    lib.infiniopDestroyRotDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rotg_(lib):
+    lib.infiniopCreateRotgDescriptor.restype = c_int32
+    lib.infiniopCreateRotgDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRotgWorkspaceSize.restype = c_int32
+    lib.infiniopGetRotgWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRotg.restype = c_int32
+    lib.infiniopRotg.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRotgDescriptor.restype = c_int32
+    lib.infiniopDestroyRotgDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rotm_(lib):
+    lib.infiniopCreateRotmDescriptor.restype = c_int32
+    lib.infiniopCreateRotmDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRotmWorkspaceSize.restype = c_int32
+    lib.infiniopGetRotmWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRotm.restype = c_int32
+    lib.infiniopRotm.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRotmDescriptor.restype = c_int32
+    lib.infiniopDestroyRotmDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def rotmg_(lib):
+    lib.infiniopCreateRotmgDescriptor.restype = c_int32
+    lib.infiniopCreateRotmgDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetRotmgWorkspaceSize.restype = c_int32
+    lib.infiniopGetRotmgWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopRotmg.restype = c_int32
+    lib.infiniopRotmg.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyRotmgDescriptor.restype = c_int32
+    lib.infiniopDestroyRotmgDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def scal_(lib):
+    lib.infiniopCreateScalDescriptor.restype = c_int32
+    lib.infiniopCreateScalDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetScalWorkspaceSize.restype = c_int32
+    lib.infiniopGetScalWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopScal.restype = c_int32
+    lib.infiniopScal.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyScalDescriptor.restype = c_int32
+    lib.infiniopDestroyScalDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def swap_(lib):
+    lib.infiniopCreateSwapDescriptor.restype = c_int32
+    lib.infiniopCreateSwapDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSwapWorkspaceSize.restype = c_int32
+    lib.infiniopGetSwapWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSwap.restype = c_int32
+    lib.infiniopSwap.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySwapDescriptor.restype = c_int32
+    lib.infiniopDestroySwapDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/nrm2.py b/test/infiniop/nrm2.py
new file mode 100644
index 000000000..7c45fda1f
--- /dev/null
+++ b/test/infiniop/nrm2.py
@@ -0,0 +1,152 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+# Format: (shape, x_stride)
+_TEST_CASES = [
+    ((13,), None),
+    ((13,), (10,)),
+    ((5632,), None),
+    ((5632,), (5,)),
+    ((16,), (4,)),
+    ((5632,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_nrm2(x):
+    return torch.norm(x, p=2)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    dtype=InfiniDtype.F32,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+
+    result = TestTensor(tuple(), None, dtype, device, mode="zeros")
+
+    print(
+        f"Testing Nrm2 on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    result_ref = torch_nrm2(x.torch_tensor())
+    result.update_torch_tensor(result_ref)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateNrm2Descriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            result.descriptor,
+        )
+    )
+
+    for tensor in [x, result]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetNrm2WorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_nrm2():
+        check_error(
+            LIBINFINIOP.infiniopNrm2(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                result.data(),
+                None,
+            )
+        )
+
+    lib_nrm2()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+
+    if DEBUG:
+        debug(result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(
+        result.actual_tensor(), result.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: torch_nrm2(x.torch_tensor()),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_nrm2(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyNrm2Descriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rot.py b/test/infiniop/rot.py
new file mode 100644
index 000000000..e9d6f63f3
--- /dev/null
+++ b/test/infiniop/rot.py
@@ -0,0 +1,160 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+_TEST_CASES = [
+    ((13,), None, None),
+    ((13,), (10,), (10,)),
+    ((5632,), None, None),
+    ((5632,), (5,), (5,)),
+    ((16,), (4,), (4,)),
+    ((5632,), (32,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_rot(x, y, c, s):
+    x0 = x.clone()
+    y0 = y.clone()
+    x.copy_(c * x0 + s * y0)
+    y.copy_(c * y0 - s * x0)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    dtype=torch.float32,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+    y = TestTensor(shape, y_stride, dtype, device)
+    c = TestTensor(tuple(), None, dtype, device)
+    s = TestTensor(tuple(), None, dtype, device)
+
+    if x.is_broadcast() or y.is_broadcast():
+        return
+
+    print(
+        f"Testing Rot on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} "
+        f"y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_rot(x.torch_tensor(), y.torch_tensor(), c.torch_tensor(), s.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRotDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            y.descriptor,
+            c.descriptor,
+            s.descriptor,
+        )
+    )
+
+    for tensor in [c, s, x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRotWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_rot():
+        check_error(
+            LIBINFINIOP.infiniopRot(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                y.data(),
+                c.data(),
+                s.data(),
+                None,
+            )
+        )
+
+    lib_rot()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: torch_rot(
+                x.torch_tensor(), y.torch_tensor(), c.torch_tensor(), s.torch_tensor()
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_rot(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyRotDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rotg.py b/test/infiniop/rotg.py
new file mode 100644
index 000000000..da71dbe2b
--- /dev/null
+++ b/test/infiniop/rotg.py
@@ -0,0 +1,147 @@
+import ctypes
+import math
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    test_operator,
+)
+
+_TEST_CASES = [
+    (0.0, 0.0),
+    (3.0, 4.0),
+    (-2.5, 5.0),
+    (7.0, -1.5),
+    (-3.2, -8.4),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def torch_rotg(a, b):
+    anorm = abs(a)
+    bnorm = abs(b)
+    if bnorm == 0.0:
+        return a, 0.0, 1.0, 0.0
+    if anorm == 0.0:
+        return b, 1.0, 0.0, 1.0
+
+    sigma = math.copysign(1.0, a if anorm > bnorm else b)
+    r = sigma * math.hypot(a, b)
+    c = a / r
+    s = b / r
+    if anorm > bnorm:
+        z = s
+    elif c != 0.0:
+        z = 1.0 / c
+    else:
+        z = 1.0
+    return r, z, c, s
+
+
+def test(handle, device, a0, b0, dtype=torch.float32, sync=None):
+    a_torch = torch.tensor([a0])
+    b_torch = torch.tensor([b0])
+    a = TestTensor(
+        a_torch.shape,
+        a_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=a_torch,
+    )
+    b = TestTensor(
+        b_torch.shape,
+        b_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=b_torch,
+    )
+    c = TestTensor(tuple(), None, dtype, device, mode="zeros")
+    s = TestTensor(tuple(), None, dtype, device, mode="zeros")
+
+    exp_a, exp_b, exp_c, exp_s = torch_rotg(a0, b0)
+
+    print(
+        f"Testing Rotg on {InfiniDeviceNames[device]} with a:{a0} b:{b0} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRotgDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            a.descriptor,
+            b.descriptor,
+            c.descriptor,
+            s.descriptor,
+        )
+    )
+
+    a.destroy_desc()
+    b.destroy_desc()
+    c.destroy_desc()
+    s.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRotgWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    check_error(
+        LIBINFINIOP.infiniopRotg(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            a.data(),
+            b.data(),
+            c.data(),
+            s.data(),
+            None,
+        )
+    )
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert math.isclose(a.actual_tensor().item(), exp_a, rel_tol=rtol, abs_tol=atol)
+    assert math.isclose(b.actual_tensor().item(), exp_b, rel_tol=rtol, abs_tol=atol)
+    assert math.isclose(c.actual_tensor().item(), exp_c, rel_tol=rtol, abs_tol=atol)
+    assert math.isclose(s.actual_tensor().item(), exp_s, rel_tol=rtol, abs_tol=atol)
+
+    check_error(LIBINFINIOP.infiniopDestroyRotgDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rotm.py b/test/infiniop/rotm.py
new file mode 100644
index 000000000..79637f671
--- /dev/null
+++ b/test/infiniop/rotm.py
@@ -0,0 +1,180 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+_TEST_CASES = [
+    ((13,), None, None, (-1.0, 1.2, -0.3, 0.4, 0.8)),
+    ((13,), (10,), (10,), (0.0, 0.0, -0.25, 0.5, 0.0)),
+    ((5632,), None, None, (1.0, 1.1, 0.0, 0.0, 0.9)),
+    ((5632,), (5,), (5,), (-2.0, 0.0, 0.0, 0.0, 0.0)),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-9, "rtol": 1e-9},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_rotm(x, y, param):
+    sflag, sh11, sh21, sh12, sh22 = param
+    if sflag == -2.0:
+        return
+
+    w = x.clone()
+    z = y.clone()
+
+    if sflag < 0.0:
+        x.copy_(w * sh11 + z * sh12)
+        y.copy_(w * sh21 + z * sh22)
+    elif sflag == 0.0:
+        x.copy_(w + z * sh12)
+        y.copy_(w * sh21 + z)
+    else:
+        x.copy_(w * sh11 + z)
+        y.copy_(-w + sh22 * z)
+
+
+def _torch_dtype(dtype):
+    if dtype == InfiniDtype.F64:
+        return torch.float64
+    return torch.float32
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    param=(-1.0, 1.2, -0.3, 0.4, 0.8),
+    dtype=torch.float32,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+    y = TestTensor(shape, y_stride, dtype, device)
+    param_tensor = TestTensor(
+        (5,),
+        (1,),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=torch.tensor(param, dtype=_torch_dtype(dtype)),
+    )
+
+    if x.is_broadcast() or y.is_broadcast():
+        return
+
+    print(
+        f"Testing Rotm on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} "
+        f"y_stride:{y_stride} param:{param} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_rotm(x.torch_tensor(), y.torch_tensor(), param)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRotmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            y.descriptor,
+            param_tensor.descriptor,
+        )
+    )
+
+    x.destroy_desc()
+    y.destroy_desc()
+    param_tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRotmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_rotm():
+        check_error(
+            LIBINFINIOP.infiniopRotm(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                y.data(),
+                param_tensor.data(),
+                None,
+            )
+        )
+
+    lib_rotm()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: torch_rotm(x.torch_tensor(), y.torch_tensor(), param),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_rotm(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyRotmDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/rotmg.py b/test/infiniop/rotmg.py
new file mode 100644
index 000000000..ff0dc8976
--- /dev/null
+++ b/test/infiniop/rotmg.py
@@ -0,0 +1,262 @@
+import ctypes
+import math
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    test_operator,
+)
+
+_TEST_CASES = [
+    (1.0, 2.0, 3.0, 4.0),
+    (2.5, 0.5, -1.2, 0.8),
+    (3.0, 4.0, 0.0, 2.0),
+    (1.5, 1.5, 2.0, -3.0),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 1e-12, "rtol": 1e-12},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def _torch_dtype(dtype):
+    if dtype == InfiniDtype.F64:
+        return torch.float64
+    return torch.float32
+
+
+def torch_rotmg(d1, d2, x1, y1):
+    zero = 0.0
+    one = 1.0
+    two = 2.0
+    gam = 4096.0
+    gamsq = 1.67772e7
+    rgamsq = 5.96046e-8
+
+    sparam = [0.0] * 5
+    sh11 = sh12 = sh21 = sh22 = 0.0
+
+    if d1 < zero:
+        sflag = -one
+        d1 = d2 = x1 = zero
+    else:
+        sp2 = d2 * y1
+        if sp2 == zero:
+            sparam[0] = -two
+            return d1, d2, x1, sparam
+
+        sp1 = d1 * x1
+        sq2 = sp2 * y1
+        sq1 = sp1 * x1
+
+        if abs(sq1) > abs(sq2):
+            sh21 = -y1 / x1
+            sh12 = sp2 / sp1
+            su = one - sh12 * sh21
+            if su > zero:
+                sflag = zero
+                d1 = d1 / su
+                d2 = d2 / su
+                x1 = x1 * su
+            else:
+                sflag = -one
+                sh11 = sh12 = sh21 = sh22 = zero
+                d1 = d2 = x1 = zero
+        else:
+            if sq2 < zero:
+                sflag = -one
+                d1 = d2 = x1 = zero
+            else:
+                sflag = one
+                sh11 = sp1 / sp2
+                sh22 = x1 / y1
+                su = one + sh11 * sh22
+                stemp = d2 / su
+                d2 = d1 / su
+                d1 = stemp
+                x1 = y1 * su
+
+        if d1 != zero:
+            while d1 <= rgamsq or d1 >= gamsq:
+                if sflag == zero:
+                    sh11 = one
+                    sh22 = one
+                    sflag = -one
+                else:
+                    sh21 = -one
+                    sh12 = one
+                    sflag = -one
+                if d1 <= rgamsq:
+                    d1 = d1 * gam * gam
+                    x1 = x1 / gam
+                    sh11 = sh11 / gam
+                    sh12 = sh12 / gam
+                else:
+                    d1 = d1 / (gam * gam)
+                    x1 = x1 * gam
+                    sh11 = sh11 * gam
+                    sh12 = sh12 * gam
+
+        if d2 != zero:
+            while abs(d2) <= rgamsq or abs(d2) >= gamsq:
+                if sflag == zero:
+                    sh11 = one
+                    sh22 = one
+                    sflag = -one
+                else:
+                    sh21 = -one
+                    sh12 = one
+                    sflag = -one
+                if abs(d2) <= rgamsq:
+                    d2 = d2 * gam * gam
+                    sh21 = sh21 / gam
+                    sh22 = sh22 / gam
+                else:
+                    d2 = d2 / (gam * gam)
+                    sh21 = sh21 * gam
+                    sh22 = sh22 * gam
+
+    if sflag < zero:
+        sparam[1] = sh11
+        sparam[2] = sh21
+        sparam[3] = sh12
+        sparam[4] = sh22
+    elif sflag == zero:
+        sparam[2] = sh21
+        sparam[3] = sh12
+    else:
+        sparam[1] = sh11
+        sparam[4] = sh22
+
+    sparam[0] = sflag
+    return d1, d2, x1, sparam
+
+
+def test(handle, device, d1_0, d2_0, x1_0, y1_0, dtype=torch.float32, sync=None):
+    exp_d1, exp_d2, exp_x1, exp_sparam = torch_rotmg(d1_0, d2_0, x1_0, y1_0)
+
+    scalar_dtype = _torch_dtype(dtype)
+    d1_torch = torch.tensor([d1_0], dtype=scalar_dtype)
+    d2_torch = torch.tensor([d2_0], dtype=scalar_dtype)
+    x1_torch = torch.tensor([x1_0], dtype=scalar_dtype)
+    y1_torch = torch.tensor([y1_0], dtype=scalar_dtype)
+    d1 = TestTensor(
+        d1_torch.shape,
+        d1_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=d1_torch,
+    )
+    d2 = TestTensor(
+        d2_torch.shape,
+        d2_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=d2_torch,
+    )
+    x1 = TestTensor(
+        x1_torch.shape,
+        x1_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x1_torch,
+    )
+    y1 = TestTensor(
+        y1_torch.shape,
+        y1_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=y1_torch,
+    )
+    param = TestTensor((5,), (1,), dtype, device, mode="zeros")
+
+    print(
+        f"Testing Rotmg on {InfiniDeviceNames[device]} with d1:{d1_0} d2:{d2_0} x1:{x1_0} y1:{y1_0} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRotmgDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            d1.descriptor,
+            d2.descriptor,
+            x1.descriptor,
+            y1.descriptor,
+            param.descriptor,
+        )
+    )
+
+    d1.destroy_desc()
+    d2.destroy_desc()
+    x1.destroy_desc()
+    y1.destroy_desc()
+    param.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRotmgWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    check_error(
+        LIBINFINIOP.infiniopRotmg(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            d1.data(),
+            d2.data(),
+            x1.data(),
+            y1.data(),
+            param.data(),
+            None,
+        )
+    )
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert math.isclose(d1.actual_tensor().item(), exp_d1, rel_tol=rtol, abs_tol=atol)
+    assert math.isclose(d2.actual_tensor().item(), exp_d2, rel_tol=rtol, abs_tol=atol)
+    assert math.isclose(x1.actual_tensor().item(), exp_x1, rel_tol=rtol, abs_tol=atol)
+    for i in range(5):
+        assert math.isclose(
+            param.actual_tensor()[i].item(), exp_sparam[i], rel_tol=rtol, abs_tol=atol
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyRotmgDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/scal.py b/test/infiniop/scal.py
new file mode 100644
index 000000000..ff5c6ed32
--- /dev/null
+++ b/test/infiniop/scal.py
@@ -0,0 +1,164 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+# Format: (shape, x_stride, alpha)
+_TEST_CASES = [
+    ((13,), None, 2.5),
+    ((13,), (10,), 2.5),
+    ((5632,), None, 2.5),
+    ((5632,), (5,), 2.5),
+    ((16,), (4,), 2.5),
+    ((5632,), (32,), 2.5),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-15},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def scal(x, alpha):
+    x.mul_(alpha)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    alpha_value=2.5,
+    dtype=torch.float16,
+    sync=None,
+):
+    alpha_torch = torch.tensor([alpha_value])
+    alpha = TestTensor(
+        alpha_torch.shape,
+        alpha_torch.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=alpha_torch,
+    )
+    x = TestTensor(shape, x_stride, dtype, device)
+
+    if x.is_broadcast():
+        return
+
+    print(
+        f"Testing Scal on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    # Compute PyTorch reference
+    scal(x.torch_tensor(), alpha.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    # Create Descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateScalDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            alpha.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    alpha.destroy_desc()
+    x.destroy_desc()
+
+    # Allocate Workspace
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetScalWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    # Execute C library op
+    def lib_scal():
+        check_error(
+            LIBINFINIOP.infiniopScal(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                alpha.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_scal()
+
+    # Compare results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: scal(x.torch_tensor(), alpha.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_scal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyScalDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/swap.py b/test/infiniop/swap.py
new file mode 100644
index 000000000..4d92668d7
--- /dev/null
+++ b/test/infiniop/swap.py
@@ -0,0 +1,151 @@
+import ctypes
+from ctypes import c_uint64
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+_TEST_CASES = [
+    ((13,), None, None),
+    ((13,), (10,), (10,)),
+    ((5632,), None, None),
+    ((5632,), (5,), (5,)),
+    ((16,), (4,), (4,)),
+    ((5632,), (32,), (32,)),
+]
+
+_TENSOR_DTYPES = [
+    # InfiniDtype.F16,
+    InfiniDtype.F32,
+    # InfiniDtype.F64,
+    # InfiniDtype.BF16,
+]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 1e-15, "rtol": 1e-15},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_swap(x, y):
+    tmp = x.clone()
+    x.copy_(y)
+    y.copy_(tmp)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+    y = TestTensor(shape, y_stride, dtype, device)
+
+    if x.is_broadcast() or y.is_broadcast():
+        return
+
+    print(
+        f"Testing Swap on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} "
+        f"y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_swap(x.torch_tensor(), y.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSwapDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor,
+            y.descriptor,
+        )
+    )
+
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSwapWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_swap():
+        check_error(
+            LIBINFINIOP.infiniopSwap(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                x.data(),
+                y.data(),
+                None,
+            )
+        )
+
+    lib_swap()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(x.actual_tensor(), x.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: torch_swap(x.torch_tensor(), y.torch_tensor()),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_swap(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroySwapDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")