Expose transposition functions in the tiled attention

Krzysztof Rymski · copybara-github · commit 3eb606096ca0 · 2026-03-24T10:07:57.000-07:00
PiperOrigin-RevId: 888724073
diff --git a/gemma/tiled_attention.cc b/gemma/tiled_attention.cc
@@ -502,6 +502,22 @@ TransposeQueriesToGroupsOfNBF16orInt16(hwy::Span<float*> queries_ptrs,
                          std::move(q_scales));
 }
 
+std::tuple<std::vector<int16_t, hwy::AlignedAllocator<int16_t>>,
+           std::vector<int16_t*>, AlignedFloatVector>
+TransposeQueriesToGroupsOfNInt16(hwy::Span<float*> queries_ptrs, int qkv_dim,
+                                 size_t group_size) {
+  return TransposeQueriesToGroupsOfNBF16orInt16<int16_t>(queries_ptrs, qkv_dim,
+                                                         group_size);
+}
+
+std::tuple<std::vector<BF16, hwy::AlignedAllocator<BF16>>, std::vector<BF16*>,
+           AlignedFloatVector>
+TransposeQueriesToGroupsOfNBF16(hwy::Span<float*> queries_ptrs, int qkv_dim,
+                                size_t group_size) {
+  return TransposeQueriesToGroupsOfNBF16orInt16<BF16>(queries_ptrs, qkv_dim,
+                                                      group_size);
+}
+
 std::pair<AlignedBF16Vector, std::vector<BF16*>>
 TransposeTransposedQueriesAndPackIntoBF16(hwy::Span<float*> queries_ptrs,
                                           int qkv_dim, int num_queries) {
diff --git a/gemma/tiled_attention.h b/gemma/tiled_attention.h
@@ -15,26 +15,36 @@
 namespace gcpp {
 
 // Passed to HWY_VISIT_TARGETS; declares for one target.
-#define GEMMA_DECL_TILED_ATTENTION(TARGET, NAMESPACE)                        \
-  namespace NAMESPACE {                                                      \
-  void TiledAttention(AttentionImpl attention_impl, size_t num_tokens,       \
-                      size_t layer_idx, const LayerWeightsPtrs& layer,       \
-                      AttentionActivationsPtrs& activations, QBatch& qbatch, \
-                      MatMulEnv& env, int flags);                            \
-  void TransposeStridedQueries(hwy::Span<float*> queries, int qkv_dim,       \
-                               hwy::Span<float> transposed_queries);         \
-  void LocalAttentionForAllHeadsTokensAndBatch(                              \
-      AttentionImpl attention_impl, const size_t num_tokens,                 \
-      const size_t layer_idx, const LayerWeightsPtrs& layer,                 \
-      AttentionActivationsPtrs& activations, QBatch& qbatch,                 \
-      ThreadingContext& ctx);                                                \
-                                                                             \
-  template <typename OutT>                                                   \
-  std::tuple<std::vector<OutT, hwy::AlignedAllocator<OutT>>,                 \
-             std::vector<OutT*>, AlignedFloatVector>                         \
-  TransposeQueriesToGroupsOfNBF16orInt16(hwy::Span<float*> queries_ptrs,     \
-                                         int qkv_dim, size_t group_size);    \
-  /* NOLINTNEXTLINE(google-readability-namespace-comments) */                \
+#define GEMMA_DECL_TILED_ATTENTION(TARGET, NAMESPACE)                          \
+  namespace NAMESPACE {                                                        \
+  void TiledAttention(AttentionImpl attention_impl, size_t num_tokens,         \
+                      size_t layer_idx, const LayerWeightsPtrs& layer,         \
+                      AttentionActivationsPtrs& activations, QBatch& qbatch,   \
+                      MatMulEnv& env, int flags);                              \
+  void TransposeStridedQueries(hwy::Span<float*> queries, int qkv_dim,         \
+                               hwy::Span<float> transposed_queries);           \
+  void LocalAttentionForAllHeadsTokensAndBatch(                                \
+      AttentionImpl attention_impl, const size_t num_tokens,                   \
+      const size_t layer_idx, const LayerWeightsPtrs& layer,                   \
+      AttentionActivationsPtrs& activations, QBatch& qbatch,                   \
+      ThreadingContext& ctx);                                                  \
+                                                                               \
+  template <typename OutT>                                                     \
+  std::tuple<std::vector<OutT, hwy::AlignedAllocator<OutT>>,                   \
+             std::vector<OutT*>, AlignedFloatVector>                           \
+  TransposeQueriesToGroupsOfNBF16orInt16(hwy::Span<float*> queries_ptrs,       \
+                                         int qkv_dim, size_t group_size);      \
+                                                                               \
+  std::tuple<std::vector<int16_t, hwy::AlignedAllocator<int16_t>>,             \
+             std::vector<int16_t*>, AlignedFloatVector>                        \
+  TransposeQueriesToGroupsOfNInt16(hwy::Span<float*> queries_ptrs,             \
+                                   int qkv_dim, size_t group_size);            \
+                                                                               \
+  std::tuple<std::vector<BF16, hwy::AlignedAllocator<BF16>>,                   \
+             std::vector<BF16*>, AlignedFloatVector>                           \
+  TransposeQueriesToGroupsOfNBF16(hwy::Span<float*> queries_ptrs, int qkv_dim, \
+                                  size_t group_size);                          \
+  /* NOLINTNEXTLINE(google-readability-namespace-comments) */                  \
   }  // namespace NAMESPACE
 
 // Function declarations for each SIMD target. Allows direct call from the