From 5ee172aa8446ce37bf9ac51e40ebdf9859c116a8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:14:10 +0000 Subject: [PATCH 1/2] feat(pr-x2): soa_struct! gains #[soa(pad_to_lanes = N)] attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Worker B of PR-X2 per .claude/knowledge/pr-x2-design.md § "Worker decomposition" line 459. SIMD-staged kernels need each SoA field's underlying Vec to be a multiple of the lane width N so the consumer walks the buffer with one uniform N-lane loop — no scalar tail-case branch. Pre-PR-X2 callers achieved this by hand (W3-W6 GaussianBatch::with_capacity + eager-zero fill); this PR makes it declarative on the field. Macro surface: soa_struct! { pub struct Cells { #[soa(pad_to_lanes = 8)] pub palette: u8, pub label: u32, // unpadded } } let mut c = Cells::new(); c.push(7, 100); assert_eq!(c.len(), 1); // logical row count assert_eq!(c.palette.len(), 8); // physical, rounded to lane 8 assert_eq!(c.label.len(), 1); // unpadded: physical == logical Implementation: - Added optional `$(#[soa(pad_to_lanes = $pad:literal)])?` per field in the macro_rules! head — Rust 1.32+ optional-meta repetition. - Generated struct grows a private `_logical_len: usize` so `len()` / `is_empty()` return the **semantic** row count independent of any field's lane padding. - `push()` dispatches per-field through internal `@push_field` arms: • padded arm grows the Vec to `(logical+1).div_ceil(N)*N` filling with `<$ty as Default>::default()`, then writes the new value at `[logical]` • plain arm is the pre-PR-X2 `Vec::push` call The dispatch uses macro_rules! tt-munching with literal-token separators (`pad = $pad`) so a single repetition handles both shapes. - Compile-time guard: `const { assert!($pad > 0) }` inside the padded arm — `pad_to_lanes = 0` is rejected at expansion, not at runtime. - `with_capacity(cap)` reserves `cap` on each field but does NOT pre-pad; padding happens lazily on push (matches the original `with_capacity` semantics modulo the lane-tail). - `clear()` resets _logical_len + .clear() on each field. Re-pushing rebuilds padding from scratch. Breaking change: `len()` no longer mirrors `self..len()` after direct field mutation (e.g. `s.x.push(...)` bypasses `_logical_len`). The canonical entry point is the macro-generated `push`. Pre-existing `macro_public_visibility_passthrough` test updated to use `push`. New tests (`src/hpc/soa.rs`, 5 added): - pad_to_lanes_single_push_grows_to_lane — mixed cadence 8+16+none - pad_to_lanes_crosses_lane_boundary — 9 pushes against lane 8 - pad_to_lanes_clear_resets_both — clear() round-trips - pad_to_lanes_uniform_cadence — all-padded variant - pad_to_lanes_with_capacity_empty — empty state invariants Plus a `# Example — #[soa(pad_to_lanes = N)] field attribute` doctest on the `soa_struct!` macro itself. Verified: cargo test -p ndarray --lib hpc::soa 38 passed cargo test --doc -p ndarray hpc::soa 14 passed cargo fmt --check clean cargo clippy --features approx,serde,rayon -- -D warnings clean --- src/hpc/soa.rs | 250 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 231 insertions(+), 19 deletions(-) diff --git a/src/hpc/soa.rs b/src/hpc/soa.rs index 2392c10e..86f568fb 100644 --- a/src/hpc/soa.rs +++ b/src/hpc/soa.rs @@ -323,52 +323,119 @@ impl<'a, T, const N: usize> Iterator for SoaChunks<'a, T, N> { /// assert_eq!(b.means_y.as_slice(), &[2.0, 5.0]); /// assert_eq!(b.means_z.as_slice(), &[3.0, 6.0]); /// ``` +/// +/// # Example — `#[soa(pad_to_lanes = N)]` field attribute (PR-X2 Worker B) +/// +/// Tag a field with `#[soa(pad_to_lanes = N)]` to make `push` pad the +/// underlying `Vec` up to the next multiple of `N` (filling with +/// `Default::default()`). SIMD-staged kernels then walk the field with +/// one uniform N-lane loop — no tail-case branch. +/// +/// `len()` returns the **logical** row count (unchanged by padding); +/// `self..len()` returns the **physical** Vec length. The difference +/// is the lane-alignment tail. +/// +/// ``` +/// use ndarray::soa_struct; +/// +/// soa_struct! { +/// pub struct Cells { +/// #[soa(pad_to_lanes = 8)] +/// pub palette: u8, +/// pub label: u32, // unpadded +/// } +/// } +/// +/// let mut c = Cells::new(); +/// c.push(7, 100); +/// assert_eq!(c.len(), 1); // logical: 1 row +/// assert_eq!(c.palette.len(), 8); // physical: rounded up to lane 8 +/// assert_eq!(c.label.len(), 1); // unpadded: physical == logical +/// assert_eq!(c.palette[0], 7); +/// assert_eq!(c.palette[1..8], [0u8; 7]); // padded tail is Default::default() +/// ``` #[macro_export] macro_rules! soa_struct { ( $(#[$meta:meta])* $vis:vis struct $name:ident { - $($field_vis:vis $field:ident : $ty:ty),* $(,)? + $( + $(#[soa(pad_to_lanes = $pad:literal)])? + $field_vis:vis $field:ident : $ty:ty + ),* $(,)? } ) => { $(#[$meta])* $vis struct $name { - $($field_vis $field: ::std::vec::Vec<$ty>),* + $($field_vis $field: ::std::vec::Vec<$ty>,)* + /// Shared logical row count across all fields. Padded fields may + /// have `self..len() > _logical_len` after `push`. + /// Updated by `push` / `clear`; treat as private. + #[doc(hidden)] + _logical_len: usize, } impl $name { /// Construct an empty instance. pub fn new() -> Self { - Self { $($field: ::std::vec::Vec::new()),* } + Self { + $($field: ::std::vec::Vec::new(),)* + _logical_len: 0, + } } /// Construct with each field pre-allocated to `cap`. + /// + /// Padded fields per `#[soa(pad_to_lanes = N)]` get + /// `cap` worth of physical capacity, not `cap.div_ceil(N) * N` — + /// the lane padding happens lazily inside `push` so the up-front + /// reservation is a hint, not a hard size guarantee. pub fn with_capacity(cap: usize) -> Self { - Self { $($field: ::std::vec::Vec::with_capacity(cap)),* } + Self { + $($field: ::std::vec::Vec::with_capacity(cap),)* + _logical_len: 0, + } } /// Append one row across all fields. + /// + /// For fields tagged `#[soa(pad_to_lanes = N)]`, the underlying + /// `Vec` is padded with `<$ty as Default>::default()` up to the + /// next multiple of `N` before the new value is written. Padded + /// elements occupy slots `[_logical_len + 1 .. padded_len)` and + /// are guaranteed to compare equal to `Default::default()`. #[allow(clippy::too_many_arguments)] pub fn push(&mut self, $($field: $ty),*) { - $(self.$field.push($field);)* + let logical = self._logical_len; + $( + $crate::soa_struct!(@push_field + self, $field, $field, $ty, logical + $(, pad = $pad)? + ); + )* + self._logical_len = logical + 1; } - /// Length (all fields share this length; debug-asserted). + /// Logical row count (shared across all fields). + /// + /// For padded fields this may be **less than** `self..len()`; + /// the difference is the lane-alignment tail. Use `len()` for the + /// semantic count, `self..len()` for the physical Vec length. pub fn len(&self) -> usize { - let lens = [$(self.$field.len()),*]; - debug_assert!( - lens.iter().all(|&l| l == lens[0]), - concat!(stringify!($name), ": field-length invariant violated") - ); - lens[0] + self._logical_len } - /// Returns `true` if there are zero rows. - pub fn is_empty(&self) -> bool { self.len() == 0 } + /// Returns `true` if there are zero logical rows. + pub fn is_empty(&self) -> bool { self._logical_len == 0 } - /// Clear all fields. Capacity is retained. + /// Clear all fields. Capacity is retained; logical length resets to 0. + /// + /// Padded fields' physical `Vec`s are cleared along with the + /// unpadded ones — re-pushing into a cleared struct rebuilds the + /// padding from scratch. pub fn clear(&mut self) { $(self.$field.clear();)* + self._logical_len = 0; } } @@ -376,6 +443,25 @@ macro_rules! soa_struct { fn default() -> Self { Self::new() } } }; + + // Internal — padded field push: grow Vec to the next multiple of $pad + // with Default::default() before writing the new value at `logical`. + (@push_field $self:ident, $vec:ident, $val:ident, $ty:ty, $logical:ident, pad = $pad:literal) => {{ + const _: () = { + // Compile-time guard: pad_to_lanes = 0 is nonsensical. + assert!($pad > 0, "soa_struct! #[soa(pad_to_lanes = N)] requires N > 0"); + }; + let needed = ($logical + 1).div_ceil($pad) * $pad; + while $self.$vec.len() < needed { + $self.$vec.push(<$ty as ::std::default::Default>::default()); + } + $self.$vec[$logical] = $val; + }}; + + // Internal — plain (unpadded) field push. + (@push_field $self:ident, $vec:ident, $val:ident, $ty:ty, $logical:ident) => {{ + $self.$vec.push($val); + }}; } /// Deinterleave an AoS slice into a [`SoaVec`] by extracting `N` @@ -791,12 +877,16 @@ mod tests { #[test] fn macro_public_visibility_passthrough() { // Soa3 has `pub` fields; verify the field is accessible - // (compilation alone proves visibility). + // (compilation alone proves visibility). Use the macro-generated + // `push` (canonical entry point) so `_logical_len` stays in sync; + // direct `s.x.push(...)` would bypass `_logical_len` since PR-X2 B. let mut s = Soa3::new(); - s.x.push(1.0); - s.y.push(2.0); - s.z.push(3.0); + s.push(1.0, 2.0, 3.0); assert_eq!(s.len(), 1); + // Field access works (visibility test): + assert_eq!(s.x.as_slice(), &[1.0]); + assert_eq!(s.y.as_slice(), &[2.0]); + assert_eq!(s.z.as_slice(), &[3.0]); } #[test] @@ -994,6 +1084,128 @@ mod tests { assert_eq!(back, aos); } + // ------------------------------------------------------------------ + // PR-X2 Worker B — `#[soa(pad_to_lanes = N)]` field attribute + // ------------------------------------------------------------------ + + soa_struct! { + /// 3-field SoA with two padded fields at different lane widths and + /// one unpadded field. Exercises the mixed-cadence macro arm. + pub struct PadMixed { + #[soa(pad_to_lanes = 8)] + pub palette: u8, + #[soa(pad_to_lanes = 16)] + pub depth: u16, + pub label: u32, + } + } + + /// Single push into a `pad_to_lanes = 8` field rounds the physical Vec + /// up to 8 elements; logical len is 1. + #[test] + fn pad_to_lanes_single_push_grows_to_lane() { + let mut s = PadMixed::new(); + s.push(7u8, 0x1234u16, 99u32); + assert_eq!(s.len(), 1, "logical len = 1"); + assert_eq!(s.palette.len(), 8, "palette padded to lane 8"); + assert_eq!(s.depth.len(), 16, "depth padded to lane 16"); + assert_eq!(s.label.len(), 1, "label unpadded — physical = logical"); + assert_eq!(s.palette[0], 7); + assert_eq!(s.depth[0], 0x1234); + assert_eq!(s.label[0], 99); + // Padded tail is Default::default(). + for &b in &s.palette[1..8] { + assert_eq!(b, 0u8); + } + for &d in &s.depth[1..16] { + assert_eq!(d, 0u16); + } + } + + /// Crossing a lane boundary on a padded field grows the Vec by another N. + #[test] + fn pad_to_lanes_crosses_lane_boundary() { + let mut s = PadMixed::new(); + for i in 0..9u8 { + s.push(i, i as u16, i as u32); + } + assert_eq!(s.len(), 9); + // palette: 9 pushes → next multiple of 8 is 16 + assert_eq!(s.palette.len(), 16); + // depth: 9 pushes → still inside lane 16 + assert_eq!(s.depth.len(), 16); + // label: unpadded + assert_eq!(s.label.len(), 9); + // first 9 slots carry user values + for i in 0..9 { + assert_eq!(s.palette[i], i as u8); + assert_eq!(s.depth[i], i as u16); + assert_eq!(s.label[i], i as u32); + } + // tail is default-zeroed + for &b in &s.palette[9..16] { + assert_eq!(b, 0u8); + } + } + + /// `clear()` resets logical_len and clears physical Vecs. + #[test] + fn pad_to_lanes_clear_resets_both() { + let mut s = PadMixed::new(); + s.push(1, 2, 3); + s.push(4, 5, 6); + assert_eq!(s.len(), 2); + s.clear(); + assert_eq!(s.len(), 0); + assert!(s.is_empty()); + assert_eq!(s.palette.len(), 0); + assert_eq!(s.depth.len(), 0); + assert_eq!(s.label.len(), 0); + // Reuse after clear works — padding rebuilds from scratch. + s.push(99, 0xFFFF, 7); + assert_eq!(s.len(), 1); + assert_eq!(s.palette.len(), 8); + assert_eq!(s.depth.len(), 16); + } + + soa_struct! { + /// All-padded variant — every field gets the same lane width. + pub struct PadUniform { + #[soa(pad_to_lanes = 4)] + pub a: i32, + #[soa(pad_to_lanes = 4)] + pub b: i32, + } + } + + /// All-padded struct: every field grows in sync with the lane cadence. + #[test] + fn pad_to_lanes_uniform_cadence() { + let mut s = PadUniform::new(); + s.push(10, 20); + s.push(30, 40); + s.push(50, 60); + assert_eq!(s.len(), 3); + // 3 pushes → next multiple of 4 is 4 + assert_eq!(s.a.len(), 4); + assert_eq!(s.b.len(), 4); + assert_eq!(s.a[0..3], [10, 30, 50]); + assert_eq!(s.b[0..3], [20, 40, 60]); + assert_eq!(s.a[3], 0); + assert_eq!(s.b[3], 0); + } + + /// `with_capacity` initialises an empty padded struct correctly. + #[test] + fn pad_to_lanes_with_capacity_empty() { + let s = PadMixed::with_capacity(64); + assert_eq!(s.len(), 0); + assert!(s.is_empty()); + assert_eq!(s.palette.len(), 0); + assert_eq!(s.depth.len(), 0); + assert_eq!(s.label.len(), 0); + } + /// Inference-only entry: caller relies on closure return-type ascription, /// no turbofish at all. #[test] From 9d492db536fa8a7c8b88b8c6651f329a1f48e160 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 08:30:30 +0000 Subject: [PATCH 2/2] fix(pr-x2): split soa_struct! into unpadded + padded arms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 codex review on PR #169: unconditional `_logical_len` injection turned previously all-public generated structs into structs with a private field, breaking downstream code that constructs them via struct literals (`MyBatch { a: vec![], b: vec![] }`) and exhaustive pattern matches — even for callers who never use `#[soa(pad_to_lanes = N)]`. Fix: split `soa_struct!` into two arms. Arm 1 — unpadded (no `#[soa(...)]` anywhere): byte-for-byte the pre-PR-X2 emit. No `_logical_len` field. `len()` reads field lengths under `debug_assert`. Struct-literal construction works exactly as before. macro_rules! tries this arm first; if any field carries a `#[soa(pad_to_lanes = N)]` attribute, the no-attribute pattern fails to match and the macro falls through to arm 2. Arm 2 — padded (at least one `#[soa(...)]`): the new PR-X2 B emit with `_logical_len` + lane-padded `push`. Reached only when the user opted in by tagging a field — struct-literal construction was never going to work for these anyway because the user can't fill the lane-padded `Vec` slots themselves. The pre-existing `macro_public_visibility_passthrough` test is restored to its original form (direct `s.x.push(...)` on the unpadded `Soa3`) since arm 1 is byte-identical to pre-PR-X2. Verified: cargo test -p ndarray --lib hpc::soa 38 passed cargo test --doc -p ndarray hpc::soa 14 passed cargo fmt --check clean cargo clippy --features approx,serde,rayon -- -D warnings clean Resolves PR #169 P1 review thread r3272307622. --- src/hpc/soa.rs | 89 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 9 deletions(-) diff --git a/src/hpc/soa.rs b/src/hpc/soa.rs index 86f568fb..e5158f53 100644 --- a/src/hpc/soa.rs +++ b/src/hpc/soa.rs @@ -356,6 +356,74 @@ impl<'a, T, const N: usize> Iterator for SoaChunks<'a, T, N> { /// ``` #[macro_export] macro_rules! soa_struct { + // ─────────────────────────────────────────────────────────────────── + // Arm 1 — unpadded (no `#[soa(...)]` attribute on any field). + // This is byte-for-byte the pre-PR-X2 emit: no `_logical_len` field, + // `len()` reads from field lengths under `debug_assert`. Existing + // callers (struct-literal construction, exhaustive patterns) are + // unaffected. macro_rules! tries this arm first; if any field has + // a `#[soa(pad_to_lanes = N)]` attribute the pattern fails to match + // and arm 2 is tried. + // ─────────────────────────────────────────────────────────────────── + ( + $(#[$meta:meta])* + $vis:vis struct $name:ident { + $($field_vis:vis $field:ident : $ty:ty),* $(,)? + } + ) => { + $(#[$meta])* + $vis struct $name { + $($field_vis $field: ::std::vec::Vec<$ty>),* + } + + impl $name { + /// Construct an empty instance. + pub fn new() -> Self { + Self { $($field: ::std::vec::Vec::new()),* } + } + + /// Construct with each field pre-allocated to `cap`. + pub fn with_capacity(cap: usize) -> Self { + Self { $($field: ::std::vec::Vec::with_capacity(cap)),* } + } + + /// Append one row across all fields. + #[allow(clippy::too_many_arguments)] + pub fn push(&mut self, $($field: $ty),*) { + $(self.$field.push($field);)* + } + + /// Length (all fields share this length; debug-asserted). + pub fn len(&self) -> usize { + let lens = [$(self.$field.len()),*]; + debug_assert!( + lens.iter().all(|&l| l == lens[0]), + concat!(stringify!($name), ": field-length invariant violated") + ); + lens[0] + } + + /// Returns `true` if there are zero rows. + pub fn is_empty(&self) -> bool { self.len() == 0 } + + /// Clear all fields. Capacity is retained. + pub fn clear(&mut self) { + $(self.$field.clear();)* + } + } + + impl ::std::default::Default for $name { + fn default() -> Self { Self::new() } + } + }; + + // ─────────────────────────────────────────────────────────────────── + // Arm 2 — padded (at least one field has `#[soa(pad_to_lanes = N)]`). + // Adds a `#[doc(hidden)] _logical_len: usize` field so `len()` can + // return the semantic row count independent of lane-tail padding. + // Reached only when arm 1's no-attribute pattern fails to match — + // existing callers without padding never see this struct shape. + // ─────────────────────────────────────────────────────────────────── ( $(#[$meta:meta])* $vis:vis struct $name:ident { @@ -371,6 +439,10 @@ macro_rules! soa_struct { /// Shared logical row count across all fields. Padded fields may /// have `self..len() > _logical_len` after `push`. /// Updated by `push` / `clear`; treat as private. + /// + /// Only present on padded structs (at least one field has + /// `#[soa(pad_to_lanes = N)]`); unpadded structs keep the + /// pre-PR-X2 all-public shape. #[doc(hidden)] _logical_len: usize, } @@ -458,7 +530,8 @@ macro_rules! soa_struct { $self.$vec[$logical] = $val; }}; - // Internal — plain (unpadded) field push. + // Internal — plain (unpadded) field push inside a padded struct + // (mixed cadence: some fields padded, others not). (@push_field $self:ident, $vec:ident, $val:ident, $ty:ty, $logical:ident) => {{ $self.$vec.push($val); }}; @@ -877,16 +950,14 @@ mod tests { #[test] fn macro_public_visibility_passthrough() { // Soa3 has `pub` fields; verify the field is accessible - // (compilation alone proves visibility). Use the macro-generated - // `push` (canonical entry point) so `_logical_len` stays in sync; - // direct `s.x.push(...)` would bypass `_logical_len` since PR-X2 B. + // (compilation alone proves visibility). Soa3 is unpadded → uses + // arm 1 of the macro → fields drive `len()` directly, so pushing + // into individual fields still gives the right count. let mut s = Soa3::new(); - s.push(1.0, 2.0, 3.0); + s.x.push(1.0); + s.y.push(2.0); + s.z.push(3.0); assert_eq!(s.len(), 1); - // Field access works (visibility test): - assert_eq!(s.x.as_slice(), &[1.0]); - assert_eq!(s.y.as_slice(), &[2.0]); - assert_eq!(s.z.as_slice(), &[3.0]); } #[test]