diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index 87a165c..aec8dd8 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -5758,6 +5758,336 @@ impl InstructionSelector { stack.push(dst); } + // ============================================================ + // i64 comparisons (binary: pop 2 i64 pairs, push 1 i32 result) + // + // Issue #103: previously these fell through to `select_default`, + // which hardcodes the operand pairs at R0:R1 / R2:R3 and the + // result at R0 — clobbering any AAPCS param register the user + // hasn't read yet via `LocalGet`. The fix is to pop the actual + // register pairs the stack tracker assigned to the operands and + // allocate a result register with `alloc_temp_safe`, which + // already skips live stack values. + // + // Same class as PR #86's i64-const fix in `optimizer_bridge`, + // applied here to every i64 op that hardcoded R0..R3. + // ============================================================ + I64Eq | I64Ne | I64LtS | I64LtU | I64LeS | I64LeU | I64GtS | I64GtU | I64GeS + | I64GeU => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 comparison".to_string(), + ) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 comparison".to_string(), + ) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // Result is a single i32. alloc_temp_safe avoids any reg + // still on the wasm stack, but the popped operand halves + // are NO LONGER on the stack — they may be reused by the + // allocator. That is fine for I64SetCond which encodes to + // a sequence that reads all four operand halves before + // writing rd (see arm_encoder; the CMP chain is fully + // resolved before SetCond writes the byte). + let dst = if idx == wasm_ops.len() - 1 { + Reg::R0 + } else { + alloc_temp_safe(&mut next_temp, &stack)? + }; + let cond = match op { + I64Eq => Condition::EQ, + I64Ne => Condition::NE, + I64LtS => Condition::LT, + I64LtU => Condition::LO, + I64LeS => Condition::LE, + I64LeU => Condition::LS, + I64GtS => Condition::GT, + I64GtU => Condition::HI, + I64GeS => Condition::GE, + I64GeU => Condition::HS, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: ArmOp::I64SetCond { + rd: dst, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + cond, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst); + } + + // ============================================================ + // i64 multiply (binary: pop 2 i64 pairs, push 1 i64 pair) + // + // Issue #103: was hardcoding R0:R1 (operands and result low), + // R2:R3 (second operand). Now uses the stack-tracked pairs + // and a fresh consecutive pair for the destination. + // ============================================================ + I64Mul => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in I64Mul".to_string()) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in I64Mul".to_string()) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + // I64Mul encodes to UMULL + MLA cross products: rd_lo/rd_hi + // are written, and ALL four operand halves are read. dst + // must not overlap any operand half before the encoded + // sequence reads it. + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; + instructions.push(ArmInstruction { + op: ArmOp::I64Mul { + rd_lo: dst_lo, + rd_hi: dst_hi, + rn_lo: a_lo, + rn_hi: a_hi, + rm_lo: b_lo, + rm_hi: b_hi, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 divide / remainder (binary: pop 2 i64 pairs, push 1 pair) + // + // Issue #103: was hardcoding R0:R1 / R2:R3. The encoded + // sequence for these ops is a libcall-style helper that + // reads/writes the operand and result registers — using the + // stack-tracked pairs keeps AAPCS params intact. + // ============================================================ + I64DivS | I64DivU | I64RemS | I64RemU => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 div/rem".to_string()) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 div/rem".to_string()) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; + let arm_op = match op { + I64DivS => ArmOp::I64DivS { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + rmlo: b_lo, + rmhi: b_hi, + }, + I64DivU => ArmOp::I64DivU { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + rmlo: b_lo, + rmhi: b_hi, + }, + I64RemS => ArmOp::I64RemS { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + rmlo: b_lo, + rmhi: b_hi, + }, + I64RemU => ArmOp::I64RemU { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + rmlo: b_lo, + rmhi: b_hi, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: arm_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 rotations (binary: pop 2 i64 pairs, push 1 pair) + // + // Issue #103: was hardcoding R0:R1 / R2. ArmOp::I64Rotl/Rotr + // takes a SINGLE shift reg (the low half of the i64 shift + // amount) — i64.rotl in WASM has an i64 shift amount but + // ARM only uses the low 32 bits modulo-64 by convention. + // We pop both halves of `b` for stack correctness and pass + // b_lo as the shift reg, matching the pre-fix `select_default` + // contract (which assumed shift in R2). + // ============================================================ + I64Rotl | I64Rotr => { + let b_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 rotate".to_string()) + })?; + let a_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in i64 rotate".to_string()) + })?; + let b_hi = i64_pair_hi(b_lo)?; + let a_hi = i64_pair_hi(a_lo)?; + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[a_lo, a_hi, b_lo, b_hi])?; + let arm_op = match op { + I64Rotl => ArmOp::I64Rotl { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + shift: b_lo, + }, + I64Rotr => ArmOp::I64Rotr { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: a_lo, + rnhi: a_hi, + shift: b_lo, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: arm_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 unary bit ops (pop 1 i64 pair, push 1 i32 result) + // + // I64Clz / I64Ctz / I64Popcnt return a 32-bit count. Was + // hardcoding R0 (operand lo + result) and R1 (operand hi). + // ============================================================ + I64Clz | I64Ctz | I64Popcnt => { + let src_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 unary bit op".to_string(), + ) + })?; + let src_hi = i64_pair_hi(src_lo)?; + let dst = if idx == wasm_ops.len() - 1 { + Reg::R0 + } else { + alloc_temp_safe(&mut next_temp, &stack)? + }; + let arm_op = match op { + I64Clz => ArmOp::I64Clz { + rd: dst, + rnlo: src_lo, + rnhi: src_hi, + }, + I64Ctz => ArmOp::I64Ctz { + rd: dst, + rnlo: src_lo, + rnhi: src_hi, + }, + I64Popcnt => ArmOp::I64Popcnt { + rd: dst, + rnlo: src_lo, + rnhi: src_hi, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: arm_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst); + } + + // ============================================================ + // i64 in-place sign extension (pop 1 i64 pair, push 1 pair) + // + // I64Extend{8,16,32}S take an i64 (the upper bits are + // ignored) and sign-extend the low N bits to 64. Was + // hardcoding R0:R1 for both operand and result. + // ============================================================ + I64Extend8S | I64Extend16S | I64Extend32S => { + let src_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis( + "stack underflow in i64 sign-extend".to_string(), + ) + })?; + let _src_hi = i64_pair_hi(src_lo)?; + // dst must not overlap src_lo before the encoded sequence + // reads it (the encoder issues a SXTB/SXTH/MOV + ASR #31 + // pattern that reads src_lo first then writes rdlo/rdhi). + let (dst_lo, dst_hi) = + alloc_consecutive_pair(&mut next_temp, &stack, &[src_lo])?; + let arm_op = match op { + I64Extend8S => ArmOp::I64Extend8S { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: src_lo, + }, + I64Extend16S => ArmOp::I64Extend16S { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: src_lo, + }, + I64Extend32S => ArmOp::I64Extend32S { + rdlo: dst_lo, + rdhi: dst_hi, + rnlo: src_lo, + }, + _ => unreachable!(), + }; + instructions.push(ArmInstruction { + op: arm_op, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst_lo); + } + + // ============================================================ + // i64 → i32 wrap (pop 1 i64 pair, push 1 i32) + // + // I32WrapI64 keeps the low half. Was hardcoding R0 for both + // operand low and result. + // ============================================================ + I32WrapI64 => { + let src_lo = stack.pop().ok_or_else(|| { + synth_core::Error::synthesis("stack underflow in I32WrapI64".to_string()) + })?; + let _src_hi = i64_pair_hi(src_lo)?; + let dst = if idx == wasm_ops.len() - 1 { + Reg::R0 + } else { + alloc_temp_safe(&mut next_temp, &stack)? + }; + instructions.push(ArmInstruction { + op: ArmOp::I32WrapI64 { + rd: dst, + rnlo: src_lo, + }, + source_line: Some(idx), + }); + cf.add_instruction(); + stack.push(dst); + } + // For other operations, fall back to default behavior. // Stack tracking is approximate after this point: select_default // uses its own register allocator and doesn't update the virtual stack. diff --git a/crates/synth-synthesis/tests/issue_103_i64_aapcs.rs b/crates/synth-synthesis/tests/issue_103_i64_aapcs.rs new file mode 100644 index 0000000..2fa8c24 --- /dev/null +++ b/crates/synth-synthesis/tests/issue_103_i64_aapcs.rs @@ -0,0 +1,385 @@ +//! Regression test for issue #103: i64 ops hardcoded R0..R3, clobbering AAPCS +//! param registers before LocalGet reads them. +//! +//! The cargo-fuzz harness `i64_lowering_doesnt_clobber_params` (PR #100) caught +//! this for `I64SetCond` with the crash signature: +//! +//! ```text +//! AAPCS clobber: ARM instr at wasm line 1 writes param reg R0 before +//! LocalGet(0) at line 4. +//! Op: I64SetCond { rd: R0, rn_lo: R0, rn_hi: R1, rm_lo: R2, rm_hi: R3, cond: LT } +//! ``` +//! +//! Root cause: in `instruction_selector::select_with_stack` (the `--no-optimize` +//! lowering path), the wildcard fallthrough handed every unhandled i64 op off +//! to `select_default`, which hardcodes R0:R1 / R2:R3 for the operand pairs and +//! R0 for the result. This is the same class of AAPCS-clobber bug that PR #86 +//! fixed for `I64Const` in the optimizer path — applied to every other i64 op +//! that wasn't explicitly handled. +//! +//! Affected ops (all confirmed by audit): +//! I64Eq, I64Ne, I64LtS, I64LtU, I64LeS, I64LeU, I64GtS, I64GtU, I64GeS, I64GeU +//! I64Mul, I64DivS, I64DivU, I64RemS, I64RemU +//! I64Rotl, I64Rotr +//! I64Clz, I64Ctz, I64Popcnt +//! I64Extend8S, I64Extend16S, I64Extend32S +//! I32WrapI64 +//! +//! The fix is to add explicit handlers in `select_with_stack` for each, using +//! the stack-tracked operand pairs and a fresh `alloc_temp_safe` / +//! `alloc_consecutive_pair` allocation for the destination. + +use synth_synthesis::{ArmInstruction, ArmOp, InstructionSelector, Reg, RuleDatabase, WasmOp}; + +/// Compile the WASM op sequence through the `--no-optimize` lowering path +/// (`select_with_stack`) and return the emitted ARM ops. This is the path the +/// fuzz harness exercises. +fn compile_no_optimize(wasm_ops: &[WasmOp], num_params: u32) -> Vec { + let db = RuleDatabase::new(); + let mut selector = InstructionSelector::new(db.rules().to_vec()); + selector + .select_with_stack(wasm_ops, num_params) + .expect("select_with_stack should succeed for valid input") +} + +/// Get the (write_set, read_set) of an ArmOp at the granularity needed for the +/// AAPCS-clobber check. write_set = registers the op writes; read_set = +/// registers the op reads. For multi-register pseudo-ops we conservatively +/// include all halves. +fn rw_sets(op: &ArmOp) -> (Vec, Vec) { + match op { + ArmOp::I64SetCond { + rd, + rn_lo, + rn_hi, + rm_lo, + rm_hi, + .. + } => (vec![*rd], vec![*rn_lo, *rn_hi, *rm_lo, *rm_hi]), + ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => (vec![*rd], vec![*rn_lo, *rn_hi]), + ArmOp::I64Mul { + rd_lo, + rd_hi, + rn_lo, + rn_hi, + rm_lo, + rm_hi, + } => (vec![*rd_lo, *rd_hi], vec![*rn_lo, *rn_hi, *rm_lo, *rm_hi]), + ArmOp::I64DivS { + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, + } + | ArmOp::I64DivU { + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, + } + | ArmOp::I64RemS { + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, + } + | ArmOp::I64RemU { + rdlo, + rdhi, + rnlo, + rnhi, + rmlo, + rmhi, + } => (vec![*rdlo, *rdhi], vec![*rnlo, *rnhi, *rmlo, *rmhi]), + ArmOp::I64Rotl { + rdlo, + rdhi, + rnlo, + rnhi, + shift, + } + | ArmOp::I64Rotr { + rdlo, + rdhi, + rnlo, + rnhi, + shift, + } => (vec![*rdlo, *rdhi], vec![*rnlo, *rnhi, *shift]), + ArmOp::I64Clz { rd, rnlo, rnhi } + | ArmOp::I64Ctz { rd, rnlo, rnhi } + | ArmOp::I64Popcnt { rd, rnlo, rnhi } => (vec![*rd], vec![*rnlo, *rnhi]), + ArmOp::I64Extend8S { rdlo, rdhi, rnlo } + | ArmOp::I64Extend16S { rdlo, rdhi, rnlo } + | ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => (vec![*rdlo, *rdhi], vec![*rnlo]), + ArmOp::I32WrapI64 { rd, rnlo } => (vec![*rd], vec![*rnlo]), + // For ops we don't specifically encode, be conservative: assume + // nothing is read or written. The harness's purpose is to catch + // I64-op clobber regressions; if other ops sneak in we miss them + // here but they're outside this issue's scope. + _ => (vec![], vec![]), + } +} + +/// Check that no ARM instruction emitted from `op_idx == 0..first_local_get` +/// writes to any AAPCS param register R0..R{num_params-1} before that param +/// is read. +/// +/// This mirrors PR #100's `i64_lowering_doesnt_clobber_params` invariant: +/// the prologue is allowed to push/pop param regs (preserving them), but no +/// data-flow instruction may write a param reg before LocalGet has been +/// emitted to read it. +fn assert_no_param_clobber_before_localget( + wasm_ops: &[WasmOp], + arm: &[ArmInstruction], + num_params: u32, +) { + let param_regs: Vec = (0..num_params.min(4)) + .map(|i| match i { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => unreachable!(), + }) + .collect(); + + // For each param i, find the earliest WASM line that does LocalGet(i). + let mut earliest_read: std::collections::HashMap = std::collections::HashMap::new(); + for (idx, op) in wasm_ops.iter().enumerate() { + if let WasmOp::LocalGet(p) = op + && *p < num_params + { + earliest_read.entry(*p).or_insert(idx); + } + } + + // Walk every emitted ARM instruction. For each instruction whose + // `source_line` is earlier than the earliest LocalGet for some param, + // assert it does NOT write that param's reg. + for instr in arm { + let Some(wasm_line) = instr.source_line else { + // Prologue / epilogue ops without source_line are exempt. + continue; + }; + let (writes, _reads) = rw_sets(&instr.op); + for (param_idx, ¶m_reg) in param_regs.iter().enumerate() { + let earliest = earliest_read.get(&(param_idx as u32)).copied(); + // If the param is never read, treat its read line as + // infinity — any write before that is fine in principle, but + // for the fuzz harness's strict invariant we only flag writes + // that strictly precede a real read. + let Some(earliest_read_line) = earliest else { + continue; + }; + if wasm_line < earliest_read_line && writes.contains(¶m_reg) { + panic!( + "AAPCS clobber: ARM instr at wasm line {wasm_line} writes \ + param reg {:?} before LocalGet({param_idx}) at line {earliest_read_line}. \ + Op: {:?}", + param_reg, instr.op + ); + } + } + } +} + +/// The exact reproducer from issue #103: an i64 LtS op runs before LocalGet(0) +/// reads the first param. Pre-fix, `select_default` emits I64SetCond with +/// `rd: R0, rn_lo: R0, rn_hi: R1, rm_lo: R2, rm_hi: R3`, clobbering R0. +#[test] +fn issue_103_i64_lt_s_does_not_clobber_r0() { + let wasm = vec![ + WasmOp::I64Const(0), + WasmOp::I64Const(1), + WasmOp::I64LtS, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + ]; + let arm = compile_no_optimize(&wasm, /*num_params=*/ 1); + assert_no_param_clobber_before_localget(&wasm, &arm, 1); +} + +/// Class-level audit: every i64 op the fuzz harness flags should pass the +/// no-clobber invariant. We run each in isolation against a function with +/// `num_params=4` (worst case — all of R0..R3 are reserved AAPCS params). +#[test] +fn issue_103_all_i64_ops_preserve_params() { + // Each entry: (label, wasm-op sequence that puts the op on the stack, + // num_params). The sequence ends with LocalGet(0) and Drop + // so we can prove the param was preserved. + // + // For binary i64 ops we push two i64 consts; for unary i64 ops one; for + // I32WrapI64 we push one i64 const. + let cases: Vec<(&str, Vec)> = vec![ + ( + "I64Eq", + vec![WasmOp::I64Const(0), WasmOp::I64Const(0), WasmOp::I64Eq], + ), + ( + "I64Ne", + vec![WasmOp::I64Const(0), WasmOp::I64Const(1), WasmOp::I64Ne], + ), + ( + "I64LtS", + vec![WasmOp::I64Const(0), WasmOp::I64Const(1), WasmOp::I64LtS], + ), + ( + "I64LtU", + vec![WasmOp::I64Const(0), WasmOp::I64Const(1), WasmOp::I64LtU], + ), + ( + "I64LeS", + vec![WasmOp::I64Const(0), WasmOp::I64Const(1), WasmOp::I64LeS], + ), + ( + "I64LeU", + vec![WasmOp::I64Const(0), WasmOp::I64Const(1), WasmOp::I64LeU], + ), + ( + "I64GtS", + vec![WasmOp::I64Const(1), WasmOp::I64Const(0), WasmOp::I64GtS], + ), + ( + "I64GtU", + vec![WasmOp::I64Const(1), WasmOp::I64Const(0), WasmOp::I64GtU], + ), + ( + "I64GeS", + vec![WasmOp::I64Const(1), WasmOp::I64Const(0), WasmOp::I64GeS], + ), + ( + "I64GeU", + vec![WasmOp::I64Const(1), WasmOp::I64Const(0), WasmOp::I64GeU], + ), + ( + "I64Mul", + vec![WasmOp::I64Const(2), WasmOp::I64Const(3), WasmOp::I64Mul], + ), + ( + "I64DivS", + vec![WasmOp::I64Const(10), WasmOp::I64Const(3), WasmOp::I64DivS], + ), + ( + "I64DivU", + vec![WasmOp::I64Const(10), WasmOp::I64Const(3), WasmOp::I64DivU], + ), + ( + "I64RemS", + vec![WasmOp::I64Const(10), WasmOp::I64Const(3), WasmOp::I64RemS], + ), + ( + "I64RemU", + vec![WasmOp::I64Const(10), WasmOp::I64Const(3), WasmOp::I64RemU], + ), + ( + "I64Rotl", + vec![ + WasmOp::I64Const(0x123), + WasmOp::I64Const(4), + WasmOp::I64Rotl, + ], + ), + ( + "I64Rotr", + vec![ + WasmOp::I64Const(0x123), + WasmOp::I64Const(4), + WasmOp::I64Rotr, + ], + ), + ("I64Clz", vec![WasmOp::I64Const(0x123), WasmOp::I64Clz]), + ("I64Ctz", vec![WasmOp::I64Const(0x123), WasmOp::I64Ctz]), + ( + "I64Popcnt", + vec![WasmOp::I64Const(0x123), WasmOp::I64Popcnt], + ), + ( + "I64Extend8S", + vec![WasmOp::I64Const(0x7f), WasmOp::I64Extend8S], + ), + ( + "I64Extend16S", + vec![WasmOp::I64Const(0x7fff), WasmOp::I64Extend16S], + ), + ( + "I64Extend32S", + vec![WasmOp::I64Const(0x7fff_ffff), WasmOp::I64Extend32S], + ), + ( + "I32WrapI64", + vec![WasmOp::I64Const(0xdead_beef), WasmOp::I32WrapI64], + ), + ]; + + for num_params in [1u32, 2, 3, 4] { + for (label, prefix) in &cases { + // Build a function: + // + // (computes result, may produce i32 or i64) + // drop (may need to drop more than once for i64 result; + // we use multiple Drops in a forgiving way below) + // local.get 0 (would surface a clobber of R0) + // drop + // + // We avoid trying to size Drop precisely — the select_with_stack + // path will pop whatever's on the wasm stack and stop. The + // AAPCS-clobber invariant only depends on what's BETWEEN line 0 + // and the LocalGet line. + let mut wasm = prefix.clone(); + // Drop the result (could be i32 or i64; one Drop handles either + // because select_with_stack's Drop only pops one stack slot). + // For i64 producers we add a second Drop guarded by a stack-len + // check — simpler to just push the local AFTER drops which the + // unused stack contents won't perturb because all ops here + // produce a deterministic stack delta. + wasm.push(WasmOp::Drop); + // For binary i64 ops that produce an i64 result we drop the hi + // half too. Unary ops with i32 result (cmp / clz / etc.) leave + // an empty stack after one Drop. To stay safe we always emit a + // LocalGet just to compute the AAPCS invariant — extra stack + // items don't change reg writes. + wasm.push(WasmOp::LocalGet(0)); + wasm.push(WasmOp::Drop); + + let arm = compile_no_optimize(&wasm, num_params); + // We use catch_unwind so we report which case failed, not just + // the first. + let result = std::panic::catch_unwind(|| { + assert_no_param_clobber_before_localget(&wasm, &arm, num_params); + }); + if let Err(payload) = result { + let msg = payload + .downcast_ref::() + .cloned() + .or_else(|| payload.downcast_ref::<&str>().map(|s| s.to_string())) + .unwrap_or_else(|| "".to_string()); + panic!("issue #103 regression for {label} with num_params={num_params}:\n{msg}"); + } + } + } +} + +/// Sanity check: the I64Eqz op (already handled correctly in select_with_stack +/// pre-fix per PR #86's vicinity) still passes the invariant. Guards against +/// accidentally regressing the working case. +#[test] +fn i64_eqz_still_preserves_params() { + let wasm = vec![ + WasmOp::I64Const(42), + WasmOp::I64Eqz, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + ]; + for num_params in [1u32, 4] { + let arm = compile_no_optimize(&wasm, num_params); + assert_no_param_clobber_before_localget(&wasm, &arm, num_params); + } +}