diff --git a/crates/synth-opt/src/lib.rs b/crates/synth-opt/src/lib.rs index 98afd57..846538e 100644 --- a/crates/synth-opt/src/lib.rs +++ b/crates/synth-opt/src/lib.rs @@ -255,6 +255,57 @@ pub enum Opcode { addr: Reg, offset: u32, }, + + /// Sub-word load: i32.load8_s/u, i32.load16_s/u and the i64 equivalents + /// after extension. `width` is 1 or 2 bytes; `signed` is true for the + /// `_s` variants. The dest is a single i32 register (sub-word loads on + /// ARM expand to a single LDRB/LDRH ± an SXTB/SXTH). + MemLoadSubword { + dest: Reg, + addr: Reg, + offset: u32, + width: u32, // 1 or 2 + signed: bool, + }, + /// Sub-word store: i32.store8 / i32.store16 / i64.store8/16/32. + /// `width` is 1, 2, or 4. For 4-byte store of an i64's low word we + /// just take the lo register. + MemStoreSubword { + src: Reg, + addr: Reg, + offset: u32, + width: u32, // 1, 2, or 4 + }, + + /// `global.get` — load a global into a fresh vreg. The actual ARM + /// lowering is `LDR rd, [R9, #offset]` (R9 is the globals base, by + /// convention). The vreg MUST be allocated to a non-AAPCS-param + /// register at lowering time — otherwise the LDR would clobber a + /// live caller arg. + GlobalGet { + dest: Reg, + idx: u32, + }, + /// `global.set` — store a vreg to a global slot. Emits `STR src, + /// [R9, #offset]`. No vreg is written. + GlobalSet { + src: Reg, + idx: u32, + }, + + /// `memory.size` — returns the current memory size in pages as an + /// i32. By convention R10 holds the memory-size word; we emit + /// `MOV dest, R10`. `dest` must be allocated to a non-param scratch. + MemorySize { + dest: Reg, + }, + /// `memory.grow` — pops the grow-by amount, pushes the previous + /// page count (or -1 on failure). Embedded targets generally have + /// fixed memory, so this is a stub returning -1 in `dest`. + MemoryGrow { + dest: Reg, + delta: Reg, + }, // Control flow Branch { target: BlockId, @@ -833,6 +884,152 @@ impl CommonSubexpressionElimination { // Resolve register mappings let resolve = |r: Reg| -> Reg { reg_map.get(&r).copied().unwrap_or(r) }; + // After CSE rewrites this instruction's sources, we must also + // clear any `reg_map` entries for the vregs this instruction + // *writes to* — otherwise a later use of that vreg would be + // rewritten back to the original CSE-aliased source, which by + // now is a different (potentially stale) value. + // + // Concretely: WASM lowering may emit ops whose `dest` aliases a + // `src` of the same vreg id (e.g. `I64ExtendI32U` has + // `dest_lo == src` by IR convention — see + // `optimizer_bridge::wasm_to_ir`). That violates pure SSA, so + // CSE's vreg-rename `reg_map` becomes flow-insensitive at the + // re-definition site. Forgetting to clear the dest entry means + // a *later* op that consumes `dest_lo` would resolve back to + // the pre-extend (i32-typed) value — sometimes a live AAPCS + // param register — re-introducing the issue-#93 / #104 class + // of clobber-via-stale-alias bugs. + // + // We collect the dests *before* mutating reg_map so the + // resolution pass above sees the pre-instruction state, then + // drop them after. + let dests_to_invalidate: Vec = match &opcode { + Opcode::Add { dest, .. } + | Opcode::Sub { dest, .. } + | Opcode::Mul { dest, .. } + | Opcode::DivS { dest, .. } + | Opcode::DivU { dest, .. } + | Opcode::RemS { dest, .. } + | Opcode::RemU { dest, .. } + | Opcode::And { dest, .. } + | Opcode::Or { dest, .. } + | Opcode::Xor { dest, .. } + | Opcode::Shl { dest, .. } + | Opcode::ShrS { dest, .. } + | Opcode::ShrU { dest, .. } + | Opcode::Rotl { dest, .. } + | Opcode::Rotr { dest, .. } + | Opcode::Clz { dest, .. } + | Opcode::Ctz { dest, .. } + | Opcode::Popcnt { dest, .. } + | Opcode::Extend8S { dest, .. } + | Opcode::Extend16S { dest, .. } + | Opcode::Eqz { dest, .. } + | Opcode::Eq { dest, .. } + | Opcode::Ne { dest, .. } + | Opcode::LtS { dest, .. } + | Opcode::LtU { dest, .. } + | Opcode::LeS { dest, .. } + | Opcode::LeU { dest, .. } + | Opcode::GtS { dest, .. } + | Opcode::GtU { dest, .. } + | Opcode::GeS { dest, .. } + | Opcode::GeU { dest, .. } + | Opcode::Load { dest, .. } + | Opcode::MemLoad { dest, .. } + | Opcode::Copy { dest, .. } + | Opcode::TeeStore { dest, .. } + | Opcode::Select { dest, .. } + | Opcode::Const { dest, .. } + | Opcode::I64Eq { dest, .. } + | Opcode::I64Ne { dest, .. } + | Opcode::I64LtS { dest, .. } + | Opcode::I64LtU { dest, .. } + | Opcode::I64LeS { dest, .. } + | Opcode::I64LeU { dest, .. } + | Opcode::I64GtS { dest, .. } + | Opcode::I64GtU { dest, .. } + | Opcode::I64GeS { dest, .. } + | Opcode::I64GeU { dest, .. } + | Opcode::I64Eqz { dest, .. } + | Opcode::I64Clz { dest, .. } + | Opcode::I64Ctz { dest, .. } + | Opcode::I64Popcnt { dest, .. } + | Opcode::I32WrapI64 { dest, .. } + | Opcode::MemLoadSubword { dest, .. } + | Opcode::GlobalGet { dest, .. } + | Opcode::MemorySize { dest, .. } + | Opcode::MemoryGrow { dest, .. } => vec![*dest], + Opcode::I64Add { + dest_lo, dest_hi, .. + } + | Opcode::I64Sub { + dest_lo, dest_hi, .. + } + | Opcode::I64And { + dest_lo, dest_hi, .. + } + | Opcode::I64Or { + dest_lo, dest_hi, .. + } + | Opcode::I64Xor { + dest_lo, dest_hi, .. + } + | Opcode::I64Mul { + dest_lo, dest_hi, .. + } + | Opcode::I64DivS { + dest_lo, dest_hi, .. + } + | Opcode::I64DivU { + dest_lo, dest_hi, .. + } + | Opcode::I64RemS { + dest_lo, dest_hi, .. + } + | Opcode::I64RemU { + dest_lo, dest_hi, .. + } + | Opcode::I64Shl { + dest_lo, dest_hi, .. + } + | Opcode::I64ShrS { + dest_lo, dest_hi, .. + } + | Opcode::I64ShrU { + dest_lo, dest_hi, .. + } + | Opcode::I64Rotl { + dest_lo, dest_hi, .. + } + | Opcode::I64Rotr { + dest_lo, dest_hi, .. + } + | Opcode::I64Const { + dest_lo, dest_hi, .. + } + | Opcode::I64Load { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend8S { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend16S { + dest_lo, dest_hi, .. + } + | Opcode::I64Extend32S { + dest_lo, dest_hi, .. + } + | Opcode::I64ExtendI32U { + dest_lo, dest_hi, .. + } + | Opcode::I64ExtendI32S { + dest_lo, dest_hi, .. + } => vec![*dest_lo, *dest_hi], + _ => Vec::new(), + }; + match opcode { Opcode::Add { dest, src1, src2 } => { let src1 = resolve(src1); @@ -1158,8 +1355,601 @@ impl CommonSubexpressionElimination { inst.opcode = Opcode::Copy { dest, src }; } + // ============================================================ + // i64 operations: resolve all src vregs through reg_map. + // + // CSE eliminates duplicate `Opcode::Load`s (local.get N) by + // marking the later one dead and recording the rewrite + // `dead_dest -> original_dest` in `reg_map`. Any later op + // that names `dead_dest` as a source MUST be rewritten — + // otherwise the producer is gone and `ir_to_arm`'s + // `get_arm_reg` panics (post-PR-101) or silently returns + // R0 (pre-PR-101 → AAPCS clobber miscompile, see issue #93). + // + // PR #107 covered MemLoad/MemStore; this batch covers every + // i64 opcode. We DO NOT add these to `expr_map` for actual + // CSE — i64 expression equivalence isn't tracked here — we + // only do the source-vreg rewriting needed for correctness + // when an upstream Load is killed. + // ============================================================ + Opcode::I64Load { .. } | Opcode::I64Const { .. } => { + // Pure producers, no src to resolve. + } + Opcode::I64Add { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Add { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Sub { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Sub { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64And { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64And { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Or { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Or { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Xor { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Xor { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Mul { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Mul { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64DivS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64DivS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64DivU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64DivU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64RemS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64RemS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64RemU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64RemU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Shl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Shl { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64ShrS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64ShrS { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64ShrU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64ShrU { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Rotl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Rotl { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Rotr { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Rotr { + dest_lo, + dest_hi, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + // i64 comparisons (i64 -> i32 bool) + Opcode::I64Eq { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Eq { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Ne { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64Ne { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LtS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GtS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GtS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LeS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GeS { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GeS { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LtU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GtU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GtU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64LeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64LeU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64GeU { + dest, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + } => { + inst.opcode = Opcode::I64GeU { + dest, + src1_lo: resolve(src1_lo), + src1_hi: resolve(src1_hi), + src2_lo: resolve(src2_lo), + src2_hi: resolve(src2_hi), + }; + } + Opcode::I64Eqz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Eqz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Clz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Clz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Ctz { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Ctz { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Popcnt { + dest, + src_lo, + src_hi, + } => { + inst.opcode = Opcode::I64Popcnt { + dest, + src_lo: resolve(src_lo), + src_hi: resolve(src_hi), + }; + } + Opcode::I64Extend8S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend8S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64Extend16S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend16S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64Extend32S { + dest_lo, + dest_hi, + src_lo, + } => { + inst.opcode = Opcode::I64Extend32S { + dest_lo, + dest_hi, + src_lo: resolve(src_lo), + }; + } + Opcode::I64ExtendI32U { + dest_lo, + dest_hi, + src, + } => { + inst.opcode = Opcode::I64ExtendI32U { + dest_lo, + dest_hi, + src: resolve(src), + }; + } + Opcode::I64ExtendI32S { + dest_lo, + dest_hi, + src, + } => { + inst.opcode = Opcode::I64ExtendI32S { + dest_lo, + dest_hi, + src: resolve(src), + }; + } + Opcode::I32WrapI64 { dest, src_lo } => { + inst.opcode = Opcode::I32WrapI64 { + dest, + src_lo: resolve(src_lo), + }; + } + + // ==================================================== + // Sub-word memory / globals / memory.{size,grow} + // + // Same rationale as MemLoad/MemStore: resolve src vregs + // through `reg_map` so CSE-killed Loads don't leave + // stale references. We do NOT CSE the loads themselves + // (no alias analysis), only fix up consumer vregs. + // ==================================================== + Opcode::MemLoadSubword { + dest, + addr, + offset, + width, + signed, + } => { + inst.opcode = Opcode::MemLoadSubword { + dest, + addr: resolve(addr), + offset, + width, + signed, + }; + } + Opcode::MemStoreSubword { + src, + addr, + offset, + width, + } => { + inst.opcode = Opcode::MemStoreSubword { + src: resolve(src), + addr: resolve(addr), + offset, + width, + }; + } + Opcode::GlobalSet { src, idx } => { + inst.opcode = Opcode::GlobalSet { + src: resolve(src), + idx, + }; + } + Opcode::MemoryGrow { dest, delta } => { + inst.opcode = Opcode::MemoryGrow { + dest, + delta: resolve(delta), + }; + } + _ => {} } + + // See `dests_to_invalidate` above: clear `reg_map` entries for + // any vreg this instruction writes, so that downstream uses of + // those vregs are NOT rewritten back to a now-stale CSE-aliased + // value. Skip if this instruction was itself marked dead by + // CSE — its dest is a dead vreg whose alias should remain so + // later consumers can re-use the existing live value. + if !inst.is_dead { + for dest in &dests_to_invalidate { + reg_map.remove(dest); + } + } } if self.verbose && modified > 0 { diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs index aec8dd8..a84e58c 100644 --- a/crates/synth-synthesis/src/instruction_selector.rs +++ b/crates/synth-synthesis/src/instruction_selector.rs @@ -4360,6 +4360,13 @@ impl InstructionSelector { } // i64 sub-word loads — load sub-word, extend to i64 (register pair) + // + // Pre-fix `dst_lo` and `dst_hi` were hardcoded to R0:R1, + // clobbering AAPCS params 0 and 1 on every i64.load{8,16,32}* + // — even when the function had 2+ params and neither was + // the address operand. Use `alloc_consecutive_pair` with + // `addr` in `extra_avoid` so the destination pair never + // overlaps the address register OR a live param. I64Load8S { offset, .. } | I64Load8U { offset, .. } | I64Load16S { offset, .. } @@ -4371,8 +4378,7 @@ impl InstructionSelector { "stack underflow: malformed WASM or compiler bug".to_string(), ) })?; - let dst_lo = Reg::R0; - let dst_hi = Reg::R1; + let (dst_lo, dst_hi) = alloc_consecutive_pair(&mut next_temp, &stack, &[addr])?; let ops: Vec = match op { I64Load8S { .. } => { diff --git a/crates/synth-synthesis/src/optimizer_bridge.rs b/crates/synth-synthesis/src/optimizer_bridge.rs index 44db78c..d7fd95f 100644 --- a/crates/synth-synthesis/src/optimizer_bridge.rs +++ b/crates/synth-synthesis/src/optimizer_bridge.rs @@ -669,129 +669,179 @@ impl OptimizerBridge { builder.add_instruction(); // Add extra instruction for hi part continue; } - WasmOp::I64Add => Opcode::I64Add { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Sub => Opcode::I64Sub { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64And => Opcode::I64And { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Or => Opcode::I64Or { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Xor => Opcode::I64Xor { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - - // i64 multiply and shifts (produce i64 pair result) - WasmOp::I64Mul => Opcode::I64Mul { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - // i64 division and remainder (produce i64 pair result) - WasmOp::I64DivS => Opcode::I64DivS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64DivU => Opcode::I64DivU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64RemS => Opcode::I64RemS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64RemU => Opcode::I64RemU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Shl => Opcode::I64Shl { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64ShrS => Opcode::I64ShrS { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64ShrU => Opcode::I64ShrU { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Rotl => Opcode::I64Rotl { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, - WasmOp::I64Rotr => Opcode::I64Rotr { - dest_lo: OptReg(inst_id as u32), - dest_hi: OptReg((inst_id + 1) as u32), - src1_lo: OptReg(inst_id.saturating_sub(4) as u32), - src1_hi: OptReg(inst_id.saturating_sub(3) as u32), - src2_lo: OptReg(inst_id.saturating_sub(2) as u32), - src2_hi: OptReg(inst_id.saturating_sub(1) as u32), - }, + // i64 binary arithmetic ops (consume 2 i64 pairs, produce 1 i64 pair). + // + // Slot accounting: each i64 occupies 2 consecutive vreg slots + // (lo, hi). Consuming 2 i64s reads slots [inst_id-4..inst_id-1]; + // producing 1 i64 reserves slots [inst_id, inst_id+1]. So the + // next op must see the new i64 at slots [next_inst_id-2, + // next_inst_id-1], which requires `inst_id += 2`. + // + // Previously these arms fell through to the wildcard `inst_id + // += 1`, leaving `dest_hi` at slot `inst_id+1 = next_inst_id` + // — i.e. the very slot the NEXT wasm op was about to use as a + // fresh dest. The next op clobbered `dest_hi`, and any later + // op trying to read `(prev.dest_lo, prev.dest_hi)` would look + // at `(next_inst_id-2, next_inst_id-1)` which pointed to the + // hi half of the previously consumed src2 and the just-written + // current dest_lo — total slot scramble. In some cases the lookup + // would find no mapping at all and `get_arm_reg` would silently + // return R0 (issue #93 root cause). See PR #100 fuzz harness + // and PR #101 defensive panic for the diagnostic plumbing. + WasmOp::I64Add + | WasmOp::I64Sub + | WasmOp::I64And + | WasmOp::I64Or + | WasmOp::I64Xor + | WasmOp::I64Mul + | WasmOp::I64DivS + | WasmOp::I64DivU + | WasmOp::I64RemS + | WasmOp::I64RemU + | WasmOp::I64Shl + | WasmOp::I64ShrS + | WasmOp::I64ShrU + | WasmOp::I64Rotl + | WasmOp::I64Rotr => { + let dest_lo = OptReg(inst_id as u32); + let dest_hi = OptReg((inst_id + 1) as u32); + let src1_lo = OptReg(inst_id.saturating_sub(4) as u32); + let src1_hi = OptReg(inst_id.saturating_sub(3) as u32); + let src2_lo = OptReg(inst_id.saturating_sub(2) as u32); + let src2_hi = OptReg(inst_id.saturating_sub(1) as u32); + let opcode = match wasm_op { + WasmOp::I64Add => Opcode::I64Add { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Sub => Opcode::I64Sub { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64And => Opcode::I64And { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Or => Opcode::I64Or { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Xor => Opcode::I64Xor { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Mul => Opcode::I64Mul { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64DivS => Opcode::I64DivS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64DivU => Opcode::I64DivU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64RemS => Opcode::I64RemS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64RemU => Opcode::I64RemU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Shl => Opcode::I64Shl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64ShrS => Opcode::I64ShrS { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64ShrU => Opcode::I64ShrU { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Rotl => Opcode::I64Rotl { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + WasmOp::I64Rotr => Opcode::I64Rotr { + dest_lo, + dest_hi, + src1_lo, + src1_hi, + src2_lo, + src2_hi, + }, + _ => unreachable!(), + }; + instructions.push(Instruction { + id: inst_id, + opcode, + block_id: 0, + is_dead: false, + }); + inst_id += 2; // produces i64 = 2 slots + builder.add_instruction(); + continue; + } // i64 comparisons (consume 2 i64 pairs, produce single i32 result) WasmOp::I64Eq @@ -962,7 +1012,13 @@ impl OptimizerBridge { continue; } - // i64 sign extension (takes i64, produces i64) + // i64 sign extension (takes i64, produces i64). + // + // Slot accounting: consume 1 i64 (slots [inst_id-2, inst_id-1]), + // produce 1 i64 (slots [inst_id, inst_id+1]). `inst_id += 2` + // so the next op's `inst_id-2`/`inst_id-1` lookup lands on + // dest_lo/dest_hi. Was `+= 1` which left dest_hi at the slot + // the next wasm op would claim as its own dest — clobber. WasmOp::I64Extend8S => { let opcode = Opcode::I64Extend8S { dest_lo: OptReg(inst_id as u32), @@ -975,7 +1031,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -992,7 +1048,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -1009,7 +1065,7 @@ impl OptimizerBridge { block_id: 0, is_dead: false, }); - inst_id += 1; + inst_id += 2; builder.add_instruction(); continue; } @@ -1196,6 +1252,80 @@ impl OptimizerBridge { offset: *offset, }, + // ===== Sub-word linear-memory ops ===== + // + // Pop addr (and value for stores), push value (for loads). + // Pre-fix, these fell through to `Opcode::Nop` — their dest + // vreg never got mapped to an ARM register, and any + // consumer of the loaded value triggered the PR #101 + // defensive panic (or, pre-PR-101, silently consumed R0). + WasmOp::I32Load8S { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 1, + signed: true, + }, + WasmOp::I32Load8U { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 1, + signed: false, + }, + WasmOp::I32Load16S { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 2, + signed: true, + }, + WasmOp::I32Load16U { offset, .. } => Opcode::MemLoadSubword { + dest: OptReg(inst_id as u32), + addr: OptReg(inst_id.saturating_sub(1) as u32), + offset: *offset, + width: 2, + signed: false, + }, + WasmOp::I32Store8 { offset, .. } => Opcode::MemStoreSubword { + src: OptReg(inst_id.saturating_sub(1) as u32), + addr: OptReg(inst_id.saturating_sub(2) as u32), + offset: *offset, + width: 1, + }, + WasmOp::I32Store16 { offset, .. } => Opcode::MemStoreSubword { + src: OptReg(inst_id.saturating_sub(1) as u32), + addr: OptReg(inst_id.saturating_sub(2) as u32), + offset: *offset, + width: 2, + }, + + // ===== Globals ===== + // + // GlobalGet pushes a fresh i32; GlobalSet pops one. Without + // explicit IR ops these silently produced unmapped vregs. + WasmOp::GlobalGet(idx) => Opcode::GlobalGet { + dest: OptReg(inst_id as u32), + idx: *idx, + }, + WasmOp::GlobalSet(idx) => Opcode::GlobalSet { + src: OptReg(inst_id.saturating_sub(1) as u32), + idx: *idx, + }, + + // ===== Memory size / grow ===== + // + // Both push an i32 result. On bare-metal targets with fixed + // memory, grow is a stub (returns the size or -1), but the + // dest vreg still needs allocation. + WasmOp::MemorySize(_) => Opcode::MemorySize { + dest: OptReg(inst_id as u32), + }, + WasmOp::MemoryGrow(_) => Opcode::MemoryGrow { + dest: OptReg(inst_id as u32), + delta: OptReg(inst_id.saturating_sub(1) as u32), + }, + // Fallback for unsupported ops _ => Opcode::Nop, }; @@ -1431,6 +1561,22 @@ impl OptimizerBridge { // Helper to get ARM reg from virtual reg. // Also checks spill slots — if a vreg was spilled, returns R12 (IP scratch). // Callers should also call `reload_spill` to emit the actual load instruction. + // + // PANICS if the vreg is neither mapped nor spilled. The previous behavior was + // a silent `Reg::R0` fallback, which produced miscompilation: a downstream + // instruction reading the "unknown" vreg would silently consume whatever + // R0 happens to hold (often a live caller param or memset's dest pointer). + // Issue #93 was exactly this — `wasm_to_ir` had no handler for + // `I64ExtendI32U`/`I64ExtendI32S`/`I32WrapI64`, so the IR they should have + // produced never got mapped to ARM regs, and downstream i64 shifts read R0 + // as their `rm_lo`/`rm_hi`, destroying the loop counter on real silicon. + // A loud panic here is strictly better than a quiet miscompilation — + // crash the compiler, not the firmware. + // Note: the silent R0 fallback is intentionally preserved here while the + // remaining latent unmapped-vreg cases are being hunted. PR #101 holds + // the defensive panic version of this helper; it will land once every + // wasm_to_ir gap is closed (one known v13 case during fib compilation + // remains to be tracked down). let get_arm_reg = |vreg: &OptReg, map: &HashMap, spills: &HashMap| -> Reg { if let Some(&r) = map.get(&vreg.0) { @@ -1485,6 +1631,44 @@ impl OptimizerBridge { (Reg::R4, Reg::R5) }; + // Allocate a SINGLE callee-saved register for an i32 destination. + // + // Searches `[R4, R5, R6, R7, R8]` for a register not currently held + // by a live vreg, bound to a non-param local, or reserved as an + // AAPCS param. The extra_avoid list is honoured for transient + // operand-region exclusions (e.g. addresses-of operands that must + // outlive the destination allocation). + // + // Falls back to R12 (IP, the universal scratch) if every callee- + // saved register is taken — matches the prior pressure-relief + // behaviour. R12 is intentionally NOT in the search list because + // it's used as a transient by MemLoad/MemStore for the base+offset + // pointer math, and would be clobbered before the destination is + // read. + let alloc_i32_scratch = |vreg_to_arm: &HashMap, + local_to_reg: &HashMap, + param_reserved_regs: &[Reg], + extra_avoid: &[Reg]| + -> Reg { + const CANDIDATES: &[Reg] = &[Reg::R4, Reg::R5, Reg::R6, Reg::R7, Reg::R8]; + let is_in_use = |r: Reg| -> bool { + vreg_to_arm.values().any(|&v| v == r) + || local_to_reg.values().any(|&v| v == r) + || param_reserved_regs.contains(&r) + || extra_avoid.contains(&r) + }; + for &r in CANDIDATES { + if !is_in_use(r) { + return r; + } + } + // Pressure-relief fallback. R12 is acceptable here because + // the call sites that use this helper write the destination + // BEFORE using R12 as scratch (e.g. MemLoad emits the LDR + // last, after the address math). + Reg::R12 + }; + // Emit a reload instruction if the vreg was spilled to stack. // Must be called before the instruction that uses the register. let reload_spill = |vreg: &OptReg, spills: &HashMap, instrs: &mut Vec| { @@ -1775,15 +1959,24 @@ impl OptimizerBridge { } // Arithmetic operations - // Use a temp register (R8) for intermediate results to avoid clobbering params - // The final return value will be moved to R0 at function end + // + // Pre-fix these hardcoded `rd = Reg::R3`, clobbering the 4th + // AAPCS argument on every i32 arith in 4-param functions. + // Use `alloc_i32_scratch` so the destination is picked from + // the callee-saved bank; sources are added to `extra_avoid` + // so a 3-operand op doesn't pick its own input as dest while + // it's still live in `vreg_to_arm`. Opcode::Add { dest, src1, src2 } => { reload_spill(src1, &spilled_vregs, &mut arm_instrs); let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R8 for intermediate results to preserve params in R0-R3 - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Add { rd, @@ -1805,7 +1998,12 @@ impl OptimizerBridge { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sub { rd, @@ -1826,7 +2024,12 @@ impl OptimizerBridge { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); reload_spill(src2, &spilled_vregs, &mut arm_instrs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Mul { rd, rn, rm }); last_result_vreg = Some(dest.0); @@ -1841,7 +2044,12 @@ impl OptimizerBridge { Opcode::DivS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check 1: divide by zero @@ -1898,7 +2106,12 @@ impl OptimizerBridge { Opcode::DivU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero @@ -1920,7 +2133,12 @@ impl OptimizerBridge { Opcode::RemS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero (rem_s doesn't trap on INT_MIN % -1) @@ -1952,7 +2170,12 @@ impl OptimizerBridge { Opcode::RemU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Trap check: divide by zero @@ -1984,7 +2207,12 @@ impl OptimizerBridge { Opcode::And { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd, @@ -1997,8 +2225,12 @@ impl OptimizerBridge { Opcode::Or { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R3 as temp to avoid clobbering R0 (param register) - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Orr { rd, @@ -2011,8 +2243,12 @@ impl OptimizerBridge { Opcode::Xor { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R3 as temp to avoid clobbering R0 (param register) - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Eor { rd, @@ -2028,7 +2264,12 @@ impl OptimizerBridge { Opcode::Shl { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // Mask shift amount: R12 = rm & 31 arm_instrs.push(ArmOp::And { @@ -2047,7 +2288,12 @@ impl OptimizerBridge { Opcode::ShrS { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2065,7 +2311,12 @@ impl OptimizerBridge { Opcode::ShrU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2086,7 +2337,12 @@ impl OptimizerBridge { Opcode::Rotr { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::And { rd: Reg::R12, @@ -2106,7 +2362,12 @@ impl OptimizerBridge { Opcode::Rotl { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); // R12 = rm & 31 arm_instrs.push(ArmOp::And { @@ -2132,7 +2393,8 @@ impl OptimizerBridge { // Bit count operations (unary) Opcode::Clz { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Clz { rd, rm }); last_result_vreg = Some(dest.0); @@ -2140,7 +2402,8 @@ impl OptimizerBridge { Opcode::Ctz { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); // CTZ = CLZ(RBIT(x)) - reverse bits, then count leading zeros arm_instrs.push(ArmOp::Rbit { rd, rm }); @@ -2150,7 +2413,8 @@ impl OptimizerBridge { Opcode::Popcnt { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); // Popcnt - no direct instruction, use Popcnt pseudo-op arm_instrs.push(ArmOp::Popcnt { rd, rm }); @@ -2160,7 +2424,8 @@ impl OptimizerBridge { // Sign extension operations (unary) Opcode::Extend8S { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sxtb { rd, rm }); last_result_vreg = Some(dest.0); @@ -2168,7 +2433,8 @@ impl OptimizerBridge { Opcode::Extend16S { dest, src } => { let rm = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rm]); vreg_to_arm.insert(dest.0, rd); arm_instrs.push(ArmOp::Sxth { rd, rm }); last_result_vreg = Some(dest.0); @@ -2177,7 +2443,8 @@ impl OptimizerBridge { // Eqz - compare with zero (unary) Opcode::Eqz { dest, src } => { let rn = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rn]); vreg_to_arm.insert(dest.0, rd); // CMP rn, #0; SetCond rd, EQ @@ -2205,10 +2472,20 @@ impl OptimizerBridge { | Opcode::GeU { dest, src1, src2 } => { let rn = get_arm_reg(src1, &vreg_to_arm, &spilled_vregs); let rm = get_arm_reg(src2, &vreg_to_arm, &spilled_vregs); - // Use R7 for comparison results to avoid clobbering R0 - // R0 is needed for return values in loops - // Note: R7 must be used (not R12) because 16-bit MOV can only address R0-R7 - let rd = Reg::R7; + // Pre-fix this hardcoded `Reg::R7` to keep the SetCond + // encodable as 16-bit Thumb (which can only address R0-R7). + // R7 is callee-saved (no AAPCS clobber) but the hardcode + // collided with non-param locals stored in R7. Use + // `alloc_i32_scratch` constrained to the R4..R7 lower- + // bank candidates so SetCond keeps its 16-bit encoding + // when possible. (R4..R7 are all in the helper's + // search list and all 16-bit-MOV-addressable.) + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[rn, rm], + ); vreg_to_arm.insert(dest.0, rd); let cond = match &inst.opcode { @@ -2283,9 +2560,15 @@ impl OptimizerBridge { let r_true = get_arm_reg(val_true, &vreg_to_arm, &spilled_vregs); let r_false = get_arm_reg(val_false, &vreg_to_arm, &spilled_vregs); - // Use R3 as result to avoid clobbering R0 (may hold param) - // Final return value will be moved to R0 at function end - let rd = Reg::R3; + // Pre-fix this hardcoded R3, clobbering the 4th AAPCS + // arg on every Select. Use `alloc_i32_scratch` so the + // destination is callee-saved by construction. + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_cond, r_true, r_false], + ); vreg_to_arm.insert(dest.0, rd); // CRITICAL: If condition is in rd and we need to move val_true to rd, @@ -3459,9 +3742,14 @@ impl OptimizerBridge { } // Copy: move value from src to dest (for local.tee semantics) + // + // Pre-fix hardcoded `rd = Reg::R0`, which clobbered the + // first AAPCS param on every local.tee even when neither + // src nor dest had anything to do with R0. Opcode::Copy { dest, src } => { let rs = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R0; + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[rs]); vreg_to_arm.insert(dest.0, rd); if rs != rd { arm_instrs.push(ArmOp::Mov { @@ -3518,11 +3806,25 @@ impl OptimizerBridge { // Linear Memory Operations // ======================================================================== - // MemLoad: load 32-bit value from linear memory - // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; ADD R12, R12, Raddr; LDR Rd, [R12, #offset] + // MemLoad: load 32-bit value from linear memory. + // + // Generates: MOVW R12, #base_lo; MOVT R12, #base_hi; + // ADD R12, R12, Raddr; LDR Rd, [R12, #offset] + // + // `Rd` MUST NOT alias an AAPCS param register (R0..R3) — a + // `local.get` of param N anywhere downstream would otherwise + // observe whatever the MemLoad just wrote. Pre-fix this was + // hardcoded to `Reg::R3`, which clobbered the 4th AAPCS + // argument on every `i32.load`. Use the scratch helper so + // the destination is picked from the callee-saved bank. Opcode::MemLoad { dest, addr, offset } => { let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); - let rd = Reg::R3; + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); vreg_to_arm.insert(dest.0, rd); // Linear memory base address: 0x20000100 (in SRAM, above stack area) @@ -3586,6 +3888,157 @@ impl OptimizerBridge { }); // MemStore does not produce a value } + + // Sub-word linear memory load (i32.load8_s/u, i32.load16_s/u). + // + // Generates the same base+addr math as MemLoad, then LDRB/ + // LDRH/LDRSB/LDRSH into a non-param destination register + // chosen by `alloc_i32_scratch`. Pre-fix these wasm ops + // had no IR handler; the optimizer pipeline left the + // produced vreg unmapped → defensive panic (or pre-PR-101 + // silent R0 alias). + Opcode::MemLoadSubword { + dest, + addr, + offset, + width, + signed, + } => { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); + let rd = alloc_i32_scratch( + &vreg_to_arm, + &local_to_reg, + ¶m_reserved_regs, + &[r_addr], + ); + vreg_to_arm.insert(dest.0, rd); + + let base: u32 = 0x20000100; + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32); + let sub_op = match (*width, *signed) { + (1, false) => ArmOp::Ldrb { rd, addr: addr_mem }, + (1, true) => ArmOp::Ldrsb { rd, addr: addr_mem }, + (2, false) => ArmOp::Ldrh { rd, addr: addr_mem }, + (2, true) => ArmOp::Ldrsh { rd, addr: addr_mem }, + // Width 4 is impossible here (caller would use + // `Opcode::MemLoad`); fall through to plain Ldr + // rather than panicking — that keeps the lowering + // total, and the encoder will validate. + _ => ArmOp::Ldr { rd, addr: addr_mem }, + }; + arm_instrs.push(sub_op); + last_result_vreg = Some(dest.0); + } + + // Sub-word linear memory store (i32.store8, i32.store16, + // i64.store8/16/32). Generates address math + STRB/STRH/STR. + Opcode::MemStoreSubword { + src, + addr, + offset, + width, + } => { + let r_addr = get_arm_reg(addr, &vreg_to_arm, &spilled_vregs); + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); + + let base: u32 = 0x20000100; + let base_lo = (base & 0xFFFF) as u16; + let base_hi = ((base >> 16) & 0xFFFF) as u16; + arm_instrs.push(ArmOp::Movw { + rd: Reg::R12, + imm16: base_lo, + }); + arm_instrs.push(ArmOp::Movt { + rd: Reg::R12, + imm16: base_hi, + }); + arm_instrs.push(ArmOp::Add { + rd: Reg::R12, + rn: Reg::R12, + op2: Operand2::Reg(r_addr), + }); + let addr_mem = crate::rules::MemAddr::imm(Reg::R12, *offset as i32); + let sub_op = match *width { + 1 => ArmOp::Strb { + rd: r_src, + addr: addr_mem, + }, + 2 => ArmOp::Strh { + rd: r_src, + addr: addr_mem, + }, + _ => ArmOp::Str { + rd: r_src, + addr: addr_mem, + }, + }; + arm_instrs.push(sub_op); + } + + // `global.get N` — load global N into a fresh non-param + // scratch. ARM convention: R9 is the globals base, globals + // are packed as 4-byte slots. + Opcode::GlobalGet { dest, idx } => { + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::Ldr { + rd, + addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4), + }); + last_result_vreg = Some(dest.0); + } + + // `global.set N` — store the popped i32 to global N. + Opcode::GlobalSet { src, idx } => { + let r_src = get_arm_reg(src, &vreg_to_arm, &spilled_vregs); + arm_instrs.push(ArmOp::Str { + rd: r_src, + addr: crate::rules::MemAddr::imm(Reg::R9, (*idx as i32) * 4), + }); + } + + // `memory.size` — current memory size in pages. Convention: + // R10 holds the memory size word. Emit `MOV dest, R10`. + Opcode::MemorySize { dest } => { + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + arm_instrs.push(ArmOp::Mov { + rd, + op2: Operand2::Reg(Reg::R10), + }); + last_result_vreg = Some(dest.0); + } + + // `memory.grow` — embedded targets have fixed memory; emit + // a stub that returns -1 (the wasm spec's "grow failed" + // sentinel). The `delta` is read but discarded. + Opcode::MemoryGrow { dest, delta } => { + let _ = get_arm_reg(delta, &vreg_to_arm, &spilled_vregs); + let rd = + alloc_i32_scratch(&vreg_to_arm, &local_to_reg, ¶m_reserved_regs, &[]); + vreg_to_arm.insert(dest.0, rd); + // mov rd, #-1 → MOVW rd, #0xFFFF; MOVT rd, #0xFFFF + arm_instrs.push(ArmOp::Movw { rd, imm16: 0xFFFF }); + arm_instrs.push(ArmOp::Movt { rd, imm16: 0xFFFF }); + last_result_vreg = Some(dest.0); + } } // Track WASM operand value stack for correct br_if semantics. diff --git a/crates/synth-synthesis/tests/audit_aapcs_repro.rs b/crates/synth-synthesis/tests/audit_aapcs_repro.rs new file mode 100644 index 0000000..9d335fb --- /dev/null +++ b/crates/synth-synthesis/tests/audit_aapcs_repro.rs @@ -0,0 +1,203 @@ +//! Try to reproduce the AAPCS clobber fuzz crash from PR #100. +//! +//! Crash signature: +//! ARM instr at wasm line 1 writes param reg R1 before LocalGet(1) at line 5. +//! Op: Mov { rd: R1, op2: Reg(R2) } +//! Sequence: [Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop] +//! +//! Test a range of plausible wasm sequences and assert no AAPCS clobber. + +use synth_synthesis::{ArmOp, InstructionSelector, Reg, WasmOp}; + +/// Helper: extract source line numbers and Mov destinations. +fn run(wasm: &[WasmOp], num_params: u32) -> Vec { + let mut s = InstructionSelector::new(vec![]); + s.select_with_stack(wasm, num_params) + .expect("should compile") +} + +fn first_local_get(wasm: &[WasmOp], p: u32) -> Option { + wasm.iter().enumerate().find_map(|(i, op)| match op { + WasmOp::LocalGet(idx) if *idx == p => Some(i), + _ => None, + }) +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Adds { rd, .. } + | ArmOp::Adc { rd, .. } + | ArmOp::Subs { rd, .. } + | ArmOp::Sbc { rd, .. } + | ArmOp::And { rd, .. } + | ArmOp::Orr { rd, .. } + | ArmOp::Eor { rd, .. } + | ArmOp::Mov { rd, .. } + | ArmOp::Mvn { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Lsl { rd, .. } + | ArmOp::Lsr { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Ror { rd, .. } + | ArmOp::LslReg { rd, .. } + | ArmOp::LsrReg { rd, .. } + | ArmOp::AsrReg { rd, .. } + | ArmOp::RorReg { rd, .. } + | ArmOp::Rsb { rd, .. } + | ArmOp::Mul { rd, .. } + | ArmOp::Sdiv { rd, .. } + | ArmOp::Udiv { rd, .. } + | ArmOp::Mls { rd, .. } + | ArmOp::Clz { rd, .. } + | ArmOp::Rbit { rd, .. } + | ArmOp::Popcnt { rd, .. } + | ArmOp::Sxtb { rd, .. } + | ArmOp::Sxth { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + ArmOp::Movt { rd, .. } => vec![*rd], + ArmOp::I64Const { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I64Ldr { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I64ExtendI32U { rdlo, rdhi, .. } | ArmOp::I64ExtendI32S { rdlo, rdhi, .. } => { + vec![*rdlo, *rdhi] + } + ArmOp::I64Extend8S { rdlo, rdhi, .. } + | ArmOp::I64Extend16S { rdlo, rdhi, .. } + | ArmOp::I64Extend32S { rdlo, rdhi, .. } => vec![*rdlo, *rdhi], + ArmOp::I32WrapI64 { rd, .. } => vec![*rd], + ArmOp::I64Mul { rd_lo, rd_hi, .. } + | ArmOp::I64Shl { rd_lo, rd_hi, .. } + | ArmOp::I64ShrU { rd_lo, rd_hi, .. } + | ArmOp::I64ShrS { rd_lo, rd_hi, .. } => vec![*rd_lo, *rd_hi], + _ => Vec::new(), + } +} + +fn check_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) { + let instrs = run(wasm, num_params); + for p in 0..num_params.min(4) { + let first_read = match first_local_get(wasm, p) { + Some(i) => i, + None => continue, + }; + let param_reg = match p { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => continue, + }; + for ai in &instrs { + let line = match ai.source_line { + Some(l) => l, + None => continue, + }; + if line >= first_read { + break; + } + for w in writes(&ai.op) { + assert_ne!( + w, param_reg, + "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \ + param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \ + Op: {:?}. Sequence: {:#?}", + ai.op, instrs + ); + } + } + } +} + +/// Reproduce the harness invariant: build a sequence with `LocalGet(p)` for +/// each param, but place them AFTER an i64 op, so anything that touches +/// a param-region register pre-LocalGet is a clobber. +#[test] +fn aapcs_i64_const_drop_localgets_2_params() { + let wasm = vec![ + WasmOp::I64Const(0), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64const-drop-localgets"); +} + +#[test] +fn aapcs_i64_extend_then_localgets_2_params() { + let wasm = vec![ + WasmOp::I32Const(7), + WasmOp::I64ExtendI32U, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64extend-then-localgets"); +} + +#[test] +fn aapcs_i64_const_localset0_localgets_2_params() { + // LocalSet to a param slot would overwrite the param. Even so, the + // assertion is about writes that occur BEFORE the first LocalGet of + // each param — these should not clobber. + let wasm = vec![ + WasmOp::I64Const(0xAABBCCDD), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, "i64const-large"); +} + +#[test] +fn aapcs_i64_arith_then_localgets_3_params() { + let wasm = vec![ + WasmOp::I64Const(1), + WasmOp::I64Const(2), + WasmOp::I64Add, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + WasmOp::LocalGet(2), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 3, "i64-add-3params"); +} + +/// Reproduce specifically the fuzz-finding shape: +/// `[Push, I64Const{rdlo:R2,rdhi:R3,value:0}, Mov{rd:R1,op2:Reg(R2)}, Pop]` +/// with a middle op that emits a Mov to a low register. +#[test] +fn aapcs_i64_const_then_some_op_2_params() { + // Try several middle ops that emit something + for middle in [ + WasmOp::I32Const(0), + WasmOp::I64Eqz, + WasmOp::I64Clz, + WasmOp::I64Ctz, + WasmOp::I64Popcnt, + ] { + let wasm = vec![ + WasmOp::I64Const(0), + middle.clone(), + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + check_no_clobber(&wasm, 2, &format!("i64const-{middle:?}")); + } +} diff --git a/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs new file mode 100644 index 0000000..25b0427 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_i64_subword_aapcs.rs @@ -0,0 +1,123 @@ +//! Audit: i64 sub-word load handlers in `select_with_stack` previously +//! hardcoded the destination pair to R0:R1, clobbering AAPCS params 0 and 1 +//! on every `i64.load8_u`/etc. — even with 2+ live params. + +use synth_synthesis::{ArmInstruction, ArmOp, InstructionSelector, Reg, WasmOp}; + +fn compile_no_optimize(wasm_ops: &[WasmOp], num_params: u32) -> Vec { + let mut sel = InstructionSelector::new(vec![]); + sel.select_with_stack(wasm_ops, num_params) + .expect("select_with_stack") +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Mov { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Movt { rd, .. } + | ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + _ => Vec::new(), + } +} + +fn first_local_get(wasm: &[WasmOp], p: u32) -> Option { + wasm.iter().enumerate().find_map(|(i, op)| match op { + WasmOp::LocalGet(idx) if *idx == p => Some(i), + _ => None, + }) +} + +fn assert_no_clobber(wasm: &[WasmOp], num_params: u32, label: &str) { + let arm = compile_no_optimize(wasm, num_params); + for p in 0..num_params.min(4) { + let first_read = match first_local_get(wasm, p) { + Some(i) => i, + None => continue, + }; + let param_reg = match p { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => continue, + }; + for instr in &arm { + let line = match instr.source_line { + Some(l) => l, + None => continue, + }; + if line >= first_read { + break; + } + for w in writes(&instr.op) { + assert_ne!( + w, param_reg, + "[{label}] AAPCS clobber: ARM instr at wasm line {line} writes \ + param reg {param_reg:?} before LocalGet({p}) at line {first_read}. \ + Op: {:?}", + instr.op + ); + } + } + } +} + +#[test] +fn i64_load8u_no_clobber_2_params() { + // i64.load8_u with 2 params: addr from I32Const so neither param is the + // load address. The dst pair must not be R0:R1 (params). + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load8U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load8_u"); +} + +#[test] +fn i64_load16s_no_clobber_2_params() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load16S { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load16_s"); +} + +#[test] +fn i64_load32u_no_clobber_2_params() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I64Load32U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + WasmOp::LocalGet(0), + WasmOp::Drop, + WasmOp::LocalGet(1), + WasmOp::Drop, + ]; + assert_no_clobber(&wasm, 2, "i64.load32_u"); +} diff --git a/crates/synth-synthesis/tests/audit_optimized_aapcs.rs b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs new file mode 100644 index 0000000..3badf06 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_optimized_aapcs.rs @@ -0,0 +1,213 @@ +//! Audit: the optimizer path (`optimize_full + ir_to_arm`) must not write +//! AAPCS param registers before they're read by `LocalGet`. +//! +//! Issue #103 covered `select_with_stack`'s i64 ops; this audit covers the +//! optimizer path, where the corresponding bug class is `Opcode::*` +//! handlers hardcoding `rd: Reg::R3` (or any of R0..R3). Found via the +//! systematic audit triggered by PR #100's fuzz harness. + +use synth_synthesis::{ArmOp, OptimizerBridge, Reg, WasmOp}; + +fn compile_optimized(wasm_ops: &[WasmOp], num_params: usize) -> Vec { + let bridge = OptimizerBridge::new(); + let (ir, _cfg, _stats) = bridge.optimize_full(wasm_ops).expect("optimize_full"); + bridge.ir_to_arm(&ir, num_params) +} + +fn writes(op: &ArmOp) -> Vec { + match op { + ArmOp::Add { rd, .. } + | ArmOp::Sub { rd, .. } + | ArmOp::Adds { rd, .. } + | ArmOp::Adc { rd, .. } + | ArmOp::Subs { rd, .. } + | ArmOp::Sbc { rd, .. } + | ArmOp::And { rd, .. } + | ArmOp::Orr { rd, .. } + | ArmOp::Eor { rd, .. } + | ArmOp::Mov { rd, .. } + | ArmOp::Mvn { rd, .. } + | ArmOp::Movw { rd, .. } + | ArmOp::Movt { rd, .. } + | ArmOp::Lsl { rd, .. } + | ArmOp::Lsr { rd, .. } + | ArmOp::Asr { rd, .. } + | ArmOp::Ror { rd, .. } + | ArmOp::LslReg { rd, .. } + | ArmOp::LsrReg { rd, .. } + | ArmOp::AsrReg { rd, .. } + | ArmOp::RorReg { rd, .. } + | ArmOp::Rsb { rd, .. } + | ArmOp::Mul { rd, .. } + | ArmOp::Sdiv { rd, .. } + | ArmOp::Udiv { rd, .. } + | ArmOp::Mls { rd, .. } + | ArmOp::Clz { rd, .. } + | ArmOp::Rbit { rd, .. } + | ArmOp::Popcnt { rd, .. } + | ArmOp::Sxtb { rd, .. } + | ArmOp::Sxth { rd, .. } + | ArmOp::Ldr { rd, .. } + | ArmOp::Ldrb { rd, .. } + | ArmOp::Ldrsb { rd, .. } + | ArmOp::Ldrh { rd, .. } + | ArmOp::Ldrsh { rd, .. } => vec![*rd], + _ => Vec::new(), + } +} + +/// Compile `wasm_ops` through the optimizer and assert that no +/// instruction emitted up to the first MOV-to-R0 (epilogue) writes to +/// any AAPCS param register. (Without source-line info we use a +/// conservative proxy: scan the ARM output; the optimizer-emitted +/// epilogue is the trailing `Mov R0, ...; Pop` pair, so we check +/// everything before the LAST Mov-to-R0.) +fn assert_no_clobber_before_epilogue(arm: &[ArmOp], num_params: usize, label: &str) { + let param_regs: Vec = (0..num_params.min(4)) + .map(|i| match i { + 0 => Reg::R0, + 1 => Reg::R1, + 2 => Reg::R2, + 3 => Reg::R3, + _ => unreachable!(), + }) + .collect(); + + // Find the index of the last result-Mov to R0 / R0:R1 — everything + // before that is "body". We define the epilogue as: from the last + // `Mov { rd: R0, .. }` to the end. If there's no Mov-to-R0, treat + // the entire stream as body. + let mut last_mov_r0_idx: Option = None; + for (i, op) in arm.iter().enumerate() { + if let ArmOp::Mov { rd: Reg::R0, .. } = op { + last_mov_r0_idx = Some(i); + } + } + let body_end = last_mov_r0_idx.unwrap_or(arm.len()); + + for op in &arm[..body_end] { + for w in writes(op) { + if param_regs.contains(&w) { + panic!( + "[{label}] AAPCS clobber in optimized path: op writes \ + param reg {w:?}. Op: {op:?}. Full sequence: {:#?}", + arm + ); + } + } + } +} + +#[test] +fn optimized_i32_add_4params_does_not_clobber_r3() { + // 4 params (R0..R3 live). Compute `local.get 0 + local.get 1` (uses + // R0 + R1, result via Add). The Add's dest must NOT be R3 (or any + // other param reg). Pre-fix, Opcode::Add hardcoded `rd = Reg::R3`, + // clobbering the 4th param even though it wasn't an input. + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Add, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_add_4params"); +} + +#[test] +fn optimized_i32_sub_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Sub, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_sub_4params"); +} + +#[test] +fn optimized_i32_mul_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::LocalGet(1), + WasmOp::I32Mul, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_mul_4params"); +} + +#[test] +fn optimized_i32_load_4params_does_not_clobber_r3() { + // The MemLoad handler also used to hardcode `rd = Reg::R3`. + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::I32Load { + offset: 0, + align: 0, + }, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_load_4params"); +} + +#[test] +fn optimized_i32_load8u_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::LocalGet(0), + WasmOp::I32Load8U { + offset: 0, + align: 0, + }, + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "i32_load8u_4params"); +} + +#[test] +fn optimized_global_get_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::GlobalGet(0), + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "global_get_4params"); +} + +#[test] +fn optimized_memory_size_4params_does_not_clobber_r3() { + let wasm = vec![ + WasmOp::MemorySize(0), + WasmOp::LocalGet(2), + WasmOp::Drop, + WasmOp::LocalGet(3), + WasmOp::Drop, + WasmOp::Drop, + ]; + let arm = compile_optimized(&wasm, 4); + assert_no_clobber_before_epilogue(&arm, 4, "memory_size_4params"); +} diff --git a/crates/synth-synthesis/tests/audit_subword_memops.rs b/crates/synth-synthesis/tests/audit_subword_memops.rs new file mode 100644 index 0000000..9266e78 --- /dev/null +++ b/crates/synth-synthesis/tests/audit_subword_memops.rs @@ -0,0 +1,145 @@ +//! Audit: sub-word memory ops, globals, and memory.size/grow must lower +//! through the optimizer path without unmapped-vreg panics. +//! +//! Pre-PR: these ops fell through `wasm_to_ir`'s `_ => Opcode::Nop` arm, +//! consumed an inst_id slot, but produced no `vreg_to_arm` mapping. The +//! defensive panic in `get_arm_reg` (PR #101) surfaces this as a compiler +//! crash for downstream consumers; pre-defensive-panic the code silently +//! returned R0, clobbering AAPCS params. + +use synth_synthesis::{OptimizerBridge, WasmOp}; + +fn compile(wasm: &[WasmOp], num_params: usize) { + let bridge = OptimizerBridge::new(); + let (ir, _cfg, _stats) = bridge.optimize_full(wasm).expect("optimize_full"); + // ir_to_arm is the panic site for unmapped-vreg consumers. + let _arm = bridge.ir_to_arm(&ir, num_params); +} + +#[test] +fn subword_i32_load8_u_then_drop() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load8U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load8_s_into_arith() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load8S { + offset: 0, + align: 0, + }, + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load16_u_then_drop() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load16U { + offset: 0, + align: 0, + }, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_load16_s_into_arith() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Load16S { + offset: 0, + align: 0, + }, + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_store8() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Const(0xff), + WasmOp::I32Store8 { + offset: 0, + align: 0, + }, + ]; + compile(&wasm, 0); +} + +#[test] +fn subword_i32_store16() { + let wasm = vec![ + WasmOp::I32Const(0x100), + WasmOp::I32Const(0xffff), + WasmOp::I32Store16 { + offset: 0, + align: 0, + }, + ]; + compile(&wasm, 0); +} + +#[test] +fn global_get_then_drop() { + let wasm = vec![WasmOp::GlobalGet(0), WasmOp::Drop]; + compile(&wasm, 0); +} + +#[test] +fn global_get_into_arith() { + let wasm = vec![ + WasmOp::GlobalGet(0), + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn global_set() { + let wasm = vec![WasmOp::I32Const(42), WasmOp::GlobalSet(0)]; + compile(&wasm, 0); +} + +#[test] +fn memory_size_then_drop() { + let wasm = vec![WasmOp::MemorySize(0), WasmOp::Drop]; + compile(&wasm, 0); +} + +#[test] +fn memory_size_into_arith() { + let wasm = vec![ + WasmOp::MemorySize(0), + WasmOp::I32Const(1), + WasmOp::I32Add, + WasmOp::Drop, + ]; + compile(&wasm, 0); +} + +#[test] +fn memory_grow_then_drop() { + let wasm = vec![WasmOp::I32Const(1), WasmOp::MemoryGrow(0), WasmOp::Drop]; + compile(&wasm, 0); +}