Pack collision-round slots at shrinking per-round width

Each collision round consumes the leading 24-bit block, so a round-r output entry only carries 8-r meaningful words — but every round previously stored a fixed 8-word (32 B) slot, moving dead trailing words. Since the collision rounds are memory-bandwidth bound, that wasted DRAM traffic is wasted time. Pack each round's slots at w_out = w_in - 1 words. The XOR child producer still loads a full 256-bit register (over-reading up to SLOT words past a narrow tail slot — buffers carry a SLOT_SLACK pad so this stays in bounds) but masked-stores only the w_out live lanes so packed neighbours aren't clobbered. The over-read garbage only ever lands in non-stored lanes: storing out[0..w_out] = x[1..w_in] uses exclusively meaningful input words. Width is threaded through collide/collide_final/emit_bucket from solve_with (round 0 stays full 8-word). Measured (16 threads, clamp 16/32): ~9.2 -> ~8.4 s/solve (~9%); per-round time now shrinks r1~1230 -> r6~1045 ms. Cumulative with the parallel-partition change: ~13.4 -> ~8.4 s (-37%). Identical solution yield; xor_child_matches_scalar now covers every width + the masked-store no-clobber property; cross-clamp validity and full_solve_baseline pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 11:11:05 -04:00
parent 501527d3cb
commit 966ce3e262
1 changed files with 109 additions and 53 deletions
@@ -164,36 +164,53 @@ const LOW_BUCKETS: usize = 1 << LOW_BITS;
 /// Mask isolating the low resolved bits.
 const LOW_MASK: u32 = (LOW_BUCKETS - 1) as u32;

-/// Number of u32 words in a padded entry slot (32 bytes = one AVX2 register).
-/// Every round stores its residual hash words in a fixed 8-word slot so the XOR
-/// that produces a child is a single 256-bit load/xor/permute/store and every
-/// slot access is naturally aligned — xenoncat's packed-slot trick (and the same
-/// `uint4`-aligned-slot idea the CUDA backend already uses), on the CPU. The
-/// leading collision word lives in lane 0 and is mirrored into a dense parallel
-/// `keys[]` array so the histogram passes stream over 4 bytes/entry instead of
-/// striding the 32-byte slots.
+/// Number of u32 words in a round-0 entry slot (32 bytes = one AVX2 register).
+/// Round 0 stores all eight 24-bit blocks; the XOR that produces a child is a
+/// single 256-bit load/xor/permute — xenoncat's packed-slot trick (and the same
+/// `uint4`-aligned-slot idea the CUDA backend uses), on the CPU. The leading
+/// collision word lives in lane 0 and is mirrored into a dense parallel `keys[]`
+/// array so the histogram passes stream over 4 bytes/entry instead of striding
+/// the slots.
+///
+/// Later rounds use a *narrower* pitch: each collision round consumes the leading
+/// block, so a round-`r` output entry only carries `8 - r` meaningful words. The
+/// solver packs each round's slots at that width (see `collide`'s `w_out`),
+/// cutting the per-round slot-buffer DRAM traffic that bounds the collision rounds.
+/// The XOR producer still loads a full 256-bit register (over-reading up to
+/// `SLOT` words past a narrow tail slot, hence the `SLOT_SLACK` pad) but
+/// masked-stores only the `w_out` live lanes so packed neighbours aren't touched.
 const SLOT: usize = 8;

-/// Scalar child producer: `out[0..8] = (a XOR b)` rotated left one lane; returns
-/// the child's new leading word (lane 1 of the XOR). Lane 0 of the XOR is the
-/// just-collided block (always zero) and is rotated out.
+/// Trailing pad (in u32 words) on every slot buffer so the XOR producer's 256-bit
+/// load over a narrow tail slot stays in bounds. The over-read reaches at most
+/// `(n-1)*w + SLOT` words for pitch `w`, i.e. `SLOT - w ≤ SLOT` words past the
+/// `n*w` payload; `SLOT` words always covers it.
+const SLOT_SLACK: usize = SLOT;
+
+/// Scalar child producer: writes the `w_out` live words of `(a XOR b)` rotated
+/// left one lane into `out`, and returns the child's new leading word (lane 1 of
+/// the XOR). Lane 0 of the XOR is the just-collided block (always zero) and is
+/// rotated out. `a`/`b` are read a full `SLOT` words wide (the caller pads each
+/// slot buffer by `SLOT_SLACK`); only `out[0..w_out]` is written.
 #[inline]
-unsafe fn xor_child_scalar(out: *mut u32, a: *const u32, b: *const u32) -> u32 {
+unsafe fn xor_child_scalar(out: *mut u32, a: *const u32, b: *const u32, w_out: usize) -> u32 {
    let mut x = [0u32; SLOT];
    for t in 0..SLOT {
        x[t] = *a.add(t) ^ *b.add(t);
    }
-    for t in 0..SLOT {
-        *out.add(t) = x[(t + 1) % SLOT];
+    // out[t] = x[(t + 1) % SLOT]; for t < w_out <= SLOT-1 the modulo is a no-op.
+    for t in 0..w_out {
+        *out.add(t) = x[t + 1];
    }
    x[1]
 }

 /// AVX2 child producer: one `vpxor` + one `vpermd` (rotate the 8 lanes left by
-/// one) + one store. Replaces the per-word scalar XOR loop.
+/// one), then a masked store of the low `w_out` lanes so the packed, `w_out`-
+/// pitched output never clobbers the next slot.
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
-unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32) -> u32 {
+unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32, w_out: usize) -> u32 {
    use core::arch::x86_64::*;
    let x = _mm256_xor_si256(
        _mm256_loadu_si256(a as *const __m256i),
@@ -201,7 +218,12 @@ unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32) -> u32 {
    );
    // rotate left by one 32-bit lane: out[i] = x[(i + 1) % 8]
    let p = _mm256_permutevar8x32_epi32(x, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0));
-    _mm256_storeu_si256(out as *mut __m256i, p);
+    // mask = lanes [0, w_out) -> all-ones; maskstore writes only those.
+    let mask = _mm256_cmpgt_epi32(
+        _mm256_set1_epi32(w_out as i32),
+        _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    );
+    _mm256_maskstore_epi32(out as *mut i32, mask, p);
    _mm_cvtsi128_si32(_mm256_castsi256_si128(p)) as u32
 }

@@ -383,9 +405,11 @@ fn low_group(keys: &[u32], run: &[u32], hist: &mut [u32], sorted: &mut Vec<u32>)
 /// Monomorphised over the XOR producer so the AVX2 intrinsics inline cleanly
 /// inside a `target_feature` wrapper while sharing one source of truth.
 macro_rules! emit_bucket_body {
-    ($keys:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $xor:path) => {{
+    ($keys:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $w_in:expr, $w_out:expr, $xor:path) => {{
        let s = $sorted;
        let m = s.len();
+        let w_in = $w_in;
+        let w_out = $w_out;
        let mut w = 0usize;
        let mut i = 0;
        while i < m {
@@ -400,9 +424,10 @@ macro_rules! emit_bucket_body {
                for b in (a + 1)..hi {
                    let mr = s[b] as usize;
                    let nk = $xor(
-                        $sout.as_mut_ptr().add(w * SLOT),
-                        $slots.as_ptr().add(l * SLOT),
-                        $slots.as_ptr().add(mr * SLOT),
+                        $sout.as_mut_ptr().add(w * w_out),
+                        $slots.as_ptr().add(l * w_in),
+                        $slots.as_ptr().add(mr * w_in),
+                        w_out,
                    );
                    $kout[w] = nk;
                    $pout[w] = ((l as u64) << 32) | mr as u64;
@@ -423,8 +448,10 @@ unsafe fn emit_bucket_scalar(
    sout: &mut [u32],
    pout: &mut [u64],
    clamp: usize,
+    w_in: usize,
+    w_out: usize,
 ) -> usize {
-    emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, xor_child_scalar)
+    emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_scalar)
 }

 #[cfg(target_arch = "x86_64")]
@@ -437,11 +464,14 @@ unsafe fn emit_bucket_avx2(
    sout: &mut [u32],
    pout: &mut [u64],
    clamp: usize,
+    w_in: usize,
+    w_out: usize,
 ) -> usize {
-    emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, xor_child_avx2)
+    emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_avx2)
 }

 /// Emit a partition's children, dispatching to the AVX2 producer when available.
+/// `w_in`/`w_out` are the input/output slot pitches (`w_out == w_in - 1`).
 unsafe fn emit_bucket(
    keys: &[u32],
    slots: &[u32],
@@ -450,22 +480,35 @@ unsafe fn emit_bucket(
    sout: &mut [u32],
    pout: &mut [u64],
    clamp: usize,
+    w_in: usize,
+    w_out: usize,
 ) -> usize {
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx2") {
-            return emit_bucket_avx2(keys, slots, sorted, kout, sout, pout, clamp);
+            return emit_bucket_avx2(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out);
        }
    }
-    emit_bucket_scalar(keys, slots, sorted, kout, sout, pout, clamp)
+    emit_bucket_scalar(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out)
 }

 /// Group `n` entries by their leading block, then emit one child per colliding
-/// pair: the XOR of the residual blocks (rotated into a fresh 8-word slot) plus
-/// a packed `(l << 32) | mr` back-reference. Two passes — count, then emit
-/// directly into one pre-sized arena — so there is no per-partition allocation
-/// or final concatenation copy. Returns `(keys_out, slots_out, parents)`.
-fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Vec<u32>, Vec<u64>) {
+/// pair: the XOR of the residual blocks (rotated into a fresh slot) plus a packed
+/// `(l << 32) | mr` back-reference. Two passes — count, then emit directly into
+/// one pre-sized arena — so there is no per-partition allocation or final
+/// concatenation copy. Returns `(keys_out, slots_out, parents)`.
+///
+/// `w_in` is the input slot pitch; the round consumes one block, so the output is
+/// packed at `w_out = w_in - 1` words/slot. `slots_out` carries a `SLOT_SLACK`
+/// trailing pad so the next round's 256-bit over-read stays in bounds.
+fn collide(
+    keys: &[u32],
+    slots: &[u32],
+    n: usize,
+    clamp: usize,
+    w_in: usize,
+) -> (Vec<u32>, Vec<u32>, Vec<u64>) {
+    let w_out = w_in - 1;
    // Sub-phase timing, gated on `EQ_PROFILE`. Prints partition / count / alloc /
    // emit splits so we can see which part of the round dominates.
    let prof = std::env::var_os("EQ_PROFILE").is_some();
@@ -493,7 +536,8 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
    }
    let total = out_starts[TOP_BUCKETS];
    let mut keys_out = vec![0u32; total];
-    let mut slots_out = vec![0u32; total * SLOT];
+    // Packed at `w_out` words/slot, plus a slack pad for the next round's over-read.
+    let mut slots_out = vec![0u32; total * w_out + SLOT_SLACK];
    let mut parents = vec![0u64; total];

    // Carve the output arena into disjoint per-partition sub-slices so workers
@@ -505,7 +549,7 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
        let (mut kr, mut sr, mut pr) = (&mut keys_out[..], &mut slots_out[..], &mut parents[..]);
        for &c in &counts {
            let (kh, kt) = kr.split_at_mut(c);
-            let (sh, st) = sr.split_at_mut(c * SLOT);
+            let (sh, st) = sr.split_at_mut(c * w_out);
            let (ph, pt) = pr.split_at_mut(c);
            kparts.push(kh);
            sparts.push(sh);
@@ -529,7 +573,7 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
            let mut hist = vec![0u32; LOW_BUCKETS + 1];
            let mut sorted = Vec::new();
            low_group(keys, run, &mut hist, &mut sorted);
-            let w = unsafe { emit_bucket(keys, slots, &sorted, kout, sout, pout, clamp) };
+            let w = unsafe { emit_bucket(keys, slots, &sorted, kout, sout, pout, clamp, w_in, w_out) };
            debug_assert_eq!(w, kout.len());
        });

@@ -547,10 +591,10 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
    (keys_out, slots_out, parents)
 }

-/// Final round (slots hold `[w0, w1, …]`): among entries sharing leading block
-/// `w0`, a pair whose `w1` also matches XORs the last two blocks to zero — a
-/// candidate. Returns the `(l, mr)` parents of each candidate.
-fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> Vec<(u32, u32)> {
+/// Final round (slots hold `[w0, w1, …]` at pitch `w_in`): among entries sharing
+/// leading block `w0`, a pair whose `w1` also matches XORs the last two blocks to
+/// zero — a candidate. Returns the `(l, mr)` parents of each candidate.
+fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize, w_in: usize) -> Vec<(u32, u32)> {
    let (starts, order) = partition_top(keys, n);

    (0..TOP_BUCKETS)
@@ -574,7 +618,7 @@ fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> Vec<(u3
                        let l = sorted[a] as usize;
                        for b in (a + 1)..hi {
                            let mr = sorted[b] as usize;
-                            if slots[l * SLOT + 1] == slots[mr * SLOT + 1] {
+                            if slots[l * w_in + 1] == slots[mr * w_in + 1] {
                                local.push((l as u32, mr as u32));
                            }
                        }
@@ -669,19 +713,23 @@ pub fn solve_with(header: &[u8], clamp: Option<usize>) -> Vec<Vec<u32>> {
    phase("round0-hash", n0);
    let mut parents: Vec<Vec<u64>> = Vec::with_capacity(K - 1);
    let mut n = n0;
+    // Round-0 slots carry all SLOT words; each collision round consumes the
+    // leading block, so the next round's slots are one word narrower.
+    let mut width = SLOT;
    for r in 0..(K - 1) {
-        let (ok, os, op) = collide(&keys, &slots, n, clamp);
+        let (ok, os, op) = collide(&keys, &slots, n, clamp, width);
        n = op.len();
        parents.push(op);
        keys = ok;
        slots = os;
+        width -= 1;
        phase(&format!("collide r{}", r + 1), n);
        if n == 0 {
            return Vec::new();
        }
    }

-    let candidates = collide_final(&keys, &slots, n, clamp);
+    let candidates = collide_final(&keys, &slots, n, clamp, width);
    phase("collide-final", candidates.len());
    if candidates.is_empty() {
        return Vec::new();
@@ -846,24 +894,32 @@ mod tests {
        assert_eq!(a[0], (src[0] as u32) << 16 | (src[1] as u32) << 8 | src[2] as u32);
    }

-    // AVX2 XOR-child (xor + rotate-left-one-lane) must match the scalar version.
+    // The XOR-child producers (scalar + AVX2) must agree for every output width,
+    // write exactly `w_out` words (rotate-left-one of the lane-wise XOR), and
+    // leave the rest of the packed buffer untouched (the AVX2 path masked-stores).
    #[test]
    fn xor_child_matches_scalar() {
+        const SENT: u32 = 0xDEAD_BEEF;
        let pa: [u32; SLOT] = [9, 8, 7, 6, 5, 4, 3, 2];
        let pb: [u32; SLOT] = [1, 2, 3, 4, 5, 6, 7, 8];
-        let mut o1 = [0u32; SLOT];
-        let mut o2 = [0u32; SLOT];
-        let k1 = unsafe { xor_child_scalar(o1.as_mut_ptr(), pa.as_ptr(), pb.as_ptr()) };
-        // Reference: rotate-left-one of the lane-wise XOR; new key = lane 1.
        let x: Vec<u32> = (0..SLOT).map(|i| pa[i] ^ pb[i]).collect();
-        let expect: Vec<u32> = (0..SLOT).map(|i| x[(i + 1) % SLOT]).collect();
-        assert_eq!(&o1[..], &expect[..]);
-        assert_eq!(k1, x[1]);
-        #[cfg(target_arch = "x86_64")]
-        if is_x86_feature_detected!("avx2") {
-            let k2 = unsafe { xor_child_avx2(o2.as_mut_ptr(), pa.as_ptr(), pb.as_ptr()) };
-            assert_eq!(o1, o2, "avx2 xor_child != scalar");
-            assert_eq!(k1, k2);
+        for w_out in 1..SLOT {
+            let mut o1 = [SENT; SLOT];
+            let k1 = unsafe { xor_child_scalar(o1.as_mut_ptr(), pa.as_ptr(), pb.as_ptr(), w_out) };
+            for t in 0..w_out {
+                assert_eq!(o1[t], x[(t + 1) % SLOT], "scalar word {t} (w_out={w_out})");
+            }
+            for t in w_out..SLOT {
+                assert_eq!(o1[t], SENT, "scalar wrote past w_out at {t} (w_out={w_out})");
+            }
+            assert_eq!(k1, x[1]);
+            #[cfg(target_arch = "x86_64")]
+            if is_x86_feature_detected!("avx2") {
+                let mut o2 = [SENT; SLOT];
+                let k2 = unsafe { xor_child_avx2(o2.as_mut_ptr(), pa.as_ptr(), pb.as_ptr(), w_out) };
+                assert_eq!(o1, o2, "avx2 xor_child != scalar (w_out={w_out})");
+                assert_eq!(k1, k2);
+            }
        }
    }