Pack collision-round slots at shrinking per-round width

Each collision round consumes the leading 24-bit block, so a round-r output
entry only carries 8-r meaningful words — but every round previously stored a
fixed 8-word (32 B) slot, moving dead trailing words. Since the collision
rounds are memory-bandwidth bound, that wasted DRAM traffic is wasted time.

Pack each round's slots at w_out = w_in - 1 words. The XOR child producer still
loads a full 256-bit register (over-reading up to SLOT words past a narrow tail
slot — buffers carry a SLOT_SLACK pad so this stays in bounds) but masked-stores
only the w_out live lanes so packed neighbours aren't clobbered. The over-read
garbage only ever lands in non-stored lanes: storing out[0..w_out] = x[1..w_in]
uses exclusively meaningful input words. Width is threaded through
collide/collide_final/emit_bucket from solve_with (round 0 stays full 8-word).

Measured (16 threads, clamp 16/32): ~9.2 -> ~8.4 s/solve (~9%); per-round time
now shrinks r1~1230 -> r6~1045 ms. Cumulative with the parallel-partition change:
~13.4 -> ~8.4 s (-37%). Identical solution yield; xor_child_matches_scalar now
covers every width + the masked-store no-clobber property; cross-clamp validity
and full_solve_baseline pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jackpotincorporated
2026-06-06 11:11:05 -04:00
parent 501527d3cb
commit 966ce3e262
+109 -53
View File
@@ -164,36 +164,53 @@ const LOW_BUCKETS: usize = 1 << LOW_BITS;
/// Mask isolating the low resolved bits. /// Mask isolating the low resolved bits.
const LOW_MASK: u32 = (LOW_BUCKETS - 1) as u32; const LOW_MASK: u32 = (LOW_BUCKETS - 1) as u32;
/// Number of u32 words in a padded entry slot (32 bytes = one AVX2 register). /// Number of u32 words in a round-0 entry slot (32 bytes = one AVX2 register).
/// Every round stores its residual hash words in a fixed 8-word slot so the XOR /// Round 0 stores all eight 24-bit blocks; the XOR that produces a child is a
/// that produces a child is a single 256-bit load/xor/permute/store and every /// single 256-bit load/xor/permute — xenoncat's packed-slot trick (and the same
/// slot access is naturally aligned — xenoncat's packed-slot trick (and the same /// `uint4`-aligned-slot idea the CUDA backend uses), on the CPU. The leading
/// `uint4`-aligned-slot idea the CUDA backend already uses), on the CPU. The /// collision word lives in lane 0 and is mirrored into a dense parallel `keys[]`
/// leading collision word lives in lane 0 and is mirrored into a dense parallel /// array so the histogram passes stream over 4 bytes/entry instead of striding
/// `keys[]` array so the histogram passes stream over 4 bytes/entry instead of /// the slots.
/// striding the 32-byte slots. ///
/// Later rounds use a *narrower* pitch: each collision round consumes the leading
/// block, so a round-`r` output entry only carries `8 - r` meaningful words. The
/// solver packs each round's slots at that width (see `collide`'s `w_out`),
/// cutting the per-round slot-buffer DRAM traffic that bounds the collision rounds.
/// The XOR producer still loads a full 256-bit register (over-reading up to
/// `SLOT` words past a narrow tail slot, hence the `SLOT_SLACK` pad) but
/// masked-stores only the `w_out` live lanes so packed neighbours aren't touched.
const SLOT: usize = 8; const SLOT: usize = 8;
/// Scalar child producer: `out[0..8] = (a XOR b)` rotated left one lane; returns /// Trailing pad (in u32 words) on every slot buffer so the XOR producer's 256-bit
/// the child's new leading word (lane 1 of the XOR). Lane 0 of the XOR is the /// load over a narrow tail slot stays in bounds. The over-read reaches at most
/// just-collided block (always zero) and is rotated out. /// `(n-1)*w + SLOT` words for pitch `w`, i.e. `SLOT - w ≤ SLOT` words past the
/// `n*w` payload; `SLOT` words always covers it.
const SLOT_SLACK: usize = SLOT;
/// Scalar child producer: writes the `w_out` live words of `(a XOR b)` rotated
/// left one lane into `out`, and returns the child's new leading word (lane 1 of
/// the XOR). Lane 0 of the XOR is the just-collided block (always zero) and is
/// rotated out. `a`/`b` are read a full `SLOT` words wide (the caller pads each
/// slot buffer by `SLOT_SLACK`); only `out[0..w_out]` is written.
#[inline] #[inline]
unsafe fn xor_child_scalar(out: *mut u32, a: *const u32, b: *const u32) -> u32 { unsafe fn xor_child_scalar(out: *mut u32, a: *const u32, b: *const u32, w_out: usize) -> u32 {
let mut x = [0u32; SLOT]; let mut x = [0u32; SLOT];
for t in 0..SLOT { for t in 0..SLOT {
x[t] = *a.add(t) ^ *b.add(t); x[t] = *a.add(t) ^ *b.add(t);
} }
for t in 0..SLOT { // out[t] = x[(t + 1) % SLOT]; for t < w_out <= SLOT-1 the modulo is a no-op.
*out.add(t) = x[(t + 1) % SLOT]; for t in 0..w_out {
*out.add(t) = x[t + 1];
} }
x[1] x[1]
} }
/// AVX2 child producer: one `vpxor` + one `vpermd` (rotate the 8 lanes left by /// AVX2 child producer: one `vpxor` + one `vpermd` (rotate the 8 lanes left by
/// one) + one store. Replaces the per-word scalar XOR loop. /// one), then a masked store of the low `w_out` lanes so the packed, `w_out`-
/// pitched output never clobbers the next slot.
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")] #[target_feature(enable = "avx2")]
unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32) -> u32 { unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32, w_out: usize) -> u32 {
use core::arch::x86_64::*; use core::arch::x86_64::*;
let x = _mm256_xor_si256( let x = _mm256_xor_si256(
_mm256_loadu_si256(a as *const __m256i), _mm256_loadu_si256(a as *const __m256i),
@@ -201,7 +218,12 @@ unsafe fn xor_child_avx2(out: *mut u32, a: *const u32, b: *const u32) -> u32 {
); );
// rotate left by one 32-bit lane: out[i] = x[(i + 1) % 8] // rotate left by one 32-bit lane: out[i] = x[(i + 1) % 8]
let p = _mm256_permutevar8x32_epi32(x, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0)); let p = _mm256_permutevar8x32_epi32(x, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0));
_mm256_storeu_si256(out as *mut __m256i, p); // mask = lanes [0, w_out) -> all-ones; maskstore writes only those.
let mask = _mm256_cmpgt_epi32(
_mm256_set1_epi32(w_out as i32),
_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
);
_mm256_maskstore_epi32(out as *mut i32, mask, p);
_mm_cvtsi128_si32(_mm256_castsi256_si128(p)) as u32 _mm_cvtsi128_si32(_mm256_castsi256_si128(p)) as u32
} }
@@ -383,9 +405,11 @@ fn low_group(keys: &[u32], run: &[u32], hist: &mut [u32], sorted: &mut Vec<u32>)
/// Monomorphised over the XOR producer so the AVX2 intrinsics inline cleanly /// Monomorphised over the XOR producer so the AVX2 intrinsics inline cleanly
/// inside a `target_feature` wrapper while sharing one source of truth. /// inside a `target_feature` wrapper while sharing one source of truth.
macro_rules! emit_bucket_body { macro_rules! emit_bucket_body {
($keys:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $xor:path) => {{ ($keys:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $w_in:expr, $w_out:expr, $xor:path) => {{
let s = $sorted; let s = $sorted;
let m = s.len(); let m = s.len();
let w_in = $w_in;
let w_out = $w_out;
let mut w = 0usize; let mut w = 0usize;
let mut i = 0; let mut i = 0;
while i < m { while i < m {
@@ -400,9 +424,10 @@ macro_rules! emit_bucket_body {
for b in (a + 1)..hi { for b in (a + 1)..hi {
let mr = s[b] as usize; let mr = s[b] as usize;
let nk = $xor( let nk = $xor(
$sout.as_mut_ptr().add(w * SLOT), $sout.as_mut_ptr().add(w * w_out),
$slots.as_ptr().add(l * SLOT), $slots.as_ptr().add(l * w_in),
$slots.as_ptr().add(mr * SLOT), $slots.as_ptr().add(mr * w_in),
w_out,
); );
$kout[w] = nk; $kout[w] = nk;
$pout[w] = ((l as u64) << 32) | mr as u64; $pout[w] = ((l as u64) << 32) | mr as u64;
@@ -423,8 +448,10 @@ unsafe fn emit_bucket_scalar(
sout: &mut [u32], sout: &mut [u32],
pout: &mut [u64], pout: &mut [u64],
clamp: usize, clamp: usize,
w_in: usize,
w_out: usize,
) -> usize { ) -> usize {
emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, xor_child_scalar) emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_scalar)
} }
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
@@ -437,11 +464,14 @@ unsafe fn emit_bucket_avx2(
sout: &mut [u32], sout: &mut [u32],
pout: &mut [u64], pout: &mut [u64],
clamp: usize, clamp: usize,
w_in: usize,
w_out: usize,
) -> usize { ) -> usize {
emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, xor_child_avx2) emit_bucket_body!(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_avx2)
} }
/// Emit a partition's children, dispatching to the AVX2 producer when available. /// Emit a partition's children, dispatching to the AVX2 producer when available.
/// `w_in`/`w_out` are the input/output slot pitches (`w_out == w_in - 1`).
unsafe fn emit_bucket( unsafe fn emit_bucket(
keys: &[u32], keys: &[u32],
slots: &[u32], slots: &[u32],
@@ -450,22 +480,35 @@ unsafe fn emit_bucket(
sout: &mut [u32], sout: &mut [u32],
pout: &mut [u64], pout: &mut [u64],
clamp: usize, clamp: usize,
w_in: usize,
w_out: usize,
) -> usize { ) -> usize {
#[cfg(target_arch = "x86_64")] #[cfg(target_arch = "x86_64")]
{ {
if is_x86_feature_detected!("avx2") { if is_x86_feature_detected!("avx2") {
return emit_bucket_avx2(keys, slots, sorted, kout, sout, pout, clamp); return emit_bucket_avx2(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out);
} }
} }
emit_bucket_scalar(keys, slots, sorted, kout, sout, pout, clamp) emit_bucket_scalar(keys, slots, sorted, kout, sout, pout, clamp, w_in, w_out)
} }
/// Group `n` entries by their leading block, then emit one child per colliding /// Group `n` entries by their leading block, then emit one child per colliding
/// pair: the XOR of the residual blocks (rotated into a fresh 8-word slot) plus /// pair: the XOR of the residual blocks (rotated into a fresh slot) plus a packed
/// a packed `(l << 32) | mr` back-reference. Two passes — count, then emit /// `(l << 32) | mr` back-reference. Two passes — count, then emit directly into
/// directly into one pre-sized arena — so there is no per-partition allocation /// one pre-sized arena — so there is no per-partition allocation or final
/// or final concatenation copy. Returns `(keys_out, slots_out, parents)`. /// concatenation copy. Returns `(keys_out, slots_out, parents)`.
fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Vec<u32>, Vec<u64>) { ///
/// `w_in` is the input slot pitch; the round consumes one block, so the output is
/// packed at `w_out = w_in - 1` words/slot. `slots_out` carries a `SLOT_SLACK`
/// trailing pad so the next round's 256-bit over-read stays in bounds.
fn collide(
keys: &[u32],
slots: &[u32],
n: usize,
clamp: usize,
w_in: usize,
) -> (Vec<u32>, Vec<u32>, Vec<u64>) {
let w_out = w_in - 1;
// Sub-phase timing, gated on `EQ_PROFILE`. Prints partition / count / alloc / // Sub-phase timing, gated on `EQ_PROFILE`. Prints partition / count / alloc /
// emit splits so we can see which part of the round dominates. // emit splits so we can see which part of the round dominates.
let prof = std::env::var_os("EQ_PROFILE").is_some(); let prof = std::env::var_os("EQ_PROFILE").is_some();
@@ -493,7 +536,8 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
} }
let total = out_starts[TOP_BUCKETS]; let total = out_starts[TOP_BUCKETS];
let mut keys_out = vec![0u32; total]; let mut keys_out = vec![0u32; total];
let mut slots_out = vec![0u32; total * SLOT]; // Packed at `w_out` words/slot, plus a slack pad for the next round's over-read.
let mut slots_out = vec![0u32; total * w_out + SLOT_SLACK];
let mut parents = vec![0u64; total]; let mut parents = vec![0u64; total];
// Carve the output arena into disjoint per-partition sub-slices so workers // Carve the output arena into disjoint per-partition sub-slices so workers
@@ -505,7 +549,7 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
let (mut kr, mut sr, mut pr) = (&mut keys_out[..], &mut slots_out[..], &mut parents[..]); let (mut kr, mut sr, mut pr) = (&mut keys_out[..], &mut slots_out[..], &mut parents[..]);
for &c in &counts { for &c in &counts {
let (kh, kt) = kr.split_at_mut(c); let (kh, kt) = kr.split_at_mut(c);
let (sh, st) = sr.split_at_mut(c * SLOT); let (sh, st) = sr.split_at_mut(c * w_out);
let (ph, pt) = pr.split_at_mut(c); let (ph, pt) = pr.split_at_mut(c);
kparts.push(kh); kparts.push(kh);
sparts.push(sh); sparts.push(sh);
@@ -529,7 +573,7 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
let mut hist = vec![0u32; LOW_BUCKETS + 1]; let mut hist = vec![0u32; LOW_BUCKETS + 1];
let mut sorted = Vec::new(); let mut sorted = Vec::new();
low_group(keys, run, &mut hist, &mut sorted); low_group(keys, run, &mut hist, &mut sorted);
let w = unsafe { emit_bucket(keys, slots, &sorted, kout, sout, pout, clamp) }; let w = unsafe { emit_bucket(keys, slots, &sorted, kout, sout, pout, clamp, w_in, w_out) };
debug_assert_eq!(w, kout.len()); debug_assert_eq!(w, kout.len());
}); });
@@ -547,10 +591,10 @@ fn collide(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> (Vec<u32>, Ve
(keys_out, slots_out, parents) (keys_out, slots_out, parents)
} }
/// Final round (slots hold `[w0, w1, …]`): among entries sharing leading block /// Final round (slots hold `[w0, w1, …]` at pitch `w_in`): among entries sharing
/// `w0`, a pair whose `w1` also matches XORs the last two blocks to zero — a /// leading block `w0`, a pair whose `w1` also matches XORs the last two blocks to
/// candidate. Returns the `(l, mr)` parents of each candidate. /// zero — a candidate. Returns the `(l, mr)` parents of each candidate.
fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> Vec<(u32, u32)> { fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize, w_in: usize) -> Vec<(u32, u32)> {
let (starts, order) = partition_top(keys, n); let (starts, order) = partition_top(keys, n);
(0..TOP_BUCKETS) (0..TOP_BUCKETS)
@@ -574,7 +618,7 @@ fn collide_final(keys: &[u32], slots: &[u32], n: usize, clamp: usize) -> Vec<(u3
let l = sorted[a] as usize; let l = sorted[a] as usize;
for b in (a + 1)..hi { for b in (a + 1)..hi {
let mr = sorted[b] as usize; let mr = sorted[b] as usize;
if slots[l * SLOT + 1] == slots[mr * SLOT + 1] { if slots[l * w_in + 1] == slots[mr * w_in + 1] {
local.push((l as u32, mr as u32)); local.push((l as u32, mr as u32));
} }
} }
@@ -669,19 +713,23 @@ pub fn solve_with(header: &[u8], clamp: Option<usize>) -> Vec<Vec<u32>> {
phase("round0-hash", n0); phase("round0-hash", n0);
let mut parents: Vec<Vec<u64>> = Vec::with_capacity(K - 1); let mut parents: Vec<Vec<u64>> = Vec::with_capacity(K - 1);
let mut n = n0; let mut n = n0;
// Round-0 slots carry all SLOT words; each collision round consumes the
// leading block, so the next round's slots are one word narrower.
let mut width = SLOT;
for r in 0..(K - 1) { for r in 0..(K - 1) {
let (ok, os, op) = collide(&keys, &slots, n, clamp); let (ok, os, op) = collide(&keys, &slots, n, clamp, width);
n = op.len(); n = op.len();
parents.push(op); parents.push(op);
keys = ok; keys = ok;
slots = os; slots = os;
width -= 1;
phase(&format!("collide r{}", r + 1), n); phase(&format!("collide r{}", r + 1), n);
if n == 0 { if n == 0 {
return Vec::new(); return Vec::new();
} }
} }
let candidates = collide_final(&keys, &slots, n, clamp); let candidates = collide_final(&keys, &slots, n, clamp, width);
phase("collide-final", candidates.len()); phase("collide-final", candidates.len());
if candidates.is_empty() { if candidates.is_empty() {
return Vec::new(); return Vec::new();
@@ -846,24 +894,32 @@ mod tests {
assert_eq!(a[0], (src[0] as u32) << 16 | (src[1] as u32) << 8 | src[2] as u32); assert_eq!(a[0], (src[0] as u32) << 16 | (src[1] as u32) << 8 | src[2] as u32);
} }
// AVX2 XOR-child (xor + rotate-left-one-lane) must match the scalar version. // The XOR-child producers (scalar + AVX2) must agree for every output width,
// write exactly `w_out` words (rotate-left-one of the lane-wise XOR), and
// leave the rest of the packed buffer untouched (the AVX2 path masked-stores).
#[test] #[test]
fn xor_child_matches_scalar() { fn xor_child_matches_scalar() {
const SENT: u32 = 0xDEAD_BEEF;
let pa: [u32; SLOT] = [9, 8, 7, 6, 5, 4, 3, 2]; let pa: [u32; SLOT] = [9, 8, 7, 6, 5, 4, 3, 2];
let pb: [u32; SLOT] = [1, 2, 3, 4, 5, 6, 7, 8]; let pb: [u32; SLOT] = [1, 2, 3, 4, 5, 6, 7, 8];
let mut o1 = [0u32; SLOT];
let mut o2 = [0u32; SLOT];
let k1 = unsafe { xor_child_scalar(o1.as_mut_ptr(), pa.as_ptr(), pb.as_ptr()) };
// Reference: rotate-left-one of the lane-wise XOR; new key = lane 1.
let x: Vec<u32> = (0..SLOT).map(|i| pa[i] ^ pb[i]).collect(); let x: Vec<u32> = (0..SLOT).map(|i| pa[i] ^ pb[i]).collect();
let expect: Vec<u32> = (0..SLOT).map(|i| x[(i + 1) % SLOT]).collect(); for w_out in 1..SLOT {
assert_eq!(&o1[..], &expect[..]); let mut o1 = [SENT; SLOT];
assert_eq!(k1, x[1]); let k1 = unsafe { xor_child_scalar(o1.as_mut_ptr(), pa.as_ptr(), pb.as_ptr(), w_out) };
#[cfg(target_arch = "x86_64")] for t in 0..w_out {
if is_x86_feature_detected!("avx2") { assert_eq!(o1[t], x[(t + 1) % SLOT], "scalar word {t} (w_out={w_out})");
let k2 = unsafe { xor_child_avx2(o2.as_mut_ptr(), pa.as_ptr(), pb.as_ptr()) }; }
assert_eq!(o1, o2, "avx2 xor_child != scalar"); for t in w_out..SLOT {
assert_eq!(k1, k2); assert_eq!(o1[t], SENT, "scalar wrote past w_out at {t} (w_out={w_out})");
}
assert_eq!(k1, x[1]);
#[cfg(target_arch = "x86_64")]
if is_x86_feature_detected!("avx2") {
let mut o2 = [SENT; SLOT];
let k2 = unsafe { xor_child_avx2(o2.as_mut_ptr(), pa.as_ptr(), pb.as_ptr(), w_out) };
assert_eq!(o1, o2, "avx2 xor_child != scalar (w_out={w_out})");
assert_eq!(k1, k2);
}
} }
} }