From afd56bee1b72615e14841b56392541c949ffa1da Mon Sep 17 00:00:00 2001 From: jackpotincorporated Date: Sat, 6 Jun 2026 13:12:57 -0400 Subject: [PATCH] CPU solver: AVX-512 round-0 hashing + prefetch the emit gather MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Branch the CPU solver onto modern x86 extensions, runtime-dispatched with the existing AVX2/scalar fallbacks: - BatchHasher::hash8 — an 8-lane AVX-512 BLAKE2b final-block compression (native _mm512_ror_epi64 rotates), falling back to two AVX2 hash4s or scalar. Round 0 now hashes eight g-values per chunk. round0-hash drops ~1.45x on AVX-512 CPUs (≈225→155 ms here, AMD Zen4). - emit_bucket software-prefetches each collision group's randomly-gathered member slots (the ~1 GB slot arena is the round's cache-miss bottleneck), shaving a few percent off the dominant emit phase. Controlled A/B on this Zen4 box (same thermal state): ~4-5% faster overall. The collision rounds are memory-bandwidth bound, so SIMD width is not the limiter — the modern-ISA win is modest by nature. EQ_NO_AVX512 / EQ_NO_PREFETCH opt out per-CPU (e.g. parts where AVX-512 downclocks) and back the A/B harness. hash8 is validated against the scalar reference (batch_matches_reference) and full solves still find valid solutions in every dispatch configuration. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/blake.rs | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ src/equihash.rs | 57 ++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 13 deletions(-) diff --git a/src/blake.rs b/src/blake.rs index 798f250..27c67ab 100644 --- a/src/blake.rs +++ b/src/blake.rs @@ -215,6 +215,92 @@ impl BatchHasher { } } + /// Hash eight consecutive indices `g0..g0+8`, writing each 48-byte digest + /// into `out[0..8]`. Uses AVX-512 (one 8-wide BLAKE2b compression) when + /// available, else two AVX2 `hash4`s. Modern AMD (Zen4+) and Intel CPUs run + /// AVX-512 without the clock penalty older Intel parts had. + #[inline] + pub fn hash8(&self, g0: u32, out: &mut [[u8; HASH_OUTPUT]; 8]) { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512f") { + unsafe { self.hash8_avx512(g0, out) }; + return; + } + } + // Fallback: two 4-lane batches into the two halves of `out`. + let (lo, hi) = out.split_at_mut(4); + self.hash4(g0, (&mut lo[..4]).try_into().unwrap()); + self.hash4(g0 + 4, (&mut hi[..4]).try_into().unwrap()); + } + + /// Eight-lane BLAKE2b final-block compression (AVX-512). Same structure as + /// [`Self::hash4_avx2`] but 512-bit lanes and native 64-bit rotates. + #[cfg(target_arch = "x86_64")] + #[target_feature(enable = "avx512f")] + unsafe fn hash8_avx512(&self, g0: u32, out: &mut [[u8; HASH_OUTPUT]; 8]) { + use core::arch::x86_64::*; + + #[inline(always)] + unsafe fn g8( + v: &mut [__m512i; 16], + a: usize, b: usize, c: usize, d: usize, + x: __m512i, y: __m512i, + ) { + v[a] = _mm512_add_epi64(_mm512_add_epi64(v[a], v[b]), x); + v[d] = _mm512_ror_epi64::<32>(_mm512_xor_si512(v[d], v[a])); + v[c] = _mm512_add_epi64(v[c], v[d]); + v[b] = _mm512_ror_epi64::<24>(_mm512_xor_si512(v[b], v[c])); + v[a] = _mm512_add_epi64(_mm512_add_epi64(v[a], v[b]), y); + v[d] = _mm512_ror_epi64::<16>(_mm512_xor_si512(v[d], v[a])); + v[c] = _mm512_add_epi64(v[c], v[d]); + v[b] = _mm512_ror_epi64::<63>(_mm512_xor_si512(v[b], v[c])); + } + + // Only m0 and m1 are nonzero; m1's high 32 bits hold the per-lane `g`. + let tail0 = u64::from_le_bytes(self.tail[0..8].try_into().unwrap()); + let tail_hi = u32::from_le_bytes(self.tail[8..12].try_into().unwrap()) as u64; + let m1 = |g: u32| (tail_hi | ((g as u64) << 32)) as i64; + let mut m = [_mm512_setzero_si512(); 16]; + m[0] = _mm512_set1_epi64(tail0 as i64); + m[1] = _mm512_set_epi64( + m1(g0 + 7), m1(g0 + 6), m1(g0 + 5), m1(g0 + 4), + m1(g0 + 3), m1(g0 + 2), m1(g0 + 1), m1(g0), + ); + + let mut v = [_mm512_setzero_si512(); 16]; + for i in 0..8 { + v[i] = _mm512_set1_epi64(self.mid[i] as i64); + v[i + 8] = _mm512_set1_epi64(IV[i] as i64); + } + v[12] = _mm512_xor_si512(v[12], _mm512_set1_epi64(FINAL_COUNT as i64)); + v[14] = _mm512_xor_si512(v[14], _mm512_set1_epi64(-1)); // last-block flag + + for s in &SIGMA { + g8(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]); + g8(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]); + g8(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]); + g8(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]); + g8(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]); + g8(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]); + g8(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]); + g8(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]); + } + + // h[i] = mid[i] ^ v[i] ^ v[i+8]; first HASH_OUTPUT/8 words per lane, LE. + let mut tmp = [0u64; 8]; + for i in 0..HASH_OUTPUT / 8 { + let o = _mm512_xor_si512( + _mm512_xor_si512(_mm512_set1_epi64(self.mid[i] as i64), v[i]), + v[i + 8], + ); + _mm512_storeu_si512(tmp.as_mut_ptr() as *mut _, o); + for l in 0..8 { + out[l][i * 8..i * 8 + 8].copy_from_slice(&tmp[l].to_le_bytes()); + } + } + } + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn hash4_avx2(&self, g0: u32, out: &mut [[u8; HASH_OUTPUT]; 4]) { @@ -316,5 +402,14 @@ mod tests { assert_eq!(out[l], generate_hash(&base, base_g + l as u32), "hash4 mismatch at g={}", base_g + l as u32); } } + + // hash8 (AVX-512 or hash4 fallback) must match for several batches. + for base_g in [0u32, 8, 64, 1_000_000] { + let mut out = [[0u8; HASH_OUTPUT]; 8]; + hasher.hash8(base_g, &mut out); + for l in 0..8 { + assert_eq!(out[l], generate_hash(&base, base_g + l as u32), "hash8 mismatch at g={}", base_g + l as u32); + } + } } } diff --git a/src/equihash.rs b/src/equihash.rs index a28e208..3dc23c4 100644 --- a/src/equihash.rs +++ b/src/equihash.rs @@ -438,7 +438,7 @@ fn low_group( /// Monomorphised over the XOR producer so the AVX2 intrinsics inline cleanly /// inside a `target_feature` wrapper while sharing one source of truth. macro_rules! emit_bucket_body { - ($keys_sorted:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $w_in:expr, $w_out:expr, $xor:path) => {{ + ($keys_sorted:expr, $slots:expr, $sorted:expr, $kout:expr, $sout:expr, $pout:expr, $clamp:expr, $w_in:expr, $w_out:expr, $prefetch:expr, $xor:path) => {{ let s = $sorted; let ks = $keys_sorted; // leading keys in `s`-order; group walk streams it let m = s.len(); @@ -453,6 +453,18 @@ macro_rules! emit_bucket_body { j += 1; } let hi = j.min(i.saturating_add($clamp)); + // The pair loops below read `slots[s[a]]` / `slots[s[b]]` at random + // global positions in the ~1 GB slot arena — each a likely cache + // miss. Prefetch this group's member slots up front so the misses + // overlap with the (L1-resident) pair XORs that follow. + #[cfg(target_arch = "x86_64")] + if $prefetch { + for a in i..hi { + core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T0 }>( + $slots.as_ptr().add(s[a] as usize * w_in) as *const i8, + ); + } + } for a in i..hi { let l = s[a] as usize; for b in (a + 1)..hi { @@ -484,8 +496,9 @@ unsafe fn emit_bucket_scalar( clamp: usize, w_in: usize, w_out: usize, + prefetch: bool, ) -> usize { - emit_bucket_body!(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_scalar) + emit_bucket_body!(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, prefetch, xor_child_scalar) } #[cfg(target_arch = "x86_64")] @@ -500,12 +513,14 @@ unsafe fn emit_bucket_avx2( clamp: usize, w_in: usize, w_out: usize, + prefetch: bool, ) -> usize { - emit_bucket_body!(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, xor_child_avx2) + emit_bucket_body!(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, prefetch, xor_child_avx2) } /// Emit a partition's children, dispatching to the AVX2 producer when available. /// `w_in`/`w_out` are the input/output slot pitches (`w_out == w_in - 1`). +/// `prefetch` software-prefetches each group's randomly-gathered member slots. unsafe fn emit_bucket( keys_sorted: &[u32], slots: &[u32], @@ -516,14 +531,15 @@ unsafe fn emit_bucket( clamp: usize, w_in: usize, w_out: usize, + prefetch: bool, ) -> usize { #[cfg(target_arch = "x86_64")] { if is_x86_feature_detected!("avx2") { - return emit_bucket_avx2(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out); + return emit_bucket_avx2(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, prefetch); } } - emit_bucket_scalar(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out) + emit_bucket_scalar(keys_sorted, slots, sorted, kout, sout, pout, clamp, w_in, w_out, prefetch) } /// Group `n` entries by their leading block, then emit one child per colliding @@ -546,6 +562,9 @@ fn collide( // Sub-phase timing, gated on `EQ_PROFILE`. Prints partition / count / alloc / // emit splits so we can see which part of the round dominates. let prof = std::env::var_os("EQ_PROFILE").is_some(); + // Software-prefetch the emit gather (on by default; `EQ_NO_PREFETCH` disables + // it for A/B benchmarking). Read once, not per group. + let do_prefetch = std::env::var_os("EQ_NO_PREFETCH").is_none(); let t0 = std::time::Instant::now(); let (starts, order, keys_part) = partition_top(keys, n); @@ -609,8 +628,9 @@ fn collide( let mut sorted = Vec::new(); let mut keys_sorted = Vec::new(); low_group(&keys_part[lo..hi], &order[lo..hi], &mut hist, &mut sorted, &mut keys_sorted); - let w = - unsafe { emit_bucket(&keys_sorted, slots, &sorted, kout, sout, pout, clamp, w_in, w_out) }; + let w = unsafe { + emit_bucket(&keys_sorted, slots, &sorted, kout, sout, pout, clamp, w_in, w_out, do_prefetch) + }; debug_assert_eq!(w, kout.len()); }); @@ -722,18 +742,29 @@ pub fn solve_with(header: &[u8], clamp: Option) -> Vec> { let hasher = BatchHasher::new(header); let mut keys: Vec = vec![0u32; n0]; let mut slots: Vec = vec![0u32; n0 * SLOT]; - let kgroup = 4 * INDICES_PER_HASH_OUTPUT; // eight entries + // Hash eight `g` values per chunk via `hash8`, which uses one AVX-512 + // compression where available and falls back to two AVX2 `hash4`s (or scalar) + // otherwise — so this single path covers every CPU. + let kgroup = 8 * INDICES_PER_HASH_OUTPUT; // sixteen entries let sgroup = kgroup * SLOT; - debug_assert_eq!(n0 % kgroup, 0, "round-0 buffer must split into whole 4-g groups"); + debug_assert_eq!(n0 % kgroup, 0, "round-0 buffer must split into whole 8-g groups"); + // `EQ_NO_AVX512` forces the AVX2 fallback (two hash4) for A/B benchmarking. + let use_avx512 = std::env::var_os("EQ_NO_AVX512").is_none(); slots .par_chunks_mut(sgroup) .zip(keys.par_chunks_mut(kgroup)) .enumerate() .for_each(|(c, (schunk, kchunk))| { - let g0 = (c * 4) as u32; - let mut hs = [[0u8; HASH_OUTPUT]; 4]; - hasher.hash4(g0, &mut hs); - for j in 0..4 { + let g0 = (c * 8) as u32; + let mut hs = [[0u8; HASH_OUTPUT]; 8]; + if use_avx512 { + hasher.hash8(g0, &mut hs); + } else { + let (lo, hi) = hs.split_at_mut(4); + hasher.hash4(g0, (&mut lo[..4]).try_into().unwrap()); + hasher.hash4(g0 + 4, (&mut hi[..4]).try_into().unwrap()); + } + for j in 0..8 { for i in 0..INDICES_PER_HASH_OUTPUT { let e = j * INDICES_PER_HASH_OUTPUT + i; let src = &hs[j][i * HASH_BYTES..i * HASH_BYTES + HASH_BYTES];