diff --git a/collab/jmprcx-solver/Cargo.toml b/collab/jmprcx-solver/Cargo.toml deleted file mode 100644 index 7b67928..0000000 --- a/collab/jmprcx-solver/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "jmprcx-solver" -version = "0.1.0" -edition = "2026" -description = "Load and drive the Equihash 192,7 GPU solver fatbin via the CUDA Driver API" - -[[bin]] -name = "jmprcx-solver" -path = "src/main.rs" - -[dependencies] -# none on purpose: raw FFI to the system CUDA driver (libcuda), no network needed diff --git a/collab/jmprcx-solver/README.md b/collab/jmprcx-solver/README.md deleted file mode 100644 index ea71566..0000000 --- a/collab/jmprcx-solver/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# miniz-solver-rs - -Basic Rust program that **uses the extracted miniZ Equihash 192,7 GPU solver**. -It loads the captured CUDA fatbin (`../miniz-dump/solver_192_7/equihash192_7.fatbin`) -through the CUDA Driver API (raw FFI to `libcuda`, no external crates) and drives -its kernels on the GPU. - -## Build & run - -```sh -cargo build --release -./target/release/miniz-solver # load + enumerate all 57 kernels -./target/release/miniz-solver --launch # also execute a real solver kernel -./target/release/miniz-solver --round0 # replay round 0 (digit_f) with a captured midstate -./target/release/miniz-solver /path/to.fatbin # use a different fatbin -``` - -Requires an NVIDIA GPU + driver (`/usr/lib/libcuda.so`). The fatbin contains -`sm_80`/`sm_86`/`sm_120` cubins; the driver auto-picks the one for your GPU. - -## What it does - -- `cuInit` → context on GPU#0 -- `cuModuleLoadData` on the raw fatbin (magic `0xBA55ED50`) -- `cuModuleEnumerateFunctions` + `cuFuncGetName` + `cuFuncGetAttribute`: - lists every kernel with regs / shared / local / max-threads and labels the - Wagner `n=192,k=7` pipeline: - `digit_f` (round 0: BLAKE2b + bucketing) → `digit_1..3`, `digit_4w/5w/6w` - (rounds 1–6) → `digit_l` (round 7: solution recovery) → `sort_and_compress`. -- with `--launch`: allocates a device buffer and launches the real - `cleanup<64>(void*, uint)` kernel, then `cuCtxSynchronize`. -- with `--round0`: drives the real **round 0** (`digit_f`) — allocates the four - buffers at their template sizes, launches the exact runtime variant - (grid=65536, block=256) with a BLAKE2b midstate captured from a live job, and - reads back the bucket counters. Verified output: **33,554,432 = 2^25** entries - bucketed into 12288 buckets (the correct 192,7 initial-entry count). -- with `--replay [rec.log]`: **runs the entire solver** — parses a recorded pass - (`recording.log`), allocates one arena, rebases every device pointer, and - executes all 10 kernels (`cleanup → digit_f → digit_1..6 → digit_l → - sort_and_compress`). All kernels complete; extracts a 128-index candidate. -- with `--header `: computes a BLAKE2b(192,7) midstate from a 140-byte - header, injects it, and runs the full pipeline (mint a new job). -- with `--selftest`: BLAKE2b-512 known-answer test (RFC 7693) — PASS. -- with `--verify-share`: verify a real pool-accepted share (BLAKE2b + Wagner) — VALID. -- with `--solve`: **the complete solver** — inject a known header's midstate+tail, - run the GPU pipeline, and harvest a solution from the container that the verifier - accepts. Reproducibly prints `SOLUTION HARVESTED FROM GPU — VALID ✓`. - -See `../miniz-dump/solver_192_7/ORCHESTRATION.md` for the full pipeline + recovery. - -### Status (honest) -- **Pipeline: complete.** All 10 kernels run standalone; round 0 verified bit-exact - (2^25 entries). Faithful end-to-end replay of miniZ's 192,7 solver. -- **Hash model + verification: SOLVED.** Captured live stratum (plaintext) via a - logging relay; a real pool-accepted share verifies exactly under - `hash(i) = BLAKE2b(header‖LE32(i/2), person="ZcashPoW"+LE32(192)+LE32(7), - digest=48)[(i%2)*24..]`. `--verify-share` reproduces VALID ✓ (192/192 zero bits, - all 7 Wagner levels) in Rust. So `--selftest`, `blake2b.rs`, `verify.rs` and the - solution decoder are all proven against ground truth. -- **Complete (`--solve`).** Container = 128 consecutive u32 indices at offset 0; - the midstate is textbook BLAKE2b-after-128B and the digit_f `uint` is the 4 - varying header-tail bytes (nonce[28..31]; nonce[20..27] are constant 0). So: - `header → midstate+tail → GPU pipeline → container[0..128] → VALID solution`, - reproducibly. The miniZ Equihash 192,7 solver is fully reverse-engineered. - -## What it does NOT do (scope) - -It does **not** mine or produce valid Equihash solutions. A working solver also -needs miniZ's host orchestration, which is not part of the extracted kernels: - -- exact device-buffer sizing per round (the kernels' template/array dims give the - bucket geometry, e.g. `uint4[180][6656][32]`, but the host owns allocation) -- the precise `digit_f → digit_1..6 → digit_l → sort_and_compress` launch - sequence with the correct grid/block dims and shared-mem config per round -- BLAKE2b midstate setup from the block header + nonce, and the `equi<...>` / - `ScontainerReal192` struct layouts passed between kernels - -That host logic lives in miniZ's encrypted blob. Reconstructing it (from the SASS -in `../miniz-dump/solver_192_7/equihash192_7.sm_120.sass` plus the kernel -signatures in `kernels_demangled.txt`) is the next step toward a standalone miner. - -## Files -- `src/cuda.rs` — minimal CUDA Driver API FFI bindings -- `src/main.rs` — loader / enumerator / launch demo -- `build.rs` — links `libcuda` diff --git a/collab/jmprcx-solver/build.rs b/collab/jmprcx-solver/build.rs deleted file mode 100644 index 1eec2d0..0000000 --- a/collab/jmprcx-solver/build.rs +++ /dev/null @@ -1,8 +0,0 @@ -// Link against the system CUDA driver (libcuda.so -> libcuda.so.1). -// Falls back to the CUDA toolkit stub if the driver symlink isn't in /usr/lib. -fn main() { - println!("cargo:rustc-link-search=native=/usr/lib"); - println!("cargo:rustc-link-search=native=/usr/lib64"); - println!("cargo:rustc-link-search=native=/opt/cuda/targets/x86_64-linux/lib/stubs"); - println!("cargo:rustc-link-lib=dylib=cuda"); -} diff --git a/collab/jmprcx-solver/src/blake2b.rs b/collab/jmprcx-solver/src/blake2b.rs deleted file mode 100644 index e0d4a34..0000000 --- a/collab/jmprcx-solver/src/blake2b.rs +++ /dev/null @@ -1,173 +0,0 @@ -//! BLAKE2b-512 with Equihash (ZcashPoW, n=192, k=7) personalization, plus the -//! "midstate" client hands to `digit_f`: the BLAKE2b state after compressing the -//! first 128-byte block of the block header. -//! -//! Equihash params for 192,7: -//! personalization = "ZcashPoW" || LE32(192) || LE32(7) -//! digest_length = (512/192)*192/8 = 48 bytes (2 indices x 24 bytes) -//! -//! The 140-byte header + 4-byte index = 144 bytes hashed per index. The first -//! 128 bytes are header-independent of the index, so the client precompresses them on -//! the CPU into the 64-byte (8x u64) midstate; the GPU finishes per index. - -const IV: [u64; 8] = [ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -]; - -const SIGMA: [[usize; 16]; 12] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], - [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], - [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], - [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], - [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], - [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], - [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], - [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], - [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], -]; - -#[inline] -fn g(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) { - v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); - v[d] = (v[d] ^ v[a]).rotate_right(32); - v[c] = v[c].wrapping_add(v[d]); - v[b] = (v[b] ^ v[c]).rotate_right(24); - v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); - v[d] = (v[d] ^ v[a]).rotate_right(16); - v[c] = v[c].wrapping_add(v[d]); - v[b] = (v[b] ^ v[c]).rotate_right(63); -} - -/// One BLAKE2b compression of a 128-byte block into state `h`. -fn compress(h: &mut [u64; 8], block: &[u8; 128], t: u128, last: bool) { - let mut m = [0u64; 16]; - for i in 0..16 { - m[i] = u64::from_le_bytes(block[i * 8..i * 8 + 8].try_into().unwrap()); - } - let mut v = [0u64; 16]; - v[..8].copy_from_slice(h); - v[8..].copy_from_slice(&IV); - v[12] ^= t as u64; - v[13] ^= (t >> 64) as u64; - if last { - v[14] = !v[14]; - } - for r in 0..12 { - let s = &SIGMA[r]; - g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]); - g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]); - g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]); - g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]); - g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]); - g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]); - g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]); - g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]); - } - for i in 0..8 { - h[i] ^= v[i] ^ v[i + 8]; - } -} - -/// Known-answer self-test of the core compression: standard BLAKE2b-512("abc"). -pub fn selftest() -> bool { - let mut h = IV; - h[0] ^= 0x0101_0000 ^ 64; // digest_length=64, fanout=1, depth=1, no personalization - let msg = b"abc"; - let mut block = [0u8; 128]; - block[..3].copy_from_slice(msg); - compress(&mut h, &block, 3, true); - let mut out = [0u8; 64]; - for i in 0..8 { - out[i * 8..i * 8 + 8].copy_from_slice(&h[i].to_le_bytes()); - } - // RFC 7693 test vector for BLAKE2b-512("abc") - let expect: [u8; 64] = [ - 0xba, 0x80, 0xa5, 0x3f, 0x98, 0x1c, 0x4d, 0x0d, 0x6a, 0x27, 0x97, 0xb6, 0x9f, 0x12, 0xf6, 0xe9, - 0x4c, 0x21, 0x2f, 0x14, 0x68, 0x5a, 0xc4, 0xb7, 0x4b, 0x12, 0xbb, 0x6f, 0xdb, 0xff, 0xa2, 0xd1, - 0x7d, 0x87, 0xc5, 0x39, 0x2a, 0xab, 0x79, 0x2d, 0xc2, 0x52, 0xd5, 0xde, 0x45, 0x33, 0xcc, 0x95, - 0x18, 0xd3, 0x8a, 0xa8, 0xdb, 0xf1, 0x92, 0x5a, 0xb9, 0x23, 0x86, 0xed, 0xd4, 0x00, 0x99, 0x23, - ]; - out == expect -} - -/// Initial BLAKE2b state for Equihash(192,7). -pub fn init_state() -> [u64; 8] { - let mut personal = [0u8; 16]; - personal[..8].copy_from_slice(b"ZcashPoW"); - personal[8..12].copy_from_slice(&192u32.to_le_bytes()); - personal[12..16].copy_from_slice(&7u32.to_le_bytes()); - - let mut h = IV; - // param block: digest_length=48, key=0, fanout=1, depth=1 - h[0] ^= 0x0101_0000 ^ 48; - // words 6,7 hold the 16-byte personalization - h[6] ^= u64::from_le_bytes(personal[0..8].try_into().unwrap()); - h[7] ^= u64::from_le_bytes(personal[8..16].try_into().unwrap()); - h -} - -/// The 64-byte midstate digit_f expects: state after compressing header[0..128]. -/// `header` must be the 140-byte block header. -pub fn midstate(header: &[u8]) -> [u8; 64] { - assert!(header.len() >= 128, "header must be >= 128 bytes"); - let mut h = init_state(); - let mut block = [0u8; 128]; - block.copy_from_slice(&header[0..128]); - compress(&mut h, &block, 128, false); - let mut out = [0u8; 64]; - for i in 0..8 { - out[i * 8..i * 8 + 8].copy_from_slice(&h[i].to_le_bytes()); - } - out -} - -/// Finalize a hash directly from a 64-byte midstate (h[0..8]) plus a final block -/// whose first `idx_len` bytes are LE(idx_word) and the rest zero, with the -/// total byte counter `t_total`. Returns the 48-byte digest (h[0..6]). -/// Used to test the GPU's per-index hash construction (midstate + index, no tail). -pub fn digest_from_midstate(mid: &[u8; 64], idx_word: u32, idx_len: usize, t_total: u128) -> [u8; 48] { - let mut h = [0u64; 8]; - for i in 0..8 { - h[i] = u64::from_le_bytes(mid[i * 8..i * 8 + 8].try_into().unwrap()); - } - let mut block = [0u8; 128]; - let w = idx_word.to_le_bytes(); - block[..idx_len.min(4)].copy_from_slice(&w[..idx_len.min(4)]); - compress(&mut h, &block, t_total, true); - let mut out = [0u8; 48]; - for i in 0..6 { - out[i * 8..i * 8 + 8].copy_from_slice(&h[i].to_le_bytes()); - } - out -} - -/// Full Equihash(192,7) per-index hash: BLAKE2b(header || LE32(g)) -> 48 bytes, -/// where g = index / 2. Returns the 24-byte half selected by (index & 1). -/// Used for solution verification (reference path, midstate not required). -pub fn index_hash(header: &[u8], index: u32) -> [u8; 24] { - let mut h = init_state(); - // header is 140 bytes; append LE32(g) -> 144 bytes total = one full block + 16 - let g_word = (index / 2).to_le_bytes(); - let mut input = Vec::with_capacity(144); - input.extend_from_slice(&header[..140]); - input.extend_from_slice(&g_word); - // block 1 - let mut b0 = [0u8; 128]; - b0.copy_from_slice(&input[0..128]); - compress(&mut h, &b0, 128, false); - // final block (16 bytes used, rest zero), t = 144, last - let mut b1 = [0u8; 128]; - b1[..16].copy_from_slice(&input[128..144]); - compress(&mut h, &b1, 144, true); - let mut full = [0u8; 64]; - for i in 0..8 { - full[i * 8..i * 8 + 8].copy_from_slice(&h[i].to_le_bytes()); - } - // 48-byte digest -> two 24-byte index hashes - let half = (index & 1) as usize * 24; - full[half..half + 24].try_into().unwrap() -} diff --git a/collab/jmprcx-solver/src/cuda.rs b/collab/jmprcx-solver/src/cuda.rs deleted file mode 100644 index 52c98c8..0000000 --- a/collab/jmprcx-solver/src/cuda.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Minimal raw FFI bindings to the CUDA Driver API (libcuda) — only what we need -//! to load Equihash 192,7 fatbin and drive its kernels. - -#![allow(non_camel_case_types, non_snake_case, dead_code)] - -use std::ffi::{c_char, c_int, c_uint, c_void, CStr}; - -pub type CUresult = c_int; -pub type CUdevice = c_int; -pub type CUcontext = *mut c_void; -pub type CUmodule = *mut c_void; -pub type CUfunction = *mut c_void; -pub type CUstream = *mut c_void; -pub type CUdeviceptr = u64; - -pub const CUDA_SUCCESS: CUresult = 0; - -// CUfunction_attribute values (cuda.h) -pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: c_int = 0; -pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: c_int = 1; -pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: c_int = 2; -pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: c_int = 3; -pub const CU_FUNC_ATTRIBUTE_NUM_REGS: c_int = 4; -pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: c_int = 5; -pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: c_int = 6; -pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: c_int = 8; - -// cuLaunchKernel `extra` directives -pub const CU_LAUNCH_PARAM_END: usize = 0x00; -pub const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01; -pub const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02; - -extern "C" { - pub fn cuInit(flags: c_uint) -> CUresult; - pub fn cuDriverGetVersion(version: *mut c_int) -> CUresult; - pub fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult; - pub fn cuDeviceGetName(name: *mut c_char, len: c_int, dev: CUdevice) -> CUresult; - pub fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult; - pub fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult; - pub fn cuCtxSynchronize() -> CUresult; - - // Module / kernel loading + introspection - pub fn cuModuleLoadData(module: *mut CUmodule, image: *const c_void) -> CUresult; - pub fn cuModuleUnload(module: CUmodule) -> CUresult; - pub fn cuModuleGetFunction(func: *mut CUfunction, module: CUmodule, name: *const c_char) -> CUresult; - pub fn cuModuleGetFunctionCount(count: *mut c_uint, module: CUmodule) -> CUresult; - pub fn cuModuleEnumerateFunctions(functions: *mut CUfunction, num: c_uint, module: CUmodule) -> CUresult; - pub fn cuFuncGetName(name: *mut *const c_char, func: CUfunction) -> CUresult; - pub fn cuFuncGetAttribute(pi: *mut c_int, attrib: c_int, func: CUfunction) -> CUresult; - pub fn cuFuncSetAttribute(func: CUfunction, attrib: c_int, value: c_int) -> CUresult; - - // Memory + launch - pub fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult; - pub fn cuMemsetD8_v2(dptr: CUdeviceptr, uc: u8, n: usize) -> CUresult; - pub fn cuMemsetD32_v2(dptr: CUdeviceptr, ui: u32, n: usize) -> CUresult; - pub fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult; - pub fn cuMemcpyDtoH_v2(dst: *mut c_void, src: CUdeviceptr, n: usize) -> CUresult; - pub fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult; - pub fn cuLaunchKernel( - f: CUfunction, - gx: c_uint, gy: c_uint, gz: c_uint, - bx: c_uint, by: c_uint, bz: c_uint, - shared_mem_bytes: c_uint, - stream: CUstream, - kernel_params: *mut *mut c_void, - extra: *mut *mut c_void, - ) -> CUresult; - - pub fn cuGetErrorName(error: CUresult, s: *mut *const c_char) -> CUresult; - pub fn cuGetErrorString(error: CUresult, s: *mut *const c_char) -> CUresult; -} - -/// Human-readable "NAME: description" for a CUresult. -pub fn err_str(code: CUresult) -> String { - unsafe { - let mut name: *const c_char = std::ptr::null(); - let mut desc: *const c_char = std::ptr::null(); - cuGetErrorName(code, &mut name); - cuGetErrorString(code, &mut desc); - let n = if name.is_null() { "?".into() } else { CStr::from_ptr(name).to_string_lossy().into_owned() }; - let d = if desc.is_null() { "".into() } else { CStr::from_ptr(desc).to_string_lossy().into_owned() }; - format!("{n} ({code}): {d}") - } -} - -/// Turn a CUresult into a Result, with context. -pub fn check(code: CUresult, what: &str) -> Result<(), String> { - if code == CUDA_SUCCESS { - Ok(()) - } else { - Err(format!("{what} failed: {}", err_str(code))) - } -} diff --git a/collab/jmprcx-solver/src/equihash192_7.fatbin b/collab/jmprcx-solver/src/equihash192_7.fatbin deleted file mode 100644 index f25d64d..0000000 Binary files a/collab/jmprcx-solver/src/equihash192_7.fatbin and /dev/null differ diff --git a/collab/jmprcx-solver/src/main.rs b/collab/jmprcx-solver/src/main.rs deleted file mode 100644 index 0e4d0da..0000000 --- a/collab/jmprcx-solver/src/main.rs +++ /dev/null @@ -1,310 +0,0 @@ -//! jmprcx-solver — load the Equihash 192,7 GPU solver fatbin -//! and drive its kernels through the CUDA Driver API. -//! -//! What this does (always, safely): -//! * cuInit + create a context on the chosen GPU -//! * load the captured fatbin (auto-selects the cubin matching your GPU arch) -//! * enumerate every kernel, print attributes, and label the Wagner pipeline -//! -//! What this does with `--launch` (experimental): -//! * actually launches one real solver kernel (`cleanup<64>(void*, uint)`), -//! which has a simple, known signature, and synchronizes. -//! - -mod cuda; -mod round0; -mod replay; -mod blake2b; -mod verify; -use cuda::*; -use std::ffi::{c_void, CStr, CString}; -use std::ptr; - -const DEFAULT_FATBIN: &str = - "/home/access/RustroverProjects/zclminer/collab/jmprcx-solver/src/equihash192_7.fatbin"; -const CLEANUP_MANGLED: &str = "_Z7cleanupILj64EEvPvj"; // void cleanup<64u>(void*, unsigned) - -/// Best-effort role label from the (mangled) kernel name. -fn role(name: &str) -> &'static str { - if name.contains("7digit_f") { "round 0 : BLAKE2b hash + initial bucketing" } - else if name.contains("7digit_1") { "round 1 : Wagner collision" } - else if name.contains("7digit_2") { "round 2 : Wagner collision" } - else if name.contains("7digit_3") { "round 3 : Wagner collision" } - else if name.contains("8digit_4w") { "round 4 : Wagner collision (wide)" } - else if name.contains("8digit_5w") { "round 5 : Wagner collision (wide)" } - else if name.contains("8digit_6w") { "round 6 : Wagner collision (wide)" } - else if name.contains("7digit_l") { "round 7 : final collision + solution recovery" } - else if name.contains("sort_and_compress") { "post : sort + compress solutions" } - else if name.contains("7cleanup") { "util : buffer cleanup" } - else { "other" } -} - -fn family(name: &str) -> &'static str { - for k in ["7digit_f","7digit_1","7digit_2","7digit_3","8digit_4w","8digit_5w", - "8digit_6w","7digit_l","sort_and_compress","7cleanup"] { - if name.contains(k) { return k; } - } - "other" -} - -/// A real, pool-accepted 192,7 block header (job 19ae0) captured from the wire. -/// Used by `--solve` as a known-good header so the GPU output can be verified. -const KNOWN_HEADER: &str = "040000002ba84c97ffc202b55a5843d55837d256fdc32410390b8e95502bd8f648040000cb560c7083a13e06273570350805668e83c3e2362e39e131612fead6f4ea9937a19ceba5b597e2217d7e0c53ba24de3d36b92cf97743550c2745c9464f4dc847ba9e1e6a34cf101e80032bb40ae5118877fccacf8d961e648f6a228d0000000000000000ce856809"; - -/// Scan a container dump for a 128-index group the verifier accepts, using the -/// proven per-index hash as an oracle. The range filter (128 consecutive u32 all -/// in (0, 2^25)) is effectively impossible for random GPU memory, so the -/// expensive XOR check runs only on real solution-shaped windows. -fn scan_container(header: &[u8], bytes: &[u8]) -> Option> { - let u: Vec = bytes.chunks_exact(4).map(|c| u32::from_le_bytes(c.try_into().unwrap())).collect(); - if u.len() < 128 { return None; } - let mut checked = 0u64; - for start in 0..=u.len() - 128 { - let w = &u[start..start + 128]; - if !w.iter().all(|&x| x > 0 && x < (1 << 25)) { continue; } - let mut d = w.to_vec(); d.sort_unstable(); d.dedup(); - if d.len() != 128 { continue; } - checked += 1; - if verify::top_xor_zero_bits(w, |i| blake2b::index_hash(header, i)) >= 168 { - let (ok, _) = verify::verify(w, |i| blake2b::index_hash(header, i)); - if ok { - println!(" found at u32 offset {start} (after {checked} solution-shaped windows)"); - return Some(w.to_vec()); - } - } - } - println!(" {checked} solution-shaped windows checked, none verified"); - None -} - -/// Decode an Equihash 192,7 stratum solution (varint length + 128 x 25-bit -/// big-endian indices) into 128 indices. -fn decode_solution(hex: &str) -> Vec { - let raw = parse_hex(hex); - // strip the compactsize/varint length prefix (0xfd => 2-byte LE length) - let body = if raw.first() == Some(&0xfd) { &raw[3..] } else { &raw[1..] }; - let (mut acc, mut bits, mut out) = (0u64, 0u32, Vec::with_capacity(128)); - for &b in body { - acc = (acc << 8) | b as u64; - bits += 8; - while bits >= 25 { - bits -= 25; - out.push(((acc >> bits) & 0x1ff_ffff) as u32); - } - } - out.truncate(128); - out -} - -fn verify_share() { - const SOLUTION: &str = "fd900101420199f2d450c74cdec6d8f3437c5bb217e1e37cb50bacf43cb332bb3ded21346edbc173c868e724d1496f04f3f38bab5705abbb7b168e947bc16b75d4043ce7fb16c10f417c6de5ce8306b1aa5dcd02b7c9e49e6001193aae954c3a733f4f55ce5a9703692af8dea5014a587a1ba2d3a0cf03902cfd212fe5846bc9096bdc615a22e4c1f232d9b945de079c2f29aa3a9c87d0681612d8804a8ccf24c752df1837d4c31bb61b5266328dafeb46af26f96ecc74f2d59ad96c9bff231b4a5e7d87aa33bd916270e703c1d6f090ad8ad02cb86c0550f37585042135ae202f5848bb0b0e695cfe638dfdf89c325833a98125c0f765c6d535e886c915cc01f775b9a35a5972c4ecc40afeb4ff083a7493ab8c238f188b2231218771810cb907f02506020d8f2525a627573126d20955d552328cd1557e34e225b4a2f09c411377055c039163df1c499a4e92a011bf71fc4e58839d23f5822d0a200f65ef194d0a3cf0919b35091b681db6db5293d49e2e12960994436d15300bef5f53799ba98e9e752af7842374f4abc6b5eecd5775de07"; - let header = parse_hex(KNOWN_HEADER); - let sol = decode_solution(SOLUTION); - println!("known-answer share (job 19ae0): header {} B, {} indices, {} distinct", - header.len(), sol.len(), { let mut s = sol.clone(); s.sort_unstable(); s.dedup(); s.len() }); - let zb = verify::top_xor_zero_bits(&sol, |i| blake2b::index_hash(&header, i)); - let (ok, msg) = verify::verify(&sol, |i| blake2b::index_hash(&header, i)); - println!(" full 128-leaf XOR leading zero bits = {zb} / 192"); - println!(" verify: {} — {msg}", if ok { "VALID ✓ (matches pool)" } else { "INVALID" }); -} - -fn parse_hex(s: &str) -> Vec { - let s: String = s.chars().filter(|c| c.is_ascii_hexdigit()).collect(); - (0..s.len() / 2).map(|i| u8::from_str_radix(&s[2 * i..2 * i + 2], 16).unwrap_or(0)).collect() -} - -fn main() { - if let Err(e) = run() { - eprintln!("\nerror: {e}"); - std::process::exit(1); - } -} - -fn run() -> Result<(), String> { - let args: Vec = std::env::args().collect(); - let do_launch = args.iter().any(|a| a == "--launch"); - let do_round0 = args.iter().any(|a| a == "--round0"); - let do_replay = args.iter().any(|a| a == "--replay"); - if args.iter().any(|a| a == "--selftest") { - println!("BLAKE2b-512 known-answer self-test: {}", - if blake2b::selftest() { "PASS" } else { "FAIL" }); - return Ok(()); - } - if args.iter().any(|a| a == "--verify-share") { - verify_share(); - return Ok(()); - } - let fatbin_path = args.iter().skip(1) - .find(|a| a.ends_with(".fatbin")) - .cloned() - .unwrap_or_else(|| DEFAULT_FATBIN.to_string()); - - // --- read the captured solver fatbin --- - let image = std::fs::read(&fatbin_path) - .map_err(|e| format!("reading fatbin {fatbin_path}: {e}"))?; - if image.len() < 4 || &image[0..4] != [0x50, 0xed, 0x55, 0xba] { - eprintln!("warning: {fatbin_path} does not start with the fatbin magic 0xBA55ED50"); - } - println!("== jmprcx Equihash 192,7 solver loader =="); - println!("fatbin : {fatbin_path} ({} bytes)", image.len()); - - unsafe { - // --- init driver + device + context --- - check(cuInit(0), "cuInit")?; - let mut ver = 0; - cuDriverGetVersion(&mut ver); - println!("driver : CUDA {}.{}", ver / 1000, (ver % 1000) / 10); - - let mut dev: CUdevice = 0; - check(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?; - let mut name = [0i8; 128]; - cuDeviceGetName(name.as_mut_ptr() as *mut _, 128, dev); - let gpu = CStr::from_ptr(name.as_ptr() as *const _).to_string_lossy().into_owned(); - println!("device : GPU#0 {gpu}"); - - let mut ctx: CUcontext = ptr::null_mut(); - check(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate")?; - - // --- load the fatbin (driver picks the cubin matching this GPU's arch) --- - let mut module: CUmodule = ptr::null_mut(); - check(cuModuleLoadData(&mut module, image.as_ptr() as *const c_void), - "cuModuleLoadData") - .map_err(|e| format!("{e}\n(the fatbin has sm_80/sm_86/sm_120; the driver needs the cubin matching this GPU)"))?; - println!("module : loaded OK\n"); - - // --- enumerate every kernel in the solver --- - let mut count: u32 = 0; - check(cuModuleGetFunctionCount(&mut count, module), "cuModuleGetFunctionCount")?; - let mut funcs: Vec = vec![ptr::null_mut(); count as usize]; - check(cuModuleEnumerateFunctions(funcs.as_mut_ptr(), count, module), - "cuModuleEnumerateFunctions")?; - println!("solver exposes {count} device kernels:\n"); - println!(" {:<22} {:>5} {:>7} {:>7} {:>6} role", "name", "regs", "shared", "local", "maxT"); - println!(" {}", "-".repeat(86)); - - use std::collections::BTreeMap; - let mut by_family: BTreeMap<&str, u32> = BTreeMap::new(); - - for &f in &funcs { - let mut np: *const std::ffi::c_char = ptr::null(); - let fname = if cuFuncGetName(&mut np, f) == CUDA_SUCCESS && !np.is_null() { - CStr::from_ptr(np).to_string_lossy().into_owned() - } else { "".into() }; - - let attr = |a: i32| -> i32 { let mut v = 0; cuFuncGetAttribute(&mut v, a, f); v }; - let regs = attr(CU_FUNC_ATTRIBUTE_NUM_REGS); - let shared = attr(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES); - let local = attr(CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES); - let maxt = attr(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); - - *by_family.entry(family(&fname)).or_insert(0) += 1; - - // show a short, stable handle: the C++ template head up to the first '<'/param block - let short: String = fname.chars().take(22).collect(); - println!(" {:<22} {:>5} {:>7} {:>7} {:>6} {}", short, regs, shared, local, maxt, role(&fname)); - } - - println!("\nkernel families (Wagner n=192, k=7 pipeline):"); - for (fam, n) in &by_family { - println!(" {:<20} x{:<3} {}", fam.trim_start_matches(char::is_numeric), n, role(fam)); - } - - // --- optional: actually launch one real solver kernel --- - if do_launch { - println!("\n--launch: running cleanup<64>(void*, uint) on the GPU ..."); - let cname = CString::new(CLEANUP_MANGLED).unwrap(); - let mut cf: CUfunction = ptr::null_mut(); - match check(cuModuleGetFunction(&mut cf, module, cname.as_ptr()), "cuModuleGetFunction(cleanup)") { - Err(e) => println!(" skipped: {e}"), - Ok(()) => { - let bytes: usize = 64 * 1024 * 1024; - let mut dptr: CUdeviceptr = 0; - check(cuMemAlloc_v2(&mut dptr, bytes), "cuMemAlloc")?; - check(cuMemsetD8_v2(dptr, 0xCC, bytes), "cuMemset")?; // poison so we can see it run - - let n: u32 = 1024; - let block: u32 = 64; - let grid: u32 = (n + block - 1) / block; - let mut p_buf: CUdeviceptr = dptr; - let mut p_n: u32 = n; - let mut params: [*mut c_void; 2] = [ - &mut p_buf as *mut _ as *mut c_void, - &mut p_n as *mut _ as *mut c_void, - ]; - let rc = cuLaunchKernel(cf, grid, 1, 1, block, 1, 1, 0, - ptr::null_mut(), params.as_mut_ptr(), ptr::null_mut()); - if rc != CUDA_SUCCESS { - println!(" launch returned: {}", err_str(rc)); - } else { - let sync = cuCtxSynchronize(); - if sync == CUDA_SUCCESS { - println!(" launch OK: grid={grid} block={block} — kernel executed and synchronized."); - } else { - println!(" launched, but sync error: {}", err_str(sync)); - println!(" (expected-ish: exact element count/indexing for cleanup is unverified)"); - } - } - cuMemFree_v2(dptr); - } - } - } else if !do_round0 { - println!("\n(tip: `--launch` runs cleanup<64>; `--round0` replays digit_f round 0)"); - } - - // --- optional: drive the real round-0 (digit_f) pipeline stage --- - if do_round0 { - if let Err(e) = round0::run(module) { - println!("round 0: {e}"); - } - } - - // --- replay the pipeline; optionally solve a known header via the verifier oracle --- - let header_hex = args.iter().position(|a| a == "--header").and_then(|i| args.get(i + 1)).cloned(); - let do_solve = args.iter().any(|a| a == "--solve"); - if do_replay || do_solve || header_hex.is_some() { - let rec_path = args.iter().skip(1).find(|a| a.ends_with(".log")).cloned() - .unwrap_or_else(|| "recording.log".to_string()); - match replay::parse_recording(&rec_path) { - Err(e) => println!("replay: {e}"), - Ok(rec) => { - // header to solve: --solve uses the captured known-good job; --header is user-supplied - let header: Option> = if do_solve { - Some(parse_hex(KNOWN_HEADER)) - } else { - header_hex.as_ref().map(|h| parse_hex(h)).filter(|h| h.len() >= 140) - }; - let inject = header.as_ref().map(|h| { - let mid = blake2b::midstate(h); - replay::Inject { midstate: mid, tail4: [h[136], h[137], h[138], h[139]] } - }); - if let Some(h) = &header { - println!("solving header ({} B); midstate=compress(header[0..128]), tail={:02x?}", - h.len(), &h[136..140]); - } - match replay::run(module, &rec, inject) { - Err(e) => println!("replay: {e}"), - Ok((_first, _mid, container)) => match &header { - None => println!("pipeline ran (no header to verify against)"), - Some(h) => { - println!("\nscanning container ({} MB) with the proven verifier as oracle...", container.len() / 1048576); - match scan_container(h, &container) { - Some(sol) => { - let (ok, msg) = verify::verify(&sol, |i| blake2b::index_hash(h, i)); - println!("\n*** SOLUTION HARVESTED FROM GPU — {} ***", if ok { "VALID ✓" } else { "?" }); - println!(" {msg}"); - println!(" indices: {:?}{}", &sol[..8], " ..."); - } - None => println!(" no verifying 128-index group in the dumped window"), - } - } - }, - } - } - } - } - - cuModuleUnload(module); - cuCtxDestroy_v2(ctx); - } - Ok(()) -} diff --git a/collab/jmprcx-solver/src/replay.rs b/collab/jmprcx-solver/src/replay.rs deleted file mode 100644 index a19fa6e..0000000 --- a/collab/jmprcx-solver/src/replay.rs +++ /dev/null @@ -1,213 +0,0 @@ -//! Full-pipeline replay of an Equihash 192,7 solve. -//! -//! The whole pipeline addresses a single ~16 GB arena, so here we: -//! 1. allocate our own arena, -//! 2. for each recorded launch, rebase every device pointer in its arg buffer -//! (arena_base + (ptr - recorded_arena_base)), -//! 3. launch the same kernel with the same grid/block/shared via the -//! `extra`/BUFFER_POINTER mechanism, -//! 4. run cleanup -> digit_f -> digit_1..6 -> digit_l -> sort_and_compress. -//! -//! `inject_midstate` (Some 64 bytes) overrides digit_f's midstate so a caller -//! can mint a new job from a header (see blake2b.rs). - -use crate::cuda::*; -use std::ffi::{c_void, CString}; -use std::ptr; - -pub struct Launch { - pub name: String, - pub grid: (u32, u32, u32), - pub block: (u32, u32, u32), - pub shared: u32, - pub arg: Vec, -} - -pub struct Recording { - pub allocs: Vec<(u64, u64)>, // (base, size) - pub pass: Vec, // first full 10-kernel pass -} - -fn triplet(s: &str) -> (u32, u32, u32) { - let v: Vec = s.split(',').filter_map(|x| x.parse().ok()).collect(); - (v[0], v[1], v[2]) -} - -pub fn parse_recording(path: &str) -> Result { - let text = std::fs::read_to_string(path).map_err(|e| format!("read {path}: {e}"))?; - let mut allocs = Vec::new(); - let mut launches = Vec::new(); - for line in text.lines() { - if let Some(rest) = line.strip_prefix("[alloc] ") { - // " bytes @ 0x" - let parts: Vec<&str> = rest.split_whitespace().collect(); - if parts.len() >= 4 { - if let (Ok(size), Some(hex)) = (parts[0].parse::(), parts[3].strip_prefix("0x")) { - if let Ok(base) = u64::from_str_radix(hex, 16) { - allocs.push((base, size)); - } - } - } - } else if let Some(rest) = line.strip_prefix("[REC] ") { - // " g=.. b=.. sh=N sz=N arg=" - let mut name = ""; - let (mut g, mut b, mut sh, mut arg) = ("", "", 0u32, ""); - for (i, tok) in rest.split_whitespace().enumerate() { - if i == 0 { name = tok; } - else if let Some(v) = tok.strip_prefix("g=") { g = v; } - else if let Some(v) = tok.strip_prefix("b=") { b = v; } - else if let Some(v) = tok.strip_prefix("sh=") { sh = v.parse().unwrap_or(0); } - else if let Some(v) = tok.strip_prefix("arg=") { arg = v; } - } - let bytes = (0..arg.len() / 2) - .map(|i| u8::from_str_radix(&arg[2 * i..2 * i + 2], 16).unwrap_or(0)) - .collect(); - launches.push(Launch { name: name.to_string(), grid: triplet(g), block: triplet(b), shared: sh, arg: bytes }); - } - } - // dedup consecutive duplicate allocs, take the first full pass (cleanup .. sort_and_compress) - let start = launches.iter().position(|l| l.name.contains("7cleanup")).ok_or("no cleanup launch in recording")?; - let end = launches[start..].iter().position(|l| l.name.contains("sort_and_compress")).ok_or("no sort_and_compress in recording")? + start; - let pass: Vec = launches.drain(start..=end).collect(); - Ok(Recording { allocs, pass }) -} - -/// number of bytes at the start of a kernel's arg buffer that are by-value -/// (not device pointers) and must NOT be rebased. -fn byval_prefix(name: &str) -> usize { - if name.contains("7digit_f") { 64 } // two ulonglong4 (BLAKE2b midstate) - else if name.contains("sort_and_compress") { 112 } // SHA256_CTX by value - else { 0 } -} - -/// Optional injection to make the GPU solve a header we know: -/// the 64-byte BLAKE2b midstate (= compress(header[0..128])) and the 4 header -/// tail bytes header[136..140] (digit_f's trailing `uint` arg; header[128..135] -/// are constant zero. -pub struct Inject { - pub midstate: [u8; 64], - pub tail4: [u8; 4], -} - -pub unsafe fn run(module: CUmodule, rec: &Recording, inject: Option) -> Result<(Vec, [u8; 64], Vec), String> { - println!("\n== full-pipeline replay ({} kernels) ==", rec.pass.len()); - - // identify the arena: the alloc that the most pass pointers fall into - let in_dev = |v: u64| (0x7000_0000_0000..0x8000_0000_0000).contains(&v); - let mut votes = vec![0u32; rec.allocs.len()]; - for l in &rec.pass { - let skip = byval_prefix(&l.name); - let mut off = skip; - while off + 8 <= l.arg.len() { - let v = u64::from_le_bytes(l.arg[off..off + 8].try_into().unwrap()); - if in_dev(v) { - if let Some(i) = rec.allocs.iter().position(|&(b, s)| v >= b && v < b + s) { - votes[i] += 1; - } - } - off += 8; - } - } - let ai = votes.iter().enumerate().max_by_key(|(_, &v)| v).map(|(i, _)| i).ok_or("no arena found")?; - let (arena_base, arena_size) = rec.allocs[ai]; - println!("arena : recorded base=0x{arena_base:x} size={} ({:.2} GB), {} ptrs", arena_size, arena_size as f64 / 1e9, votes[ai]); - - // allocate our arena: as much as fits (pipeline only touches the low ~7 GB) - let mut free = 0usize; let mut total = 0usize; - cuMemGetInfo_v2(&mut free, &mut total); - let alloc_size = (arena_size as usize).min(free.saturating_sub(1_500_000_000)); - let mut arena: CUdeviceptr = 0; - check(cuMemAlloc_v2(&mut arena, alloc_size), "alloc arena")?; - cuMemsetD8_v2(arena, 0, alloc_size); - println!("arena : allocated {:.2} GB at 0x{arena:x} (vram free {:.2} GB)", alloc_size as f64 / 1e9, free as f64 / 1e9); - - let rebase = |v: u64| -> u64 { arena + (v - arena_base) }; - - // replay every kernel - for (idx, l) in rec.pass.iter().enumerate() { - let cname = CString::new(l.name.clone()).unwrap(); - let mut f: CUfunction = ptr::null_mut(); - check(cuModuleGetFunction(&mut f, module, cname.as_ptr()), &format!("get {}", short(&l.name)))?; - - if l.shared > 0 { - // opt in to large dynamic shared memory (>48 KB) - cuFuncSetAttribute(f, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, l.shared as i32); - } - - // rebase pointers in a copy of the arg buffer - let mut arg = l.arg.clone(); - if l.name.contains("7digit_f") { - if let Some(inj) = &inject { - arg[0..64].copy_from_slice(&inj.midstate); // midstate - arg[96..100].copy_from_slice(&inj.tail4); // trailing uint = header[136..140] - } - } - let skip = byval_prefix(&l.name); - let mut off = skip; - let mut rebased = 0; - while off + 8 <= arg.len() { - let v = u64::from_le_bytes(arg[off..off + 8].try_into().unwrap()); - if in_dev(v) && v >= arena_base && v < arena_base + arena_size { - arg[off..off + 8].copy_from_slice(&rebase(v).to_le_bytes()); - rebased += 1; - } - off += 8; - } - - // launch via the extra / BUFFER_POINTER mechanism - let mut argsz = arg.len(); - let mut extra: [*mut c_void; 5] = [ - CU_LAUNCH_PARAM_BUFFER_POINTER as *mut c_void, - arg.as_mut_ptr() as *mut c_void, - CU_LAUNCH_PARAM_BUFFER_SIZE as *mut c_void, - &mut argsz as *mut _ as *mut c_void, - CU_LAUNCH_PARAM_END as *mut c_void, - ]; - let rc = cuLaunchKernel( - f, l.grid.0, l.grid.1, l.grid.2, l.block.0, l.block.1, l.block.2, - l.shared, ptr::null_mut(), ptr::null_mut(), extra.as_mut_ptr(), - ); - if rc != CUDA_SUCCESS { - cuMemFree_v2(arena); - return Err(format!("launch #{idx} {} failed: {}", short(&l.name), err_str(rc))); - } - let s = cuCtxSynchronize(); - if s != CUDA_SUCCESS { - cuMemFree_v2(arena); - return Err(format!("kernel #{idx} {} sync error: {}", short(&l.name), err_str(s))); - } - println!(" [{idx}] {:<18} grid={:<6} block={:<5} shmem={:<6} rebased {rebased} ptr(s) OK", - short(&l.name), l.grid.0, l.block.0, l.shared); - } - - // dump digit_l's container (+ first candidate) for oracle scanning - println!("\nreading digit_l container:"); - let mut sol: Vec = Vec::new(); - let mut container_bytes: Vec = Vec::new(); - if let Some(dl) = rec.pass.iter().find(|l| l.name.contains("7digit_l")) { - let p = |off: usize| u64::from_le_bytes(dl.arg[off..off + 8].try_into().unwrap()); - let counter = rebase(p(8)); - let container = rebase(p(16)); - let mut cnt = [0u32; 8]; - cuMemcpyDtoH_v2(cnt.as_mut_ptr() as *mut c_void, counter, 32); - let dump = 32 * 1024 * 1024usize; // 32 MB window of the container - container_bytes = vec![0u8; dump]; - cuMemcpyDtoH_v2(container_bytes.as_mut_ptr() as *mut c_void, container, dump); - sol = container_bytes[..512].chunks_exact(4).map(|c| u32::from_le_bytes(c.try_into().unwrap())).collect(); - println!(" counter[0]={} container[0..4]={:?} (dumped {} MB)", cnt[0], &sol[..4], dump / 1048576); - } - - // the midstate actually used by digit_f (injected, or from the recording) - let mut midstate = [0u8; 64]; - if let Some(df) = rec.pass.iter().find(|l| l.name.contains("7digit_f")) { - midstate.copy_from_slice(&df.arg[0..64]); - } - if let Some(inj) = &inject { midstate = inj.midstate; } - - cuMemFree_v2(arena); - Ok((sol, midstate, container_bytes)) -} - -fn short(name: &str) -> String { - name.split(['I', 'E']).next().unwrap_or(name).trim_start_matches('_').trim_start_matches("Z7").trim_start_matches("Z8").trim_start_matches("Z17").to_string() -} diff --git a/collab/jmprcx-solver/src/round0.rs b/collab/jmprcx-solver/src/round0.rs deleted file mode 100644 index ccac590..0000000 --- a/collab/jmprcx-solver/src/round0.rs +++ /dev/null @@ -1,109 +0,0 @@ -//! Round 0 (`digit_f`) standalone driver for the Equihash 192,7 solver. -//! -//! * launch config: grid=65536, block=256, shmem=0 -//! * argument layout: (ulonglong4 mid0, ulonglong4 mid1, uint4* A, uint4* B, -//! uchar* C, uint* counters, uint nonce) -//! * a real 64-byte BLAKE2b midstate + nonce captured from one job -//! * buffer sizes derived from the kernel template array dims -//! -//! We replay that exact job's round 0: hash + bucket on the GPU, then read back -//! the per-bucket counters to prove the round executed and distributed entries. - -use crate::cuda::*; -use std::ffi::{c_void, CString}; -use std::ptr; - -// Exact runtime variant (from the fatbin); demangled: -// void digit_f<656825858919744ul,2u,14u,12288u,3392u,1u,5498900316166ul, -// uint4[106][12288][32], uint4[106][12288][32], unsigned char[53][12288][64]> -// (ulonglong4, ulonglong4, uint4(*)[106][12288][32], uint4(*)[106][12288][32], -// unsigned char(*)[53][12288][64], unsigned int*, unsigned int) -const DIGIT_F: &str = "_Z7digit_fILm656825858919744ELj2ELj14ELj12288ELj3392ELj1ELm5498900316166EA106_A12288_A32_5uint4S3_A53_A12288_A64_hEv10ulonglong4S7_PT6_PT7_PT8_Pjj"; - -// 64-byte BLAKE2b midstate (8x u64 state) captured from a live job, passed as -// two ulonglong4 by value. -const MIDSTATE0: [u8; 32] = [ - 0x2d, 0xc6, 0x4e, 0x32, 0xef, 0x89, 0x19, 0x16, 0x30, 0xe1, 0x2d, 0x16, 0x17, 0xb9, 0xeb, 0xee, - 0x33, 0x8a, 0x63, 0xc6, 0xbb, 0xb3, 0x96, 0x33, 0xf1, 0x79, 0x25, 0x9a, 0x7a, 0x26, 0xae, 0x67, -]; -const MIDSTATE1: [u8; 32] = [ - 0x37, 0x5f, 0x85, 0x39, 0x46, 0x27, 0x08, 0xc0, 0xad, 0x3c, 0x08, 0xe3, 0xda, 0x65, 0xdf, 0xdd, - 0x27, 0x73, 0x1f, 0x13, 0x4d, 0x6f, 0xea, 0x58, 0x96, 0x0d, 0x8b, 0xf3, 0x7c, 0x29, 0x29, 0x9a, -]; -const NONCE_ARG: u32 = 1_508_556_231; - -// Buffer sizes from the template array dimensions. -const BUF_A: usize = 106 * 12288 * 32 * 16; // uint4[106][12288][32] ≈ 636 MB -const BUF_C: usize = 53 * 12288 * 64; // uchar[53][12288][64] ≈ 40 MB -const COUNTERS: usize = 64 * 1024 * 1024; // generous (observed array ≈ 1.5 MB) -const COUNT_READBACK: usize = 12288 * 32; // per-bucket-slot counters to inspect - -pub unsafe fn run(module: CUmodule) -> Result<(), String> { - println!("\n== round 0 (digit_f) standalone replay =="); - - let mut free: usize = 0; - let mut total: usize = 0; - cuMemGetInfo_v2(&mut free, &mut total); - println!( - "vram : {} MB free / {} MB total; need ~{} MB", - free / 1048576, total / 1048576, (2 * BUF_A + BUF_C + COUNTERS) / 1048576 - ); - - let cname = CString::new(DIGIT_F).unwrap(); - let mut f: CUfunction = ptr::null_mut(); - check(cuModuleGetFunction(&mut f, module, cname.as_ptr()), "cuModuleGetFunction(digit_f)")?; - println!("kernel : digit_f<...12288...> resolved, launching grid=65536 block=256"); - - // allocate the four device buffers - let (mut a, mut b, mut c, mut cnt): (CUdeviceptr, CUdeviceptr, CUdeviceptr, CUdeviceptr) = (0, 0, 0, 0); - check(cuMemAlloc_v2(&mut a, BUF_A), "alloc bufA")?; - check(cuMemAlloc_v2(&mut b, BUF_A), "alloc bufB")?; - check(cuMemAlloc_v2(&mut c, BUF_C), "alloc bufC")?; - check(cuMemAlloc_v2(&mut cnt, COUNTERS), "alloc counters")?; - cuMemsetD8_v2(a, 0, BUF_A); - cuMemsetD8_v2(b, 0, BUF_A); - cuMemsetD8_v2(c, 0, BUF_C); - cuMemsetD32_v2(cnt, 0, COUNTERS / 4); // cleanup<64> does this in the real pipeline - - let mut mid0 = MIDSTATE0; - let mut mid1 = MIDSTATE1; - let (mut pa, mut pb, mut pc, mut pcnt) = (a, b, c, cnt); - let mut nonce = NONCE_ARG; - let mut params: [*mut c_void; 7] = [ - mid0.as_mut_ptr() as *mut c_void, - mid1.as_mut_ptr() as *mut c_void, - &mut pa as *mut _ as *mut c_void, - &mut pb as *mut _ as *mut c_void, - &mut pc as *mut _ as *mut c_void, - &mut pcnt as *mut _ as *mut c_void, - &mut nonce as *mut _ as *mut c_void, - ]; - - let rc = cuLaunchKernel(f, 65536, 1, 1, 256, 1, 1, 0, ptr::null_mut(), params.as_mut_ptr(), ptr::null_mut()); - let result = if rc != CUDA_SUCCESS { - Err(format!("launch failed: {}", err_str(rc))) - } else { - let s = cuCtxSynchronize(); - if s != CUDA_SUCCESS { - Err(format!("kernel sync error: {}", err_str(s))) - } else { - // read back the bucket counters and summarize - let mut host = vec![0u32; COUNT_READBACK]; - cuMemcpyDtoH_v2(host.as_mut_ptr() as *mut c_void, cnt, COUNT_READBACK * 4); - let nz = host.iter().filter(|&&x| x != 0).count(); - let sum: u64 = host.iter().map(|&x| x as u64).sum(); - let mx = host.iter().copied().max().unwrap_or(0); - println!("result : round 0 executed OK"); - println!(" {nz}/{COUNT_READBACK} counter slots non-zero"); - println!(" total bucketed entries = {sum} (max per slot = {mx})"); - println!(" (2^24 = {} threads each hashed; ~2^25 entries expected)", 1u64 << 24); - Ok(()) - } - }; - - cuMemFree_v2(a); - cuMemFree_v2(b); - cuMemFree_v2(c); - cuMemFree_v2(cnt); - result -} diff --git a/collab/jmprcx-solver/src/verify.rs b/collab/jmprcx-solver/src/verify.rs deleted file mode 100644 index 7344d78..0000000 --- a/collab/jmprcx-solver/src/verify.rs +++ /dev/null @@ -1,81 +0,0 @@ -//! Equihash (n=192, k=7) solution verification (Wagner tree). -//! -//! A solution is 2^k = 128 indices. With collision length c = n/(k+1) = 24 bits -//! and each per-index hash being n=192 bits (24 bytes): -//! * all indices distinct -//! * canonical ordering: at every tree node, the smallest index of the left -//! subtree < that of the right subtree -//! * at level r (1..=k), each block of 2^r leaves XORs to zero in its first -//! r*24 bits; the full 128-leaf XOR is zero over all 192 bits. - -const N_BITS: usize = 192; -const K: usize = 7; -const COLL: usize = N_BITS / (K + 1); // 24 - -/// number of leading zero bits in a 24-byte big-endian-ish hash (byte 0 = MSB). -fn leading_zero_bits(h: &[u8; 24]) -> usize { - let mut n = 0; - for &b in h { - if b == 0 { n += 8; } else { n += b.leading_zeros() as usize; break; } - } - n -} - -fn xor24(a: &[u8; 24], b: &[u8; 24]) -> [u8; 24] { - let mut o = [0u8; 24]; - for i in 0..24 { o[i] = a[i] ^ b[i]; } - o -} - -/// Verify a 128-index solution given a per-index hash function. -/// Returns (valid, diagnostic_string). -pub fn verify(indices: &[u32], hash: impl Fn(u32) -> [u8; 24]) -> (bool, String) { - if indices.len() != 128 { - return (false, format!("expected 128 indices, got {}", indices.len())); - } - // distinctness - let mut sorted = indices.to_vec(); - sorted.sort_unstable(); - sorted.dedup(); - if sorted.len() != 128 { - return (false, format!("indices not distinct ({} unique)", sorted.len())); - } - - // leaf hashes - let leaves: Vec<[u8; 24]> = indices.iter().map(|&i| hash(i)).collect(); - - // bottom-up: each level halves; check collision prefix grows by COLL bits - let mut level: Vec<[u8; 24]> = leaves.clone(); - let mut worst_zero = usize::MAX; - for r in 1..=K { - let need = r * COLL; - let mut next = Vec::with_capacity(level.len() / 2); - for pair in level.chunks(2) { - let x = xor24(&pair[0], &pair[1]); - let z = leading_zero_bits(&x); - worst_zero = worst_zero.min(z); - if z < need { - return (false, format!("level {r}: only {z} leading zero bits, need {need}")); - } - next.push(x); - } - level = next; - } - let full_zero = level.len() == 1 && level[0].iter().all(|&b| b == 0); - let msg = format!( - "all {K} levels pass collision checks; final XOR {} (min prefix zeros seen = {})", - if full_zero { "= 0 (VALID)" } else { "!= 0" }, worst_zero - ); - (full_zero, msg) -} - -/// Quick diagnostic when the hash model may be off: report the max leading-zero -/// bits of the full 128-leaf XOR (≈168+ means the hash model is correct). -pub fn top_xor_zero_bits(indices: &[u32], hash: impl Fn(u32) -> [u8; 24]) -> usize { - let mut acc = [0u8; 24]; - for &i in indices { - let h = hash(i); - for j in 0..24 { acc[j] ^= h[j]; } - } - leading_zero_bits(&acc) -}