//! CUDA Equihash 192,7 backend driving miniZ's extracted GPU solver. //! //! Unlike a hand-written solver, this backend loads miniZ's captured CUDA fatbin //! (`miniz/equihash192_7.fatbin`) through the CUDA Driver API and replays its //! exact 10-kernel Wagner pipeline. The launch sequence — grid/block/shared-mem //! config and the pre-packed argument buffers for every kernel — was recorded at //! the `libcuda` boundary from a live mining run and is embedded as //! `miniz/recording.log`. See `collab/jmprcx-solver/` and //! `/home/access/code/miniz-dump/solver_192_7/ORCHESTRATION.md` for the //! reverse-engineering work behind this. //! //! ## How a solve works //! //! The whole pass addresses a single ~15 GB arena. At init we allocate our own //! arena and rebase every recorded device pointer into it //! (`mine = arena + (recorded - recorded_arena_base)`). To solve a header we: //! 1. compute the 64-byte BLAKE2b midstate = compress(header[0..128]) and the //! 4 varying tail bytes header[136..140] on the CPU, //! 2. inject them into `digit_f`'s argument buffer (arg[0..64] and arg[96..100]), //! 3. replay `cleanup → digit_f → digit_1..3 → digit_4w/5w/6w → digit_l → //! sort_and_compress`, //! 4. read `digit_l`'s solution counter and container (128 consecutive u32 //! indices per solution at offset 0) back to the host, //! 5. hand the recovered indices to [`equihash::filter_candidates`], which //! canonicalises and fully verifies each candidate against the real header. //! //! Step 5 is the correctness guarantee: only solutions that genuinely verify for //! this exact header are ever returned, so the backend can never yield a bad //! share. The kernel reconstructs the 8 header bytes [128..135] (= nonce[20..27]) //! as zero, matching miniZ's nonce layout; the standard miner nonce layout keeps //! those bytes zero, so solutions verify. Any header whose bytes [128..135] are //! non-zero simply yields nothing (the verifier rejects the mismatched set) //! rather than a wrong result. use std::ffi::{c_char, c_int, c_uint, c_void, CStr, CString}; use std::ptr; use anyhow::{anyhow, Result}; use crate::blake; use crate::equihash; use crate::params::HEADER_LEN; /// miniZ's captured Equihash 192,7 solver (sm_50/60/70/75/80/86/120 cubins; the /// driver picks the one matching the active GPU). sm_80/86/120 carry the full /// kernel set (all bucket configs); sm_50/60 also full; sm_70/75 carry a reduced /// set, so on those arches only a config whose kernels are present will replay. static FATBIN: &[u8] = include_bytes!("miniz/equihash192_7.fatbin"); /// One bundled solver configuration. miniZ ships several bucket geometries with /// different memory footprints; we pick the highest-capacity one that fits the /// card's free VRAM (see [`select_config`]). `table_capacity` is the number of /// table slots (higher ⇒ fewer dropped collisions ⇒ better solution yield). struct ConfigDef { name: &'static str, table_capacity: u64, recording: &'static str, } /// Bundled configs, captured from live miniZ runs (see `miniz/configs/README.md`). /// Ordered low→high capacity; selection scans for the best that fits. static CONFIGS: &[ConfigDef] = &[ ConfigDef { name: "2048x16960", table_capacity: 34_734_080, recording: include_str!("miniz/configs/config_2048x16960.log"), }, ConfigDef { name: "10000x4032", table_capacity: 40_325_000, recording: include_str!("miniz/configs/config_10000x32.log"), }, ConfigDef { name: "12288x3392", table_capacity: 41_713_664, recording: include_str!("miniz/configs/config_12288x32.log"), }, ]; /// VRAM held back for the CUDA context / driver and other processes. const VRAM_HEADROOM: usize = 1_500_000_000; /// Extra space past the highest pointer offset in a single over-allocated arena, /// to cover the buffer that lives at that offset. const ARENA_MARGIN: usize = 2 << 30; // 2 GiB /// Cap on solutions read back from the container per solve. The recorded /// container alloc is 1.5 MB = 3072 * 128 * 4 bytes, so this stays in-bounds. const MAX_SOLS: usize = 3072; // ---- CUDA Driver API FFI ---- type CUresult = c_int; type CUdevice = c_int; type CUcontext = *mut c_void; type CUmodule = *mut c_void; type CUfunction = *mut c_void; type CUstream = *mut c_void; type CUevent = *mut c_void; type CUdeviceptr = u64; const CUDA_SUCCESS: CUresult = 0; // CUfunction_attribute: opt in to >48 KB dynamic shared memory. const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: c_int = 8; // cuLaunchKernel `extra` directives (miniZ passes a single pre-packed arg buffer). const CU_LAUNCH_PARAM_END: usize = 0x00; const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01; const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02; // The CUDA driver API, loaded at runtime via dlopen (see `crate::dylib`) rather // than linked at build time: the SONAME `libcuda.so.1` ships with the NVIDIA // driver (`nvcuda.dll` on Windows) and is absent on driver-less / AMD-only // hosts. `cuda_lib()` returns `None` when it can't be opened; the public entry // points below turn that into a clear error / empty device list, so the binary // still builds and starts everywhere. crate::dylib::dynamic_library! { lib_struct: CudaLib, loader: cuda_lib, names: ["libcuda.so.1", "libcuda.so", "nvcuda.dll"], fn cuInit(flags: c_uint) -> CUresult; fn cuDeviceGetCount(count: *mut c_int) -> CUresult; fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult; fn cuDeviceGetName(name: *mut c_char, len: c_int, dev: CUdevice) -> CUresult; fn cuDeviceGetPCIBusId(pci_bus_id: *mut c_char, len: c_int, dev: CUdevice) -> CUresult; fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult; fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult; fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult; fn cuModuleLoadData(module: *mut CUmodule, image: *const c_void) -> CUresult; fn cuModuleUnload(module: CUmodule) -> CUresult; fn cuModuleGetFunction(hfunc: *mut CUfunction, hmod: CUmodule, name: *const c_char) -> CUresult; fn cuFuncSetAttribute(func: CUfunction, attrib: c_int, value: c_int) -> CUresult; fn cuMemAlloc_v2(dptr: *mut CUdeviceptr, bytesize: usize) -> CUresult; fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult; fn cuMemsetD8_v2(dptr: CUdeviceptr, uc: u8, n: usize) -> CUresult; fn cuMemcpyDtoH_v2(dst: *mut c_void, src: CUdeviceptr, byte_count: usize) -> CUresult; fn cuMemcpyDtoHAsync_v2(dst: *mut c_void, src: CUdeviceptr, byte_count: usize, stream: CUstream) -> CUresult; fn cuMemAllocHost_v2(pp: *mut *mut c_void, bytesize: usize) -> CUresult; fn cuMemFreeHost(p: *mut c_void) -> CUresult; fn cuMemGetInfo_v2(free: *mut usize, total: *mut usize) -> CUresult; fn cuStreamCreate(stream: *mut CUstream, flags: c_uint) -> CUresult; fn cuStreamDestroy_v2(stream: CUstream) -> CUresult; fn cuEventCreate(event: *mut CUevent, flags: c_uint) -> CUresult; fn cuEventRecord(event: CUevent, stream: CUstream) -> CUresult; fn cuEventSynchronize(event: CUevent) -> CUresult; fn cuEventDestroy_v2(event: CUevent) -> CUresult; fn cuLaunchKernel( f: CUfunction, gx: c_uint, gy: c_uint, gz: c_uint, bx: c_uint, by: c_uint, bz: c_uint, shared_mem: c_uint, stream: CUstream, params: *mut *mut c_void, extra: *mut *mut c_void, ) -> CUresult; fn cuCtxSynchronize() -> CUresult; fn cuGetErrorName(error: CUresult, str: *mut *const c_char) -> CUresult; } /// Error returned when the CUDA driver library isn't present on the host. fn cuda_unavailable() -> anyhow::Error { anyhow!("CUDA driver library (libcuda.so.1) not found — is the NVIDIA driver installed?") } /// Turn a non-success `CUresult` into an error with the driver's symbolic name. fn check(code: CUresult, what: &str) -> Result<()> { if code == CUDA_SUCCESS { return Ok(()); } let name = unsafe { let mut p: *const c_char = ptr::null(); if cuGetErrorName(code, &mut p) == CUDA_SUCCESS && !p.is_null() { CStr::from_ptr(p).to_string_lossy().into_owned() } else { format!("CUDA error {code}") } }; Err(anyhow!("{what} failed: {name}")) } /// Number of CUDA devices (initialises the driver as a side effect). Returns an /// error if the CUDA driver library isn't installed. pub fn device_count() -> Result { cuda_lib().ok_or_else(cuda_unavailable)?; unsafe { check(cuInit(0), "cuInit")?; let mut n: c_int = 0; check(cuDeviceGetCount(&mut n), "cuDeviceGetCount")?; Ok(n as usize) } } /// List CUDA devices as human-readable strings. pub fn list_devices() -> Result> { let n = device_count()?; let mut out = Vec::with_capacity(n); unsafe { for i in 0..n { let mut dev: CUdevice = 0; let name = if cuDeviceGet(&mut dev, i as c_int) == CUDA_SUCCESS { let mut buf = [0i8; 128]; if cuDeviceGetName(buf.as_mut_ptr() as *mut c_char, 128, dev) == CUDA_SUCCESS { CStr::from_ptr(buf.as_ptr() as *const c_char).to_string_lossy().into_owned() } else { format!("CUDA device {i}") } } else { format!("CUDA device {i}") }; out.push(format!("[{i}] {name}")); } } Ok(out) } // ---- BLAKE2b midstate (Equihash 192,7 personalisation) ---- // // digit_f wants the 64-byte BLAKE2b state after compressing the first 128-byte // header block (not finalised); blake2b_simd doesn't expose that intermediate // state, so we compute it directly here. const BLAKE_IV: [u64; 8] = [ 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, ]; const SIGMA: [[usize; 16]; 12] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], ]; #[inline] #[allow(clippy::too_many_arguments)] fn mix(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) { v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); v[d] = (v[d] ^ v[a]).rotate_right(32); v[c] = v[c].wrapping_add(v[d]); v[b] = (v[b] ^ v[c]).rotate_right(24); v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); v[d] = (v[d] ^ v[a]).rotate_right(16); v[c] = v[c].wrapping_add(v[d]); v[b] = (v[b] ^ v[c]).rotate_right(63); } /// One BLAKE2b compression of a 128-byte block into state `h`. fn compress(h: &mut [u64; 8], block: &[u8; 128], t: u128, last: bool) { let mut m = [0u64; 16]; for i in 0..16 { m[i] = u64::from_le_bytes(block[i * 8..i * 8 + 8].try_into().unwrap()); } let mut v = [0u64; 16]; v[..8].copy_from_slice(h); v[8..].copy_from_slice(&BLAKE_IV); v[12] ^= t as u64; v[13] ^= (t >> 64) as u64; if last { v[14] = !v[14]; } for r in 0..12 { let s = &SIGMA[r]; mix(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]); mix(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]); mix(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]); mix(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]); mix(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]); mix(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]); mix(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]); mix(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]); } for i in 0..8 { h[i] ^= v[i] ^ v[i + 8]; } } /// The 64-byte midstate `digit_f` expects: the BLAKE2b(192,7) state after /// compressing header[0..128] (personalisation `"ZcashPoW"||LE32(192)||LE32(7)`, /// digest length 48). fn midstate(header: &[u8]) -> [u8; 64] { let mut personal = [0u8; 16]; personal[..8].copy_from_slice(b"ZcashPoW"); personal[8..12].copy_from_slice(&192u32.to_le_bytes()); personal[12..16].copy_from_slice(&7u32.to_le_bytes()); let mut h = BLAKE_IV; h[0] ^= 0x0101_0000 ^ 48; // digest_length=48, fanout=1, depth=1 h[6] ^= u64::from_le_bytes(personal[0..8].try_into().unwrap()); h[7] ^= u64::from_le_bytes(personal[8..16].try_into().unwrap()); let mut block = [0u8; 128]; block.copy_from_slice(&header[0..128]); compress(&mut h, &block, 128, false); let mut out = [0u8; 64]; for i in 0..8 { out[i * 8..i * 8 + 8].copy_from_slice(&h[i].to_le_bytes()); } out } // ---- Recording (driver-boundary launch trace) ---- /// One recorded `cuLaunchKernel`: kernel name, dims, dynamic shared bytes, and /// the pre-packed argument buffer (device pointers already rebased into our /// arena by [`Recording::resolve`]). struct Launch { name: String, grid: (u32, u32, u32), block: (u32, u32, u32), shared: u32, arg: Vec, } /// Parsed recording: device allocations plus the first full 10-kernel pass. struct Recording { allocs: Vec<(u64, u64)>, // (base, size) pass: Vec, // cleanup .. sort_and_compress } fn triplet(s: &str) -> (u32, u32, u32) { let v: Vec = s.split(',').filter_map(|x| x.parse().ok()).collect(); ( v.first().copied().unwrap_or(1), v.get(1).copied().unwrap_or(1), v.get(2).copied().unwrap_or(1), ) } fn parse_recording(text: &str) -> Result { let mut allocs = Vec::new(); let mut launches = Vec::new(); for line in text.lines() { if let Some(rest) = line.strip_prefix("[alloc] ") { // " bytes @ 0x" let parts: Vec<&str> = rest.split_whitespace().collect(); if parts.len() >= 4 { if let (Ok(size), Some(hex)) = (parts[0].parse::(), parts[3].strip_prefix("0x")) { if let Ok(base) = u64::from_str_radix(hex, 16) { allocs.push((base, size)); } } } } else if let Some(rest) = line.strip_prefix("[REC] ") { // " g=.. b=.. sh=N sz=N arg=" let mut name = ""; let (mut g, mut b, mut sh, mut arg) = ("", "", 0u32, ""); for (i, tok) in rest.split_whitespace().enumerate() { if i == 0 { name = tok; } else if let Some(v) = tok.strip_prefix("g=") { g = v; } else if let Some(v) = tok.strip_prefix("b=") { b = v; } else if let Some(v) = tok.strip_prefix("sh=") { sh = v.parse().unwrap_or(0); } else if let Some(v) = tok.strip_prefix("arg=") { arg = v; } } let bytes = (0..arg.len() / 2) .map(|i| u8::from_str_radix(&arg[2 * i..2 * i + 2], 16).unwrap_or(0)) .collect(); launches.push(Launch { name: name.to_string(), grid: triplet(g), block: triplet(b), shared: sh, arg: bytes, }); } } // Take the first full pass: cleanup .. sort_and_compress. let start = launches .iter() .position(|l| l.name.contains("7cleanup")) .ok_or_else(|| anyhow!("no cleanup launch in recording"))?; let end = start + launches[start..] .iter() .position(|l| l.name.contains("sort_and_compress")) .ok_or_else(|| anyhow!("no sort_and_compress in recording"))?; let pass: Vec = launches.drain(start..=end).collect(); Ok(Recording { allocs, pass }) } /// Bytes at the start of a kernel's arg buffer that are by-value (not device /// pointers) and must NOT be rebased. fn byval_prefix(name: &str) -> usize { if name.contains("7digit_f") { 64 // two ulonglong4 (the BLAKE2b midstate) } else if name.contains("sort_and_compress") { 112 // SHA256_CTX by value } else { 0 } } /// Recorded device-address range used by miniZ's arena allocations. fn in_dev(v: u64) -> bool { (0x7000_0000_0000..0x8000_0000_0000).contains(&v) } /// A device buffer the pass references, with the highest offset dereferenced /// into it (`base`/`size` are the recorded allocation; `high_water` is the /// largest `ptr - base` seen). #[derive(Clone)] struct RefBuf { base: u64, size: u64, high_water: u64, } impl Recording { /// The distinct device buffers this pass references — the small configs use /// several separate allocations, the 12288 config one big arena. For each we /// track the highest offset dereferenced into it (its owner is the first /// recorded alloc that contains the pointer; reused addresses share a base, /// so the choice is unambiguous for rebasing). fn referenced_buffers(&self) -> Vec { let mut refs: Vec = Vec::new(); for l in &self.pass { let mut off = byval_prefix(&l.name); while off + 8 <= l.arg.len() { let v = u64::from_le_bytes(l.arg[off..off + 8].try_into().unwrap()); if in_dev(v) { if let Some(&(b, s)) = self.allocs.iter().find(|&&(b, s)| v >= b && v < b + s) { let hw = v - b; match refs.iter_mut().find(|r| r.base == b) { Some(r) => r.high_water = r.high_water.max(hw), None => refs.push(RefBuf { base: b, size: s, high_water: hw }), } } } off += 8; } } refs } } /// Minimum device memory a config needs. /// /// - One over-allocated arena (the 12288 config): only the region up to the /// highest pointer plus the buffer there is touched, so we can cap it. /// - Several dedicated buffers (the small configs): each is fully indexed by its /// kernels, so all must be allocated at full size. fn required_bytes(refs: &[RefBuf]) -> usize { if refs.len() == 1 { (refs[0].high_water as usize + ARENA_MARGIN).min(refs[0].size as usize) } else { refs.iter().map(|r| r.size as usize).sum() } } /// A chosen, parsed config ready to allocate and replay. struct Chosen { name: &'static str, rec: Recording, refs: Vec, required: usize, } /// Configs that fit `budget` bytes of usable VRAM, **highest table capacity /// first**. `ZCL_CUDA_CONFIG=` forces exactly one (even if it doesn't fit). /// /// The caller picks the first of these whose kernels are actually present in the /// active GPU's cubin (see [`config_present`]) — the legacy-arch cubins (sm_70/75) /// ship a reduced kernel set, so the highest-capacity VRAM-fitting config may not /// exist there and we fall through to one that does. fn candidate_configs(budget: usize) -> Result> { let forced = std::env::var("ZCL_CUDA_CONFIG").ok(); let mut cands: Vec<(u64, Chosen)> = Vec::new(); let mut min_required = usize::MAX; for def in CONFIGS { let rec = parse_recording(def.recording)?; let refs = rec.referenced_buffers(); let required = required_bytes(&refs); min_required = min_required.min(required); if forced.as_deref() == Some(def.name) { return Ok(vec![Chosen { name: def.name, rec, refs, required }]); } if forced.is_none() && required <= budget { cands.push((def.table_capacity, Chosen { name: def.name, rec, refs, required })); } } if let Some(f) = forced { return Err(anyhow!("ZCL_CUDA_CONFIG='{f}' is not a known config")); } // Highest table capacity first. cands.sort_by(|a, b| b.0.cmp(&a.0)); if cands.is_empty() { return Err(anyhow!( "insufficient VRAM: ~{:.1} GB usable, but the smallest solver config needs ~{:.1} GB", budget as f64 / 1e9, min_required as f64 / 1e9 )); } Ok(cands.into_iter().map(|(_, c)| c).collect()) } /// Whether every kernel a config replays is present in `module` (this GPU's /// cubin). Probing `cuModuleGetFunction` only resolves a handle — it allocates /// nothing — so this is a cheap pre-flight before committing to a config. unsafe fn config_present(module: CUmodule, rec: &Recording) -> bool { let mut seen = std::collections::HashSet::new(); for l in &rec.pass { if !seen.insert(l.name.as_str()) { continue; } let Ok(cname) = CString::new(l.name.as_str()) else { return false; }; let mut f: CUfunction = ptr::null_mut(); if cuModuleGetFunction(&mut f, module, cname.as_ptr()) != CUDA_SUCCESS { return false; } } true } /// From VRAM-fitting candidates (best first), pick the first fully present in the /// loaded `module`. On full arches (sm_50/60/80/86/120) this is the highest- /// capacity config; on reduced arches (sm_70/75) it falls through to a present one. fn pick_present_config(candidates: Vec, module: CUmodule) -> Result { let n = candidates.len(); for c in candidates { if unsafe { config_present(module, &c.rec) } { return Ok(c); } } Err(anyhow!( "none of the {n} VRAM-fitting solver config(s) is fully present in this GPU's cubin \ (reduced legacy-arch kernel set — try a smaller config or a fuller-arch GPU)" )) } // ---- Solver ---- /// A persistent CUDA solver bound to one device + context, holding the loaded /// fatbin, its device buffers, and the resolved/rebased launch sequence. pub struct CudaSolver { ctx: CUcontext, _module: CUmodule, /// Device buffers we allocated for the selected config (freed in `Drop`). bufs: Vec, /// Resolved launches: (function, grid, block, shared, rebased arg buffer). launches: Vec<(CUfunction, (u32, u32, u32), (u32, u32, u32), u32, Vec)>, /// Index of `digit_f` within `launches` (where we inject midstate + tail). digit_f: usize, /// Device pointers for `digit_l`'s solution counter and container. counter_ptr: CUdeviceptr, container_ptr: CUdeviceptr, // --- pipelining (used by enqueue/drain) --- /// Stream all pipelined launches + async copies run on. stream: CUstream, /// Double-buffered pinned host memory for the counter and container readback. host_counter: [*mut u32; 2], host_container: [*mut u32; 2], /// Completion event per buffer slot. event: [CUevent; 2], /// Buffer slot the next `enqueue` will use. slot: usize, /// The pass currently in flight: (its buffer slot, the header that produced it). pending: Option<(usize, Vec)>, /// Per-card GPU control (clocks/power/readout); None if unavailable. tuner: Option>, /// Whether tuning changed clock/power state (so we restore it on Drop). tuned: bool, /// Core / memory clock offsets (MHz) chosen by `--auto-tune`, for the /// dashboard to reflect; `None` if auto-tune didn't run or was skipped. auto_core_off: Option, auto_mem_off: Option, } // The context is created on, and only used from, the worker thread that owns the // solver, so it is safe to move the solver to that thread. unsafe impl Send for CudaSolver {} impl CudaSolver { /// Initialise the driver, create a context on `device_index`, load the miniZ /// fatbin, select the config that fits free VRAM, allocate its buffers, and /// rebase the recorded launch sequence. pub fn new(device_index: usize) -> Result { cuda_lib().ok_or_else(cuda_unavailable)?; unsafe { check(cuInit(0), "cuInit")?; let mut dev: CUdevice = 0; check(cuDeviceGet(&mut dev, device_index as c_int), "cuDeviceGet")?; // Resolve this card's PCI bus id so we can open a GPU control handle // (clocks/power/readout) for the matching physical GPU — CUDA and the // driver's NVML index orderings can differ — then apply the tuning // policy to that card. let tuner = { let mut buf = [0 as c_char; 32]; if cuDeviceGetPCIBusId(buf.as_mut_ptr(), buf.len() as c_int, dev) == CUDA_SUCCESS { let bus = CStr::from_ptr(buf.as_ptr()).to_string_lossy(); crate::gpu_tune::open(&bus) } else { None } }; let tuned = match &tuner { Some(t) => crate::gpu_tune::apply(t.as_ref(), device_index), None => false, }; let mut ctx: CUcontext = ptr::null_mut(); check(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate")?; check(cuCtxSetCurrent(ctx), "cuCtxSetCurrent")?; // Choose the config by available VRAM (highest capacity that fits). let mut free = 0usize; let mut total = 0usize; check(cuMemGetInfo_v2(&mut free, &mut total), "cuMemGetInfo")?; let budget = free.saturating_sub(VRAM_HEADROOM); let candidates = candidate_configs(budget)?; // Load the module first so we can pick a config whose kernels this // GPU's cubin actually contains (legacy arches ship a reduced set). let mut module: CUmodule = ptr::null_mut(); check( cuModuleLoadData(&mut module, FATBIN.as_ptr() as *const c_void), "cuModuleLoadData (does this GPU's arch match the fatbin's sm_50/60/70/75/80/86/120 cubins?)", )?; let chosen = pick_present_config(candidates, module)?; log::info!( "CUDA device {device_index}: config '{}' ({} ref buffer(s), ~{:.1} GB; {:.1} GB free)", chosen.name, chosen.refs.len(), chosen.required as f64 / 1e9, free as f64 / 1e9, ); // Allocate a device buffer per referenced recording buffer. A lone // over-allocated arena is capped to what the pipeline actually // touches; dedicated buffers are allocated at full size. `rebase` // maps each recorded pointer to its owning new buffer. let single_arena = chosen.refs.len() == 1; let mut bufs = Vec::with_capacity(chosen.refs.len()); // (orig_base, orig_size, my_base) let mut device_bufs = Vec::with_capacity(chosen.refs.len()); let mut remaining = budget; for r in &chosen.refs { let want = if single_arena { (r.high_water as usize + ARENA_MARGIN).min(r.size as usize).min(remaining) } else { (r.size as usize).min(remaining) }; let mut p: CUdeviceptr = 0; check(cuMemAlloc_v2(&mut p, want), "cuMemAlloc")?; check(cuMemsetD8_v2(p, 0, want), "cuMemsetD8")?; remaining = remaining.saturating_sub(want); bufs.push((r.base, r.size, p)); device_bufs.push(p); } let rebase = |v: u64| -> Option { bufs.iter() .find(|&&(b, s, _)| v >= b && v < b + s) .map(|&(b, _, mb)| mb + (v - b)) }; // Resolve every kernel, opt into large dynamic shared memory, and // rebase the device pointers in each arg buffer once. let mut launches = Vec::with_capacity(chosen.rec.pass.len()); let mut digit_f = None; for (idx, l) in chosen.rec.pass.iter().enumerate() { let cname = CString::new(l.name.as_str()).map_err(|_| anyhow!("kernel name has NUL"))?; let mut f: CUfunction = ptr::null_mut(); check( cuModuleGetFunction(&mut f, module, cname.as_ptr()), "cuModuleGetFunction", )?; if l.shared > 0 { cuFuncSetAttribute(f, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, l.shared as c_int); } let mut arg = l.arg.clone(); let mut off = byval_prefix(&l.name); while off + 8 <= arg.len() { let v = u64::from_le_bytes(arg[off..off + 8].try_into().unwrap()); if in_dev(v) { if let Some(nv) = rebase(v) { arg[off..off + 8].copy_from_slice(&nv.to_le_bytes()); } } off += 8; } if l.name.contains("7digit_f") { digit_f = Some(idx); } launches.push((f, l.grid, l.block, l.shared, arg)); } let digit_f = digit_f.ok_or_else(|| anyhow!("no digit_f in recorded pass"))?; // digit_l: arg[8..16] = solution counter*, arg[16..24] = container* // (already rebased above). let dl = chosen .rec .pass .iter() .position(|l| l.name.contains("7digit_l")) .ok_or_else(|| anyhow!("no digit_l in recorded pass"))?; let dl_arg = &launches[dl].4; if dl_arg.len() < 24 { return Err(anyhow!("digit_l arg buffer too short")); } let counter_ptr = u64::from_le_bytes(dl_arg[8..16].try_into().unwrap()); let container_ptr = u64::from_le_bytes(dl_arg[16..24].try_into().unwrap()); // Pipelining resources: a stream + per-slot completion event and // pinned host buffers for async readback (overlaps the next pass's // GPU work with this pass's host-side verification). let mut stream: CUstream = ptr::null_mut(); check(cuStreamCreate(&mut stream, 0), "cuStreamCreate")?; let alloc_host = |bytes: usize| -> Result<*mut u32> { let mut p: *mut c_void = ptr::null_mut(); check(cuMemAllocHost_v2(&mut p, bytes), "cuMemAllocHost")?; Ok(p as *mut u32) }; let mut event = [ptr::null_mut(); 2]; let mut host_counter = [ptr::null_mut(); 2]; let mut host_container = [ptr::null_mut(); 2]; for s in 0..2 { check(cuEventCreate(&mut event[s], 0), "cuEventCreate")?; host_counter[s] = alloc_host(64)?; host_container[s] = alloc_host(MAX_SOLS * 128 * 4)?; } let mut solver = Self { ctx, _module: module, bufs: device_bufs, launches, digit_f, counter_ptr, container_ptr, stream, host_counter, host_container, event, slot: 0, pending: None, tuner, tuned, auto_core_off: None, auto_mem_off: None, }; // Optionally find this card's fastest stable clock offsets. if crate::gpu_tune::auto_tune_enabled() { if let Some((core, mem)) = solver.auto_tune_speed() { solver.auto_core_off = Some(core); solver.auto_mem_off = Some(mem); } } Ok(solver) } } /// Current board power draw in watts, or `None` if unavailable. pub fn power_watts(&self) -> Option { self.tuner.as_ref().and_then(|t| t.watts()) } /// Current GPU core temperature in °C, or `None` if unavailable. pub fn temperature_c(&self) -> Option { self.tuner.as_ref().and_then(|t| t.temperature_c()) } /// This card's product name (e.g. "NVIDIA GeForce RTX 5080"), if available. pub fn device_name(&self) -> Option { self.tuner.as_ref().map(|t| t.name()) } /// Currently enforced power limit in watts, or `None` if unavailable. pub fn current_power_limit_w(&self) -> Option { self.tuner.as_ref().and_then(|t| t.current_power_limit_w()) } /// This card's (min, max) settable power limit in watts, or `None`. pub fn power_limit_range_w(&self) -> Option<(u32, u32)> { self.tuner.as_ref().and_then(|t| t.power_limit_range_w()) } /// Apply live hardware controls from the dashboard: absolute core/memory VF /// offsets (MHz) and an absolute power-limit target (watts; 0 = leave the /// power limit alone). Best-effort — needs elevated privileges, and the /// tuner clamps each value to the card's allowed range. pub fn apply_hw_controls(&self, core_off: i32, mem_off: i32, power_w: u32) { if let Some(t) = &self.tuner { t.set_core_offset_mhz(core_off); t.set_mem_offset_mhz(mem_off); if power_w > 0 { t.set_power_limit_w(power_w); } } } /// Sweep this card's core clock offset, then its memory clock offset, each /// upward to maximise solve throughput, locking in the best stable value of /// each. Best-effort: needs root to change clocks, and stops a sweep at the /// first sign of instability (a kernel error or no valid solutions). Records /// every step for the dashboard. Runs once at startup. pub fn auto_tune_speed(&self) -> Option<(i32, i32)> { use log::info; use std::time::Instant; // Per-step sample window. Longer windows average out per-pass jitter and // power fluctuation, so the efficiency/throughput numbers are stable. const SAMPLE_SECS: f64 = 5.0; let ctrl = self.tuner.as_ref()?; // A header whose nonce bytes [128..136) are zero, so the GPU yields valid // solutions we can both rate and use as a stability check. let mut header = vec![0x42u8; HEADER_LEN]; for b in &mut header[128..136] { *b = 0; } // Throughput (passes/s), solution rate (Sol/s) and average board power // (W) over `secs`. `None` on a kernel fault (⇒ unstable). let measure = |secs: f64| -> Option<(f64, f64, f64)> { let t = Instant::now(); let (mut passes, mut sols) = (0u64, 0u64); let (mut wsum, mut wcnt) = (0.0f64, 0u64); while t.elapsed().as_secs_f64() < secs { match self.solve(&header) { Ok(s) => { passes += 1; sols += s.len() as u64; } Err(_) => return None, } if let Some(w) = self.power_watts() { wsum += w; wcnt += 1; } } let el = t.elapsed().as_secs_f64().max(1e-9); let watts = if wcnt > 0 { wsum / wcnt as f64 } else { 0.0 }; Some((passes as f64 / el, sols as f64 / el, watts)) }; if !ctrl.set_core_offset_mhz(0).applied() { info!("auto-tune: cannot set GPU clock offset (needs root/Administrator) — skipping"); return None; } ctrl.set_mem_offset_mhz(0); info!("auto-tune: sampling solve rate ({SAMPLE_SECS:.0}s/step) before sweeping clock offsets..."); let base_rate = match measure(SAMPLE_SECS) { Some((rate, sol_s, _)) if sol_s > 0.0 => rate, _ => { info!("auto-tune: baseline produced no solutions — skipping"); return None; } }; // Sweep one knob upward via `set` until throughput stops improving (two // stale steps) or the card goes unstable, returning the best offset. // `set_*_offset` clamps to the driver's allowed range. let mut best_rate = base_rate; let mut sweep = |set: &dyn Fn(i32) -> bool, step: i32, cap: i32| -> i32 { let mut best = 0i32; let mut stale = 0; let mut off = 0; while off < cap { off += step; if !set(off) { break; } match measure(SAMPLE_SECS) { Some((rate, sol_s, _)) if sol_s > 0.0 => { if rate > best_rate * 1.005 { best_rate = rate; best = off; stale = 0; } else { stale += 1; if stale >= 2 { break; } } } _ => break, // error or zero solutions ⇒ unstable; stop here } } best }; // Phase 1: core offset (memory held at 0). let best_core = sweep(&|o| ctrl.set_core_offset_mhz(o).applied(), 45, 450); // Lock the winning core offset before sweeping memory. ctrl.set_core_offset_mhz(best_core); // Phase 2: memory offset (core held at the winner). let best_mem = sweep(&|o| ctrl.set_mem_offset_mhz(o).applied(), 200, 1600); ctrl.set_mem_offset_mhz(best_mem); info!( "auto-tune: core {best_core:+} MHz, memory {best_mem:+} MHz ({:+.1}% solve rate)", (best_rate / base_rate - 1.0) * 100.0 ); Some((best_core, best_mem)) } /// The core clock offset (MHz) `--auto-tune` settled on, if it ran. pub fn tuned_core_offset(&self) -> Option { self.auto_core_off } /// The memory clock offset (MHz) `--auto-tune` settled on, if it ran. pub fn tuned_mem_offset(&self) -> Option { self.auto_mem_off } /// Current (SM core, memory) clock in MHz, each `None` if unavailable. pub fn current_clocks_mhz(&self) -> (Option, Option) { match &self.tuner { Some(t) => (t.core_clock_mhz(), t.mem_clock_mhz()), None => (None, None), } } /// Inject the per-header midstate + tail into `digit_f` and launch all 10 /// kernels back-to-back on `stream`. They have strict data dependencies and /// the stream is in-order, so no host sync is needed between them. fn launch_pass(&self, header: &[u8], stream: CUstream) -> Result<()> { let mut digit_f_arg = self.launches[self.digit_f].4.clone(); digit_f_arg[0..64].copy_from_slice(&midstate(header)); digit_f_arg[96..100].copy_from_slice(&[header[136], header[137], header[138], header[139]]); unsafe { for (idx, (f, grid, block, shared, arg)) in self.launches.iter().enumerate() { let buf = if idx == self.digit_f { &digit_f_arg } else { arg }; launch(*f, *grid, *block, *shared, buf, stream)?; } } Ok(()) } /// Synchronous one-shot replay: launch the pipeline and block on the readback. /// Used by `--benchmark`/`--gpu-debug`/`--selftest`; the mining loop uses the /// pipelined [`enqueue`](Self::enqueue) path instead. fn run_pipeline(&self, header: &[u8]) -> Result> { unsafe { check(cuCtxSetCurrent(self.ctx), "cuCtxSetCurrent")?; self.launch_pass(header, ptr::null_mut())?; // Synchronous DtoH on the default stream blocks until the pipeline // finishes. let mut counter = [0u32; 1]; check( cuMemcpyDtoH_v2(counter.as_mut_ptr() as *mut c_void, self.counter_ptr, 4), "cuMemcpyDtoH(counter)", )?; let n = (counter[0] as usize).min(MAX_SOLS); if n == 0 { return Ok(Vec::new()); } let mut out = vec![0u32; n * 128]; check( cuMemcpyDtoH_v2(out.as_mut_ptr() as *mut c_void, self.container_ptr, n * 128 * 4), "cuMemcpyDtoH(container)", )?; Ok(out) } } /// Solve the puzzle for `header` (140 bytes), returning verified solutions. /// One-shot/synchronous; for mining throughput use [`enqueue`](Self::enqueue). pub fn solve(&self, header: &[u8]) -> Result>> { assert_eq!(header.len(), HEADER_LEN); let recovered = self.run_pipeline(header)?; if recovered.is_empty() { return Ok(Vec::new()); } let base = blake::base_state(header); Ok(equihash::filter_candidates(&base, &recovered)) } /// Pipelined solve: launch `header`'s pass and asynchronously copy its results /// back, then return the solutions of the pass enqueued one call ago (or empty /// on the first call). The GPU runs the new pass while the host verifies the /// previous one, keeping the device ~100% busy. Drain the final in-flight pass /// with [`drain`](Self::drain). Returns solutions for the header passed to the /// *previous* `enqueue`. pub fn enqueue(&mut self, header: &[u8]) -> Result>> { assert_eq!(header.len(), HEADER_LEN); let slot = self.slot; unsafe { check(cuCtxSetCurrent(self.ctx), "cuCtxSetCurrent")?; self.launch_pass(header, self.stream)?; // Queue the readback after the pass on the same stream, then mark it. check( cuMemcpyDtoHAsync_v2(self.host_counter[slot] as *mut c_void, self.counter_ptr, 4, self.stream), "cuMemcpyDtoHAsync(counter)", )?; check( cuMemcpyDtoHAsync_v2(self.host_container[slot] as *mut c_void, self.container_ptr, MAX_SOLS * 128 * 4, self.stream), "cuMemcpyDtoHAsync(container)", )?; check(cuEventRecord(self.event[slot], self.stream), "cuEventRecord")?; } // While the GPU runs the pass just queued, verify the previous one. let result = match self.pending.take() { Some((prev, prev_header)) => self.read_slot(prev, &prev_header)?, None => Vec::new(), }; self.pending = Some((slot, header.to_vec())); self.slot = 1 - slot; Ok(result) } /// Wait for and verify the last in-flight pass (after the final `enqueue`). pub fn drain(&mut self) -> Result>> { match self.pending.take() { Some((prev, prev_header)) => self.read_slot(prev, &prev_header), None => Ok(Vec::new()), } } /// Block on slot `s`'s completion event, then verify its recovered indices /// against `header`. Reads pinned host memory filled by `enqueue`. fn read_slot(&self, s: usize, header: &[u8]) -> Result>> { unsafe { check(cuEventSynchronize(self.event[s]), "cuEventSynchronize")?; let n = (*self.host_counter[s] as usize).min(MAX_SOLS); if n == 0 { return Ok(Vec::new()); } let recovered = std::slice::from_raw_parts(self.host_container[s], n * 128); let base = blake::base_state(header); Ok(equihash::filter_candidates(&base, recovered)) } } /// Time each GPU kernel individually (sync between launches). pub fn profile(&self, header: &[u8]) -> Result<()> { use log::info; use std::time::Instant; let mid = midstate(header); let tail4 = [header[136], header[137], header[138], header[139]]; unsafe { check(cuCtxSetCurrent(self.ctx), "cuCtxSetCurrent")?; check(cuCtxSynchronize(), "cuCtxSynchronize")?; for (idx, (f, grid, block, shared, arg)) in self.launches.iter().enumerate() { let mut a = arg.clone(); if idx == self.digit_f { a[0..64].copy_from_slice(&mid); a[96..100].copy_from_slice(&tail4); } let t = Instant::now(); launch(*f, *grid, *block, *shared, &a, ptr::null_mut())?; check(cuCtxSynchronize(), "cuCtxSynchronize")?; info!(" kernel {idx:2} {:>6.1} ms", t.elapsed().as_secs_f64() * 1000.0); } } Ok(()) } } impl Drop for CudaSolver { fn drop(&mut self) { // Restore default clocks/power if we changed them. if self.tuned { if let Some(t) = &self.tuner { t.reset(); } } unsafe { cuCtxSetCurrent(self.ctx); for s in 0..2 { cuEventDestroy_v2(self.event[s]); cuMemFreeHost(self.host_counter[s] as *mut c_void); cuMemFreeHost(self.host_container[s] as *mut c_void); } cuStreamDestroy_v2(self.stream); for &b in &self.bufs { cuMemFree_v2(b); } cuModuleUnload(self._module); cuCtxDestroy_v2(self.ctx); } } } #[cfg(test)] mod tests { use super::*; const KNOWN_HEADER: &str = "040000002ba84c97ffc202b55a5843d55837d256fdc32410390b8e95502bd8f648040000cb560c7083a13e06273570350805668e83c3e2362e39e131612fead6f4ea9937a19ceba5b597e2217d7e0c53ba24de3d36b92cf97743550c2745c9464f4dc847ba9e1e6a34cf101e80032bb40ae5118877fccacf8d961e648f6a228d0000000000000000ce856809"; fn known_header() -> Vec { let h: Vec = (0..KNOWN_HEADER.len() / 2) .map(|i| u8::from_str_radix(&KNOWN_HEADER[2 * i..2 * i + 2], 16).unwrap()) .collect(); assert_eq!(h.len(), HEADER_LEN); h } /// End-to-end GPU harvest on a real, pool-accepted header (job 19ae0) for the /// auto-selected (default) config: drive the full pipeline and confirm at /// least one solution verifies. Ignored by default — needs an NVIDIA GPU with /// ~10 GB free whose arch matches the fatbin (sm_50/60/70/75/80/86/120). Run with: /// cargo test --no-default-features --features cuda -- --ignored --nocapture #[test] #[ignore] fn harvests_known_solution() { let header = known_header(); let solver = CudaSolver::new(0).expect("init CUDA device 0"); let sols = solver.solve(&header).expect("solve"); assert!(!sols.is_empty(), "expected at least one harvested solution"); let base = blake::base_state(&header); for s in &sols { assert!(equihash::is_valid_solution(&base, s), "harvested solution must verify"); } eprintln!("harvested {} valid solution(s) from the GPU", sols.len()); } /// Drive every bundled config (forced via `ZCL_CUDA_CONFIG`) on the known /// header and confirm each replays cleanly (no OOB from per-alloc rebasing) /// and returns only valid solutions. Ignored by default (needs a GPU). #[test] #[ignore] fn all_configs_replay_cleanly() { let header = known_header(); let base = blake::base_state(&header); for cfg in CONFIGS { std::env::set_var("ZCL_CUDA_CONFIG", cfg.name); let solver = CudaSolver::new(0).unwrap_or_else(|e| panic!("init config {}: {e}", cfg.name)); let sols = solver.solve(&header).unwrap_or_else(|e| panic!("solve {}: {e}", cfg.name)); for s in &sols { assert!(equihash::is_valid_solution(&base, s), "config {} produced an invalid solution", cfg.name); } eprintln!("config {:<12} -> {} valid solution(s)", cfg.name, sols.len()); } std::env::remove_var("ZCL_CUDA_CONFIG"); } /// The pipelined `enqueue`/`drain` path must produce the same valid solutions /// as the synchronous `solve`. Auto-selects a config that fits free VRAM. /// Ignored by default (needs a GPU). #[test] #[ignore] fn pipelined_matches_known() { let header = known_header(); let base = blake::base_state(&header); let mut solver = CudaSolver::new(0).expect("init CUDA device 0"); let r0 = solver.enqueue(&header).expect("enqueue 1"); // priming -> empty assert!(r0.is_empty(), "first enqueue should return no results yet"); let r1 = solver.enqueue(&header).expect("enqueue 2"); // results of enqueue 1 let r2 = solver.drain().expect("drain"); // results of enqueue 2 for (label, sols) in [("enqueue", &r1), ("drain", &r2)] { assert!(!sols.is_empty(), "pipelined {label} harvested no solutions"); for s in sols { assert!(equihash::is_valid_solution(&base, s), "pipelined {label} invalid solution"); } } eprintln!("pipelined: enqueue={} drain={} valid solution(s)", r1.len(), r2.len()); } } /// Launch a kernel via the `extra` / `BUFFER_POINTER` path (a single pre-packed /// argument buffer), matching how miniZ drives these kernels. The driver /// marshals the argument bytes during this call, so a shared `&[u8]` is fine. unsafe fn launch( f: CUfunction, grid: (u32, u32, u32), block: (u32, u32, u32), shared: u32, arg: &[u8], stream: CUstream, ) -> Result<()> { let mut argsz = arg.len(); let mut extra: [*mut c_void; 5] = [ CU_LAUNCH_PARAM_BUFFER_POINTER as *mut c_void, arg.as_ptr() as *mut c_void, CU_LAUNCH_PARAM_BUFFER_SIZE as *mut c_void, &mut argsz as *mut usize as *mut c_void, CU_LAUNCH_PARAM_END as *mut c_void, ]; check( cuLaunchKernel( f, grid.0, grid.1, grid.2, block.0, block.1, block.2, shared, stream, ptr::null_mut(), extra.as_mut_ptr(), ), "cuLaunchKernel", ) }