Add AMD OpenCL kernel, runtime-loaded CUDA, mixed backend, portability

AMD GPU backend: - Add the GCN-tuned equihash192_7.cl kernel (clearCounter/blake/round1..7/ combine pipeline) and its host driver src/gpu_amd.rs. GpuSolver now dispatches AMD-vendor OpenCL devices to it and other devices to the existing kernel (force with ZCL_OPENCL_KERNEL=amd|legacy). Validated on an RX 9060 XT: GPU solutions match the CPU reference 1/1. - Expose BatchHasher::midstate() for the kernel's ulong8 hashState arg. Runtime-loaded GPU drivers (minimum host deps): - dlopen libcuda / libnvidia-ml via libloading instead of linking them (src/dylib.rs macro; cuda.rs, nvml.rs, gpu_probe.rs). The binary now builds and starts on hosts without an NVIDIA driver and reports no CUDA devices gracefully; remove build.rs (its only job was linking those libs). - Add Dockerfile.portable + build-portable.sh: build against Debian bullseye's glibc 2.31 for a binary that runs on older distros and drives both AMD (OpenCL) and NVIDIA (CUDA) cards. Document the build matrix in the README. Mixed backend (default): - Add --backend mixed (now the default): each card on its native backend (NVIDIA->CUDA, AMD/Intel->OpenCL), deduped so no card is mined twice. --devices indexes the unified list shown by --list-devices. Misc: - Stale-work timeout (--job-timeout) default 300s -> 600s (10 minutes). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 01:15:41 -04:00
parent f3ca6a1ee4
commit 4b5f84959c
18 changed files with 2949 additions and 109 deletions
@@ -14,6 +14,14 @@ mod tui;
 #[cfg(feature = "gpu")]
 mod gpu;

+// AMD-tuned OpenCL kernel driver (selected by GpuSolver for AMD-vendor devices).
+#[cfg(feature = "gpu")]
+mod gpu_amd;
+
+// Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML.
+#[cfg(feature = "cuda")]
+mod dylib;
+
 #[cfg(feature = "cuda")]
 mod cuda;

@@ -79,8 +87,9 @@ struct Args {
    jackpot: Option<u32>,

    /// Pause mining if no new job arrives within this many seconds (stale work
-    /// guard); resumes automatically when fresh work arrives. 0 disables.
-    #[arg(long, value_name = "SECS", default_value_t = 300)]
+    /// guard); resumes automatically when fresh work arrives. Default 600 (10
+    /// minutes). 0 disables.
+    #[arg(long, value_name = "SECS", default_value_t = 600)]
    job_timeout: u64,

    /// Open a local control server on 127.0.0.1:<PORT> so the GUI config tool can
@@ -139,8 +148,11 @@ struct Args {
    #[arg(long, default_value = "all")]
    devices: String,

-    /// GPU backend: "opencl" or "cuda" (for nvidia cards).
-    #[arg(long, default_value = "cuda")]
+    /// GPU backend: "mixed" (default — each card on its native backend: NVIDIA
+    /// on CUDA, AMD/Intel on OpenCL), "opencl" (every card via OpenCL), or
+    /// "cuda" (NVIDIA only). In mixed mode `--devices` indexes the combined list
+    /// shown by --list-devices.
+    #[arg(long, default_value = "mixed")]
    backend: String,

    /// Force the OpenCL backend, disabling CUDA (overrides --backend).
@@ -610,6 +622,9 @@ fn main() -> Result<()> {
 /// Which GPU backend the user selected.
 enum BackendKind {
    Cpu,
+    /// Each physical card on its native backend (NVIDIA→CUDA, others→OpenCL).
+    #[cfg(any(feature = "gpu", feature = "cuda"))]
+    Mixed,
    #[cfg(feature = "gpu")]
    OpenCl,
    #[cfg(feature = "cuda")]
@@ -633,6 +648,16 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
        }
    }
    match args.backend.to_ascii_lowercase().as_str() {
+        "mixed" => {
+            // Each card on its native backend; falls back to whatever single GPU
+            // backend is compiled, or to CPU when none is.
+            #[cfg(any(feature = "gpu", feature = "cuda"))]
+            {
+                Ok(BackendKind::Mixed)
+            }
+            #[cfg(not(any(feature = "gpu", feature = "cuda")))]
+            Ok(BackendKind::Cpu)
+        }
        "cuda" => {
            #[cfg(feature = "cuda")]
            {
@@ -649,7 +674,7 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
            #[cfg(not(feature = "gpu"))]
            Ok(BackendKind::Cpu)
        }
-        other => Err(anyhow!("unknown --backend '{other}' (expected opencl or cuda)")),
+        other => Err(anyhow!("unknown --backend '{other}' (expected mixed, opencl, or cuda)")),
    }
 }

@@ -707,6 +732,8 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
                let clamp = (args.cpu_clamp != 0).then_some(args.cpu_clamp);
                return Ok(vec![BackendSpec::Cpu(clamp)]);
            }
+            // Mixed builds its own unified list (each card on its native backend).
+            BackendKind::Mixed => return mixed_specs(args),
            #[cfg(feature = "cuda")]
            BackendKind::Cuda => (cuda::device_count()?, true),
            #[cfg(feature = "gpu")]
@@ -735,6 +762,71 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
    }
 }

+/// The unified device list for the `mixed` backend, as `(label, spec)`: each
+/// physical GPU on its native backend, with no card mined twice. NVIDIA cards go
+/// to CUDA (listed first); the remaining OpenCL devices (AMD/Intel, plus NVIDIA
+/// when CUDA is unavailable) go to OpenCL. Shared by [`mixed_specs`] and
+/// [`list_devices`]; `--devices` indexes into this list.
+#[cfg(any(feature = "gpu", feature = "cuda"))]
+fn mixed_plan() -> Vec<(String, BackendSpec)> {
+    /// Drop a leading `"[<n>] "` index prefix from a backend's device label, so
+    /// the mixed list shows its own single index instead of two.
+    fn strip_index(label: &str) -> &str {
+        label
+            .strip_prefix('[')
+            .and_then(|s| s.split_once("] "))
+            .map(|(_, rest)| rest)
+            .unwrap_or(label)
+    }
+
+    #[allow(unused_mut)]
+    let mut plan: Vec<(String, BackendSpec)> = Vec::new();
+
+    // NVIDIA cards via CUDA, when the backend is compiled and the driver loads.
+    #[cfg(feature = "cuda")]
+    let cuda_has_nvidia = {
+        let names = cuda::list_devices().unwrap_or_default();
+        for (i, label) in names.iter().enumerate() {
+            plan.push((format!("{} (CUDA)", strip_index(label)), BackendSpec::Cuda(i)));
+        }
+        !names.is_empty()
+    };
+    #[cfg(not(feature = "cuda"))]
+    let cuda_has_nvidia = false;
+
+    // Remaining OpenCL cards via OpenCL; skip NVIDIA ones already on CUDA.
+    #[cfg(feature = "gpu")]
+    {
+        let names = gpu::list_devices().unwrap_or_default();
+        let nvidia = gpu::device_is_nvidia();
+        for (j, label) in names.iter().enumerate() {
+            if nvidia.get(j).copied().unwrap_or(false) && cuda_has_nvidia {
+                continue;
+            }
+            plan.push((format!("{} (OpenCL)", strip_index(label)), BackendSpec::Gpu(j)));
+        }
+    }
+    // `cuda_has_nvidia` is only consumed by the OpenCL branch above.
+    #[cfg(not(feature = "gpu"))]
+    let _ = cuda_has_nvidia;
+
+    plan
+}
+
+/// Build the worker list for `--backend mixed`: each card on its native backend.
+/// `--devices` selects into [`mixed_plan`]'s unified list.
+#[cfg(any(feature = "gpu", feature = "cuda"))]
+fn mixed_specs(args: &Args) -> Result<Vec<BackendSpec>> {
+    let plan = mixed_plan();
+    if plan.is_empty() {
+        return Err(anyhow!(
+            "no GPUs found for the mixed backend — none detected via CUDA or OpenCL"
+        ));
+    }
+    let selected = parse_devices(&args.devices, plan.len())?;
+    Ok(selected.into_iter().map(|i| plan[i].1).collect())
+}
+
 /// Build a single GPU worker spec for `idx`, choosing CUDA or OpenCL, erroring if
 /// the requested backend wasn't compiled in.
 #[cfg(any(feature = "gpu", feature = "cuda"))]
@@ -821,6 +913,18 @@ fn list_devices() {
        Ok(_) => println!("no CUDA devices found"),
        Err(e) => println!("error listing CUDA devices: {e}"),
    }
+    // What the default `mixed` backend will mine, and the indices `--devices`
+    // selects from in that mode.
+    #[cfg(any(feature = "gpu", feature = "cuda"))]
+    {
+        let plan = mixed_plan();
+        if !plan.is_empty() {
+            println!("\nMixed backend (--backend mixed, the default) — `--devices` indexes this list:");
+            for (i, (label, _)) in plan.iter().enumerate() {
+                println!("  [{i}] {label}");
+            }
+        }
+    }
    #[cfg(not(any(feature = "gpu", feature = "cuda")))]
    println!("built without GPU support (rebuild with the `gpu` or `cuda` feature)");
 }
@@ -886,18 +990,24 @@ fn selftest(gpu_device: usize) -> Result<()> {
        let solver = gpu::GpuSolver::new(gpu_device)
            .with_context(|| format!("init OpenCL device {gpu_device}"))?;

-        // Spot-check the BLAKE2b kernel against the CPU reference.
-        let outputs = solver.hash_all(&header)?;
-        let step = params::BLAKE_CALLS / 64;
-        for k in 0..64 {
-            let g = (k * step) as u32;
-            let cpu = blake::generate_hash(&base, g);
-            let off = g as usize * params::HASH_OUTPUT;
-            if cpu != outputs[off..off + params::HASH_OUTPUT] {
-                return Err(anyhow!("GPU BLAKE2b mismatch at g={g}"));
+        // Spot-check the BLAKE2b kernel against the CPU reference. The AMD kernel
+        // buckets its round-0 output instead of exposing per-index digests, so
+        // the probe is skipped there (the solve-vs-CPU check below still runs).
+        if solver.supports_blake_probe() {
+            let outputs = solver.hash_all(&header)?;
+            let step = params::BLAKE_CALLS / 64;
+            for k in 0..64 {
+                let g = (k * step) as u32;
+                let cpu = blake::generate_hash(&base, g);
+                let off = g as usize * params::HASH_OUTPUT;
+                if cpu != outputs[off..off + params::HASH_OUTPUT] {
+                    return Err(anyhow!("GPU BLAKE2b mismatch at g={g}"));
+                }
            }
+            info!("GPU BLAKE2b kernel matches CPU");
+        } else {
+            info!("skipping BLAKE2b kernel probe (AMD kernel buckets round-0 output)");
        }
-        info!("GPU BLAKE2b kernel matches CPU");

        let gpu_solutions = solver.solve(&header)?;
        info!("GPU found {} valid solution(s)", gpu_solutions.len());