Add AMD OpenCL kernel, runtime-loaded CUDA, mixed backend, portability
AMD GPU backend: - Add the GCN-tuned equihash192_7.cl kernel (clearCounter/blake/round1..7/ combine pipeline) and its host driver src/gpu_amd.rs. GpuSolver now dispatches AMD-vendor OpenCL devices to it and other devices to the existing kernel (force with ZCL_OPENCL_KERNEL=amd|legacy). Validated on an RX 9060 XT: GPU solutions match the CPU reference 1/1. - Expose BatchHasher::midstate() for the kernel's ulong8 hashState arg. Runtime-loaded GPU drivers (minimum host deps): - dlopen libcuda / libnvidia-ml via libloading instead of linking them (src/dylib.rs macro; cuda.rs, nvml.rs, gpu_probe.rs). The binary now builds and starts on hosts without an NVIDIA driver and reports no CUDA devices gracefully; remove build.rs (its only job was linking those libs). - Add Dockerfile.portable + build-portable.sh: build against Debian bullseye's glibc 2.31 for a binary that runs on older distros and drives both AMD (OpenCL) and NVIDIA (CUDA) cards. Document the build matrix in the README. Mixed backend (default): - Add --backend mixed (now the default): each card on its native backend (NVIDIA->CUDA, AMD/Intel->OpenCL), deduped so no card is mined twice. --devices indexes the unified list shown by --list-devices. Misc: - Stale-work timeout (--job-timeout) default 300s -> 600s (10 minutes). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,7 @@
|
|||||||
|
# Keep the build context small: the portable build only needs the sources.
|
||||||
|
/target
|
||||||
|
/dist
|
||||||
|
/.git
|
||||||
|
/pearl-dump
|
||||||
|
/alpha-miner
|
||||||
|
*.log
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
# Rust build artifacts
|
# Rust build artifacts
|
||||||
/target
|
/target
|
||||||
|
|
||||||
|
# Portable container build output (build-portable.sh)
|
||||||
|
/dist
|
||||||
|
|
||||||
# IDE / editor
|
# IDE / editor
|
||||||
/.idea
|
/.idea
|
||||||
|
|
||||||
|
|||||||
Generated
+1
@@ -2236,6 +2236,7 @@ dependencies = [
|
|||||||
"eframe",
|
"eframe",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"hex",
|
"hex",
|
||||||
|
"libloading 0.8.9",
|
||||||
"log",
|
"log",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"ocl",
|
"ocl",
|
||||||
|
|||||||
+7
-2
@@ -23,13 +23,18 @@ socket2 = "0.5"
|
|||||||
ocl = { version = "0.19", optional = true }
|
ocl = { version = "0.19", optional = true }
|
||||||
ratatui = "0.30.0"
|
ratatui = "0.30.0"
|
||||||
eframe = { version = "0.28", optional = true }
|
eframe = { version = "0.28", optional = true }
|
||||||
|
# Runtime loader for the CUDA driver / NVML (dlopen'd, not link-time, so the
|
||||||
|
# binary has no build- or load-time dependency on libcuda / libnvidia-ml).
|
||||||
|
libloading = { version = "0.8", optional = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["gpu", "cuda", "config-gui"]
|
default = ["gpu", "cuda", "config-gui"]
|
||||||
gpu = ["dep:ocl"]
|
gpu = ["dep:ocl"]
|
||||||
# CUDA backend: drives miniZ's embedded Equihash 192,7 fatbin via the CUDA driver
|
# CUDA backend: drives miniZ's embedded Equihash 192,7 fatbin via the CUDA driver
|
||||||
# API. build.rs only links libcuda (no nvcc / kernel compilation needed).
|
# API. The driver (libcuda) and NVML are dlopen'd at runtime via libloading, so
|
||||||
cuda = []
|
# there is no build-time or load-time dependency on them — the binary builds and
|
||||||
|
# starts on hosts without an NVIDIA driver and simply reports no CUDA devices.
|
||||||
|
cuda = ["dep:libloading"]
|
||||||
# Optional native GUI config editor (the `jackpotminer-config` binary). Off by
|
# Optional native GUI config editor (the `jackpotminer-config` binary). Off by
|
||||||
# default so the miner never pulls in the GUI toolkit.
|
# default so the miner never pulls in the GUI toolkit.
|
||||||
config-gui = ["dep:eframe"]
|
config-gui = ["dep:eframe"]
|
||||||
|
|||||||
@@ -0,0 +1,40 @@
|
|||||||
|
# Portable jackpotminer build.
|
||||||
|
#
|
||||||
|
# Links against Debian bullseye's glibc 2.31 (released 2020), so the resulting
|
||||||
|
# binary runs on essentially any Linux from the last several years instead of
|
||||||
|
# requiring the build host's (much newer) glibc. This is the real fix for the
|
||||||
|
# "version `GLIBC_2.39' not found" class of errors — static linking can't solve
|
||||||
|
# it for a GPU build, because the GPU driver libraries are glibc-only and load
|
||||||
|
# at runtime.
|
||||||
|
#
|
||||||
|
# The CUDA driver and NVML are dlopen'd at runtime (see src/dylib.rs), so this
|
||||||
|
# build needs NO NVIDIA toolkit — only the OpenCL ICD loader (to link libOpenCL).
|
||||||
|
# The result is one binary that drives AMD cards (OpenCL) and NVIDIA cards (CUDA,
|
||||||
|
# loaded if libcuda.so.1 is present at runtime).
|
||||||
|
#
|
||||||
|
# Build: DOCKER_BUILDKIT=1 docker build -f Dockerfile.portable \
|
||||||
|
# --output type=local,dest=dist .
|
||||||
|
# or just: ./build-portable.sh
|
||||||
|
# Output: dist/jackpotminer
|
||||||
|
|
||||||
|
FROM debian:bullseye-slim AS build
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
# gcc/g++ for the linker; ocl-icd-opencl-dev provides libOpenCL.so for linking
|
||||||
|
# (the runtime host supplies its own libOpenCL.so.1 via its GPU driver).
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates curl gcc g++ make pkg-config ocl-icd-opencl-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
# Minimal stable Rust toolchain.
|
||||||
|
RUN curl -fsSL https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
|
||||||
|
WORKDIR /src
|
||||||
|
COPY . .
|
||||||
|
# Miner only (no GUI config tool, to avoid pulling X11/Wayland/GL into the
|
||||||
|
# build): AMD OpenCL + dlopen'd CUDA. `--locked` keeps it reproducible.
|
||||||
|
RUN cargo build --release --locked --no-default-features --features gpu,cuda \
|
||||||
|
&& strip target/release/jackpotminer
|
||||||
|
|
||||||
|
# Export just the binary to the build output directory.
|
||||||
|
FROM scratch AS export
|
||||||
|
COPY --from=build /src/target/release/jackpotminer /jackpotminer
|
||||||
@@ -98,18 +98,47 @@ reverse-engineered Equihash 192,7 solver — see "CUDA backend" below.
|
|||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
Requirements: a Rust toolchain and an OpenCL runtime (the NVIDIA driver ships
|
Requirements: a Rust toolchain and, for the OpenCL backend, the OpenCL ICD
|
||||||
`libOpenCL`). The CUDA backend only needs `libcuda` (the NVIDIA driver) — the
|
loader (`libOpenCL` — e.g. `ocl-icd-opencl-dev` on Debian/Ubuntu; the NVIDIA and
|
||||||
fatbin and launch trace it drives are embedded in the binary, so no CUDA toolkit
|
AMD drivers also ship it). The CUDA driver and NVML are **`dlopen`'d at runtime**
|
||||||
or `nvcc` is required.
|
(see `src/dylib.rs`), so the `cuda` feature needs no NVIDIA toolkit or libs to
|
||||||
|
build, and a `cuda`-enabled binary still builds and starts on hosts without an
|
||||||
|
NVIDIA driver — it simply reports no CUDA devices. The fatbin and launch trace
|
||||||
|
the CUDA backend drives are embedded, so no `nvcc` is required either.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cargo build --release # OpenCL backend (default)
|
cargo build --release # default: OpenCL + CUDA + GUI config tool
|
||||||
cargo build --release --features cuda # OpenCL + CUDA backends
|
cargo build --release --no-default-features --features gpu,cuda # miner only, both GPU backends
|
||||||
|
cargo build --release --no-default-features --features gpu # OpenCL only (AMD/Intel/NVIDIA)
|
||||||
cargo build --release --no-default-features --features cuda # CUDA only
|
cargo build --release --no-default-features --features cuda # CUDA only
|
||||||
cargo build --release --no-default-features # CPU-only (no GPU)
|
cargo build --release --no-default-features # CPU-only (no GPU)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Portable / distributable builds
|
||||||
|
|
||||||
|
The miner's only runtime dependencies are the C library and the OpenCL ICD loader
|
||||||
|
(`libOpenCL.so.1`, present wherever a GPU driver is); CUDA/NVML are loaded on
|
||||||
|
demand. So the main compatibility risk when shipping a Linux binary is the
|
||||||
|
**glibc version** it was built against — not the GPU libraries. To build one that
|
||||||
|
runs on older distros, compile against an old glibc in a container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./build-portable.sh # → dist/jackpotminer (Docker, or ENGINE=podman)
|
||||||
|
```
|
||||||
|
|
||||||
|
This links against Debian bullseye's glibc 2.31 (runs on most Linux from ~2020
|
||||||
|
on) and yields a single miner that drives both AMD (OpenCL) and NVIDIA (CUDA)
|
||||||
|
cards. See `Dockerfile.portable`.
|
||||||
|
|
||||||
|
A fully *static* GPU binary isn't possible: the OpenCL/CUDA driver libraries are
|
||||||
|
glibc-only and must load at runtime. For a zero-dependency binary that runs
|
||||||
|
anywhere, build the **CPU-only** miner against musl:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rustup target add x86_64-unknown-linux-musl
|
||||||
|
cargo build --release --target x86_64-unknown-linux-musl --no-default-features
|
||||||
|
```
|
||||||
|
|
||||||
### CUDA backend (miniZ fatbin replay)
|
### CUDA backend (miniZ fatbin replay)
|
||||||
|
|
||||||
`--features cuda` (selectable with `--backend cuda`) does **not** compile its own
|
`--features cuda` (selectable with `--backend cuda`) does **not** compile its own
|
||||||
@@ -181,7 +210,7 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and
|
|||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# List OpenCL devices
|
# List devices (and the default "mixed" backend's combined index list)
|
||||||
./target/release/jackpotminer --list-devices
|
./target/release/jackpotminer --list-devices
|
||||||
|
|
||||||
# Mine on one GPU
|
# Mine on one GPU
|
||||||
@@ -195,8 +224,11 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and
|
|||||||
./target/release/jackpotminer --url ... --user ... --devices 0,1
|
./target/release/jackpotminer --url ... --user ... --devices 0,1
|
||||||
./target/release/jackpotminer --url ... --user ... --devices all
|
./target/release/jackpotminer --url ... --user ... --devices all
|
||||||
|
|
||||||
# Use the CUDA backend instead of OpenCL (needs a --features cuda build)
|
# Default backend is "mixed": NVIDIA cards run on CUDA, AMD/Intel on OpenCL —
|
||||||
./target/release/jackpotminer --url ... --user ... --backend cuda --devices all
|
# so an AMD + NVIDIA rig just works. --devices indexes the combined list from
|
||||||
|
# --list-devices. Pin a single backend for every card with:
|
||||||
|
./target/release/jackpotminer --url ... --user ... --backend opencl # all via OpenCL
|
||||||
|
./target/release/jackpotminer --url ... --user ... --backend cuda # NVIDIA only
|
||||||
|
|
||||||
# Force the CPU backend
|
# Force the CPU backend
|
||||||
./target/release/jackpotminer --url ... --user ... --cpu
|
./target/release/jackpotminer --url ... --user ... --cpu
|
||||||
|
|||||||
Executable
+36
@@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Build a portable jackpotminer binary inside an old-glibc (Debian bullseye,
|
||||||
|
# glibc 2.31) container, so it runs on essentially any recent Linux regardless of
|
||||||
|
# the build host's glibc. CUDA is dlopen'd at runtime, so no NVIDIA toolkit is
|
||||||
|
# needed to build; the binary drives both AMD (OpenCL) and NVIDIA (CUDA) cards.
|
||||||
|
#
|
||||||
|
# Output: dist/jackpotminer
|
||||||
|
#
|
||||||
|
# Works with Docker (BuildKit) or Podman. Override the engine with ENGINE=podman.
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
ENGINE="${ENGINE:-docker}"
|
||||||
|
OUT="${OUT:-dist}"
|
||||||
|
|
||||||
|
mkdir -p "$OUT"
|
||||||
|
|
||||||
|
case "$ENGINE" in
|
||||||
|
podman)
|
||||||
|
# Podman builds the final `scratch` stage; extract the binary from it.
|
||||||
|
podman build -f Dockerfile.portable -t jackpotminer-portable .
|
||||||
|
cid=$(podman create jackpotminer-portable)
|
||||||
|
podman cp "$cid:/jackpotminer" "$OUT/jackpotminer"
|
||||||
|
podman rm "$cid" >/dev/null
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
DOCKER_BUILDKIT=1 "$ENGINE" build -f Dockerfile.portable \
|
||||||
|
--output "type=local,dest=$OUT" .
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
chmod +x "$OUT/jackpotminer"
|
||||||
|
echo
|
||||||
|
echo "Built $OUT/jackpotminer"
|
||||||
|
command -v file >/dev/null 2>&1 && file "$OUT/jackpotminer" || true
|
||||||
|
echo "Minimum glibc / dynamic deps:"
|
||||||
|
{ objdump -T "$OUT/jackpotminer" 2>/dev/null | grep -oE 'GLIBC_[0-9.]+' | sort -V | tail -1; } || true
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
//! Build script for the CUDA backend.
|
|
||||||
//!
|
|
||||||
//! The `cuda` feature links the CUDA driver API (`cuda`) and NVML (for
|
|
||||||
//! clock/power control + readout). The backend drives miniZ's embedded fatbin
|
|
||||||
//! (`src/miniz/equihash192_7.fatbin`) via the driver API, so no nvcc / kernel
|
|
||||||
//! compilation is needed at build time. (The default OpenCL backend needs no
|
|
||||||
//! build-script support — `ocl` links `OpenCL` itself, cross-platform.)
|
|
||||||
//!
|
|
||||||
//! Linking is target-aware so the `cuda` feature builds on both Linux and
|
|
||||||
//! Windows:
|
|
||||||
//! - Linux: `libcuda.so` + `libnvidia-ml.so` from the system / toolkit dirs.
|
|
||||||
//! - Windows: `cuda.lib` + `nvml.lib` from `%CUDA_PATH%\lib\x64`.
|
|
||||||
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
println!("cargo:rerun-if-changed=build.rs");
|
|
||||||
|
|
||||||
if std::env::var("CARGO_FEATURE_CUDA").is_ok() {
|
|
||||||
// Use the *target* OS (correct for cross-compilation too), not the host.
|
|
||||||
let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
|
|
||||||
link_cuda_driver(&target_os);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Link the CUDA driver library plus NVML. The backend loads the embedded miniZ
|
|
||||||
/// fatbin at runtime, so there is nothing to compile here.
|
|
||||||
fn link_cuda_driver(target_os: &str) {
|
|
||||||
if target_os == "windows" {
|
|
||||||
// CUDA Toolkit import libraries (cuda.lib, nvml.lib).
|
|
||||||
println!("cargo:rerun-if-env-changed=CUDA_PATH");
|
|
||||||
if let Ok(cuda_path) = std::env::var("CUDA_PATH") {
|
|
||||||
println!("cargo:rustc-link-search=native={cuda_path}\\lib\\x64");
|
|
||||||
}
|
|
||||||
// Driver API import lib is `cuda.lib`; NVML is `nvml.lib` (nvml.dll
|
|
||||||
// ships with the NVIDIA driver).
|
|
||||||
println!("cargo:rustc-link-lib=dylib=cuda");
|
|
||||||
println!("cargo:rustc-link-lib=dylib=nvml");
|
|
||||||
} else {
|
|
||||||
for dir in ["/usr/lib64", "/usr/lib", "/opt/cuda/lib64"] {
|
|
||||||
if Path::new(dir).exists() {
|
|
||||||
println!("cargo:rustc-link-search=native={dir}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// GNU ld: embed an rpath so libcuda is found at runtime (Linux only —
|
|
||||||
// MSVC's linker rejects `-Wl,...`).
|
|
||||||
if target_os == "linux" {
|
|
||||||
println!("cargo:rustc-link-arg=-Wl,-rpath,/opt/cuda/lib64");
|
|
||||||
}
|
|
||||||
println!("cargo:rustc-link-lib=dylib=cuda");
|
|
||||||
println!("cargo:rustc-link-lib=dylib=nvidia-ml");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
+5
-3
@@ -23,9 +23,11 @@ user = "t1YourZClassicAddressHere.rig1" # payout address / worker login
|
|||||||
# no-jackpot = true # PPLNS
|
# no-jackpot = true # PPLNS
|
||||||
|
|
||||||
# ── GPU backend ───────────────────────────────────────────────────────────────
|
# ── GPU backend ───────────────────────────────────────────────────────────────
|
||||||
# backend = "cuda" # "cuda" or "opencl"
|
# backend = "mixed" # "mixed" (default: NVIDIA→CUDA, AMD/Intel→OpenCL),
|
||||||
# devices = "all" # "all", or a comma list e.g. "0,1"
|
# # "opencl" (every card via OpenCL), or "cuda"
|
||||||
# force-opencl = false # force OpenCL, disabling CUDA
|
# devices = "all" # "all", or a comma list e.g. "0,1" — in mixed mode
|
||||||
|
# # these index the combined list from --list-devices
|
||||||
|
# force-opencl = false # force every card onto OpenCL, disabling CUDA
|
||||||
|
|
||||||
# ── GPU tuning (clock/power changes need root) ────────────────────────────────
|
# ── GPU tuning (clock/power changes need root) ────────────────────────────────
|
||||||
# no-gpu-tune = false
|
# no-gpu-tune = false
|
||||||
|
|||||||
@@ -166,6 +166,18 @@ impl BatchHasher {
|
|||||||
Self { mid, tail }
|
Self { mid, tail }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The BLAKE2b chaining state after compressing the shared first 128-byte
|
||||||
|
/// header block (the eight 64-bit midstate words). This is exactly the
|
||||||
|
/// `hashState` (`ulong8`) the OpenCL `equihash192_7.cl` round-0 kernel
|
||||||
|
/// consumes: it injects the final 16-byte block (message word `m[1] =
|
||||||
|
/// (index << 32) | nonce_low`, `m[0] = 0`) itself, which requires the
|
||||||
|
/// header's bytes [128..136] to be zero (the same `cuda_compatible` rule the
|
||||||
|
/// CUDA backend relies on). Only used by the OpenCL (AMD) backend.
|
||||||
|
#[cfg_attr(not(feature = "gpu"), allow(dead_code))]
|
||||||
|
pub fn midstate(&self) -> [u64; 8] {
|
||||||
|
self.mid
|
||||||
|
}
|
||||||
|
|
||||||
/// Assemble the zero-padded final block for index `g`.
|
/// Assemble the zero-padded final block for index `g`.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn final_block(&self, g: u32) -> [u8; 128] {
|
fn final_block(&self, g: u32) -> [u8; 128] {
|
||||||
|
|||||||
+19
-2
@@ -108,7 +108,16 @@ const CU_LAUNCH_PARAM_END: usize = 0x00;
|
|||||||
const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01;
|
const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01;
|
||||||
const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02;
|
const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02;
|
||||||
|
|
||||||
extern "C" {
|
// The CUDA driver API, loaded at runtime via dlopen (see `crate::dylib`) rather
|
||||||
|
// than linked at build time: the SONAME `libcuda.so.1` ships with the NVIDIA
|
||||||
|
// driver (`nvcuda.dll` on Windows) and is absent on driver-less / AMD-only
|
||||||
|
// hosts. `cuda_lib()` returns `None` when it can't be opened; the public entry
|
||||||
|
// points below turn that into a clear error / empty device list, so the binary
|
||||||
|
// still builds and starts everywhere.
|
||||||
|
crate::dylib::dynamic_library! {
|
||||||
|
lib_struct: CudaLib,
|
||||||
|
loader: cuda_lib,
|
||||||
|
names: ["libcuda.so.1", "libcuda.so", "nvcuda.dll"],
|
||||||
fn cuInit(flags: c_uint) -> CUresult;
|
fn cuInit(flags: c_uint) -> CUresult;
|
||||||
fn cuDeviceGetCount(count: *mut c_int) -> CUresult;
|
fn cuDeviceGetCount(count: *mut c_int) -> CUresult;
|
||||||
fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
|
fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
|
||||||
@@ -148,6 +157,11 @@ extern "C" {
|
|||||||
fn cuGetErrorName(error: CUresult, str: *mut *const c_char) -> CUresult;
|
fn cuGetErrorName(error: CUresult, str: *mut *const c_char) -> CUresult;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Error returned when the CUDA driver library isn't present on the host.
|
||||||
|
fn cuda_unavailable() -> anyhow::Error {
|
||||||
|
anyhow!("CUDA driver library (libcuda.so.1) not found — is the NVIDIA driver installed?")
|
||||||
|
}
|
||||||
|
|
||||||
/// Turn a non-success `CUresult` into an error with the driver's symbolic name.
|
/// Turn a non-success `CUresult` into an error with the driver's symbolic name.
|
||||||
fn check(code: CUresult, what: &str) -> Result<()> {
|
fn check(code: CUresult, what: &str) -> Result<()> {
|
||||||
if code == CUDA_SUCCESS {
|
if code == CUDA_SUCCESS {
|
||||||
@@ -164,8 +178,10 @@ fn check(code: CUresult, what: &str) -> Result<()> {
|
|||||||
Err(anyhow!("{what} failed: {name}"))
|
Err(anyhow!("{what} failed: {name}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of CUDA devices (initialises the driver as a side effect).
|
/// Number of CUDA devices (initialises the driver as a side effect). Returns an
|
||||||
|
/// error if the CUDA driver library isn't installed.
|
||||||
pub fn device_count() -> Result<usize> {
|
pub fn device_count() -> Result<usize> {
|
||||||
|
cuda_lib().ok_or_else(cuda_unavailable)?;
|
||||||
unsafe {
|
unsafe {
|
||||||
check(cuInit(0), "cuInit")?;
|
check(cuInit(0), "cuInit")?;
|
||||||
let mut n: c_int = 0;
|
let mut n: c_int = 0;
|
||||||
@@ -579,6 +595,7 @@ impl CudaSolver {
|
|||||||
/// fatbin, select the config that fits free VRAM, allocate its buffers, and
|
/// fatbin, select the config that fits free VRAM, allocate its buffers, and
|
||||||
/// rebase the recorded launch sequence.
|
/// rebase the recorded launch sequence.
|
||||||
pub fn new(device_index: usize) -> Result<Self> {
|
pub fn new(device_index: usize) -> Result<Self> {
|
||||||
|
cuda_lib().ok_or_else(cuda_unavailable)?;
|
||||||
unsafe {
|
unsafe {
|
||||||
check(cuInit(0), "cuInit")?;
|
check(cuInit(0), "cuInit")?;
|
||||||
let mut dev: CUdevice = 0;
|
let mut dev: CUdevice = 0;
|
||||||
|
|||||||
@@ -0,0 +1,85 @@
|
|||||||
|
//! Tiny runtime dynamic-library loader for the optional GPU vendor libraries.
|
||||||
|
//!
|
||||||
|
//! The CUDA driver (`libcuda`) and NVML (`libnvidia-ml`) are vendor components
|
||||||
|
//! that ship with the NVIDIA driver — they are not installable as ordinary
|
||||||
|
//! build dependencies and are absent on AMD-only / driver-less hosts. Linking
|
||||||
|
//! them at build time would (a) make the build fail without the NVIDIA libs
|
||||||
|
//! present and (b) make the resulting binary refuse to start anywhere they are
|
||||||
|
//! missing. Instead we `dlopen` them on first use: the binary has no build-time
|
||||||
|
//! or load-time dependency on them, and the CUDA backend simply reports "no
|
||||||
|
//! devices" when the driver isn't installed.
|
||||||
|
//!
|
||||||
|
//! [`dynamic_library!`] generates, for one such library, a function-pointer
|
||||||
|
//! table plus same-named wrapper `fn`s, so the call sites in [`crate::cuda`] /
|
||||||
|
//! [`crate::nvml`] are unchanged — only the `extern "C"` block is replaced.
|
||||||
|
|
||||||
|
/// Open the first of `names` that loads (e.g. the versioned SONAME first, then
|
||||||
|
/// the unversioned dev symlink). Returns the last error if none load.
|
||||||
|
pub fn load_first(names: &[&str]) -> Result<libloading::Library, libloading::Error> {
|
||||||
|
let mut last_err = None;
|
||||||
|
for name in names {
|
||||||
|
match unsafe { libloading::Library::new(name) } {
|
||||||
|
Ok(lib) => return Ok(lib),
|
||||||
|
Err(e) => last_err = Some(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(last_err.expect("load_first called with an empty name list"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a runtime-loaded binding for one shared library.
|
||||||
|
///
|
||||||
|
/// Produces a hidden fn-pointer struct, a `OnceLock`-cached loader (`$loader()`
|
||||||
|
/// returns `Option<&'static _>`, `None` when the library can't be loaded), and a
|
||||||
|
/// same-named `unsafe fn` wrapper for each declared function that dispatches
|
||||||
|
/// through the table. Public entry points must check `$loader().is_some()` (or
|
||||||
|
/// `?` on the `Option`) before invoking any wrapper; the wrappers themselves
|
||||||
|
/// panic if called with the library unloaded, which the entry-point guards
|
||||||
|
/// prevent.
|
||||||
|
macro_rules! dynamic_library {
|
||||||
|
(
|
||||||
|
lib_struct: $Lib:ident,
|
||||||
|
loader: $loader:ident,
|
||||||
|
names: [$($lname:expr),+ $(,)?],
|
||||||
|
$( fn $fname:ident($($an:ident: $at:ty),* $(,)?) -> $ret:ty; )*
|
||||||
|
) => {
|
||||||
|
#[allow(non_snake_case)]
|
||||||
|
struct $Lib {
|
||||||
|
$( $fname: unsafe extern "C" fn($($at),*) -> $ret, )*
|
||||||
|
// Keep the library mapped for the process lifetime; the fn pointers
|
||||||
|
// above point into it.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
handle: libloading::Library,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl $Lib {
|
||||||
|
#[allow(non_snake_case)]
|
||||||
|
unsafe fn load() -> std::result::Result<Self, libloading::Error> {
|
||||||
|
let handle = $crate::dylib::load_first(&[$($lname),+])?;
|
||||||
|
$(
|
||||||
|
let $fname: unsafe extern "C" fn($($at),*) -> $ret =
|
||||||
|
*handle.get(concat!(stringify!($fname), "\0").as_bytes())?;
|
||||||
|
)*
|
||||||
|
Ok(Self { $($fname,)* handle })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __DYLIB: std::sync::OnceLock<Option<$Lib>> = std::sync::OnceLock::new();
|
||||||
|
|
||||||
|
/// The loaded library, or `None` if it could not be opened.
|
||||||
|
fn $loader() -> Option<&'static $Lib> {
|
||||||
|
__DYLIB.get_or_init(|| unsafe { $Lib::load().ok() }).as_ref()
|
||||||
|
}
|
||||||
|
|
||||||
|
$(
|
||||||
|
#[inline]
|
||||||
|
#[allow(non_snake_case)]
|
||||||
|
unsafe fn $fname($($an: $at),*) -> $ret {
|
||||||
|
($loader()
|
||||||
|
.expect(concat!(stringify!($fname), ": ", stringify!($Lib), " not loaded"))
|
||||||
|
.$fname)($($an),*)
|
||||||
|
}
|
||||||
|
)*
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) use dynamic_library;
|
||||||
+119
-5
@@ -151,8 +151,9 @@ fn kernel_source(geom: &Geom) -> String {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A persistent OpenCL solver bound to one device.
|
/// The default (project-native) OpenCL solver, bound to one device. Wrapped by
|
||||||
pub struct GpuSolver {
|
/// [`GpuSolver`], which selects it for non-AMD devices.
|
||||||
|
struct LegacySolver {
|
||||||
pq: ProQue,
|
pq: ProQue,
|
||||||
header: Buffer<u8>,
|
header: Buffer<u8>,
|
||||||
/// Per-table back-reference arrays (1 u32/slot), kept resident for recovery.
|
/// Per-table back-reference arrays (1 u32/slot), kept resident for recovery.
|
||||||
@@ -167,15 +168,14 @@ pub struct GpuSolver {
|
|||||||
nr_rows: usize,
|
nr_rows: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GpuSolver {
|
impl LegacySolver {
|
||||||
/// This device's product name (e.g. "NVIDIA GeForce RTX 5080"), if available.
|
/// This device's product name (e.g. "NVIDIA GeForce RTX 5080"), if available.
|
||||||
pub fn device_name(&self) -> Option<String> {
|
pub fn device_name(&self) -> Option<String> {
|
||||||
self.pq.device().name().ok()
|
self.pq.device().name().ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialise the solver and allocate all device buffers.
|
/// Initialise the solver and allocate all device buffers.
|
||||||
pub fn new(device_index: usize) -> Result<Self> {
|
pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result<Self> {
|
||||||
let (platform, device) = pick_device(device_index)?;
|
|
||||||
let geom = pick_geom(&device);
|
let geom = pick_geom(&device);
|
||||||
// The device's platform must be set explicitly: ProQue otherwise builds
|
// The device's platform must be set explicitly: ProQue otherwise builds
|
||||||
// the context against `Platform::default()` (the first platform), which
|
// the context against `Platform::default()` (the first platform), which
|
||||||
@@ -406,6 +406,101 @@ impl GpuSolver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// OpenCL solver for one device. Dispatches to the AMD-tuned kernel
|
||||||
|
/// (`equihash192_7.cl`) on AMD-vendor devices and the default project kernel
|
||||||
|
/// (`equihash.cl`) everywhere else. Forceable with `ZCL_OPENCL_KERNEL=amd|legacy`.
|
||||||
|
pub struct GpuSolver {
|
||||||
|
inner: SolverInner,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum SolverInner {
|
||||||
|
Legacy(LegacySolver),
|
||||||
|
Amd(crate::gpu_amd::AmdSolver),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GpuSolver {
|
||||||
|
/// Initialise the solver for a flat device index, choosing the kernel by
|
||||||
|
/// device vendor (AMD → `equihash192_7.cl`).
|
||||||
|
pub fn new(device_index: usize) -> Result<Self> {
|
||||||
|
let (platform, device) = pick_device(device_index)?;
|
||||||
|
let inner = if use_amd_kernel(&device) {
|
||||||
|
log::info!("OpenCL: AMD device — using the equihash192_7 kernel");
|
||||||
|
SolverInner::Amd(crate::gpu_amd::AmdSolver::new(platform, device)?)
|
||||||
|
} else {
|
||||||
|
SolverInner::Legacy(LegacySolver::new(platform, device)?)
|
||||||
|
};
|
||||||
|
Ok(Self { inner })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This device's product name, if available.
|
||||||
|
pub fn device_name(&self) -> Option<String> {
|
||||||
|
match &self.inner {
|
||||||
|
SolverInner::Legacy(s) => s.device_name(),
|
||||||
|
SolverInner::Amd(s) => s.device_name(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Solve the puzzle for `header` (140 bytes).
|
||||||
|
pub fn solve(&self, header: &[u8]) -> Result<Vec<Vec<u32>>> {
|
||||||
|
match &self.inner {
|
||||||
|
SolverInner::Legacy(s) => s.solve(header),
|
||||||
|
SolverInner::Amd(s) => s.solve(header),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Solve and also return the raw GPU candidate count (for diagnostics).
|
||||||
|
pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec<Vec<u32>>)> {
|
||||||
|
match &self.inner {
|
||||||
|
SolverInner::Legacy(s) => s.solve_with_stats(header),
|
||||||
|
SolverInner::Amd(s) => s.solve_with_stats(header),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Time each GPU stage individually.
|
||||||
|
pub fn profile(&self, header: &[u8]) -> Result<()> {
|
||||||
|
match &self.inner {
|
||||||
|
SolverInner::Legacy(s) => s.profile(header),
|
||||||
|
SolverInner::Amd(s) => s.profile(header),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the per-index BLAKE2b probe ([`Self::hash_all`]) is available.
|
||||||
|
/// Only the default kernel exposes a linear digest layout; the AMD kernel
|
||||||
|
/// buckets in round 0, so the self-test skips the probe there.
|
||||||
|
pub fn supports_blake_probe(&self) -> bool {
|
||||||
|
matches!(self.inner, SolverInner::Legacy(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute every first-round BLAKE2b output (default kernel only).
|
||||||
|
pub fn hash_all(&self, header: &[u8]) -> Result<Vec<u8>> {
|
||||||
|
match &self.inner {
|
||||||
|
SolverInner::Legacy(s) => s.hash_all(header),
|
||||||
|
SolverInner::Amd(_) => {
|
||||||
|
Err(anyhow!("hash_all is not supported by the AMD kernel"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decide whether to drive `device` with the AMD `equihash192_7.cl` kernel.
|
||||||
|
/// `ZCL_OPENCL_KERNEL` forces the choice (`amd` or `legacy`); otherwise it's by
|
||||||
|
/// device vendor.
|
||||||
|
fn use_amd_kernel(device: &ocl::Device) -> bool {
|
||||||
|
use ocl::enums::{DeviceInfo, DeviceInfoResult};
|
||||||
|
match std::env::var("ZCL_OPENCL_KERNEL").ok().as_deref() {
|
||||||
|
Some(v) if v.eq_ignore_ascii_case("amd") => return true,
|
||||||
|
Some(v) if v.eq_ignore_ascii_case("legacy") => return false,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
match device.info(DeviceInfo::Vendor) {
|
||||||
|
Ok(DeviceInfoResult::Vendor(v)) => {
|
||||||
|
let v = v.to_ascii_lowercase();
|
||||||
|
v.contains("advanced micro devices") || v.contains("amd")
|
||||||
|
}
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// List `(platform, device)` names so the user can choose `--device`.
|
/// List `(platform, device)` names so the user can choose `--device`.
|
||||||
pub fn list_devices() -> Result<Vec<String>> {
|
pub fn list_devices() -> Result<Vec<String>> {
|
||||||
use ocl::{Device, Platform};
|
use ocl::{Device, Platform};
|
||||||
@@ -422,6 +517,25 @@ pub fn list_devices() -> Result<Vec<String>> {
|
|||||||
Ok(names)
|
Ok(names)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// For each OpenCL device — in the same flat order as [`list_devices`] and
|
||||||
|
/// `--devices` — whether its vendor is NVIDIA. The mixed backend uses this to
|
||||||
|
/// hand NVIDIA cards to CUDA (and mine only the non-NVIDIA OpenCL devices).
|
||||||
|
pub fn device_is_nvidia() -> Vec<bool> {
|
||||||
|
use ocl::enums::{DeviceInfo, DeviceInfoResult};
|
||||||
|
use ocl::{Device, Platform};
|
||||||
|
let mut out = Vec::new();
|
||||||
|
for platform in Platform::list() {
|
||||||
|
for device in Device::list_all(platform).unwrap_or_default() {
|
||||||
|
let is_nv = matches!(
|
||||||
|
device.info(DeviceInfo::Vendor),
|
||||||
|
Ok(DeviceInfoResult::Vendor(v)) if v.to_ascii_lowercase().contains("nvidia")
|
||||||
|
);
|
||||||
|
out.push(is_nv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
/// The flat OpenCL device index of the first CPU-type device (e.g. PoCL), if any.
|
/// The flat OpenCL device index of the first CPU-type device (e.g. PoCL), if any.
|
||||||
/// Lets CPU mining run through the OpenCL backend on the CPU. The index matches
|
/// Lets CPU mining run through the OpenCL backend on the CPU. The index matches
|
||||||
/// [`list_devices`] / `--devices`.
|
/// [`list_devices`] / `--devices`.
|
||||||
|
|||||||
+286
@@ -0,0 +1,286 @@
|
|||||||
|
//! AMD OpenCL Equihash 192,7 solver (`kernels/equihash192_7.cl`).
|
||||||
|
//!
|
||||||
|
//! A second OpenCL backend, selected for AMD-vendor devices by
|
||||||
|
//! [`crate::gpu::GpuSolver`]. Where the default [`crate::gpu`] driver runs the
|
||||||
|
//! project's own `gen`/`round_collide`/`recover` kernel, this one drives a
|
||||||
|
//! self-contained, GCN-tuned kernel with a fixed table geometry and a different
|
||||||
|
//! host ABI: a `clearCounter` → `blake` → `round1..round7` → `combine`
|
||||||
|
//! pipeline.
|
||||||
|
//!
|
||||||
|
//! ## Geometry (hard-coded in the kernel, mirrored here)
|
||||||
|
//!
|
||||||
|
//! 2^25 initial entries are bucketed into `NR_ROWS = 8192` rows. Round 0 and the
|
||||||
|
//! early rounds keep `SLOTS_R0 = 4592` `uint8` slots per row (`buffer0`/
|
||||||
|
//! `buffer1` ping-pong, ~1.2 GB each); rounds 4–5 widen to `SLOTS_R45 = 8688`
|
||||||
|
//! `uint4` slots in `buffer2` (~1.1 GB). `buffer1` is additionally reinterpreted
|
||||||
|
//! as `uint4`/`uint2` for the late-round R46/R57 outputs at fixed offsets. A
|
||||||
|
//! flat counter array of 8 banks × 16384 tracks per-row occupancy; the
|
||||||
|
//! round-7/R5 survivor count lives at index `R5_COUNTER_IDX = 114688` and sizes
|
||||||
|
//! the `combine` launch.
|
||||||
|
//!
|
||||||
|
//! ## Hashing ABI
|
||||||
|
//!
|
||||||
|
//! `blake` takes the BLAKE2b first-block midstate as a by-value `ulong8`
|
||||||
|
//! (`hashState`, from [`BatchHasher::midstate`]) plus a `nonce` whose low 32
|
||||||
|
//! bits become message word `m[1]`'s low half (= header bytes [136..140]); the
|
||||||
|
//! kernel hard-codes `m[0] = 0`, so the header's bytes [128..136] must be zero
|
||||||
|
//! (the same `cuda_compatible` rule the CUDA backend uses). Each work item hashes
|
||||||
|
//! index `tId`, emitting the two leaf entries `2*tId` and `2*tId+1` — exactly the
|
||||||
|
//! canonical leaf-index/sub-block split [`crate::equihash`] verifies against.
|
||||||
|
//!
|
||||||
|
//! `combine` writes recovered solutions to the `output0`/`res` buffer:
|
||||||
|
//! `output0[0].s0` is the solution count and each solution is 32 `uint4`
|
||||||
|
//! (`SOLUTION_INDICES = 128` pre-sorted 25-bit leaf indices) at
|
||||||
|
//! `output0[1 + 32*i ..]`. Those flatten straight into
|
||||||
|
//! [`equihash::filter_candidates`], which canonicalises, verifies and
|
||||||
|
//! de-duplicates them — the same contract as the default driver.
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
use ocl::prm::Ulong8;
|
||||||
|
use ocl::{Buffer, MemFlags, ProQue};
|
||||||
|
|
||||||
|
use crate::blake::BatchHasher;
|
||||||
|
use crate::equihash;
|
||||||
|
use crate::params::{BLAKE_CALLS, HEADER_LEN, SOLUTION_INDICES};
|
||||||
|
|
||||||
|
/// Buckets ("rows") the 2^25 entries are hashed into (kernel: `& 0x1FFF`).
|
||||||
|
const NR_ROWS: usize = 8192;
|
||||||
|
/// `uint8` slots per row in the round-0/early `buffer0`/`buffer1` tables.
|
||||||
|
const SLOTS_R0: usize = 4592;
|
||||||
|
/// `uint4` slots per row in the round-4/5 `buffer2` table.
|
||||||
|
const SLOTS_R45: usize = 8688;
|
||||||
|
/// `uint8` entries in `buffer0`/`buffer1` (the kernel's `37617664` bound).
|
||||||
|
const BUF01_U8ENTRIES: usize = NR_ROWS * SLOTS_R0;
|
||||||
|
/// `uint4` entries in `buffer2`.
|
||||||
|
const BUF2_U4ENTRIES: usize = NR_ROWS * SLOTS_R45;
|
||||||
|
/// Flat counter array: 8 banks × 16384 (one bank consumed per round).
|
||||||
|
const COUNTERS_U32: usize = 8 * 16384;
|
||||||
|
/// Counter index holding the round-7 / R5 survivor count (sizes `combine`).
|
||||||
|
const R5_COUNTER_IDX: usize = 114688;
|
||||||
|
/// `combine` only emits solutions for `addr < this` (matches the kernel cap);
|
||||||
|
/// far above the ~2 solutions a 192,7 nonce yields. Reads are capped to match.
|
||||||
|
const MAX_WRITTEN_SOLS: usize = 16;
|
||||||
|
/// Solution buffer capacity in solutions (`output0` = 1 + 32*cap `uint4`).
|
||||||
|
const SOL_CAP: usize = 16;
|
||||||
|
/// `reqd_work_group_size` of `blake`/`combine`.
|
||||||
|
const WG_BLAKE: usize = 64;
|
||||||
|
/// `reqd_work_group_size` of `round1..round7`.
|
||||||
|
const WG_ROUND: usize = 256;
|
||||||
|
/// Input rows (buckets) each collision round reads. The table is keyed on 13
|
||||||
|
/// bits (8192) through round 4, then narrows to 12 bits (4096) — a round that
|
||||||
|
/// reads `b` rows must launch exactly `b * 4` work-groups (kernel:
|
||||||
|
/// `bucket = grp >> 2`), or it processes uninitialised rows and explodes.
|
||||||
|
const ROUND_BUCKETS: [usize; 7] = [8192, 8192, 8192, 8192, 4096, 4096, 4096];
|
||||||
|
/// Extra rows of slack appended to each big table. The kernel's per-row
|
||||||
|
/// `atomic_inc` writes are uncapped, so a row that overflows its slot count
|
||||||
|
/// spills into the next row and the top row spills past the nominal table end;
|
||||||
|
/// this slack absorbs that (mean occupancy ~4096 sits well under the 4592/8688
|
||||||
|
/// slot counts, so realistic overflow is a few rows at most).
|
||||||
|
const ROW_SLACK: usize = 64;
|
||||||
|
|
||||||
|
const KERNEL_SRC: &str = include_str!("../kernels/equihash192_7.cl");
|
||||||
|
|
||||||
|
/// A persistent AMD OpenCL solver bound to one device.
|
||||||
|
pub struct AmdSolver {
|
||||||
|
pq: ProQue,
|
||||||
|
/// Round-0/early ping-pong tables (`uint8`), reinterpreted at narrower
|
||||||
|
/// widths in late rounds.
|
||||||
|
buffer0: Buffer<u32>,
|
||||||
|
buffer1: Buffer<u32>,
|
||||||
|
/// Round-4/5 wide table (`uint4`).
|
||||||
|
buffer2: Buffer<u32>,
|
||||||
|
/// Per-row occupancy counters (8 banks).
|
||||||
|
counters: Buffer<u32>,
|
||||||
|
/// `res` / `output0`: `[count, then 32 uint4 per solution]`.
|
||||||
|
sols: Buffer<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe impl Send for AmdSolver {}
|
||||||
|
|
||||||
|
impl AmdSolver {
|
||||||
|
/// This device's product name, if available.
|
||||||
|
pub fn device_name(&self) -> Option<String> {
|
||||||
|
self.pq.device().name().ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the solver on `(platform, device)` and allocate all device buffers
|
||||||
|
/// (~3.5 GB total).
|
||||||
|
pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result<Self> {
|
||||||
|
let pq = ProQue::builder()
|
||||||
|
.src(KERNEL_SRC)
|
||||||
|
.platform(platform)
|
||||||
|
.device(device)
|
||||||
|
.dims(1) // placeholder; every launch sets its own work size
|
||||||
|
.build()
|
||||||
|
.map_err(|e| anyhow!("AMD OpenCL build failed: {e}"))?;
|
||||||
|
|
||||||
|
let alloc = |len: usize| -> Result<Buffer<u32>> {
|
||||||
|
Ok(Buffer::<u32>::builder()
|
||||||
|
.queue(pq.queue().clone())
|
||||||
|
.flags(MemFlags::new().read_write())
|
||||||
|
.len(len)
|
||||||
|
.build()?)
|
||||||
|
};
|
||||||
|
// uint8 entries → 8 u32 each; uint4 entries → 4 u32 each. Each table gets
|
||||||
|
// ROW_SLACK extra rows of write headroom (see ROW_SLACK). buffer1's
|
||||||
|
// nominal uint8 size (75.2M uint4) already covers the late-round R46/R57
|
||||||
|
// regions at fixed offsets 48496640 / 67305472.
|
||||||
|
let buffer0 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?;
|
||||||
|
let buffer1 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?;
|
||||||
|
let buffer2 = alloc((BUF2_U4ENTRIES + ROW_SLACK * SLOTS_R45) * 4)?;
|
||||||
|
let counters = alloc(COUNTERS_U32)?;
|
||||||
|
let sols = alloc((1 + 32 * SOL_CAP) * 4)?;
|
||||||
|
|
||||||
|
Ok(Self { pq, buffer0, buffer1, buffer2, counters, sols })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the eight by-value args shared by every kernel in the pipeline:
|
||||||
|
/// `(buffer0, buffer1, buffer2, counters, res, extra, hashState, nonce)`.
|
||||||
|
fn build_kernel(&self, name: &str, hash_state: Ulong8, nonce: u64) -> Result<ocl::Kernel> {
|
||||||
|
Ok(self
|
||||||
|
.pq
|
||||||
|
.kernel_builder(name)
|
||||||
|
.arg(&self.buffer0)
|
||||||
|
.arg(&self.buffer1)
|
||||||
|
.arg(&self.buffer2)
|
||||||
|
.arg(&self.counters)
|
||||||
|
.arg(&self.sols)
|
||||||
|
.arg(0u32) // extra (unused by the pipeline)
|
||||||
|
.arg(hash_state)
|
||||||
|
.arg(nonce)
|
||||||
|
.build()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the full pipeline for `header` and return the flat recovered leaf
|
||||||
|
/// indices (`n * SOLUTION_INDICES`), ready for [`equihash::filter_candidates`].
|
||||||
|
fn run_pipeline(&self, header: &[u8]) -> Result<Vec<u32>> {
|
||||||
|
let mid = BatchHasher::new(header).midstate();
|
||||||
|
let hash_state = Ulong8::new(
|
||||||
|
mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7],
|
||||||
|
);
|
||||||
|
// Kernel's gId = nonce & 0xFFFFFFFF = message word m[1] low = header[136..140].
|
||||||
|
let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64;
|
||||||
|
|
||||||
|
// Clear counters + solution header (global = counter uint4 count).
|
||||||
|
let clear = self.build_kernel("clearCounter", hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Round 0: BLAKE2b + bucket. One work item per blake call (2^24); each
|
||||||
|
// emits two leaf entries.
|
||||||
|
let blake = self.build_kernel("blake", hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
blake
|
||||||
|
.cmd()
|
||||||
|
.global_work_size(BLAKE_CALLS)
|
||||||
|
.local_work_size(WG_BLAKE)
|
||||||
|
.enq()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collision rounds 1..7 (4 groups per input row, 256 work items each).
|
||||||
|
for r in 1..=7 {
|
||||||
|
let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
k.cmd()
|
||||||
|
.global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND)
|
||||||
|
.local_work_size(WG_ROUND)
|
||||||
|
.enq()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Size `combine` from the round-7 survivor count (one group per candidate).
|
||||||
|
let mut r5 = [0u32; 1];
|
||||||
|
self.counters
|
||||||
|
.read(&mut r5[..])
|
||||||
|
.offset(R5_COUNTER_IDX)
|
||||||
|
.enq()?;
|
||||||
|
let groups = r5[0] as usize;
|
||||||
|
if groups == 0 {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
let combine = self.build_kernel("combine", hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
combine
|
||||||
|
.cmd()
|
||||||
|
.global_work_size(groups * WG_BLAKE)
|
||||||
|
.local_work_size(WG_BLAKE)
|
||||||
|
.enq()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// output0[0].s0 = solution count; each solution is 128 u32 (32 uint4)
|
||||||
|
// starting at uint4 index 1 (= u32 offset 4).
|
||||||
|
let mut head = [0u32; 1];
|
||||||
|
self.sols.read(&mut head[..]).enq()?;
|
||||||
|
let nsols = (head[0] as usize).min(MAX_WRITTEN_SOLS);
|
||||||
|
if nsols == 0 {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
let mut data = vec![0u32; nsols * SOLUTION_INDICES];
|
||||||
|
self.sols.read(&mut data[..]).offset(4).enq()?;
|
||||||
|
Ok(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Solve for `header` (140 bytes): returns valid, canonical, de-duplicated
|
||||||
|
/// solutions as leaf-index lists.
|
||||||
|
pub fn solve(&self, header: &[u8]) -> Result<Vec<Vec<u32>>> {
|
||||||
|
assert_eq!(header.len(), HEADER_LEN);
|
||||||
|
let base = crate::blake::base_state(header);
|
||||||
|
let out = self.run_pipeline(header)?;
|
||||||
|
Ok(equihash::filter_candidates(&base, &out))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Solve and also return the raw recovered-candidate count (for diagnostics).
|
||||||
|
pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec<Vec<u32>>)> {
|
||||||
|
assert_eq!(header.len(), HEADER_LEN);
|
||||||
|
let base = crate::blake::base_state(header);
|
||||||
|
let out = self.run_pipeline(header)?;
|
||||||
|
let raw = out.len() / SOLUTION_INDICES;
|
||||||
|
Ok((raw, equihash::filter_candidates(&base, &out)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Time each pipeline stage individually (forces a sync between stages).
|
||||||
|
pub fn profile(&self, header: &[u8]) -> Result<()> {
|
||||||
|
use log::info;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
let mid = BatchHasher::new(header).midstate();
|
||||||
|
let hash_state = Ulong8::new(
|
||||||
|
mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7],
|
||||||
|
);
|
||||||
|
let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64;
|
||||||
|
let q = self.pq.queue();
|
||||||
|
let stage = |label: &str, t: Instant| -> Result<()> {
|
||||||
|
q.finish().map_err(|e| anyhow!("{label} failed: {e}"))?;
|
||||||
|
info!(" {label:14} {:6.1} ms", t.elapsed().as_secs_f64() * 1000.0);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
|
||||||
|
let t = Instant::now();
|
||||||
|
let clear = self.build_kernel("clearCounter", hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?;
|
||||||
|
}
|
||||||
|
stage("clear", t)?;
|
||||||
|
|
||||||
|
let t = Instant::now();
|
||||||
|
let blake = self.build_kernel("blake", hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
blake.cmd().global_work_size(BLAKE_CALLS).local_work_size(WG_BLAKE).enq()?;
|
||||||
|
}
|
||||||
|
stage("blake", t)?;
|
||||||
|
|
||||||
|
for r in 1..=7 {
|
||||||
|
let t = Instant::now();
|
||||||
|
let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?;
|
||||||
|
unsafe {
|
||||||
|
k.cmd()
|
||||||
|
.global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND)
|
||||||
|
.local_work_size(WG_ROUND)
|
||||||
|
.enq()?;
|
||||||
|
}
|
||||||
|
stage(&format!("round {r}"), t)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
+41
-17
@@ -1,11 +1,12 @@
|
|||||||
//! GPU device probing for the config tool (`jackpotminer-config` only — this is
|
//! GPU device probing for the config tool (`jackpotminer-config` only — this is
|
||||||
//! not compiled into the miner, so there is no duplicate FFI).
|
//! not compiled into the miner, so there is no duplicate FFI).
|
||||||
//!
|
//!
|
||||||
//! With the `gpu`/`cuda` features the OpenCL/CUDA SDKs are linked in (build.rs
|
//! With the `gpu`/`cuda` features the tool enumerates devices directly — handy
|
||||||
//! links `cuda`/`nvml`; the `ocl` crate links `OpenCL`), and the tool enumerates
|
//! on Windows where you may not want to shell out to the miner. OpenCL goes
|
||||||
//! devices directly — handy on Windows where you may not want to shell out to
|
//! through the `ocl` crate; CUDA is `dlopen`'d at runtime (so this binary, like
|
||||||
//! the miner. Without those features the functions return empty lists and the
|
//! the miner, has no build- or load-time dependency on libcuda). Without those
|
||||||
//! tool falls back to spawning `jackpotminer --devices-json`.
|
//! features the functions return empty lists and the tool falls back to spawning
|
||||||
|
//! `jackpotminer --devices-json`.
|
||||||
|
|
||||||
/// True when at least one GPU SDK is compiled in, so direct probing works.
|
/// True when at least one GPU SDK is compiled in, so direct probing works.
|
||||||
pub const HAS_SDK: bool = cfg!(feature = "gpu") || cfg!(feature = "cuda");
|
pub const HAS_SDK: bool = cfg!(feature = "gpu") || cfg!(feature = "cuda");
|
||||||
@@ -34,34 +35,57 @@ pub fn opencl() -> Vec<String> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// CUDA devices as `"[i] <name>"` via the driver API (empty without the SDK, no
|
/// CUDA devices as `"[i] <name>"` via the driver API (empty without the SDK, no
|
||||||
/// driver, or any error). Uses a tiny self-contained FFI subset.
|
/// driver, or any error). The CUDA driver is `dlopen`'d at runtime — a tiny
|
||||||
|
/// self-contained subset of the FFI in `src/cuda.rs` — so this binary needs no
|
||||||
|
/// link- or load-time libcuda.
|
||||||
#[cfg(feature = "cuda")]
|
#[cfg(feature = "cuda")]
|
||||||
pub fn cuda() -> Vec<String> {
|
pub fn cuda() -> Vec<String> {
|
||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
use std::os::raw::{c_char, c_int, c_uint};
|
use std::os::raw::{c_char, c_int, c_uint};
|
||||||
|
|
||||||
// Linked via build.rs (`cuda`), matching src/cuda.rs's declarations.
|
type CuInit = unsafe extern "C" fn(c_uint) -> c_int;
|
||||||
extern "C" {
|
type CuCount = unsafe extern "C" fn(*mut c_int) -> c_int;
|
||||||
fn cuInit(flags: c_uint) -> c_int;
|
type CuGet = unsafe extern "C" fn(*mut c_int, c_int) -> c_int;
|
||||||
fn cuDeviceGetCount(count: *mut c_int) -> c_int;
|
type CuName = unsafe extern "C" fn(*mut c_char, c_int, c_int) -> c_int;
|
||||||
fn cuDeviceGet(device: *mut c_int, ordinal: c_int) -> c_int;
|
|
||||||
fn cuDeviceGetName(name: *mut c_char, len: c_int, dev: c_int) -> c_int;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
unsafe {
|
unsafe {
|
||||||
if cuInit(0) != 0 {
|
// libcuda.so.1 ships with the NVIDIA driver; absent on AMD-only hosts.
|
||||||
|
let lib = match ["libcuda.so.1", "libcuda.so", "nvcuda.dll"]
|
||||||
|
.iter()
|
||||||
|
.find_map(|n| libloading::Library::new(n).ok())
|
||||||
|
{
|
||||||
|
Some(l) => l,
|
||||||
|
None => return out,
|
||||||
|
};
|
||||||
|
let sym = |name: &[u8]| -> Option<*mut std::ffi::c_void> {
|
||||||
|
lib.get::<*mut std::ffi::c_void>(name).ok().map(|s| *s)
|
||||||
|
};
|
||||||
|
let (Some(init), Some(count), Some(get), Some(getname)) = (
|
||||||
|
sym(b"cuInit\0"),
|
||||||
|
sym(b"cuDeviceGetCount\0"),
|
||||||
|
sym(b"cuDeviceGet\0"),
|
||||||
|
sym(b"cuDeviceGetName\0"),
|
||||||
|
) else {
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
let cu_init: CuInit = std::mem::transmute(init);
|
||||||
|
let cu_count: CuCount = std::mem::transmute(count);
|
||||||
|
let cu_get: CuGet = std::mem::transmute(get);
|
||||||
|
let cu_name: CuName = std::mem::transmute(getname);
|
||||||
|
|
||||||
|
if cu_init(0) != 0 {
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
let mut n: c_int = 0;
|
let mut n: c_int = 0;
|
||||||
if cuDeviceGetCount(&mut n) != 0 {
|
if cu_count(&mut n) != 0 {
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
let mut dev: c_int = 0;
|
let mut dev: c_int = 0;
|
||||||
let name = if cuDeviceGet(&mut dev, i) == 0 {
|
let name = if cu_get(&mut dev, i) == 0 {
|
||||||
let mut buf = [0i8; 128];
|
let mut buf = [0i8; 128];
|
||||||
if cuDeviceGetName(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 {
|
if cu_name(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 {
|
||||||
CStr::from_ptr(buf.as_ptr() as *const c_char).to_string_lossy().into_owned()
|
CStr::from_ptr(buf.as_ptr() as *const c_char).to_string_lossy().into_owned()
|
||||||
} else {
|
} else {
|
||||||
format!("CUDA device {i}")
|
format!("CUDA device {i}")
|
||||||
|
|||||||
+116
-6
@@ -14,6 +14,14 @@ mod tui;
|
|||||||
#[cfg(feature = "gpu")]
|
#[cfg(feature = "gpu")]
|
||||||
mod gpu;
|
mod gpu;
|
||||||
|
|
||||||
|
// AMD-tuned OpenCL kernel driver (selected by GpuSolver for AMD-vendor devices).
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
mod gpu_amd;
|
||||||
|
|
||||||
|
// Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML.
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
mod dylib;
|
||||||
|
|
||||||
#[cfg(feature = "cuda")]
|
#[cfg(feature = "cuda")]
|
||||||
mod cuda;
|
mod cuda;
|
||||||
|
|
||||||
@@ -79,8 +87,9 @@ struct Args {
|
|||||||
jackpot: Option<u32>,
|
jackpot: Option<u32>,
|
||||||
|
|
||||||
/// Pause mining if no new job arrives within this many seconds (stale work
|
/// Pause mining if no new job arrives within this many seconds (stale work
|
||||||
/// guard); resumes automatically when fresh work arrives. 0 disables.
|
/// guard); resumes automatically when fresh work arrives. Default 600 (10
|
||||||
#[arg(long, value_name = "SECS", default_value_t = 300)]
|
/// minutes). 0 disables.
|
||||||
|
#[arg(long, value_name = "SECS", default_value_t = 600)]
|
||||||
job_timeout: u64,
|
job_timeout: u64,
|
||||||
|
|
||||||
/// Open a local control server on 127.0.0.1:<PORT> so the GUI config tool can
|
/// Open a local control server on 127.0.0.1:<PORT> so the GUI config tool can
|
||||||
@@ -139,8 +148,11 @@ struct Args {
|
|||||||
#[arg(long, default_value = "all")]
|
#[arg(long, default_value = "all")]
|
||||||
devices: String,
|
devices: String,
|
||||||
|
|
||||||
/// GPU backend: "opencl" or "cuda" (for nvidia cards).
|
/// GPU backend: "mixed" (default — each card on its native backend: NVIDIA
|
||||||
#[arg(long, default_value = "cuda")]
|
/// on CUDA, AMD/Intel on OpenCL), "opencl" (every card via OpenCL), or
|
||||||
|
/// "cuda" (NVIDIA only). In mixed mode `--devices` indexes the combined list
|
||||||
|
/// shown by --list-devices.
|
||||||
|
#[arg(long, default_value = "mixed")]
|
||||||
backend: String,
|
backend: String,
|
||||||
|
|
||||||
/// Force the OpenCL backend, disabling CUDA (overrides --backend).
|
/// Force the OpenCL backend, disabling CUDA (overrides --backend).
|
||||||
@@ -610,6 +622,9 @@ fn main() -> Result<()> {
|
|||||||
/// Which GPU backend the user selected.
|
/// Which GPU backend the user selected.
|
||||||
enum BackendKind {
|
enum BackendKind {
|
||||||
Cpu,
|
Cpu,
|
||||||
|
/// Each physical card on its native backend (NVIDIA→CUDA, others→OpenCL).
|
||||||
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
|
Mixed,
|
||||||
#[cfg(feature = "gpu")]
|
#[cfg(feature = "gpu")]
|
||||||
OpenCl,
|
OpenCl,
|
||||||
#[cfg(feature = "cuda")]
|
#[cfg(feature = "cuda")]
|
||||||
@@ -633,6 +648,16 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
match args.backend.to_ascii_lowercase().as_str() {
|
match args.backend.to_ascii_lowercase().as_str() {
|
||||||
|
"mixed" => {
|
||||||
|
// Each card on its native backend; falls back to whatever single GPU
|
||||||
|
// backend is compiled, or to CPU when none is.
|
||||||
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
|
{
|
||||||
|
Ok(BackendKind::Mixed)
|
||||||
|
}
|
||||||
|
#[cfg(not(any(feature = "gpu", feature = "cuda")))]
|
||||||
|
Ok(BackendKind::Cpu)
|
||||||
|
}
|
||||||
"cuda" => {
|
"cuda" => {
|
||||||
#[cfg(feature = "cuda")]
|
#[cfg(feature = "cuda")]
|
||||||
{
|
{
|
||||||
@@ -649,7 +674,7 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
|
|||||||
#[cfg(not(feature = "gpu"))]
|
#[cfg(not(feature = "gpu"))]
|
||||||
Ok(BackendKind::Cpu)
|
Ok(BackendKind::Cpu)
|
||||||
}
|
}
|
||||||
other => Err(anyhow!("unknown --backend '{other}' (expected opencl or cuda)")),
|
other => Err(anyhow!("unknown --backend '{other}' (expected mixed, opencl, or cuda)")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -707,6 +732,8 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
|
|||||||
let clamp = (args.cpu_clamp != 0).then_some(args.cpu_clamp);
|
let clamp = (args.cpu_clamp != 0).then_some(args.cpu_clamp);
|
||||||
return Ok(vec![BackendSpec::Cpu(clamp)]);
|
return Ok(vec![BackendSpec::Cpu(clamp)]);
|
||||||
}
|
}
|
||||||
|
// Mixed builds its own unified list (each card on its native backend).
|
||||||
|
BackendKind::Mixed => return mixed_specs(args),
|
||||||
#[cfg(feature = "cuda")]
|
#[cfg(feature = "cuda")]
|
||||||
BackendKind::Cuda => (cuda::device_count()?, true),
|
BackendKind::Cuda => (cuda::device_count()?, true),
|
||||||
#[cfg(feature = "gpu")]
|
#[cfg(feature = "gpu")]
|
||||||
@@ -735,6 +762,71 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The unified device list for the `mixed` backend, as `(label, spec)`: each
|
||||||
|
/// physical GPU on its native backend, with no card mined twice. NVIDIA cards go
|
||||||
|
/// to CUDA (listed first); the remaining OpenCL devices (AMD/Intel, plus NVIDIA
|
||||||
|
/// when CUDA is unavailable) go to OpenCL. Shared by [`mixed_specs`] and
|
||||||
|
/// [`list_devices`]; `--devices` indexes into this list.
|
||||||
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
|
fn mixed_plan() -> Vec<(String, BackendSpec)> {
|
||||||
|
/// Drop a leading `"[<n>] "` index prefix from a backend's device label, so
|
||||||
|
/// the mixed list shows its own single index instead of two.
|
||||||
|
fn strip_index(label: &str) -> &str {
|
||||||
|
label
|
||||||
|
.strip_prefix('[')
|
||||||
|
.and_then(|s| s.split_once("] "))
|
||||||
|
.map(|(_, rest)| rest)
|
||||||
|
.unwrap_or(label)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(unused_mut)]
|
||||||
|
let mut plan: Vec<(String, BackendSpec)> = Vec::new();
|
||||||
|
|
||||||
|
// NVIDIA cards via CUDA, when the backend is compiled and the driver loads.
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
let cuda_has_nvidia = {
|
||||||
|
let names = cuda::list_devices().unwrap_or_default();
|
||||||
|
for (i, label) in names.iter().enumerate() {
|
||||||
|
plan.push((format!("{} (CUDA)", strip_index(label)), BackendSpec::Cuda(i)));
|
||||||
|
}
|
||||||
|
!names.is_empty()
|
||||||
|
};
|
||||||
|
#[cfg(not(feature = "cuda"))]
|
||||||
|
let cuda_has_nvidia = false;
|
||||||
|
|
||||||
|
// Remaining OpenCL cards via OpenCL; skip NVIDIA ones already on CUDA.
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
{
|
||||||
|
let names = gpu::list_devices().unwrap_or_default();
|
||||||
|
let nvidia = gpu::device_is_nvidia();
|
||||||
|
for (j, label) in names.iter().enumerate() {
|
||||||
|
if nvidia.get(j).copied().unwrap_or(false) && cuda_has_nvidia {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
plan.push((format!("{} (OpenCL)", strip_index(label)), BackendSpec::Gpu(j)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// `cuda_has_nvidia` is only consumed by the OpenCL branch above.
|
||||||
|
#[cfg(not(feature = "gpu"))]
|
||||||
|
let _ = cuda_has_nvidia;
|
||||||
|
|
||||||
|
plan
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the worker list for `--backend mixed`: each card on its native backend.
|
||||||
|
/// `--devices` selects into [`mixed_plan`]'s unified list.
|
||||||
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
|
fn mixed_specs(args: &Args) -> Result<Vec<BackendSpec>> {
|
||||||
|
let plan = mixed_plan();
|
||||||
|
if plan.is_empty() {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"no GPUs found for the mixed backend — none detected via CUDA or OpenCL"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let selected = parse_devices(&args.devices, plan.len())?;
|
||||||
|
Ok(selected.into_iter().map(|i| plan[i].1).collect())
|
||||||
|
}
|
||||||
|
|
||||||
/// Build a single GPU worker spec for `idx`, choosing CUDA or OpenCL, erroring if
|
/// Build a single GPU worker spec for `idx`, choosing CUDA or OpenCL, erroring if
|
||||||
/// the requested backend wasn't compiled in.
|
/// the requested backend wasn't compiled in.
|
||||||
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
@@ -821,6 +913,18 @@ fn list_devices() {
|
|||||||
Ok(_) => println!("no CUDA devices found"),
|
Ok(_) => println!("no CUDA devices found"),
|
||||||
Err(e) => println!("error listing CUDA devices: {e}"),
|
Err(e) => println!("error listing CUDA devices: {e}"),
|
||||||
}
|
}
|
||||||
|
// What the default `mixed` backend will mine, and the indices `--devices`
|
||||||
|
// selects from in that mode.
|
||||||
|
#[cfg(any(feature = "gpu", feature = "cuda"))]
|
||||||
|
{
|
||||||
|
let plan = mixed_plan();
|
||||||
|
if !plan.is_empty() {
|
||||||
|
println!("\nMixed backend (--backend mixed, the default) — `--devices` indexes this list:");
|
||||||
|
for (i, (label, _)) in plan.iter().enumerate() {
|
||||||
|
println!(" [{i}] {label}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#[cfg(not(any(feature = "gpu", feature = "cuda")))]
|
#[cfg(not(any(feature = "gpu", feature = "cuda")))]
|
||||||
println!("built without GPU support (rebuild with the `gpu` or `cuda` feature)");
|
println!("built without GPU support (rebuild with the `gpu` or `cuda` feature)");
|
||||||
}
|
}
|
||||||
@@ -886,7 +990,10 @@ fn selftest(gpu_device: usize) -> Result<()> {
|
|||||||
let solver = gpu::GpuSolver::new(gpu_device)
|
let solver = gpu::GpuSolver::new(gpu_device)
|
||||||
.with_context(|| format!("init OpenCL device {gpu_device}"))?;
|
.with_context(|| format!("init OpenCL device {gpu_device}"))?;
|
||||||
|
|
||||||
// Spot-check the BLAKE2b kernel against the CPU reference.
|
// Spot-check the BLAKE2b kernel against the CPU reference. The AMD kernel
|
||||||
|
// buckets its round-0 output instead of exposing per-index digests, so
|
||||||
|
// the probe is skipped there (the solve-vs-CPU check below still runs).
|
||||||
|
if solver.supports_blake_probe() {
|
||||||
let outputs = solver.hash_all(&header)?;
|
let outputs = solver.hash_all(&header)?;
|
||||||
let step = params::BLAKE_CALLS / 64;
|
let step = params::BLAKE_CALLS / 64;
|
||||||
for k in 0..64 {
|
for k in 0..64 {
|
||||||
@@ -898,6 +1005,9 @@ fn selftest(gpu_device: usize) -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
info!("GPU BLAKE2b kernel matches CPU");
|
info!("GPU BLAKE2b kernel matches CPU");
|
||||||
|
} else {
|
||||||
|
info!("skipping BLAKE2b kernel probe (AMD kernel buckets round-0 output)");
|
||||||
|
}
|
||||||
|
|
||||||
let gpu_solutions = solver.solve(&header)?;
|
let gpu_solutions = solver.solve(&header)?;
|
||||||
info!("GPU found {} valid solution(s)", gpu_solutions.len());
|
info!("GPU found {} valid solution(s)", gpu_solutions.len());
|
||||||
|
|||||||
+10
-1
@@ -33,7 +33,15 @@ const NVML_CLOCK_MEM: c_int = 2;
|
|||||||
// nvmlTemperatureSensors_t
|
// nvmlTemperatureSensors_t
|
||||||
const NVML_TEMPERATURE_GPU: c_int = 0;
|
const NVML_TEMPERATURE_GPU: c_int = 0;
|
||||||
|
|
||||||
extern "C" {
|
// NVML, loaded at runtime via dlopen (see `crate::dylib`) rather than linked at
|
||||||
|
// build time — it ships with the NVIDIA driver (`libnvidia-ml.so.1`;
|
||||||
|
// `nvml.dll` on Windows) and is absent on driver-less / AMD-only hosts.
|
||||||
|
// `nvml_lib()` is `None` when it can't be opened; `open()` checks it first and
|
||||||
|
// returns `None` (no tuning) so the rest of the program is unaffected.
|
||||||
|
crate::dylib::dynamic_library! {
|
||||||
|
lib_struct: NvmlLib,
|
||||||
|
loader: nvml_lib,
|
||||||
|
names: ["libnvidia-ml.so.1", "libnvidia-ml.so", "nvml.dll"],
|
||||||
fn nvmlInit_v2() -> nvmlReturn_t;
|
fn nvmlInit_v2() -> nvmlReturn_t;
|
||||||
fn nvmlShutdown() -> nvmlReturn_t;
|
fn nvmlShutdown() -> nvmlReturn_t;
|
||||||
fn nvmlDeviceGetName(device: nvmlDevice_t, name: *mut c_char, length: c_uint) -> nvmlReturn_t;
|
fn nvmlDeviceGetName(device: nvmlDevice_t, name: *mut c_char, length: c_uint) -> nvmlReturn_t;
|
||||||
@@ -69,6 +77,7 @@ unsafe impl Send for NvmlTuner {}
|
|||||||
|
|
||||||
/// Open an NVML control handle for the GPU at `pci_bus_id` (e.g. "0000:01:00.0").
|
/// Open an NVML control handle for the GPU at `pci_bus_id` (e.g. "0000:01:00.0").
|
||||||
pub fn open(pci_bus_id: &str) -> Option<Box<dyn GpuTuner>> {
|
pub fn open(pci_bus_id: &str) -> Option<Box<dyn GpuTuner>> {
|
||||||
|
nvml_lib()?; // NVML not installed → no tuning
|
||||||
let cstr = CString::new(pci_bus_id).ok()?;
|
let cstr = CString::new(pci_bus_id).ok()?;
|
||||||
unsafe {
|
unsafe {
|
||||||
if nvmlInit_v2() != NVML_SUCCESS {
|
if nvmlInit_v2() != NVML_SUCCESS {
|
||||||
|
|||||||
Reference in New Issue
Block a user