diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..69d99ee --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +# Keep the build context small: the portable build only needs the sources. +/target +/dist +/.git +/pearl-dump +/alpha-miner +*.log diff --git a/.gitignore b/.gitignore index 7d7ae7c..9810fe2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # Rust build artifacts /target +# Portable container build output (build-portable.sh) +/dist + # IDE / editor /.idea diff --git a/Cargo.lock b/Cargo.lock index 3213946..7d4fdf4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2236,6 +2236,7 @@ dependencies = [ "eframe", "env_logger", "hex", + "libloading 0.8.9", "log", "num_cpus", "ocl", diff --git a/Cargo.toml b/Cargo.toml index e85ab4e..9e74463 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,13 +23,18 @@ socket2 = "0.5" ocl = { version = "0.19", optional = true } ratatui = "0.30.0" eframe = { version = "0.28", optional = true } +# Runtime loader for the CUDA driver / NVML (dlopen'd, not link-time, so the +# binary has no build- or load-time dependency on libcuda / libnvidia-ml). +libloading = { version = "0.8", optional = true } [features] default = ["gpu", "cuda", "config-gui"] gpu = ["dep:ocl"] # CUDA backend: drives miniZ's embedded Equihash 192,7 fatbin via the CUDA driver -# API. build.rs only links libcuda (no nvcc / kernel compilation needed). -cuda = [] +# API. The driver (libcuda) and NVML are dlopen'd at runtime via libloading, so +# there is no build-time or load-time dependency on them — the binary builds and +# starts on hosts without an NVIDIA driver and simply reports no CUDA devices. +cuda = ["dep:libloading"] # Optional native GUI config editor (the `jackpotminer-config` binary). Off by # default so the miner never pulls in the GUI toolkit. config-gui = ["dep:eframe"] diff --git a/Dockerfile.portable b/Dockerfile.portable new file mode 100644 index 0000000..f2396a6 --- /dev/null +++ b/Dockerfile.portable @@ -0,0 +1,40 @@ +# Portable jackpotminer build. +# +# Links against Debian bullseye's glibc 2.31 (released 2020), so the resulting +# binary runs on essentially any Linux from the last several years instead of +# requiring the build host's (much newer) glibc. This is the real fix for the +# "version `GLIBC_2.39' not found" class of errors — static linking can't solve +# it for a GPU build, because the GPU driver libraries are glibc-only and load +# at runtime. +# +# The CUDA driver and NVML are dlopen'd at runtime (see src/dylib.rs), so this +# build needs NO NVIDIA toolkit — only the OpenCL ICD loader (to link libOpenCL). +# The result is one binary that drives AMD cards (OpenCL) and NVIDIA cards (CUDA, +# loaded if libcuda.so.1 is present at runtime). +# +# Build: DOCKER_BUILDKIT=1 docker build -f Dockerfile.portable \ +# --output type=local,dest=dist . +# or just: ./build-portable.sh +# Output: dist/jackpotminer + +FROM debian:bullseye-slim AS build +ENV DEBIAN_FRONTEND=noninteractive +# gcc/g++ for the linker; ocl-icd-opencl-dev provides libOpenCL.so for linking +# (the runtime host supplies its own libOpenCL.so.1 via its GPU driver). +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl gcc g++ make pkg-config ocl-icd-opencl-dev \ + && rm -rf /var/lib/apt/lists/* +# Minimal stable Rust toolchain. +RUN curl -fsSL https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable +ENV PATH="/root/.cargo/bin:${PATH}" + +WORKDIR /src +COPY . . +# Miner only (no GUI config tool, to avoid pulling X11/Wayland/GL into the +# build): AMD OpenCL + dlopen'd CUDA. `--locked` keeps it reproducible. +RUN cargo build --release --locked --no-default-features --features gpu,cuda \ + && strip target/release/jackpotminer + +# Export just the binary to the build output directory. +FROM scratch AS export +COPY --from=build /src/target/release/jackpotminer /jackpotminer diff --git a/README.md b/README.md index edc2ee5..1d8f62e 100644 --- a/README.md +++ b/README.md @@ -98,16 +98,45 @@ reverse-engineered Equihash 192,7 solver — see "CUDA backend" below. ## Build -Requirements: a Rust toolchain and an OpenCL runtime (the NVIDIA driver ships -`libOpenCL`). The CUDA backend only needs `libcuda` (the NVIDIA driver) — the -fatbin and launch trace it drives are embedded in the binary, so no CUDA toolkit -or `nvcc` is required. +Requirements: a Rust toolchain and, for the OpenCL backend, the OpenCL ICD +loader (`libOpenCL` — e.g. `ocl-icd-opencl-dev` on Debian/Ubuntu; the NVIDIA and +AMD drivers also ship it). The CUDA driver and NVML are **`dlopen`'d at runtime** +(see `src/dylib.rs`), so the `cuda` feature needs no NVIDIA toolkit or libs to +build, and a `cuda`-enabled binary still builds and starts on hosts without an +NVIDIA driver — it simply reports no CUDA devices. The fatbin and launch trace +the CUDA backend drives are embedded, so no `nvcc` is required either. ```bash -cargo build --release # OpenCL backend (default) -cargo build --release --features cuda # OpenCL + CUDA backends -cargo build --release --no-default-features --features cuda # CUDA only -cargo build --release --no-default-features # CPU-only (no GPU) +cargo build --release # default: OpenCL + CUDA + GUI config tool +cargo build --release --no-default-features --features gpu,cuda # miner only, both GPU backends +cargo build --release --no-default-features --features gpu # OpenCL only (AMD/Intel/NVIDIA) +cargo build --release --no-default-features --features cuda # CUDA only +cargo build --release --no-default-features # CPU-only (no GPU) +``` + +### Portable / distributable builds + +The miner's only runtime dependencies are the C library and the OpenCL ICD loader +(`libOpenCL.so.1`, present wherever a GPU driver is); CUDA/NVML are loaded on +demand. So the main compatibility risk when shipping a Linux binary is the +**glibc version** it was built against — not the GPU libraries. To build one that +runs on older distros, compile against an old glibc in a container: + +```bash +./build-portable.sh # → dist/jackpotminer (Docker, or ENGINE=podman) +``` + +This links against Debian bullseye's glibc 2.31 (runs on most Linux from ~2020 +on) and yields a single miner that drives both AMD (OpenCL) and NVIDIA (CUDA) +cards. See `Dockerfile.portable`. + +A fully *static* GPU binary isn't possible: the OpenCL/CUDA driver libraries are +glibc-only and must load at runtime. For a zero-dependency binary that runs +anywhere, build the **CPU-only** miner against musl: + +```bash +rustup target add x86_64-unknown-linux-musl +cargo build --release --target x86_64-unknown-linux-musl --no-default-features ``` ### CUDA backend (miniZ fatbin replay) @@ -181,7 +210,7 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and ## Usage ```bash -# List OpenCL devices +# List devices (and the default "mixed" backend's combined index list) ./target/release/jackpotminer --list-devices # Mine on one GPU @@ -195,8 +224,11 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and ./target/release/jackpotminer --url ... --user ... --devices 0,1 ./target/release/jackpotminer --url ... --user ... --devices all -# Use the CUDA backend instead of OpenCL (needs a --features cuda build) -./target/release/jackpotminer --url ... --user ... --backend cuda --devices all +# Default backend is "mixed": NVIDIA cards run on CUDA, AMD/Intel on OpenCL — +# so an AMD + NVIDIA rig just works. --devices indexes the combined list from +# --list-devices. Pin a single backend for every card with: +./target/release/jackpotminer --url ... --user ... --backend opencl # all via OpenCL +./target/release/jackpotminer --url ... --user ... --backend cuda # NVIDIA only # Force the CPU backend ./target/release/jackpotminer --url ... --user ... --cpu diff --git a/build-portable.sh b/build-portable.sh new file mode 100755 index 0000000..c73c7a5 --- /dev/null +++ b/build-portable.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# Build a portable jackpotminer binary inside an old-glibc (Debian bullseye, +# glibc 2.31) container, so it runs on essentially any recent Linux regardless of +# the build host's glibc. CUDA is dlopen'd at runtime, so no NVIDIA toolkit is +# needed to build; the binary drives both AMD (OpenCL) and NVIDIA (CUDA) cards. +# +# Output: dist/jackpotminer +# +# Works with Docker (BuildKit) or Podman. Override the engine with ENGINE=podman. +set -eu + +ENGINE="${ENGINE:-docker}" +OUT="${OUT:-dist}" + +mkdir -p "$OUT" + +case "$ENGINE" in + podman) + # Podman builds the final `scratch` stage; extract the binary from it. + podman build -f Dockerfile.portable -t jackpotminer-portable . + cid=$(podman create jackpotminer-portable) + podman cp "$cid:/jackpotminer" "$OUT/jackpotminer" + podman rm "$cid" >/dev/null + ;; + *) + DOCKER_BUILDKIT=1 "$ENGINE" build -f Dockerfile.portable \ + --output "type=local,dest=$OUT" . + ;; +esac + +chmod +x "$OUT/jackpotminer" +echo +echo "Built $OUT/jackpotminer" +command -v file >/dev/null 2>&1 && file "$OUT/jackpotminer" || true +echo "Minimum glibc / dynamic deps:" +{ objdump -T "$OUT/jackpotminer" 2>/dev/null | grep -oE 'GLIBC_[0-9.]+' | sort -V | tail -1; } || true diff --git a/build.rs b/build.rs deleted file mode 100644 index 4fdd6a0..0000000 --- a/build.rs +++ /dev/null @@ -1,53 +0,0 @@ -//! Build script for the CUDA backend. -//! -//! The `cuda` feature links the CUDA driver API (`cuda`) and NVML (for -//! clock/power control + readout). The backend drives miniZ's embedded fatbin -//! (`src/miniz/equihash192_7.fatbin`) via the driver API, so no nvcc / kernel -//! compilation is needed at build time. (The default OpenCL backend needs no -//! build-script support — `ocl` links `OpenCL` itself, cross-platform.) -//! -//! Linking is target-aware so the `cuda` feature builds on both Linux and -//! Windows: -//! - Linux: `libcuda.so` + `libnvidia-ml.so` from the system / toolkit dirs. -//! - Windows: `cuda.lib` + `nvml.lib` from `%CUDA_PATH%\lib\x64`. - -use std::path::Path; - -fn main() { - println!("cargo:rerun-if-changed=build.rs"); - - if std::env::var("CARGO_FEATURE_CUDA").is_ok() { - // Use the *target* OS (correct for cross-compilation too), not the host. - let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); - link_cuda_driver(&target_os); - } -} - -/// Link the CUDA driver library plus NVML. The backend loads the embedded miniZ -/// fatbin at runtime, so there is nothing to compile here. -fn link_cuda_driver(target_os: &str) { - if target_os == "windows" { - // CUDA Toolkit import libraries (cuda.lib, nvml.lib). - println!("cargo:rerun-if-env-changed=CUDA_PATH"); - if let Ok(cuda_path) = std::env::var("CUDA_PATH") { - println!("cargo:rustc-link-search=native={cuda_path}\\lib\\x64"); - } - // Driver API import lib is `cuda.lib`; NVML is `nvml.lib` (nvml.dll - // ships with the NVIDIA driver). - println!("cargo:rustc-link-lib=dylib=cuda"); - println!("cargo:rustc-link-lib=dylib=nvml"); - } else { - for dir in ["/usr/lib64", "/usr/lib", "/opt/cuda/lib64"] { - if Path::new(dir).exists() { - println!("cargo:rustc-link-search=native={dir}"); - } - } - // GNU ld: embed an rpath so libcuda is found at runtime (Linux only — - // MSVC's linker rejects `-Wl,...`). - if target_os == "linux" { - println!("cargo:rustc-link-arg=-Wl,-rpath,/opt/cuda/lib64"); - } - println!("cargo:rustc-link-lib=dylib=cuda"); - println!("cargo:rustc-link-lib=dylib=nvidia-ml"); - } -} diff --git a/kernels/equihash192_7.cl b/kernels/equihash192_7.cl new file mode 100644 index 0000000..6db2719 --- /dev/null +++ b/kernels/equihash192_7.cl @@ -0,0 +1,2110 @@ + + +//#define PRINT 1 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +__constant ulong blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +inline static uint2 ror64(const uint2 x, const uint y) +{ + return (uint2)(((x).x>>y)^((x).y<<(32-y)),((x).y>>y)^((x).x<<(32-y))); +} +inline static uint2 ror64_2(const uint2 x, const uint y) +{ + return (uint2)(((x).y>>(y-32))^((x).x<<(64-y)),((x).x>>(y-32))^((x).y<<(64-y))); +} + +#define gFunc(va, vb, vc, vd, x, y) \ +va = (va + vb + x); \ +((uint2*)&vd)[0] = ((uint2*)&vd)[0].yx ^ ((uint2*)&va)[0].yx; \ +vc = (vc + vd); \ +((uint2*)&vb)[0] = ror64( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 24U); \ +va = (va + vb + y); \ +((uint2*)&vd)[0] = ror64( ((uint2*)&vd)[0] ^ ((uint2*)&va)[0], 16U); \ +vc = (vc + vd); \ +((uint2*)&vb)[0] = ror64_2( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 63U); + +#define gFunc0(va, vb, vc, vd) \ +va = (va + vb); \ +((uint2*)&vd)[0] = ((uint2*)&vd)[0].yx ^ ((uint2*)&va)[0].yx; \ +vc = (vc + vd); \ +((uint2*)&vb)[0] = ror64( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 24U); \ +va = (va + vb); \ +((uint2*)&vd)[0] = ror64( ((uint2*)&vd)[0] ^ ((uint2*)&va)[0], 16U); \ +vc = (vc + vd); \ +((uint2*)&vb)[0] = ror64_2( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 63U); + + +inline uint swapByteEndian(uint input) { + uint tmp0 = input & 0x0F0F0F0F; + uint tmp1 = input & 0xF0F0F0F0; + + tmp0 = tmp0 << 4; + tmp1 = tmp1 >> 4; + + uint tmpIn = tmp0 | tmp1; + + tmp0 = tmpIn & 0x33333333; + tmp1 = tmpIn & 0xCCCCCCCC; + + tmp0 = tmp0 << 2; + tmp1 = tmp1 >> 2; + + tmpIn = tmp0 | tmp1; + + tmp0 = tmpIn & 0x55555555; + tmp1 = tmpIn & 0xAAAAAAAA; + + tmp0 = tmp0 << 1; + tmp1 = tmp1 >> 1; + + return tmp0 | tmp1; +} + + +uint8 shr_7(uint8 input, uint sh0, uint sh1) { + uint8 tmp = (input >> sh0); + uint8 tmp2 = (input << 32-sh0); + + tmp.s0 = input.s0 >> sh1; + + tmp.s0123 |= tmp2.s1234; + tmp.s45 |= tmp2.s56; + + tmp.s7 = input.s7; + return tmp; +} + + +void round0(ulong8 blake_state, __global uint8 *resultsHi, __global uint2 *resultsLo , __global uint *counters, uint tId, uint gId) { + ulong v[16]; + + ulong word1 = ((ulong)tId << 32) | gId; + // init vector v + v[0] = blake_state.s0; + v[1] = blake_state.s1; + v[2] = blake_state.s2; + v[3] = blake_state.s3; + v[4] = blake_state.s4; + v[5] = blake_state.s5; + v[6] = blake_state.s6; + v[7] = blake_state.s7; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // gFunc in length of data + v[12] ^= 144 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + gFunc(v[0], v[4], v[8], v[12], 0, word1); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 2 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc(v[0], v[5], v[10], v[15], word1, 0); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 3 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc(v[2], v[7], v[8], v[13], 0, word1); + gFunc0(v[3], v[4], v[9], v[14]); + // round 4 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc(v[1], v[5], v[9], v[13], 0, word1); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 5 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc(v[0], v[5], v[10], v[15], 0, word1); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 6 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc(v[1], v[5], v[9], v[13], word1, 0); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 8 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc(v[2], v[6], v[10], v[14], 0, word1); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 9 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc(v[2], v[7], v[8], v[13], word1, 0); + gFunc0(v[3], v[4], v[9], v[14]); + // round 10 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc(v[3], v[7], v[11], v[15], word1, 0); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 11 + gFunc(v[0], v[4], v[8], v[12], 0, word1); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc0(v[0], v[5], v[10], v[15]); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + // round 12 + gFunc0(v[0], v[4], v[8], v[12]); + gFunc0(v[1], v[5], v[9], v[13]); + gFunc0(v[2], v[6], v[10], v[14]); + gFunc0(v[3], v[7], v[11], v[15]); + gFunc(v[0], v[5], v[10], v[15], word1, 0); + gFunc0(v[1], v[6], v[11], v[12]); + gFunc0(v[2], v[7], v[8], v[13]); + gFunc0(v[3], v[4], v[9], v[14]); + + v[0] = v[0] ^ blake_state.s0 ^ v[8]; + v[1] = v[1] ^ blake_state.s1 ^ v[9]; + v[2] = v[2] ^ blake_state.s2 ^ v[10]; + v[3] = v[3] ^ blake_state.s3 ^ v[11]; + v[4] = v[4] ^ blake_state.s4 ^ v[12]; + v[5] = v[5] ^ blake_state.s5 ^ v[13]; + v[6] = v[6] ^ blake_state.s6 ^ v[14]; + v[7] = v[7] ^ blake_state.s7 ^ v[15]; + + uint8 output0, output1, output2; + int addme,shift, bucket; + + output0.s0 = v[0] & 0xFFFFFFFF; + output0.s1 = v[0] >> 32; + output0.s2 = v[1] & 0xFFFFFFFF; + output0.s3 = v[1] >> 32; + output0.s4 = v[2] & 0xFFFFFFFF; + output0.s5 = v[2] >> 32; + output0.s7 = 0; + output0.s6 = 2*tId; + + output1.s0 = v[3] & 0xFFFFFFFF; + output1.s1 = v[3] >> 32; + output1.s2 = v[4] & 0xFFFFFFFF; + output1.s3 = v[4] >> 32; + output1.s4 = v[5] & 0xFFFFFFFF; + output1.s5 = v[5] >> 32; + output1.s7 = 0; + output1.s6 = 2*tId+1; + + uint2 addr; + addr.s0 = atomic_inc(&counters[output0.s0 & 0x1FFF]); + addr.s0 += 4592 * (output0.s0 & 0x1FFF); + resultsHi[addr.s0] = shr_7(output0,13,13); + + addr.s1 = atomic_inc(&counters[output1.s0 & 0x1FFF]); + addr.s1 += 4592 * (output1.s0 & 0x1FFF); + resultsHi[addr.s1] = shr_7(output1,13,13); +} + + +__kernel void clearCounter (__global uint8 * buffer0, + __global uint8 * buffer1, + __global uint8 * buffer2, + __global uint4 * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint gId = get_global_id(0); + counters[gId] = (uint4) 0; + + if (gId == 0) { + res[0] = (uint4) 0; + } +} + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void blake ( + __global uint8 * output0, + __global uint2 * buffer1, + __global uint8 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint gId = get_global_id(0); + + round0(hashState, output0, buffer1, counters, gId, (uint) (nonce & 0xFFFFFFFF)); +} + + +inline int masking8_7(uint8 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + + if (pos < 1166) { + uint value = atomic_xchg(&scratch[654 + ((input.s0 >> 2) & 0x1FF)], pos); + uint high = value >> 12; + + value &= 0x7FF; + value |= (input.s0 & 0xFFFFF800); + + scratch[1166+pos] = value; + scratch[2332+pos] = input.s1; + scratch[3498+pos] = input.s2; + scratch[4664+pos] = input.s3; + scratch[5830+pos] = input.s4; + scratch[6996+pos] = input.s5; + + if (pos < 654) { + scratch[pos] = input.s6 << 12; + } else { + atomic_or(&scratch[pos], input.s6 << 12); + } + + if (high != 0) { + atomic_or(&scratch[654 + ((input.s0 >> 2) & 0x1FF)], high << 12); + } + + } + + return pos; + } + + return -1; +} + + + +inline void masking4_4b(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + + uint value = atomic_xchg(&scratch[(input.s0 >> 5) & 0x1FF], pos); + scratch[1792+pos] = input.s1; + scratch[3072+pos] = input.s2; + scratch[4352+pos] = input.s3; + scratch[5632+pos] = idx; + + value |= (input.s0 & 0xFFFFE000); + scratch[512+pos] = value; + } +} + +inline void masking4_4bt(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + if (pos < 1280) { + uint value = atomic_xchg(&scratch[(input.s0 >> 5) & 0x1FF], pos); + scratch[1792+pos] = input.s1; + scratch[3072+pos] = input.s2; + scratch[4352+pos] = input.s3; + scratch[5632+pos] = idx; + + value |= (input.s0 & 0xFFFFE000); + scratch[512+pos] = value; + } + } +} + +void masking4_4(uint4 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x1FF], pos); + value |= (input.s0 & 0xFFFFF800); + scratch[512+pos] = value; + scratch[1728+pos] = input.s1; + scratch[2944+pos] = input.s2; + scratch[4160+pos] = input.s3; + + + } +} + +void masking4_4t(uint4 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + if (pos < 1216) { + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x1FF], pos); + value |= (input.s0 & 0xFFFFF800); + scratch[512+pos] = value; + scratch[1728+pos] = input.s1; + scratch[2944+pos] = input.s2; + scratch[4160+pos] = input.s3; + } + } +} + + +void masking4_3b(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos); + value |= (input.s0 & 0xFFFFF000); + scratch[1024+pos] = value; + scratch[3328+pos] = input.s1; + scratch[5632+pos] = (input.s2 & 0x3FFF) | (idx << 14); + //scratch[7936+pos] = idx; + } +} + + +void masking4_3bt(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + if (pos < 2304) { + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos); + value |= (input.s0 & 0xFFFFF000); + scratch[1024+pos] = value; + scratch[3328+pos] = input.s1; + scratch[5632+pos] = (input.s2 & 0x3FFF) | (idx << 14); + //scratch[7936+pos] = idx; + } + } +} + + +void masking2_2b(uint2 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos); + value |= (input.s0 & 0xFFFFF000); + scratch[1024+pos] = value; + scratch[3328+pos] = (input.s1 & 0x3FFFF) | (idx << 18); + } +} + +void masking2_2bt(uint2 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + if (pos < 2304) { + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos); + value |= (input.s0 & 0xFFFFF000); + scratch[1024+pos] = value; + scratch[3328+pos] = (input.s1 & 0x3FFFF) | (idx << 18); + } + } +} + +void masking2_2(uint2 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) { + if ((input.s0 & check) == mask) { + uint pos = atomic_inc(&cnt[0]); + if (pos < 2304) { + uint value = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos); + value |= (input.s0 & 0xFFFFF000); + scratch[1024+pos] = value; + scratch[3328+pos] = input.s1; + } + } +} + + +uint2 compress(uint in0, uint in1) { + uint hi, lo; + uint hi2, lo2; + + if (in0 < in1) { + hi = in1 >> 12; + hi2 = in1 & 0xFFF; + lo = in0 >> 12; + lo2 = in0 & 0xFFF; + } else { + hi = in0 >> 12; + hi2 = in0 & 0xFFF; + lo = in1 >> 12; + lo2 = in1 & 0xFFF; + } + + hi++; + + uint2 tmp; + tmp.s0 = mul24(hi,(hi-1)); + tmp.s0 = tmp.s0 >> 1; + tmp.s0 += lo; + + tmp.s0 = tmp.s0 << 6; + tmp.s0 |= (hi2 & 0x3F); + + tmp.s1 = hi2 >> 6; + tmp.s1 |= (lo2 << 6); + + return tmp; +} + +uint compress2(uint in0, uint in1) { + uint hi, lo; + + if (in0 < in1) { + hi = in1; + lo = in0; + } else { + hi = in0; + lo = in1; + } + + uint tmp; + tmp = mul24(hi,(hi-1)); + tmp = tmp >> 1; + tmp += lo; + + return tmp; +} + +uint2 decompress(uint2 in) { + double inFl = (double) (in.s0 >> 6); + + inFl *= 2.0; + inFl += 1.0; + + uint2 res; + res.s0 = (uint) round(sqrt(inFl)); + + + uint tmp = res.s0 * (res.s0-1); + tmp = tmp >> 1; + + res.s1 = (uint) ((in.s0 >> 6) - tmp); + res.s0--; + + res.s0 = res.s0 << 12; + res.s1 = res.s1 << 12; + + res.s0 |= (in.s0 & 0x3F); + res.s0 |= ((in.s1 & 0x3F) << 6); + + res.s1 |= (in.s1 >> 6); + + return res; +} + +uint2 decompress2(uint in) { + double inFl = (double) in; + + inFl *= 2.0; + inFl += 1.0; + + uint2 res; + res.s0 = (uint) round(sqrt(inFl)); + + + uint tmp = res.s0 * (res.s0-1); + tmp = tmp >> 1; + + res.s1 = (uint) (in - tmp); + + return res; +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round1 ( // Round 1 + __global uint8 * buffer0, + __global uint8 * buffer1, + __global uint8 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = (grp & 3); + + __global uint8 * output = buffer1; + __global uint8 * input = &buffer0[bucket*4592]; + + __local uint scratch[8162]; + __local uint * ht = &scratch[654]; + __local uint * scratch0 = &scratch[1166]; + __local uint * scratch1 = &scratch[2332]; + __local uint * scratch2 = &scratch[3498]; + __local uint * scratch3 = &scratch[4664]; + __local uint * scratch4 = &scratch[5830]; + __local uint * scratch5 = &scratch[6996]; + __local uint * scratch6 = &scratch[0]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[0]; + __global uint * outCounter = &counters[16384]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R0: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 4592); + } + + ht[lId] = 0x7FF; + ht[lId+256] = 0x7FF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; + + masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840]; + if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096]; + if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + int pos = masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 3584) < iCNT[0]) masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4096) < iCNT[0]) masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4352) < iCNT[0]) masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + uint inlim = min(iCNT[1], (uint) 1166); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + + othPos = (ownPos < inlim) ? othPos : 0x7FF; + + uint addr, elem, val; + uint2 el0, el1; + uint8 oScrT; + + while (othPos < inlim) { + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 11; + + if (oScrT.s0 != 0) { + val = oScrT.s0 & 0x1FFF; + addr = atomic_inc(&outCounter[val]); + + oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos]; + oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos]; + oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos]; + oScrT.s4 = scratch4[ownPos] ^ scratch4[othPos]; + + el0.s0 = scratch5[ownPos]; + el0.s1 = scratch6[ownPos]; + el1.s0 = scratch5[othPos]; + el1.s1 = scratch6[othPos]; + + oScrT.s5 = (el0.s0 ^ el1.s0) & 0x7FFFF; + + el0.s0 = el0.s0 >> 19; + el1.s0 = el1.s0 >> 19; + el0.s1 = (el0.s1 >> 12) << 13; + el1.s1 = (el1.s1 >> 12) << 13; + + el0.s1 |= el0.s0; + el1.s1 |= el1.s0; + + oScrT.s0 = (oScrT.s0 >> 13) | (oScrT.s1 << 8); + oScrT.s1 = (oScrT.s1 >> 24) | (oScrT.s2 << 8); + oScrT.s2 = (oScrT.s2 >> 24) | (oScrT.s3 << 8); + oScrT.s3 = (oScrT.s3 >> 24) | (oScrT.s4 << 8); + oScrT.s4 = (oScrT.s4 >> 24) | (oScrT.s5 << 8); + + //if (get_global_id(0) == 0) printf("%d %d \n ", el0, el1); + + oScrT.s5 = el0.s1 | (el1.s1 << 25); + oScrT.s6 = el1.s1 >> 7; + oScrT.s7 = 0; + addr += 4592*val; + + output[addr] = oScrT; + } + + othPos = elem & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + } +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round2 ( // Round 2 + __global uint8 * buffer0, + __global uint8 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; //0x3FFF; + uint mask = grp & 3; + + __global uint8 * output = buffer0; + __global uint8 * input = &buffer1[bucket*4592]; + + __local uint scratch[8162]; + __local uint * ht = &scratch[654]; + __local uint * scratch0 = &scratch[1166]; + __local uint * scratch1 = &scratch[2332]; + __local uint * scratch2 = &scratch[3498]; + __local uint * scratch3 = &scratch[4664]; + __local uint * scratch4 = &scratch[5830]; + __local uint * scratch5 = &scratch[6996]; + __local uint * scratch6 = &scratch[0]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[16384]; + __global uint * outCounter = &counters[32768]; + + if (lId == 0) { + iCNT[0] = min(inCounter[bucket],(uint) 4592); + iCNT[1] = 0; + } + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R1: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + ht[lId] = 0x7FF; + ht[lId+256] = 0x7FF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; + + masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840]; + if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096]; + if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352]; + + masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 3584) < iCNT[0]) masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4096) < iCNT[0]) masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4352) < iCNT[0]) masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + + barrier(CLK_LOCAL_MEM_FENCE); + uint inlim = min(iCNT[1], (uint) 1166); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + + othPos = (ownPos < inlim) ? othPos : 0x7FF; + + uint addr, elem, val; + uint el0; + uint8 oScrT; + + while (othPos < inlim) { + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 11; + + if (oScrT.s0 != 0) { + val = oScrT.s0 & 0x1FFF; + addr = atomic_inc(&outCounter[val]); + + oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos]; + oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos]; + oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos]; + oScrT.s4 = scratch4[ownPos] ^ scratch4[othPos]; + + oScrT.s7 = scratch5[othPos]; + oScrT.s6 = scratch5[ownPos]; + oScrT.s5 = (scratch6[othPos] & 0xFFFFF000); + + oScrT.s0 = (oScrT.s0 >> 10) | (oScrT.s1 << 11); + oScrT.s1 = (oScrT.s1 >> 21) | (oScrT.s2 << 11); + oScrT.s2 = (oScrT.s2 >> 21) | (oScrT.s3 << 11); + oScrT.s3 = (oScrT.s3 >> 21) | (oScrT.s4 << 11); + oScrT.s4 = (oScrT.s4 >> 21); + oScrT.s4 |= (scratch6[ownPos] & 0xFFFFF000); + + addr += 4592*val; + + output[addr] = oScrT; + + //if (get_global_id(0) == 0) printf("%x %x %x %x \n", oScrT.lo); + } + + othPos = elem & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + } +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round3 ( // Round 1 / 3 + __global uint8 * buffer0, + __global uint4 * buffer1, + __global uint * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = (grp & 3) << 3; + + __global uint4 * output = buffer1; + __global uint8 * input = &buffer0[bucket*4592]; + + __global uint * sideLoadR13 = (__global uint *) &buffer1[37748736 + grp*328]; + + __local uint scratch[6912]; + __local uint * ht = &scratch[0]; + __local uint * scratch0 = &scratch[512]; + __local uint * scratch1 = &scratch[1792]; + __local uint * scratch2 = &scratch[3072]; + __local uint * scratch3 = &scratch[4352]; + __local uint * scratch4 = &scratch[5632]; + __local uint iCNT[2]; + __local uint pCNT[1]; + + __global uint * inCounter = &counters[32768]; + __global uint * outCounter = &counters[49152]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R2: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 4592); + pCNT[0] = 0; + } + + ht[lId] = 0xFFF; + ht[lId+256] = 0xFFF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking4_4b(iScr0.lo, lId + 0, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr1.lo, lId + 256, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr2.lo, lId + 512, &scratch[0], &iCNT[1], mask, 0x18); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking4_4b(iScr3.lo, lId + 768, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr4.lo, lId + 1024, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr5.lo, lId + 1280, &scratch[0], &iCNT[1], mask, 0x18); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking4_4b(iScr0.lo, lId + 1536, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr1.lo, lId + 1792, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr2.lo, lId + 2048, &scratch[0], &iCNT[1], mask, 0x18); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + if ((lId + 3584) < iCNT[0])iScr2 = input[lId + 3584]; + + masking4_4b(iScr3.lo, lId + 2304, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr4.lo, lId + 2560, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4b(iScr5.lo, lId + 2816, &scratch[0], &iCNT[1], mask, 0x18); + + if ((lId + 3840) < iCNT[0])iScr3 = input[lId + 3840]; + if ((lId + 4096) < iCNT[0])iScr4 = input[lId + 4096]; + if ((lId + 4352) < iCNT[0])iScr5 = input[lId + 4352]; + + masking4_4bt(iScr0.lo, lId + 3072, &scratch[0], &iCNT[1], mask, 0x18); + masking4_4bt(iScr1.lo, lId + 3328, &scratch[0], &iCNT[1], mask, 0x18); + if ((lId + 3584) < iCNT[0])masking4_4bt(iScr2.lo, lId + 3584, &scratch[0], &iCNT[1], mask, 0x18); + + if ((lId + 3840) < iCNT[0])masking4_4bt(iScr3.lo, lId + 3840, &scratch[0], &iCNT[1], mask, 0x18); + if ((lId + 4096) < iCNT[0])masking4_4bt(iScr4.lo, lId + 4096, &scratch[0], &iCNT[1], mask, 0x18); + if ((lId + 4352) < iCNT[0])masking4_4bt(iScr5.lo, lId + 4352, &scratch[0], &iCNT[1], mask, 0x18); + + barrier(CLK_LOCAL_MEM_FENCE); + //if (lId == 0) printf("%d \n", iCNT[1]); + uint inlim = min(iCNT[1], (uint) 1280); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + + othPos = (ownPos < inlim) ? othPos : 0xFFF; + + uint addr, elem, val; + uint el0, el1; + int pcnt = 0; + uint4 oScrT; + + while (othPos < inlim) { + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 14; + + uint sideAddr = atomic_inc(&pCNT[0]); + if (sideAddr < 1312) { + val = oScrT.s0 & 0x1FFF; + addr = atomic_inc(&outCounter[val]); + + sideLoadR13[sideAddr] = scratch4[ownPos] | (scratch4[othPos] << 16); + + oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos]; + oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos]; + oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos]; + + oScrT.s0 = (oScrT.s0 >> 13) | (oScrT.s1 << 5); + oScrT.s1 = (oScrT.s1 >> 27) | (oScrT.s2 << 5); + oScrT.s2 = (oScrT.s2 >> 27) | (oScrT.s3 << 5); + oScrT.s3 = (oScrT.s3 >> 27); + + addr += 4592*val; + oScrT.s3 |= ((sideAddr + 1312*grp) << 6); + + output[addr] = oScrT; + } + + othPos = elem & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + + } +} + + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round4 ( // Round 2 / 4 + __global uint4 * buffer0, + __global uint4 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = grp & 3; + + __global uint4 * output = buffer2; + __global uint4 * input = &buffer1[bucket*4592]; + + __local uint scratch[5376]; + __local uint * ht = &scratch[0]; + __local uint * scratch0 = &scratch[512]; + __local uint * scratch1 = &scratch[1728]; + __local uint * scratch2 = &scratch[2944]; + __local uint * scratch3 = &scratch[4160]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[49152]; + __global uint * outCounter = &counters[65536]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R3: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 4592); + } + + ht[lId] = 0x7FF; + ht[lId+256] = 0x7FF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking4_4(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking4_4(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking4_4(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; + + masking4_4t(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4t(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4t(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840]; + if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096]; + if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352]; + + masking4_4t(iScr0, &scratch[0], &iCNT[1], mask, 0x3); + masking4_4t(iScr1, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 3584) < iCNT[0]) masking4_4t(iScr2, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 3840) < iCNT[0]) masking4_4t(iScr3, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4096) < iCNT[0]) masking4_4t(iScr4, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 4352) < iCNT[0]) masking4_4t(iScr5, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + uint inlim = min(iCNT[1], (uint) 1216); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + + othPos = (ownPos < inlim) ? othPos : 0x7FF; + + uint addr, elem, val; + uint el0, el1; + uint4 oScrT; + + while (othPos < inlim) { + + + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 11; + + if (oScrT.s0 != 0) { + val = oScrT.s0 & 0xFFF; + addr = atomic_inc(&outCounter[val]); + + oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos]; + oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos]; + el0 = scratch3[ownPos]; + el1 = scratch3[othPos]; + + oScrT.s3 = (el0 ^ el1) & 0x1F; + oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 9); + oScrT.s1 = (oScrT.s1 >> 23) | (oScrT.s2 << 9); + oScrT.s2 = (oScrT.s2 >> 23) | (oScrT.s3 << 9); + + uint2 tmp = compress(el0 >> 6,el1 >> 6); + addr += 8688*val; + + oScrT.s3 = tmp.s0; + oScrT.s2 |= (tmp.s1 << 14); + output[addr] = oScrT; + } + + othPos = elem & 0x7FF; + + while ((othPos == 0x7FF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF; + } + } + +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round5 ( // Round 3 / 5 + __global uint4 * buffer0, + __global uint4 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = grp & 3; + + __global uint4 * output = buffer1; + __global uint4 * input = &buffer2[bucket*8688]; + + __local uint scratch[7936]; + __local uint * ht = &scratch[0]; + __local uint * scratch0 = &scratch[1024]; + __local uint * scratch1 = &scratch[3328]; + __local uint * scratch2 = &scratch[5632]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[65536]; + __global uint * outCounter = &counters[81920]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R4: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 8688); + } + + ht[lId] = 0xFFF; + ht[lId+256] = 0xFFF; + ht[lId+512] = 0xFFF; + ht[lId+768] = 0xFFF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking4_3b(iScr0, lId + 0, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr1, lId + 256, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr2, lId + 512, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking4_3b(iScr3, lId + 768, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr4, lId + 1024, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr5, lId + 1280, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking4_3b(iScr0, lId + 1536, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr1, lId + 1792, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr2, lId + 2048, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + iScr2 = input[lId + 3584]; + + masking4_3b(iScr3, lId + 2304, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr4, lId + 2560, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr5, lId + 2816, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 3840]; + iScr4 = input[lId + 4096]; + iScr5 = input[lId + 4352]; + + masking4_3b(iScr0, lId + 3072, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr1, lId + 3328, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr2, lId + 3584, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 4608]; + iScr1 = input[lId + 4864]; + iScr2 = input[lId + 5120]; + + masking4_3b(iScr3, lId + 3840, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr4, lId + 4096, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr5, lId + 4352, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 5376]; + iScr4 = input[lId + 5632]; + iScr5 = input[lId + 5888]; + + masking4_3b(iScr0, lId + 4608, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr1, lId + 4864, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr2, lId + 5120, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 6144]; + iScr1 = input[lId + 6400]; + iScr2 = input[lId + 6656]; + + masking4_3b(iScr3, lId + 5376, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr4, lId + 5632, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr5, lId + 5888, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 6912]; + iScr4 = input[lId + 7168]; + iScr5 = input[lId + 7424]; + + masking4_3b(iScr0, lId + 6144, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr1, lId + 6400, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3b(iScr2, lId + 6656, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 7680) < iCNT[0]) iScr0 = input[lId + 7680]; + if ((lId + 7936) < iCNT[0]) iScr1 = input[lId + 7936]; + if ((lId + 8192) < iCNT[0]) iScr2 = input[lId + 8192]; + + masking4_3bt(iScr3, lId + 6912, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3bt(iScr4, lId + 7168, &scratch[0], &iCNT[1], mask, 0x3); + masking4_3bt(iScr5, lId + 7424, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 8448) < iCNT[0]) iScr3 = input[lId + 8448]; + + if ((lId + 7680) < iCNT[0]) masking4_3bt(iScr0, lId + 7680, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 7936) < iCNT[0]) masking4_3bt(iScr1, lId + 7936, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 8192) < iCNT[0]) masking4_3bt(iScr2, lId + 8192, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 8448) < iCNT[0]) masking4_3bt(iScr3, lId + 8448, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + uint inlim = min(iCNT[1], (uint) 2304); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + + othPos = (ownPos < inlim) ? othPos : 0xFFF; + + uint addr, elem, val; + uint el0, el1; + int pcnt = 0; + uint4 oScrT; + + while (othPos < inlim) { + + + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 12; + + if (oScrT.s0 != 0) { + val = oScrT.s0 & 0xFFF; + addr = atomic_inc(&outCounter[val]); + + oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos]; + el0 = scratch2[ownPos]; + el1 = scratch2[othPos]; + + oScrT.s2 = (el0 ^ el1) & 0x3FFF; + oScrT.s3 = 0; + + oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 8); + oScrT.s1 = (oScrT.s1 >> 24) | (oScrT.s2 << 8); + oScrT.s2 = bucket; + oScrT.s3 = (el0 >> 14) | ((el1 >> 14) << 16); + + addr += 8688*(val & 0xFFF); + output[addr] = oScrT; + } + + othPos = elem & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + } +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round6 ( // Round 4 / 6 + __global uint4 * buffer0, + __global uint4 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = grp & 3; + + __global uint2 * output = (__global uint2 *) &buffer1[48496640]; + __global uint4 * input = &buffer1[bucket*8688]; + + + __local uint scratch[5632]; + __local uint * ht = &scratch[0]; + __local uint * scratch0 = &scratch[1024]; + __local uint * scratch1 = &scratch[3328]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[81920]; + __global uint * outCounter = &counters[98304]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R5: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 8688); + } + + ht[lId] = 0xFFF; + ht[lId+256] = 0xFFF; + ht[lId+512] = 0xFFF; + ht[lId+768] = 0xFFF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking2_2b(iScr0.lo, lId + 0, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr1.lo, lId + 256, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr2.lo, lId + 512, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking2_2b(iScr3.lo, lId + 768, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr4.lo, lId + 1024, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr5.lo, lId + 1280, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking2_2b(iScr0.lo, lId + 1536, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr1.lo, lId + 1792, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr2.lo, lId + 2048, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + iScr2 = input[lId + 3584]; + + masking2_2b(iScr3.lo, lId + 2304, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr4.lo, lId + 2560, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr5.lo, lId + 2816, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 3840]; + iScr4 = input[lId + 4096]; + iScr5 = input[lId + 4352]; + + masking2_2b(iScr0.lo, lId + 3072, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr1.lo, lId + 3328, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr2.lo, lId + 3584, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 4608]; + iScr1 = input[lId + 4864]; + iScr2 = input[lId + 5120]; + + masking2_2b(iScr3.lo, lId + 3840, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr4.lo, lId + 4096, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr5.lo, lId + 4352, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 5376]; + iScr4 = input[lId + 5632]; + iScr5 = input[lId + 5888]; + + masking2_2b(iScr0.lo, lId + 4608, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr1.lo, lId + 4864, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr2.lo, lId + 5120, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 6144]; + iScr1 = input[lId + 6400]; + iScr2 = input[lId + 6656]; + + masking2_2b(iScr3.lo, lId + 5376, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr4.lo, lId + 5632, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr5.lo, lId + 5888, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 6912]; + iScr4 = input[lId + 7168]; + iScr5 = input[lId + 7424]; + + masking2_2b(iScr0.lo, lId + 6144, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr1.lo, lId + 6400, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2b(iScr2.lo, lId + 6656, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 7680) < iCNT[0]) iScr0 = input[lId + 7680]; + if ((lId + 7936) < iCNT[0]) iScr1 = input[lId + 7936]; + if ((lId + 8192) < iCNT[0]) iScr2 = input[lId + 8192]; + + masking2_2bt(iScr3.lo, lId + 6912, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2bt(iScr4.lo, lId + 7168, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2bt(iScr5.lo, lId + 7424, &scratch[0], &iCNT[1], mask, 0x3); + + if ((lId + 8448) < iCNT[0]) iScr3 = input[lId + 8448]; + + if ((lId + 7680) < iCNT[0]) masking2_2bt(iScr0.lo, lId + 7680, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 7936) < iCNT[0]) masking2_2bt(iScr1.lo, lId + 7936, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 8192) < iCNT[0]) masking2_2bt(iScr2.lo, lId + 8192, &scratch[0], &iCNT[1], mask, 0x3); + if ((lId + 8448) < iCNT[0]) masking2_2bt(iScr3.lo, lId + 8448, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + uint inlim = min(iCNT[1], (uint) 2304); + if (lId == 0) iCNT[0] = inlim-1; + barrier(CLK_LOCAL_MEM_FENCE); + + int ownPos = atomic_dec(&iCNT[0]); + uint own = scratch0[ownPos]; + uint othPos = own & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + + othPos = (ownPos < inlim) ? othPos : 0xFFF; + + uint addr, elem, val; + uint el0, el1; + int pcnt = 0; + uint2 oScrT; + + while (othPos < inlim) { + + + elem = scratch0[othPos]; + oScrT.s0 = (own ^ elem) >> 12; + + if (oScrT.s0 != 0) { + val = oScrT.s0 & 0xFFF; + addr = atomic_inc(&outCounter[val]); + + uint el0 = scratch1[ownPos]; + uint el1 = scratch1[othPos]; + + oScrT.s1 = (el0 ^ el1) & 0x3FFFF; + el0 = el0 >> 18; + el1 = el1 >> 18; + + oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 8); + oScrT.s1 = compress2(el0,el1); + + addr += 8688*(val & 0xFFF); + + oScrT.s1 |= (bucket << 26); + oScrT.s0 |= ((bucket >> 6) << 26); + + output[addr] = oScrT; + } + + othPos = elem & 0xFFF; + + while ((othPos == 0xFFF) && (ownPos >= 0)) { + ownPos = atomic_dec(&iCNT[0]); + if (ownPos >= 0) own = scratch0[ownPos]; + othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF; + } + } +} + + +__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round7 ( // Round 5 / 7 + __global uint4 * buffer0, + __global uint4 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * res, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + uint lId = get_local_id(0); + uint grp = get_group_id(0); + + uint bucket = grp >> 2; + uint mask = grp & 3; + + __global uint4 * output = &buffer1[67305472]; + __global uint2 * outputR46 = (__global uint2 *) &buffer1[48496640]; + __global uint4 * input = &buffer1[48496640 + bucket*4344]; + + __local uint scratch[5632]; + __local uint * ht = &scratch[0]; + __local uint * scratch0 = &scratch[1024]; + __local uint * scratch1 = &scratch[3328]; + __local uint iCNT[2]; + + __global uint * inCounter = &counters[98304]; + __global uint * outCounter = &counters[114688]; + + #ifdef PRINT + if (get_global_id(0) == 0) { + uint sum=0; + for (uint i=0; i<16384; i++) { + sum += inCounter[i]; + } + printf("R6: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]); + } + #endif + + if (lId == 0) { + iCNT[1] = 0; + iCNT[0] = min(inCounter[bucket],(uint) 8688); + } + + ht[lId] = 0xFFF; + ht[lId+256] = 0xFFF; + ht[lId+512] = 0xFFF; + ht[lId+768] = 0xFFF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; + + iScr0 = input[lId]; + iScr1 = input[lId + 256]; + iScr2 = input[lId + 512]; + + iScr3 = input[lId + 768]; + iScr4 = input[lId + 1024]; + iScr5 = input[lId + 1280]; + + masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 1536]; + iScr1 = input[lId + 1792]; + iScr2 = input[lId + 2048]; + + masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3); + + iScr3 = input[lId + 2304]; + iScr4 = input[lId + 2560]; + iScr5 = input[lId + 2816]; + + masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3); + + iScr0 = input[lId + 3072]; + iScr1 = input[lId + 3328]; + if (2*(lId+3584) < iCNT[0]) iScr2 = input[lId + 3584]; + + masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3); + + if (2*(lId+3840) < iCNT[0]) iScr3 = input[lId + 3840]; + if (2*(lId+4096) < iCNT[0]) iScr4 = input[lId + 4096]; + if (2*(lId+4352) < iCNT[0]) iScr5 = input[lId + 4352]; + + masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3); + masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+3584) < iCNT[0]) masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+3584)+1 < iCNT[0])masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3); + + if (2*(lId+3840)+0 < iCNT[0]) masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+3840)+1 < iCNT[0]) masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+4096)+0 < iCNT[0]) masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+4096)+1 < iCNT[0]) masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+4352)+0 < iCNT[0]) masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3); + if (2*(lId+4352)+1 < iCNT[0]) masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3); + + barrier(CLK_LOCAL_MEM_FENCE); + if (lId == 0) iCNT[1] = min(iCNT[1], (uint) 2304); + barrier(CLK_LOCAL_MEM_FENCE); + + uint ownPos = lId; + uint own = scratch0[ownPos]; + uint othPos = own & 0xFFF; + + uint ownEl; + + if ((own & 0xFFF) != 0xFFF) { + ownEl = scratch1[ownPos]; + } + + uint2 oScrT; + + while (ownPos < iCNT[1]) { + uint addr = (othPos < iCNT[1]) ? othPos : ownPos+256; + uint elem = scratch0[addr]; + + if (othPos < iCNT[1]) { + uint oth = elem; + if (((own ^ oth) & 0x03FFF000) == 0) { + uint4 elem; + elem.s0 = own; + elem.s1 = ownEl; + + elem.s2 = oth; + elem.s3 = scratch1[othPos]; + + uint4 naddr; + + naddr.s01 = decompress2(elem.s1 & 0x3FFFFFF); // Unpack R4 / 6 + naddr.s23 = decompress2(elem.s3 & 0x3FFFFFF); + + elem.s0 = elem.s0 >> 26; + elem.s1 = elem.s1 >> 26; + elem.s0 = elem.s0 << 6; + elem.s0 |= elem.s1; + + naddr.s0 += 8688*elem.s0; + naddr.s1 += 8688*elem.s0; + + elem.s2 = elem.s2 >> 26; + elem.s3 = elem.s3 >> 26; + elem.s2 = elem.s2 << 6; + elem.s2 |= elem.s3; + + naddr.s2 += 8688*elem.s2; + naddr.s3 += 8688*elem.s2; + + bool ok = true; + + ok = ok && (naddr.s0 != naddr.s1) && (naddr.s0 != naddr.s2) && (naddr.s0 != naddr.s3); + ok = ok && (naddr.s1 != naddr.s2) && (naddr.s1 != naddr.s3) && (naddr.s2 != naddr.s3); + + if (ok) { + addr = atomic_inc(&outCounter[oScrT.s0 & 0xFFF]); + if (addr < 4096) { + output[addr] = naddr; + } + } + } + } else { + own = elem; + ownPos += 256; + if (((own & 0xFFF) != 0xFFF) && (ownPos < iCNT[1])) { + ownEl = scratch1[ownPos]; + } + } + + othPos = elem & 0xFFF; + } +} + + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void combine ( + __global uint8 * buffer0, + __global uint4 * buffer1, + __global uint4 * buffer2, + __global uint * counters, + __global uint4 * output0, + const uint extra, + const ulong8 hashState, + const ulong nonce) { + + uint gId = get_group_id(0); + uint lId = get_local_id(0); + + __global uint * outCounters = (__global uint*) &output0[0]; + + __global uint * sideLoadR13 = (__global uint *) &buffer1[37748736]; + + __global uint2 * R46Out = (__global uint2 *) &buffer1[48496640]; + __global uint4 * R57Out = &buffer1[67305472]; + + __local uint scratch[256]; + __local uint ok[1]; + + uint2 tmps; + + #ifdef PRINT + if (get_global_id(0) == 0) { + printf("R5: %d \n", counters[114688]); + } + #endif + + if (gId < counters[114688]) { + if (lId == 0) { + uint4 tmp; + tmp = R57Out[gId]; + + scratch[128 + 4*lId+0] = tmp.s0; + scratch[128 + 4*lId+1] = tmp.s1; + scratch[128 + 4*lId+2] = tmp.s2; + scratch[128 + 4*lId+3] = tmp.s3; + } + + bool check = true; + if (lId == 0) ok[0] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 4) { // Unpack R3 / 5 + uint addr = scratch[128 + lId]; + if (addr < 37617664) { + uint4 tmp = buffer1[addr]; + + atomic_xor(&ok[0], tmp.s1 >> 18); + + tmp.s0 = tmp.s3 & 0xFFFF; + tmp.s1 = tmp.s3 >> 16; + + tmp.s0 += 8688*tmp.s2; + tmp.s1 += 8688*tmp.s2; + + scratch[2*lId] = tmp.s0; + scratch[2*lId+1] = tmp.s1; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + check = (ok[0] == 0); + barrier(CLK_LOCAL_MEM_FENCE); + + if ((lId < 8) && (check)) { // Unpack R2 / 4 + uint addr = scratch[lId]; + if (addr < 37617664) { + uint4 tmp = buffer2[addr]; + + tmp.s2 = tmp.s2 >> 14; + tmp.s01 = decompress(tmp.s32); + + scratch[128+2*lId] = tmp.s0; + scratch[128+2*lId+1] = tmp.s1; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if ((lId < 16) && (check)) { // Unpack R1 / 3 + uint addr = scratch[128+lId]; + if (addr < 42991616) { + uint2 tmp; + + tmp.s0 = sideLoadR13[addr]; + tmp.s1 = tmp.s0 & 0xFFFF; + tmp.s0 = tmp.s0 >> 16; + + tmp.s0 += 4592*(addr / 5248); + tmp.s1 += 4592*(addr / 5248); + + scratch[144+2*lId] = tmp.s0; + scratch[144+2*lId+1] = tmp.s1; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + + if ((lId < 32) && (check)) { // Unpack R2 + uint addr = scratch[144+lId]; + if (addr < 37617664) { + uint8 tmp; + + tmp = buffer0[addr]; + + atomic_xor(&ok[0], tmp.s4 & 0x3F); + + tmp.s0 = (tmp.s4 >> 12) << 7; + tmp.s1 = tmp.s6; + + tmp.s2 = (tmp.s5 >> 12) << 7; + tmp.s3 = tmp.s7; + + tmp.s0 |= (tmp.s1 >> 25); + tmp.s2 |= (tmp.s3 >> 25); + + tmp.s1 &= 0x1FFFFFF; + tmp.s3 &= 0x1FFFFFF; + + scratch[4*lId] = tmp.s0; + scratch[4*lId+1] = tmp.s1; + scratch[4*lId+2] = tmp.s2; + scratch[4*lId+3] = tmp.s3; + } + } + + + barrier(CLK_LOCAL_MEM_FENCE); + + if ((ok[0] == 0) && (check)) { + + scratch[128 + 2*lId] = 0xFFF; + scratch[128 + 2*lId+1] = 0xFFF; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint2 listEntry ; + + uint elem = scratch[lId] & 0x3F; + listEntry.s0 = atomic_xchg(&scratch[128 + elem], lId); + + elem = scratch[64+lId] & 0x3F; + listEntry.s1 = atomic_xchg(&scratch[128 + elem], lId+64); + + + barrier(CLK_LOCAL_MEM_FENCE); + + + scratch[128 + lId] = listEntry.s0; + scratch[128 + 64 + lId] = listEntry.s1; + + + barrier(CLK_LOCAL_MEM_FENCE); + + + int next = scratch[128 + lId]; + while (next < 128) { + if (scratch[lId] == scratch[next]) { + atomic_inc(&ok[0]); + } + next = scratch[128 + next]; + } + + next = scratch[128 + 64 + lId]; + while (next < 128) { + if (scratch[64 + lId] == scratch[next]) { + atomic_inc(&ok[0]); + } + next = scratch[128 + next]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if ((ok[0] == 0) && (check)) { + uint addr; + if (lId == 0) addr = atomic_inc(&outCounters[0]); + + if (lId < 64) { + uint2 elem; + elem.s0 = scratch[2*lId]; + elem.s1 = scratch[2*lId+1]; + + if (elem.s0 > elem.s1) elem.s01 = elem.s10; + + scratch[128+2*lId] = elem.s0; + scratch[128+2*lId+1] = elem.s1; // Elements sorted by 2 Elem + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint2 tmp2; + + if (lId < 64) { + tmp2.s0 = lId >> 1; + tmp2.s1 = (scratch[128+4*tmp2.s0+0] > scratch[128+4*tmp2.s0+2]) ? (lId ^ 0x1) : lId; + + scratch[2*lId] = scratch[128+2*tmp2.s1]; + scratch[2*lId+1] = scratch[128+2*tmp2.s1+1]; // Elements sorted by 4 Elem + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 64) { + tmp2.s0 = lId >> 2; + tmp2.s1 = (scratch[8*tmp2.s0+0] > scratch[8*tmp2.s0+4]) ? (lId ^ 0x2) : lId; + + scratch[128+2*lId+0] = scratch[2*tmp2.s1+0]; // Elements sorted by 8 Elem + scratch[128+2*lId+1] = scratch[2*tmp2.s1+1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 64) { + tmp2.s0 = lId >> 3; + tmp2.s1 = (scratch[128+16*tmp2.s0+0] > scratch[128+16*tmp2.s0+8]) ? (lId ^ 0x4) : lId; + + scratch[2*lId+0] = scratch[128+2*tmp2.s1+0]; // Elements sorted by 16 Elem + scratch[2*lId+1] = scratch[128+2*tmp2.s1+1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 64) { + tmp2.s0 = lId >> 4; + tmp2.s1 = (scratch[32*tmp2.s0+0] > scratch[32*tmp2.s0+16]) ? (lId ^ 0x8) : lId; + + scratch[128+2*lId+0] = scratch[2*tmp2.s1+0]; // Elements sorted by 32 Elem + scratch[128+2*lId+1] = scratch[2*tmp2.s1+1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 64) { + tmp2.s0 = lId >> 5; + tmp2.s1 = (scratch[128+64*tmp2.s0+0] > scratch[128+64*tmp2.s0+32]) ? (lId ^ 0x10) : lId; + + scratch[2*lId+0] = scratch[128+2*tmp2.s1+0]; // Elements sorted by 64 Elem + scratch[2*lId+1] = scratch[128+2*tmp2.s1+1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lId < 64) { + tmp2.s0 = lId >> 6; + tmp2.s1 = (scratch[128*tmp2.s0+0] > scratch[128*tmp2.s0+64]) ? (lId ^ 0x20) : lId; + + scratch[128+2*lId+0] = scratch[2*tmp2.s1+0]; // Elements sorted by 128 Elem + scratch[128+2*lId+1] = scratch[2*tmp2.s1+1]; + } + + barrier(CLK_LOCAL_MEM_FENCE); // All Elements sorted + + if (lId == 0) scratch[0] = addr; + + barrier(CLK_LOCAL_MEM_FENCE); + + addr = scratch[0]; + + if ((addr < 16) && (lId < 32)) { + uint4 tmp; + tmp.s0 = scratch[128+4*lId]; + tmp.s1 = scratch[128+4*lId+1]; + tmp.s2 = scratch[128+4*lId+2]; + tmp.s3 = scratch[128+4*lId+3]; + + output0[1 + 32*addr + lId] = tmp; + } + + } + } + + //if (get_global_id(0) == 0) outCounters[0] = 2; + +} + diff --git a/mine.example.toml b/mine.example.toml index b9a4e2d..15c76a8 100644 --- a/mine.example.toml +++ b/mine.example.toml @@ -23,9 +23,11 @@ user = "t1YourZClassicAddressHere.rig1" # payout address / worker login # no-jackpot = true # PPLNS # ── GPU backend ─────────────────────────────────────────────────────────────── -# backend = "cuda" # "cuda" or "opencl" -# devices = "all" # "all", or a comma list e.g. "0,1" -# force-opencl = false # force OpenCL, disabling CUDA +# backend = "mixed" # "mixed" (default: NVIDIA→CUDA, AMD/Intel→OpenCL), +# # "opencl" (every card via OpenCL), or "cuda" +# devices = "all" # "all", or a comma list e.g. "0,1" — in mixed mode +# # these index the combined list from --list-devices +# force-opencl = false # force every card onto OpenCL, disabling CUDA # ── GPU tuning (clock/power changes need root) ──────────────────────────────── # no-gpu-tune = false diff --git a/src/blake.rs b/src/blake.rs index 1d6b2cd..798f250 100644 --- a/src/blake.rs +++ b/src/blake.rs @@ -166,6 +166,18 @@ impl BatchHasher { Self { mid, tail } } + /// The BLAKE2b chaining state after compressing the shared first 128-byte + /// header block (the eight 64-bit midstate words). This is exactly the + /// `hashState` (`ulong8`) the OpenCL `equihash192_7.cl` round-0 kernel + /// consumes: it injects the final 16-byte block (message word `m[1] = + /// (index << 32) | nonce_low`, `m[0] = 0`) itself, which requires the + /// header's bytes [128..136] to be zero (the same `cuda_compatible` rule the + /// CUDA backend relies on). Only used by the OpenCL (AMD) backend. + #[cfg_attr(not(feature = "gpu"), allow(dead_code))] + pub fn midstate(&self) -> [u64; 8] { + self.mid + } + /// Assemble the zero-padded final block for index `g`. #[inline] fn final_block(&self, g: u32) -> [u8; 128] { diff --git a/src/cuda.rs b/src/cuda.rs index 3d3d8b6..4f6eaaa 100644 --- a/src/cuda.rs +++ b/src/cuda.rs @@ -108,7 +108,16 @@ const CU_LAUNCH_PARAM_END: usize = 0x00; const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01; const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02; -extern "C" { +// The CUDA driver API, loaded at runtime via dlopen (see `crate::dylib`) rather +// than linked at build time: the SONAME `libcuda.so.1` ships with the NVIDIA +// driver (`nvcuda.dll` on Windows) and is absent on driver-less / AMD-only +// hosts. `cuda_lib()` returns `None` when it can't be opened; the public entry +// points below turn that into a clear error / empty device list, so the binary +// still builds and starts everywhere. +crate::dylib::dynamic_library! { + lib_struct: CudaLib, + loader: cuda_lib, + names: ["libcuda.so.1", "libcuda.so", "nvcuda.dll"], fn cuInit(flags: c_uint) -> CUresult; fn cuDeviceGetCount(count: *mut c_int) -> CUresult; fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult; @@ -148,6 +157,11 @@ extern "C" { fn cuGetErrorName(error: CUresult, str: *mut *const c_char) -> CUresult; } +/// Error returned when the CUDA driver library isn't present on the host. +fn cuda_unavailable() -> anyhow::Error { + anyhow!("CUDA driver library (libcuda.so.1) not found — is the NVIDIA driver installed?") +} + /// Turn a non-success `CUresult` into an error with the driver's symbolic name. fn check(code: CUresult, what: &str) -> Result<()> { if code == CUDA_SUCCESS { @@ -164,8 +178,10 @@ fn check(code: CUresult, what: &str) -> Result<()> { Err(anyhow!("{what} failed: {name}")) } -/// Number of CUDA devices (initialises the driver as a side effect). +/// Number of CUDA devices (initialises the driver as a side effect). Returns an +/// error if the CUDA driver library isn't installed. pub fn device_count() -> Result { + cuda_lib().ok_or_else(cuda_unavailable)?; unsafe { check(cuInit(0), "cuInit")?; let mut n: c_int = 0; @@ -579,6 +595,7 @@ impl CudaSolver { /// fatbin, select the config that fits free VRAM, allocate its buffers, and /// rebase the recorded launch sequence. pub fn new(device_index: usize) -> Result { + cuda_lib().ok_or_else(cuda_unavailable)?; unsafe { check(cuInit(0), "cuInit")?; let mut dev: CUdevice = 0; diff --git a/src/dylib.rs b/src/dylib.rs new file mode 100644 index 0000000..0e5b062 --- /dev/null +++ b/src/dylib.rs @@ -0,0 +1,85 @@ +//! Tiny runtime dynamic-library loader for the optional GPU vendor libraries. +//! +//! The CUDA driver (`libcuda`) and NVML (`libnvidia-ml`) are vendor components +//! that ship with the NVIDIA driver — they are not installable as ordinary +//! build dependencies and are absent on AMD-only / driver-less hosts. Linking +//! them at build time would (a) make the build fail without the NVIDIA libs +//! present and (b) make the resulting binary refuse to start anywhere they are +//! missing. Instead we `dlopen` them on first use: the binary has no build-time +//! or load-time dependency on them, and the CUDA backend simply reports "no +//! devices" when the driver isn't installed. +//! +//! [`dynamic_library!`] generates, for one such library, a function-pointer +//! table plus same-named wrapper `fn`s, so the call sites in [`crate::cuda`] / +//! [`crate::nvml`] are unchanged — only the `extern "C"` block is replaced. + +/// Open the first of `names` that loads (e.g. the versioned SONAME first, then +/// the unversioned dev symlink). Returns the last error if none load. +pub fn load_first(names: &[&str]) -> Result { + let mut last_err = None; + for name in names { + match unsafe { libloading::Library::new(name) } { + Ok(lib) => return Ok(lib), + Err(e) => last_err = Some(e), + } + } + Err(last_err.expect("load_first called with an empty name list")) +} + +/// Generate a runtime-loaded binding for one shared library. +/// +/// Produces a hidden fn-pointer struct, a `OnceLock`-cached loader (`$loader()` +/// returns `Option<&'static _>`, `None` when the library can't be loaded), and a +/// same-named `unsafe fn` wrapper for each declared function that dispatches +/// through the table. Public entry points must check `$loader().is_some()` (or +/// `?` on the `Option`) before invoking any wrapper; the wrappers themselves +/// panic if called with the library unloaded, which the entry-point guards +/// prevent. +macro_rules! dynamic_library { + ( + lib_struct: $Lib:ident, + loader: $loader:ident, + names: [$($lname:expr),+ $(,)?], + $( fn $fname:ident($($an:ident: $at:ty),* $(,)?) -> $ret:ty; )* + ) => { + #[allow(non_snake_case)] + struct $Lib { + $( $fname: unsafe extern "C" fn($($at),*) -> $ret, )* + // Keep the library mapped for the process lifetime; the fn pointers + // above point into it. + #[allow(dead_code)] + handle: libloading::Library, + } + + impl $Lib { + #[allow(non_snake_case)] + unsafe fn load() -> std::result::Result { + let handle = $crate::dylib::load_first(&[$($lname),+])?; + $( + let $fname: unsafe extern "C" fn($($at),*) -> $ret = + *handle.get(concat!(stringify!($fname), "\0").as_bytes())?; + )* + Ok(Self { $($fname,)* handle }) + } + } + + static __DYLIB: std::sync::OnceLock> = std::sync::OnceLock::new(); + + /// The loaded library, or `None` if it could not be opened. + fn $loader() -> Option<&'static $Lib> { + __DYLIB.get_or_init(|| unsafe { $Lib::load().ok() }).as_ref() + } + + $( + #[inline] + #[allow(non_snake_case)] + unsafe fn $fname($($an: $at),*) -> $ret { + ($loader() + .expect(concat!(stringify!($fname), ": ", stringify!($Lib), " not loaded")) + .$fname)($($an),*) + } + )* + }; +} + +pub(crate) use dynamic_library; diff --git a/src/gpu.rs b/src/gpu.rs index 84a36f1..4134544 100644 --- a/src/gpu.rs +++ b/src/gpu.rs @@ -151,8 +151,9 @@ fn kernel_source(geom: &Geom) -> String { ) } -/// A persistent OpenCL solver bound to one device. -pub struct GpuSolver { +/// The default (project-native) OpenCL solver, bound to one device. Wrapped by +/// [`GpuSolver`], which selects it for non-AMD devices. +struct LegacySolver { pq: ProQue, header: Buffer, /// Per-table back-reference arrays (1 u32/slot), kept resident for recovery. @@ -167,15 +168,14 @@ pub struct GpuSolver { nr_rows: usize, } -impl GpuSolver { +impl LegacySolver { /// This device's product name (e.g. "NVIDIA GeForce RTX 5080"), if available. pub fn device_name(&self) -> Option { self.pq.device().name().ok() } /// Initialise the solver and allocate all device buffers. - pub fn new(device_index: usize) -> Result { - let (platform, device) = pick_device(device_index)?; + pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result { let geom = pick_geom(&device); // The device's platform must be set explicitly: ProQue otherwise builds // the context against `Platform::default()` (the first platform), which @@ -406,6 +406,101 @@ impl GpuSolver { } } +/// OpenCL solver for one device. Dispatches to the AMD-tuned kernel +/// (`equihash192_7.cl`) on AMD-vendor devices and the default project kernel +/// (`equihash.cl`) everywhere else. Forceable with `ZCL_OPENCL_KERNEL=amd|legacy`. +pub struct GpuSolver { + inner: SolverInner, +} + +enum SolverInner { + Legacy(LegacySolver), + Amd(crate::gpu_amd::AmdSolver), +} + +impl GpuSolver { + /// Initialise the solver for a flat device index, choosing the kernel by + /// device vendor (AMD → `equihash192_7.cl`). + pub fn new(device_index: usize) -> Result { + let (platform, device) = pick_device(device_index)?; + let inner = if use_amd_kernel(&device) { + log::info!("OpenCL: AMD device — using the equihash192_7 kernel"); + SolverInner::Amd(crate::gpu_amd::AmdSolver::new(platform, device)?) + } else { + SolverInner::Legacy(LegacySolver::new(platform, device)?) + }; + Ok(Self { inner }) + } + + /// This device's product name, if available. + pub fn device_name(&self) -> Option { + match &self.inner { + SolverInner::Legacy(s) => s.device_name(), + SolverInner::Amd(s) => s.device_name(), + } + } + + /// Solve the puzzle for `header` (140 bytes). + pub fn solve(&self, header: &[u8]) -> Result>> { + match &self.inner { + SolverInner::Legacy(s) => s.solve(header), + SolverInner::Amd(s) => s.solve(header), + } + } + + /// Solve and also return the raw GPU candidate count (for diagnostics). + pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec>)> { + match &self.inner { + SolverInner::Legacy(s) => s.solve_with_stats(header), + SolverInner::Amd(s) => s.solve_with_stats(header), + } + } + + /// Time each GPU stage individually. + pub fn profile(&self, header: &[u8]) -> Result<()> { + match &self.inner { + SolverInner::Legacy(s) => s.profile(header), + SolverInner::Amd(s) => s.profile(header), + } + } + + /// Whether the per-index BLAKE2b probe ([`Self::hash_all`]) is available. + /// Only the default kernel exposes a linear digest layout; the AMD kernel + /// buckets in round 0, so the self-test skips the probe there. + pub fn supports_blake_probe(&self) -> bool { + matches!(self.inner, SolverInner::Legacy(_)) + } + + /// Compute every first-round BLAKE2b output (default kernel only). + pub fn hash_all(&self, header: &[u8]) -> Result> { + match &self.inner { + SolverInner::Legacy(s) => s.hash_all(header), + SolverInner::Amd(_) => { + Err(anyhow!("hash_all is not supported by the AMD kernel")) + } + } + } +} + +/// Decide whether to drive `device` with the AMD `equihash192_7.cl` kernel. +/// `ZCL_OPENCL_KERNEL` forces the choice (`amd` or `legacy`); otherwise it's by +/// device vendor. +fn use_amd_kernel(device: &ocl::Device) -> bool { + use ocl::enums::{DeviceInfo, DeviceInfoResult}; + match std::env::var("ZCL_OPENCL_KERNEL").ok().as_deref() { + Some(v) if v.eq_ignore_ascii_case("amd") => return true, + Some(v) if v.eq_ignore_ascii_case("legacy") => return false, + _ => {} + } + match device.info(DeviceInfo::Vendor) { + Ok(DeviceInfoResult::Vendor(v)) => { + let v = v.to_ascii_lowercase(); + v.contains("advanced micro devices") || v.contains("amd") + } + _ => false, + } +} + /// List `(platform, device)` names so the user can choose `--device`. pub fn list_devices() -> Result> { use ocl::{Device, Platform}; @@ -422,6 +517,25 @@ pub fn list_devices() -> Result> { Ok(names) } +/// For each OpenCL device — in the same flat order as [`list_devices`] and +/// `--devices` — whether its vendor is NVIDIA. The mixed backend uses this to +/// hand NVIDIA cards to CUDA (and mine only the non-NVIDIA OpenCL devices). +pub fn device_is_nvidia() -> Vec { + use ocl::enums::{DeviceInfo, DeviceInfoResult}; + use ocl::{Device, Platform}; + let mut out = Vec::new(); + for platform in Platform::list() { + for device in Device::list_all(platform).unwrap_or_default() { + let is_nv = matches!( + device.info(DeviceInfo::Vendor), + Ok(DeviceInfoResult::Vendor(v)) if v.to_ascii_lowercase().contains("nvidia") + ); + out.push(is_nv); + } + } + out +} + /// The flat OpenCL device index of the first CPU-type device (e.g. PoCL), if any. /// Lets CPU mining run through the OpenCL backend on the CPU. The index matches /// [`list_devices`] / `--devices`. diff --git a/src/gpu_amd.rs b/src/gpu_amd.rs new file mode 100644 index 0000000..f1c7c90 --- /dev/null +++ b/src/gpu_amd.rs @@ -0,0 +1,286 @@ +//! AMD OpenCL Equihash 192,7 solver (`kernels/equihash192_7.cl`). +//! +//! A second OpenCL backend, selected for AMD-vendor devices by +//! [`crate::gpu::GpuSolver`]. Where the default [`crate::gpu`] driver runs the +//! project's own `gen`/`round_collide`/`recover` kernel, this one drives a +//! self-contained, GCN-tuned kernel with a fixed table geometry and a different +//! host ABI: a `clearCounter` → `blake` → `round1..round7` → `combine` +//! pipeline. +//! +//! ## Geometry (hard-coded in the kernel, mirrored here) +//! +//! 2^25 initial entries are bucketed into `NR_ROWS = 8192` rows. Round 0 and the +//! early rounds keep `SLOTS_R0 = 4592` `uint8` slots per row (`buffer0`/ +//! `buffer1` ping-pong, ~1.2 GB each); rounds 4–5 widen to `SLOTS_R45 = 8688` +//! `uint4` slots in `buffer2` (~1.1 GB). `buffer1` is additionally reinterpreted +//! as `uint4`/`uint2` for the late-round R46/R57 outputs at fixed offsets. A +//! flat counter array of 8 banks × 16384 tracks per-row occupancy; the +//! round-7/R5 survivor count lives at index `R5_COUNTER_IDX = 114688` and sizes +//! the `combine` launch. +//! +//! ## Hashing ABI +//! +//! `blake` takes the BLAKE2b first-block midstate as a by-value `ulong8` +//! (`hashState`, from [`BatchHasher::midstate`]) plus a `nonce` whose low 32 +//! bits become message word `m[1]`'s low half (= header bytes [136..140]); the +//! kernel hard-codes `m[0] = 0`, so the header's bytes [128..136] must be zero +//! (the same `cuda_compatible` rule the CUDA backend uses). Each work item hashes +//! index `tId`, emitting the two leaf entries `2*tId` and `2*tId+1` — exactly the +//! canonical leaf-index/sub-block split [`crate::equihash`] verifies against. +//! +//! `combine` writes recovered solutions to the `output0`/`res` buffer: +//! `output0[0].s0` is the solution count and each solution is 32 `uint4` +//! (`SOLUTION_INDICES = 128` pre-sorted 25-bit leaf indices) at +//! `output0[1 + 32*i ..]`. Those flatten straight into +//! [`equihash::filter_candidates`], which canonicalises, verifies and +//! de-duplicates them — the same contract as the default driver. + +use anyhow::{anyhow, Result}; +use ocl::prm::Ulong8; +use ocl::{Buffer, MemFlags, ProQue}; + +use crate::blake::BatchHasher; +use crate::equihash; +use crate::params::{BLAKE_CALLS, HEADER_LEN, SOLUTION_INDICES}; + +/// Buckets ("rows") the 2^25 entries are hashed into (kernel: `& 0x1FFF`). +const NR_ROWS: usize = 8192; +/// `uint8` slots per row in the round-0/early `buffer0`/`buffer1` tables. +const SLOTS_R0: usize = 4592; +/// `uint4` slots per row in the round-4/5 `buffer2` table. +const SLOTS_R45: usize = 8688; +/// `uint8` entries in `buffer0`/`buffer1` (the kernel's `37617664` bound). +const BUF01_U8ENTRIES: usize = NR_ROWS * SLOTS_R0; +/// `uint4` entries in `buffer2`. +const BUF2_U4ENTRIES: usize = NR_ROWS * SLOTS_R45; +/// Flat counter array: 8 banks × 16384 (one bank consumed per round). +const COUNTERS_U32: usize = 8 * 16384; +/// Counter index holding the round-7 / R5 survivor count (sizes `combine`). +const R5_COUNTER_IDX: usize = 114688; +/// `combine` only emits solutions for `addr < this` (matches the kernel cap); +/// far above the ~2 solutions a 192,7 nonce yields. Reads are capped to match. +const MAX_WRITTEN_SOLS: usize = 16; +/// Solution buffer capacity in solutions (`output0` = 1 + 32*cap `uint4`). +const SOL_CAP: usize = 16; +/// `reqd_work_group_size` of `blake`/`combine`. +const WG_BLAKE: usize = 64; +/// `reqd_work_group_size` of `round1..round7`. +const WG_ROUND: usize = 256; +/// Input rows (buckets) each collision round reads. The table is keyed on 13 +/// bits (8192) through round 4, then narrows to 12 bits (4096) — a round that +/// reads `b` rows must launch exactly `b * 4` work-groups (kernel: +/// `bucket = grp >> 2`), or it processes uninitialised rows and explodes. +const ROUND_BUCKETS: [usize; 7] = [8192, 8192, 8192, 8192, 4096, 4096, 4096]; +/// Extra rows of slack appended to each big table. The kernel's per-row +/// `atomic_inc` writes are uncapped, so a row that overflows its slot count +/// spills into the next row and the top row spills past the nominal table end; +/// this slack absorbs that (mean occupancy ~4096 sits well under the 4592/8688 +/// slot counts, so realistic overflow is a few rows at most). +const ROW_SLACK: usize = 64; + +const KERNEL_SRC: &str = include_str!("../kernels/equihash192_7.cl"); + +/// A persistent AMD OpenCL solver bound to one device. +pub struct AmdSolver { + pq: ProQue, + /// Round-0/early ping-pong tables (`uint8`), reinterpreted at narrower + /// widths in late rounds. + buffer0: Buffer, + buffer1: Buffer, + /// Round-4/5 wide table (`uint4`). + buffer2: Buffer, + /// Per-row occupancy counters (8 banks). + counters: Buffer, + /// `res` / `output0`: `[count, then 32 uint4 per solution]`. + sols: Buffer, +} + +unsafe impl Send for AmdSolver {} + +impl AmdSolver { + /// This device's product name, if available. + pub fn device_name(&self) -> Option { + self.pq.device().name().ok() + } + + /// Build the solver on `(platform, device)` and allocate all device buffers + /// (~3.5 GB total). + pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result { + let pq = ProQue::builder() + .src(KERNEL_SRC) + .platform(platform) + .device(device) + .dims(1) // placeholder; every launch sets its own work size + .build() + .map_err(|e| anyhow!("AMD OpenCL build failed: {e}"))?; + + let alloc = |len: usize| -> Result> { + Ok(Buffer::::builder() + .queue(pq.queue().clone()) + .flags(MemFlags::new().read_write()) + .len(len) + .build()?) + }; + // uint8 entries → 8 u32 each; uint4 entries → 4 u32 each. Each table gets + // ROW_SLACK extra rows of write headroom (see ROW_SLACK). buffer1's + // nominal uint8 size (75.2M uint4) already covers the late-round R46/R57 + // regions at fixed offsets 48496640 / 67305472. + let buffer0 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?; + let buffer1 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?; + let buffer2 = alloc((BUF2_U4ENTRIES + ROW_SLACK * SLOTS_R45) * 4)?; + let counters = alloc(COUNTERS_U32)?; + let sols = alloc((1 + 32 * SOL_CAP) * 4)?; + + Ok(Self { pq, buffer0, buffer1, buffer2, counters, sols }) + } + + /// Set the eight by-value args shared by every kernel in the pipeline: + /// `(buffer0, buffer1, buffer2, counters, res, extra, hashState, nonce)`. + fn build_kernel(&self, name: &str, hash_state: Ulong8, nonce: u64) -> Result { + Ok(self + .pq + .kernel_builder(name) + .arg(&self.buffer0) + .arg(&self.buffer1) + .arg(&self.buffer2) + .arg(&self.counters) + .arg(&self.sols) + .arg(0u32) // extra (unused by the pipeline) + .arg(hash_state) + .arg(nonce) + .build()?) + } + + /// Run the full pipeline for `header` and return the flat recovered leaf + /// indices (`n * SOLUTION_INDICES`), ready for [`equihash::filter_candidates`]. + fn run_pipeline(&self, header: &[u8]) -> Result> { + let mid = BatchHasher::new(header).midstate(); + let hash_state = Ulong8::new( + mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7], + ); + // Kernel's gId = nonce & 0xFFFFFFFF = message word m[1] low = header[136..140]. + let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64; + + // Clear counters + solution header (global = counter uint4 count). + let clear = self.build_kernel("clearCounter", hash_state, nonce)?; + unsafe { + clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?; + } + + // Round 0: BLAKE2b + bucket. One work item per blake call (2^24); each + // emits two leaf entries. + let blake = self.build_kernel("blake", hash_state, nonce)?; + unsafe { + blake + .cmd() + .global_work_size(BLAKE_CALLS) + .local_work_size(WG_BLAKE) + .enq()?; + } + + // Collision rounds 1..7 (4 groups per input row, 256 work items each). + for r in 1..=7 { + let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?; + unsafe { + k.cmd() + .global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND) + .local_work_size(WG_ROUND) + .enq()?; + } + } + + // Size `combine` from the round-7 survivor count (one group per candidate). + let mut r5 = [0u32; 1]; + self.counters + .read(&mut r5[..]) + .offset(R5_COUNTER_IDX) + .enq()?; + let groups = r5[0] as usize; + if groups == 0 { + return Ok(Vec::new()); + } + let combine = self.build_kernel("combine", hash_state, nonce)?; + unsafe { + combine + .cmd() + .global_work_size(groups * WG_BLAKE) + .local_work_size(WG_BLAKE) + .enq()?; + } + + // output0[0].s0 = solution count; each solution is 128 u32 (32 uint4) + // starting at uint4 index 1 (= u32 offset 4). + let mut head = [0u32; 1]; + self.sols.read(&mut head[..]).enq()?; + let nsols = (head[0] as usize).min(MAX_WRITTEN_SOLS); + if nsols == 0 { + return Ok(Vec::new()); + } + let mut data = vec![0u32; nsols * SOLUTION_INDICES]; + self.sols.read(&mut data[..]).offset(4).enq()?; + Ok(data) + } + + /// Solve for `header` (140 bytes): returns valid, canonical, de-duplicated + /// solutions as leaf-index lists. + pub fn solve(&self, header: &[u8]) -> Result>> { + assert_eq!(header.len(), HEADER_LEN); + let base = crate::blake::base_state(header); + let out = self.run_pipeline(header)?; + Ok(equihash::filter_candidates(&base, &out)) + } + + /// Solve and also return the raw recovered-candidate count (for diagnostics). + pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec>)> { + assert_eq!(header.len(), HEADER_LEN); + let base = crate::blake::base_state(header); + let out = self.run_pipeline(header)?; + let raw = out.len() / SOLUTION_INDICES; + Ok((raw, equihash::filter_candidates(&base, &out))) + } + + /// Time each pipeline stage individually (forces a sync between stages). + pub fn profile(&self, header: &[u8]) -> Result<()> { + use log::info; + use std::time::Instant; + + let mid = BatchHasher::new(header).midstate(); + let hash_state = Ulong8::new( + mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7], + ); + let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64; + let q = self.pq.queue(); + let stage = |label: &str, t: Instant| -> Result<()> { + q.finish().map_err(|e| anyhow!("{label} failed: {e}"))?; + info!(" {label:14} {:6.1} ms", t.elapsed().as_secs_f64() * 1000.0); + Ok(()) + }; + + let t = Instant::now(); + let clear = self.build_kernel("clearCounter", hash_state, nonce)?; + unsafe { + clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?; + } + stage("clear", t)?; + + let t = Instant::now(); + let blake = self.build_kernel("blake", hash_state, nonce)?; + unsafe { + blake.cmd().global_work_size(BLAKE_CALLS).local_work_size(WG_BLAKE).enq()?; + } + stage("blake", t)?; + + for r in 1..=7 { + let t = Instant::now(); + let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?; + unsafe { + k.cmd() + .global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND) + .local_work_size(WG_ROUND) + .enq()?; + } + stage(&format!("round {r}"), t)?; + } + Ok(()) + } +} diff --git a/src/gpu_probe.rs b/src/gpu_probe.rs index 54a5a48..697ed59 100644 --- a/src/gpu_probe.rs +++ b/src/gpu_probe.rs @@ -1,11 +1,12 @@ //! GPU device probing for the config tool (`jackpotminer-config` only — this is //! not compiled into the miner, so there is no duplicate FFI). //! -//! With the `gpu`/`cuda` features the OpenCL/CUDA SDKs are linked in (build.rs -//! links `cuda`/`nvml`; the `ocl` crate links `OpenCL`), and the tool enumerates -//! devices directly — handy on Windows where you may not want to shell out to -//! the miner. Without those features the functions return empty lists and the -//! tool falls back to spawning `jackpotminer --devices-json`. +//! With the `gpu`/`cuda` features the tool enumerates devices directly — handy +//! on Windows where you may not want to shell out to the miner. OpenCL goes +//! through the `ocl` crate; CUDA is `dlopen`'d at runtime (so this binary, like +//! the miner, has no build- or load-time dependency on libcuda). Without those +//! features the functions return empty lists and the tool falls back to spawning +//! `jackpotminer --devices-json`. /// True when at least one GPU SDK is compiled in, so direct probing works. pub const HAS_SDK: bool = cfg!(feature = "gpu") || cfg!(feature = "cuda"); @@ -34,34 +35,57 @@ pub fn opencl() -> Vec { } /// CUDA devices as `"[i] "` via the driver API (empty without the SDK, no -/// driver, or any error). Uses a tiny self-contained FFI subset. +/// driver, or any error). The CUDA driver is `dlopen`'d at runtime — a tiny +/// self-contained subset of the FFI in `src/cuda.rs` — so this binary needs no +/// link- or load-time libcuda. #[cfg(feature = "cuda")] pub fn cuda() -> Vec { use std::ffi::CStr; use std::os::raw::{c_char, c_int, c_uint}; - // Linked via build.rs (`cuda`), matching src/cuda.rs's declarations. - extern "C" { - fn cuInit(flags: c_uint) -> c_int; - fn cuDeviceGetCount(count: *mut c_int) -> c_int; - fn cuDeviceGet(device: *mut c_int, ordinal: c_int) -> c_int; - fn cuDeviceGetName(name: *mut c_char, len: c_int, dev: c_int) -> c_int; - } + type CuInit = unsafe extern "C" fn(c_uint) -> c_int; + type CuCount = unsafe extern "C" fn(*mut c_int) -> c_int; + type CuGet = unsafe extern "C" fn(*mut c_int, c_int) -> c_int; + type CuName = unsafe extern "C" fn(*mut c_char, c_int, c_int) -> c_int; let mut out = Vec::new(); unsafe { - if cuInit(0) != 0 { + // libcuda.so.1 ships with the NVIDIA driver; absent on AMD-only hosts. + let lib = match ["libcuda.so.1", "libcuda.so", "nvcuda.dll"] + .iter() + .find_map(|n| libloading::Library::new(n).ok()) + { + Some(l) => l, + None => return out, + }; + let sym = |name: &[u8]| -> Option<*mut std::ffi::c_void> { + lib.get::<*mut std::ffi::c_void>(name).ok().map(|s| *s) + }; + let (Some(init), Some(count), Some(get), Some(getname)) = ( + sym(b"cuInit\0"), + sym(b"cuDeviceGetCount\0"), + sym(b"cuDeviceGet\0"), + sym(b"cuDeviceGetName\0"), + ) else { + return out; + }; + let cu_init: CuInit = std::mem::transmute(init); + let cu_count: CuCount = std::mem::transmute(count); + let cu_get: CuGet = std::mem::transmute(get); + let cu_name: CuName = std::mem::transmute(getname); + + if cu_init(0) != 0 { return out; } let mut n: c_int = 0; - if cuDeviceGetCount(&mut n) != 0 { + if cu_count(&mut n) != 0 { return out; } for i in 0..n { let mut dev: c_int = 0; - let name = if cuDeviceGet(&mut dev, i) == 0 { + let name = if cu_get(&mut dev, i) == 0 { let mut buf = [0i8; 128]; - if cuDeviceGetName(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 { + if cu_name(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 { CStr::from_ptr(buf.as_ptr() as *const c_char).to_string_lossy().into_owned() } else { format!("CUDA device {i}") diff --git a/src/main.rs b/src/main.rs index 4c1be8c..9ed4ea9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,6 +14,14 @@ mod tui; #[cfg(feature = "gpu")] mod gpu; +// AMD-tuned OpenCL kernel driver (selected by GpuSolver for AMD-vendor devices). +#[cfg(feature = "gpu")] +mod gpu_amd; + +// Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML. +#[cfg(feature = "cuda")] +mod dylib; + #[cfg(feature = "cuda")] mod cuda; @@ -79,8 +87,9 @@ struct Args { jackpot: Option, /// Pause mining if no new job arrives within this many seconds (stale work - /// guard); resumes automatically when fresh work arrives. 0 disables. - #[arg(long, value_name = "SECS", default_value_t = 300)] + /// guard); resumes automatically when fresh work arrives. Default 600 (10 + /// minutes). 0 disables. + #[arg(long, value_name = "SECS", default_value_t = 600)] job_timeout: u64, /// Open a local control server on 127.0.0.1: so the GUI config tool can @@ -139,8 +148,11 @@ struct Args { #[arg(long, default_value = "all")] devices: String, - /// GPU backend: "opencl" or "cuda" (for nvidia cards). - #[arg(long, default_value = "cuda")] + /// GPU backend: "mixed" (default — each card on its native backend: NVIDIA + /// on CUDA, AMD/Intel on OpenCL), "opencl" (every card via OpenCL), or + /// "cuda" (NVIDIA only). In mixed mode `--devices` indexes the combined list + /// shown by --list-devices. + #[arg(long, default_value = "mixed")] backend: String, /// Force the OpenCL backend, disabling CUDA (overrides --backend). @@ -610,6 +622,9 @@ fn main() -> Result<()> { /// Which GPU backend the user selected. enum BackendKind { Cpu, + /// Each physical card on its native backend (NVIDIA→CUDA, others→OpenCL). + #[cfg(any(feature = "gpu", feature = "cuda"))] + Mixed, #[cfg(feature = "gpu")] OpenCl, #[cfg(feature = "cuda")] @@ -633,6 +648,16 @@ fn backend_kind(args: &Args) -> Result { } } match args.backend.to_ascii_lowercase().as_str() { + "mixed" => { + // Each card on its native backend; falls back to whatever single GPU + // backend is compiled, or to CPU when none is. + #[cfg(any(feature = "gpu", feature = "cuda"))] + { + Ok(BackendKind::Mixed) + } + #[cfg(not(any(feature = "gpu", feature = "cuda")))] + Ok(BackendKind::Cpu) + } "cuda" => { #[cfg(feature = "cuda")] { @@ -649,7 +674,7 @@ fn backend_kind(args: &Args) -> Result { #[cfg(not(feature = "gpu"))] Ok(BackendKind::Cpu) } - other => Err(anyhow!("unknown --backend '{other}' (expected opencl or cuda)")), + other => Err(anyhow!("unknown --backend '{other}' (expected mixed, opencl, or cuda)")), } } @@ -707,6 +732,8 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result return mixed_specs(args), #[cfg(feature = "cuda")] BackendKind::Cuda => (cuda::device_count()?, true), #[cfg(feature = "gpu")] @@ -735,6 +762,71 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result Vec<(String, BackendSpec)> { + /// Drop a leading `"[] "` index prefix from a backend's device label, so + /// the mixed list shows its own single index instead of two. + fn strip_index(label: &str) -> &str { + label + .strip_prefix('[') + .and_then(|s| s.split_once("] ")) + .map(|(_, rest)| rest) + .unwrap_or(label) + } + + #[allow(unused_mut)] + let mut plan: Vec<(String, BackendSpec)> = Vec::new(); + + // NVIDIA cards via CUDA, when the backend is compiled and the driver loads. + #[cfg(feature = "cuda")] + let cuda_has_nvidia = { + let names = cuda::list_devices().unwrap_or_default(); + for (i, label) in names.iter().enumerate() { + plan.push((format!("{} (CUDA)", strip_index(label)), BackendSpec::Cuda(i))); + } + !names.is_empty() + }; + #[cfg(not(feature = "cuda"))] + let cuda_has_nvidia = false; + + // Remaining OpenCL cards via OpenCL; skip NVIDIA ones already on CUDA. + #[cfg(feature = "gpu")] + { + let names = gpu::list_devices().unwrap_or_default(); + let nvidia = gpu::device_is_nvidia(); + for (j, label) in names.iter().enumerate() { + if nvidia.get(j).copied().unwrap_or(false) && cuda_has_nvidia { + continue; + } + plan.push((format!("{} (OpenCL)", strip_index(label)), BackendSpec::Gpu(j))); + } + } + // `cuda_has_nvidia` is only consumed by the OpenCL branch above. + #[cfg(not(feature = "gpu"))] + let _ = cuda_has_nvidia; + + plan +} + +/// Build the worker list for `--backend mixed`: each card on its native backend. +/// `--devices` selects into [`mixed_plan`]'s unified list. +#[cfg(any(feature = "gpu", feature = "cuda"))] +fn mixed_specs(args: &Args) -> Result> { + let plan = mixed_plan(); + if plan.is_empty() { + return Err(anyhow!( + "no GPUs found for the mixed backend — none detected via CUDA or OpenCL" + )); + } + let selected = parse_devices(&args.devices, plan.len())?; + Ok(selected.into_iter().map(|i| plan[i].1).collect()) +} + /// Build a single GPU worker spec for `idx`, choosing CUDA or OpenCL, erroring if /// the requested backend wasn't compiled in. #[cfg(any(feature = "gpu", feature = "cuda"))] @@ -821,6 +913,18 @@ fn list_devices() { Ok(_) => println!("no CUDA devices found"), Err(e) => println!("error listing CUDA devices: {e}"), } + // What the default `mixed` backend will mine, and the indices `--devices` + // selects from in that mode. + #[cfg(any(feature = "gpu", feature = "cuda"))] + { + let plan = mixed_plan(); + if !plan.is_empty() { + println!("\nMixed backend (--backend mixed, the default) — `--devices` indexes this list:"); + for (i, (label, _)) in plan.iter().enumerate() { + println!(" [{i}] {label}"); + } + } + } #[cfg(not(any(feature = "gpu", feature = "cuda")))] println!("built without GPU support (rebuild with the `gpu` or `cuda` feature)"); } @@ -886,18 +990,24 @@ fn selftest(gpu_device: usize) -> Result<()> { let solver = gpu::GpuSolver::new(gpu_device) .with_context(|| format!("init OpenCL device {gpu_device}"))?; - // Spot-check the BLAKE2b kernel against the CPU reference. - let outputs = solver.hash_all(&header)?; - let step = params::BLAKE_CALLS / 64; - for k in 0..64 { - let g = (k * step) as u32; - let cpu = blake::generate_hash(&base, g); - let off = g as usize * params::HASH_OUTPUT; - if cpu != outputs[off..off + params::HASH_OUTPUT] { - return Err(anyhow!("GPU BLAKE2b mismatch at g={g}")); + // Spot-check the BLAKE2b kernel against the CPU reference. The AMD kernel + // buckets its round-0 output instead of exposing per-index digests, so + // the probe is skipped there (the solve-vs-CPU check below still runs). + if solver.supports_blake_probe() { + let outputs = solver.hash_all(&header)?; + let step = params::BLAKE_CALLS / 64; + for k in 0..64 { + let g = (k * step) as u32; + let cpu = blake::generate_hash(&base, g); + let off = g as usize * params::HASH_OUTPUT; + if cpu != outputs[off..off + params::HASH_OUTPUT] { + return Err(anyhow!("GPU BLAKE2b mismatch at g={g}")); + } } + info!("GPU BLAKE2b kernel matches CPU"); + } else { + info!("skipping BLAKE2b kernel probe (AMD kernel buckets round-0 output)"); } - info!("GPU BLAKE2b kernel matches CPU"); let gpu_solutions = solver.solve(&header)?; info!("GPU found {} valid solution(s)", gpu_solutions.len()); diff --git a/src/nvml.rs b/src/nvml.rs index e73ad63..05e9bba 100644 --- a/src/nvml.rs +++ b/src/nvml.rs @@ -33,7 +33,15 @@ const NVML_CLOCK_MEM: c_int = 2; // nvmlTemperatureSensors_t const NVML_TEMPERATURE_GPU: c_int = 0; -extern "C" { +// NVML, loaded at runtime via dlopen (see `crate::dylib`) rather than linked at +// build time — it ships with the NVIDIA driver (`libnvidia-ml.so.1`; +// `nvml.dll` on Windows) and is absent on driver-less / AMD-only hosts. +// `nvml_lib()` is `None` when it can't be opened; `open()` checks it first and +// returns `None` (no tuning) so the rest of the program is unaffected. +crate::dylib::dynamic_library! { + lib_struct: NvmlLib, + loader: nvml_lib, + names: ["libnvidia-ml.so.1", "libnvidia-ml.so", "nvml.dll"], fn nvmlInit_v2() -> nvmlReturn_t; fn nvmlShutdown() -> nvmlReturn_t; fn nvmlDeviceGetName(device: nvmlDevice_t, name: *mut c_char, length: c_uint) -> nvmlReturn_t; @@ -69,6 +77,7 @@ unsafe impl Send for NvmlTuner {} /// Open an NVML control handle for the GPU at `pci_bus_id` (e.g. "0000:01:00.0"). pub fn open(pci_bus_id: &str) -> Option> { + nvml_lib()?; // NVML not installed → no tuning let cstr = CString::new(pci_bus_id).ok()?; unsafe { if nvmlInit_v2() != NVML_SUCCESS {