diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..69d99ee
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+# Keep the build context small: the portable build only needs the sources.
+/target
+/dist
+/.git
+/pearl-dump
+/alpha-miner
+*.log
diff --git a/.gitignore b/.gitignore
index 7d7ae7c..9810fe2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 # Rust build artifacts
 /target
 
+# Portable container build output (build-portable.sh)
+/dist
+
 # IDE / editor
 /.idea
 
diff --git a/Cargo.lock b/Cargo.lock
index 3213946..7d4fdf4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2236,6 +2236,7 @@ dependencies = [
  "eframe",
  "env_logger",
  "hex",
+ "libloading 0.8.9",
  "log",
  "num_cpus",
  "ocl",
diff --git a/Cargo.toml b/Cargo.toml
index e85ab4e..9e74463 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,13 +23,18 @@ socket2 = "0.5"
 ocl = { version = "0.19", optional = true }
 ratatui = "0.30.0"
 eframe = { version = "0.28", optional = true }
+# Runtime loader for the CUDA driver / NVML (dlopen'd, not link-time, so the
+# binary has no build- or load-time dependency on libcuda / libnvidia-ml).
+libloading = { version = "0.8", optional = true }
 
 [features]
 default = ["gpu", "cuda", "config-gui"]
 gpu = ["dep:ocl"]
 # CUDA backend: drives miniZ's embedded Equihash 192,7 fatbin via the CUDA driver
-# API. build.rs only links libcuda (no nvcc / kernel compilation needed).
-cuda = []
+# API. The driver (libcuda) and NVML are dlopen'd at runtime via libloading, so
+# there is no build-time or load-time dependency on them — the binary builds and
+# starts on hosts without an NVIDIA driver and simply reports no CUDA devices.
+cuda = ["dep:libloading"]
 # Optional native GUI config editor (the `jackpotminer-config` binary). Off by
 # default so the miner never pulls in the GUI toolkit.
 config-gui = ["dep:eframe"]
diff --git a/Dockerfile.portable b/Dockerfile.portable
new file mode 100644
index 0000000..f2396a6
--- /dev/null
+++ b/Dockerfile.portable
@@ -0,0 +1,40 @@
+# Portable jackpotminer build.
+#
+# Links against Debian bullseye's glibc 2.31 (released 2020), so the resulting
+# binary runs on essentially any Linux from the last several years instead of
+# requiring the build host's (much newer) glibc. This is the real fix for the
+# "version `GLIBC_2.39' not found" class of errors — static linking can't solve
+# it for a GPU build, because the GPU driver libraries are glibc-only and load
+# at runtime.
+#
+# The CUDA driver and NVML are dlopen'd at runtime (see src/dylib.rs), so this
+# build needs NO NVIDIA toolkit — only the OpenCL ICD loader (to link libOpenCL).
+# The result is one binary that drives AMD cards (OpenCL) and NVIDIA cards (CUDA,
+# loaded if libcuda.so.1 is present at runtime).
+#
+# Build:   DOCKER_BUILDKIT=1 docker build -f Dockerfile.portable \
+#            --output type=local,dest=dist .
+# or just: ./build-portable.sh
+# Output:  dist/jackpotminer
+
+FROM debian:bullseye-slim AS build
+ENV DEBIAN_FRONTEND=noninteractive
+# gcc/g++ for the linker; ocl-icd-opencl-dev provides libOpenCL.so for linking
+# (the runtime host supplies its own libOpenCL.so.1 via its GPU driver).
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates curl gcc g++ make pkg-config ocl-icd-opencl-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Minimal stable Rust toolchain.
+RUN curl -fsSL https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /src
+COPY . .
+# Miner only (no GUI config tool, to avoid pulling X11/Wayland/GL into the
+# build): AMD OpenCL + dlopen'd CUDA. `--locked` keeps it reproducible.
+RUN cargo build --release --locked --no-default-features --features gpu,cuda \
+    && strip target/release/jackpotminer
+
+# Export just the binary to the build output directory.
+FROM scratch AS export
+COPY --from=build /src/target/release/jackpotminer /jackpotminer
diff --git a/README.md b/README.md
index edc2ee5..1d8f62e 100644
--- a/README.md
+++ b/README.md
@@ -98,16 +98,45 @@ reverse-engineered Equihash 192,7 solver — see "CUDA backend" below.
 
 ## Build
 
-Requirements: a Rust toolchain and an OpenCL runtime (the NVIDIA driver ships
-`libOpenCL`). The CUDA backend only needs `libcuda` (the NVIDIA driver) — the
-fatbin and launch trace it drives are embedded in the binary, so no CUDA toolkit
-or `nvcc` is required.
+Requirements: a Rust toolchain and, for the OpenCL backend, the OpenCL ICD
+loader (`libOpenCL` — e.g. `ocl-icd-opencl-dev` on Debian/Ubuntu; the NVIDIA and
+AMD drivers also ship it). The CUDA driver and NVML are **`dlopen`'d at runtime**
+(see `src/dylib.rs`), so the `cuda` feature needs no NVIDIA toolkit or libs to
+build, and a `cuda`-enabled binary still builds and starts on hosts without an
+NVIDIA driver — it simply reports no CUDA devices. The fatbin and launch trace
+the CUDA backend drives are embedded, so no `nvcc` is required either.
 
 ```bash
-cargo build --release                          # OpenCL backend (default)
-cargo build --release --features cuda           # OpenCL + CUDA backends
-cargo build --release --no-default-features --features cuda   # CUDA only
-cargo build --release --no-default-features      # CPU-only (no GPU)
+cargo build --release                          # default: OpenCL + CUDA + GUI config tool
+cargo build --release --no-default-features --features gpu,cuda  # miner only, both GPU backends
+cargo build --release --no-default-features --features gpu        # OpenCL only (AMD/Intel/NVIDIA)
+cargo build --release --no-default-features --features cuda       # CUDA only
+cargo build --release --no-default-features                       # CPU-only (no GPU)
+```
+
+### Portable / distributable builds
+
+The miner's only runtime dependencies are the C library and the OpenCL ICD loader
+(`libOpenCL.so.1`, present wherever a GPU driver is); CUDA/NVML are loaded on
+demand. So the main compatibility risk when shipping a Linux binary is the
+**glibc version** it was built against — not the GPU libraries. To build one that
+runs on older distros, compile against an old glibc in a container:
+
+```bash
+./build-portable.sh          # → dist/jackpotminer   (Docker, or ENGINE=podman)
+```
+
+This links against Debian bullseye's glibc 2.31 (runs on most Linux from ~2020
+on) and yields a single miner that drives both AMD (OpenCL) and NVIDIA (CUDA)
+cards. See `Dockerfile.portable`.
+
+A fully *static* GPU binary isn't possible: the OpenCL/CUDA driver libraries are
+glibc-only and must load at runtime. For a zero-dependency binary that runs
+anywhere, build the **CPU-only** miner against musl:
+
+```bash
+rustup target add x86_64-unknown-linux-musl
+cargo build --release --target x86_64-unknown-linux-musl --no-default-features
 ```
 
 ### CUDA backend (miniZ fatbin replay)
@@ -181,7 +210,7 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and
 ## Usage
 
 ```bash
-# List OpenCL devices
+# List devices (and the default "mixed" backend's combined index list)
 ./target/release/jackpotminer --list-devices
 
 # Mine on one GPU
@@ -195,8 +224,11 @@ on clean shutdown**. The per-card stats line shows live `Sol/s`, board `W`, and
 ./target/release/jackpotminer --url ... --user ... --devices 0,1
 ./target/release/jackpotminer --url ... --user ... --devices all
 
-# Use the CUDA backend instead of OpenCL (needs a --features cuda build)
-./target/release/jackpotminer --url ... --user ... --backend cuda --devices all
+# Default backend is "mixed": NVIDIA cards run on CUDA, AMD/Intel on OpenCL —
+# so an AMD + NVIDIA rig just works. --devices indexes the combined list from
+# --list-devices. Pin a single backend for every card with:
+./target/release/jackpotminer --url ... --user ... --backend opencl   # all via OpenCL
+./target/release/jackpotminer --url ... --user ... --backend cuda     # NVIDIA only
 
 # Force the CPU backend
 ./target/release/jackpotminer --url ... --user ... --cpu
diff --git a/build-portable.sh b/build-portable.sh
new file mode 100755
index 0000000..c73c7a5
--- /dev/null
+++ b/build-portable.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+# Build a portable jackpotminer binary inside an old-glibc (Debian bullseye,
+# glibc 2.31) container, so it runs on essentially any recent Linux regardless of
+# the build host's glibc. CUDA is dlopen'd at runtime, so no NVIDIA toolkit is
+# needed to build; the binary drives both AMD (OpenCL) and NVIDIA (CUDA) cards.
+#
+# Output: dist/jackpotminer
+#
+# Works with Docker (BuildKit) or Podman. Override the engine with ENGINE=podman.
+set -eu
+
+ENGINE="${ENGINE:-docker}"
+OUT="${OUT:-dist}"
+
+mkdir -p "$OUT"
+
+case "$ENGINE" in
+  podman)
+    # Podman builds the final `scratch` stage; extract the binary from it.
+    podman build -f Dockerfile.portable -t jackpotminer-portable .
+    cid=$(podman create jackpotminer-portable)
+    podman cp "$cid:/jackpotminer" "$OUT/jackpotminer"
+    podman rm "$cid" >/dev/null
+    ;;
+  *)
+    DOCKER_BUILDKIT=1 "$ENGINE" build -f Dockerfile.portable \
+      --output "type=local,dest=$OUT" .
+    ;;
+esac
+
+chmod +x "$OUT/jackpotminer"
+echo
+echo "Built $OUT/jackpotminer"
+command -v file >/dev/null 2>&1 && file "$OUT/jackpotminer" || true
+echo "Minimum glibc / dynamic deps:"
+{ objdump -T "$OUT/jackpotminer" 2>/dev/null | grep -oE 'GLIBC_[0-9.]+' | sort -V | tail -1; } || true
diff --git a/build.rs b/build.rs
deleted file mode 100644
index 4fdd6a0..0000000
--- a/build.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-//! Build script for the CUDA backend.
-//!
-//! The `cuda` feature links the CUDA driver API (`cuda`) and NVML (for
-//! clock/power control + readout). The backend drives miniZ's embedded fatbin
-//! (`src/miniz/equihash192_7.fatbin`) via the driver API, so no nvcc / kernel
-//! compilation is needed at build time. (The default OpenCL backend needs no
-//! build-script support — `ocl` links `OpenCL` itself, cross-platform.)
-//!
-//! Linking is target-aware so the `cuda` feature builds on both Linux and
-//! Windows:
-//!   - Linux: `libcuda.so` + `libnvidia-ml.so` from the system / toolkit dirs.
-//!   - Windows: `cuda.lib` + `nvml.lib` from `%CUDA_PATH%\lib\x64`.
-
-use std::path::Path;
-
-fn main() {
-    println!("cargo:rerun-if-changed=build.rs");
-
-    if std::env::var("CARGO_FEATURE_CUDA").is_ok() {
-        // Use the *target* OS (correct for cross-compilation too), not the host.
-        let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
-        link_cuda_driver(&target_os);
-    }
-}
-
-/// Link the CUDA driver library plus NVML. The backend loads the embedded miniZ
-/// fatbin at runtime, so there is nothing to compile here.
-fn link_cuda_driver(target_os: &str) {
-    if target_os == "windows" {
-        // CUDA Toolkit import libraries (cuda.lib, nvml.lib).
-        println!("cargo:rerun-if-env-changed=CUDA_PATH");
-        if let Ok(cuda_path) = std::env::var("CUDA_PATH") {
-            println!("cargo:rustc-link-search=native={cuda_path}\\lib\\x64");
-        }
-        // Driver API import lib is `cuda.lib`; NVML is `nvml.lib` (nvml.dll
-        // ships with the NVIDIA driver).
-        println!("cargo:rustc-link-lib=dylib=cuda");
-        println!("cargo:rustc-link-lib=dylib=nvml");
-    } else {
-        for dir in ["/usr/lib64", "/usr/lib", "/opt/cuda/lib64"] {
-            if Path::new(dir).exists() {
-                println!("cargo:rustc-link-search=native={dir}");
-            }
-        }
-        // GNU ld: embed an rpath so libcuda is found at runtime (Linux only —
-        // MSVC's linker rejects `-Wl,...`).
-        if target_os == "linux" {
-            println!("cargo:rustc-link-arg=-Wl,-rpath,/opt/cuda/lib64");
-        }
-        println!("cargo:rustc-link-lib=dylib=cuda");
-        println!("cargo:rustc-link-lib=dylib=nvidia-ml");
-    }
-}
diff --git a/kernels/equihash192_7.cl b/kernels/equihash192_7.cl
new file mode 100644
index 0000000..6db2719
--- /dev/null
+++ b/kernels/equihash192_7.cl
@@ -0,0 +1,2110 @@
+
+
+//#define PRINT 1
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__constant ulong blake_iv[] =
+{
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+inline static uint2 ror64(const uint2 x, const uint y)
+{
+    return (uint2)(((x).x>>y)^((x).y<<(32-y)),((x).y>>y)^((x).x<<(32-y)));
+}
+inline static uint2 ror64_2(const uint2 x, const uint y)
+{
+    return (uint2)(((x).y>>(y-32))^((x).x<<(64-y)),((x).x>>(y-32))^((x).y<<(64-y)));
+}
+
+#define gFunc(va, vb, vc, vd, x, y) \
+va = (va + vb + x); \
+((uint2*)&vd)[0] = ((uint2*)&vd)[0].yx ^ ((uint2*)&va)[0].yx; \
+vc = (vc + vd); \
+((uint2*)&vb)[0] = ror64( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 24U); \
+va = (va + vb + y); \
+((uint2*)&vd)[0] = ror64( ((uint2*)&vd)[0] ^ ((uint2*)&va)[0], 16U); \
+vc = (vc + vd); \
+((uint2*)&vb)[0] = ror64_2( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 63U);
+
+#define gFunc0(va, vb, vc, vd) \
+va = (va + vb); \
+((uint2*)&vd)[0] = ((uint2*)&vd)[0].yx ^ ((uint2*)&va)[0].yx; \
+vc = (vc + vd); \
+((uint2*)&vb)[0] = ror64( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 24U); \
+va = (va + vb); \
+((uint2*)&vd)[0] = ror64( ((uint2*)&vd)[0] ^ ((uint2*)&va)[0], 16U); \
+vc = (vc + vd); \
+((uint2*)&vb)[0] = ror64_2( ((uint2*)&vb)[0] ^ ((uint2*)&vc)[0], 63U);
+
+
+inline uint swapByteEndian(uint input) {
+	uint tmp0 = input & 0x0F0F0F0F;
+	uint tmp1 = input & 0xF0F0F0F0;
+
+	tmp0 = tmp0 << 4;
+	tmp1 = tmp1 >> 4;
+
+	uint tmpIn = tmp0 | tmp1;
+	
+	tmp0 = tmpIn & 0x33333333;
+	tmp1 = tmpIn & 0xCCCCCCCC;
+
+	tmp0 = tmp0 << 2;
+	tmp1 = tmp1 >> 2;
+
+	tmpIn = tmp0 | tmp1;
+
+	tmp0 = tmpIn & 0x55555555;
+	tmp1 = tmpIn & 0xAAAAAAAA;
+
+	tmp0 = tmp0 << 1;
+	tmp1 = tmp1 >> 1;
+
+	return tmp0 | tmp1;
+}
+
+
+uint8 shr_7(uint8 input, uint sh0, uint sh1) {
+	uint8 tmp = (input >> sh0);
+	uint8 tmp2 = (input << 32-sh0);
+
+	tmp.s0 = input.s0 >> sh1;
+
+	tmp.s0123 |= tmp2.s1234;
+	tmp.s45 |= tmp2.s56;
+
+	tmp.s7 = input.s7;
+	return tmp;
+}
+
+
+void round0(ulong8 blake_state, __global uint8 *resultsHi, __global uint2 *resultsLo , __global uint *counters, uint tId, uint gId) {
+       	ulong v[16];
+   
+	ulong word1 = ((ulong)tId << 32) | gId;
+	// init vector v
+	v[0] = blake_state.s0;
+	v[1] = blake_state.s1;
+	v[2] = blake_state.s2;
+	v[3] = blake_state.s3;
+	v[4] = blake_state.s4;
+	v[5] = blake_state.s5;
+	v[6] = blake_state.s6;
+	v[7] = blake_state.s7;
+	v[8] =  blake_iv[0];
+	v[9] =  blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4];
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6];
+	v[15] = blake_iv[7];
+	// gFunc in length of data
+	v[12] ^= 144 /* length of "i" */;
+	// last block
+	v[14] ^= (ulong)-1;
+
+	// round 1
+	gFunc(v[0], v[4], v[8],  v[12], 0, word1);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 2
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc(v[0], v[5], v[10], v[15], word1, 0);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 3
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc(v[2], v[7], v[8],  v[13], 0, word1);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 4
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc(v[1], v[5], v[9],  v[13], 0, word1);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 5
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc(v[0], v[5], v[10], v[15], 0, word1);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 6
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc(v[3], v[4], v[9],  v[14], word1, 0);
+	// round 7
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc(v[1], v[5], v[9],  v[13], word1, 0);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 8
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc(v[2], v[6], v[10], v[14], 0, word1);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 9
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc(v[2], v[7], v[8],  v[13], word1, 0);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 10
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc(v[3], v[7], v[11], v[15], word1, 0);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 11
+	gFunc(v[0], v[4], v[8],  v[12], 0, word1);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc0(v[0], v[5], v[10], v[15]);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9],  v[14]);
+	// round 12
+	gFunc0(v[0], v[4], v[8],  v[12]);
+	gFunc0(v[1], v[5], v[9],  v[13]);
+	gFunc0(v[2], v[6], v[10], v[14]);
+	gFunc0(v[3], v[7], v[11], v[15]);
+	gFunc(v[0], v[5], v[10], v[15], word1, 0);
+	gFunc0(v[1], v[6], v[11], v[12]);
+	gFunc0(v[2], v[7], v[8],  v[13]);
+	gFunc0(v[3], v[4], v[9], v[14]);	
+
+        v[0] = v[0] ^ blake_state.s0 ^ v[8];
+        v[1] = v[1] ^ blake_state.s1 ^ v[9];
+        v[2] = v[2] ^ blake_state.s2 ^ v[10];
+        v[3] = v[3] ^ blake_state.s3 ^ v[11];
+        v[4] = v[4] ^ blake_state.s4 ^ v[12];
+        v[5] = v[5] ^ blake_state.s5 ^ v[13];
+        v[6] = v[6] ^ blake_state.s6 ^ v[14];
+	v[7] = v[7] ^ blake_state.s7 ^ v[15]; 
+
+	uint8 output0, output1, output2;
+	int addme,shift, bucket;
+
+	output0.s0 = v[0] & 0xFFFFFFFF;
+	output0.s1 = v[0] >> 32; 
+	output0.s2 = v[1] & 0xFFFFFFFF;  
+	output0.s3 = v[1] >> 32;
+	output0.s4 = v[2] & 0xFFFFFFFF; 
+	output0.s5 = v[2] >> 32; 
+	output0.s7 = 0;
+	output0.s6 = 2*tId;
+
+	output1.s0 = v[3] & 0xFFFFFFFF;
+	output1.s1 = v[3] >> 32; 
+	output1.s2 = v[4] & 0xFFFFFFFF;  
+	output1.s3 = v[4] >> 32;
+	output1.s4 = v[5] & 0xFFFFFFFF; 
+	output1.s5 = v[5] >> 32; 
+	output1.s7 = 0;
+	output1.s6 = 2*tId+1;
+
+	uint2 addr;
+	addr.s0 = atomic_inc(&counters[output0.s0 & 0x1FFF]);
+	addr.s0 += 4592 * (output0.s0 & 0x1FFF);
+	resultsHi[addr.s0] = shr_7(output0,13,13);
+
+	addr.s1 = atomic_inc(&counters[output1.s0 & 0x1FFF]);
+	addr.s1 += 4592 * (output1.s0 & 0x1FFF);
+	resultsHi[addr.s1] = shr_7(output1,13,13);
+}
+
+
+__kernel void clearCounter (__global uint8 * buffer0,
+		__global uint8 * buffer1,
+		__global uint8 * buffer2,
+		__global uint4 * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+
+	uint gId = get_global_id(0);
+	counters[gId] = (uint4) 0;
+
+	if (gId == 0) {
+		res[0] = (uint4) 0; 
+	}
+}
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void blake (
+		__global uint8 * output0,
+		__global uint2 * buffer1,
+		__global uint8 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	
+	uint gId = get_global_id(0);
+	
+	round0(hashState, output0, buffer1, counters, gId, (uint) (nonce & 0xFFFFFFFF));
+}
+
+
+inline int masking8_7(uint8 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		
+		if (pos < 1166) {
+			uint value  = atomic_xchg(&scratch[654 + ((input.s0 >> 2) & 0x1FF)], pos); 
+			uint high = value >> 12;
+
+			value &= 0x7FF;
+			value |= (input.s0 & 0xFFFFF800);	
+			
+			scratch[1166+pos] = value;
+			scratch[2332+pos] = input.s1;
+			scratch[3498+pos] = input.s2;
+			scratch[4664+pos] = input.s3;
+			scratch[5830+pos] = input.s4;
+			scratch[6996+pos] = input.s5;
+	 
+			if (pos < 654) {
+				scratch[pos] = input.s6 << 12;
+			} else {
+				atomic_or(&scratch[pos], input.s6 << 12); 
+			}	
+
+			if (high != 0) {
+				atomic_or(&scratch[654 + ((input.s0 >> 2) & 0x1FF)], high << 12); 
+			}				
+
+		}
+
+		return pos;
+	}	
+	
+	return -1;
+}
+
+
+
+inline void masking4_4b(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		
+		uint value  = atomic_xchg(&scratch[(input.s0 >> 5) & 0x1FF], pos); 
+		scratch[1792+pos] = input.s1;
+		scratch[3072+pos] = input.s2;
+		scratch[4352+pos] = input.s3;
+		scratch[5632+pos] = idx;
+ 
+		value |= (input.s0 & 0xFFFFE000);	
+		scratch[512+pos] = value;
+	}
+}
+
+inline void masking4_4bt(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		if (pos < 1280) {
+			uint value  = atomic_xchg(&scratch[(input.s0 >> 5) & 0x1FF], pos); 
+			scratch[1792+pos] = input.s1;
+			scratch[3072+pos] = input.s2;
+			scratch[4352+pos] = input.s3;
+			scratch[5632+pos] = idx;
+	 
+			value |= (input.s0 & 0xFFFFE000);	
+			scratch[512+pos] = value;		
+		}
+	}
+}
+
+void masking4_4(uint4 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+
+		uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x1FF], pos);  
+		value |= (input.s0 & 0xFFFFF800);	
+		scratch[512+pos] = value;
+		scratch[1728+pos] = input.s1;
+		scratch[2944+pos] = input.s2;
+		scratch[4160+pos] = input.s3;
+
+		
+	}
+}
+
+void masking4_4t(uint4 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		if (pos < 1216) {
+			uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x1FF], pos);  
+			value |= (input.s0 & 0xFFFFF800);	
+			scratch[512+pos] = value;
+			scratch[1728+pos] = input.s1;
+			scratch[2944+pos] = input.s2;
+			scratch[4160+pos] = input.s3;
+		}
+	}
+}
+
+
+void masking4_3b(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+
+		uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos);  
+		value |= (input.s0 & 0xFFFFF000);	
+		scratch[1024+pos] = value;
+		scratch[3328+pos] = input.s1;
+		scratch[5632+pos] = (input.s2 & 0x3FFF) | (idx << 14);
+		//scratch[7936+pos] = idx;
+	}
+}
+
+
+void masking4_3bt(uint4 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		if (pos < 2304) {
+			uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos);  
+			value |= (input.s0 & 0xFFFFF000);	
+			scratch[1024+pos] = value;
+			scratch[3328+pos] = input.s1;
+			scratch[5632+pos] = (input.s2 & 0x3FFF) | (idx << 14);
+			//scratch[7936+pos] = idx;
+		}
+	}
+}
+
+
+void masking2_2b(uint2 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+
+		uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos);  
+		value |= (input.s0 & 0xFFFFF000);	
+		scratch[1024+pos] = value;
+		scratch[3328+pos] = (input.s1 & 0x3FFFF) | (idx << 18);
+	}
+}
+
+void masking2_2bt(uint2 input, uint idx, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		if (pos < 2304) {
+			uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos);  
+			value |= (input.s0 & 0xFFFFF000);	
+			scratch[1024+pos] = value;
+			scratch[3328+pos] = (input.s1 & 0x3FFFF) | (idx << 18);
+		}
+	}
+}
+
+void masking2_2(uint2 input, __local uint* scratch, __local uint* cnt, uint mask, uint check) {
+	if ((input.s0 & check) == mask) {
+		uint pos = atomic_inc(&cnt[0]);
+		if (pos < 2304) {
+			uint value  = atomic_xchg(&scratch[(input.s0 >> 2) & 0x3FF], pos);  
+			value |= (input.s0 & 0xFFFFF000);	
+			scratch[1024+pos] = value;
+			scratch[3328+pos] = input.s1;
+		}
+	}
+}
+
+
+uint2 compress(uint in0, uint in1) {
+	uint hi, lo;
+	uint hi2, lo2;
+
+	if (in0 < in1) {
+		hi = in1 >> 12;
+		hi2 = in1 & 0xFFF;
+		lo = in0 >> 12;
+		lo2 = in0 & 0xFFF;
+	} else {
+		hi = in0 >> 12;
+		hi2 = in0 & 0xFFF;
+		lo = in1 >> 12;
+		lo2 = in1 & 0xFFF;
+	}
+
+	hi++;
+
+	uint2 tmp;
+	tmp.s0 = mul24(hi,(hi-1));
+	tmp.s0 = tmp.s0 >> 1;
+	tmp.s0 += lo;
+	
+	tmp.s0 = tmp.s0 << 6;
+	tmp.s0 |= (hi2 & 0x3F);
+
+	tmp.s1 = hi2 >> 6;	
+	tmp.s1 |= (lo2 << 6);
+
+	return tmp;	
+}
+
+uint compress2(uint in0, uint in1) {
+	uint hi, lo;
+	
+	if (in0 < in1) {
+		hi = in1;
+		lo = in0;
+	} else {
+		hi = in0;
+		lo = in1;
+	}
+
+	uint tmp;
+	tmp = mul24(hi,(hi-1));
+	tmp = tmp >> 1;
+	tmp += lo;
+
+	return tmp;	
+}
+
+uint2 decompress(uint2 in) {
+	double inFl = (double) (in.s0 >> 6);
+	
+	inFl *= 2.0;
+	inFl += 1.0;
+
+	uint2 res; 
+	res.s0 = (uint) round(sqrt(inFl));
+
+
+	uint tmp = res.s0 * (res.s0-1);
+	tmp = tmp >> 1;	
+
+	res.s1 = (uint) ((in.s0 >> 6) - tmp);
+	res.s0--;
+
+	res.s0 = res.s0 << 12;
+	res.s1 = res.s1 << 12;
+
+	res.s0 |= (in.s0 & 0x3F);
+	res.s0 |= ((in.s1 & 0x3F) << 6);
+
+	res.s1 |= (in.s1 >> 6);
+	
+	return res;	
+}
+
+uint2 decompress2(uint in) {
+	double inFl = (double) in;
+	
+	inFl *= 2.0;
+	inFl += 1.0;
+
+	uint2 res; 
+	res.s0 = (uint) round(sqrt(inFl));
+
+
+	uint tmp = res.s0 * (res.s0-1);
+	tmp = tmp >> 1;	
+
+	res.s1 = (uint) (in - tmp);
+	
+	return res;	
+}
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round1 (				// Round 1
+		__global uint8 * buffer0,
+		__global uint8 * buffer1,
+		__global uint8  * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = (grp & 3);
+
+	__global uint8 * output = buffer1;
+	__global uint8 * input = &buffer0[bucket*4592];
+
+	__local uint scratch[8162];
+	__local uint * ht = &scratch[654];
+	__local uint * scratch0 = &scratch[1166];
+	__local uint * scratch1 = &scratch[2332];
+	__local uint * scratch2 = &scratch[3498];
+	__local uint * scratch3 = &scratch[4664];
+	__local uint * scratch4 = &scratch[5830];
+	__local uint * scratch5 = &scratch[6996];
+	__local uint * scratch6 = &scratch[0];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[0];
+	__global uint * outCounter = &counters[16384];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R0: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif
+
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 4592);
+	} 
+
+	ht[lId] = 0x7FF;
+	ht[lId+256] = 0x7FF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5;
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; 
+
+	masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840];
+	if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096];
+	if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352];
+
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	int pos = masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 3584) < iCNT[0]) masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 3840) < iCNT[0]) masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4096) < iCNT[0]) masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4352) < iCNT[0]) masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	uint inlim = min(iCNT[1], (uint) 1166);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	int ownPos = atomic_dec(&iCNT[0]);
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0x7FF;
+
+	while ((othPos == 0x7FF) && (ownPos >= 0)) {
+		ownPos = atomic_dec(&iCNT[0]);
+		if (ownPos >= 0) own = scratch0[ownPos];
+		othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+	}
+
+	othPos = (ownPos < inlim) ? othPos : 0x7FF;
+
+	uint addr, elem, val;
+	uint2 el0, el1;
+	uint8 oScrT;
+
+	while (othPos < inlim) {
+		elem = scratch0[othPos];
+		oScrT.s0 = (own ^ elem) >> 11;
+	
+		if (oScrT.s0 != 0) {
+			val = oScrT.s0 & 0x1FFF;
+			addr = atomic_inc(&outCounter[val]);
+
+			oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos];
+			oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos];
+			oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos];
+			oScrT.s4 = scratch4[ownPos] ^ scratch4[othPos];
+
+			el0.s0 = scratch5[ownPos];
+			el0.s1 = scratch6[ownPos];
+			el1.s0 = scratch5[othPos];
+			el1.s1 = scratch6[othPos];
+
+			oScrT.s5 = (el0.s0 ^ el1.s0) & 0x7FFFF;
+
+			el0.s0 = el0.s0 >> 19;
+			el1.s0 = el1.s0 >> 19;
+			el0.s1 = (el0.s1 >> 12) << 13;
+			el1.s1 = (el1.s1 >> 12) << 13;
+
+			el0.s1 |= el0.s0;
+			el1.s1 |= el1.s0;
+
+			oScrT.s0 = (oScrT.s0 >> 13) | (oScrT.s1 << 8);
+			oScrT.s1 = (oScrT.s1 >> 24) | (oScrT.s2 << 8);
+			oScrT.s2 = (oScrT.s2 >> 24) | (oScrT.s3 << 8);
+			oScrT.s3 = (oScrT.s3 >> 24) | (oScrT.s4 << 8);
+			oScrT.s4 = (oScrT.s4 >> 24) | (oScrT.s5 << 8);
+			
+			//if (get_global_id(0) == 0) printf("%d %d \n ", el0, el1);
+
+			oScrT.s5 = el0.s1 | (el1.s1 << 25);
+			oScrT.s6 = el1.s1 >> 7;
+			oScrT.s7 = 0;
+			addr += 4592*val;
+
+			output[addr] = oScrT; 
+		} 
+
+		othPos = elem & 0x7FF;
+
+		while ((othPos == 0x7FF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+		}
+	} 	
+} 
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round2 (				// Round 2
+		__global uint8 * buffer0,
+		__global uint8 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+			
+	uint bucket = grp >> 2; //0x3FFF;
+	uint mask = grp & 3;
+
+	__global uint8 * output = buffer0;
+	__global uint8 * input = &buffer1[bucket*4592];
+
+	__local uint scratch[8162];
+	__local uint * ht = &scratch[654];
+	__local uint * scratch0 = &scratch[1166];
+	__local uint * scratch1 = &scratch[2332];
+	__local uint * scratch2 = &scratch[3498];
+	__local uint * scratch3 = &scratch[4664];
+	__local uint * scratch4 = &scratch[5830];
+	__local uint * scratch5 = &scratch[6996];
+	__local uint * scratch6 = &scratch[0];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[16384];
+	__global uint * outCounter = &counters[32768];
+
+	if (lId == 0) {
+		iCNT[0] = min(inCounter[bucket],(uint) 4592);
+		iCNT[1] = 0;
+	} 
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R1: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif
+
+	ht[lId] = 0x7FF;
+	ht[lId+256] = 0x7FF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5;
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; 
+
+	masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840];
+	if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096];
+	if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352];
+
+	masking8_7(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking8_7(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 3584) < iCNT[0]) masking8_7(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 3840) < iCNT[0]) masking8_7(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4096) < iCNT[0]) masking8_7(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4352) < iCNT[0]) masking8_7(iScr5, &scratch[0], &iCNT[1], mask, 0x3);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	uint inlim = min(iCNT[1], (uint) 1166);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	int ownPos = atomic_dec(&iCNT[0]);
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0x7FF;
+
+	while ((othPos == 0x7FF) && (ownPos >= 0)) {
+		ownPos = atomic_dec(&iCNT[0]);
+		if (ownPos >= 0) own = scratch0[ownPos];
+		othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+	}
+
+	othPos = (ownPos < inlim) ? othPos : 0x7FF;
+
+	uint addr, elem, val;
+	uint el0;
+	uint8 oScrT;
+
+	while (othPos < inlim) {
+		elem = scratch0[othPos];
+		oScrT.s0 = (own ^ elem) >> 11;
+	
+		if (oScrT.s0 != 0) {
+			val = oScrT.s0 & 0x1FFF;
+			addr = atomic_inc(&outCounter[val]);
+
+			oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos];
+			oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos];
+			oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos];
+			oScrT.s4 = scratch4[ownPos] ^ scratch4[othPos];
+
+			oScrT.s7 = scratch5[othPos];
+			oScrT.s6 = scratch5[ownPos];
+			oScrT.s5 = (scratch6[othPos] & 0xFFFFF000);
+
+			oScrT.s0 = (oScrT.s0 >> 10) | (oScrT.s1 << 11);
+			oScrT.s1 = (oScrT.s1 >> 21) | (oScrT.s2 << 11);
+			oScrT.s2 = (oScrT.s2 >> 21) | (oScrT.s3 << 11);
+			oScrT.s3 = (oScrT.s3 >> 21) | (oScrT.s4 << 11);
+			oScrT.s4 = (oScrT.s4 >> 21);
+			oScrT.s4 |= (scratch6[ownPos] & 0xFFFFF000);
+
+			addr += 4592*val;
+
+			output[addr] = oScrT; 	
+
+			//if (get_global_id(0) == 0) printf("%x %x %x %x \n", oScrT.lo);
+		} 
+
+		othPos = elem & 0x7FF;
+
+		while ((othPos == 0x7FF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+		}
+	}  	
+}
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round3 (				// Round 1 / 3
+		__global uint8 * buffer0,
+		__global uint4 * buffer1,
+		__global uint * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = (grp & 3) << 3;
+
+	__global uint4 * output = buffer1;
+	__global uint8 * input = &buffer0[bucket*4592];
+
+	__global uint * sideLoadR13 = (__global uint *) &buffer1[37748736 + grp*328];
+
+	__local uint scratch[6912];
+	__local uint * ht = &scratch[0];
+	__local uint * scratch0 = &scratch[512];
+	__local uint * scratch1 = &scratch[1792];
+	__local uint * scratch2 = &scratch[3072];
+	__local uint * scratch3 = &scratch[4352];
+	__local uint * scratch4 = &scratch[5632];
+	__local uint iCNT[2];
+	__local uint pCNT[1];
+
+	__global uint * inCounter = &counters[32768];
+	__global uint * outCounter = &counters[49152];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R2: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif
+
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 4592);
+		pCNT[0] = 0;
+	} 
+
+	ht[lId] = 0xFFF;
+	ht[lId+256] = 0xFFF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint8 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5;
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking4_4b(iScr0.lo, lId + 0, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr1.lo, lId + 256, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr2.lo, lId + 512, &scratch[0], &iCNT[1], mask, 0x18);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking4_4b(iScr3.lo, lId + 768, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr4.lo, lId + 1024, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr5.lo, lId + 1280, &scratch[0], &iCNT[1], mask, 0x18);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking4_4b(iScr0.lo, lId + 1536, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr1.lo, lId + 1792, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr2.lo, lId + 2048, &scratch[0], &iCNT[1], mask, 0x18);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	if ((lId + 3584) < iCNT[0])iScr2 = input[lId + 3584]; 
+
+	masking4_4b(iScr3.lo, lId + 2304, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr4.lo, lId + 2560, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4b(iScr5.lo, lId + 2816, &scratch[0], &iCNT[1], mask, 0x18);
+
+	if ((lId + 3840) < iCNT[0])iScr3 = input[lId + 3840];
+	if ((lId + 4096) < iCNT[0])iScr4 = input[lId + 4096];
+	if ((lId + 4352) < iCNT[0])iScr5 = input[lId + 4352];
+
+	masking4_4bt(iScr0.lo, lId + 3072, &scratch[0], &iCNT[1], mask, 0x18);
+	masking4_4bt(iScr1.lo, lId + 3328, &scratch[0], &iCNT[1], mask, 0x18);
+	if ((lId + 3584) < iCNT[0])masking4_4bt(iScr2.lo, lId + 3584, &scratch[0], &iCNT[1], mask, 0x18);
+
+	if ((lId + 3840) < iCNT[0])masking4_4bt(iScr3.lo, lId + 3840, &scratch[0], &iCNT[1], mask, 0x18);
+	if ((lId + 4096) < iCNT[0])masking4_4bt(iScr4.lo, lId + 4096, &scratch[0], &iCNT[1], mask, 0x18);
+	if ((lId + 4352) < iCNT[0])masking4_4bt(iScr5.lo, lId + 4352, &scratch[0], &iCNT[1], mask, 0x18);
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	//if (lId == 0) printf("%d \n", iCNT[1]);
+	uint inlim = min(iCNT[1], (uint) 1280);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ownPos = atomic_dec(&iCNT[0]);
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0xFFF;
+	
+	while ((othPos == 0xFFF) && (ownPos >= 0)) {
+		ownPos = atomic_dec(&iCNT[0]);
+		if (ownPos >= 0) own = scratch0[ownPos];
+		othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+	}
+
+	othPos = (ownPos < inlim) ? othPos : 0xFFF;
+	
+	uint addr, elem, val;
+	uint el0, el1;
+	int pcnt = 0;
+	uint4 oScrT;
+
+	while (othPos < inlim) {
+		elem = scratch0[othPos];
+		oScrT.s0 = (own ^ elem) >> 14;
+
+		uint sideAddr = atomic_inc(&pCNT[0]);			
+		if (sideAddr < 1312) {	
+			val = oScrT.s0 & 0x1FFF;		
+			addr = atomic_inc(&outCounter[val]);
+
+			sideLoadR13[sideAddr] = scratch4[ownPos] | (scratch4[othPos] << 16);
+
+			oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos];
+			oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos];
+			oScrT.s3 = scratch3[ownPos] ^ scratch3[othPos];
+
+			oScrT.s0 = (oScrT.s0 >> 13) | (oScrT.s1 << 5);
+			oScrT.s1 = (oScrT.s1 >> 27) | (oScrT.s2 << 5);
+			oScrT.s2 = (oScrT.s2 >> 27) | (oScrT.s3 << 5);
+			oScrT.s3 = (oScrT.s3 >> 27);
+					
+			addr += 4592*val;
+			oScrT.s3 |= ((sideAddr + 1312*grp) << 6);  
+
+			output[addr] = oScrT;	
+		} 
+
+		othPos = elem & 0xFFF;
+	
+		while ((othPos == 0xFFF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+		}
+
+	}  
+}
+
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round4 (				// Round 2 / 4
+		__global uint4 * buffer0,
+		__global uint4 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = grp & 3;
+
+	__global uint4 * output = buffer2;
+	__global uint4 * input = &buffer1[bucket*4592];
+
+	__local uint scratch[5376];
+	__local uint * ht = &scratch[0];
+	__local uint * scratch0 = &scratch[512];
+	__local uint * scratch1 = &scratch[1728];
+	__local uint * scratch2 = &scratch[2944];
+	__local uint * scratch3 = &scratch[4160];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[49152];
+	__global uint * outCounter = &counters[65536];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R3: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif	
+	
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 4592);
+	} 
+
+	ht[lId] = 0x7FF;
+	ht[lId+256] = 0x7FF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5;
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking4_4(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking4_4(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr5, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking4_4(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	if ((lId + 3584) < iCNT[0]) iScr2 = input[lId + 3584]; 
+
+	masking4_4t(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4t(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4t(iScr5, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 3840) < iCNT[0]) iScr3 = input[lId + 3840];
+	if ((lId + 4096) < iCNT[0]) iScr4 = input[lId + 4096];
+	if ((lId + 4352) < iCNT[0]) iScr5 = input[lId + 4352];
+
+	masking4_4t(iScr0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_4t(iScr1, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 3584) < iCNT[0]) masking4_4t(iScr2, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 3840) < iCNT[0]) masking4_4t(iScr3, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4096) < iCNT[0]) masking4_4t(iScr4, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 4352) < iCNT[0]) masking4_4t(iScr5, &scratch[0], &iCNT[1], mask, 0x3);
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	uint inlim = min(iCNT[1], (uint) 1216);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);		 
+
+	int ownPos = atomic_dec(&iCNT[0]);
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0x7FF;
+
+	while ((othPos == 0x7FF) && (ownPos >= 0)) {
+		ownPos = atomic_dec(&iCNT[0]);
+		if (ownPos >= 0) own = scratch0[ownPos];
+		othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+	}
+
+	othPos = (ownPos < inlim) ? othPos : 0x7FF;
+
+	uint addr, elem, val;
+	uint el0, el1;
+	uint4 oScrT;
+
+	while (othPos < inlim) {
+
+	
+		elem = scratch0[othPos];
+		oScrT.s0 = (own ^ elem) >> 11;
+	
+		if (oScrT.s0 != 0) {
+			val = oScrT.s0 & 0xFFF;
+			addr = atomic_inc(&outCounter[val]);
+
+			oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos];
+			oScrT.s2 = scratch2[ownPos] ^ scratch2[othPos];
+			el0 = scratch3[ownPos];
+			el1 = scratch3[othPos];
+
+			oScrT.s3 = (el0 ^ el1) & 0x1F;
+			oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 9);
+			oScrT.s1 = (oScrT.s1 >> 23) | (oScrT.s2 << 9);
+			oScrT.s2 = (oScrT.s2 >> 23) | (oScrT.s3 << 9);
+			
+			uint2 tmp = compress(el0 >> 6,el1 >> 6);
+			addr += 8688*val;
+
+			oScrT.s3 = tmp.s0;
+			oScrT.s2 |= (tmp.s1 << 14); 	
+			output[addr] = oScrT; 	
+		} 
+
+		othPos = elem & 0x7FF;
+
+		while ((othPos == 0x7FF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0x7FF : 0x7FF;
+		}
+	}  
+	
+}
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round5 (				// Round 3 / 5
+		__global uint4 * buffer0,
+		__global uint4 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = grp & 3;
+
+	__global uint4 * output = buffer1;
+	__global uint4 * input = &buffer2[bucket*8688];
+
+	__local uint scratch[7936];
+	__local uint * ht = &scratch[0];
+	__local uint * scratch0 = &scratch[1024];
+	__local uint * scratch1 = &scratch[3328];
+	__local uint * scratch2 = &scratch[5632];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[65536];
+	__global uint * outCounter = &counters[81920];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R4: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif	
+	
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 8688);
+	} 
+
+	ht[lId] = 0xFFF;
+	ht[lId+256] = 0xFFF;
+	ht[lId+512] = 0xFFF;
+	ht[lId+768] = 0xFFF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; 
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking4_3b(iScr0, lId + 0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr1, lId + 256, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr2, lId + 512, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking4_3b(iScr3, lId + 768, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr4, lId + 1024, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr5, lId + 1280, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking4_3b(iScr0, lId + 1536, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr1, lId + 1792, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr2, lId + 2048, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	iScr2 = input[lId + 3584]; 
+
+	masking4_3b(iScr3, lId + 2304, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr4, lId + 2560, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr5, lId + 2816, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 3840];
+	iScr4 = input[lId + 4096];
+	iScr5 = input[lId + 4352];
+
+	masking4_3b(iScr0, lId + 3072, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr1, lId + 3328, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr2, lId + 3584, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 4608];
+	iScr1 = input[lId + 4864];
+	iScr2 = input[lId + 5120]; 
+
+	masking4_3b(iScr3, lId + 3840, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr4, lId + 4096, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr5, lId + 4352, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 5376];
+	iScr4 = input[lId + 5632];
+	iScr5 = input[lId + 5888];
+
+	masking4_3b(iScr0, lId + 4608, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr1, lId + 4864, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr2, lId + 5120, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 6144];
+	iScr1 = input[lId + 6400];
+	iScr2 = input[lId + 6656]; 
+
+	masking4_3b(iScr3, lId + 5376, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr4, lId + 5632, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr5, lId + 5888, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 6912];
+	iScr4 = input[lId + 7168];
+	iScr5 = input[lId + 7424];
+
+	masking4_3b(iScr0, lId + 6144, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr1, lId + 6400, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3b(iScr2, lId + 6656, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 7680) < iCNT[0]) iScr0 = input[lId + 7680];
+	if ((lId + 7936) < iCNT[0]) iScr1 = input[lId + 7936];
+	if ((lId + 8192) < iCNT[0]) iScr2 = input[lId + 8192]; 
+
+	masking4_3bt(iScr3, lId + 6912, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3bt(iScr4, lId + 7168, &scratch[0], &iCNT[1], mask, 0x3);
+	masking4_3bt(iScr5, lId + 7424, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 8448) < iCNT[0]) iScr3 = input[lId + 8448];
+	
+	if ((lId + 7680) < iCNT[0]) masking4_3bt(iScr0, lId + 7680, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 7936) < iCNT[0]) masking4_3bt(iScr1, lId + 7936, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 8192) < iCNT[0]) masking4_3bt(iScr2, lId + 8192, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 8448) < iCNT[0]) masking4_3bt(iScr3, lId + 8448, &scratch[0], &iCNT[1], mask, 0x3); 	
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	uint inlim = min(iCNT[1], (uint) 2304);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	int ownPos = atomic_dec(&iCNT[0]);
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0xFFF;
+
+	while ((othPos == 0xFFF) && (ownPos >= 0)) {
+		ownPos = atomic_dec(&iCNT[0]);
+		if (ownPos >= 0) own = scratch0[ownPos];
+		othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+	}
+
+	othPos = (ownPos < inlim) ? othPos : 0xFFF;
+
+	uint addr, elem, val;
+	uint el0, el1;
+	int pcnt = 0;
+	uint4 oScrT;
+
+	while (othPos < inlim) {
+
+	
+		elem = scratch0[othPos];
+		oScrT.s0 = (own ^ elem) >> 12;
+	
+		if (oScrT.s0 != 0) {
+			val = oScrT.s0 & 0xFFF;
+			addr = atomic_inc(&outCounter[val]);
+
+			oScrT.s1 = scratch1[ownPos] ^ scratch1[othPos];
+			el0 = scratch2[ownPos];
+			el1 = scratch2[othPos];
+			
+			oScrT.s2 = (el0 ^ el1) & 0x3FFF;
+			oScrT.s3 = 0;
+
+			oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 8);
+			oScrT.s1 = (oScrT.s1 >> 24) | (oScrT.s2 << 8);
+			oScrT.s2 = bucket;
+			oScrT.s3 = (el0 >> 14) | ((el1 >> 14) << 16);
+		
+			addr += 8688*(val & 0xFFF);
+			output[addr] = oScrT; 	
+		} 
+
+		othPos = elem & 0xFFF;
+
+		while ((othPos == 0xFFF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+		}
+	}  
+}
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round6 (				// Round 4 / 6
+		__global uint4 * buffer0,
+		__global uint4 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = grp & 3;
+
+	__global uint2 * output = (__global uint2 *) &buffer1[48496640];
+	__global uint4 * input = &buffer1[bucket*8688];
+
+
+	__local uint scratch[5632];
+	__local uint * ht = &scratch[0];
+	__local uint * scratch0 = &scratch[1024];
+	__local uint * scratch1 = &scratch[3328];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[81920];
+	__global uint * outCounter = &counters[98304];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R5: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif	
+
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 8688);
+	}
+
+	ht[lId] = 0xFFF;
+	ht[lId+256] = 0xFFF;
+	ht[lId+512] = 0xFFF;
+	ht[lId+768] = 0xFFF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+	uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; 
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024];
+	iScr5 = input[lId + 1280];
+	
+	masking2_2b(iScr0.lo, lId + 0, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr1.lo, lId + 256, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr2.lo, lId + 512, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048]; 
+
+	masking2_2b(iScr3.lo, lId + 768, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr4.lo, lId + 1024, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr5.lo, lId + 1280, &scratch[0], &iCNT[1], mask, 0x3);	
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560];
+	iScr5 = input[lId + 2816];
+
+	masking2_2b(iScr0.lo, lId + 1536, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr1.lo, lId + 1792, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr2.lo, lId + 2048, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	iScr2 = input[lId + 3584]; 
+
+	masking2_2b(iScr3.lo, lId + 2304, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr4.lo, lId + 2560, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr5.lo, lId + 2816, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 3840];
+	iScr4 = input[lId + 4096];
+	iScr5 = input[lId + 4352];
+
+	masking2_2b(iScr0.lo, lId + 3072, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr1.lo, lId + 3328, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr2.lo, lId + 3584, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 4608];
+	iScr1 = input[lId + 4864];
+	iScr2 = input[lId + 5120]; 
+
+	masking2_2b(iScr3.lo, lId + 3840, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr4.lo, lId + 4096, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr5.lo, lId + 4352, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 5376];
+	iScr4 = input[lId + 5632];
+	iScr5 = input[lId + 5888];
+
+	masking2_2b(iScr0.lo, lId + 4608, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr1.lo, lId + 4864, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr2.lo, lId + 5120, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 6144];
+	iScr1 = input[lId + 6400];
+	iScr2 = input[lId + 6656]; 
+
+	masking2_2b(iScr3.lo, lId + 5376, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr4.lo, lId + 5632, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr5.lo, lId + 5888, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 6912];
+	iScr4 = input[lId + 7168];
+	iScr5 = input[lId + 7424];
+
+	masking2_2b(iScr0.lo, lId + 6144, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr1.lo, lId + 6400, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2b(iScr2.lo, lId + 6656, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 7680) < iCNT[0]) iScr0 = input[lId + 7680];
+	if ((lId + 7936) < iCNT[0]) iScr1 = input[lId + 7936];
+	if ((lId + 8192) < iCNT[0]) iScr2 = input[lId + 8192]; 
+
+	masking2_2bt(iScr3.lo, lId + 6912, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2bt(iScr4.lo, lId + 7168, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2bt(iScr5.lo, lId + 7424, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if ((lId + 8448) < iCNT[0]) iScr3 = input[lId + 8448];
+	
+	if ((lId + 7680) < iCNT[0]) masking2_2bt(iScr0.lo, lId + 7680, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 7936) < iCNT[0]) masking2_2bt(iScr1.lo, lId + 7936, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 8192) < iCNT[0]) masking2_2bt(iScr2.lo, lId + 8192, &scratch[0], &iCNT[1], mask, 0x3);
+	if ((lId + 8448) < iCNT[0]) masking2_2bt(iScr3.lo, lId + 8448, &scratch[0], &iCNT[1], mask, 0x3); 
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	uint inlim = min(iCNT[1], (uint) 2304);
+	if (lId == 0) iCNT[0] = inlim-1;
+	barrier(CLK_LOCAL_MEM_FENCE);	
+
+		int ownPos = atomic_dec(&iCNT[0]);
+		uint own = scratch0[ownPos];
+		uint othPos = own & 0xFFF;
+	
+		while ((othPos == 0xFFF) && (ownPos >= 0)) {
+			ownPos = atomic_dec(&iCNT[0]);
+			if (ownPos >= 0) own = scratch0[ownPos];
+			othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+		}
+
+		othPos = (ownPos < inlim) ? othPos : 0xFFF;
+	
+		uint addr, elem, val;
+		uint el0, el1;
+		int pcnt = 0;
+		uint2 oScrT;
+
+		while (othPos < inlim) {
+
+		
+			elem = scratch0[othPos];
+			oScrT.s0 = (own ^ elem) >> 12;
+		
+			if (oScrT.s0 != 0) {
+				val = oScrT.s0 & 0xFFF;
+				addr = atomic_inc(&outCounter[val]);
+
+				uint el0 = scratch1[ownPos];
+				uint el1 = scratch1[othPos];	
+
+				oScrT.s1 = (el0 ^ el1) & 0x3FFFF;
+				el0 = el0 >> 18;
+				el1 = el1 >> 18;	
+
+				oScrT.s0 = (oScrT.s0 >> 12) | (oScrT.s1 << 8);
+				oScrT.s1 = compress2(el0,el1);
+				
+				addr += 8688*(val & 0xFFF);
+
+				oScrT.s1 |= (bucket << 26);
+				oScrT.s0 |= ((bucket >> 6) << 26); 	
+			
+				output[addr] = oScrT; 	
+			} 
+
+			othPos = elem & 0xFFF;
+	
+			while ((othPos == 0xFFF) && (ownPos >= 0)) {
+				ownPos = atomic_dec(&iCNT[0]);
+				if (ownPos >= 0) own = scratch0[ownPos];
+				othPos = (ownPos >= 0) ? own & 0xFFF : 0xFFF;
+			}
+		}  
+}
+
+
+__kernel __attribute__((reqd_work_group_size(256, 1, 1))) void round7 (				// Round 5 / 7
+		__global uint4 * buffer0,
+		__global uint4 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * res,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+	uint lId = get_local_id(0);
+	uint grp = get_group_id(0); 
+
+	uint bucket = grp >> 2;
+	uint mask = grp & 3;
+
+	__global uint4 * output = &buffer1[67305472];
+	__global uint2 * outputR46 = (__global uint2 *) &buffer1[48496640];
+	__global uint4 * input = &buffer1[48496640 + bucket*4344];
+
+	__local uint scratch[5632];
+	__local uint * ht = &scratch[0];
+	__local uint * scratch0 = &scratch[1024];
+	__local uint * scratch1 = &scratch[3328];
+	__local uint iCNT[2];
+
+	__global uint * inCounter = &counters[98304];
+	__global uint * outCounter = &counters[114688];
+
+	#ifdef PRINT
+	if (get_global_id(0) == 0) {
+		uint sum=0;
+		for (uint i=0; i<16384; i++) {
+			sum += inCounter[i];			
+		}
+		printf("R6: %d %d %d %d \n", sum, inCounter[0], inCounter[1], inCounter[2]);
+	} 
+	#endif
+
+	if (lId == 0) {
+		iCNT[1] = 0;
+		iCNT[0] = min(inCounter[bucket],(uint) 8688);
+	}
+
+	ht[lId] = 0xFFF;
+	ht[lId+256] = 0xFFF;
+	ht[lId+512] = 0xFFF;
+	ht[lId+768] = 0xFFF;
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint4 iScr0,iScr1,iScr2,iScr3,iScr4,iScr5; 
+
+	iScr0 = input[lId];
+	iScr1 = input[lId + 256];
+	iScr2 = input[lId + 512];
+
+	iScr3 = input[lId + 768];
+	iScr4 = input[lId + 1024]; 
+	iScr5 = input[lId + 1280];
+
+	masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 1536];
+	iScr1 = input[lId + 1792];
+	iScr2 = input[lId + 2048];
+
+	masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr3 = input[lId + 2304];
+	iScr4 = input[lId + 2560]; 
+	iScr5 = input[lId + 2816];
+
+	masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	iScr0 = input[lId + 3072];
+	iScr1 = input[lId + 3328];
+	if (2*(lId+3584) < iCNT[0]) iScr2 = input[lId + 3584];
+
+	masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if (2*(lId+3840) < iCNT[0]) iScr3 = input[lId + 3840];
+	if (2*(lId+4096) < iCNT[0]) iScr4 = input[lId + 4096]; 
+	if (2*(lId+4352) < iCNT[0]) iScr5 = input[lId + 4352];
+
+	masking2_2(iScr0.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr0.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	masking2_2(iScr1.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+3584) < iCNT[0])  masking2_2(iScr2.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+3584)+1 < iCNT[0])masking2_2(iScr2.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	if (2*(lId+3840)+0 < iCNT[0]) masking2_2(iScr3.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+3840)+1 < iCNT[0]) masking2_2(iScr3.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+4096)+0 < iCNT[0]) masking2_2(iScr4.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+4096)+1 < iCNT[0]) masking2_2(iScr4.hi, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+4352)+0 < iCNT[0]) masking2_2(iScr5.lo, &scratch[0], &iCNT[1], mask, 0x3);
+	if (2*(lId+4352)+1 < iCNT[0]) masking2_2(iScr5.hi, &scratch[0], &iCNT[1], mask, 0x3);
+
+	barrier(CLK_LOCAL_MEM_FENCE);	
+	if (lId == 0) iCNT[1] = min(iCNT[1], (uint) 2304);
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	uint ownPos = lId;
+	uint own = scratch0[ownPos];
+	uint othPos = own & 0xFFF;
+
+	uint ownEl;
+
+	if ((own & 0xFFF) != 0xFFF) {
+		ownEl = scratch1[ownPos];
+	}
+
+	uint2 oScrT;
+
+	while (ownPos < iCNT[1]) {
+		uint addr = (othPos < iCNT[1]) ? othPos : ownPos+256;
+		uint elem = scratch0[addr];
+		
+		if (othPos < iCNT[1]) {
+			uint oth = elem;
+			if (((own ^ oth) & 0x03FFF000) == 0) {
+				uint4 elem;
+				elem.s0 = own;
+				elem.s1 = ownEl;
+
+				elem.s2 = oth;
+				elem.s3 = scratch1[othPos];
+
+				uint4 naddr;
+
+				naddr.s01 = decompress2(elem.s1 & 0x3FFFFFF);			// Unpack R4 / 6
+				naddr.s23 = decompress2(elem.s3 & 0x3FFFFFF);
+				
+				elem.s0 = elem.s0 >> 26;
+				elem.s1 = elem.s1 >> 26;
+				elem.s0 = elem.s0 << 6;
+				elem.s0 |= elem.s1;
+				
+				naddr.s0 += 8688*elem.s0;
+				naddr.s1 += 8688*elem.s0;
+
+				elem.s2 = elem.s2 >> 26;
+				elem.s3 = elem.s3 >> 26;
+				elem.s2 = elem.s2 << 6;
+				elem.s2 |= elem.s3;
+				
+				naddr.s2 += 8688*elem.s2;
+				naddr.s3 += 8688*elem.s2;
+
+				bool ok = true;
+
+				ok = ok && (naddr.s0 != naddr.s1) && (naddr.s0 != naddr.s2) && (naddr.s0 != naddr.s3); 
+				ok = ok && (naddr.s1 != naddr.s2) && (naddr.s1 != naddr.s3) && (naddr.s2 != naddr.s3); 
+				
+				if (ok) {
+					addr = atomic_inc(&outCounter[oScrT.s0 & 0xFFF]);
+					if (addr < 4096) {
+						output[addr] = naddr;	
+					}
+				}
+			}		
+		} else { 
+			own = elem;
+			ownPos += 256;
+			if (((own & 0xFFF) != 0xFFF) && (ownPos < iCNT[1])) {
+				ownEl = scratch1[ownPos];
+			}
+		}
+
+		othPos = elem & 0xFFF;
+	}  	
+}
+
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void combine (
+		__global uint8 * buffer0,
+		__global uint4 * buffer1,
+		__global uint4 * buffer2,
+		__global uint * counters,
+		__global uint4 * output0,
+		const uint extra,
+		const ulong8 hashState,
+		const ulong nonce) {
+
+	 uint gId = get_group_id(0);
+	uint lId = get_local_id(0);
+
+	__global uint * outCounters = (__global uint*) &output0[0];
+
+	__global uint * sideLoadR13 = (__global uint *) &buffer1[37748736];
+
+	__global uint2 * R46Out = (__global uint2 *) &buffer1[48496640];
+	__global uint4 * R57Out = &buffer1[67305472];
+
+	__local uint scratch[256];
+	__local uint ok[1];
+
+	uint2 tmps;
+
+	#ifdef PRINT
+		if (get_global_id(0) == 0) {
+			 printf("R5: %d \n", counters[114688]);
+		}
+	#endif
+
+	if (gId < counters[114688]) {
+		if (lId == 0) {
+			uint4 tmp;
+			tmp = R57Out[gId];
+			
+			scratch[128 + 4*lId+0] = tmp.s0;
+			scratch[128 + 4*lId+1] = tmp.s1;
+			scratch[128 + 4*lId+2] = tmp.s2;
+			scratch[128 + 4*lId+3] = tmp.s3;
+		}
+
+		bool check = true;
+		if (lId == 0) ok[0] = 0;
+		barrier(CLK_LOCAL_MEM_FENCE); 
+
+		if (lId < 4) {							// Unpack R3 / 5
+			uint addr = scratch[128 + lId];
+			if (addr < 37617664) {
+				uint4 tmp = buffer1[addr];
+				
+				atomic_xor(&ok[0], tmp.s1 >> 18);
+
+				tmp.s0 = tmp.s3 & 0xFFFF;
+				tmp.s1 = tmp.s3 >> 16;
+
+				tmp.s0 += 8688*tmp.s2;
+				tmp.s1 += 8688*tmp.s2;
+
+				scratch[2*lId] = tmp.s0;	
+				scratch[2*lId+1] = tmp.s1;	
+			}
+		}			
+
+		barrier(CLK_LOCAL_MEM_FENCE); 
+		check = (ok[0] == 0);
+		barrier(CLK_LOCAL_MEM_FENCE); 
+
+		if ((lId < 8) && (check)) {							// Unpack R2 / 4
+			uint addr = scratch[lId];
+			if (addr < 37617664) {
+				uint4 tmp = buffer2[addr];
+				
+				tmp.s2 = tmp.s2 >> 14;
+				tmp.s01 = decompress(tmp.s32);
+
+				scratch[128+2*lId] = tmp.s0;	
+				scratch[128+2*lId+1] = tmp.s1;	
+			}
+		}
+			
+		barrier(CLK_LOCAL_MEM_FENCE); 
+		
+		if ((lId < 16) && (check)) {							// Unpack R1 / 3
+			uint addr = scratch[128+lId];
+			if (addr < 42991616) {
+				uint2 tmp;
+			
+				tmp.s0 = sideLoadR13[addr];
+				tmp.s1 = tmp.s0 & 0xFFFF;
+				tmp.s0 = tmp.s0 >> 16;
+
+				tmp.s0 += 4592*(addr / 5248);
+				tmp.s1 += 4592*(addr / 5248);
+
+				scratch[144+2*lId] = tmp.s0;
+				scratch[144+2*lId+1] = tmp.s1;
+			} 
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE); 
+
+		
+		if ((lId < 32) && (check)) {							// Unpack R2
+			uint addr = scratch[144+lId];
+			if (addr < 37617664) {
+				uint8 tmp;
+			
+				tmp = buffer0[addr];
+				
+				atomic_xor(&ok[0], tmp.s4 & 0x3F);
+				
+				tmp.s0 = (tmp.s4 >> 12) << 7;
+				tmp.s1 = tmp.s6;
+
+				tmp.s2 = (tmp.s5 >> 12) << 7;
+				tmp.s3 = tmp.s7;
+
+				tmp.s0 |= (tmp.s1 >> 25);
+				tmp.s2 |= (tmp.s3 >> 25);
+
+				tmp.s1 &= 0x1FFFFFF;
+				tmp.s3 &= 0x1FFFFFF;
+
+				scratch[4*lId] = tmp.s0;
+				scratch[4*lId+1] = tmp.s1;
+				scratch[4*lId+2] = tmp.s2;
+				scratch[4*lId+3] = tmp.s3;				
+			} 
+		} 
+		
+	
+		barrier(CLK_LOCAL_MEM_FENCE);	
+
+		if ((ok[0] == 0) && (check)) {
+			
+			scratch[128 + 2*lId] = 0xFFF;
+			scratch[128 + 2*lId+1] = 0xFFF;
+			
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			uint2 listEntry ;
+
+			uint elem = scratch[lId] & 0x3F;
+			listEntry.s0 = atomic_xchg(&scratch[128 + elem], lId);
+
+			elem = scratch[64+lId] & 0x3F;
+			listEntry.s1 = atomic_xchg(&scratch[128 + elem], lId+64);
+			
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+
+			scratch[128 + lId] = listEntry.s0;
+			scratch[128 + 64 + lId] = listEntry.s1;
+			
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+	
+
+			int next = scratch[128 + lId];
+			while (next < 128)  {	
+				if (scratch[lId] == scratch[next]) {
+					atomic_inc(&ok[0]);
+				} 	
+				next = scratch[128 + next];
+			} 
+
+			next = scratch[128 + 64 + lId];
+			while (next < 128)  {	
+				if (scratch[64 + lId] == scratch[next]) {
+					atomic_inc(&ok[0]);
+				} 	
+				next = scratch[128 + next];
+			} 	
+		} 
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if ((ok[0] == 0) && (check)) {
+			uint addr;
+			if (lId == 0) addr = atomic_inc(&outCounters[0]);	
+
+			if (lId < 64) {
+				uint2 elem;
+				elem.s0 = scratch[2*lId];
+				elem.s1 = scratch[2*lId+1];
+
+				if (elem.s0 > elem.s1) elem.s01 = elem.s10;
+
+				scratch[128+2*lId] = elem.s0;
+				scratch[128+2*lId+1] = elem.s1;				// Elements sorted by 2 Elem
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);	
+
+			uint2 tmp2;
+		
+			if (lId < 64) {
+				tmp2.s0 = lId >> 1;
+				tmp2.s1 = (scratch[128+4*tmp2.s0+0] > scratch[128+4*tmp2.s0+2]) ? (lId ^ 0x1) : lId;
+
+				scratch[2*lId] = scratch[128+2*tmp2.s1];
+				scratch[2*lId+1] = scratch[128+2*tmp2.s1+1];		// Elements sorted by 4 Elem
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (lId < 64) {
+				tmp2.s0 = lId >> 2;
+				tmp2.s1 = (scratch[8*tmp2.s0+0] > scratch[8*tmp2.s0+4]) ? (lId ^ 0x2) : lId;
+
+				scratch[128+2*lId+0] = scratch[2*tmp2.s1+0];		// Elements sorted by 8 Elem
+				scratch[128+2*lId+1] = scratch[2*tmp2.s1+1];
+			}
+		
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (lId < 64) {
+				tmp2.s0 = lId >> 3;
+				tmp2.s1 = (scratch[128+16*tmp2.s0+0] > scratch[128+16*tmp2.s0+8]) ? (lId ^ 0x4) : lId;
+
+				scratch[2*lId+0] = scratch[128+2*tmp2.s1+0];		// Elements sorted by 16 Elem
+				scratch[2*lId+1] = scratch[128+2*tmp2.s1+1];
+			}
+		
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (lId < 64) {
+				tmp2.s0 = lId >> 4;
+				tmp2.s1 = (scratch[32*tmp2.s0+0] > scratch[32*tmp2.s0+16]) ? (lId ^ 0x8) : lId;
+
+				scratch[128+2*lId+0] = scratch[2*tmp2.s1+0];		// Elements sorted by 32 Elem
+				scratch[128+2*lId+1] = scratch[2*tmp2.s1+1];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (lId < 64) {
+				tmp2.s0 = lId >> 5;
+				tmp2.s1 = (scratch[128+64*tmp2.s0+0] > scratch[128+64*tmp2.s0+32]) ? (lId ^ 0x10) : lId;
+
+				scratch[2*lId+0] = scratch[128+2*tmp2.s1+0];		// Elements sorted by 64 Elem
+				scratch[2*lId+1] = scratch[128+2*tmp2.s1+1];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			if (lId < 64) {
+				tmp2.s0 = lId >> 6;
+				tmp2.s1 = (scratch[128*tmp2.s0+0] > scratch[128*tmp2.s0+64]) ? (lId ^ 0x20) : lId;
+
+				scratch[128+2*lId+0] = scratch[2*tmp2.s1+0];		// Elements sorted by 128 Elem
+				scratch[128+2*lId+1] = scratch[2*tmp2.s1+1];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);					// All Elements sorted	
+			
+			if (lId == 0) scratch[0] = addr;
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			addr = scratch[0];
+
+			if ((addr < 16) && (lId < 32)) {
+				uint4 tmp;
+				tmp.s0 = scratch[128+4*lId];
+				tmp.s1 = scratch[128+4*lId+1];
+				tmp.s2 = scratch[128+4*lId+2];
+				tmp.s3 = scratch[128+4*lId+3];
+
+				output0[1 + 32*addr + lId] = tmp;
+			}   
+			
+		}
+	} 
+
+	//if (get_global_id(0) == 0) outCounters[0] = 2;
+
+}
+
diff --git a/mine.example.toml b/mine.example.toml
index b9a4e2d..15c76a8 100644
--- a/mine.example.toml
+++ b/mine.example.toml
@@ -23,9 +23,11 @@ user = "t1YourZClassicAddressHere.rig1"        # payout address / worker login
 # no-jackpot = true          # PPLNS
 
 # ── GPU backend ───────────────────────────────────────────────────────────────
-# backend      = "cuda"      # "cuda" or "opencl"
-# devices      = "all"       # "all", or a comma list e.g. "0,1"
-# force-opencl = false       # force OpenCL, disabling CUDA
+# backend      = "mixed"     # "mixed" (default: NVIDIA→CUDA, AMD/Intel→OpenCL),
+#                            # "opencl" (every card via OpenCL), or "cuda"
+# devices      = "all"       # "all", or a comma list e.g. "0,1" — in mixed mode
+#                            # these index the combined list from --list-devices
+# force-opencl = false       # force every card onto OpenCL, disabling CUDA
 
 # ── GPU tuning (clock/power changes need root) ────────────────────────────────
 # no-gpu-tune      = false
diff --git a/src/blake.rs b/src/blake.rs
index 1d6b2cd..798f250 100644
--- a/src/blake.rs
+++ b/src/blake.rs
@@ -166,6 +166,18 @@ impl BatchHasher {
         Self { mid, tail }
     }
 
+    /// The BLAKE2b chaining state after compressing the shared first 128-byte
+    /// header block (the eight 64-bit midstate words). This is exactly the
+    /// `hashState` (`ulong8`) the OpenCL `equihash192_7.cl` round-0 kernel
+    /// consumes: it injects the final 16-byte block (message word `m[1] =
+    /// (index << 32) | nonce_low`, `m[0] = 0`) itself, which requires the
+    /// header's bytes [128..136] to be zero (the same `cuda_compatible` rule the
+    /// CUDA backend relies on). Only used by the OpenCL (AMD) backend.
+    #[cfg_attr(not(feature = "gpu"), allow(dead_code))]
+    pub fn midstate(&self) -> [u64; 8] {
+        self.mid
+    }
+
     /// Assemble the zero-padded final block for index `g`.
     #[inline]
     fn final_block(&self, g: u32) -> [u8; 128] {
diff --git a/src/cuda.rs b/src/cuda.rs
index 3d3d8b6..4f6eaaa 100644
--- a/src/cuda.rs
+++ b/src/cuda.rs
@@ -108,7 +108,16 @@ const CU_LAUNCH_PARAM_END: usize = 0x00;
 const CU_LAUNCH_PARAM_BUFFER_POINTER: usize = 0x01;
 const CU_LAUNCH_PARAM_BUFFER_SIZE: usize = 0x02;
 
-extern "C" {
+// The CUDA driver API, loaded at runtime via dlopen (see `crate::dylib`) rather
+// than linked at build time: the SONAME `libcuda.so.1` ships with the NVIDIA
+// driver (`nvcuda.dll` on Windows) and is absent on driver-less / AMD-only
+// hosts. `cuda_lib()` returns `None` when it can't be opened; the public entry
+// points below turn that into a clear error / empty device list, so the binary
+// still builds and starts everywhere.
+crate::dylib::dynamic_library! {
+    lib_struct: CudaLib,
+    loader: cuda_lib,
+    names: ["libcuda.so.1", "libcuda.so", "nvcuda.dll"],
     fn cuInit(flags: c_uint) -> CUresult;
     fn cuDeviceGetCount(count: *mut c_int) -> CUresult;
     fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
@@ -148,6 +157,11 @@ extern "C" {
     fn cuGetErrorName(error: CUresult, str: *mut *const c_char) -> CUresult;
 }
 
+/// Error returned when the CUDA driver library isn't present on the host.
+fn cuda_unavailable() -> anyhow::Error {
+    anyhow!("CUDA driver library (libcuda.so.1) not found — is the NVIDIA driver installed?")
+}
+
 /// Turn a non-success `CUresult` into an error with the driver's symbolic name.
 fn check(code: CUresult, what: &str) -> Result<()> {
     if code == CUDA_SUCCESS {
@@ -164,8 +178,10 @@ fn check(code: CUresult, what: &str) -> Result<()> {
     Err(anyhow!("{what} failed: {name}"))
 }
 
-/// Number of CUDA devices (initialises the driver as a side effect).
+/// Number of CUDA devices (initialises the driver as a side effect). Returns an
+/// error if the CUDA driver library isn't installed.
 pub fn device_count() -> Result<usize> {
+    cuda_lib().ok_or_else(cuda_unavailable)?;
     unsafe {
         check(cuInit(0), "cuInit")?;
         let mut n: c_int = 0;
@@ -579,6 +595,7 @@ impl CudaSolver {
     /// fatbin, select the config that fits free VRAM, allocate its buffers, and
     /// rebase the recorded launch sequence.
     pub fn new(device_index: usize) -> Result<Self> {
+        cuda_lib().ok_or_else(cuda_unavailable)?;
         unsafe {
             check(cuInit(0), "cuInit")?;
             let mut dev: CUdevice = 0;
diff --git a/src/dylib.rs b/src/dylib.rs
new file mode 100644
index 0000000..0e5b062
--- /dev/null
+++ b/src/dylib.rs
@@ -0,0 +1,85 @@
+//! Tiny runtime dynamic-library loader for the optional GPU vendor libraries.
+//!
+//! The CUDA driver (`libcuda`) and NVML (`libnvidia-ml`) are vendor components
+//! that ship with the NVIDIA driver — they are not installable as ordinary
+//! build dependencies and are absent on AMD-only / driver-less hosts. Linking
+//! them at build time would (a) make the build fail without the NVIDIA libs
+//! present and (b) make the resulting binary refuse to start anywhere they are
+//! missing. Instead we `dlopen` them on first use: the binary has no build-time
+//! or load-time dependency on them, and the CUDA backend simply reports "no
+//! devices" when the driver isn't installed.
+//!
+//! [`dynamic_library!`] generates, for one such library, a function-pointer
+//! table plus same-named wrapper `fn`s, so the call sites in [`crate::cuda`] /
+//! [`crate::nvml`] are unchanged — only the `extern "C"` block is replaced.
+
+/// Open the first of `names` that loads (e.g. the versioned SONAME first, then
+/// the unversioned dev symlink). Returns the last error if none load.
+pub fn load_first(names: &[&str]) -> Result<libloading::Library, libloading::Error> {
+    let mut last_err = None;
+    for name in names {
+        match unsafe { libloading::Library::new(name) } {
+            Ok(lib) => return Ok(lib),
+            Err(e) => last_err = Some(e),
+        }
+    }
+    Err(last_err.expect("load_first called with an empty name list"))
+}
+
+/// Generate a runtime-loaded binding for one shared library.
+///
+/// Produces a hidden fn-pointer struct, a `OnceLock`-cached loader (`$loader()`
+/// returns `Option<&'static _>`, `None` when the library can't be loaded), and a
+/// same-named `unsafe fn` wrapper for each declared function that dispatches
+/// through the table. Public entry points must check `$loader().is_some()` (or
+/// `?` on the `Option`) before invoking any wrapper; the wrappers themselves
+/// panic if called with the library unloaded, which the entry-point guards
+/// prevent.
+macro_rules! dynamic_library {
+    (
+        lib_struct: $Lib:ident,
+        loader: $loader:ident,
+        names: [$($lname:expr),+ $(,)?],
+        $( fn $fname:ident($($an:ident: $at:ty),* $(,)?) -> $ret:ty; )*
+    ) => {
+        #[allow(non_snake_case)]
+        struct $Lib {
+            $( $fname: unsafe extern "C" fn($($at),*) -> $ret, )*
+            // Keep the library mapped for the process lifetime; the fn pointers
+            // above point into it.
+            #[allow(dead_code)]
+            handle: libloading::Library,
+        }
+
+        impl $Lib {
+            #[allow(non_snake_case)]
+            unsafe fn load() -> std::result::Result<Self, libloading::Error> {
+                let handle = $crate::dylib::load_first(&[$($lname),+])?;
+                $(
+                    let $fname: unsafe extern "C" fn($($at),*) -> $ret =
+                        *handle.get(concat!(stringify!($fname), "\0").as_bytes())?;
+                )*
+                Ok(Self { $($fname,)* handle })
+            }
+        }
+
+        static __DYLIB: std::sync::OnceLock<Option<$Lib>> = std::sync::OnceLock::new();
+
+        /// The loaded library, or `None` if it could not be opened.
+        fn $loader() -> Option<&'static $Lib> {
+            __DYLIB.get_or_init(|| unsafe { $Lib::load().ok() }).as_ref()
+        }
+
+        $(
+            #[inline]
+            #[allow(non_snake_case)]
+            unsafe fn $fname($($an: $at),*) -> $ret {
+                ($loader()
+                    .expect(concat!(stringify!($fname), ": ", stringify!($Lib), " not loaded"))
+                    .$fname)($($an),*)
+            }
+        )*
+    };
+}
+
+pub(crate) use dynamic_library;
diff --git a/src/gpu.rs b/src/gpu.rs
index 84a36f1..4134544 100644
--- a/src/gpu.rs
+++ b/src/gpu.rs
@@ -151,8 +151,9 @@ fn kernel_source(geom: &Geom) -> String {
     )
 }
 
-/// A persistent OpenCL solver bound to one device.
-pub struct GpuSolver {
+/// The default (project-native) OpenCL solver, bound to one device. Wrapped by
+/// [`GpuSolver`], which selects it for non-AMD devices.
+struct LegacySolver {
     pq: ProQue,
     header: Buffer<u8>,
     /// Per-table back-reference arrays (1 u32/slot), kept resident for recovery.
@@ -167,15 +168,14 @@ pub struct GpuSolver {
     nr_rows: usize,
 }
 
-impl GpuSolver {
+impl LegacySolver {
     /// This device's product name (e.g. "NVIDIA GeForce RTX 5080"), if available.
     pub fn device_name(&self) -> Option<String> {
         self.pq.device().name().ok()
     }
 
     /// Initialise the solver and allocate all device buffers.
-    pub fn new(device_index: usize) -> Result<Self> {
-        let (platform, device) = pick_device(device_index)?;
+    pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result<Self> {
         let geom = pick_geom(&device);
         // The device's platform must be set explicitly: ProQue otherwise builds
         // the context against `Platform::default()` (the first platform), which
@@ -406,6 +406,101 @@ impl GpuSolver {
     }
 }
 
+/// OpenCL solver for one device. Dispatches to the AMD-tuned kernel
+/// (`equihash192_7.cl`) on AMD-vendor devices and the default project kernel
+/// (`equihash.cl`) everywhere else. Forceable with `ZCL_OPENCL_KERNEL=amd|legacy`.
+pub struct GpuSolver {
+    inner: SolverInner,
+}
+
+enum SolverInner {
+    Legacy(LegacySolver),
+    Amd(crate::gpu_amd::AmdSolver),
+}
+
+impl GpuSolver {
+    /// Initialise the solver for a flat device index, choosing the kernel by
+    /// device vendor (AMD → `equihash192_7.cl`).
+    pub fn new(device_index: usize) -> Result<Self> {
+        let (platform, device) = pick_device(device_index)?;
+        let inner = if use_amd_kernel(&device) {
+            log::info!("OpenCL: AMD device — using the equihash192_7 kernel");
+            SolverInner::Amd(crate::gpu_amd::AmdSolver::new(platform, device)?)
+        } else {
+            SolverInner::Legacy(LegacySolver::new(platform, device)?)
+        };
+        Ok(Self { inner })
+    }
+
+    /// This device's product name, if available.
+    pub fn device_name(&self) -> Option<String> {
+        match &self.inner {
+            SolverInner::Legacy(s) => s.device_name(),
+            SolverInner::Amd(s) => s.device_name(),
+        }
+    }
+
+    /// Solve the puzzle for `header` (140 bytes).
+    pub fn solve(&self, header: &[u8]) -> Result<Vec<Vec<u32>>> {
+        match &self.inner {
+            SolverInner::Legacy(s) => s.solve(header),
+            SolverInner::Amd(s) => s.solve(header),
+        }
+    }
+
+    /// Solve and also return the raw GPU candidate count (for diagnostics).
+    pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec<Vec<u32>>)> {
+        match &self.inner {
+            SolverInner::Legacy(s) => s.solve_with_stats(header),
+            SolverInner::Amd(s) => s.solve_with_stats(header),
+        }
+    }
+
+    /// Time each GPU stage individually.
+    pub fn profile(&self, header: &[u8]) -> Result<()> {
+        match &self.inner {
+            SolverInner::Legacy(s) => s.profile(header),
+            SolverInner::Amd(s) => s.profile(header),
+        }
+    }
+
+    /// Whether the per-index BLAKE2b probe ([`Self::hash_all`]) is available.
+    /// Only the default kernel exposes a linear digest layout; the AMD kernel
+    /// buckets in round 0, so the self-test skips the probe there.
+    pub fn supports_blake_probe(&self) -> bool {
+        matches!(self.inner, SolverInner::Legacy(_))
+    }
+
+    /// Compute every first-round BLAKE2b output (default kernel only).
+    pub fn hash_all(&self, header: &[u8]) -> Result<Vec<u8>> {
+        match &self.inner {
+            SolverInner::Legacy(s) => s.hash_all(header),
+            SolverInner::Amd(_) => {
+                Err(anyhow!("hash_all is not supported by the AMD kernel"))
+            }
+        }
+    }
+}
+
+/// Decide whether to drive `device` with the AMD `equihash192_7.cl` kernel.
+/// `ZCL_OPENCL_KERNEL` forces the choice (`amd` or `legacy`); otherwise it's by
+/// device vendor.
+fn use_amd_kernel(device: &ocl::Device) -> bool {
+    use ocl::enums::{DeviceInfo, DeviceInfoResult};
+    match std::env::var("ZCL_OPENCL_KERNEL").ok().as_deref() {
+        Some(v) if v.eq_ignore_ascii_case("amd") => return true,
+        Some(v) if v.eq_ignore_ascii_case("legacy") => return false,
+        _ => {}
+    }
+    match device.info(DeviceInfo::Vendor) {
+        Ok(DeviceInfoResult::Vendor(v)) => {
+            let v = v.to_ascii_lowercase();
+            v.contains("advanced micro devices") || v.contains("amd")
+        }
+        _ => false,
+    }
+}
+
 /// List `(platform, device)` names so the user can choose `--device`.
 pub fn list_devices() -> Result<Vec<String>> {
     use ocl::{Device, Platform};
@@ -422,6 +517,25 @@ pub fn list_devices() -> Result<Vec<String>> {
     Ok(names)
 }
 
+/// For each OpenCL device — in the same flat order as [`list_devices`] and
+/// `--devices` — whether its vendor is NVIDIA. The mixed backend uses this to
+/// hand NVIDIA cards to CUDA (and mine only the non-NVIDIA OpenCL devices).
+pub fn device_is_nvidia() -> Vec<bool> {
+    use ocl::enums::{DeviceInfo, DeviceInfoResult};
+    use ocl::{Device, Platform};
+    let mut out = Vec::new();
+    for platform in Platform::list() {
+        for device in Device::list_all(platform).unwrap_or_default() {
+            let is_nv = matches!(
+                device.info(DeviceInfo::Vendor),
+                Ok(DeviceInfoResult::Vendor(v)) if v.to_ascii_lowercase().contains("nvidia")
+            );
+            out.push(is_nv);
+        }
+    }
+    out
+}
+
 /// The flat OpenCL device index of the first CPU-type device (e.g. PoCL), if any.
 /// Lets CPU mining run through the OpenCL backend on the CPU. The index matches
 /// [`list_devices`] / `--devices`.
diff --git a/src/gpu_amd.rs b/src/gpu_amd.rs
new file mode 100644
index 0000000..f1c7c90
--- /dev/null
+++ b/src/gpu_amd.rs
@@ -0,0 +1,286 @@
+//! AMD OpenCL Equihash 192,7 solver (`kernels/equihash192_7.cl`).
+//!
+//! A second OpenCL backend, selected for AMD-vendor devices by
+//! [`crate::gpu::GpuSolver`]. Where the default [`crate::gpu`] driver runs the
+//! project's own `gen`/`round_collide`/`recover` kernel, this one drives a
+//! self-contained, GCN-tuned kernel with a fixed table geometry and a different
+//! host ABI: a `clearCounter` → `blake` → `round1..round7` → `combine`
+//! pipeline.
+//!
+//! ## Geometry (hard-coded in the kernel, mirrored here)
+//!
+//! 2^25 initial entries are bucketed into `NR_ROWS = 8192` rows. Round 0 and the
+//! early rounds keep `SLOTS_R0 = 4592` `uint8` slots per row (`buffer0`/
+//! `buffer1` ping-pong, ~1.2 GB each); rounds 4–5 widen to `SLOTS_R45 = 8688`
+//! `uint4` slots in `buffer2` (~1.1 GB). `buffer1` is additionally reinterpreted
+//! as `uint4`/`uint2` for the late-round R46/R57 outputs at fixed offsets. A
+//! flat counter array of 8 banks × 16384 tracks per-row occupancy; the
+//! round-7/R5 survivor count lives at index `R5_COUNTER_IDX = 114688` and sizes
+//! the `combine` launch.
+//!
+//! ## Hashing ABI
+//!
+//! `blake` takes the BLAKE2b first-block midstate as a by-value `ulong8`
+//! (`hashState`, from [`BatchHasher::midstate`]) plus a `nonce` whose low 32
+//! bits become message word `m[1]`'s low half (= header bytes [136..140]); the
+//! kernel hard-codes `m[0] = 0`, so the header's bytes [128..136] must be zero
+//! (the same `cuda_compatible` rule the CUDA backend uses). Each work item hashes
+//! index `tId`, emitting the two leaf entries `2*tId` and `2*tId+1` — exactly the
+//! canonical leaf-index/sub-block split [`crate::equihash`] verifies against.
+//!
+//! `combine` writes recovered solutions to the `output0`/`res` buffer:
+//! `output0[0].s0` is the solution count and each solution is 32 `uint4`
+//! (`SOLUTION_INDICES = 128` pre-sorted 25-bit leaf indices) at
+//! `output0[1 + 32*i ..]`. Those flatten straight into
+//! [`equihash::filter_candidates`], which canonicalises, verifies and
+//! de-duplicates them — the same contract as the default driver.
+
+use anyhow::{anyhow, Result};
+use ocl::prm::Ulong8;
+use ocl::{Buffer, MemFlags, ProQue};
+
+use crate::blake::BatchHasher;
+use crate::equihash;
+use crate::params::{BLAKE_CALLS, HEADER_LEN, SOLUTION_INDICES};
+
+/// Buckets ("rows") the 2^25 entries are hashed into (kernel: `& 0x1FFF`).
+const NR_ROWS: usize = 8192;
+/// `uint8` slots per row in the round-0/early `buffer0`/`buffer1` tables.
+const SLOTS_R0: usize = 4592;
+/// `uint4` slots per row in the round-4/5 `buffer2` table.
+const SLOTS_R45: usize = 8688;
+/// `uint8` entries in `buffer0`/`buffer1` (the kernel's `37617664` bound).
+const BUF01_U8ENTRIES: usize = NR_ROWS * SLOTS_R0;
+/// `uint4` entries in `buffer2`.
+const BUF2_U4ENTRIES: usize = NR_ROWS * SLOTS_R45;
+/// Flat counter array: 8 banks × 16384 (one bank consumed per round).
+const COUNTERS_U32: usize = 8 * 16384;
+/// Counter index holding the round-7 / R5 survivor count (sizes `combine`).
+const R5_COUNTER_IDX: usize = 114688;
+/// `combine` only emits solutions for `addr < this` (matches the kernel cap);
+/// far above the ~2 solutions a 192,7 nonce yields. Reads are capped to match.
+const MAX_WRITTEN_SOLS: usize = 16;
+/// Solution buffer capacity in solutions (`output0` = 1 + 32*cap `uint4`).
+const SOL_CAP: usize = 16;
+/// `reqd_work_group_size` of `blake`/`combine`.
+const WG_BLAKE: usize = 64;
+/// `reqd_work_group_size` of `round1..round7`.
+const WG_ROUND: usize = 256;
+/// Input rows (buckets) each collision round reads. The table is keyed on 13
+/// bits (8192) through round 4, then narrows to 12 bits (4096) — a round that
+/// reads `b` rows must launch exactly `b * 4` work-groups (kernel:
+/// `bucket = grp >> 2`), or it processes uninitialised rows and explodes.
+const ROUND_BUCKETS: [usize; 7] = [8192, 8192, 8192, 8192, 4096, 4096, 4096];
+/// Extra rows of slack appended to each big table. The kernel's per-row
+/// `atomic_inc` writes are uncapped, so a row that overflows its slot count
+/// spills into the next row and the top row spills past the nominal table end;
+/// this slack absorbs that (mean occupancy ~4096 sits well under the 4592/8688
+/// slot counts, so realistic overflow is a few rows at most).
+const ROW_SLACK: usize = 64;
+
+const KERNEL_SRC: &str = include_str!("../kernels/equihash192_7.cl");
+
+/// A persistent AMD OpenCL solver bound to one device.
+pub struct AmdSolver {
+    pq: ProQue,
+    /// Round-0/early ping-pong tables (`uint8`), reinterpreted at narrower
+    /// widths in late rounds.
+    buffer0: Buffer<u32>,
+    buffer1: Buffer<u32>,
+    /// Round-4/5 wide table (`uint4`).
+    buffer2: Buffer<u32>,
+    /// Per-row occupancy counters (8 banks).
+    counters: Buffer<u32>,
+    /// `res` / `output0`: `[count, then 32 uint4 per solution]`.
+    sols: Buffer<u32>,
+}
+
+unsafe impl Send for AmdSolver {}
+
+impl AmdSolver {
+    /// This device's product name, if available.
+    pub fn device_name(&self) -> Option<String> {
+        self.pq.device().name().ok()
+    }
+
+    /// Build the solver on `(platform, device)` and allocate all device buffers
+    /// (~3.5 GB total).
+    pub fn new(platform: ocl::Platform, device: ocl::Device) -> Result<Self> {
+        let pq = ProQue::builder()
+            .src(KERNEL_SRC)
+            .platform(platform)
+            .device(device)
+            .dims(1) // placeholder; every launch sets its own work size
+            .build()
+            .map_err(|e| anyhow!("AMD OpenCL build failed: {e}"))?;
+
+        let alloc = |len: usize| -> Result<Buffer<u32>> {
+            Ok(Buffer::<u32>::builder()
+                .queue(pq.queue().clone())
+                .flags(MemFlags::new().read_write())
+                .len(len)
+                .build()?)
+        };
+        // uint8 entries → 8 u32 each; uint4 entries → 4 u32 each. Each table gets
+        // ROW_SLACK extra rows of write headroom (see ROW_SLACK). buffer1's
+        // nominal uint8 size (75.2M uint4) already covers the late-round R46/R57
+        // regions at fixed offsets 48496640 / 67305472.
+        let buffer0 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?;
+        let buffer1 = alloc((BUF01_U8ENTRIES + ROW_SLACK * SLOTS_R0) * 8)?;
+        let buffer2 = alloc((BUF2_U4ENTRIES + ROW_SLACK * SLOTS_R45) * 4)?;
+        let counters = alloc(COUNTERS_U32)?;
+        let sols = alloc((1 + 32 * SOL_CAP) * 4)?;
+
+        Ok(Self { pq, buffer0, buffer1, buffer2, counters, sols })
+    }
+
+    /// Set the eight by-value args shared by every kernel in the pipeline:
+    /// `(buffer0, buffer1, buffer2, counters, res, extra, hashState, nonce)`.
+    fn build_kernel(&self, name: &str, hash_state: Ulong8, nonce: u64) -> Result<ocl::Kernel> {
+        Ok(self
+            .pq
+            .kernel_builder(name)
+            .arg(&self.buffer0)
+            .arg(&self.buffer1)
+            .arg(&self.buffer2)
+            .arg(&self.counters)
+            .arg(&self.sols)
+            .arg(0u32) // extra (unused by the pipeline)
+            .arg(hash_state)
+            .arg(nonce)
+            .build()?)
+    }
+
+    /// Run the full pipeline for `header` and return the flat recovered leaf
+    /// indices (`n * SOLUTION_INDICES`), ready for [`equihash::filter_candidates`].
+    fn run_pipeline(&self, header: &[u8]) -> Result<Vec<u32>> {
+        let mid = BatchHasher::new(header).midstate();
+        let hash_state = Ulong8::new(
+            mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7],
+        );
+        // Kernel's gId = nonce & 0xFFFFFFFF = message word m[1] low = header[136..140].
+        let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64;
+
+        // Clear counters + solution header (global = counter uint4 count).
+        let clear = self.build_kernel("clearCounter", hash_state, nonce)?;
+        unsafe {
+            clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?;
+        }
+
+        // Round 0: BLAKE2b + bucket. One work item per blake call (2^24); each
+        // emits two leaf entries.
+        let blake = self.build_kernel("blake", hash_state, nonce)?;
+        unsafe {
+            blake
+                .cmd()
+                .global_work_size(BLAKE_CALLS)
+                .local_work_size(WG_BLAKE)
+                .enq()?;
+        }
+
+        // Collision rounds 1..7 (4 groups per input row, 256 work items each).
+        for r in 1..=7 {
+            let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?;
+            unsafe {
+                k.cmd()
+                    .global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND)
+                    .local_work_size(WG_ROUND)
+                    .enq()?;
+            }
+        }
+
+        // Size `combine` from the round-7 survivor count (one group per candidate).
+        let mut r5 = [0u32; 1];
+        self.counters
+            .read(&mut r5[..])
+            .offset(R5_COUNTER_IDX)
+            .enq()?;
+        let groups = r5[0] as usize;
+        if groups == 0 {
+            return Ok(Vec::new());
+        }
+        let combine = self.build_kernel("combine", hash_state, nonce)?;
+        unsafe {
+            combine
+                .cmd()
+                .global_work_size(groups * WG_BLAKE)
+                .local_work_size(WG_BLAKE)
+                .enq()?;
+        }
+
+        // output0[0].s0 = solution count; each solution is 128 u32 (32 uint4)
+        // starting at uint4 index 1 (= u32 offset 4).
+        let mut head = [0u32; 1];
+        self.sols.read(&mut head[..]).enq()?;
+        let nsols = (head[0] as usize).min(MAX_WRITTEN_SOLS);
+        if nsols == 0 {
+            return Ok(Vec::new());
+        }
+        let mut data = vec![0u32; nsols * SOLUTION_INDICES];
+        self.sols.read(&mut data[..]).offset(4).enq()?;
+        Ok(data)
+    }
+
+    /// Solve for `header` (140 bytes): returns valid, canonical, de-duplicated
+    /// solutions as leaf-index lists.
+    pub fn solve(&self, header: &[u8]) -> Result<Vec<Vec<u32>>> {
+        assert_eq!(header.len(), HEADER_LEN);
+        let base = crate::blake::base_state(header);
+        let out = self.run_pipeline(header)?;
+        Ok(equihash::filter_candidates(&base, &out))
+    }
+
+    /// Solve and also return the raw recovered-candidate count (for diagnostics).
+    pub fn solve_with_stats(&self, header: &[u8]) -> Result<(usize, Vec<Vec<u32>>)> {
+        assert_eq!(header.len(), HEADER_LEN);
+        let base = crate::blake::base_state(header);
+        let out = self.run_pipeline(header)?;
+        let raw = out.len() / SOLUTION_INDICES;
+        Ok((raw, equihash::filter_candidates(&base, &out)))
+    }
+
+    /// Time each pipeline stage individually (forces a sync between stages).
+    pub fn profile(&self, header: &[u8]) -> Result<()> {
+        use log::info;
+        use std::time::Instant;
+
+        let mid = BatchHasher::new(header).midstate();
+        let hash_state = Ulong8::new(
+            mid[0], mid[1], mid[2], mid[3], mid[4], mid[5], mid[6], mid[7],
+        );
+        let nonce = u32::from_le_bytes(header[136..140].try_into().unwrap()) as u64;
+        let q = self.pq.queue();
+        let stage = |label: &str, t: Instant| -> Result<()> {
+            q.finish().map_err(|e| anyhow!("{label} failed: {e}"))?;
+            info!("  {label:14} {:6.1} ms", t.elapsed().as_secs_f64() * 1000.0);
+            Ok(())
+        };
+
+        let t = Instant::now();
+        let clear = self.build_kernel("clearCounter", hash_state, nonce)?;
+        unsafe {
+            clear.cmd().global_work_size(COUNTERS_U32 / 4).enq()?;
+        }
+        stage("clear", t)?;
+
+        let t = Instant::now();
+        let blake = self.build_kernel("blake", hash_state, nonce)?;
+        unsafe {
+            blake.cmd().global_work_size(BLAKE_CALLS).local_work_size(WG_BLAKE).enq()?;
+        }
+        stage("blake", t)?;
+
+        for r in 1..=7 {
+            let t = Instant::now();
+            let k = self.build_kernel(&format!("round{r}"), hash_state, nonce)?;
+            unsafe {
+                k.cmd()
+                    .global_work_size(ROUND_BUCKETS[r - 1] * 4 * WG_ROUND)
+                    .local_work_size(WG_ROUND)
+                    .enq()?;
+            }
+            stage(&format!("round {r}"), t)?;
+        }
+        Ok(())
+    }
+}
diff --git a/src/gpu_probe.rs b/src/gpu_probe.rs
index 54a5a48..697ed59 100644
--- a/src/gpu_probe.rs
+++ b/src/gpu_probe.rs
@@ -1,11 +1,12 @@
 //! GPU device probing for the config tool (`jackpotminer-config` only — this is
 //! not compiled into the miner, so there is no duplicate FFI).
 //!
-//! With the `gpu`/`cuda` features the OpenCL/CUDA SDKs are linked in (build.rs
-//! links `cuda`/`nvml`; the `ocl` crate links `OpenCL`), and the tool enumerates
-//! devices directly — handy on Windows where you may not want to shell out to
-//! the miner. Without those features the functions return empty lists and the
-//! tool falls back to spawning `jackpotminer --devices-json`.
+//! With the `gpu`/`cuda` features the tool enumerates devices directly — handy
+//! on Windows where you may not want to shell out to the miner. OpenCL goes
+//! through the `ocl` crate; CUDA is `dlopen`'d at runtime (so this binary, like
+//! the miner, has no build- or load-time dependency on libcuda). Without those
+//! features the functions return empty lists and the tool falls back to spawning
+//! `jackpotminer --devices-json`.
 
 /// True when at least one GPU SDK is compiled in, so direct probing works.
 pub const HAS_SDK: bool = cfg!(feature = "gpu") || cfg!(feature = "cuda");
@@ -34,34 +35,57 @@ pub fn opencl() -> Vec<String> {
 }
 
 /// CUDA devices as `"[i] <name>"` via the driver API (empty without the SDK, no
-/// driver, or any error). Uses a tiny self-contained FFI subset.
+/// driver, or any error). The CUDA driver is `dlopen`'d at runtime — a tiny
+/// self-contained subset of the FFI in `src/cuda.rs` — so this binary needs no
+/// link- or load-time libcuda.
 #[cfg(feature = "cuda")]
 pub fn cuda() -> Vec<String> {
     use std::ffi::CStr;
     use std::os::raw::{c_char, c_int, c_uint};
 
-    // Linked via build.rs (`cuda`), matching src/cuda.rs's declarations.
-    extern "C" {
-        fn cuInit(flags: c_uint) -> c_int;
-        fn cuDeviceGetCount(count: *mut c_int) -> c_int;
-        fn cuDeviceGet(device: *mut c_int, ordinal: c_int) -> c_int;
-        fn cuDeviceGetName(name: *mut c_char, len: c_int, dev: c_int) -> c_int;
-    }
+    type CuInit = unsafe extern "C" fn(c_uint) -> c_int;
+    type CuCount = unsafe extern "C" fn(*mut c_int) -> c_int;
+    type CuGet = unsafe extern "C" fn(*mut c_int, c_int) -> c_int;
+    type CuName = unsafe extern "C" fn(*mut c_char, c_int, c_int) -> c_int;
 
     let mut out = Vec::new();
     unsafe {
-        if cuInit(0) != 0 {
+        // libcuda.so.1 ships with the NVIDIA driver; absent on AMD-only hosts.
+        let lib = match ["libcuda.so.1", "libcuda.so", "nvcuda.dll"]
+            .iter()
+            .find_map(|n| libloading::Library::new(n).ok())
+        {
+            Some(l) => l,
+            None => return out,
+        };
+        let sym = |name: &[u8]| -> Option<*mut std::ffi::c_void> {
+            lib.get::<*mut std::ffi::c_void>(name).ok().map(|s| *s)
+        };
+        let (Some(init), Some(count), Some(get), Some(getname)) = (
+            sym(b"cuInit\0"),
+            sym(b"cuDeviceGetCount\0"),
+            sym(b"cuDeviceGet\0"),
+            sym(b"cuDeviceGetName\0"),
+        ) else {
+            return out;
+        };
+        let cu_init: CuInit = std::mem::transmute(init);
+        let cu_count: CuCount = std::mem::transmute(count);
+        let cu_get: CuGet = std::mem::transmute(get);
+        let cu_name: CuName = std::mem::transmute(getname);
+
+        if cu_init(0) != 0 {
             return out;
         }
         let mut n: c_int = 0;
-        if cuDeviceGetCount(&mut n) != 0 {
+        if cu_count(&mut n) != 0 {
             return out;
         }
         for i in 0..n {
             let mut dev: c_int = 0;
-            let name = if cuDeviceGet(&mut dev, i) == 0 {
+            let name = if cu_get(&mut dev, i) == 0 {
                 let mut buf = [0i8; 128];
-                if cuDeviceGetName(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 {
+                if cu_name(buf.as_mut_ptr() as *mut c_char, 128, dev) == 0 {
                     CStr::from_ptr(buf.as_ptr() as *const c_char).to_string_lossy().into_owned()
                 } else {
                     format!("CUDA device {i}")
diff --git a/src/main.rs b/src/main.rs
index 4c1be8c..9ed4ea9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,6 +14,14 @@ mod tui;
 #[cfg(feature = "gpu")]
 mod gpu;
 
+// AMD-tuned OpenCL kernel driver (selected by GpuSolver for AMD-vendor devices).
+#[cfg(feature = "gpu")]
+mod gpu_amd;
+
+// Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML.
+#[cfg(feature = "cuda")]
+mod dylib;
+
 #[cfg(feature = "cuda")]
 mod cuda;
 
@@ -79,8 +87,9 @@ struct Args {
     jackpot: Option<u32>,
 
     /// Pause mining if no new job arrives within this many seconds (stale work
-    /// guard); resumes automatically when fresh work arrives. 0 disables.
-    #[arg(long, value_name = "SECS", default_value_t = 300)]
+    /// guard); resumes automatically when fresh work arrives. Default 600 (10
+    /// minutes). 0 disables.
+    #[arg(long, value_name = "SECS", default_value_t = 600)]
     job_timeout: u64,
 
     /// Open a local control server on 127.0.0.1:<PORT> so the GUI config tool can
@@ -139,8 +148,11 @@ struct Args {
     #[arg(long, default_value = "all")]
     devices: String,
 
-    /// GPU backend: "opencl" or "cuda" (for nvidia cards).
-    #[arg(long, default_value = "cuda")]
+    /// GPU backend: "mixed" (default — each card on its native backend: NVIDIA
+    /// on CUDA, AMD/Intel on OpenCL), "opencl" (every card via OpenCL), or
+    /// "cuda" (NVIDIA only). In mixed mode `--devices` indexes the combined list
+    /// shown by --list-devices.
+    #[arg(long, default_value = "mixed")]
     backend: String,
 
     /// Force the OpenCL backend, disabling CUDA (overrides --backend).
@@ -610,6 +622,9 @@ fn main() -> Result<()> {
 /// Which GPU backend the user selected.
 enum BackendKind {
     Cpu,
+    /// Each physical card on its native backend (NVIDIA→CUDA, others→OpenCL).
+    #[cfg(any(feature = "gpu", feature = "cuda"))]
+    Mixed,
     #[cfg(feature = "gpu")]
     OpenCl,
     #[cfg(feature = "cuda")]
@@ -633,6 +648,16 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
         }
     }
     match args.backend.to_ascii_lowercase().as_str() {
+        "mixed" => {
+            // Each card on its native backend; falls back to whatever single GPU
+            // backend is compiled, or to CPU when none is.
+            #[cfg(any(feature = "gpu", feature = "cuda"))]
+            {
+                Ok(BackendKind::Mixed)
+            }
+            #[cfg(not(any(feature = "gpu", feature = "cuda")))]
+            Ok(BackendKind::Cpu)
+        }
         "cuda" => {
             #[cfg(feature = "cuda")]
             {
@@ -649,7 +674,7 @@ fn backend_kind(args: &Args) -> Result<BackendKind> {
             #[cfg(not(feature = "gpu"))]
             Ok(BackendKind::Cpu)
         }
-        other => Err(anyhow!("unknown --backend '{other}' (expected opencl or cuda)")),
+        other => Err(anyhow!("unknown --backend '{other}' (expected mixed, opencl, or cuda)")),
     }
 }
 
@@ -707,6 +732,8 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
                 let clamp = (args.cpu_clamp != 0).then_some(args.cpu_clamp);
                 return Ok(vec![BackendSpec::Cpu(clamp)]);
             }
+            // Mixed builds its own unified list (each card on its native backend).
+            BackendKind::Mixed => return mixed_specs(args),
             #[cfg(feature = "cuda")]
             BackendKind::Cuda => (cuda::device_count()?, true),
             #[cfg(feature = "gpu")]
@@ -735,6 +762,71 @@ fn backend_specs(args: &Args, gpu_devices: &[GpuDeviceCfg]) -> Result<Vec<Backen
     }
 }
 
+/// The unified device list for the `mixed` backend, as `(label, spec)`: each
+/// physical GPU on its native backend, with no card mined twice. NVIDIA cards go
+/// to CUDA (listed first); the remaining OpenCL devices (AMD/Intel, plus NVIDIA
+/// when CUDA is unavailable) go to OpenCL. Shared by [`mixed_specs`] and
+/// [`list_devices`]; `--devices` indexes into this list.
+#[cfg(any(feature = "gpu", feature = "cuda"))]
+fn mixed_plan() -> Vec<(String, BackendSpec)> {
+    /// Drop a leading `"[<n>] "` index prefix from a backend's device label, so
+    /// the mixed list shows its own single index instead of two.
+    fn strip_index(label: &str) -> &str {
+        label
+            .strip_prefix('[')
+            .and_then(|s| s.split_once("] "))
+            .map(|(_, rest)| rest)
+            .unwrap_or(label)
+    }
+
+    #[allow(unused_mut)]
+    let mut plan: Vec<(String, BackendSpec)> = Vec::new();
+
+    // NVIDIA cards via CUDA, when the backend is compiled and the driver loads.
+    #[cfg(feature = "cuda")]
+    let cuda_has_nvidia = {
+        let names = cuda::list_devices().unwrap_or_default();
+        for (i, label) in names.iter().enumerate() {
+            plan.push((format!("{} (CUDA)", strip_index(label)), BackendSpec::Cuda(i)));
+        }
+        !names.is_empty()
+    };
+    #[cfg(not(feature = "cuda"))]
+    let cuda_has_nvidia = false;
+
+    // Remaining OpenCL cards via OpenCL; skip NVIDIA ones already on CUDA.
+    #[cfg(feature = "gpu")]
+    {
+        let names = gpu::list_devices().unwrap_or_default();
+        let nvidia = gpu::device_is_nvidia();
+        for (j, label) in names.iter().enumerate() {
+            if nvidia.get(j).copied().unwrap_or(false) && cuda_has_nvidia {
+                continue;
+            }
+            plan.push((format!("{} (OpenCL)", strip_index(label)), BackendSpec::Gpu(j)));
+        }
+    }
+    // `cuda_has_nvidia` is only consumed by the OpenCL branch above.
+    #[cfg(not(feature = "gpu"))]
+    let _ = cuda_has_nvidia;
+
+    plan
+}
+
+/// Build the worker list for `--backend mixed`: each card on its native backend.
+/// `--devices` selects into [`mixed_plan`]'s unified list.
+#[cfg(any(feature = "gpu", feature = "cuda"))]
+fn mixed_specs(args: &Args) -> Result<Vec<BackendSpec>> {
+    let plan = mixed_plan();
+    if plan.is_empty() {
+        return Err(anyhow!(
+            "no GPUs found for the mixed backend — none detected via CUDA or OpenCL"
+        ));
+    }
+    let selected = parse_devices(&args.devices, plan.len())?;
+    Ok(selected.into_iter().map(|i| plan[i].1).collect())
+}
+
 /// Build a single GPU worker spec for `idx`, choosing CUDA or OpenCL, erroring if
 /// the requested backend wasn't compiled in.
 #[cfg(any(feature = "gpu", feature = "cuda"))]
@@ -821,6 +913,18 @@ fn list_devices() {
         Ok(_) => println!("no CUDA devices found"),
         Err(e) => println!("error listing CUDA devices: {e}"),
     }
+    // What the default `mixed` backend will mine, and the indices `--devices`
+    // selects from in that mode.
+    #[cfg(any(feature = "gpu", feature = "cuda"))]
+    {
+        let plan = mixed_plan();
+        if !plan.is_empty() {
+            println!("\nMixed backend (--backend mixed, the default) — `--devices` indexes this list:");
+            for (i, (label, _)) in plan.iter().enumerate() {
+                println!("  [{i}] {label}");
+            }
+        }
+    }
     #[cfg(not(any(feature = "gpu", feature = "cuda")))]
     println!("built without GPU support (rebuild with the `gpu` or `cuda` feature)");
 }
@@ -886,18 +990,24 @@ fn selftest(gpu_device: usize) -> Result<()> {
         let solver = gpu::GpuSolver::new(gpu_device)
             .with_context(|| format!("init OpenCL device {gpu_device}"))?;
 
-        // Spot-check the BLAKE2b kernel against the CPU reference.
-        let outputs = solver.hash_all(&header)?;
-        let step = params::BLAKE_CALLS / 64;
-        for k in 0..64 {
-            let g = (k * step) as u32;
-            let cpu = blake::generate_hash(&base, g);
-            let off = g as usize * params::HASH_OUTPUT;
-            if cpu != outputs[off..off + params::HASH_OUTPUT] {
-                return Err(anyhow!("GPU BLAKE2b mismatch at g={g}"));
+        // Spot-check the BLAKE2b kernel against the CPU reference. The AMD kernel
+        // buckets its round-0 output instead of exposing per-index digests, so
+        // the probe is skipped there (the solve-vs-CPU check below still runs).
+        if solver.supports_blake_probe() {
+            let outputs = solver.hash_all(&header)?;
+            let step = params::BLAKE_CALLS / 64;
+            for k in 0..64 {
+                let g = (k * step) as u32;
+                let cpu = blake::generate_hash(&base, g);
+                let off = g as usize * params::HASH_OUTPUT;
+                if cpu != outputs[off..off + params::HASH_OUTPUT] {
+                    return Err(anyhow!("GPU BLAKE2b mismatch at g={g}"));
+                }
             }
+            info!("GPU BLAKE2b kernel matches CPU");
+        } else {
+            info!("skipping BLAKE2b kernel probe (AMD kernel buckets round-0 output)");
         }
-        info!("GPU BLAKE2b kernel matches CPU");
 
         let gpu_solutions = solver.solve(&header)?;
         info!("GPU found {} valid solution(s)", gpu_solutions.len());
diff --git a/src/nvml.rs b/src/nvml.rs
index e73ad63..05e9bba 100644
--- a/src/nvml.rs
+++ b/src/nvml.rs
@@ -33,7 +33,15 @@ const NVML_CLOCK_MEM: c_int = 2;
 // nvmlTemperatureSensors_t
 const NVML_TEMPERATURE_GPU: c_int = 0;
 
-extern "C" {
+// NVML, loaded at runtime via dlopen (see `crate::dylib`) rather than linked at
+// build time — it ships with the NVIDIA driver (`libnvidia-ml.so.1`;
+// `nvml.dll` on Windows) and is absent on driver-less / AMD-only hosts.
+// `nvml_lib()` is `None` when it can't be opened; `open()` checks it first and
+// returns `None` (no tuning) so the rest of the program is unaffected.
+crate::dylib::dynamic_library! {
+    lib_struct: NvmlLib,
+    loader: nvml_lib,
+    names: ["libnvidia-ml.so.1", "libnvidia-ml.so", "nvml.dll"],
     fn nvmlInit_v2() -> nvmlReturn_t;
     fn nvmlShutdown() -> nvmlReturn_t;
     fn nvmlDeviceGetName(device: nvmlDevice_t, name: *mut c_char, length: c_uint) -> nvmlReturn_t;
@@ -69,6 +77,7 @@ unsafe impl Send for NvmlTuner {}
 
 /// Open an NVML control handle for the GPU at `pci_bus_id` (e.g. "0000:01:00.0").
 pub fn open(pci_bus_id: &str) -> Option<Box<dyn GpuTuner>> {
+    nvml_lib()?; // NVML not installed → no tuning
     let cstr = CString::new(pci_bus_id).ok()?;
     unsafe {
         if nvmlInit_v2() != NVML_SUCCESS {