Initial commit: jackpotminer Equihash 192,7 miner
GPU-accelerated Equihash 192,7 miner in Rust with three solver backends: - CPU: Wagner's algorithm, AVX2 packed slots (xenoncat-style) - OpenCL: full on-GPU solve (kernels/equihash.cl); runs on NVIDIA and AMD - CUDA: driver-API replay of miniZ's extracted fatbin (src/miniz/) Also includes a default-off pearlhash backend (src/pearl/, native CPU core + NVRTC int8-GEMM GPU kernels) and a WIP Ethash CUDA backend (src/ethash/). Reverse-engineering scratch (alpha-miner, pearl-dump/) and the active runtime config (mine.toml) are gitignored; mine.example.toml is the template. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,277 @@
|
||||
// GPU Equihash 192,7 solver.
|
||||
//
|
||||
// Pipeline (all stages on the GPU):
|
||||
// gen - BLAKE2b each index, bucket entries into table[0] by block 0
|
||||
// round_collide- for r=0..5: collide block r in table[r], write table[r+1]
|
||||
// final_round - collide blocks 6 & 7 in table[6], emit candidate solutions
|
||||
// recover - walk back-references to reconstruct 128 leaf indices/solution
|
||||
//
|
||||
// A "block" is one 24-bit (3-byte) chunk of the 192-bit hash. Entries are
|
||||
// bucketed by the high ROW_BITS bits of the current collision block; the full
|
||||
// block is re-checked within a bucket.
|
||||
//
|
||||
// Memory model (kept small so 8-12 GB cards work): each entry's data is split.
|
||||
// - back-reference (1 u32/slot): a leaf index in table[0], otherwise a packed
|
||||
// (row,slot,slot) parent reference. One persistent array per table (0..6),
|
||||
// because `recover` walks these — and ONLY these — to rebuild a solution.
|
||||
// - blocks (the remaining 24-bit hash words): needed only during the round
|
||||
// that consumes them, so they live in two ping-pong working buffers
|
||||
// (`src_blk`/`dst_blk`, MAXB words per slot) reused across all rounds.
|
||||
// This avoids keeping seven full block tables resident (~11 GB → ~6 GB).
|
||||
//
|
||||
// The host prepends a prelude of #defines (NR_ROWS, NR_SLOTS, ROW_BITS, CBL,
|
||||
// MAXB, MAX_SOLS), so those symbols are defined here.
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
|
||||
typedef ulong u64;
|
||||
|
||||
__constant u64 BLAKE2B_IV[8] = {
|
||||
0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL,
|
||||
0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL,
|
||||
0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL,
|
||||
0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL
|
||||
};
|
||||
|
||||
__constant uchar SIGMA[12][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
|
||||
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
|
||||
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
|
||||
{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
|
||||
{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
|
||||
{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
|
||||
{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
|
||||
{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
|
||||
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
static inline u64 rotr64(u64 x, uint n) {
|
||||
return (x >> n) | (x << (64 - n));
|
||||
}
|
||||
|
||||
#define G(r, i, a, b, c, d) \
|
||||
a = a + b + m[SIGMA[r][2*i + 0]]; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b + m[SIGMA[r][2*i + 1]]; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63);
|
||||
|
||||
static void compress(u64 h[8], const u64 m[16], u64 t, int last) {
|
||||
u64 v[16];
|
||||
for (int i = 0; i < 8; i++) v[i] = h[i];
|
||||
for (int i = 0; i < 8; i++) v[8 + i] = BLAKE2B_IV[i];
|
||||
v[12] ^= t;
|
||||
if (last) v[14] = ~v[14];
|
||||
|
||||
for (int r = 0; r < 12; r++) {
|
||||
G(r, 0, v[0], v[4], v[ 8], v[12]);
|
||||
G(r, 1, v[1], v[5], v[ 9], v[13]);
|
||||
G(r, 2, v[2], v[6], v[10], v[14]);
|
||||
G(r, 3, v[3], v[7], v[11], v[15]);
|
||||
G(r, 4, v[0], v[5], v[10], v[15]);
|
||||
G(r, 5, v[1], v[6], v[11], v[12]);
|
||||
G(r, 6, v[2], v[7], v[ 8], v[13]);
|
||||
G(r, 7, v[3], v[4], v[ 9], v[14]);
|
||||
}
|
||||
for (int i = 0; i < 8; i++) h[i] ^= v[i] ^ v[8 + i];
|
||||
}
|
||||
|
||||
static inline u64 load64(const uchar *p) {
|
||||
return (u64)p[0] | ((u64)p[1] << 8) | ((u64)p[2] << 16) | ((u64)p[3] << 24) |
|
||||
((u64)p[4] << 32)| ((u64)p[5] << 40) | ((u64)p[6] << 48) | ((u64)p[7] << 56);
|
||||
}
|
||||
|
||||
// Personalised BLAKE2b of header[0..140] || LE32(g); writes the 48-byte digest.
|
||||
static void blake_hash(__global const uchar *header, uint g, uchar out[48]) {
|
||||
uchar msg[256];
|
||||
for (int i = 0; i < 140; i++) msg[i] = header[i];
|
||||
msg[140] = (uchar)(g & 0xff);
|
||||
msg[141] = (uchar)((g >> 8) & 0xff);
|
||||
msg[142] = (uchar)((g >> 16) & 0xff);
|
||||
msg[143] = (uchar)((g >> 24) & 0xff);
|
||||
for (int i = 144; i < 256; i++) msg[i] = 0;
|
||||
|
||||
u64 h[8];
|
||||
for (int i = 0; i < 8; i++) h[i] = BLAKE2B_IV[i];
|
||||
h[0] ^= 0x0000000001010030UL;
|
||||
h[6] ^= 0x576f50687361635aUL; // "ZcashPoW"
|
||||
h[7] ^= 0x00000007000000c0UL; // LE32(192) || LE32(7)
|
||||
|
||||
u64 m[16];
|
||||
for (int i = 0; i < 16; i++) m[i] = load64(msg + i * 8);
|
||||
compress(h, m, 128, 0);
|
||||
for (int i = 0; i < 16; i++) m[i] = load64(msg + 128 + i * 8);
|
||||
compress(h, m, 144, 1);
|
||||
|
||||
for (int w = 0; w < 6; w++) {
|
||||
u64 hv = h[w];
|
||||
for (int b = 0; b < 8; b++) out[w * 8 + b] = (uchar)(hv >> (8 * b));
|
||||
}
|
||||
}
|
||||
|
||||
// Offset of (row, slot) in a back-reference array (1 u32 per slot).
|
||||
static inline size_t bref_off(uint row, uint slot) {
|
||||
return (size_t)row * NR_SLOTS + slot;
|
||||
}
|
||||
// Offset of (row, slot) in a block working buffer (MAXB u32 blocks per slot).
|
||||
static inline size_t blk_off(uint row, uint slot) {
|
||||
return ((size_t)row * NR_SLOTS + slot) * MAXB;
|
||||
}
|
||||
|
||||
// ---- selftest helper: raw BLAKE2b output for every index ----
|
||||
__kernel void equihash_hash(__global const uchar *header, __global uchar *out) {
|
||||
uint g = get_global_id(0);
|
||||
uchar dig[48];
|
||||
blake_hash(header, g, dig);
|
||||
__global uchar *o = out + (size_t)g * 48;
|
||||
for (int i = 0; i < 48; i++) o[i] = dig[i];
|
||||
}
|
||||
|
||||
// ---- round 0: hash and bucket into table[0] by block 0 ----
|
||||
// Writes the leaf index to back-reference array `bref0` and all 8 blocks to the
|
||||
// block working buffer `blk0`.
|
||||
__kernel void gen(__global const uchar *header,
|
||||
__global uint *bref0,
|
||||
__global uint *blk0,
|
||||
__global uint *counts) {
|
||||
uint g = get_global_id(0);
|
||||
uchar dig[48];
|
||||
blake_hash(header, g, dig);
|
||||
|
||||
for (uint i = 0; i < 2; i++) {
|
||||
uchar *h = dig + i * 24; // one 192-bit hash (blocks 0..7)
|
||||
uint blocks[8];
|
||||
for (uint b = 0; b < 8; b++)
|
||||
blocks[b] = ((uint)h[3*b] << 16) | ((uint)h[3*b + 1] << 8) | h[3*b + 2];
|
||||
uint row = blocks[0] >> (CBL - ROW_BITS);
|
||||
uint slot = atomic_inc(&counts[row]);
|
||||
if (slot < NR_SLOTS) {
|
||||
bref0[bref_off(row, slot)] = g * 2 + i;
|
||||
__global uint *bb = blk0 + blk_off(row, slot);
|
||||
for (uint b = 0; b < 8; b++) bb[b] = blocks[b];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- intermediate rounds r = 0..5: collide block r, write table[r+1] ----
|
||||
// Reads blocks from `src_blk` (table r), writes the parent ref to `dst_bref`
|
||||
// (table r+1's back-reference array) and the carried blocks to `dst_blk`.
|
||||
__kernel void round_collide(__global uint *src_blk,
|
||||
__global uint *dst_bref,
|
||||
__global uint *dst_blk,
|
||||
__global uint *counts,
|
||||
uint r) {
|
||||
uint row = get_global_id(0);
|
||||
uint carry = 7u - r; // blocks r+1..7 carried forward
|
||||
|
||||
uint cnt = counts[r * NR_ROWS + row];
|
||||
if (cnt > NR_SLOTS) cnt = NR_SLOTS;
|
||||
|
||||
for (uint i = 0; i < cnt; i++) {
|
||||
__global uint *xi = src_blk + blk_off(row, i);
|
||||
for (uint j = i + 1; j < cnt; j++) {
|
||||
__global uint *xj = src_blk + blk_off(row, j);
|
||||
// Full block r (slot block 0) must match.
|
||||
if (xi[0] == xj[0]) {
|
||||
uint nblock = xi[1] ^ xj[1];
|
||||
uint nrow = nblock >> (CBL - ROW_BITS);
|
||||
uint nslot = atomic_inc(&counts[(r + 1) * NR_ROWS + nrow]);
|
||||
if (nslot < NR_SLOTS) {
|
||||
dst_bref[bref_off(nrow, nslot)] = (row << (2 * SLOT_BITS)) | (i << SLOT_BITS) | j;
|
||||
__global uint *d = dst_blk + blk_off(nrow, nslot);
|
||||
for (uint b = 0; b < carry; b++) d[b] = xi[1 + b] ^ xj[1 + b];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- final round: blocks 6 & 7 (table 6's two blocks) must match ----
|
||||
__kernel void final_round(__global uint *src_blk,
|
||||
__global uint *counts,
|
||||
__global uint *sols,
|
||||
__global uint *solcnt) {
|
||||
uint row = get_global_id(0);
|
||||
|
||||
uint cnt = counts[6 * NR_ROWS + row];
|
||||
if (cnt > NR_SLOTS) cnt = NR_SLOTS;
|
||||
|
||||
for (uint i = 0; i < cnt; i++) {
|
||||
__global uint *xi = src_blk + blk_off(row, i);
|
||||
for (uint j = i + 1; j < cnt; j++) {
|
||||
__global uint *xj = src_blk + blk_off(row, j);
|
||||
if (xi[0] == xj[0] && xi[1] == xj[1]) {
|
||||
uint idx = atomic_inc(solcnt);
|
||||
if (idx < MAX_SOLS) sols[idx] = (row << (2 * SLOT_BITS)) | (i << SLOT_BITS) | j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pick back-reference array `lvl` (recover touches every level).
|
||||
static __global uint *pick(uint lvl, __global uint *b0, __global uint *b1,
|
||||
__global uint *b2, __global uint *b3,
|
||||
__global uint *b4, __global uint *b5,
|
||||
__global uint *b6) {
|
||||
switch (lvl) {
|
||||
case 0: return b0;
|
||||
case 1: return b1;
|
||||
case 2: return b2;
|
||||
case 3: return b3;
|
||||
case 4: return b4;
|
||||
case 5: return b5;
|
||||
default: return b6;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- recover 128 leaf indices per candidate solution ----
|
||||
// Walks the per-table back-reference arrays (bN); the block buffers aren't
|
||||
// needed here.
|
||||
__kernel void recover(__global uint *b0, __global uint *b1, __global uint *b2,
|
||||
__global uint *b3, __global uint *b4, __global uint *b5,
|
||||
__global uint *b6,
|
||||
__global uint *sols, __global uint *out) {
|
||||
uint s = get_global_id(0);
|
||||
uint packed = sols[s];
|
||||
uint row = packed >> (2 * SLOT_BITS);
|
||||
uint sa = (packed >> SLOT_BITS) & SLOT_MASK;
|
||||
uint sb = packed & SLOT_MASK;
|
||||
|
||||
// refs hold (row << SLOT_BITS | slot) into the current table level.
|
||||
uint refs[128];
|
||||
uint tmp[128];
|
||||
refs[0] = (row << SLOT_BITS) | sa;
|
||||
refs[1] = (row << SLOT_BITS) | sb;
|
||||
uint cnt = 2;
|
||||
|
||||
// table[6] refs -> ... -> table[0] refs (six doublings: 2 -> 128).
|
||||
for (uint lvl = 6; lvl >= 1; lvl--) {
|
||||
__global uint *b = pick(lvl, b0, b1, b2, b3, b4, b5, b6);
|
||||
uint nc = 0;
|
||||
for (uint k = 0; k < cnt; k++) {
|
||||
uint rr = refs[k] >> SLOT_BITS;
|
||||
uint sl = refs[k] & SLOT_MASK;
|
||||
uint ref = b[bref_off(rr, sl)];
|
||||
uint prow = ref >> (2 * SLOT_BITS);
|
||||
uint pa = (ref >> SLOT_BITS) & SLOT_MASK;
|
||||
uint pb = ref & SLOT_MASK;
|
||||
tmp[nc++] = (prow << SLOT_BITS) | pa;
|
||||
tmp[nc++] = (prow << SLOT_BITS) | pb;
|
||||
}
|
||||
for (uint k = 0; k < nc; k++) refs[k] = tmp[k];
|
||||
cnt = nc;
|
||||
}
|
||||
|
||||
// refs now index table[0]; its back-reference array holds the leaf index.
|
||||
for (uint k = 0; k < 128; k++) {
|
||||
uint rr = refs[k] >> SLOT_BITS;
|
||||
uint sl = refs[k] & SLOT_MASK;
|
||||
out[s * 128 + k] = b0[bref_off(rr, sl)];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user