AMD kernel: drop fp64 dependency (build on rusticl/Mesa OpenCL)
equihash192_7.cl's decompress/decompress2 used double + sqrt behind cl_khr_fp64, so the kernel failed to build on OpenCL stacks without fp64 (notably rusticl/Mesa) — those workers died with 'use of type double requires cl_khr_fp64'. Replace round(sqrt(2*x+1)) with an exact integer square root (single-precision estimate corrected to the integer floor, then rounded; inputs are triangular indices < ~2^26). No fp64, no behavior change on ROCm (verified bit-identical: 77 solutions/40 nonces, same as before), and rusticl devices now build and solve correctly. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+15
-14
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
//#define PRINT 1
|
//#define PRINT 1
|
||||||
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
||||||
|
|
||||||
__constant ulong blake_iv[] =
|
__constant ulong blake_iv[] =
|
||||||
{
|
{
|
||||||
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
|
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
|
||||||
@@ -507,14 +505,22 @@ uint compress2(uint in0, uint in1) {
|
|||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Exact round(sqrt(x)) without fp64: a single-precision estimate corrected to the
|
||||||
|
// exact integer floor, then rounded. Lets the kernel build on OpenCL stacks that
|
||||||
|
// lack cl_khr_fp64 (e.g. rusticl/Mesa) while staying bit-identical on ROCm. Inputs
|
||||||
|
// are triangular indices (< ~2^26), well within range for the float estimate +
|
||||||
|
// integer correction.
|
||||||
|
inline uint isqrt_round(ulong x) {
|
||||||
|
long m = (long) sqrt((float) x);
|
||||||
|
while (m > 0 && (ulong)(m * m) > x) m--; // correct down to floor(sqrt)
|
||||||
|
while ((ulong)((m + 1) * (m + 1)) <= x) m++; // correct up to floor(sqrt)
|
||||||
|
// round to nearest: round up iff the fractional part is >= 0.5.
|
||||||
|
return (uint)(((x - (ulong)(m * m)) > (ulong) m) ? (m + 1) : m);
|
||||||
|
}
|
||||||
|
|
||||||
uint2 decompress(uint2 in) {
|
uint2 decompress(uint2 in) {
|
||||||
double inFl = (double) (in.s0 >> 6);
|
|
||||||
|
|
||||||
inFl *= 2.0;
|
|
||||||
inFl += 1.0;
|
|
||||||
|
|
||||||
uint2 res;
|
uint2 res;
|
||||||
res.s0 = (uint) round(sqrt(inFl));
|
res.s0 = isqrt_round(2ul * (ulong)(in.s0 >> 6) + 1ul);
|
||||||
|
|
||||||
|
|
||||||
uint tmp = res.s0 * (res.s0-1);
|
uint tmp = res.s0 * (res.s0-1);
|
||||||
@@ -535,13 +541,8 @@ uint2 decompress(uint2 in) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint2 decompress2(uint in) {
|
uint2 decompress2(uint in) {
|
||||||
double inFl = (double) in;
|
|
||||||
|
|
||||||
inFl *= 2.0;
|
|
||||||
inFl += 1.0;
|
|
||||||
|
|
||||||
uint2 res;
|
uint2 res;
|
||||||
res.s0 = (uint) round(sqrt(inFl));
|
res.s0 = isqrt_round(2ul * (ulong) in + 1ul);
|
||||||
|
|
||||||
|
|
||||||
uint tmp = res.s0 * (res.s0-1);
|
uint tmp = res.s0 * (res.s0-1);
|
||||||
|
|||||||
Reference in New Issue
Block a user