AMD GPU telemetry + --target-temp governor
Brings AMD cards to parity with NVIDIA for monitoring/control surface, which was NVML-only. New src/amd_smi.rs is a gpu_tune::GpuTuner backed by Linux amdgpu sysfs (power1_average, temp1_input edge, freq1_input sclk, pp_dpm_sclk/mclk), matched to the device by PCI bus id from OpenCL cl_khr_pci_bus_info. gpu_tune is un-gated to compile under the gpu feature; open() probes NVML then amd_smi. GpuSolver carries the tuner and Backend::Gpu dispatches power/temp/clocks, so the TUI and --benchmark now show power, temperature, clocks and Sol/W for AMD. Telemetry-only — setters are Unsupported (amdgpu control nodes are root-only). --target-temp <C> adds an opt-in software governor (miner::govern_cadence) that paces solve cadence to hold edge temperature, no hardware writes/root. With small thermal throttle it won't beat flat-out on raw Sol/s; it's a temp/efficiency lever. Unit-tested controller; flag/plumbing verified live. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+58
-7
@@ -18,6 +18,10 @@ mod gpu;
|
||||
#[cfg(feature = "gpu")]
|
||||
mod gpu_amd;
|
||||
|
||||
// AMD GPU telemetry via Linux amdgpu sysfs (a `gpu_tune::GpuTuner` backend).
|
||||
#[cfg(feature = "gpu")]
|
||||
mod amd_smi;
|
||||
|
||||
// Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML.
|
||||
#[cfg(feature = "cuda")]
|
||||
mod dylib;
|
||||
@@ -28,7 +32,9 @@ mod cuda;
|
||||
#[cfg(feature = "cuda")]
|
||||
mod nvml;
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
// Platform-agnostic GPU tuning/telemetry surface. The trait + policy compile for
|
||||
// either GPU backend; NVML (cuda) and amd_smi (gpu) are the implementations.
|
||||
#[cfg(any(feature = "cuda", feature = "gpu"))]
|
||||
mod gpu_tune;
|
||||
|
||||
use std::io::IsTerminal;
|
||||
@@ -204,6 +210,13 @@ struct Args {
|
||||
#[arg(long)]
|
||||
auto_tune: bool,
|
||||
|
||||
/// Sustained-Sol/s governor: hold each GPU at/below this edge temperature (°C)
|
||||
/// by pacing the solve cadence (no hardware writes, no root). Trades a little
|
||||
/// throughput for lower temp/power; off by default (runs flat-out). Needs a
|
||||
/// backend that reports temperature (AMD amdgpu / NVIDIA).
|
||||
#[arg(long, value_name = "CELSIUS")]
|
||||
target_temp: Option<u32>,
|
||||
|
||||
/// Efficiency: cap each GPU's power limit in watts (default: card max).
|
||||
/// Lower power trades a little hashrate for much better Sol/W.
|
||||
#[arg(long, value_name = "WATTS")]
|
||||
@@ -620,6 +633,8 @@ fn main() -> Result<()> {
|
||||
args.power_limit.unwrap_or(0),
|
||||
args.unlock_controls,
|
||||
);
|
||||
// Software temp governor target (paces solve cadence; no hardware writes).
|
||||
miner::set_target_temp(args.target_temp);
|
||||
miner::run(client, specs, running, job_timeout, tui, format!("{host}:{port}"), controls, cpu_mining, cpu_clamp, args.control_port)
|
||||
}
|
||||
|
||||
@@ -1077,10 +1092,21 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
|
||||
use std::time::Instant;
|
||||
info!("benchmarking {runs} solve(s) per worker across {} worker(s)", specs.len());
|
||||
|
||||
/// Per-worker benchmark result, including a steady-state telemetry snapshot
|
||||
/// (sampled right after the timed loop, while the card is warm).
|
||||
struct WorkerResult {
|
||||
sols: usize,
|
||||
dt: f64,
|
||||
watts: Option<f64>,
|
||||
temp_c: Option<u32>,
|
||||
core_mhz: Option<u32>,
|
||||
mem_mhz: Option<u32>,
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let mut handles = Vec::new();
|
||||
for (id, spec) in specs.into_iter().enumerate() {
|
||||
handles.push(std::thread::spawn(move || -> Result<(usize, f64)> {
|
||||
handles.push(std::thread::spawn(move || -> Result<WorkerResult> {
|
||||
let backend = spec.build()?;
|
||||
backend.solve(&pseudo_header(id as u64))?; // warm up (excluded)
|
||||
let t = Instant::now();
|
||||
@@ -1090,7 +1116,17 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
|
||||
let seed = ((id as u64) << 40) | (i as u64 + 1);
|
||||
sols += backend.solve(&pseudo_header(seed))?.len();
|
||||
}
|
||||
Ok((sols, t.elapsed().as_secs_f64()))
|
||||
let dt = t.elapsed().as_secs_f64();
|
||||
// Snapshot telemetry while the card is still under load.
|
||||
let (core_mhz, mem_mhz) = backend.current_clocks_mhz();
|
||||
Ok(WorkerResult {
|
||||
sols,
|
||||
dt,
|
||||
watts: backend.power_watts(),
|
||||
temp_c: backend.temperature_c(),
|
||||
core_mhz,
|
||||
mem_mhz,
|
||||
})
|
||||
}));
|
||||
}
|
||||
|
||||
@@ -1099,11 +1135,26 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
|
||||
let mut workers = 0usize;
|
||||
for h in handles {
|
||||
match h.join().unwrap() {
|
||||
Ok((sols, dt)) => {
|
||||
let sol_s = sols as f64 / dt;
|
||||
Ok(r) => {
|
||||
let sol_s = r.sols as f64 / r.dt;
|
||||
// Optional telemetry tail: " | 142 W, 41.7 Sol/W, 68°C, 2700/2500 MHz".
|
||||
let mut tail = String::new();
|
||||
if let Some(w) = r.watts {
|
||||
tail.push_str(&format!(" | {w:.0} W"));
|
||||
if w > 0.0 {
|
||||
tail.push_str(&format!(", {:.2} Sol/W", sol_s / w));
|
||||
}
|
||||
}
|
||||
if let Some(t) = r.temp_c {
|
||||
tail.push_str(&format!(", {t}°C"));
|
||||
}
|
||||
if let (Some(c), m) = (r.core_mhz, r.mem_mhz) {
|
||||
tail.push_str(&format!(", {c}/{} MHz", m.map(|m| m.to_string()).unwrap_or_else(|| "?".into())));
|
||||
}
|
||||
info!(
|
||||
" worker {workers}: {sol_s:.2} Sol/s ({:.0} ms/solve), {sols} solutions",
|
||||
1000.0 * dt / runs as f64
|
||||
" worker {workers}: {sol_s:.2} Sol/s ({:.0} ms/solve), {} solutions{tail}",
|
||||
1000.0 * r.dt / runs as f64,
|
||||
r.sols
|
||||
);
|
||||
agg_sols += sol_s;
|
||||
workers += 1;
|
||||
|
||||
Reference in New Issue
Block a user