AMD GPU telemetry + --target-temp governor

Brings AMD cards to parity with NVIDIA for monitoring/control surface, which was NVML-only. New src/amd_smi.rs is a gpu_tune::GpuTuner backed by Linux amdgpu sysfs (power1_average, temp1_input edge, freq1_input sclk, pp_dpm_sclk/mclk), matched to the device by PCI bus id from OpenCL cl_khr_pci_bus_info. gpu_tune is un-gated to compile under the gpu feature; open() probes NVML then amd_smi. GpuSolver carries the tuner and Backend::Gpu dispatches power/temp/clocks, so the TUI and --benchmark now show power, temperature, clocks and Sol/W for AMD. Telemetry-only — setters are Unsupported (amdgpu control nodes are root-only). --target-temp <C> adds an opt-in software governor (miner::govern_cadence) that paces solve cadence to hold edge temperature, no hardware writes/root. With small thermal throttle it won't beat flat-out on raw Sol/s; it's a temp/efficiency lever. Unit-tested controller; flag/plumbing verified live. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 20:17:59 -04:00
parent 0002e90451
commit 31aa85733e
5 changed files with 440 additions and 19 deletions
@@ -0,0 +1,204 @@
+//! AMD GPU telemetry for [`crate::gpu_tune::GpuTuner`] via the Linux amdgpu
+//! sysfs interface (`/sys/class/drm/cardN/device/...`).
+//!
+//! Telemetry only: board power, edge temperature, and core/memory clocks come
+//! from the world-readable hwmon + DPM nodes, so it works unprivileged. The
+//! control nodes (`pp_od_clk_voltage`, power cap, performance level) are
+//! root-write-only and carry GPU-hang risk, so every setter returns
+//! [`SetOutcome::Unsupported`] — this backend never writes. The handle is matched
+//! to the physical card by PCI bus id (e.g. from OpenCL's `cl_khr_pci_bus_info`),
+//! so it lines up with whichever device the solver actually opened.
+
+use std::path::{Path, PathBuf};
+
+use crate::gpu_tune::{GpuTuner, SetOutcome};
+
+/// One amdgpu card's sysfs telemetry handle.
+pub struct AmdTuner {
+    /// Canonical `/sys/.../<PCI BDF>` device directory (holds `pp_dpm_*`).
+    device_dir: PathBuf,
+    /// `device_dir/hwmon/hwmonM` (the index M is not stable — resolved by glob).
+    hwmon_dir: PathBuf,
+    name: String,
+}
+
+// Only `PathBuf`/`String` — `AmdTuner` is `Send` automatically; no `unsafe impl`.
+
+/// Open a telemetry handle for the amdgpu card at `pci_bus_id` (e.g.
+/// "0000:03:00.0"). Returns `None` off Linux, or when no amdgpu card with
+/// telemetry nodes matches the bus id.
+pub fn open(pci_bus_id: &str) -> Option<Box<dyn GpuTuner>> {
+    #[cfg(not(target_os = "linux"))]
+    {
+        let _ = pci_bus_id;
+        None
+    }
+    #[cfg(target_os = "linux")]
+    {
+        let want = bdf_tail(pci_bus_id)?;
+        for entry in std::fs::read_dir("/sys/class/drm").ok()?.flatten() {
+            let fname = entry.file_name();
+            let fname = fname.to_string_lossy();
+            // Match the GPU nodes ("card0", "card1", …), not the per-connector
+            // dirs ("card1-DP-1") or render nodes.
+            if !fname.starts_with("card") || fname.contains('-') {
+                continue;
+            }
+            // `cardN/device` symlinks to the PCI device dir `…/<BDF>`.
+            let Ok(device_dir) = std::fs::canonicalize(entry.path().join("device")) else {
+                continue;
+            };
+            let matches = device_dir
+                .file_name()
+                .and_then(|s| s.to_str())
+                .and_then(bdf_tail)
+                .map(|t| t == want)
+                .unwrap_or(false);
+            if !matches {
+                continue;
+            }
+            let Some(hwmon_dir) = find_hwmon(&device_dir) else { continue };
+            // Require a telemetry node so we don't attach to a card without
+            // sensors (e.g. some virtual/headless devices).
+            if !hwmon_dir.join("temp1_input").exists()
+                && !hwmon_dir.join("power1_average").exists()
+            {
+                continue;
+            }
+            let name = read_name(&device_dir);
+            return Some(Box::new(AmdTuner { device_dir, hwmon_dir, name }));
+        }
+        None
+    }
+}
+
+/// First `hwmon/hwmon*` subdir of a device dir (the index isn't stable).
+fn find_hwmon(device_dir: &Path) -> Option<PathBuf> {
+    for e in std::fs::read_dir(device_dir.join("hwmon")).ok()?.flatten() {
+        if e.file_name().to_string_lossy().starts_with("hwmon") {
+            return Some(e.path());
+        }
+    }
+    None
+}
+
+/// A display name for the card; amdgpu sysfs rarely exposes a marketing name, so
+/// fall back to a generic label.
+fn read_name(device_dir: &Path) -> String {
+    read_trim(&device_dir.join("product_name"))
+        .filter(|s| !s.is_empty())
+        .unwrap_or_else(|| "AMD GPU".to_string())
+}
+
+/// Normalize a PCI BDF to its `bus:device.function` tail (drop the domain), so
+/// e.g. "0000:03:00.0" and "03:00.0" compare equal.
+fn bdf_tail(bdf: &str) -> Option<String> {
+    let bdf = bdf.trim().to_ascii_lowercase();
+    match bdf.split(':').collect::<Vec<_>>().as_slice() {
+        [_domain, bus, devfunc] => Some(format!("{bus}:{devfunc}")),
+        [bus, devfunc] => Some(format!("{bus}:{devfunc}")),
+        _ => None,
+    }
+}
+
+fn read_trim(p: &Path) -> Option<String> {
+    std::fs::read_to_string(p).ok().map(|s| s.trim().to_string())
+}
+
+fn read_u64(p: &Path) -> Option<u64> {
+    read_trim(p)?.parse().ok()
+}
+
+/// Parse the MHz value out of a `pp_dpm_*` line like `"2: 2700Mhz *"`.
+fn parse_mhz(line: &str) -> Option<u32> {
+    let after_colon = line.split(':').nth(1)?;
+    let token = after_colon.trim().split_whitespace().next()?;
+    let digits: String = token.chars().take_while(|c| c.is_ascii_digit()).collect();
+    digits.parse().ok()
+}
+
+impl AmdTuner {
+    /// MHz of the currently active DPM level (the line marked `*`) in `file`.
+    fn dpm_active_mhz(&self, file: &str) -> Option<u32> {
+        let s = read_trim(&self.device_dir.join(file))?;
+        s.lines().find(|l| l.contains('*')).and_then(parse_mhz)
+    }
+    /// Highest DPM level (MHz) listed in `file`.
+    fn dpm_max_mhz(&self, file: &str) -> Option<u32> {
+        let s = read_trim(&self.device_dir.join(file))?;
+        s.lines().filter_map(parse_mhz).max()
+    }
+}
+
+impl GpuTuner for AmdTuner {
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+
+    fn watts(&self) -> Option<f64> {
+        // power1_average is µW; fall back to the instantaneous power1_input.
+        let uw = read_u64(&self.hwmon_dir.join("power1_average"))
+            .or_else(|| read_u64(&self.hwmon_dir.join("power1_input")))?;
+        Some(uw as f64 / 1_000_000.0)
+    }
+
+    fn temperature_c(&self) -> Option<u32> {
+        // temp1 = edge (m°C).
+        let mc = read_u64(&self.hwmon_dir.join("temp1_input"))?;
+        Some(((mc + 500) / 1000) as u32)
+    }
+
+    fn current_power_limit_w(&self) -> Option<u32> {
+        // Absent on Navi 44 (RX 9060 XT); best-effort for cards that expose it.
+        read_u64(&self.hwmon_dir.join("power1_cap")).map(|uw| (uw / 1_000_000) as u32)
+    }
+
+    fn core_clock_mhz(&self) -> Option<u32> {
+        // freq1_input (Hz) is the live sclk; the DPM active level is the fallback.
+        if let Some(hz) = read_u64(&self.hwmon_dir.join("freq1_input")) {
+            if hz > 0 {
+                return Some((hz / 1_000_000) as u32);
+            }
+        }
+        self.dpm_active_mhz("pp_dpm_sclk")
+    }
+
+    fn mem_clock_mhz(&self) -> Option<u32> {
+        self.dpm_active_mhz("pp_dpm_mclk")
+    }
+
+    fn max_core_clock_mhz(&self) -> Option<u32> {
+        self.dpm_max_mhz("pp_dpm_sclk")
+    }
+
+    fn max_mem_clock_mhz(&self) -> Option<u32> {
+        self.dpm_max_mhz("pp_dpm_mclk")
+    }
+
+    fn power_limit_range_w(&self) -> Option<(u32, u32)> {
+        let mn = read_u64(&self.hwmon_dir.join("power1_cap_min"))?;
+        let mx = read_u64(&self.hwmon_dir.join("power1_cap_max"))?;
+        Some(((mn / 1_000_000) as u32, (mx / 1_000_000) as u32))
+    }
+
+    // Telemetry-only backend: never writes the root-only control nodes.
+    fn set_persistence(&self, _on: bool) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn lock_core_clock_mhz(&self, _mhz: u32) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn lock_mem_clock_mhz(&self, _mhz: u32) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn set_power_limit_w(&self, _watts: u32) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn set_core_offset_mhz(&self, _mhz: i32) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn set_mem_offset_mhz(&self, _mhz: i32) -> SetOutcome {
+        SetOutcome::Unsupported
+    }
+    fn reset(&self) {}
+}
@@ -411,6 +411,9 @@ impl LegacySolver {
 /// (`equihash.cl`) everywhere else. Forceable with `ZCL_OPENCL_KERNEL=amd|legacy`.
 pub struct GpuSolver {
    inner: SolverInner,
+    /// Per-card telemetry handle (AMD amdgpu sysfs / NVML), matched to the device
+    /// by PCI bus id. `None` when no telemetry backend matches.
+    tuner: Option<Box<dyn crate::gpu_tune::GpuTuner>>,
 }

 enum SolverInner {
@@ -423,13 +426,16 @@ impl GpuSolver {
    /// device vendor (AMD → `equihash192_7.cl`).
    pub fn new(device_index: usize) -> Result<Self> {
        let (platform, device) = pick_device(device_index)?;
+        // Resolve a telemetry handle (AMD sysfs / NVML) from the device's PCI bus
+        // before `device` is consumed by the inner solver.
+        let tuner = device_pci_bus_id(&device).and_then(|bus| crate::gpu_tune::open(&bus));
        let inner = if use_amd_kernel(&device) {
            log::info!("OpenCL: AMD device — using the equihash192_7 kernel");
            SolverInner::Amd(crate::gpu_amd::AmdSolver::new(platform, device)?)
        } else {
            SolverInner::Legacy(LegacySolver::new(platform, device)?)
        };
-        Ok(Self { inner })
+        Ok(Self { inner, tuner })
    }

    /// This device's product name, if available.
@@ -440,6 +446,34 @@ impl GpuSolver {
        }
    }

+    /// Current board power draw in watts (telemetry handle), if available.
+    pub fn power_watts(&self) -> Option<f64> {
+        self.tuner.as_ref().and_then(|t| t.watts())
+    }
+
+    /// Current GPU temperature in °C, if available.
+    pub fn temperature_c(&self) -> Option<u32> {
+        self.tuner.as_ref().and_then(|t| t.temperature_c())
+    }
+
+    /// Currently enforced power limit in watts, if available.
+    pub fn current_power_limit_w(&self) -> Option<u32> {
+        self.tuner.as_ref().and_then(|t| t.current_power_limit_w())
+    }
+
+    /// (min, max) settable power limit in watts, if available.
+    pub fn power_limit_range_w(&self) -> Option<(u32, u32)> {
+        self.tuner.as_ref().and_then(|t| t.power_limit_range_w())
+    }
+
+    /// Current (core, memory) clocks in MHz, each `None` if unavailable.
+    pub fn current_clocks_mhz(&self) -> (Option<u32>, Option<u32>) {
+        match &self.tuner {
+            Some(t) => (t.core_clock_mhz(), t.mem_clock_mhz()),
+            None => (None, None),
+        }
+    }
+
    /// Solve the puzzle for `header` (140 bytes).
    pub fn solve(&self, header: &[u8]) -> Result<Vec<Vec<u32>>> {
        match &self.inner {
@@ -475,13 +509,34 @@ impl GpuSolver {
    pub fn hash_all(&self, header: &[u8]) -> Result<Vec<u8>> {
        match &self.inner {
            SolverInner::Legacy(s) => s.hash_all(header),
-            SolverInner::Amd(_) => {
-                Err(anyhow!("hash_all is not supported by the AMD kernel"))
-            }
+            SolverInner::Amd(_) => Err(anyhow!("hash_all is not supported by the AMD kernel")),
        }
    }
 }

+/// The device's PCI address as `"DDDD:BB:DD.F"` (lowercase), for matching the
+/// physical card to a telemetry backend. Prefers `cl_khr_pci_bus_info`; falls
+/// back to `cl_device_topology_amd`. `None` if the device exposes neither.
+fn device_pci_bus_id(device: &ocl::Device) -> Option<String> {
+    const CL_DEVICE_PCI_BUS_INFO_KHR: u32 = 0x10F2;
+    const CL_DEVICE_TOPOLOGY_AMD: u32 = 0x4037;
+    // cl_device_pci_bus_info_khr = { u32 pci_domain, pci_bus, pci_device, pci_function }.
+    if let Ok(b) = device.info_raw(CL_DEVICE_PCI_BUS_INFO_KHR) {
+        if b.len() >= 16 {
+            let rd = |i: usize| u32::from_ne_bytes([b[i], b[i + 1], b[i + 2], b[i + 3]]);
+            return Some(format!("{:04x}:{:02x}:{:02x}.{:x}", rd(0), rd(4), rd(8), rd(12)));
+        }
+    }
+    // cl_device_topology_amd (PCIE branch): 24-byte struct, bus/device/function
+    // are the last three bytes; domain isn't exposed (assume 0000).
+    if let Ok(b) = device.info_raw(CL_DEVICE_TOPOLOGY_AMD) {
+        if b.len() >= 24 {
+            return Some(format!("0000:{:02x}:{:02x}.{:x}", b[21], b[22], b[23]));
+        }
+    }
+    None
+}
+
 /// Decide whether to drive `device` with the AMD `equihash192_7.cl` kernel.
 /// `ZCL_OPENCL_KERNEL` forces the choice (`amd` or `legacy`); otherwise it's by
 /// device vendor.
@@ -561,7 +616,7 @@ pub fn cpu_device_index() -> Option<usize> {
 /// Resolve a flat device index across all platforms, returning the device along
 /// with the platform it belongs to (needed to build the context against the
 /// right platform).
-fn pick_device(index: usize) -> Result<(ocl::Platform, ocl::Device)> {
+pub(crate) fn pick_device(index: usize) -> Result<(ocl::Platform, ocl::Device)> {
    use ocl::{Device, Platform};
    let mut idx = 0;
    for platform in Platform::list() {
@@ -128,18 +128,24 @@ pub trait GpuTuner: Send {
 /// Open a control handle for the GPU at `pci_bus_id` (matches the physical card
 /// regardless of CUDA-vs-driver index ordering). `None` if unavailable.
 ///
-/// NVML is the backend on both Linux (`libnvidia-ml`) and Windows (`nvml.dll`);
-/// the C API is identical, so the same [`crate::nvml`] code serves both.
+/// Tries the NVIDIA backend first (NVML, `libnvidia-ml`/`nvml.dll`), then the AMD
+/// backend ([`crate::amd_smi`], Linux amdgpu sysfs). A non-matching bus id makes
+/// each backend return `None`, so probing both is safe on mixed-vendor hosts.
 pub fn open(pci_bus_id: &str) -> Option<Box<dyn GpuTuner>> {
-    #[cfg(any(unix, windows))]
+    #[cfg(feature = "cuda")]
    {
-        crate::nvml::open(pci_bus_id)
+        if let Some(t) = crate::nvml::open(pci_bus_id) {
+            return Some(t);
+        }
    }
-    #[cfg(not(any(unix, windows)))]
+    #[cfg(feature = "gpu")]
    {
-        let _ = pci_bus_id;
-        None
+        if let Some(t) = crate::amd_smi::open(pci_bus_id) {
+            return Some(t);
+        }
    }
+    let _ = pci_bus_id;
+    None
 }

 static WARNED_PRIVS: AtomicBool = AtomicBool::new(false);
@@ -18,6 +18,10 @@ mod gpu;
 #[cfg(feature = "gpu")]
 mod gpu_amd;

+// AMD GPU telemetry via Linux amdgpu sysfs (a `gpu_tune::GpuTuner` backend).
+#[cfg(feature = "gpu")]
+mod amd_smi;
+
 // Runtime dynamic-library loader (dlopen) for the CUDA driver + NVML.
 #[cfg(feature = "cuda")]
 mod dylib;
@@ -28,7 +32,9 @@ mod cuda;
 #[cfg(feature = "cuda")]
 mod nvml;

-#[cfg(feature = "cuda")]
+// Platform-agnostic GPU tuning/telemetry surface. The trait + policy compile for
+// either GPU backend; NVML (cuda) and amd_smi (gpu) are the implementations.
+#[cfg(any(feature = "cuda", feature = "gpu"))]
 mod gpu_tune;

 use std::io::IsTerminal;
@@ -204,6 +210,13 @@ struct Args {
    #[arg(long)]
    auto_tune: bool,

+    /// Sustained-Sol/s governor: hold each GPU at/below this edge temperature (°C)
+    /// by pacing the solve cadence (no hardware writes, no root). Trades a little
+    /// throughput for lower temp/power; off by default (runs flat-out). Needs a
+    /// backend that reports temperature (AMD amdgpu / NVIDIA).
+    #[arg(long, value_name = "CELSIUS")]
+    target_temp: Option<u32>,
+
    /// Efficiency: cap each GPU's power limit in watts (default: card max).
    /// Lower power trades a little hashrate for much better Sol/W.
    #[arg(long, value_name = "WATTS")]
@@ -620,6 +633,8 @@ fn main() -> Result<()> {
        args.power_limit.unwrap_or(0),
        args.unlock_controls,
    );
+    // Software temp governor target (paces solve cadence; no hardware writes).
+    miner::set_target_temp(args.target_temp);
    miner::run(client, specs, running, job_timeout, tui, format!("{host}:{port}"), controls, cpu_mining, cpu_clamp, args.control_port)
 }

@@ -1077,10 +1092,21 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
    use std::time::Instant;
    info!("benchmarking {runs} solve(s) per worker across {} worker(s)", specs.len());

+    /// Per-worker benchmark result, including a steady-state telemetry snapshot
+    /// (sampled right after the timed loop, while the card is warm).
+    struct WorkerResult {
+        sols: usize,
+        dt: f64,
+        watts: Option<f64>,
+        temp_c: Option<u32>,
+        core_mhz: Option<u32>,
+        mem_mhz: Option<u32>,
+    }
+
    let start = Instant::now();
    let mut handles = Vec::new();
    for (id, spec) in specs.into_iter().enumerate() {
-        handles.push(std::thread::spawn(move || -> Result<(usize, f64)> {
+        handles.push(std::thread::spawn(move || -> Result<WorkerResult> {
            let backend = spec.build()?;
            backend.solve(&pseudo_header(id as u64))?; // warm up (excluded)
            let t = Instant::now();
@@ -1090,7 +1116,17 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
                let seed = ((id as u64) << 40) | (i as u64 + 1);
                sols += backend.solve(&pseudo_header(seed))?.len();
            }
-            Ok((sols, t.elapsed().as_secs_f64()))
+            let dt = t.elapsed().as_secs_f64();
+            // Snapshot telemetry while the card is still under load.
+            let (core_mhz, mem_mhz) = backend.current_clocks_mhz();
+            Ok(WorkerResult {
+                sols,
+                dt,
+                watts: backend.power_watts(),
+                temp_c: backend.temperature_c(),
+                core_mhz,
+                mem_mhz,
+            })
        }));
    }

@@ -1099,11 +1135,26 @@ fn benchmark(specs: Vec<BackendSpec>, runs: usize) -> Result<()> {
    let mut workers = 0usize;
    for h in handles {
        match h.join().unwrap() {
-            Ok((sols, dt)) => {
-                let sol_s = sols as f64 / dt;
+            Ok(r) => {
+                let sol_s = r.sols as f64 / r.dt;
+                // Optional telemetry tail: " | 142 W, 41.7 Sol/W, 68°C, 2700/2500 MHz".
+                let mut tail = String::new();
+                if let Some(w) = r.watts {
+                    tail.push_str(&format!(" | {w:.0} W"));
+                    if w > 0.0 {
+                        tail.push_str(&format!(", {:.2} Sol/W", sol_s / w));
+                    }
+                }
+                if let Some(t) = r.temp_c {
+                    tail.push_str(&format!(", {t}°C"));
+                }
+                if let (Some(c), m) = (r.core_mhz, r.mem_mhz) {
+                    tail.push_str(&format!(", {c}/{} MHz", m.map(|m| m.to_string()).unwrap_or_else(|| "?".into())));
+                }
                info!(
-                    "  worker {workers}: {sol_s:.2} Sol/s ({:.0} ms/solve), {sols} solutions",
-                    1000.0 * dt / runs as f64
+                    "  worker {workers}: {sol_s:.2} Sol/s ({:.0} ms/solve), {} solutions{tail}",
+                    1000.0 * r.dt / runs as f64,
+                    r.sols
                );
                agg_sols += sol_s;
                workers += 1;
@@ -14,6 +14,22 @@ use crate::equihash;
 use crate::params::{HEADER_LEN, SOLUTION_BYTES};
 use crate::stratum::{StratumClient, Work};

+/// Process-wide target edge temperature (°C) for the software solve-cadence
+/// governor; `None` ⇒ run flat-out. Set once at startup from `--target-temp`.
+static TARGET_TEMP_C: OnceLock<Option<u32>> = OnceLock::new();
+
+/// Install the governor's target temperature (call once, before workers start).
+pub fn set_target_temp(c: Option<u32>) {
+    if let Some(t) = c {
+        info!("temperature governor enabled: holding GPUs ≤{t}°C (paced cadence)");
+    }
+    let _ = TARGET_TEMP_C.set(c);
+}
+
+fn target_temp_c() -> Option<u32> {
+    TARGET_TEMP_C.get().copied().flatten()
+}
+
 /// Double SHA-256, as used for the Zcash/ZClassic block PoW hash.
 fn sha256d(data: &[u8]) -> [u8; 32] {
    let first = Sha256::digest(data);
@@ -136,6 +152,8 @@ impl Backend {
        match self {
            #[cfg(feature = "cuda")]
            Backend::Cuda(solver) => solver.power_watts(),
+            #[cfg(feature = "gpu")]
+            Backend::Gpu(solver) => solver.power_watts(),
            _ => None,
        }
    }
@@ -145,6 +163,8 @@ impl Backend {
        match self {
            #[cfg(feature = "cuda")]
            Backend::Cuda(solver) => solver.temperature_c(),
+            #[cfg(feature = "gpu")]
+            Backend::Gpu(solver) => solver.temperature_c(),
            _ => None,
        }
    }
@@ -154,6 +174,8 @@ impl Backend {
        match self {
            #[cfg(feature = "cuda")]
            Backend::Cuda(solver) => solver.current_power_limit_w(),
+            #[cfg(feature = "gpu")]
+            Backend::Gpu(solver) => solver.current_power_limit_w(),
            _ => None,
        }
    }
@@ -163,6 +185,8 @@ impl Backend {
        match self {
            #[cfg(feature = "cuda")]
            Backend::Cuda(solver) => solver.power_limit_range_w(),
+            #[cfg(feature = "gpu")]
+            Backend::Gpu(solver) => solver.power_limit_range_w(),
            _ => None,
        }
    }
@@ -190,6 +214,8 @@ impl Backend {
        match self {
            #[cfg(feature = "cuda")]
            Backend::Cuda(solver) => solver.current_clocks_mhz(),
+            #[cfg(feature = "gpu")]
+            Backend::Gpu(solver) => solver.current_clocks_mhz(),
            _ => (None, None),
        }
    }
@@ -513,6 +539,14 @@ fn worker(
    let mut last_job = Instant::now();
    let mut paused = false;
    let mut disabled_pause = false;
+    // Software temperature governor: pace the solve cadence to hold edge temp at
+    // or below `--target-temp` (no hardware writes). `gov_sleep` is the per-pass
+    // pause, nudged from the periodic temperature sample.
+    let gov_target = target_temp_c();
+    let mut gov_sleep = Duration::ZERO;
+    if let Some(target) = gov_target {
+        info!("worker {id}: temperature governor active — pacing cadence to hold ≤{target}°C");
+    }

    while running.load(Ordering::Relaxed) {
        if work_handle.epoch() != current.epoch {
@@ -583,6 +617,9 @@ fn worker(
            }
            if let Some(t) = backend.temperature_c() {
                stats.workers[id].temp_c.store(t, Ordering::Relaxed);
+                if let Some(target) = gov_target {
+                    gov_sleep = govern_cadence(gov_sleep, t, target);
+                }
            }
            let (core_mhz, mem_mhz) = backend.current_clocks_mhz();
            if let Some(c) = core_mhz {
@@ -614,12 +651,34 @@ fn worker(
            let ctx = inflight.pop_front().unwrap();
            process_results(id, &client, &ctx, &solutions, &stats);
        }
+
+        // Temperature governor: pace the cadence (held off entirely when flat-out).
+        if !gov_sleep.is_zero() {
+            std::thread::sleep(gov_sleep);
+        }
    }

    drain_pipeline(id, &mut backend, &mut inflight, &client, &stats)?;
    Ok(())
 }

+/// Nudge the per-pass governor pause to hold edge temperature near `target` (°C).
+/// Over target → lengthen the pause (proportional to the overshoot); comfortably
+/// under → shorten it. Called from the periodic temperature sample (~1 Hz), so
+/// this is a slow integral controller — fine for the card's thermal time constant.
+fn govern_cadence(cur: Duration, temp: u32, target: u32) -> Duration {
+    const MAX_SLEEP: Duration = Duration::from_millis(100);
+    let (t, target) = (temp as i64, target as i64);
+    if t > target {
+        cur.saturating_add(Duration::from_micros(400 * (t - target) as u64))
+            .min(MAX_SLEEP)
+    } else if t < target - 1 {
+        cur.saturating_sub(Duration::from_millis(1))
+    } else {
+        cur
+    }
+}
+
 /// Supervise CPU mining: spawn one worker per group for the current group size,
 /// and whenever the dashboard cycles the size, stop those workers (`gen_running`),
 /// rebuild the grouping, and respawn. Runs until `running` is cleared.
@@ -843,3 +902,49 @@ fn build_nonce(nonce1: &[u8], counter: u64) -> Result<[u8; 32]> {
    tail[..n].copy_from_slice(&counter.to_le_bytes()[..n]);
    Ok(nonce)
 }
+
+#[cfg(test)]
+mod governor_tests {
+    use super::*;
+
+    #[test]
+    fn governor_idles_below_target() {
+        // At/under target → no pause is introduced.
+        assert_eq!(govern_cadence(Duration::ZERO, 60, 70), Duration::ZERO);
+        assert_eq!(govern_cadence(Duration::ZERO, 70, 70), Duration::ZERO);
+    }
+
+    #[test]
+    fn governor_lengthens_over_target_and_clamps() {
+        // Over target → pause grows proportional to the overshoot.
+        let s = govern_cadence(Duration::ZERO, 75, 70); // +5°C
+        assert_eq!(s, Duration::from_micros(400 * 5));
+        // Far over target saturates at the cap, never beyond.
+        let hot = govern_cadence(Duration::from_millis(99), 200, 70);
+        assert_eq!(hot, Duration::from_millis(100));
+    }
+
+    #[test]
+    fn governor_eases_off_when_cool() {
+        // 2°C+ under target → pause shrinks; floors at zero (no underflow).
+        let s = govern_cadence(Duration::from_millis(3), 67, 70);
+        assert_eq!(s, Duration::from_millis(2));
+        assert_eq!(govern_cadence(Duration::ZERO, 50, 70), Duration::ZERO);
+    }
+
+    #[test]
+    fn governor_converges_to_hold_target() {
+        // Simulate a card whose temp rises with duty: more pause → cooler. The
+        // controller should settle near the target without runaway.
+        let target = 70u32;
+        let mut sleep = Duration::ZERO;
+        let mut temp = 85i32; // starts hot
+        for _ in 0..200 {
+            sleep = govern_cadence(sleep, temp as u32, target);
+            // crude plant: each ms of pause sheds ~1.5°C off a 90°C flat-out temp.
+            let modeled = 90.0 - 1.5 * sleep.as_millis() as f64;
+            temp = modeled.round() as i32;
+        }
+        assert!((target as i32 - 2..=target as i32 + 2).contains(&temp), "settled at {temp}°C, want ~{target}");
+    }
+}