Files
LACT/lact-daemon/src/server/gpu_controller/mod.rs

966 lines
36 KiB
Rust

pub mod fan_control;
use self::fan_control::FanCurve;
use super::vulkan::get_vulkan_info;
use crate::config::{self, ClocksConfiguration, FanControlSettings};
use amdgpu_sysfs::{
error::Error,
gpu_handle::{
fan_control::FanCurve as PmfwCurve,
overdrive::{ClocksTable, ClocksTableGen},
GpuHandle, PerformanceLevel, PowerLevelKind, PowerLevels,
},
hw_mon::{FanControlMethod, HwMon},
sysfs::SysFS,
};
use anyhow::{anyhow, Context};
use lact_schema::{
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, GpuPciInfo, LinkInfo,
PciInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
};
use pciid_parser::Database;
use std::{
borrow::Cow,
cell::RefCell,
cmp,
path::{Path, PathBuf},
rc::Rc,
str::FromStr,
time::Duration,
};
use std::{collections::BTreeMap, fs, time::Instant};
use tokio::{
select,
sync::Notify,
task::JoinHandle,
time::{sleep, timeout},
};
use tracing::{debug, error, info, trace, warn};
#[cfg(feature = "libdrm_amdgpu_sys")]
use {
lact_schema::DrmMemoryInfo,
libdrm_amdgpu_sys::AMDGPU::{DeviceHandle as DrmHandle, MetricsInfo, GPU_INFO},
std::{fs::OpenOptions, os::fd::IntoRawFd},
};
type FanControlHandle = (Rc<Notify>, JoinHandle<()>);
const GPU_CLOCKDOWN_TIMEOUT_SECS: u64 = 3;
pub struct GpuController {
pub(super) handle: GpuHandle,
#[cfg(feature = "libdrm_amdgpu_sys")]
pub drm_handle: Option<DrmHandle>,
pub pci_info: Option<GpuPciInfo>,
pub fan_control_handle: RefCell<Option<FanControlHandle>>,
}
impl GpuController {
pub fn new_from_path(sysfs_path: PathBuf, pci_db: &Database) -> anyhow::Result<Self> {
let handle = GpuHandle::new_from_path(sysfs_path)
.map_err(|error| anyhow!("failed to initialize gpu handle: {error}"))?;
#[cfg(feature = "libdrm_amdgpu_sys")]
let drm_handle = match get_drm_handle(&handle) {
Ok(handle) => Some(handle),
Err(err) => {
warn!("Could not get DRM handle: {err}");
None
}
};
let mut device_pci_info = None;
let mut subsystem_pci_info = None;
if let Some((vendor_id, model_id)) = handle.get_pci_id() {
device_pci_info = Some(PciInfo {
vendor_id: vendor_id.to_owned(),
vendor: None,
model_id: model_id.to_owned(),
model: None,
});
if let Some((subsys_vendor_id, subsys_model_id)) = handle.get_pci_subsys_id() {
let pci_device_info =
pci_db.get_device_info(vendor_id, model_id, subsys_vendor_id, subsys_model_id);
device_pci_info = Some(PciInfo {
vendor_id: vendor_id.to_owned(),
vendor: pci_device_info.vendor_name.map(str::to_owned),
model_id: model_id.to_owned(),
model: pci_device_info.device_name.map(str::to_owned),
});
subsystem_pci_info = Some(PciInfo {
vendor_id: subsys_vendor_id.to_owned(),
vendor: pci_device_info.subvendor_name.map(str::to_owned),
model_id: subsys_model_id.to_owned(),
model: pci_device_info.subdevice_name.map(str::to_owned),
});
};
}
let pci_info = device_pci_info.and_then(|device_pci_info| {
Some(GpuPciInfo {
device_pci_info,
subsystem_pci_info: subsystem_pci_info?,
})
});
Ok(Self {
handle,
#[cfg(feature = "libdrm_amdgpu_sys")]
drm_handle,
pci_info,
fan_control_handle: RefCell::new(None),
})
}
pub fn get_id(&self) -> anyhow::Result<String> {
let handle = &self.handle;
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
let pci_subsys_id = handle
.get_pci_subsys_id()
.context("Device has no subsys id")?;
let pci_slot_name = handle
.get_pci_slot_name()
.context("Device has no pci slot")?;
Ok(format!(
"{}:{}-{}:{}-{}",
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
))
}
pub fn get_path(&self) -> &Path {
self.handle.get_path()
}
fn first_hw_mon(&self) -> anyhow::Result<&HwMon> {
self.handle
.hw_monitors
.first()
.context("GPU has no hardware monitor")
}
pub fn get_info(&self) -> DeviceInfo {
let vulkan_info = self.pci_info.as_ref().and_then(|pci_info| {
match get_vulkan_info(
&pci_info.device_pci_info.vendor_id,
&pci_info.device_pci_info.model_id,
) {
Ok(info) => Some(info),
Err(err) => {
warn!("could not load vulkan info: {err}");
None
}
}
});
let pci_info = self.pci_info.as_ref().map(Cow::Borrowed);
let driver = self.handle.get_driver();
let vbios_version = self.get_full_vbios_version();
let link_info = self.get_link_info();
let drm_info = self.get_drm_info();
DeviceInfo {
pci_info,
vulkan_info,
driver,
vbios_version,
link_info,
drm_info,
}
}
#[cfg(feature = "libdrm_amdgpu_sys")]
fn get_full_vbios_version(&self) -> Option<String> {
if let Some(drm_handle) = &self.drm_handle {
if let Ok(vbios_info) = drm_handle.get_vbios_info() {
return Some(format!("{} [{}]", vbios_info.ver, vbios_info.date));
}
}
self.handle.get_vbios_version().ok()
}
#[cfg(not(feature = "libdrm_amdgpu_sys"))]
fn get_full_vbios_version(&self) -> Option<String> {
self.handle.get_vbios_version().ok()
}
#[cfg(feature = "libdrm_amdgpu_sys")]
fn get_drm_info(&self) -> Option<DrmInfo> {
trace!("Reading DRM info");
let drm_handle = self.drm_handle.as_ref();
let drm_memory_info =
drm_handle
.and_then(|handle| handle.memory_info().ok())
.map(|memory_info| DrmMemoryInfo {
resizeable_bar: memory_info.check_resizable_bar(),
cpu_accessible_used: memory_info.cpu_accessible_vram.heap_usage,
cpu_accessible_total: memory_info.cpu_accessible_vram.total_heap_size,
});
match drm_handle {
Some(handle) => handle.device_info().ok().map(|drm_info| DrmInfo {
device_name: drm_info.find_device_name(),
pci_revision_id: Some(drm_info.pci_rev_id()),
family_name: drm_info.get_family_name().to_string(),
family_id: drm_info.family_id(),
asic_name: drm_info.get_asic_name().to_string(),
chip_class: drm_info.get_chip_class().to_string(),
compute_units: drm_info.cu_active_number,
vram_type: drm_info.get_vram_type().to_string(),
vram_bit_width: drm_info.vram_bit_width,
vram_max_bw: drm_info.peak_memory_bw_gb().to_string(),
l1_cache_per_cu: drm_info.get_l1_cache_size(),
l2_cache: drm_info.calc_l2_cache_size(),
l3_cache_mb: drm_info.calc_l3_cache_size_mb(),
memory_info: drm_memory_info,
}),
None => None,
}
}
#[cfg(not(feature = "libdrm_amdgpu_sys"))]
fn get_drm_info(&self) -> Option<DrmInfo> {
None
}
#[cfg(feature = "libdrm_amdgpu_sys")]
fn get_current_gfxclk(&self) -> Option<u16> {
self.drm_handle
.as_ref()
.and_then(|drm_handle| drm_handle.get_gpu_metrics().ok())
.and_then(|metrics| metrics.get_current_gfxclk())
}
#[cfg(not(feature = "libdrm_amdgpu_sys"))]
fn get_current_gfxclk(&self) -> Option<u16> {
None
}
fn get_link_info(&self) -> LinkInfo {
LinkInfo {
current_width: self.handle.get_current_link_width().ok(),
current_speed: self.handle.get_current_link_speed().ok(),
max_width: self.handle.get_max_link_width().ok(),
max_speed: self.handle.get_max_link_speed().ok(),
}
}
pub fn get_stats(&self, gpu_config: Option<&config::Gpu>) -> DeviceStats {
let fan_settings = gpu_config.and_then(|config| config.fan_control_settings.as_ref());
DeviceStats {
fan: FanStats {
control_enabled: gpu_config.is_some_and(|config| config.fan_control_enabled),
control_mode: fan_settings.map(|settings| settings.mode),
static_speed: fan_settings.map(|settings| settings.static_speed),
curve: fan_settings.map(|settings| settings.curve.0.clone()),
spindown_delay_ms: fan_settings.and_then(|settings| settings.spindown_delay_ms),
change_threshold: fan_settings.and_then(|settings| settings.change_threshold),
speed_current: self.hw_mon_and_then(HwMon::get_fan_current),
speed_max: self.hw_mon_and_then(HwMon::get_fan_max),
speed_min: self.hw_mon_and_then(HwMon::get_fan_min),
pwm_current: self.hw_mon_and_then(HwMon::get_fan_pwm),
pmfw_info: PmfwInfo {
acoustic_limit: self.handle.get_fan_acoustic_limit().ok(),
acoustic_target: self.handle.get_fan_acoustic_target().ok(),
target_temp: self.handle.get_fan_target_temperature().ok(),
minimum_pwm: self.handle.get_fan_minimum_pwm().ok(),
},
},
clockspeed: ClockspeedStats {
gpu_clockspeed: self.hw_mon_and_then(HwMon::get_gpu_clockspeed),
current_gfxclk: self.get_current_gfxclk(),
vram_clockspeed: self.hw_mon_and_then(HwMon::get_vram_clockspeed),
},
voltage: VoltageStats {
gpu: self.hw_mon_and_then(HwMon::get_gpu_voltage),
northbridge: self.hw_mon_and_then(HwMon::get_northbridge_voltage),
},
vram: VramStats {
total: self.handle.get_total_vram().ok(),
used: self.handle.get_used_vram().ok(),
},
power: PowerStats {
average: self.hw_mon_and_then(HwMon::get_power_average),
current: self.hw_mon_and_then(HwMon::get_power_input),
cap_current: self.hw_mon_and_then(HwMon::get_power_cap),
cap_max: self.hw_mon_and_then(HwMon::get_power_cap_max),
cap_min: self.hw_mon_and_then(HwMon::get_power_cap_min),
cap_default: self.hw_mon_and_then(HwMon::get_power_cap_default),
},
temps: self.hw_mon_map(HwMon::get_temps).unwrap_or_default(),
busy_percent: self.handle.get_busy_percent().ok(),
performance_level: self.handle.get_power_force_performance_level().ok(),
core_power_state: self
.handle
.get_core_clock_levels()
.ok()
.and_then(|levels| levels.active),
memory_power_state: self
.handle
.get_memory_clock_levels()
.ok()
.and_then(|levels| levels.active),
pcie_power_state: self
.handle
.get_pcie_clock_levels()
.ok()
.and_then(|levels| levels.active),
throttle_info: self.get_throttle_info(),
}
}
#[cfg(not(feature = "libdrm_amdgpu_sys"))]
fn get_throttle_info(&self) -> Option<BTreeMap<String, Vec<String>>> {
None
}
#[cfg(feature = "libdrm_amdgpu_sys")]
fn get_throttle_info(&self) -> Option<BTreeMap<String, Vec<String>>> {
use libdrm_amdgpu_sys::AMDGPU::ThrottlerType;
self.drm_handle
.as_ref()
.and_then(|drm_handle| drm_handle.get_gpu_metrics().ok())
.and_then(|metrics| metrics.get_throttle_status_info())
.map(|throttle| {
let mut result: BTreeMap<String, Vec<String>> = BTreeMap::new();
for bit in throttle.get_all_throttler() {
let throttle_type = ThrottlerType::from(bit);
result
.entry(throttle_type.to_string())
.or_default()
.push(bit.to_string());
}
result
})
}
pub fn get_clocks_info(&self) -> anyhow::Result<ClocksInfo> {
let clocks_table = self
.handle
.get_clocks_table()
.context("Clocks table not available")?;
Ok(clocks_table.into())
}
fn hw_mon_and_then<U>(&self, f: fn(&HwMon) -> Result<U, Error>) -> Option<U> {
self.handle.hw_monitors.first().and_then(|mon| f(mon).ok())
}
fn hw_mon_map<U>(&self, f: fn(&HwMon) -> U) -> Option<U> {
self.handle.hw_monitors.first().map(f)
}
async fn set_static_fan_control(&self, static_speed: f64) -> anyhow::Result<()> {
// Stop existing task to set static speed
self.stop_fan_control(false).await?;
// Use PMFW curve functionality for static speed when it is available
if let Ok(current_curve) = self.handle.get_fan_curve() {
let allowed_ranges = current_curve.allowed_ranges.clone().ok_or_else(|| {
anyhow!("The GPU does not allow setting custom fan values (is overdrive enabled?)")
})?;
let min_temperature = allowed_ranges.temperature_range.start();
let max_temperature = allowed_ranges.temperature_range.end();
#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
let custom_pwm = (f64::from(*allowed_ranges.speed_range.end()) * static_speed) as u8;
let static_pwm = cmp::max(*allowed_ranges.speed_range.start(), custom_pwm);
let mut points = vec![(*min_temperature, static_pwm)];
for _ in 1..current_curve.points.len() {
points.push((*max_temperature, static_pwm));
}
let new_curve = PmfwCurve {
points: points.into_boxed_slice(),
allowed_ranges: Some(allowed_ranges),
};
if new_curve == current_curve {
debug!("fan curve unchanged");
} else {
debug!("setting static curve {new_curve:?}");
self.handle
.set_fan_curve(&new_curve)
.context("Could not set fan curve")?;
}
Ok(())
} else {
let hw_mon = self
.handle
.hw_monitors
.first()
.cloned()
.context("This GPU has no monitor")?;
hw_mon
.set_fan_control_method(FanControlMethod::Manual)
.context("Could not set fan control method")?;
#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
let static_pwm = (f64::from(u8::MAX) * static_speed) as u8;
hw_mon
.set_fan_pwm(static_pwm)
.context("could not set fan speed")?;
debug!("set fan speed to {}", static_speed);
Ok(())
}
}
async fn start_curve_fan_control(
&self,
curve: FanCurve,
settings: FanControlSettings,
) -> anyhow::Result<()> {
// Use the PMFW curve functionality when it is available
// Otherwise, fall back to manual fan control via a task
match self.handle.get_fan_curve() {
Ok(current_curve) => {
let new_curve = curve
.into_pmfw_curve(current_curve.clone())
.context("Invalid fan curve")?;
if new_curve == current_curve {
debug!("fan curve unchanged");
} else {
debug!("setting pmfw curve {new_curve:?}");
self.handle
.set_fan_curve(&new_curve)
.context("Could not set fan curve")?;
}
Ok(())
}
Err(_) => self.start_curve_fan_control_task(curve, settings).await,
}
}
async fn start_curve_fan_control_task(
&self,
curve: FanCurve,
settings: FanControlSettings,
) -> anyhow::Result<()> {
// Stop existing task to re-apply new curve
self.stop_fan_control(false).await?;
let hw_mon = self
.handle
.hw_monitors
.first()
.cloned()
.context("This GPU has no monitor")?;
hw_mon
.set_fan_control_method(FanControlMethod::Manual)
.context("Could not set fan control method")?;
let mut notify_guard = self
.fan_control_handle
.try_borrow_mut()
.map_err(|err| anyhow!("Lock error: {err}"))?;
let notify = Rc::new(Notify::new());
let task_notify = notify.clone();
debug!("spawning new fan control task");
let handle = tokio::task::spawn_local(async move {
let mut last_pwm = (None, Instant::now());
let mut last_temp = 0.0;
// If the fan speed could was able to be set at least once
let mut control_available = false;
let temp_key = settings.temperature_key.clone();
let interval = Duration::from_millis(settings.interval_ms);
let spindown_delay = Duration::from_millis(settings.spindown_delay_ms.unwrap_or(0));
#[allow(clippy::cast_precision_loss)]
let change_threshold = settings.change_threshold.unwrap_or(0) as f32;
loop {
select! {
() = sleep(interval) => (),
() = task_notify.notified() => break,
}
let mut temps = hw_mon.get_temps();
let temp = temps
.remove(&temp_key)
.expect("Could not get temperature by given key");
let current_temp = temp.current.expect("Missing temp");
if (last_temp - current_temp).abs() < change_threshold {
trace!("temperature changed from {last_temp}°C to {current_temp}°C, which is less than the {change_threshold}°C threshold, skipping speed adjustment");
continue;
}
let target_pwm = curve.pwm_at_temp(temp);
let now = Instant::now();
if let (Some(previous_pwm), previous_timestamp) = last_pwm {
let diff = now - previous_timestamp;
if target_pwm < previous_pwm && diff < spindown_delay {
trace!(
"delaying fan spindown ({}ms left)",
(spindown_delay - diff).as_millis()
);
continue;
}
}
last_pwm = (Some(target_pwm), now);
last_temp = current_temp;
trace!("fan control tick: setting pwm to {target_pwm}");
match hw_mon.set_fan_pwm(target_pwm) {
Ok(()) => control_available = true,
Err(err) => {
error!("could not set fan speed: {err}");
if control_available {
info!("fan control was previously available, assuming the error is temporary");
} else {
info!("disabling fan control");
break;
}
}
}
}
debug!("exited fan control task");
});
*notify_guard = Some((notify, handle));
debug!(
"started fan control with interval {}ms",
settings.interval_ms
);
Ok(())
}
async fn stop_fan_control(&self, reset_mode: bool) -> anyhow::Result<()> {
let maybe_notify = self
.fan_control_handle
.try_borrow_mut()
.map_err(|err| anyhow!("Lock error: {err}"))?
.take();
if let Some((notify, handle)) = maybe_notify {
notify.notify_one();
handle.await?;
}
if reset_mode {
if self.handle.get_fan_curve().is_ok() {
if let Err(err) = self.handle.reset_fan_curve() {
warn!("could not reset fan curve: {err:#}");
}
}
if let Some(hw_mon) = self.handle.hw_monitors.first().cloned() {
if let Ok(current_control) = hw_mon.get_fan_control_method() {
if !matches!(current_control, FanControlMethod::Auto) {
hw_mon
.set_fan_control_method(FanControlMethod::Auto)
.context("Could not set fan control back to automatic")?;
}
}
}
}
Ok(())
}
pub fn get_power_states(&self, gpu_config: Option<&config::Gpu>) -> PowerStates {
let core = self.get_power_states_kind(gpu_config, PowerLevelKind::CoreClock);
let vram = self.get_power_states_kind(gpu_config, PowerLevelKind::MemoryClock);
PowerStates { core, vram }
}
fn get_power_states_kind<T>(
&self,
gpu_config: Option<&config::Gpu>,
kind: PowerLevelKind,
) -> Vec<PowerState<T>>
where
T: FromStr,
<T as std::str::FromStr>::Err: std::fmt::Display,
{
let enabled_states = gpu_config.and_then(|gpu| gpu.power_states.get(&kind));
let levels = self
.handle
.get_clock_levels::<T>(kind)
.unwrap_or_else(|_| PowerLevels {
levels: Vec::new(),
active: None,
})
.levels;
levels
.into_iter()
.enumerate()
.map(|(i, value)| {
let i = u8::try_from(i).unwrap();
let enabled = enabled_states.map_or(true, |enabled| enabled.contains(&i));
PowerState { enabled, value }
})
.collect()
}
pub fn reset_pmfw_settings(&self) {
let handle = &self.handle;
if self.handle.get_fan_target_temperature().is_ok() {
if let Err(err) = handle.reset_fan_target_temperature() {
warn!("Could not reset target temperature: {err:#}");
}
}
if self.handle.get_fan_acoustic_target().is_ok() {
if let Err(err) = handle.reset_fan_acoustic_target() {
warn!("Could not reset acoustic target: {err:#}");
}
}
if self.handle.get_fan_acoustic_limit().is_ok() {
if let Err(err) = handle.reset_fan_acoustic_limit() {
warn!("Could not reset acoustic limit: {err:#}");
}
}
if self.handle.get_fan_minimum_pwm().is_ok() {
if let Err(err) = handle.reset_fan_minimum_pwm() {
warn!("Could not reset minimum pwm: {err:#}");
}
}
}
pub fn vbios_dump(&self) -> anyhow::Result<Vec<u8>> {
let debugfs = self.debugfs_path().context("DebugFS not found")?;
fs::read(debugfs.join("amdgpu_vbios")).context("Could not read VBIOS file")
}
fn debugfs_path(&self) -> Option<PathBuf> {
let slot_id = self.handle.get_pci_slot_name()?;
let name_search_term = format!("dev={slot_id}");
for entry in fs::read_dir("/sys/kernel/debug/dri").ok()?.flatten() {
let debugfs_path = entry.path();
let name_file_path = debugfs_path.join("name");
if name_file_path.exists() {
if let Ok(contents) = fs::read_to_string(&name_file_path) {
if contents.contains(&name_search_term) {
return Some(debugfs_path);
}
}
}
}
None
}
#[allow(clippy::too_many_lines)]
pub async fn apply_config(&self, config: &config::Gpu) -> anyhow::Result<()> {
if let Some(cap) = config.power_cap {
let hw_mon = self.first_hw_mon()?;
let current_usage = hw_mon
.get_power_input()
.or_else(|_| hw_mon.get_power_average())
.context("Could not get current power usage")?;
// When applying a power limit that's lower than the current power consumption,
// try to downclock the GPU first by forcing it into the lowest performance level.
// Workaround for behaviour described in https://github.com/ilya-zlobintsev/LACT/issues/207
let mut original_performance_level = None;
if current_usage > cap {
if let Ok(performance_level) = self.handle.get_power_force_performance_level() {
if self
.handle
.set_power_force_performance_level(PerformanceLevel::Low)
.is_ok()
{
debug!(
"waiting for the GPU to clock down before applying a new power limit"
);
match timeout(
Duration::from_secs(GPU_CLOCKDOWN_TIMEOUT_SECS),
wait_until_lowest_clock_level(&self.handle),
)
.await
{
Ok(()) => {
debug!("GPU clocked down successfully");
}
Err(_) => {
warn!("GPU did not clock down after {GPU_CLOCKDOWN_TIMEOUT_SECS}");
}
}
original_performance_level = Some(performance_level);
}
}
}
// Due to possible driver bug, RX 7900 XTX really doesn't like when we set the same value again.
// But, also in general we want to avoid setting same value twice
if Ok(cap) != hw_mon.get_power_cap() {
hw_mon
.set_power_cap(cap)
.with_context(|| format!("Failed to set power cap: {cap}"))?;
}
// Reapply old power level
if let Some(level) = original_performance_level {
self.handle
.set_power_force_performance_level(level)
.context("Could not reapply original performance level")?;
}
} else if let Ok(hw_mon) = self.first_hw_mon() {
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
// Due to possible driver bug, RX 7900 XTX really doesn't like when we set the same value again.
// But, also in general we want to avoid setting same value twice
if Ok(default_cap) != hw_mon.get_power_cap() {
hw_mon.set_power_cap(default_cap).with_context(|| {
format!("Failed to set power cap to default cap: {default_cap}")
})?;
}
}
}
// Reset the clocks table in case the settings get reverted back to not having a clocks value configured
self.handle.reset_clocks_table().ok();
// Reset performance level to work around some GPU quirks (found to be an issue on RDNA2)
self.handle
.set_power_force_performance_level(PerformanceLevel::Auto)
.ok();
if config.is_core_clocks_used() {
let mut table = self
.handle
.get_clocks_table()
.context("Failed to get clocks table")?;
config
.clocks_configuration
.apply_to_table(&mut table)
.context("Failed to apply clocks configuration to table")?;
debug!(
"writing clocks commands: {:#?}",
table
.get_commands()
.context("Failed to get table commands")?
);
self.handle
.set_clocks_table(&table)
.context("Could not write clocks table")
.with_context(|| format!("Clocks table commands: {:?}", table.get_commands()))?;
}
if let Some(level) = config.performance_level {
self.handle
.set_power_force_performance_level(level)
.context("Failed to set power performance level")?;
}
// Else is not needed, it was previously reset to auto already
if let Some(mode_index) = config.power_profile_mode_index {
if config.performance_level != Some(PerformanceLevel::Manual) {
return Err(anyhow!(
"Performance level has to be set to `manual` to use power profile modes"
));
}
self.handle
.set_active_power_profile_mode(mode_index)
.context("Failed to set active power profile mode")?;
}
for (kind, states) in &config.power_states {
if config.performance_level != Some(PerformanceLevel::Manual) {
return Err(anyhow!(
"Performance level has to be set to `manual` to configure power states"
));
}
self.handle
.set_enabled_power_levels(*kind, states)
.with_context(|| format!("Could not set {kind:?} power states"))?;
}
if config.fan_control_enabled {
if let Some(ref settings) = config.fan_control_settings {
match settings.mode {
lact_schema::FanControlMode::Static => {
self.set_static_fan_control(settings.static_speed)
.await
.context("Failed to set static fan control")?;
}
lact_schema::FanControlMode::Curve => {
if settings.curve.0.is_empty() {
return Err(anyhow!("Cannot use empty fan curve"));
}
self.start_curve_fan_control(settings.curve.clone(), settings.clone())
.await
.context("Failed to set curve fan control")?;
}
}
} else {
return Err(anyhow!(
"Trying to enable fan control with no settings provided"
));
}
} else {
self.stop_fan_control(true)
.await
.context("Failed to stop fan control")?;
let pmfw = &config.pmfw_options;
if let Some(acoustic_limit) = pmfw.acoustic_limit {
if self
.handle
.get_fan_acoustic_limit()
.context("Could not get acoustic limit")?
.current
!= acoustic_limit
{
self.handle
.set_fan_acoustic_limit(acoustic_limit)
.context("Could not set acoustic limit")?;
}
}
if let Some(acoustic_target) = pmfw.acoustic_target {
if self
.handle
.get_fan_acoustic_target()
.context("Could not get acoustic target")?
.current
!= acoustic_target
{
self.handle
.set_fan_acoustic_target(acoustic_target)
.context("Could not set acoustic target")?;
}
}
if let Some(target_temperature) = pmfw.target_temperature {
if self
.handle
.get_fan_target_temperature()
.context("Could not get target temperature")?
.current
!= target_temperature
{
self.handle
.set_fan_target_temperature(target_temperature)
.context("Could not set target temperature")?;
}
}
if let Some(minimum_pwm) = pmfw.minimum_pwm {
if self
.handle
.get_fan_minimum_pwm()
.context("Could not get minimum pwm")?
.current
!= minimum_pwm
{
self.handle
.set_fan_minimum_pwm(minimum_pwm)
.context("Could not set minimum pwm")?;
}
}
}
Ok(())
}
}
#[cfg(feature = "libdrm_amdgpu_sys")]
fn get_drm_handle(handle: &GpuHandle) -> anyhow::Result<DrmHandle> {
let slot_name = handle
.get_pci_slot_name()
.context("Device has no PCI slot name")?;
let path = format!("/dev/dri/by-path/pci-{slot_name}-render");
let drm_file = OpenOptions::new()
.read(true)
.write(true)
.open(&path)
.with_context(|| format!("Could not open drm file at {path}"))?;
let (handle, _, _) = DrmHandle::init(drm_file.into_raw_fd())
.map_err(|err| anyhow!("Could not open drm handle, error code {err}"))?;
Ok(handle)
}
impl ClocksConfiguration {
fn apply_to_table(&self, table: &mut ClocksTableGen) -> anyhow::Result<()> {
if let ClocksTableGen::Vega20(ref mut table) = table {
// Avoid writing settings to the clocks table except the user-specified ones
// There is an issue on some GPU models where the default values are actually outside of the allowed range
// See https://github.com/sibradzic/amdgpu-clocks/issues/32#issuecomment-829953519 (part 2) for an example
if table.vddc_curve.is_empty() {
table.clear();
}
// Normalize the VDDC curve - make sure all of the values are within the allowed range
table.normalize_vddc_curve();
match self.voltage_offset {
Some(offset) => table.set_voltage_offset(offset)?,
None => table.voltage_offset = None,
}
}
if let Some(min_clockspeed) = self.min_core_clock {
table.set_min_sclk(min_clockspeed)?;
}
if let Some(min_clockspeed) = self.min_memory_clock {
table.set_min_mclk(min_clockspeed)?;
}
if let Some(min_voltage) = self.min_voltage {
table.set_min_voltage(min_voltage)?;
}
if let Some(clockspeed) = self.max_core_clock {
table.set_max_sclk(clockspeed)?;
}
if let Some(clockspeed) = self.max_memory_clock {
table.set_max_mclk(clockspeed)?;
}
if let Some(voltage) = self.max_voltage {
table.set_max_voltage(voltage)?;
}
Ok(())
}
}
async fn wait_until_lowest_clock_level(handle: &GpuHandle) {
loop {
match handle.get_core_clock_levels() {
Ok(levels) => {
if levels.active == Some(0) {
break;
}
sleep(Duration::from_millis(250)).await;
}
Err(err) => {
warn!("could not get core clock levels: {err}");
break;
}
}
}
}