feat: config applying and saving rework

This commit is contained in:
Ilya Zlobintsev 2023-01-22 18:08:01 +02:00
parent 860178791c
commit a83c69466d
4 changed files with 148 additions and 131 deletions

View File

@ -1,5 +1,6 @@
use crate::server::gpu_controller::fan_control::FanCurve;
use anyhow::Context;
use lact_schema::PerformanceLevel;
use nix::unistd::getuid;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, env, fs, path::PathBuf};
@ -29,8 +30,12 @@ impl Default for DaemonConfig {
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct GpuConfig {
pub fan_control_enabled: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub fan_control_settings: Option<FanControlSettings>,
#[serde(skip_serializing_if = "Option::is_none")]
pub power_cap: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub performance_level: Option<PerformanceLevel>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]

View File

@ -2,21 +2,22 @@ pub mod fan_control;
use self::fan_control::FanCurve;
use super::vulkan::get_vulkan_info;
use crate::fork::run_forked;
use crate::{config::GpuConfig, fork::run_forked};
use amdgpu_sysfs::{
error::Error,
gpu_handle::GpuHandle,
hw_mon::{FanControlMethod, HwMon},
sysfs::SysFS,
};
use anyhow::{anyhow, Context};
use lact_schema::{
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, FanStats, GpuPciInfo, LinkInfo, PciInfo,
PowerStats, VoltageStats, VramStats,
PerformanceLevel, PowerStats, VoltageStats, VramStats,
};
use pciid_parser::Database;
use std::{
borrow::Cow,
path::PathBuf,
path::{Path, PathBuf},
sync::{Arc, Mutex},
time::Duration,
};
@ -26,7 +27,7 @@ use tracing::{debug, error, info, trace, warn};
type FanControlHandle = (Arc<Notify>, JoinHandle<()>, FanCurve);
pub struct GpuController {
pub handle: GpuHandle,
handle: GpuHandle,
pub pci_info: Option<GpuPciInfo>,
pub fan_control_handle: Mutex<Option<FanControlHandle>>,
}
@ -92,6 +93,33 @@ impl GpuController {
})
}
pub fn get_id(&self) -> anyhow::Result<String> {
let handle = &self.handle;
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
let pci_subsys_id = handle
.get_pci_subsys_id()
.context("Device has no subsys id")?;
let pci_slot_name = handle
.get_pci_slot_name()
.context("Device has no pci slot")?;
Ok(format!(
"{}:{}-{}:{}-{}",
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
))
}
pub fn get_path(&self) -> &Path {
self.handle.get_path()
}
fn first_hw_mon(&self) -> anyhow::Result<&HwMon> {
self.handle
.hw_monitors
.first()
.context("GPU has no hardware monitor")
}
pub fn get_info(&self) -> DeviceInfo {
let vulkan_info = self.pci_info.as_ref().and_then(|pci_info| {
match get_vulkan_info(
@ -188,7 +216,7 @@ impl GpuController {
self.handle.hw_monitors.first().map(f)
}
pub async fn start_fan_control(
async fn start_fan_control(
&self,
curve: FanCurve,
temp_key: String,
@ -249,7 +277,7 @@ impl GpuController {
Ok(())
}
pub async fn stop_fan_control(&self, reset_mode: bool) -> anyhow::Result<()> {
async fn stop_fan_control(&self, reset_mode: bool) -> anyhow::Result<()> {
let maybe_notify = self
.fan_control_handle
.lock()
@ -258,19 +286,58 @@ impl GpuController {
if let Some((notify, handle, _)) = maybe_notify {
notify.notify_one();
handle.await?;
if reset_mode {
let hw_mon = self
.handle
.hw_monitors
.first()
.cloned()
.context("This GPU has no monitor")?;
hw_mon
.set_fan_control_method(FanControlMethod::Auto)
.context("Could not set fan control back to automatic")?;
}
}
if reset_mode {
let hw_mon = self
.handle
.hw_monitors
.first()
.cloned()
.context("This GPU has no monitor")?;
hw_mon
.set_fan_control_method(FanControlMethod::Auto)
.context("Could not set fan control back to automatic")?;
Ok(())
}
pub async fn apply_config(&self, config: &GpuConfig) -> anyhow::Result<()> {
if config.fan_control_enabled {
if let Some(ref settings) = config.fan_control_settings {
let interval = Duration::from_millis(settings.interval_ms);
self.start_fan_control(
settings.curve.clone(),
settings.temperature_key.clone(),
interval,
)
.await?;
} else {
return Err(anyhow!(
"Trying to enable fan control with no settings provided"
));
}
} else {
self.stop_fan_control(true).await?;
}
if let Some(cap) = config.power_cap {
let hw_mon = self.first_hw_mon()?;
hw_mon.set_power_cap(cap)?;
} else if let Ok(hw_mon) = self.first_hw_mon() {
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
hw_mon.set_power_cap(default_cap)?;
}
}
if let Some(level) = config.performance_level {
self.handle.set_power_force_performance_level(level)?;
} else if self.handle.get_power_force_performance_level().is_ok() {
self.handle
.set_power_force_performance_level(PerformanceLevel::Auto)?;
}
Ok(())
}
}

View File

@ -1,6 +1,5 @@
use super::gpu_controller::{fan_control::FanCurve, GpuController};
use crate::config::{Config, FanControlSettings, GpuConfig};
use amdgpu_sysfs::{hw_mon::HwMon, sysfs::SysFS};
use anyhow::{anyhow, Context};
use lact_schema::{
ClocksInfo, DeviceInfo, DeviceListEntry, DeviceStats, FanCurveMap, PerformanceLevel,
@ -9,9 +8,8 @@ use std::{
collections::HashMap,
path::PathBuf,
sync::{Arc, RwLock},
time::Duration,
};
use tracing::{debug, info, trace, warn};
use tracing::{debug, error, info, trace, warn};
#[derive(Clone)]
pub struct Handler {
@ -39,29 +37,14 @@ impl<'a> Handler {
trace!("trying gpu controller at {:?}", entry.path());
let device_path = entry.path().join("device");
match GpuController::new_from_path(device_path) {
Ok(controller) => {
let handle = &controller.handle;
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
let pci_subsys_id = handle
.get_pci_subsys_id()
.context("Device has no subsys id")?;
let pci_slot_name = handle
.get_pci_slot_name()
.context("Device has no pci slot")?;
let id = format!(
"{}:{}-{}:{}-{}",
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
);
debug!(
"initialized GPU controller {} for path {:?}",
id,
handle.get_path()
);
controllers.insert(id, controller);
}
Ok(controller) => match controller.get_id() {
Ok(id) => {
let path = controller.get_path();
debug!("initialized GPU controller {id} for path {path:?}",);
controllers.insert(id, controller);
}
Err(err) => warn!("could not initialize controller: {err:#}"),
},
Err(error) => {
warn!(
"failed to initialize controller at {:?}, {error}",
@ -74,29 +57,7 @@ impl<'a> Handler {
for (id, gpu_config) in &config.gpus {
if let Some(controller) = controllers.get(id) {
if gpu_config.fan_control_enabled {
let settings = gpu_config.fan_control_settings.as_ref().context(
"Fan control is enabled but no settings are defined (invalid config?)",
)?;
let interval = Duration::from_millis(settings.interval_ms);
controller
.start_fan_control(
settings.curve.clone(),
settings.temperature_key.clone(),
interval,
)
.await?;
}
if let Some(power_cap) = gpu_config.power_cap {
controller
.handle
.hw_monitors
.first()
.context("GPU has power cap defined but has no hardware monitor")?
.set_power_cap(power_cap)
.context("Could not set power cap")?;
}
controller.apply_config(gpu_config).await?;
} else {
info!("could not find GPU with id {id} defined in configuration");
}
@ -108,18 +69,40 @@ impl<'a> Handler {
})
}
fn edit_config<F: FnOnce(&mut Config)>(&self, f: F) -> anyhow::Result<()> {
let mut config_guard = self.config.write().map_err(|err| anyhow!("{err}"))?;
f(&mut config_guard);
config_guard.save()?;
Ok(())
}
async fn edit_gpu_config<F: FnOnce(&mut GpuConfig)>(
&self,
id: String,
f: F,
) -> anyhow::Result<()> {
let current_config = self
.config
.read()
.map_err(|err| anyhow!("{err}"))?
.gpus
.get(&id)
.cloned()
.unwrap_or_default();
fn edit_gpu_config<F: FnOnce(&mut GpuConfig)>(&self, id: String, f: F) -> anyhow::Result<()> {
self.edit_config(|config| {
let gpu_config = config.gpus.entry(id).or_default();
f(gpu_config);
})
let mut new_config = current_config.clone();
f(&mut new_config);
let controller = self.controller_by_id(&id)?;
match controller.apply_config(&new_config).await {
Ok(()) => {
let mut config_guard = self.config.write().unwrap();
config_guard.gpus.insert(id, new_config);
config_guard.save()?;
Ok(())
}
Err(apply_err) => {
error!("Could not apply settings: {apply_err:#}");
match controller.apply_config(&current_config).await {
Ok(()) => Err(apply_err.context("Could not apply settings")),
Err(err) => Err(anyhow!("Could not apply settings, and could not reset to default settings: {err:#}")),
}
}
}
}
fn controller_by_id(&self, id: &str) -> anyhow::Result<&GpuController> {
@ -130,14 +113,6 @@ impl<'a> Handler {
.context("No controller with such id")?)
}
fn hw_mon_by_id(&self, id: &str) -> anyhow::Result<&HwMon> {
self.controller_by_id(id)?
.handle
.hw_monitors
.first()
.context("GPU has no hardware monitor")
}
pub fn list_devices(&'a self) -> Vec<DeviceListEntry<'a>> {
self.gpu_controllers
.iter()
@ -188,18 +163,8 @@ impl<'a> Handler {
}
}
};
let interval = Duration::from_millis(settings.interval_ms);
self.controller_by_id(id)?
.start_fan_control(
settings.curve.clone(),
settings.temperature_key.clone(),
interval,
)
.await?;
Some(settings)
} else {
self.controller_by_id(id)?.stop_fan_control(true).await?;
None
};
@ -207,51 +172,31 @@ impl<'a> Handler {
config.fan_control_enabled = enabled;
config.fan_control_settings = settings
})
.await
}
pub fn set_power_cap(&'a self, id: &str, maybe_cap: Option<f64>) -> anyhow::Result<()> {
let hw_mon = self.hw_mon_by_id(id)?;
let cap = match maybe_cap {
Some(cap) => cap,
None => hw_mon.get_power_cap_default()?,
};
hw_mon.set_power_cap(cap)?;
pub async fn set_power_cap(&'a self, id: &str, maybe_cap: Option<f64>) -> anyhow::Result<()> {
self.edit_gpu_config(id.to_owned(), |gpu_config| {
gpu_config.power_cap = maybe_cap;
})
.await
}
pub fn set_performance_level(&self, id: &str, level: PerformanceLevel) -> anyhow::Result<()> {
self.controller_by_id(id)?
.handle
.set_power_force_performance_level(level)
.context("Could not set performance level")
pub async fn set_performance_level(
&self,
id: &str,
level: PerformanceLevel,
) -> anyhow::Result<()> {
self.edit_gpu_config(id.to_owned(), |gpu_config| {
gpu_config.performance_level = Some(level);
})
.await
}
pub async fn cleanup(self) {
let config = self.config.read().unwrap().clone();
for (id, gpu_config) in config.gpus {
if let Ok(controller) = self.controller_by_id(&id) {
if gpu_config.fan_control_enabled {
debug!("stopping fan control");
controller
.stop_fan_control(true)
.await
.expect("Could not stop fan control");
}
if let (Some(_), Some(hw_mon)) =
(gpu_config.power_cap, controller.handle.hw_monitors.first())
{
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
debug!("setting power limit to default");
hw_mon
.set_power_cap(default_cap)
.expect("Could not set power cap to default");
}
}
for (id, controller) in self.gpu_controllers.iter() {
if let Err(err) = controller.apply_config(&GpuConfig::default()).await {
error!("Could not reset settings for controller {id}: {err:#}");
}
}
}

View File

@ -87,11 +87,11 @@ async fn handle_request<'a>(request: Request<'a>, handler: &'a Handler) -> anyho
Request::SetFanControl { id, enabled, curve } => {
ok_response(handler.set_fan_control(id, enabled, curve).await?)
}
Request::SetPowerCap { id, cap } => ok_response(handler.set_power_cap(id, cap)?),
Request::SetPowerCap { id, cap } => ok_response(handler.set_power_cap(id, cap).await?),
Request::SetPerformanceLevel {
id,
performance_level,
} => ok_response(handler.set_performance_level(id, performance_level)?),
} => ok_response(handler.set_performance_level(id, performance_level).await?),
}
}