mirror of
https://github.com/ilya-zlobintsev/LACT.git
synced 2025-02-25 18:55:26 -06:00
feat: config applying and saving rework
This commit is contained in:
parent
860178791c
commit
a83c69466d
@ -1,5 +1,6 @@
|
||||
use crate::server::gpu_controller::fan_control::FanCurve;
|
||||
use anyhow::Context;
|
||||
use lact_schema::PerformanceLevel;
|
||||
use nix::unistd::getuid;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{collections::HashMap, env, fs, path::PathBuf};
|
||||
@ -29,8 +30,12 @@ impl Default for DaemonConfig {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
|
||||
pub struct GpuConfig {
|
||||
pub fan_control_enabled: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub fan_control_settings: Option<FanControlSettings>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub power_cap: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub performance_level: Option<PerformanceLevel>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
|
@ -2,21 +2,22 @@ pub mod fan_control;
|
||||
|
||||
use self::fan_control::FanCurve;
|
||||
use super::vulkan::get_vulkan_info;
|
||||
use crate::fork::run_forked;
|
||||
use crate::{config::GpuConfig, fork::run_forked};
|
||||
use amdgpu_sysfs::{
|
||||
error::Error,
|
||||
gpu_handle::GpuHandle,
|
||||
hw_mon::{FanControlMethod, HwMon},
|
||||
sysfs::SysFS,
|
||||
};
|
||||
use anyhow::{anyhow, Context};
|
||||
use lact_schema::{
|
||||
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, FanStats, GpuPciInfo, LinkInfo, PciInfo,
|
||||
PowerStats, VoltageStats, VramStats,
|
||||
PerformanceLevel, PowerStats, VoltageStats, VramStats,
|
||||
};
|
||||
use pciid_parser::Database;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
path::PathBuf,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, Mutex},
|
||||
time::Duration,
|
||||
};
|
||||
@ -26,7 +27,7 @@ use tracing::{debug, error, info, trace, warn};
|
||||
type FanControlHandle = (Arc<Notify>, JoinHandle<()>, FanCurve);
|
||||
|
||||
pub struct GpuController {
|
||||
pub handle: GpuHandle,
|
||||
handle: GpuHandle,
|
||||
pub pci_info: Option<GpuPciInfo>,
|
||||
pub fan_control_handle: Mutex<Option<FanControlHandle>>,
|
||||
}
|
||||
@ -92,6 +93,33 @@ impl GpuController {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_id(&self) -> anyhow::Result<String> {
|
||||
let handle = &self.handle;
|
||||
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
|
||||
let pci_subsys_id = handle
|
||||
.get_pci_subsys_id()
|
||||
.context("Device has no subsys id")?;
|
||||
let pci_slot_name = handle
|
||||
.get_pci_slot_name()
|
||||
.context("Device has no pci slot")?;
|
||||
|
||||
Ok(format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
|
||||
))
|
||||
}
|
||||
|
||||
pub fn get_path(&self) -> &Path {
|
||||
self.handle.get_path()
|
||||
}
|
||||
|
||||
fn first_hw_mon(&self) -> anyhow::Result<&HwMon> {
|
||||
self.handle
|
||||
.hw_monitors
|
||||
.first()
|
||||
.context("GPU has no hardware monitor")
|
||||
}
|
||||
|
||||
pub fn get_info(&self) -> DeviceInfo {
|
||||
let vulkan_info = self.pci_info.as_ref().and_then(|pci_info| {
|
||||
match get_vulkan_info(
|
||||
@ -188,7 +216,7 @@ impl GpuController {
|
||||
self.handle.hw_monitors.first().map(f)
|
||||
}
|
||||
|
||||
pub async fn start_fan_control(
|
||||
async fn start_fan_control(
|
||||
&self,
|
||||
curve: FanCurve,
|
||||
temp_key: String,
|
||||
@ -249,7 +277,7 @@ impl GpuController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop_fan_control(&self, reset_mode: bool) -> anyhow::Result<()> {
|
||||
async fn stop_fan_control(&self, reset_mode: bool) -> anyhow::Result<()> {
|
||||
let maybe_notify = self
|
||||
.fan_control_handle
|
||||
.lock()
|
||||
@ -258,19 +286,58 @@ impl GpuController {
|
||||
if let Some((notify, handle, _)) = maybe_notify {
|
||||
notify.notify_one();
|
||||
handle.await?;
|
||||
|
||||
if reset_mode {
|
||||
let hw_mon = self
|
||||
.handle
|
||||
.hw_monitors
|
||||
.first()
|
||||
.cloned()
|
||||
.context("This GPU has no monitor")?;
|
||||
hw_mon
|
||||
.set_fan_control_method(FanControlMethod::Auto)
|
||||
.context("Could not set fan control back to automatic")?;
|
||||
}
|
||||
}
|
||||
|
||||
if reset_mode {
|
||||
let hw_mon = self
|
||||
.handle
|
||||
.hw_monitors
|
||||
.first()
|
||||
.cloned()
|
||||
.context("This GPU has no monitor")?;
|
||||
hw_mon
|
||||
.set_fan_control_method(FanControlMethod::Auto)
|
||||
.context("Could not set fan control back to automatic")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn apply_config(&self, config: &GpuConfig) -> anyhow::Result<()> {
|
||||
if config.fan_control_enabled {
|
||||
if let Some(ref settings) = config.fan_control_settings {
|
||||
let interval = Duration::from_millis(settings.interval_ms);
|
||||
self.start_fan_control(
|
||||
settings.curve.clone(),
|
||||
settings.temperature_key.clone(),
|
||||
interval,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
return Err(anyhow!(
|
||||
"Trying to enable fan control with no settings provided"
|
||||
));
|
||||
}
|
||||
} else {
|
||||
self.stop_fan_control(true).await?;
|
||||
}
|
||||
|
||||
if let Some(cap) = config.power_cap {
|
||||
let hw_mon = self.first_hw_mon()?;
|
||||
hw_mon.set_power_cap(cap)?;
|
||||
} else if let Ok(hw_mon) = self.first_hw_mon() {
|
||||
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
|
||||
hw_mon.set_power_cap(default_cap)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(level) = config.performance_level {
|
||||
self.handle.set_power_force_performance_level(level)?;
|
||||
} else if self.handle.get_power_force_performance_level().is_ok() {
|
||||
self.handle
|
||||
.set_power_force_performance_level(PerformanceLevel::Auto)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
use super::gpu_controller::{fan_control::FanCurve, GpuController};
|
||||
use crate::config::{Config, FanControlSettings, GpuConfig};
|
||||
use amdgpu_sysfs::{hw_mon::HwMon, sysfs::SysFS};
|
||||
use anyhow::{anyhow, Context};
|
||||
use lact_schema::{
|
||||
ClocksInfo, DeviceInfo, DeviceListEntry, DeviceStats, FanCurveMap, PerformanceLevel,
|
||||
@ -9,9 +8,8 @@ use std::{
|
||||
collections::HashMap,
|
||||
path::PathBuf,
|
||||
sync::{Arc, RwLock},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Handler {
|
||||
@ -39,29 +37,14 @@ impl<'a> Handler {
|
||||
trace!("trying gpu controller at {:?}", entry.path());
|
||||
let device_path = entry.path().join("device");
|
||||
match GpuController::new_from_path(device_path) {
|
||||
Ok(controller) => {
|
||||
let handle = &controller.handle;
|
||||
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
|
||||
let pci_subsys_id = handle
|
||||
.get_pci_subsys_id()
|
||||
.context("Device has no subsys id")?;
|
||||
let pci_slot_name = handle
|
||||
.get_pci_slot_name()
|
||||
.context("Device has no pci slot")?;
|
||||
|
||||
let id = format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
|
||||
);
|
||||
|
||||
debug!(
|
||||
"initialized GPU controller {} for path {:?}",
|
||||
id,
|
||||
handle.get_path()
|
||||
);
|
||||
|
||||
controllers.insert(id, controller);
|
||||
}
|
||||
Ok(controller) => match controller.get_id() {
|
||||
Ok(id) => {
|
||||
let path = controller.get_path();
|
||||
debug!("initialized GPU controller {id} for path {path:?}",);
|
||||
controllers.insert(id, controller);
|
||||
}
|
||||
Err(err) => warn!("could not initialize controller: {err:#}"),
|
||||
},
|
||||
Err(error) => {
|
||||
warn!(
|
||||
"failed to initialize controller at {:?}, {error}",
|
||||
@ -74,29 +57,7 @@ impl<'a> Handler {
|
||||
|
||||
for (id, gpu_config) in &config.gpus {
|
||||
if let Some(controller) = controllers.get(id) {
|
||||
if gpu_config.fan_control_enabled {
|
||||
let settings = gpu_config.fan_control_settings.as_ref().context(
|
||||
"Fan control is enabled but no settings are defined (invalid config?)",
|
||||
)?;
|
||||
let interval = Duration::from_millis(settings.interval_ms);
|
||||
controller
|
||||
.start_fan_control(
|
||||
settings.curve.clone(),
|
||||
settings.temperature_key.clone(),
|
||||
interval,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if let Some(power_cap) = gpu_config.power_cap {
|
||||
controller
|
||||
.handle
|
||||
.hw_monitors
|
||||
.first()
|
||||
.context("GPU has power cap defined but has no hardware monitor")?
|
||||
.set_power_cap(power_cap)
|
||||
.context("Could not set power cap")?;
|
||||
}
|
||||
controller.apply_config(gpu_config).await?;
|
||||
} else {
|
||||
info!("could not find GPU with id {id} defined in configuration");
|
||||
}
|
||||
@ -108,18 +69,40 @@ impl<'a> Handler {
|
||||
})
|
||||
}
|
||||
|
||||
fn edit_config<F: FnOnce(&mut Config)>(&self, f: F) -> anyhow::Result<()> {
|
||||
let mut config_guard = self.config.write().map_err(|err| anyhow!("{err}"))?;
|
||||
f(&mut config_guard);
|
||||
config_guard.save()?;
|
||||
Ok(())
|
||||
}
|
||||
async fn edit_gpu_config<F: FnOnce(&mut GpuConfig)>(
|
||||
&self,
|
||||
id: String,
|
||||
f: F,
|
||||
) -> anyhow::Result<()> {
|
||||
let current_config = self
|
||||
.config
|
||||
.read()
|
||||
.map_err(|err| anyhow!("{err}"))?
|
||||
.gpus
|
||||
.get(&id)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
fn edit_gpu_config<F: FnOnce(&mut GpuConfig)>(&self, id: String, f: F) -> anyhow::Result<()> {
|
||||
self.edit_config(|config| {
|
||||
let gpu_config = config.gpus.entry(id).or_default();
|
||||
f(gpu_config);
|
||||
})
|
||||
let mut new_config = current_config.clone();
|
||||
f(&mut new_config);
|
||||
|
||||
let controller = self.controller_by_id(&id)?;
|
||||
|
||||
match controller.apply_config(&new_config).await {
|
||||
Ok(()) => {
|
||||
let mut config_guard = self.config.write().unwrap();
|
||||
config_guard.gpus.insert(id, new_config);
|
||||
config_guard.save()?;
|
||||
Ok(())
|
||||
}
|
||||
Err(apply_err) => {
|
||||
error!("Could not apply settings: {apply_err:#}");
|
||||
match controller.apply_config(¤t_config).await {
|
||||
Ok(()) => Err(apply_err.context("Could not apply settings")),
|
||||
Err(err) => Err(anyhow!("Could not apply settings, and could not reset to default settings: {err:#}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn controller_by_id(&self, id: &str) -> anyhow::Result<&GpuController> {
|
||||
@ -130,14 +113,6 @@ impl<'a> Handler {
|
||||
.context("No controller with such id")?)
|
||||
}
|
||||
|
||||
fn hw_mon_by_id(&self, id: &str) -> anyhow::Result<&HwMon> {
|
||||
self.controller_by_id(id)?
|
||||
.handle
|
||||
.hw_monitors
|
||||
.first()
|
||||
.context("GPU has no hardware monitor")
|
||||
}
|
||||
|
||||
pub fn list_devices(&'a self) -> Vec<DeviceListEntry<'a>> {
|
||||
self.gpu_controllers
|
||||
.iter()
|
||||
@ -188,18 +163,8 @@ impl<'a> Handler {
|
||||
}
|
||||
}
|
||||
};
|
||||
let interval = Duration::from_millis(settings.interval_ms);
|
||||
|
||||
self.controller_by_id(id)?
|
||||
.start_fan_control(
|
||||
settings.curve.clone(),
|
||||
settings.temperature_key.clone(),
|
||||
interval,
|
||||
)
|
||||
.await?;
|
||||
Some(settings)
|
||||
} else {
|
||||
self.controller_by_id(id)?.stop_fan_control(true).await?;
|
||||
None
|
||||
};
|
||||
|
||||
@ -207,51 +172,31 @@ impl<'a> Handler {
|
||||
config.fan_control_enabled = enabled;
|
||||
config.fan_control_settings = settings
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn set_power_cap(&'a self, id: &str, maybe_cap: Option<f64>) -> anyhow::Result<()> {
|
||||
let hw_mon = self.hw_mon_by_id(id)?;
|
||||
|
||||
let cap = match maybe_cap {
|
||||
Some(cap) => cap,
|
||||
None => hw_mon.get_power_cap_default()?,
|
||||
};
|
||||
hw_mon.set_power_cap(cap)?;
|
||||
|
||||
pub async fn set_power_cap(&'a self, id: &str, maybe_cap: Option<f64>) -> anyhow::Result<()> {
|
||||
self.edit_gpu_config(id.to_owned(), |gpu_config| {
|
||||
gpu_config.power_cap = maybe_cap;
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn set_performance_level(&self, id: &str, level: PerformanceLevel) -> anyhow::Result<()> {
|
||||
self.controller_by_id(id)?
|
||||
.handle
|
||||
.set_power_force_performance_level(level)
|
||||
.context("Could not set performance level")
|
||||
pub async fn set_performance_level(
|
||||
&self,
|
||||
id: &str,
|
||||
level: PerformanceLevel,
|
||||
) -> anyhow::Result<()> {
|
||||
self.edit_gpu_config(id.to_owned(), |gpu_config| {
|
||||
gpu_config.performance_level = Some(level);
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn cleanup(self) {
|
||||
let config = self.config.read().unwrap().clone();
|
||||
for (id, gpu_config) in config.gpus {
|
||||
if let Ok(controller) = self.controller_by_id(&id) {
|
||||
if gpu_config.fan_control_enabled {
|
||||
debug!("stopping fan control");
|
||||
controller
|
||||
.stop_fan_control(true)
|
||||
.await
|
||||
.expect("Could not stop fan control");
|
||||
}
|
||||
|
||||
if let (Some(_), Some(hw_mon)) =
|
||||
(gpu_config.power_cap, controller.handle.hw_monitors.first())
|
||||
{
|
||||
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
|
||||
debug!("setting power limit to default");
|
||||
hw_mon
|
||||
.set_power_cap(default_cap)
|
||||
.expect("Could not set power cap to default");
|
||||
}
|
||||
}
|
||||
for (id, controller) in self.gpu_controllers.iter() {
|
||||
if let Err(err) = controller.apply_config(&GpuConfig::default()).await {
|
||||
error!("Could not reset settings for controller {id}: {err:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -87,11 +87,11 @@ async fn handle_request<'a>(request: Request<'a>, handler: &'a Handler) -> anyho
|
||||
Request::SetFanControl { id, enabled, curve } => {
|
||||
ok_response(handler.set_fan_control(id, enabled, curve).await?)
|
||||
}
|
||||
Request::SetPowerCap { id, cap } => ok_response(handler.set_power_cap(id, cap)?),
|
||||
Request::SetPowerCap { id, cap } => ok_response(handler.set_power_cap(id, cap).await?),
|
||||
Request::SetPerformanceLevel {
|
||||
id,
|
||||
performance_level,
|
||||
} => ok_response(handler.set_performance_level(id, performance_level)?),
|
||||
} => ok_response(handler.set_performance_level(id, performance_level).await?),
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user