mirror of
https://github.com/ilya-zlobintsev/LACT.git
synced 2025-02-25 18:55:26 -06:00
refactor: GPU controller initialization
This commit is contained in:
parent
9c100b058d
commit
dfcfa8e10d
@ -1,4 +1,4 @@
|
||||
use super::{fan_control::FanCurve, FanControlHandle, GpuController};
|
||||
use super::{fan_control::FanCurve, CommonControllerInfo, FanControlHandle, GpuController};
|
||||
use crate::{
|
||||
config::{self, ClocksConfiguration, FanControlSettings},
|
||||
server::vulkan::get_vulkan_info,
|
||||
@ -12,22 +12,19 @@ use amdgpu_sysfs::{
|
||||
CommitHandle, GpuHandle, PerformanceLevel, PowerLevelKind, PowerLevels,
|
||||
},
|
||||
hw_mon::{FanControlMethod, HwMon},
|
||||
sysfs::SysFS,
|
||||
};
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::LocalBoxFuture;
|
||||
use lact_schema::{
|
||||
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, GpuPciInfo,
|
||||
IntelDrmInfo, LinkInfo, PciInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats,
|
||||
VramStats,
|
||||
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, IntelDrmInfo,
|
||||
LinkInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
|
||||
};
|
||||
use libdrm_amdgpu_sys::AMDGPU::{ThrottleStatus, ThrottlerBit};
|
||||
use pciid_parser::Database;
|
||||
use std::{
|
||||
cell::RefCell,
|
||||
cmp,
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
path::PathBuf,
|
||||
rc::Rc,
|
||||
time::Duration,
|
||||
};
|
||||
@ -52,13 +49,13 @@ const STEAM_DECK_IDS: [&str; 2] = ["163F", "1435"];
|
||||
pub struct AmdGpuController {
|
||||
handle: GpuHandle,
|
||||
drm_handle: Option<DrmHandle>,
|
||||
pci_info: Option<GpuPciInfo>,
|
||||
common: CommonControllerInfo,
|
||||
fan_control_handle: RefCell<Option<FanControlHandle>>,
|
||||
}
|
||||
|
||||
impl AmdGpuController {
|
||||
pub fn new_from_path(sysfs_path: PathBuf, pci_db: &Database) -> anyhow::Result<Self> {
|
||||
let handle = GpuHandle::new_from_path(sysfs_path)
|
||||
pub fn new_from_path(common: CommonControllerInfo) -> anyhow::Result<Self> {
|
||||
let handle = GpuHandle::new_from_path(common.sysfs_path.clone())
|
||||
.map_err(|error| anyhow!("failed to initialize gpu handle: {error}"))?;
|
||||
|
||||
#[allow(unused_mut)]
|
||||
@ -75,47 +72,10 @@ impl AmdGpuController {
|
||||
}
|
||||
}
|
||||
|
||||
let mut device_pci_info = None;
|
||||
let mut subsystem_pci_info = None;
|
||||
|
||||
if let Some((vendor_id, model_id)) = handle.get_pci_id() {
|
||||
device_pci_info = Some(PciInfo {
|
||||
vendor_id: vendor_id.to_owned(),
|
||||
vendor: None,
|
||||
model_id: model_id.to_owned(),
|
||||
model: None,
|
||||
});
|
||||
|
||||
if let Some((subsys_vendor_id, subsys_model_id)) = handle.get_pci_subsys_id() {
|
||||
let pci_device_info =
|
||||
pci_db.get_device_info(vendor_id, model_id, subsys_vendor_id, subsys_model_id);
|
||||
|
||||
device_pci_info = Some(PciInfo {
|
||||
vendor_id: vendor_id.to_owned(),
|
||||
vendor: pci_device_info.vendor_name.map(str::to_owned),
|
||||
model_id: model_id.to_owned(),
|
||||
model: pci_device_info.device_name.map(str::to_owned),
|
||||
});
|
||||
subsystem_pci_info = Some(PciInfo {
|
||||
vendor_id: subsys_vendor_id.to_owned(),
|
||||
vendor: pci_device_info.subvendor_name.map(str::to_owned),
|
||||
model_id: subsys_model_id.to_owned(),
|
||||
model: pci_device_info.subdevice_name.map(str::to_owned),
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
let pci_info = device_pci_info.and_then(|device_pci_info| {
|
||||
Some(GpuPciInfo {
|
||||
device_pci_info,
|
||||
subsystem_pci_info: subsystem_pci_info?,
|
||||
})
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
handle,
|
||||
drm_handle,
|
||||
pci_info,
|
||||
common,
|
||||
fan_control_handle: RefCell::new(None),
|
||||
})
|
||||
}
|
||||
@ -539,56 +499,25 @@ impl AmdGpuController {
|
||||
}
|
||||
|
||||
fn is_steam_deck(&self) -> bool {
|
||||
self.pci_info.as_ref().is_some_and(|info| {
|
||||
info.device_pci_info.vendor_id == VENDOR_AMD
|
||||
&& STEAM_DECK_IDS.contains(&info.device_pci_info.model_id.as_str())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_driver(&self) -> &str {
|
||||
self.handle.get_driver()
|
||||
self.common.pci_info.device_pci_info.vendor_id == VENDOR_AMD
|
||||
&& STEAM_DECK_IDS.contains(&self.common.pci_info.device_pci_info.model_id.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl GpuController for AmdGpuController {
|
||||
fn get_id(&self) -> anyhow::Result<String> {
|
||||
let handle = &self.handle;
|
||||
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
|
||||
let pci_subsys_id = handle
|
||||
.get_pci_subsys_id()
|
||||
.context("Device has no subsys id")?;
|
||||
let pci_slot_name = handle
|
||||
.get_pci_slot_name()
|
||||
.context("Device has no pci slot")?;
|
||||
|
||||
Ok(format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
|
||||
))
|
||||
}
|
||||
|
||||
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
|
||||
self.pci_info.as_ref()
|
||||
}
|
||||
|
||||
fn get_path(&self) -> &Path {
|
||||
self.handle.get_path()
|
||||
fn controller_info(&self) -> &CommonControllerInfo {
|
||||
&self.common
|
||||
}
|
||||
|
||||
fn get_info(&self) -> DeviceInfo {
|
||||
let vulkan_info = self.pci_info.as_ref().and_then(|pci_info| {
|
||||
match get_vulkan_info(
|
||||
&pci_info.device_pci_info.vendor_id,
|
||||
&pci_info.device_pci_info.model_id,
|
||||
) {
|
||||
Ok(info) => Some(info),
|
||||
Err(err) => {
|
||||
warn!("could not load vulkan info: {err}");
|
||||
None
|
||||
}
|
||||
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
|
||||
Ok(info) => Some(info),
|
||||
Err(err) => {
|
||||
warn!("could not load vulkan info: {err}");
|
||||
None
|
||||
}
|
||||
});
|
||||
let pci_info = self.pci_info.clone();
|
||||
};
|
||||
let pci_info = Some(self.common.pci_info.clone());
|
||||
let driver = self.handle.get_driver().to_owned();
|
||||
let vbios_version = self.get_full_vbios_version();
|
||||
let link_info = self.get_link_info();
|
||||
@ -604,10 +533,6 @@ impl GpuController for AmdGpuController {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_pci_slot_name(&self) -> Option<String> {
|
||||
self.handle.get_pci_slot_name().map(str::to_owned)
|
||||
}
|
||||
|
||||
fn get_stats(&self, gpu_config: Option<&config::Gpu>) -> DeviceStats {
|
||||
let fan_settings = gpu_config.and_then(|config| config.fan_control_settings.as_ref());
|
||||
DeviceStats {
|
||||
|
@ -1,14 +1,14 @@
|
||||
mod drm;
|
||||
|
||||
use super::GpuController;
|
||||
use super::{CommonControllerInfo, GpuController};
|
||||
use crate::{config, server::vulkan::get_vulkan_info};
|
||||
use amdgpu_sysfs::gpu_handle::power_profile_mode::PowerProfileModesTable;
|
||||
use anyhow::{anyhow, Context};
|
||||
use drm::{bindings, i915};
|
||||
use drm::bindings;
|
||||
use futures::future::LocalBoxFuture;
|
||||
use lact_schema::{
|
||||
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, GpuPciInfo,
|
||||
IntelClocksTable, IntelDrmInfo, LinkInfo, PowerStates, VramStats,
|
||||
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, IntelClocksTable,
|
||||
IntelDrmInfo, LinkInfo, PowerStates, VramStats,
|
||||
};
|
||||
use std::{
|
||||
cell::Cell,
|
||||
@ -28,24 +28,16 @@ enum DriverType {
|
||||
}
|
||||
|
||||
pub struct IntelGpuController {
|
||||
sysfs_path: PathBuf,
|
||||
driver: String,
|
||||
driver_type: DriverType,
|
||||
pci_slot_id: String,
|
||||
pci_info: GpuPciInfo,
|
||||
common: CommonControllerInfo,
|
||||
tile_gts: Vec<PathBuf>,
|
||||
drm_file: fs::File,
|
||||
last_gpu_busy: Cell<Option<(Instant, u64)>>,
|
||||
}
|
||||
|
||||
impl IntelGpuController {
|
||||
pub fn new(
|
||||
sysfs_path: PathBuf,
|
||||
driver: String,
|
||||
pci_slot_id: String,
|
||||
pci_info: GpuPciInfo,
|
||||
) -> anyhow::Result<Self> {
|
||||
let driver_type = match driver.as_str() {
|
||||
pub fn new(common: CommonControllerInfo) -> anyhow::Result<Self> {
|
||||
let driver_type = match common.driver.as_str() {
|
||||
"xe" => DriverType::Xe,
|
||||
"i915" => DriverType::I915,
|
||||
_ => unreachable!(),
|
||||
@ -53,7 +45,11 @@ impl IntelGpuController {
|
||||
|
||||
let mut tile_gts = vec![];
|
||||
|
||||
for entry in fs::read_dir(&sysfs_path).into_iter().flatten().flatten() {
|
||||
for entry in fs::read_dir(&common.sysfs_path)
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.flatten()
|
||||
{
|
||||
if let Some(name) = entry.file_name().to_str() {
|
||||
if name.starts_with("tile") {
|
||||
for gt_entry in fs::read_dir(entry.path()).into_iter().flatten().flatten() {
|
||||
@ -61,7 +57,7 @@ impl IntelGpuController {
|
||||
if gt_name.starts_with("gt") {
|
||||
let gt_path = gt_entry
|
||||
.path()
|
||||
.strip_prefix(&sysfs_path)
|
||||
.strip_prefix(&common.sysfs_path)
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
debug!("initialized GT at '{}'", gt_path.display());
|
||||
@ -77,11 +73,11 @@ impl IntelGpuController {
|
||||
info!(
|
||||
"initialized {} gt at '{}'",
|
||||
tile_gts.len(),
|
||||
sysfs_path.display()
|
||||
common.sysfs_path.display()
|
||||
);
|
||||
}
|
||||
let drm_file = if cfg!(not(test)) {
|
||||
let drm_path = format!("/dev/dri/by-path/pci-{pci_slot_id}-render");
|
||||
let drm_path = format!("/dev/dri/by-path/pci-{}-render", common.pci_slot_name);
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
@ -92,11 +88,8 @@ impl IntelGpuController {
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
sysfs_path,
|
||||
driver,
|
||||
common,
|
||||
driver_type,
|
||||
pci_slot_id,
|
||||
pci_info,
|
||||
tile_gts,
|
||||
drm_file,
|
||||
last_gpu_busy: Cell::new(None),
|
||||
@ -105,35 +98,12 @@ impl IntelGpuController {
|
||||
}
|
||||
|
||||
impl GpuController for IntelGpuController {
|
||||
fn get_id(&self) -> anyhow::Result<String> {
|
||||
let GpuPciInfo {
|
||||
device_pci_info,
|
||||
subsystem_pci_info,
|
||||
} = &self.pci_info;
|
||||
|
||||
Ok(format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
device_pci_info.vendor_id,
|
||||
device_pci_info.model_id,
|
||||
subsystem_pci_info.vendor_id,
|
||||
subsystem_pci_info.model_id,
|
||||
self.pci_slot_id,
|
||||
))
|
||||
}
|
||||
|
||||
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
|
||||
Some(&self.pci_info)
|
||||
}
|
||||
|
||||
fn get_path(&self) -> &Path {
|
||||
&self.sysfs_path
|
||||
fn controller_info(&self) -> &CommonControllerInfo {
|
||||
&self.common
|
||||
}
|
||||
|
||||
fn get_info(&self) -> DeviceInfo {
|
||||
let vulkan_info = match get_vulkan_info(
|
||||
&self.pci_info.device_pci_info.vendor_id,
|
||||
&self.pci_info.device_pci_info.model_id,
|
||||
) {
|
||||
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
|
||||
Ok(info) => Some(info),
|
||||
Err(err) => {
|
||||
warn!("could not load vulkan info: {err}");
|
||||
@ -151,19 +121,15 @@ impl GpuController for IntelGpuController {
|
||||
};
|
||||
|
||||
DeviceInfo {
|
||||
pci_info: Some(self.pci_info.clone()),
|
||||
pci_info: Some(self.common.pci_info.clone()),
|
||||
vulkan_info,
|
||||
driver: self.driver.clone(),
|
||||
driver: self.common.driver.clone(),
|
||||
vbios_version: None,
|
||||
link_info: LinkInfo::default(),
|
||||
drm_info: Some(drm_info),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_pci_slot_name(&self) -> Option<String> {
|
||||
Some(self.pci_slot_id.clone())
|
||||
}
|
||||
|
||||
fn apply_config<'a>(
|
||||
&'a self,
|
||||
config: &'a config::Gpu,
|
||||
@ -285,11 +251,13 @@ impl GpuController for IntelGpuController {
|
||||
}
|
||||
|
||||
impl IntelGpuController {
|
||||
#[allow(clippy::unused_self)]
|
||||
fn debugfs_path(&self) -> PathBuf {
|
||||
#[cfg(test)]
|
||||
return PathBuf::from("/dev/null");
|
||||
|
||||
Path::new("/sys/kernel/debug/dri").join(&self.pci_slot_id)
|
||||
#[cfg(not(test))]
|
||||
Path::new("/sys/kernel/debug/dri").join(&self.common.pci_slot_name)
|
||||
}
|
||||
|
||||
fn first_tile_gt(&self) -> Option<&Path> {
|
||||
@ -304,11 +272,12 @@ impl IntelGpuController {
|
||||
|
||||
match path.strip_prefix("../") {
|
||||
Ok(path_relative_to_parent) => self
|
||||
.get_path()
|
||||
.common
|
||||
.sysfs_path
|
||||
.parent()
|
||||
.expect("Device path has no parent")
|
||||
.join(path_relative_to_parent),
|
||||
Err(_) => self.get_path().join(path),
|
||||
Err(_) => self.common.sysfs_path.join(path),
|
||||
}
|
||||
}
|
||||
|
||||
@ -379,6 +348,7 @@ impl IntelGpuController {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unused_self)]
|
||||
fn get_drm_info_xe(&self) -> IntelDrmInfo {
|
||||
IntelDrmInfo {
|
||||
execution_units: None,
|
||||
|
@ -4,30 +4,27 @@ pub mod fan_control;
|
||||
mod intel;
|
||||
mod nvidia;
|
||||
|
||||
pub use amd::AmdGpuController;
|
||||
pub use intel::IntelGpuController;
|
||||
pub use nvidia::NvidiaGpuController;
|
||||
use amd::AmdGpuController;
|
||||
use intel::IntelGpuController;
|
||||
use nvidia::NvidiaGpuController;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::config::{self};
|
||||
use amdgpu_sysfs::gpu_handle::power_profile_mode::PowerProfileModesTable;
|
||||
use anyhow::Context;
|
||||
use futures::future::LocalBoxFuture;
|
||||
use lact_schema::{ClocksInfo, DeviceInfo, DeviceStats, GpuPciInfo, PowerStates};
|
||||
use std::{path::Path, rc::Rc};
|
||||
use lact_schema::{ClocksInfo, DeviceInfo, DeviceStats, GpuPciInfo, PciInfo, PowerStates};
|
||||
use nvml_wrapper::Nvml;
|
||||
use std::{cell::OnceCell, collections::HashMap, fs, path::PathBuf, rc::Rc};
|
||||
use tokio::{sync::Notify, task::JoinHandle};
|
||||
|
||||
type FanControlHandle = (Rc<Notify>, JoinHandle<()>);
|
||||
|
||||
pub trait GpuController {
|
||||
fn get_id(&self) -> anyhow::Result<String>;
|
||||
|
||||
fn get_pci_info(&self) -> Option<&GpuPciInfo>;
|
||||
|
||||
fn get_path(&self) -> &Path;
|
||||
fn controller_info(&self) -> &CommonControllerInfo;
|
||||
|
||||
fn get_info(&self) -> DeviceInfo;
|
||||
|
||||
fn get_pci_slot_name(&self) -> Option<String>;
|
||||
|
||||
fn apply_config<'a>(
|
||||
&'a self,
|
||||
config: &'a config::Gpu,
|
||||
@ -47,3 +44,146 @@ pub trait GpuController {
|
||||
|
||||
fn vbios_dump(&self) -> anyhow::Result<Vec<u8>>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct CommonControllerInfo {
|
||||
pub sysfs_path: PathBuf,
|
||||
pub pci_info: GpuPciInfo,
|
||||
pub pci_slot_name: String,
|
||||
pub driver: String,
|
||||
}
|
||||
|
||||
impl CommonControllerInfo {
|
||||
pub fn build_id(&self) -> String {
|
||||
let GpuPciInfo {
|
||||
device_pci_info,
|
||||
subsystem_pci_info,
|
||||
} = &self.pci_info;
|
||||
|
||||
format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
device_pci_info.vendor_id,
|
||||
device_pci_info.model_id,
|
||||
subsystem_pci_info.vendor_id,
|
||||
subsystem_pci_info.model_id,
|
||||
self.pci_slot_name
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn init_controller(
|
||||
path: PathBuf,
|
||||
pci_db: &pciid_parser::Database,
|
||||
nvml: &OnceCell<Option<Rc<Nvml>>>,
|
||||
) -> anyhow::Result<Box<dyn GpuController>> {
|
||||
let uevent_path = path.join("uevent");
|
||||
let uevent = fs::read_to_string(uevent_path).context("Could not read 'uevent'")?;
|
||||
let mut uevent_map = parse_uevent(&uevent);
|
||||
|
||||
let driver = uevent_map
|
||||
.remove("DRIVER")
|
||||
.context("DRIVER entry missing in 'uevent'")?
|
||||
.to_owned();
|
||||
let pci_slot_name = uevent_map
|
||||
.remove("PCI_SLOT_NAME")
|
||||
.context("PCI_SLOT_NAME entry missing in 'uevent'")?
|
||||
.to_owned();
|
||||
|
||||
let (vendor_id, device_id) = uevent_map
|
||||
.get("PCI_ID")
|
||||
.and_then(|id_line| id_line.split_once(':'))
|
||||
.context("PCI_ID entry missing in 'uevent'")?;
|
||||
|
||||
let subsystem_entry = uevent_map
|
||||
.get("PCI_SUBSYS_ID")
|
||||
.and_then(|id_line| id_line.split_once(':'));
|
||||
|
||||
let (subsystem_vendor_id, subsystem_device_id) = subsystem_entry
|
||||
.map(|(vendor, device)| (Some(vendor), Some(device)))
|
||||
.unwrap_or_default();
|
||||
|
||||
let subsystem_info = subsystem_entry
|
||||
.map(|(subsys_vendor_id, subsys_device_id)| {
|
||||
pci_db.get_device_info(vendor_id, device_id, subsys_vendor_id, subsys_device_id)
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let vendor_entry = pci_db.vendors.get_key_value(vendor_id);
|
||||
|
||||
let pci_info = GpuPciInfo {
|
||||
device_pci_info: PciInfo {
|
||||
vendor_id: vendor_id.to_owned(),
|
||||
vendor: vendor_entry.map(|(vendor_name, _)| vendor_name.clone()),
|
||||
model_id: device_id.to_owned(),
|
||||
model: vendor_entry.and_then(|(_, vendor)| {
|
||||
vendor
|
||||
.devices
|
||||
.get(device_id)
|
||||
.map(|device| device.name.clone())
|
||||
}),
|
||||
},
|
||||
subsystem_pci_info: PciInfo {
|
||||
vendor_id: subsystem_vendor_id.unwrap_or_default().to_owned(),
|
||||
vendor: subsystem_info.subvendor_name.map(str::to_owned),
|
||||
model_id: subsystem_device_id.unwrap_or_default().to_owned(),
|
||||
model: subsystem_info.subdevice_name.map(str::to_owned),
|
||||
},
|
||||
};
|
||||
|
||||
let common = CommonControllerInfo {
|
||||
sysfs_path: path,
|
||||
pci_info,
|
||||
pci_slot_name,
|
||||
driver,
|
||||
};
|
||||
|
||||
match common.driver.as_str() {
|
||||
"amdgpu" | "radeon" => match AmdGpuController::new_from_path(common.clone()) {
|
||||
Ok(controller) => return Ok(Box::new(controller)),
|
||||
Err(err) => error!("could not initialize AMD controller: {err:#}"),
|
||||
},
|
||||
"i915" | "xe" => match IntelGpuController::new(common.clone()) {
|
||||
Ok(controller) => return Ok(Box::new(controller)),
|
||||
Err(err) => error!("could not initialize Intel controller: {err:#}"),
|
||||
},
|
||||
"nvidia" => {
|
||||
let nvml = nvml.get_or_init(|| match Nvml::init() {
|
||||
Ok(nvml) => {
|
||||
info!("Nvidia management library loaded");
|
||||
Some(Rc::new(nvml))
|
||||
}
|
||||
Err(err) => {
|
||||
error!("could not load Nvidia management library: {err}, Nvidia controls will not be available");
|
||||
None
|
||||
}
|
||||
});
|
||||
if let Some(nvml) = nvml {
|
||||
match NvidiaGpuController::new(common.clone(), nvml.clone()) {
|
||||
Ok(controller) => {
|
||||
return Ok(Box::new(controller));
|
||||
}
|
||||
Err(err) => error!("could not initialize Nvidia controller: {err:#}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
warn!(
|
||||
"GPU at '{}' has unsupported driver '{}', functionality will be limited",
|
||||
common.sysfs_path.display(),
|
||||
common.driver,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// We use the AMD controller as the fallback even for non-AMD devices, it will at least
|
||||
// display basic device information from the SysFS
|
||||
Ok(Box::new(
|
||||
AmdGpuController::new_from_path(common).context("Could initialize fallback controller")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn parse_uevent(data: &str) -> HashMap<&str, &str> {
|
||||
data.lines()
|
||||
.filter_map(|line| line.split_once('='))
|
||||
.collect()
|
||||
}
|
||||
|
@ -3,14 +3,14 @@ use crate::{
|
||||
server::vulkan::get_vulkan_info,
|
||||
};
|
||||
|
||||
use super::{fan_control::FanCurve, FanControlHandle, GpuController};
|
||||
use super::{fan_control::FanCurve, CommonControllerInfo, FanControlHandle, GpuController};
|
||||
use amdgpu_sysfs::{gpu_handle::power_profile_mode::PowerProfileModesTable, hw_mon::Temperature};
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::LocalBoxFuture;
|
||||
use lact_schema::{
|
||||
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, DrmMemoryInfo,
|
||||
FanControlMode, FanStats, GpuPciInfo, IntelDrmInfo, LinkInfo, NvidiaClockInfo,
|
||||
NvidiaClocksTable, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
|
||||
FanControlMode, FanStats, IntelDrmInfo, LinkInfo, NvidiaClockInfo, NvidiaClocksTable, PmfwInfo,
|
||||
PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
|
||||
};
|
||||
use nvml_wrapper::{
|
||||
bitmasks::device::ThrottleReasons,
|
||||
@ -21,7 +21,6 @@ use std::{
|
||||
cell::{Cell, RefCell},
|
||||
collections::HashMap,
|
||||
fmt::Write,
|
||||
path::{Path, PathBuf},
|
||||
rc::Rc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
@ -30,9 +29,7 @@ use tracing::{debug, error, trace, warn};
|
||||
|
||||
pub struct NvidiaGpuController {
|
||||
nvml: Rc<Nvml>,
|
||||
pci_slot_id: String,
|
||||
pci_info: GpuPciInfo,
|
||||
sysfs_path: PathBuf,
|
||||
common: CommonControllerInfo,
|
||||
fan_control_handle: RefCell<Option<FanControlHandle>>,
|
||||
|
||||
last_applied_gpc_offset: Cell<Option<i32>>,
|
||||
@ -40,26 +37,26 @@ pub struct NvidiaGpuController {
|
||||
}
|
||||
|
||||
impl NvidiaGpuController {
|
||||
pub fn new(
|
||||
nvml: Rc<Nvml>,
|
||||
pci_slot_id: String,
|
||||
pci_info: GpuPciInfo,
|
||||
sysfs_path: PathBuf,
|
||||
) -> Self {
|
||||
Self {
|
||||
pub fn new(common: CommonControllerInfo, nvml: Rc<Nvml>) -> anyhow::Result<Self> {
|
||||
nvml.device_by_pci_bus_id(common.pci_slot_name.as_str())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not get PCI device '{}' from NVML",
|
||||
common.pci_slot_name
|
||||
)
|
||||
})?;
|
||||
Ok(Self {
|
||||
nvml,
|
||||
pci_slot_id,
|
||||
pci_info,
|
||||
sysfs_path,
|
||||
common,
|
||||
fan_control_handle: RefCell::new(None),
|
||||
last_applied_gpc_offset: Cell::new(None),
|
||||
last_applied_mem_offset: Cell::new(None),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn device(&self) -> Device<'_> {
|
||||
self.nvml
|
||||
.device_by_pci_bus_id(self.pci_slot_id.as_str())
|
||||
.device_by_pci_bus_id(self.common.pci_slot_name.as_str())
|
||||
.expect("Can no longer get device")
|
||||
}
|
||||
|
||||
@ -90,7 +87,7 @@ impl NvidiaGpuController {
|
||||
let task_notify = notify.clone();
|
||||
|
||||
let nvml = self.nvml.clone();
|
||||
let pci_slot_id = self.pci_slot_id.clone();
|
||||
let pci_slot_id = self.common.pci_slot_name.clone();
|
||||
debug!("spawning new fan control task");
|
||||
|
||||
let handle = tokio::task::spawn_local(async move {
|
||||
@ -248,37 +245,12 @@ impl NvidiaGpuController {
|
||||
}
|
||||
|
||||
impl GpuController for NvidiaGpuController {
|
||||
fn get_id(&self) -> anyhow::Result<String> {
|
||||
let GpuPciInfo {
|
||||
device_pci_info,
|
||||
subsystem_pci_info,
|
||||
} = &self.pci_info;
|
||||
|
||||
Ok(format!(
|
||||
"{}:{}-{}:{}-{}",
|
||||
device_pci_info.vendor_id,
|
||||
device_pci_info.model_id,
|
||||
subsystem_pci_info.vendor_id,
|
||||
subsystem_pci_info.model_id,
|
||||
self.pci_slot_id
|
||||
))
|
||||
}
|
||||
|
||||
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
|
||||
Some(&self.pci_info)
|
||||
}
|
||||
|
||||
fn get_path(&self) -> &Path {
|
||||
&self.sysfs_path
|
||||
fn controller_info(&self) -> &CommonControllerInfo {
|
||||
&self.common
|
||||
}
|
||||
|
||||
fn get_info(&self) -> DeviceInfo {
|
||||
let device = self.device();
|
||||
|
||||
let vulkan_info = match get_vulkan_info(
|
||||
&self.pci_info.device_pci_info.vendor_id,
|
||||
&self.pci_info.device_pci_info.model_id,
|
||||
) {
|
||||
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
|
||||
Ok(info) => Some(info),
|
||||
Err(err) => {
|
||||
warn!("could not load vulkan info: {err}");
|
||||
@ -286,8 +258,10 @@ impl GpuController for NvidiaGpuController {
|
||||
}
|
||||
};
|
||||
|
||||
let device = self.device();
|
||||
|
||||
DeviceInfo {
|
||||
pci_info: Some(self.pci_info.clone()),
|
||||
pci_info: Some(self.common.pci_info.clone()),
|
||||
vulkan_info,
|
||||
driver: format!(
|
||||
"nvidia {}",
|
||||
@ -351,10 +325,6 @@ impl GpuController for NvidiaGpuController {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_pci_slot_name(&self) -> Option<String> {
|
||||
Some(self.pci_slot_id.clone())
|
||||
}
|
||||
|
||||
#[allow(
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_possible_truncation,
|
||||
|
@ -5,10 +5,7 @@ use super::{
|
||||
};
|
||||
use crate::{
|
||||
config::{self, default_fan_static_speed, Config, FanControlSettings, Profile},
|
||||
server::{
|
||||
gpu_controller::{AmdGpuController, IntelGpuController, NvidiaGpuController},
|
||||
profiles,
|
||||
},
|
||||
server::{gpu_controller::init_controller, profiles},
|
||||
};
|
||||
use amdgpu_sysfs::gpu_handle::{
|
||||
power_profile_mode::PowerProfileModesTable, PerformanceLevel, PowerLevelKind,
|
||||
@ -22,12 +19,11 @@ use lact_schema::{
|
||||
};
|
||||
use libflate::gzip;
|
||||
use nix::libc;
|
||||
use nvml_wrapper::{error::NvmlError, Nvml};
|
||||
use os_release::OS_RELEASE;
|
||||
use pciid_parser::Database;
|
||||
use serde_json::json;
|
||||
use std::{
|
||||
cell::{Cell, RefCell},
|
||||
cell::{Cell, OnceCell, RefCell},
|
||||
collections::{BTreeMap, HashMap},
|
||||
env,
|
||||
fs::{self, File, Permissions},
|
||||
@ -125,7 +121,7 @@ impl<'a> Handler {
|
||||
.expect("pci file name should be valid unicode");
|
||||
|
||||
if controllers.values().any(|controller| {
|
||||
controller.get_pci_slot_name().as_ref() == Some(&slot_name)
|
||||
controller.controller_info().pci_slot_name == slot_name
|
||||
}) {
|
||||
debug!("found intialized drm entry for device {:?}", device.path());
|
||||
} else {
|
||||
@ -192,7 +188,7 @@ impl<'a> Handler {
|
||||
error!("could not apply existing config for gpu {id}: {err}");
|
||||
}
|
||||
} else {
|
||||
info!("could not find GPU with id {id} defined in configuration");
|
||||
warn!("could not find GPU with id {id} defined in configuration");
|
||||
}
|
||||
}
|
||||
|
||||
@ -337,8 +333,11 @@ impl<'a> Handler {
|
||||
.iter()
|
||||
.map(|(id, controller)| {
|
||||
let name = controller
|
||||
.get_pci_info()
|
||||
.and_then(|pci_info| pci_info.device_pci_info.model.clone());
|
||||
.controller_info()
|
||||
.pci_info
|
||||
.device_pci_info
|
||||
.model
|
||||
.clone();
|
||||
DeviceListEntry {
|
||||
id: id.to_owned(),
|
||||
name,
|
||||
@ -574,14 +573,14 @@ impl<'a> Handler {
|
||||
}
|
||||
|
||||
for controller in self.gpu_controllers.values() {
|
||||
let controller_path = controller.get_path();
|
||||
let controller_path = &controller.controller_info().sysfs_path;
|
||||
|
||||
for device_file in SNAPSHOT_DEVICE_FILES {
|
||||
let full_path = controller_path.join(device_file);
|
||||
add_path_to_archive(&mut archive, &full_path)?;
|
||||
}
|
||||
|
||||
let device_files = fs::read_dir(controller.get_path())
|
||||
let device_files = fs::read_dir(controller_path)
|
||||
.context("Could not read device dir")?
|
||||
.flatten();
|
||||
|
||||
@ -591,16 +590,12 @@ impl<'a> Handler {
|
||||
.iter()
|
||||
.any(|prefix| entry_name.starts_with(prefix))
|
||||
{
|
||||
add_path_recursively(
|
||||
&mut archive,
|
||||
&device_entry.path(),
|
||||
controller.get_path(),
|
||||
)?;
|
||||
add_path_recursively(&mut archive, &device_entry.path(), controller_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let card_path = controller.get_path().parent().unwrap();
|
||||
let card_path = controller_path.parent().unwrap();
|
||||
let card_files = fs::read_dir(card_path)
|
||||
.context("Could not read device dir")?
|
||||
.flatten();
|
||||
@ -702,7 +697,7 @@ impl<'a> Handler {
|
||||
.and_then(|config| config.gpus().ok()?.get(id));
|
||||
|
||||
let data = json!({
|
||||
"pci_info": controller.get_pci_info(),
|
||||
"pci_info": controller.controller_info().pci_info.clone(),
|
||||
"info": controller.get_info(),
|
||||
"stats": controller.get_stats(gpu_config),
|
||||
"clocks_info": controller.get_clocks_info().ok(),
|
||||
@ -924,19 +919,7 @@ fn load_controllers(base_path: &Path) -> anyhow::Result<BTreeMap<String, Box<dyn
|
||||
}
|
||||
});
|
||||
|
||||
#[cfg(test)]
|
||||
let nvml: Option<Rc<Nvml>> = None;
|
||||
#[cfg(not(test))]
|
||||
let nvml = match Nvml::init() {
|
||||
Ok(nvml) => {
|
||||
info!("NVML initialized");
|
||||
Some(Rc::new(nvml))
|
||||
}
|
||||
Err(err) => {
|
||||
info!("Nvidia support disabled, {err}");
|
||||
None
|
||||
}
|
||||
};
|
||||
let nvml = OnceCell::new();
|
||||
|
||||
for entry in base_path
|
||||
.read_dir()
|
||||
@ -951,92 +934,24 @@ fn load_controllers(base_path: &Path) -> anyhow::Result<BTreeMap<String, Box<dyn
|
||||
if name.starts_with("card") && !name.contains('-') {
|
||||
trace!("trying gpu controller at {:?}", entry.path());
|
||||
let device_path = entry.path().join("device");
|
||||
match AmdGpuController::new_from_path(device_path, &pci_db) {
|
||||
Ok(controller) => match controller.get_id() {
|
||||
Ok(id) => {
|
||||
let path = controller.get_path();
|
||||
|
||||
if matches!(controller.get_driver(), "xe" | "i915") {
|
||||
match controller
|
||||
.get_pci_info()
|
||||
.zip(controller.get_pci_slot_name())
|
||||
{
|
||||
Some((pci_info, pci_slot_id)) => {
|
||||
match IntelGpuController::new(
|
||||
path.to_owned(),
|
||||
controller.get_driver().to_owned(),
|
||||
pci_slot_id,
|
||||
pci_info.clone(),
|
||||
) {
|
||||
Ok(controller) => {
|
||||
let id = controller.get_id().unwrap();
|
||||
info!("initialized Intel controller {id} for path {path:?}");
|
||||
controllers.insert(
|
||||
id,
|
||||
Box::new(controller) as Box<dyn GpuController>,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"could not initialize Intel controller: {err:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
error!("could not get PCI info for Intel GPU at {path:?}",);
|
||||
}
|
||||
}
|
||||
}
|
||||
match init_controller(device_path.clone(), &pci_db, &nvml) {
|
||||
Ok(controller) => {
|
||||
let info = controller.controller_info();
|
||||
let id = info.build_id();
|
||||
|
||||
if let Some(nvml) = nvml.clone() {
|
||||
if let Some(pci_slot_id) = controller.get_pci_slot_name() {
|
||||
match nvml.device_by_pci_bus_id(pci_slot_id.as_str()) {
|
||||
Ok(_) => {
|
||||
let controller = NvidiaGpuController::new(
|
||||
nvml,
|
||||
pci_slot_id,
|
||||
controller.get_pci_info().expect(
|
||||
"Initialized NVML device without PCI info somehow",
|
||||
).clone(),
|
||||
path.to_owned(),
|
||||
);
|
||||
match controller.get_id() {
|
||||
Ok(id) => {
|
||||
info!("initialized Nvidia GPU controller {id} for path {path:?}");
|
||||
controllers.insert(
|
||||
id,
|
||||
Box::new(controller) as Box<dyn GpuController>,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(err) => {
|
||||
error!("could not get Nvidia GPU id: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(NvmlError::NotFound) => {
|
||||
debug!("PCI slot {pci_slot_id} not found in NVML");
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"could not initialize Nvidia GPU at {path:?}: {err}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"initialized {} controller for GPU {id} at '{}'",
|
||||
info.driver,
|
||||
info.sysfs_path.display()
|
||||
);
|
||||
|
||||
info!("initialized GPU controller {id} for path {path:?}");
|
||||
controllers.insert(id, Box::new(controller) as Box<dyn GpuController>);
|
||||
}
|
||||
Err(err) => warn!("could not initialize controller: {err:#}"),
|
||||
},
|
||||
Err(error) => {
|
||||
warn!(
|
||||
"failed to initialize controller at {:?}, {error}",
|
||||
entry.path()
|
||||
controllers.insert(id, controller);
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"could not initialize GPU controller at '{}': {err:#}",
|
||||
device_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use lact_schema::{VulkanDriverInfo, VulkanInfo};
|
||||
use lact_schema::{GpuPciInfo, VulkanDriverInfo, VulkanInfo};
|
||||
use std::borrow::Cow;
|
||||
use tracing::trace;
|
||||
use vulkano::{
|
||||
@ -8,13 +8,13 @@ use vulkano::{
|
||||
};
|
||||
|
||||
#[cfg_attr(test, allow(unreachable_code, unused_variables))]
|
||||
pub fn get_vulkan_info<'a>(vendor_id: &'a str, device_id: &'a str) -> anyhow::Result<VulkanInfo> {
|
||||
pub fn get_vulkan_info(pci_info: &GpuPciInfo) -> anyhow::Result<VulkanInfo> {
|
||||
#[cfg(test)]
|
||||
return Err(anyhow!("Not allowed in tests"));
|
||||
|
||||
trace!("Reading vulkan info");
|
||||
let vendor_id = u32::from_str_radix(vendor_id, 16)?;
|
||||
let device_id = u32::from_str_radix(device_id, 16)?;
|
||||
let vendor_id = u32::from_str_radix(&pci_info.device_pci_info.vendor_id, 16)?;
|
||||
let device_id = u32::from_str_radix(&pci_info.device_pci_info.model_id, 16)?;
|
||||
|
||||
let library = VulkanLibrary::new().context("Could not create vulkan library")?;
|
||||
let instance = Instance::new(library, InstanceCreateInfo::default())
|
||||
|
Loading…
Reference in New Issue
Block a user