refactor: GPU controller initialization

This commit is contained in:
Ilya Zlobintsev 2025-01-04 14:54:06 +02:00
parent 9c100b058d
commit dfcfa8e10d
6 changed files with 256 additions and 336 deletions

View File

@ -1,4 +1,4 @@
use super::{fan_control::FanCurve, FanControlHandle, GpuController};
use super::{fan_control::FanCurve, CommonControllerInfo, FanControlHandle, GpuController};
use crate::{
config::{self, ClocksConfiguration, FanControlSettings},
server::vulkan::get_vulkan_info,
@ -12,22 +12,19 @@ use amdgpu_sysfs::{
CommitHandle, GpuHandle, PerformanceLevel, PowerLevelKind, PowerLevels,
},
hw_mon::{FanControlMethod, HwMon},
sysfs::SysFS,
};
use anyhow::{anyhow, Context};
use futures::future::LocalBoxFuture;
use lact_schema::{
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, GpuPciInfo,
IntelDrmInfo, LinkInfo, PciInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats,
VramStats,
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, IntelDrmInfo,
LinkInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
};
use libdrm_amdgpu_sys::AMDGPU::{ThrottleStatus, ThrottlerBit};
use pciid_parser::Database;
use std::{
cell::RefCell,
cmp,
collections::{HashMap, HashSet},
path::{Path, PathBuf},
path::PathBuf,
rc::Rc,
time::Duration,
};
@ -52,13 +49,13 @@ const STEAM_DECK_IDS: [&str; 2] = ["163F", "1435"];
pub struct AmdGpuController {
handle: GpuHandle,
drm_handle: Option<DrmHandle>,
pci_info: Option<GpuPciInfo>,
common: CommonControllerInfo,
fan_control_handle: RefCell<Option<FanControlHandle>>,
}
impl AmdGpuController {
pub fn new_from_path(sysfs_path: PathBuf, pci_db: &Database) -> anyhow::Result<Self> {
let handle = GpuHandle::new_from_path(sysfs_path)
pub fn new_from_path(common: CommonControllerInfo) -> anyhow::Result<Self> {
let handle = GpuHandle::new_from_path(common.sysfs_path.clone())
.map_err(|error| anyhow!("failed to initialize gpu handle: {error}"))?;
#[allow(unused_mut)]
@ -75,47 +72,10 @@ impl AmdGpuController {
}
}
let mut device_pci_info = None;
let mut subsystem_pci_info = None;
if let Some((vendor_id, model_id)) = handle.get_pci_id() {
device_pci_info = Some(PciInfo {
vendor_id: vendor_id.to_owned(),
vendor: None,
model_id: model_id.to_owned(),
model: None,
});
if let Some((subsys_vendor_id, subsys_model_id)) = handle.get_pci_subsys_id() {
let pci_device_info =
pci_db.get_device_info(vendor_id, model_id, subsys_vendor_id, subsys_model_id);
device_pci_info = Some(PciInfo {
vendor_id: vendor_id.to_owned(),
vendor: pci_device_info.vendor_name.map(str::to_owned),
model_id: model_id.to_owned(),
model: pci_device_info.device_name.map(str::to_owned),
});
subsystem_pci_info = Some(PciInfo {
vendor_id: subsys_vendor_id.to_owned(),
vendor: pci_device_info.subvendor_name.map(str::to_owned),
model_id: subsys_model_id.to_owned(),
model: pci_device_info.subdevice_name.map(str::to_owned),
});
};
}
let pci_info = device_pci_info.and_then(|device_pci_info| {
Some(GpuPciInfo {
device_pci_info,
subsystem_pci_info: subsystem_pci_info?,
})
});
Ok(Self {
handle,
drm_handle,
pci_info,
common,
fan_control_handle: RefCell::new(None),
})
}
@ -539,56 +499,25 @@ impl AmdGpuController {
}
fn is_steam_deck(&self) -> bool {
self.pci_info.as_ref().is_some_and(|info| {
info.device_pci_info.vendor_id == VENDOR_AMD
&& STEAM_DECK_IDS.contains(&info.device_pci_info.model_id.as_str())
})
}
pub fn get_driver(&self) -> &str {
self.handle.get_driver()
self.common.pci_info.device_pci_info.vendor_id == VENDOR_AMD
&& STEAM_DECK_IDS.contains(&self.common.pci_info.device_pci_info.model_id.as_str())
}
}
impl GpuController for AmdGpuController {
fn get_id(&self) -> anyhow::Result<String> {
let handle = &self.handle;
let pci_id = handle.get_pci_id().context("Device has no vendor id")?;
let pci_subsys_id = handle
.get_pci_subsys_id()
.context("Device has no subsys id")?;
let pci_slot_name = handle
.get_pci_slot_name()
.context("Device has no pci slot")?;
Ok(format!(
"{}:{}-{}:{}-{}",
pci_id.0, pci_id.1, pci_subsys_id.0, pci_subsys_id.1, pci_slot_name
))
}
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
self.pci_info.as_ref()
}
fn get_path(&self) -> &Path {
self.handle.get_path()
fn controller_info(&self) -> &CommonControllerInfo {
&self.common
}
fn get_info(&self) -> DeviceInfo {
let vulkan_info = self.pci_info.as_ref().and_then(|pci_info| {
match get_vulkan_info(
&pci_info.device_pci_info.vendor_id,
&pci_info.device_pci_info.model_id,
) {
Ok(info) => Some(info),
Err(err) => {
warn!("could not load vulkan info: {err}");
None
}
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
Ok(info) => Some(info),
Err(err) => {
warn!("could not load vulkan info: {err}");
None
}
});
let pci_info = self.pci_info.clone();
};
let pci_info = Some(self.common.pci_info.clone());
let driver = self.handle.get_driver().to_owned();
let vbios_version = self.get_full_vbios_version();
let link_info = self.get_link_info();
@ -604,10 +533,6 @@ impl GpuController for AmdGpuController {
}
}
fn get_pci_slot_name(&self) -> Option<String> {
self.handle.get_pci_slot_name().map(str::to_owned)
}
fn get_stats(&self, gpu_config: Option<&config::Gpu>) -> DeviceStats {
let fan_settings = gpu_config.and_then(|config| config.fan_control_settings.as_ref());
DeviceStats {

View File

@ -1,14 +1,14 @@
mod drm;
use super::GpuController;
use super::{CommonControllerInfo, GpuController};
use crate::{config, server::vulkan::get_vulkan_info};
use amdgpu_sysfs::gpu_handle::power_profile_mode::PowerProfileModesTable;
use anyhow::{anyhow, Context};
use drm::{bindings, i915};
use drm::bindings;
use futures::future::LocalBoxFuture;
use lact_schema::{
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, GpuPciInfo,
IntelClocksTable, IntelDrmInfo, LinkInfo, PowerStates, VramStats,
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, IntelClocksTable,
IntelDrmInfo, LinkInfo, PowerStates, VramStats,
};
use std::{
cell::Cell,
@ -28,24 +28,16 @@ enum DriverType {
}
pub struct IntelGpuController {
sysfs_path: PathBuf,
driver: String,
driver_type: DriverType,
pci_slot_id: String,
pci_info: GpuPciInfo,
common: CommonControllerInfo,
tile_gts: Vec<PathBuf>,
drm_file: fs::File,
last_gpu_busy: Cell<Option<(Instant, u64)>>,
}
impl IntelGpuController {
pub fn new(
sysfs_path: PathBuf,
driver: String,
pci_slot_id: String,
pci_info: GpuPciInfo,
) -> anyhow::Result<Self> {
let driver_type = match driver.as_str() {
pub fn new(common: CommonControllerInfo) -> anyhow::Result<Self> {
let driver_type = match common.driver.as_str() {
"xe" => DriverType::Xe,
"i915" => DriverType::I915,
_ => unreachable!(),
@ -53,7 +45,11 @@ impl IntelGpuController {
let mut tile_gts = vec![];
for entry in fs::read_dir(&sysfs_path).into_iter().flatten().flatten() {
for entry in fs::read_dir(&common.sysfs_path)
.into_iter()
.flatten()
.flatten()
{
if let Some(name) = entry.file_name().to_str() {
if name.starts_with("tile") {
for gt_entry in fs::read_dir(entry.path()).into_iter().flatten().flatten() {
@ -61,7 +57,7 @@ impl IntelGpuController {
if gt_name.starts_with("gt") {
let gt_path = gt_entry
.path()
.strip_prefix(&sysfs_path)
.strip_prefix(&common.sysfs_path)
.unwrap()
.to_owned();
debug!("initialized GT at '{}'", gt_path.display());
@ -77,11 +73,11 @@ impl IntelGpuController {
info!(
"initialized {} gt at '{}'",
tile_gts.len(),
sysfs_path.display()
common.sysfs_path.display()
);
}
let drm_file = if cfg!(not(test)) {
let drm_path = format!("/dev/dri/by-path/pci-{pci_slot_id}-render");
let drm_path = format!("/dev/dri/by-path/pci-{}-render", common.pci_slot_name);
fs::OpenOptions::new()
.read(true)
.write(true)
@ -92,11 +88,8 @@ impl IntelGpuController {
};
Ok(Self {
sysfs_path,
driver,
common,
driver_type,
pci_slot_id,
pci_info,
tile_gts,
drm_file,
last_gpu_busy: Cell::new(None),
@ -105,35 +98,12 @@ impl IntelGpuController {
}
impl GpuController for IntelGpuController {
fn get_id(&self) -> anyhow::Result<String> {
let GpuPciInfo {
device_pci_info,
subsystem_pci_info,
} = &self.pci_info;
Ok(format!(
"{}:{}-{}:{}-{}",
device_pci_info.vendor_id,
device_pci_info.model_id,
subsystem_pci_info.vendor_id,
subsystem_pci_info.model_id,
self.pci_slot_id,
))
}
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
Some(&self.pci_info)
}
fn get_path(&self) -> &Path {
&self.sysfs_path
fn controller_info(&self) -> &CommonControllerInfo {
&self.common
}
fn get_info(&self) -> DeviceInfo {
let vulkan_info = match get_vulkan_info(
&self.pci_info.device_pci_info.vendor_id,
&self.pci_info.device_pci_info.model_id,
) {
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
Ok(info) => Some(info),
Err(err) => {
warn!("could not load vulkan info: {err}");
@ -151,19 +121,15 @@ impl GpuController for IntelGpuController {
};
DeviceInfo {
pci_info: Some(self.pci_info.clone()),
pci_info: Some(self.common.pci_info.clone()),
vulkan_info,
driver: self.driver.clone(),
driver: self.common.driver.clone(),
vbios_version: None,
link_info: LinkInfo::default(),
drm_info: Some(drm_info),
}
}
fn get_pci_slot_name(&self) -> Option<String> {
Some(self.pci_slot_id.clone())
}
fn apply_config<'a>(
&'a self,
config: &'a config::Gpu,
@ -285,11 +251,13 @@ impl GpuController for IntelGpuController {
}
impl IntelGpuController {
#[allow(clippy::unused_self)]
fn debugfs_path(&self) -> PathBuf {
#[cfg(test)]
return PathBuf::from("/dev/null");
Path::new("/sys/kernel/debug/dri").join(&self.pci_slot_id)
#[cfg(not(test))]
Path::new("/sys/kernel/debug/dri").join(&self.common.pci_slot_name)
}
fn first_tile_gt(&self) -> Option<&Path> {
@ -304,11 +272,12 @@ impl IntelGpuController {
match path.strip_prefix("../") {
Ok(path_relative_to_parent) => self
.get_path()
.common
.sysfs_path
.parent()
.expect("Device path has no parent")
.join(path_relative_to_parent),
Err(_) => self.get_path().join(path),
Err(_) => self.common.sysfs_path.join(path),
}
}
@ -379,6 +348,7 @@ impl IntelGpuController {
}
}
#[allow(clippy::unused_self)]
fn get_drm_info_xe(&self) -> IntelDrmInfo {
IntelDrmInfo {
execution_units: None,

View File

@ -4,30 +4,27 @@ pub mod fan_control;
mod intel;
mod nvidia;
pub use amd::AmdGpuController;
pub use intel::IntelGpuController;
pub use nvidia::NvidiaGpuController;
use amd::AmdGpuController;
use intel::IntelGpuController;
use nvidia::NvidiaGpuController;
use tracing::{error, info, warn};
use crate::config::{self};
use amdgpu_sysfs::gpu_handle::power_profile_mode::PowerProfileModesTable;
use anyhow::Context;
use futures::future::LocalBoxFuture;
use lact_schema::{ClocksInfo, DeviceInfo, DeviceStats, GpuPciInfo, PowerStates};
use std::{path::Path, rc::Rc};
use lact_schema::{ClocksInfo, DeviceInfo, DeviceStats, GpuPciInfo, PciInfo, PowerStates};
use nvml_wrapper::Nvml;
use std::{cell::OnceCell, collections::HashMap, fs, path::PathBuf, rc::Rc};
use tokio::{sync::Notify, task::JoinHandle};
type FanControlHandle = (Rc<Notify>, JoinHandle<()>);
pub trait GpuController {
fn get_id(&self) -> anyhow::Result<String>;
fn get_pci_info(&self) -> Option<&GpuPciInfo>;
fn get_path(&self) -> &Path;
fn controller_info(&self) -> &CommonControllerInfo;
fn get_info(&self) -> DeviceInfo;
fn get_pci_slot_name(&self) -> Option<String>;
fn apply_config<'a>(
&'a self,
config: &'a config::Gpu,
@ -47,3 +44,146 @@ pub trait GpuController {
fn vbios_dump(&self) -> anyhow::Result<Vec<u8>>;
}
#[derive(Clone)]
pub(crate) struct CommonControllerInfo {
pub sysfs_path: PathBuf,
pub pci_info: GpuPciInfo,
pub pci_slot_name: String,
pub driver: String,
}
impl CommonControllerInfo {
pub fn build_id(&self) -> String {
let GpuPciInfo {
device_pci_info,
subsystem_pci_info,
} = &self.pci_info;
format!(
"{}:{}-{}:{}-{}",
device_pci_info.vendor_id,
device_pci_info.model_id,
subsystem_pci_info.vendor_id,
subsystem_pci_info.model_id,
self.pci_slot_name
)
}
}
pub(crate) fn init_controller(
path: PathBuf,
pci_db: &pciid_parser::Database,
nvml: &OnceCell<Option<Rc<Nvml>>>,
) -> anyhow::Result<Box<dyn GpuController>> {
let uevent_path = path.join("uevent");
let uevent = fs::read_to_string(uevent_path).context("Could not read 'uevent'")?;
let mut uevent_map = parse_uevent(&uevent);
let driver = uevent_map
.remove("DRIVER")
.context("DRIVER entry missing in 'uevent'")?
.to_owned();
let pci_slot_name = uevent_map
.remove("PCI_SLOT_NAME")
.context("PCI_SLOT_NAME entry missing in 'uevent'")?
.to_owned();
let (vendor_id, device_id) = uevent_map
.get("PCI_ID")
.and_then(|id_line| id_line.split_once(':'))
.context("PCI_ID entry missing in 'uevent'")?;
let subsystem_entry = uevent_map
.get("PCI_SUBSYS_ID")
.and_then(|id_line| id_line.split_once(':'));
let (subsystem_vendor_id, subsystem_device_id) = subsystem_entry
.map(|(vendor, device)| (Some(vendor), Some(device)))
.unwrap_or_default();
let subsystem_info = subsystem_entry
.map(|(subsys_vendor_id, subsys_device_id)| {
pci_db.get_device_info(vendor_id, device_id, subsys_vendor_id, subsys_device_id)
})
.unwrap_or_default();
let vendor_entry = pci_db.vendors.get_key_value(vendor_id);
let pci_info = GpuPciInfo {
device_pci_info: PciInfo {
vendor_id: vendor_id.to_owned(),
vendor: vendor_entry.map(|(vendor_name, _)| vendor_name.clone()),
model_id: device_id.to_owned(),
model: vendor_entry.and_then(|(_, vendor)| {
vendor
.devices
.get(device_id)
.map(|device| device.name.clone())
}),
},
subsystem_pci_info: PciInfo {
vendor_id: subsystem_vendor_id.unwrap_or_default().to_owned(),
vendor: subsystem_info.subvendor_name.map(str::to_owned),
model_id: subsystem_device_id.unwrap_or_default().to_owned(),
model: subsystem_info.subdevice_name.map(str::to_owned),
},
};
let common = CommonControllerInfo {
sysfs_path: path,
pci_info,
pci_slot_name,
driver,
};
match common.driver.as_str() {
"amdgpu" | "radeon" => match AmdGpuController::new_from_path(common.clone()) {
Ok(controller) => return Ok(Box::new(controller)),
Err(err) => error!("could not initialize AMD controller: {err:#}"),
},
"i915" | "xe" => match IntelGpuController::new(common.clone()) {
Ok(controller) => return Ok(Box::new(controller)),
Err(err) => error!("could not initialize Intel controller: {err:#}"),
},
"nvidia" => {
let nvml = nvml.get_or_init(|| match Nvml::init() {
Ok(nvml) => {
info!("Nvidia management library loaded");
Some(Rc::new(nvml))
}
Err(err) => {
error!("could not load Nvidia management library: {err}, Nvidia controls will not be available");
None
}
});
if let Some(nvml) = nvml {
match NvidiaGpuController::new(common.clone(), nvml.clone()) {
Ok(controller) => {
return Ok(Box::new(controller));
}
Err(err) => error!("could not initialize Nvidia controller: {err:#}"),
}
}
}
_ => {
warn!(
"GPU at '{}' has unsupported driver '{}', functionality will be limited",
common.sysfs_path.display(),
common.driver,
);
}
}
// We use the AMD controller as the fallback even for non-AMD devices, it will at least
// display basic device information from the SysFS
Ok(Box::new(
AmdGpuController::new_from_path(common).context("Could initialize fallback controller")?,
))
}
fn parse_uevent(data: &str) -> HashMap<&str, &str> {
data.lines()
.filter_map(|line| line.split_once('='))
.collect()
}

View File

@ -3,14 +3,14 @@ use crate::{
server::vulkan::get_vulkan_info,
};
use super::{fan_control::FanCurve, FanControlHandle, GpuController};
use super::{fan_control::FanCurve, CommonControllerInfo, FanControlHandle, GpuController};
use amdgpu_sysfs::{gpu_handle::power_profile_mode::PowerProfileModesTable, hw_mon::Temperature};
use anyhow::{anyhow, Context};
use futures::future::LocalBoxFuture;
use lact_schema::{
ClocksInfo, ClocksTable, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, DrmMemoryInfo,
FanControlMode, FanStats, GpuPciInfo, IntelDrmInfo, LinkInfo, NvidiaClockInfo,
NvidiaClocksTable, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
FanControlMode, FanStats, IntelDrmInfo, LinkInfo, NvidiaClockInfo, NvidiaClocksTable, PmfwInfo,
PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
};
use nvml_wrapper::{
bitmasks::device::ThrottleReasons,
@ -21,7 +21,6 @@ use std::{
cell::{Cell, RefCell},
collections::HashMap,
fmt::Write,
path::{Path, PathBuf},
rc::Rc,
time::{Duration, Instant},
};
@ -30,9 +29,7 @@ use tracing::{debug, error, trace, warn};
pub struct NvidiaGpuController {
nvml: Rc<Nvml>,
pci_slot_id: String,
pci_info: GpuPciInfo,
sysfs_path: PathBuf,
common: CommonControllerInfo,
fan_control_handle: RefCell<Option<FanControlHandle>>,
last_applied_gpc_offset: Cell<Option<i32>>,
@ -40,26 +37,26 @@ pub struct NvidiaGpuController {
}
impl NvidiaGpuController {
pub fn new(
nvml: Rc<Nvml>,
pci_slot_id: String,
pci_info: GpuPciInfo,
sysfs_path: PathBuf,
) -> Self {
Self {
pub fn new(common: CommonControllerInfo, nvml: Rc<Nvml>) -> anyhow::Result<Self> {
nvml.device_by_pci_bus_id(common.pci_slot_name.as_str())
.with_context(|| {
format!(
"Could not get PCI device '{}' from NVML",
common.pci_slot_name
)
})?;
Ok(Self {
nvml,
pci_slot_id,
pci_info,
sysfs_path,
common,
fan_control_handle: RefCell::new(None),
last_applied_gpc_offset: Cell::new(None),
last_applied_mem_offset: Cell::new(None),
}
})
}
fn device(&self) -> Device<'_> {
self.nvml
.device_by_pci_bus_id(self.pci_slot_id.as_str())
.device_by_pci_bus_id(self.common.pci_slot_name.as_str())
.expect("Can no longer get device")
}
@ -90,7 +87,7 @@ impl NvidiaGpuController {
let task_notify = notify.clone();
let nvml = self.nvml.clone();
let pci_slot_id = self.pci_slot_id.clone();
let pci_slot_id = self.common.pci_slot_name.clone();
debug!("spawning new fan control task");
let handle = tokio::task::spawn_local(async move {
@ -248,37 +245,12 @@ impl NvidiaGpuController {
}
impl GpuController for NvidiaGpuController {
fn get_id(&self) -> anyhow::Result<String> {
let GpuPciInfo {
device_pci_info,
subsystem_pci_info,
} = &self.pci_info;
Ok(format!(
"{}:{}-{}:{}-{}",
device_pci_info.vendor_id,
device_pci_info.model_id,
subsystem_pci_info.vendor_id,
subsystem_pci_info.model_id,
self.pci_slot_id
))
}
fn get_pci_info(&self) -> Option<&GpuPciInfo> {
Some(&self.pci_info)
}
fn get_path(&self) -> &Path {
&self.sysfs_path
fn controller_info(&self) -> &CommonControllerInfo {
&self.common
}
fn get_info(&self) -> DeviceInfo {
let device = self.device();
let vulkan_info = match get_vulkan_info(
&self.pci_info.device_pci_info.vendor_id,
&self.pci_info.device_pci_info.model_id,
) {
let vulkan_info = match get_vulkan_info(&self.common.pci_info) {
Ok(info) => Some(info),
Err(err) => {
warn!("could not load vulkan info: {err}");
@ -286,8 +258,10 @@ impl GpuController for NvidiaGpuController {
}
};
let device = self.device();
DeviceInfo {
pci_info: Some(self.pci_info.clone()),
pci_info: Some(self.common.pci_info.clone()),
vulkan_info,
driver: format!(
"nvidia {}",
@ -351,10 +325,6 @@ impl GpuController for NvidiaGpuController {
}
}
fn get_pci_slot_name(&self) -> Option<String> {
Some(self.pci_slot_id.clone())
}
#[allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,

View File

@ -5,10 +5,7 @@ use super::{
};
use crate::{
config::{self, default_fan_static_speed, Config, FanControlSettings, Profile},
server::{
gpu_controller::{AmdGpuController, IntelGpuController, NvidiaGpuController},
profiles,
},
server::{gpu_controller::init_controller, profiles},
};
use amdgpu_sysfs::gpu_handle::{
power_profile_mode::PowerProfileModesTable, PerformanceLevel, PowerLevelKind,
@ -22,12 +19,11 @@ use lact_schema::{
};
use libflate::gzip;
use nix::libc;
use nvml_wrapper::{error::NvmlError, Nvml};
use os_release::OS_RELEASE;
use pciid_parser::Database;
use serde_json::json;
use std::{
cell::{Cell, RefCell},
cell::{Cell, OnceCell, RefCell},
collections::{BTreeMap, HashMap},
env,
fs::{self, File, Permissions},
@ -125,7 +121,7 @@ impl<'a> Handler {
.expect("pci file name should be valid unicode");
if controllers.values().any(|controller| {
controller.get_pci_slot_name().as_ref() == Some(&slot_name)
controller.controller_info().pci_slot_name == slot_name
}) {
debug!("found intialized drm entry for device {:?}", device.path());
} else {
@ -192,7 +188,7 @@ impl<'a> Handler {
error!("could not apply existing config for gpu {id}: {err}");
}
} else {
info!("could not find GPU with id {id} defined in configuration");
warn!("could not find GPU with id {id} defined in configuration");
}
}
@ -337,8 +333,11 @@ impl<'a> Handler {
.iter()
.map(|(id, controller)| {
let name = controller
.get_pci_info()
.and_then(|pci_info| pci_info.device_pci_info.model.clone());
.controller_info()
.pci_info
.device_pci_info
.model
.clone();
DeviceListEntry {
id: id.to_owned(),
name,
@ -574,14 +573,14 @@ impl<'a> Handler {
}
for controller in self.gpu_controllers.values() {
let controller_path = controller.get_path();
let controller_path = &controller.controller_info().sysfs_path;
for device_file in SNAPSHOT_DEVICE_FILES {
let full_path = controller_path.join(device_file);
add_path_to_archive(&mut archive, &full_path)?;
}
let device_files = fs::read_dir(controller.get_path())
let device_files = fs::read_dir(controller_path)
.context("Could not read device dir")?
.flatten();
@ -591,16 +590,12 @@ impl<'a> Handler {
.iter()
.any(|prefix| entry_name.starts_with(prefix))
{
add_path_recursively(
&mut archive,
&device_entry.path(),
controller.get_path(),
)?;
add_path_recursively(&mut archive, &device_entry.path(), controller_path)?;
}
}
}
let card_path = controller.get_path().parent().unwrap();
let card_path = controller_path.parent().unwrap();
let card_files = fs::read_dir(card_path)
.context("Could not read device dir")?
.flatten();
@ -702,7 +697,7 @@ impl<'a> Handler {
.and_then(|config| config.gpus().ok()?.get(id));
let data = json!({
"pci_info": controller.get_pci_info(),
"pci_info": controller.controller_info().pci_info.clone(),
"info": controller.get_info(),
"stats": controller.get_stats(gpu_config),
"clocks_info": controller.get_clocks_info().ok(),
@ -924,19 +919,7 @@ fn load_controllers(base_path: &Path) -> anyhow::Result<BTreeMap<String, Box<dyn
}
});
#[cfg(test)]
let nvml: Option<Rc<Nvml>> = None;
#[cfg(not(test))]
let nvml = match Nvml::init() {
Ok(nvml) => {
info!("NVML initialized");
Some(Rc::new(nvml))
}
Err(err) => {
info!("Nvidia support disabled, {err}");
None
}
};
let nvml = OnceCell::new();
for entry in base_path
.read_dir()
@ -951,92 +934,24 @@ fn load_controllers(base_path: &Path) -> anyhow::Result<BTreeMap<String, Box<dyn
if name.starts_with("card") && !name.contains('-') {
trace!("trying gpu controller at {:?}", entry.path());
let device_path = entry.path().join("device");
match AmdGpuController::new_from_path(device_path, &pci_db) {
Ok(controller) => match controller.get_id() {
Ok(id) => {
let path = controller.get_path();
if matches!(controller.get_driver(), "xe" | "i915") {
match controller
.get_pci_info()
.zip(controller.get_pci_slot_name())
{
Some((pci_info, pci_slot_id)) => {
match IntelGpuController::new(
path.to_owned(),
controller.get_driver().to_owned(),
pci_slot_id,
pci_info.clone(),
) {
Ok(controller) => {
let id = controller.get_id().unwrap();
info!("initialized Intel controller {id} for path {path:?}");
controllers.insert(
id,
Box::new(controller) as Box<dyn GpuController>,
);
continue;
}
Err(err) => {
error!(
"could not initialize Intel controller: {err:#}"
);
}
}
}
None => {
error!("could not get PCI info for Intel GPU at {path:?}",);
}
}
}
match init_controller(device_path.clone(), &pci_db, &nvml) {
Ok(controller) => {
let info = controller.controller_info();
let id = info.build_id();
if let Some(nvml) = nvml.clone() {
if let Some(pci_slot_id) = controller.get_pci_slot_name() {
match nvml.device_by_pci_bus_id(pci_slot_id.as_str()) {
Ok(_) => {
let controller = NvidiaGpuController::new(
nvml,
pci_slot_id,
controller.get_pci_info().expect(
"Initialized NVML device without PCI info somehow",
).clone(),
path.to_owned(),
);
match controller.get_id() {
Ok(id) => {
info!("initialized Nvidia GPU controller {id} for path {path:?}");
controllers.insert(
id,
Box::new(controller) as Box<dyn GpuController>,
);
continue;
}
Err(err) => {
error!("could not get Nvidia GPU id: {err}");
}
}
}
Err(NvmlError::NotFound) => {
debug!("PCI slot {pci_slot_id} not found in NVML");
}
Err(err) => {
error!(
"could not initialize Nvidia GPU at {path:?}: {err}"
);
}
}
}
}
info!(
"initialized {} controller for GPU {id} at '{}'",
info.driver,
info.sysfs_path.display()
);
info!("initialized GPU controller {id} for path {path:?}");
controllers.insert(id, Box::new(controller) as Box<dyn GpuController>);
}
Err(err) => warn!("could not initialize controller: {err:#}"),
},
Err(error) => {
warn!(
"failed to initialize controller at {:?}, {error}",
entry.path()
controllers.insert(id, controller);
}
Err(err) => {
error!(
"could not initialize GPU controller at '{}': {err:#}",
device_path.display()
);
}
}

View File

@ -1,5 +1,5 @@
use anyhow::{anyhow, Context};
use lact_schema::{VulkanDriverInfo, VulkanInfo};
use lact_schema::{GpuPciInfo, VulkanDriverInfo, VulkanInfo};
use std::borrow::Cow;
use tracing::trace;
use vulkano::{
@ -8,13 +8,13 @@ use vulkano::{
};
#[cfg_attr(test, allow(unreachable_code, unused_variables))]
pub fn get_vulkan_info<'a>(vendor_id: &'a str, device_id: &'a str) -> anyhow::Result<VulkanInfo> {
pub fn get_vulkan_info(pci_info: &GpuPciInfo) -> anyhow::Result<VulkanInfo> {
#[cfg(test)]
return Err(anyhow!("Not allowed in tests"));
trace!("Reading vulkan info");
let vendor_id = u32::from_str_radix(vendor_id, 16)?;
let device_id = u32::from_str_radix(device_id, 16)?;
let vendor_id = u32::from_str_radix(&pci_info.device_pci_info.vendor_id, 16)?;
let device_id = u32::from_str_radix(&pci_info.device_pci_info.model_id, 16)?;
let library = VulkanLibrary::new().context("Could not create vulkan library")?;
let instance = Instance::new(library, InstanceCreateInfo::default())