feat: improved error reporting, RX 7900 Fixes (#279)

* refactor: Add bunch of .context calls to get more detailed error messages

* refactor: Use serde-error to serialize error with proper backtrace in client/server boundary

* fix: Workaround weird driver bug with RX 7900 XTX and power limit
This commit is contained in:
Alik Aslanyan 2024-03-03 23:47:25 +04:00 committed by GitHub
parent f55db8116a
commit 5a78c72711
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 107 additions and 26 deletions

11
Cargo.lock generated
View File

@ -1387,9 +1387,11 @@ name = "lact-schema"
version = "0.5.4"
dependencies = [
"amdgpu-sysfs",
"anyhow",
"clap",
"indexmap",
"serde",
"serde-error",
"serde_json",
"serde_with",
"vergen",
@ -2062,6 +2064,15 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-error"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e988182713aeed6a619a88bca186f6d6407483485ffe44c869ee264f8eabd13f"
dependencies = [
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.195"

View File

@ -218,7 +218,10 @@ impl<'a, T: Deserialize<'a>> ResponseBuffer<T> {
.context("Could not deserialize response from daemon")?;
match response {
Response::Ok(data) => Ok(data),
Response::Error(err) => Err(anyhow!("Got error from daemon: {err}")),
Response::Error(err) => {
Err(anyhow::Error::new(err)
.context("Got error from daemon, end of client boundary"))
}
}
}
}

View File

@ -639,7 +639,13 @@ impl GpuController {
}
}
hw_mon.set_power_cap(cap)?;
// Due to possible driver bug, RX 7900 XTX really doesn't like when we set the same value again.
// But, also in general we want to avoid setting same value twice
if Ok(cap) != hw_mon.get_power_cap() {
hw_mon
.set_power_cap(cap)
.with_context(|| format!("Failed to set power cap: {cap}"))?;
}
// Reapply old power level
if let Some(level) = original_performance_level {
@ -649,15 +655,24 @@ impl GpuController {
}
} else if let Ok(hw_mon) = self.first_hw_mon() {
if let Ok(default_cap) = hw_mon.get_power_cap_default() {
hw_mon.set_power_cap(default_cap)?;
// Due to possible driver bug, RX 7900 XTX really doesn't like when we set the same value again.
// But, also in general we want to avoid setting same value twice
if Ok(default_cap) != hw_mon.get_power_cap() {
hw_mon.set_power_cap(default_cap).with_context(|| {
format!("Failed to set power cap to default cap: {default_cap}")
})?;
}
}
}
if let Some(level) = config.performance_level {
self.handle.set_power_force_performance_level(level)?;
self.handle
.set_power_force_performance_level(level)
.context("Failed to set power performance level")?;
} else if self.handle.get_power_force_performance_level().is_ok() {
self.handle
.set_power_force_performance_level(PerformanceLevel::Auto)?;
.set_power_force_performance_level(PerformanceLevel::Auto)
.context("Failed to set performance level to PerformanceLevel::Auto")?;
}
if let Some(mode_index) = config.power_profile_mode_index {
@ -667,17 +682,30 @@ impl GpuController {
));
}
self.handle.set_active_power_profile_mode(mode_index)?;
self.handle
.set_active_power_profile_mode(mode_index)
.context("Failed to set active power profile mode")?;
}
// Reset the clocks table in case the settings get reverted back to not having a clocks value configured
self.handle.reset_clocks_table().ok();
if config.is_core_clocks_used() {
let mut table = self.handle.get_clocks_table()?;
config.clocks_configuration.apply_to_table(&mut table)?;
let mut table = self
.handle
.get_clocks_table()
.context("Failed to get clocks table")?;
config
.clocks_configuration
.apply_to_table(&mut table)
.context("Failed to apply clocks configuration to table")?;
debug!("writing clocks commands: {:#?}", table.get_commands()?);
debug!(
"writing clocks commands: {:#?}",
table
.get_commands()
.context("Failed to get table commands")?
);
self.handle
.set_clocks_table(&table)
@ -701,7 +729,9 @@ impl GpuController {
if let Some(ref settings) = config.fan_control_settings {
match settings.mode {
lact_schema::FanControlMode::Static => {
self.set_static_fan_control(settings.static_speed).await?;
self.set_static_fan_control(settings.static_speed)
.await
.context("Failed to set static fan control")?;
}
lact_schema::FanControlMode::Curve => {
if settings.curve.0.is_empty() {
@ -714,7 +744,8 @@ impl GpuController {
settings.temperature_key.clone(),
interval,
)
.await?;
.await
.context("Failed to set curve fan control")?;
}
}
} else {
@ -723,7 +754,9 @@ impl GpuController {
));
}
} else {
self.stop_fan_control(true).await?;
self.stop_fan_control(true)
.await
.context("Failed to stop fan control")?;
let pmfw = &config.pmfw_options;
if let Some(acoustic_limit) = pmfw.acoustic_limit {

View File

@ -164,9 +164,11 @@ impl<'a> Handler {
Err(apply_err) => {
error!("could not apply settings: {apply_err:?}");
match controller.apply_config(&gpu_config).await {
Ok(()) => Err(apply_err.context("Could not apply settings")),
Err(err) => Err(anyhow!("Could not apply settings, and could not reset to default settings: {err:?}")),
}
Ok(()) => Err(apply_err.context("Could not apply settings")),
Err(err) => Err(apply_err.context(err.context(
"Could not apply settings, and could not reset to default settings",
))),
}
}
}
}
@ -339,6 +341,7 @@ impl<'a> Handler {
config.pmfw_options = pmfw;
})
.await
.context("Failed to edit GPU config")
}
pub async fn reset_pmfw(&self, id: &str) -> anyhow::Result<u64> {
@ -349,6 +352,7 @@ impl<'a> Handler {
config.pmfw_options = PmfwOptions::default();
})
.await
.context("Failed to edit GPU config and reset pmfw")
}
pub async fn set_power_cap(&'a self, id: &str, maybe_cap: Option<f64>) -> anyhow::Result<u64> {
@ -356,6 +360,7 @@ impl<'a> Handler {
gpu_config.power_cap = maybe_cap;
})
.await
.context("Failed to edit GPU config and set power cap")
}
pub fn get_power_states(&self, id: &str) -> anyhow::Result<PowerStates> {
@ -382,6 +387,7 @@ impl<'a> Handler {
}
})
.await
.context("Failed to edit GPU config and set performance level")
}
pub async fn set_clocks_value(
@ -397,6 +403,7 @@ impl<'a> Handler {
gpu_config.apply_clocks_command(&command);
})
.await
.context("Failed to edit GPU config and set clocks value")
}
pub async fn batch_set_clocks_value(
@ -410,6 +417,7 @@ impl<'a> Handler {
}
})
.await
.context("Failed to edit GPU config and batch set clocks")
}
pub fn get_power_profile_modes(&self, id: &str) -> anyhow::Result<PowerProfileModesTable> {
@ -429,6 +437,7 @@ impl<'a> Handler {
gpu_config.power_profile_mode_index = index;
})
.await
.context("Failed to edit GPU config and set power profile mode")
}
pub async fn set_enabled_power_states(
@ -441,6 +450,7 @@ impl<'a> Handler {
gpu.power_states.insert(kind, enabled_states);
})
.await
.context("Failed to edit GPU config and set enabled power states")
}
pub fn generate_snapshot(&self) -> anyhow::Result<String> {

View File

@ -58,11 +58,11 @@ pub async fn handle_stream(stream: UnixStream, handler: Handler) -> anyhow::Resu
let response = match maybe_request {
Ok(request) => match handle_request(request, &handler).await {
Ok(response) => response,
Err(error) => serde_json::to_vec(&Response::<()>::Error(format!("{error:?}")))?,
Err(error) => serde_json::to_vec(&Response::<()>::from(error))?,
},
Err(error) => serde_json::to_vec(&Response::<()>::Error(format!(
"Failed to deserialize request: {error}"
)))?,
Err(error) => serde_json::to_vec(&Response::<()>::from(
anyhow::Error::new(error).context("Failed to deserialize"),
))?,
};
stream.write_all(&response).await?;

View File

@ -146,8 +146,10 @@ impl App {
app.apply_revealer.connect_apply_button_clicked(
clone!(@strong app, @strong current_gpu_id => move || {
glib::idle_add_local_once(clone!(@strong app, @strong current_gpu_id => move || {
if let Err(err) = app.apply_settings(current_gpu_id.clone()) {
show_error(&app.window, err.context("Could not apply settings"));
if let Err(err) = app.apply_settings(current_gpu_id.clone())
.context("Could not apply settings (GUI)")
{
show_error(&app.window, err);
glib::idle_add_local_once(clone!(@strong app, @strong current_gpu_id => move || {
let gpu_id = current_gpu_id.borrow().clone();

View File

@ -10,6 +10,8 @@ args = ["clap"]
amdgpu-sysfs = { workspace = true }
serde = { workspace = true }
serde_with = { workspace = true }
serde-error = "=0.1.2"
anyhow = { workspace = true }
indexmap = { version = "*", features = ["serde"] }
clap = { version = "4.4.18", features = ["derive"], optional = true }

View File

@ -1,8 +1,14 @@
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Clone)]
#[derive(Serialize, Deserialize, Debug)]
#[serde(tag = "status", content = "data", rename_all = "snake_case")]
pub enum Response<T> {
Ok(T),
Error(String),
Error(serde_error::Error),
}
impl<T> From<anyhow::Error> for Response<T> {
fn from(value: anyhow::Error) -> Self {
Response::Error(serde_error::Error::new(&*value))
}
}

View File

@ -1,4 +1,5 @@
use crate::{Pong, Request, Response};
use anyhow::anyhow;
use serde_json::json;
#[test]
@ -35,11 +36,24 @@ fn controllers_response() {
#[test]
fn error_response() {
let expected_response = json!({
"status": "error",
"data": "my super error"
"data": {
"description": "third deeper context",
"source": {
"description": "second context",
"source": {
"description": "first error",
"source": null
}
}
},
"status": "error"
});
let response = Response::<()>::Error("my super error".to_owned());
let error = anyhow!("first error")
.context("second context")
.context(anyhow!("third deeper context"));
let response = Response::<()>::from(error);
assert_eq!(serde_json::to_value(response).unwrap(), expected_response);
}