diff --git a/Cargo.lock b/Cargo.lock index c9d0833..b19e007 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -132,6 +132,12 @@ version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" +[[package]] +name = "ascii" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" + [[package]] name = "ash" version = "0.37.3+1.3.251" @@ -340,6 +346,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "chunked_transfer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901" + [[package]] name = "clang-sys" version = "1.8.1" @@ -1153,6 +1165,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1338,11 +1356,13 @@ dependencies = [ "os-release", "pciid-parser", "pretty_assertions", + "prometheus", "serde", "serde_json", "serde_with", "serde_yaml", "tar", + "tiny_http", "tokio", "tracing", "tracing-subscriber", @@ -1910,6 +1930,27 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror", +] + +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "quote" version = "1.0.38" @@ -2437,6 +2478,18 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny_http" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82" +dependencies = [ + "ascii", + "chunked_transfer", + "httpdate", + "log", +] + [[package]] name = "tokio" version = "1.43.0" diff --git a/lact-daemon/Cargo.toml b/lact-daemon/Cargo.toml index fd7dad7..708c482 100644 --- a/lact-daemon/Cargo.toml +++ b/lact-daemon/Cargo.toml @@ -46,6 +46,8 @@ os-release = "0.1.0" notify = { version = "8.0.0", default-features = false } copes = { git = "https://gitlab.com/corectrl/copes" } libloading = "0.8.6" +tiny_http = "0.12.0" +prometheus = "0.13.4" [dev-dependencies] pretty_assertions = { workspace = true } diff --git a/lact-daemon/src/config.rs b/lact-daemon/src/config.rs index 9b1f4ab..90f9791 100644 --- a/lact-daemon/src/config.rs +++ b/lact-daemon/src/config.rs @@ -67,6 +67,7 @@ pub struct Daemon { #[serde(default)] pub disable_clocks_cleanup: bool, pub tcp_listen_address: Option, + pub exporter_listen_address: Option, } impl Default for Daemon { @@ -76,6 +77,7 @@ impl Default for Daemon { admin_groups: DEFAULT_ADMIN_GROUPS.map(str::to_owned).to_vec(), disable_clocks_cleanup: false, tcp_listen_address: None, + exporter_listen_address: None, } } } diff --git a/lact-daemon/src/server.rs b/lact-daemon/src/server.rs index 22d097b..6796906 100644 --- a/lact-daemon/src/server.rs +++ b/lact-daemon/src/server.rs @@ -1,3 +1,4 @@ +mod exporter; pub mod gpu_controller; pub mod handler; mod profiles; @@ -6,11 +7,11 @@ mod vulkan; use self::handler::Handler; use crate::{config::Config, socket}; -use anyhow::Context; +use anyhow::{anyhow, Context}; use futures::future::join_all; use lact_schema::{Pong, Request, Response}; use serde::Serialize; -use std::fmt::Debug; +use std::{fmt::Debug, net::SocketAddr}; use tokio::{ io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}, net::{TcpListener, UnixListener}, @@ -38,8 +39,29 @@ impl Server { None }; + let exporter_server = if let Some(exporter_address) = &config.daemon.exporter_listen_address + { + let addr: SocketAddr = exporter_address + .parse() + .context("Invalid exporter address")?; + + let server = tiny_http::Server::http(addr) + .map_err(|err| anyhow!("Could not start metrics exporter: {err}"))?; + info!("Prometheus metrics exporter listening on {exporter_address}"); + + Some(server) + } else { + info!("Prometheus metrics exporter disabled"); + None + }; + let handler = Handler::new(config).await?; + if let Some(server) = exporter_server { + let handler = handler.clone(); + tokio::task::spawn_local(async move { exporter::run(server, &handler).await }); + } + Ok(Self { handler, unix_listener, diff --git a/lact-daemon/src/server/exporter.rs b/lact-daemon/src/server/exporter.rs new file mode 100644 index 0000000..d1a7a17 --- /dev/null +++ b/lact-daemon/src/server/exporter.rs @@ -0,0 +1,188 @@ +use prometheus::{ + register_gauge_with_registry, register_int_gauge_with_registry, Registry, TextEncoder, +}; +use std::collections::HashMap; +use tiny_http::Response; +use tokio::sync::{mpsc, oneshot}; +use tracing::{error, warn}; + +use super::handler::Handler; + +pub async fn run(server: tiny_http::Server, handler: &Handler) { + let (tx, mut rx) = mpsc::channel(1); + + // We listen to http requests in the background on a blocking task, and collect the metrics as needed from the main one + tokio::task::spawn_blocking(move || loop { + match server.recv() { + Ok(req) => { + let (response_tx, response_rx) = oneshot::channel(); + tx.blocking_send(response_tx).unwrap(); + + let response = response_rx.blocking_recv().unwrap(); + if let Err(err) = req.respond(response) { + warn!("could not write metrics response: {err}"); + } + } + Err(err) => { + error!("metrics exporter request error: {err}"); + } + } + }); + + while let Some(response_tx) = rx.recv().await { + let registry = Registry::new(); + collect_metrics(handler, ®istry).await; + let metric_families = registry.gather(); + + let encoder = TextEncoder::new(); + let output = encoder + .encode_to_string(&metric_families) + .expect("Failed to encode metrics"); + + let response = Response::from_string(output); + let _ = response_tx.send(response); + } +} + +async fn collect_metrics(handler: &Handler, registry: &Registry) { + let gpu_controllers = handler.gpu_controllers.read().await; + let config = handler.config.read().await; + + for (id, controller) in gpu_controllers.iter() { + let gpu_config = config.gpus().ok().and_then(|gpus| gpus.get(id)); + + let info = controller.get_info(); + let stats = controller.get_stats(gpu_config); + + let mut device_name = String::new(); + if let Some(pci_info) = &info.pci_info { + if let Some(vendor) = &pci_info.device_pci_info.vendor { + device_name.push_str(vendor); + } + if let Some(model) = &pci_info.device_pci_info.model { + if !device_name.is_empty() { + device_name.push(' '); + } + device_name.push_str(model); + } + } + + macro_rules! gpu_opts { + ( + $name: expr, + $help: expr, + $($key:expr => $value:expr),* $(,)* + ) => {{ + let opts = prometheus::Opts::new($name, $help); + + let labels = HashMap::from_iter([ + ("gpu_id".to_owned(), id.clone()), + ("gpu_name".to_owned(), device_name.clone()), + $( + ($key.to_string(), $value.to_string()), + )* + ]); + opts.const_labels(labels) + }}; + } + + macro_rules! gpu_gauge { + ( + $name: expr, + $help: expr, + $value: expr, + $($key:expr => $label:expr),* $(,)* + ) => {{ + #[allow(clippy::cast_precision_loss)] + register_gauge_with_registry!( + gpu_opts! { + $name, + $help, + $($key => $label)* + }, + registry + ) + .unwrap() + .set($value.into()); + }} + + } + + register_int_gauge_with_registry!( + gpu_opts! { + "lact_gpu_info", + "A static gauge containing basic GPU info in the labels", + "driver" => info.driver, + "family" => info.drm_info.and_then(|drm| drm.family_name).unwrap_or_default(), + }, + registry + ) + .unwrap(); + + if let Some(usage) = stats.busy_percent { + gpu_gauge! { + "lact_gpu_usage", + "GPU usage percentage", + usage, + }; + } + + if let Some(power_current) = stats + .power + .average + .filter(|value| *value != 0.0) + .or(stats.power.current.filter(|value| *value != 0.0)) + { + gpu_gauge! { + "lact_gpu_power_current", + "Current power consumption", + power_current, + }; + } + + if let Some(power_cap) = stats.power.cap_current { + gpu_gauge! { + "lact_gpu_power_cap", + "Power consumption cap", + power_cap, + }; + } + + for (temp_name, temp) in stats.temps { + if let Some(value) = temp.current { + gpu_gauge!( + "lact_gpu_temperature", + "Current temperature", + value, + "sensor" => temp_name, + ); + } + } + + if let Some(gpu_clock) = stats.clockspeed.gpu_clockspeed { + gpu_gauge!( + "lact_gpu_frequency", + "Current frequency", + gpu_clock as f64, + "type" => "GPU", + ); + } + + if let Some(vram_clock) = stats.clockspeed.vram_clockspeed { + gpu_gauge!( + "lact_gpu_frequency", + "Current frequency", + vram_clock as f64, + "type" => "VRAM", + ); + } + + if let Some(fan_pwm) = stats.fan.pwm_current { + gpu_gauge!("lact_gpu_fan_pwm", "Fan speed (in PWM)", fan_pwm,); + } + + if let Some(fan_rpm) = stats.fan.speed_current { + gpu_gauge!("lact_gpu_fan_rpm", "Fan speed (in RPM)", fan_rpm,); + } + } +}