feat: add prometheus exporter

This commit is contained in:
Ilya Zlobintsev 2025-02-15 18:50:51 +02:00
parent d26e8bf869
commit bef89e2923
5 changed files with 269 additions and 2 deletions

53
Cargo.lock generated
View File

@ -132,6 +132,12 @@ version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "ascii"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16"
[[package]]
name = "ash"
version = "0.37.3+1.3.251"
@ -340,6 +346,12 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "chunked_transfer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901"
[[package]]
name = "clang-sys"
version = "1.8.1"
@ -1153,6 +1165,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "httpdate"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
[[package]]
name = "iana-time-zone"
version = "0.1.61"
@ -1338,11 +1356,13 @@ dependencies = [
"os-release",
"pciid-parser",
"pretty_assertions",
"prometheus",
"serde",
"serde_json",
"serde_with",
"serde_yaml",
"tar",
"tiny_http",
"tokio",
"tracing",
"tracing-subscriber",
@ -1910,6 +1930,27 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "prometheus"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
dependencies = [
"cfg-if",
"fnv",
"lazy_static",
"memchr",
"parking_lot",
"protobuf",
"thiserror",
]
[[package]]
name = "protobuf"
version = "2.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
[[package]]
name = "quote"
version = "1.0.38"
@ -2437,6 +2478,18 @@ dependencies = [
"time-core",
]
[[package]]
name = "tiny_http"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82"
dependencies = [
"ascii",
"chunked_transfer",
"httpdate",
"log",
]
[[package]]
name = "tokio"
version = "1.43.0"

View File

@ -46,6 +46,8 @@ os-release = "0.1.0"
notify = { version = "8.0.0", default-features = false }
copes = { git = "https://gitlab.com/corectrl/copes" }
libloading = "0.8.6"
tiny_http = "0.12.0"
prometheus = "0.13.4"
[dev-dependencies]
pretty_assertions = { workspace = true }

View File

@ -67,6 +67,7 @@ pub struct Daemon {
#[serde(default)]
pub disable_clocks_cleanup: bool,
pub tcp_listen_address: Option<String>,
pub exporter_listen_address: Option<String>,
}
impl Default for Daemon {
@ -76,6 +77,7 @@ impl Default for Daemon {
admin_groups: DEFAULT_ADMIN_GROUPS.map(str::to_owned).to_vec(),
disable_clocks_cleanup: false,
tcp_listen_address: None,
exporter_listen_address: None,
}
}
}

View File

@ -1,3 +1,4 @@
mod exporter;
pub mod gpu_controller;
pub mod handler;
mod profiles;
@ -6,11 +7,11 @@ mod vulkan;
use self::handler::Handler;
use crate::{config::Config, socket};
use anyhow::Context;
use anyhow::{anyhow, Context};
use futures::future::join_all;
use lact_schema::{Pong, Request, Response};
use serde::Serialize;
use std::fmt::Debug;
use std::{fmt::Debug, net::SocketAddr};
use tokio::{
io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader},
net::{TcpListener, UnixListener},
@ -38,8 +39,29 @@ impl Server {
None
};
let exporter_server = if let Some(exporter_address) = &config.daemon.exporter_listen_address
{
let addr: SocketAddr = exporter_address
.parse()
.context("Invalid exporter address")?;
let server = tiny_http::Server::http(addr)
.map_err(|err| anyhow!("Could not start metrics exporter: {err}"))?;
info!("Prometheus metrics exporter listening on {exporter_address}");
Some(server)
} else {
info!("Prometheus metrics exporter disabled");
None
};
let handler = Handler::new(config).await?;
if let Some(server) = exporter_server {
let handler = handler.clone();
tokio::task::spawn_local(async move { exporter::run(server, &handler).await });
}
Ok(Self {
handler,
unix_listener,

View File

@ -0,0 +1,188 @@
use prometheus::{
register_gauge_with_registry, register_int_gauge_with_registry, Registry, TextEncoder,
};
use std::collections::HashMap;
use tiny_http::Response;
use tokio::sync::{mpsc, oneshot};
use tracing::{error, warn};
use super::handler::Handler;
pub async fn run(server: tiny_http::Server, handler: &Handler) {
let (tx, mut rx) = mpsc::channel(1);
// We listen to http requests in the background on a blocking task, and collect the metrics as needed from the main one
tokio::task::spawn_blocking(move || loop {
match server.recv() {
Ok(req) => {
let (response_tx, response_rx) = oneshot::channel();
tx.blocking_send(response_tx).unwrap();
let response = response_rx.blocking_recv().unwrap();
if let Err(err) = req.respond(response) {
warn!("could not write metrics response: {err}");
}
}
Err(err) => {
error!("metrics exporter request error: {err}");
}
}
});
while let Some(response_tx) = rx.recv().await {
let registry = Registry::new();
collect_metrics(handler, &registry).await;
let metric_families = registry.gather();
let encoder = TextEncoder::new();
let output = encoder
.encode_to_string(&metric_families)
.expect("Failed to encode metrics");
let response = Response::from_string(output);
let _ = response_tx.send(response);
}
}
async fn collect_metrics(handler: &Handler, registry: &Registry) {
let gpu_controllers = handler.gpu_controllers.read().await;
let config = handler.config.read().await;
for (id, controller) in gpu_controllers.iter() {
let gpu_config = config.gpus().ok().and_then(|gpus| gpus.get(id));
let info = controller.get_info();
let stats = controller.get_stats(gpu_config);
let mut device_name = String::new();
if let Some(pci_info) = &info.pci_info {
if let Some(vendor) = &pci_info.device_pci_info.vendor {
device_name.push_str(vendor);
}
if let Some(model) = &pci_info.device_pci_info.model {
if !device_name.is_empty() {
device_name.push(' ');
}
device_name.push_str(model);
}
}
macro_rules! gpu_opts {
(
$name: expr,
$help: expr,
$($key:expr => $value:expr),* $(,)*
) => {{
let opts = prometheus::Opts::new($name, $help);
let labels = HashMap::from_iter([
("gpu_id".to_owned(), id.clone()),
("gpu_name".to_owned(), device_name.clone()),
$(
($key.to_string(), $value.to_string()),
)*
]);
opts.const_labels(labels)
}};
}
macro_rules! gpu_gauge {
(
$name: expr,
$help: expr,
$value: expr,
$($key:expr => $label:expr),* $(,)*
) => {{
#[allow(clippy::cast_precision_loss)]
register_gauge_with_registry!(
gpu_opts! {
$name,
$help,
$($key => $label)*
},
registry
)
.unwrap()
.set($value.into());
}}
}
register_int_gauge_with_registry!(
gpu_opts! {
"lact_gpu_info",
"A static gauge containing basic GPU info in the labels",
"driver" => info.driver,
"family" => info.drm_info.and_then(|drm| drm.family_name).unwrap_or_default(),
},
registry
)
.unwrap();
if let Some(usage) = stats.busy_percent {
gpu_gauge! {
"lact_gpu_usage",
"GPU usage percentage",
usage,
};
}
if let Some(power_current) = stats
.power
.average
.filter(|value| *value != 0.0)
.or(stats.power.current.filter(|value| *value != 0.0))
{
gpu_gauge! {
"lact_gpu_power_current",
"Current power consumption",
power_current,
};
}
if let Some(power_cap) = stats.power.cap_current {
gpu_gauge! {
"lact_gpu_power_cap",
"Power consumption cap",
power_cap,
};
}
for (temp_name, temp) in stats.temps {
if let Some(value) = temp.current {
gpu_gauge!(
"lact_gpu_temperature",
"Current temperature",
value,
"sensor" => temp_name,
);
}
}
if let Some(gpu_clock) = stats.clockspeed.gpu_clockspeed {
gpu_gauge!(
"lact_gpu_frequency",
"Current frequency",
gpu_clock as f64,
"type" => "GPU",
);
}
if let Some(vram_clock) = stats.clockspeed.vram_clockspeed {
gpu_gauge!(
"lact_gpu_frequency",
"Current frequency",
vram_clock as f64,
"type" => "VRAM",
);
}
if let Some(fan_pwm) = stats.fan.pwm_current {
gpu_gauge!("lact_gpu_fan_pwm", "Fan speed (in PWM)", fan_pwm,);
}
if let Some(fan_rpm) = stats.fan.speed_current {
gpu_gauge!("lact_gpu_fan_rpm", "Fan speed (in RPM)", fan_rpm,);
}
}
}