mirror of
https://github.com/ilya-zlobintsev/LACT.git
synced 2025-02-25 18:55:26 -06:00
feat: add prometheus exporter
This commit is contained in:
parent
d26e8bf869
commit
bef89e2923
53
Cargo.lock
generated
53
Cargo.lock
generated
@ -132,6 +132,12 @@ version = "1.0.95"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
|
||||
|
||||
[[package]]
|
||||
name = "ascii"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16"
|
||||
|
||||
[[package]]
|
||||
name = "ash"
|
||||
version = "0.37.3+1.3.251"
|
||||
@ -340,6 +346,12 @@ dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chunked_transfer"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901"
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "1.8.1"
|
||||
@ -1153,6 +1165,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.61"
|
||||
@ -1338,11 +1356,13 @@ dependencies = [
|
||||
"os-release",
|
||||
"pciid-parser",
|
||||
"pretty_assertions",
|
||||
"prometheus",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"serde_yaml",
|
||||
"tar",
|
||||
"tiny_http",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
@ -1910,6 +1930,27 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fnv",
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"parking_lot",
|
||||
"protobuf",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.38"
|
||||
@ -2437,6 +2478,18 @@ dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiny_http"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82"
|
||||
dependencies = [
|
||||
"ascii",
|
||||
"chunked_transfer",
|
||||
"httpdate",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.43.0"
|
||||
|
@ -46,6 +46,8 @@ os-release = "0.1.0"
|
||||
notify = { version = "8.0.0", default-features = false }
|
||||
copes = { git = "https://gitlab.com/corectrl/copes" }
|
||||
libloading = "0.8.6"
|
||||
tiny_http = "0.12.0"
|
||||
prometheus = "0.13.4"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = { workspace = true }
|
||||
|
@ -67,6 +67,7 @@ pub struct Daemon {
|
||||
#[serde(default)]
|
||||
pub disable_clocks_cleanup: bool,
|
||||
pub tcp_listen_address: Option<String>,
|
||||
pub exporter_listen_address: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for Daemon {
|
||||
@ -76,6 +77,7 @@ impl Default for Daemon {
|
||||
admin_groups: DEFAULT_ADMIN_GROUPS.map(str::to_owned).to_vec(),
|
||||
disable_clocks_cleanup: false,
|
||||
tcp_listen_address: None,
|
||||
exporter_listen_address: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
mod exporter;
|
||||
pub mod gpu_controller;
|
||||
pub mod handler;
|
||||
mod profiles;
|
||||
@ -6,11 +7,11 @@ mod vulkan;
|
||||
|
||||
use self::handler::Handler;
|
||||
use crate::{config::Config, socket};
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::future::join_all;
|
||||
use lact_schema::{Pong, Request, Response};
|
||||
use serde::Serialize;
|
||||
use std::fmt::Debug;
|
||||
use std::{fmt::Debug, net::SocketAddr};
|
||||
use tokio::{
|
||||
io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader},
|
||||
net::{TcpListener, UnixListener},
|
||||
@ -38,8 +39,29 @@ impl Server {
|
||||
None
|
||||
};
|
||||
|
||||
let exporter_server = if let Some(exporter_address) = &config.daemon.exporter_listen_address
|
||||
{
|
||||
let addr: SocketAddr = exporter_address
|
||||
.parse()
|
||||
.context("Invalid exporter address")?;
|
||||
|
||||
let server = tiny_http::Server::http(addr)
|
||||
.map_err(|err| anyhow!("Could not start metrics exporter: {err}"))?;
|
||||
info!("Prometheus metrics exporter listening on {exporter_address}");
|
||||
|
||||
Some(server)
|
||||
} else {
|
||||
info!("Prometheus metrics exporter disabled");
|
||||
None
|
||||
};
|
||||
|
||||
let handler = Handler::new(config).await?;
|
||||
|
||||
if let Some(server) = exporter_server {
|
||||
let handler = handler.clone();
|
||||
tokio::task::spawn_local(async move { exporter::run(server, &handler).await });
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
handler,
|
||||
unix_listener,
|
||||
|
188
lact-daemon/src/server/exporter.rs
Normal file
188
lact-daemon/src/server/exporter.rs
Normal file
@ -0,0 +1,188 @@
|
||||
use prometheus::{
|
||||
register_gauge_with_registry, register_int_gauge_with_registry, Registry, TextEncoder,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use tiny_http::Response;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tracing::{error, warn};
|
||||
|
||||
use super::handler::Handler;
|
||||
|
||||
pub async fn run(server: tiny_http::Server, handler: &Handler) {
|
||||
let (tx, mut rx) = mpsc::channel(1);
|
||||
|
||||
// We listen to http requests in the background on a blocking task, and collect the metrics as needed from the main one
|
||||
tokio::task::spawn_blocking(move || loop {
|
||||
match server.recv() {
|
||||
Ok(req) => {
|
||||
let (response_tx, response_rx) = oneshot::channel();
|
||||
tx.blocking_send(response_tx).unwrap();
|
||||
|
||||
let response = response_rx.blocking_recv().unwrap();
|
||||
if let Err(err) = req.respond(response) {
|
||||
warn!("could not write metrics response: {err}");
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("metrics exporter request error: {err}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
while let Some(response_tx) = rx.recv().await {
|
||||
let registry = Registry::new();
|
||||
collect_metrics(handler, ®istry).await;
|
||||
let metric_families = registry.gather();
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let output = encoder
|
||||
.encode_to_string(&metric_families)
|
||||
.expect("Failed to encode metrics");
|
||||
|
||||
let response = Response::from_string(output);
|
||||
let _ = response_tx.send(response);
|
||||
}
|
||||
}
|
||||
|
||||
async fn collect_metrics(handler: &Handler, registry: &Registry) {
|
||||
let gpu_controllers = handler.gpu_controllers.read().await;
|
||||
let config = handler.config.read().await;
|
||||
|
||||
for (id, controller) in gpu_controllers.iter() {
|
||||
let gpu_config = config.gpus().ok().and_then(|gpus| gpus.get(id));
|
||||
|
||||
let info = controller.get_info();
|
||||
let stats = controller.get_stats(gpu_config);
|
||||
|
||||
let mut device_name = String::new();
|
||||
if let Some(pci_info) = &info.pci_info {
|
||||
if let Some(vendor) = &pci_info.device_pci_info.vendor {
|
||||
device_name.push_str(vendor);
|
||||
}
|
||||
if let Some(model) = &pci_info.device_pci_info.model {
|
||||
if !device_name.is_empty() {
|
||||
device_name.push(' ');
|
||||
}
|
||||
device_name.push_str(model);
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! gpu_opts {
|
||||
(
|
||||
$name: expr,
|
||||
$help: expr,
|
||||
$($key:expr => $value:expr),* $(,)*
|
||||
) => {{
|
||||
let opts = prometheus::Opts::new($name, $help);
|
||||
|
||||
let labels = HashMap::from_iter([
|
||||
("gpu_id".to_owned(), id.clone()),
|
||||
("gpu_name".to_owned(), device_name.clone()),
|
||||
$(
|
||||
($key.to_string(), $value.to_string()),
|
||||
)*
|
||||
]);
|
||||
opts.const_labels(labels)
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! gpu_gauge {
|
||||
(
|
||||
$name: expr,
|
||||
$help: expr,
|
||||
$value: expr,
|
||||
$($key:expr => $label:expr),* $(,)*
|
||||
) => {{
|
||||
#[allow(clippy::cast_precision_loss)]
|
||||
register_gauge_with_registry!(
|
||||
gpu_opts! {
|
||||
$name,
|
||||
$help,
|
||||
$($key => $label)*
|
||||
},
|
||||
registry
|
||||
)
|
||||
.unwrap()
|
||||
.set($value.into());
|
||||
}}
|
||||
|
||||
}
|
||||
|
||||
register_int_gauge_with_registry!(
|
||||
gpu_opts! {
|
||||
"lact_gpu_info",
|
||||
"A static gauge containing basic GPU info in the labels",
|
||||
"driver" => info.driver,
|
||||
"family" => info.drm_info.and_then(|drm| drm.family_name).unwrap_or_default(),
|
||||
},
|
||||
registry
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
if let Some(usage) = stats.busy_percent {
|
||||
gpu_gauge! {
|
||||
"lact_gpu_usage",
|
||||
"GPU usage percentage",
|
||||
usage,
|
||||
};
|
||||
}
|
||||
|
||||
if let Some(power_current) = stats
|
||||
.power
|
||||
.average
|
||||
.filter(|value| *value != 0.0)
|
||||
.or(stats.power.current.filter(|value| *value != 0.0))
|
||||
{
|
||||
gpu_gauge! {
|
||||
"lact_gpu_power_current",
|
||||
"Current power consumption",
|
||||
power_current,
|
||||
};
|
||||
}
|
||||
|
||||
if let Some(power_cap) = stats.power.cap_current {
|
||||
gpu_gauge! {
|
||||
"lact_gpu_power_cap",
|
||||
"Power consumption cap",
|
||||
power_cap,
|
||||
};
|
||||
}
|
||||
|
||||
for (temp_name, temp) in stats.temps {
|
||||
if let Some(value) = temp.current {
|
||||
gpu_gauge!(
|
||||
"lact_gpu_temperature",
|
||||
"Current temperature",
|
||||
value,
|
||||
"sensor" => temp_name,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(gpu_clock) = stats.clockspeed.gpu_clockspeed {
|
||||
gpu_gauge!(
|
||||
"lact_gpu_frequency",
|
||||
"Current frequency",
|
||||
gpu_clock as f64,
|
||||
"type" => "GPU",
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(vram_clock) = stats.clockspeed.vram_clockspeed {
|
||||
gpu_gauge!(
|
||||
"lact_gpu_frequency",
|
||||
"Current frequency",
|
||||
vram_clock as f64,
|
||||
"type" => "VRAM",
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(fan_pwm) = stats.fan.pwm_current {
|
||||
gpu_gauge!("lact_gpu_fan_pwm", "Fan speed (in PWM)", fan_pwm,);
|
||||
}
|
||||
|
||||
if let Some(fan_rpm) = stats.fan.speed_current {
|
||||
gpu_gauge!("lact_gpu_fan_rpm", "Fan speed (in RPM)", fan_rpm,);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user