mirror of
https://github.com/ilya-zlobintsev/LACT.git
synced 2025-02-25 18:55:26 -06:00
feat: add prometheus exporter
This commit is contained in:
parent
d26e8bf869
commit
bef89e2923
53
Cargo.lock
generated
53
Cargo.lock
generated
@ -132,6 +132,12 @@ version = "1.0.95"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
|
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ascii"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ash"
|
name = "ash"
|
||||||
version = "0.37.3+1.3.251"
|
version = "0.37.3+1.3.251"
|
||||||
@ -340,6 +346,12 @@ dependencies = [
|
|||||||
"windows-targets 0.52.6",
|
"windows-targets 0.52.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chunked_transfer"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clang-sys"
|
name = "clang-sys"
|
||||||
version = "1.8.1"
|
version = "1.8.1"
|
||||||
@ -1153,6 +1165,12 @@ dependencies = [
|
|||||||
"windows-sys 0.59.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "httpdate"
|
||||||
|
version = "1.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iana-time-zone"
|
name = "iana-time-zone"
|
||||||
version = "0.1.61"
|
version = "0.1.61"
|
||||||
@ -1338,11 +1356,13 @@ dependencies = [
|
|||||||
"os-release",
|
"os-release",
|
||||||
"pciid-parser",
|
"pciid-parser",
|
||||||
"pretty_assertions",
|
"pretty_assertions",
|
||||||
|
"prometheus",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"serde_yaml",
|
"serde_yaml",
|
||||||
"tar",
|
"tar",
|
||||||
|
"tiny_http",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
@ -1910,6 +1930,27 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "prometheus"
|
||||||
|
version = "0.13.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"fnv",
|
||||||
|
"lazy_static",
|
||||||
|
"memchr",
|
||||||
|
"parking_lot",
|
||||||
|
"protobuf",
|
||||||
|
"thiserror",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "protobuf"
|
||||||
|
version = "2.28.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.38"
|
version = "1.0.38"
|
||||||
@ -2437,6 +2478,18 @@ dependencies = [
|
|||||||
"time-core",
|
"time-core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tiny_http"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82"
|
||||||
|
dependencies = [
|
||||||
|
"ascii",
|
||||||
|
"chunked_transfer",
|
||||||
|
"httpdate",
|
||||||
|
"log",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.43.0"
|
version = "1.43.0"
|
||||||
|
@ -46,6 +46,8 @@ os-release = "0.1.0"
|
|||||||
notify = { version = "8.0.0", default-features = false }
|
notify = { version = "8.0.0", default-features = false }
|
||||||
copes = { git = "https://gitlab.com/corectrl/copes" }
|
copes = { git = "https://gitlab.com/corectrl/copes" }
|
||||||
libloading = "0.8.6"
|
libloading = "0.8.6"
|
||||||
|
tiny_http = "0.12.0"
|
||||||
|
prometheus = "0.13.4"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
pretty_assertions = { workspace = true }
|
pretty_assertions = { workspace = true }
|
||||||
|
@ -67,6 +67,7 @@ pub struct Daemon {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub disable_clocks_cleanup: bool,
|
pub disable_clocks_cleanup: bool,
|
||||||
pub tcp_listen_address: Option<String>,
|
pub tcp_listen_address: Option<String>,
|
||||||
|
pub exporter_listen_address: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Daemon {
|
impl Default for Daemon {
|
||||||
@ -76,6 +77,7 @@ impl Default for Daemon {
|
|||||||
admin_groups: DEFAULT_ADMIN_GROUPS.map(str::to_owned).to_vec(),
|
admin_groups: DEFAULT_ADMIN_GROUPS.map(str::to_owned).to_vec(),
|
||||||
disable_clocks_cleanup: false,
|
disable_clocks_cleanup: false,
|
||||||
tcp_listen_address: None,
|
tcp_listen_address: None,
|
||||||
|
exporter_listen_address: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
mod exporter;
|
||||||
pub mod gpu_controller;
|
pub mod gpu_controller;
|
||||||
pub mod handler;
|
pub mod handler;
|
||||||
mod profiles;
|
mod profiles;
|
||||||
@ -6,11 +7,11 @@ mod vulkan;
|
|||||||
|
|
||||||
use self::handler::Handler;
|
use self::handler::Handler;
|
||||||
use crate::{config::Config, socket};
|
use crate::{config::Config, socket};
|
||||||
use anyhow::Context;
|
use anyhow::{anyhow, Context};
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use lact_schema::{Pong, Request, Response};
|
use lact_schema::{Pong, Request, Response};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::fmt::Debug;
|
use std::{fmt::Debug, net::SocketAddr};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader},
|
io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader},
|
||||||
net::{TcpListener, UnixListener},
|
net::{TcpListener, UnixListener},
|
||||||
@ -38,8 +39,29 @@ impl Server {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let exporter_server = if let Some(exporter_address) = &config.daemon.exporter_listen_address
|
||||||
|
{
|
||||||
|
let addr: SocketAddr = exporter_address
|
||||||
|
.parse()
|
||||||
|
.context("Invalid exporter address")?;
|
||||||
|
|
||||||
|
let server = tiny_http::Server::http(addr)
|
||||||
|
.map_err(|err| anyhow!("Could not start metrics exporter: {err}"))?;
|
||||||
|
info!("Prometheus metrics exporter listening on {exporter_address}");
|
||||||
|
|
||||||
|
Some(server)
|
||||||
|
} else {
|
||||||
|
info!("Prometheus metrics exporter disabled");
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
let handler = Handler::new(config).await?;
|
let handler = Handler::new(config).await?;
|
||||||
|
|
||||||
|
if let Some(server) = exporter_server {
|
||||||
|
let handler = handler.clone();
|
||||||
|
tokio::task::spawn_local(async move { exporter::run(server, &handler).await });
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
handler,
|
handler,
|
||||||
unix_listener,
|
unix_listener,
|
||||||
|
188
lact-daemon/src/server/exporter.rs
Normal file
188
lact-daemon/src/server/exporter.rs
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
use prometheus::{
|
||||||
|
register_gauge_with_registry, register_int_gauge_with_registry, Registry, TextEncoder,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tiny_http::Response;
|
||||||
|
use tokio::sync::{mpsc, oneshot};
|
||||||
|
use tracing::{error, warn};
|
||||||
|
|
||||||
|
use super::handler::Handler;
|
||||||
|
|
||||||
|
pub async fn run(server: tiny_http::Server, handler: &Handler) {
|
||||||
|
let (tx, mut rx) = mpsc::channel(1);
|
||||||
|
|
||||||
|
// We listen to http requests in the background on a blocking task, and collect the metrics as needed from the main one
|
||||||
|
tokio::task::spawn_blocking(move || loop {
|
||||||
|
match server.recv() {
|
||||||
|
Ok(req) => {
|
||||||
|
let (response_tx, response_rx) = oneshot::channel();
|
||||||
|
tx.blocking_send(response_tx).unwrap();
|
||||||
|
|
||||||
|
let response = response_rx.blocking_recv().unwrap();
|
||||||
|
if let Err(err) = req.respond(response) {
|
||||||
|
warn!("could not write metrics response: {err}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
error!("metrics exporter request error: {err}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
while let Some(response_tx) = rx.recv().await {
|
||||||
|
let registry = Registry::new();
|
||||||
|
collect_metrics(handler, ®istry).await;
|
||||||
|
let metric_families = registry.gather();
|
||||||
|
|
||||||
|
let encoder = TextEncoder::new();
|
||||||
|
let output = encoder
|
||||||
|
.encode_to_string(&metric_families)
|
||||||
|
.expect("Failed to encode metrics");
|
||||||
|
|
||||||
|
let response = Response::from_string(output);
|
||||||
|
let _ = response_tx.send(response);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn collect_metrics(handler: &Handler, registry: &Registry) {
|
||||||
|
let gpu_controllers = handler.gpu_controllers.read().await;
|
||||||
|
let config = handler.config.read().await;
|
||||||
|
|
||||||
|
for (id, controller) in gpu_controllers.iter() {
|
||||||
|
let gpu_config = config.gpus().ok().and_then(|gpus| gpus.get(id));
|
||||||
|
|
||||||
|
let info = controller.get_info();
|
||||||
|
let stats = controller.get_stats(gpu_config);
|
||||||
|
|
||||||
|
let mut device_name = String::new();
|
||||||
|
if let Some(pci_info) = &info.pci_info {
|
||||||
|
if let Some(vendor) = &pci_info.device_pci_info.vendor {
|
||||||
|
device_name.push_str(vendor);
|
||||||
|
}
|
||||||
|
if let Some(model) = &pci_info.device_pci_info.model {
|
||||||
|
if !device_name.is_empty() {
|
||||||
|
device_name.push(' ');
|
||||||
|
}
|
||||||
|
device_name.push_str(model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! gpu_opts {
|
||||||
|
(
|
||||||
|
$name: expr,
|
||||||
|
$help: expr,
|
||||||
|
$($key:expr => $value:expr),* $(,)*
|
||||||
|
) => {{
|
||||||
|
let opts = prometheus::Opts::new($name, $help);
|
||||||
|
|
||||||
|
let labels = HashMap::from_iter([
|
||||||
|
("gpu_id".to_owned(), id.clone()),
|
||||||
|
("gpu_name".to_owned(), device_name.clone()),
|
||||||
|
$(
|
||||||
|
($key.to_string(), $value.to_string()),
|
||||||
|
)*
|
||||||
|
]);
|
||||||
|
opts.const_labels(labels)
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! gpu_gauge {
|
||||||
|
(
|
||||||
|
$name: expr,
|
||||||
|
$help: expr,
|
||||||
|
$value: expr,
|
||||||
|
$($key:expr => $label:expr),* $(,)*
|
||||||
|
) => {{
|
||||||
|
#[allow(clippy::cast_precision_loss)]
|
||||||
|
register_gauge_with_registry!(
|
||||||
|
gpu_opts! {
|
||||||
|
$name,
|
||||||
|
$help,
|
||||||
|
$($key => $label)*
|
||||||
|
},
|
||||||
|
registry
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.set($value.into());
|
||||||
|
}}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
register_int_gauge_with_registry!(
|
||||||
|
gpu_opts! {
|
||||||
|
"lact_gpu_info",
|
||||||
|
"A static gauge containing basic GPU info in the labels",
|
||||||
|
"driver" => info.driver,
|
||||||
|
"family" => info.drm_info.and_then(|drm| drm.family_name).unwrap_or_default(),
|
||||||
|
},
|
||||||
|
registry
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
if let Some(usage) = stats.busy_percent {
|
||||||
|
gpu_gauge! {
|
||||||
|
"lact_gpu_usage",
|
||||||
|
"GPU usage percentage",
|
||||||
|
usage,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(power_current) = stats
|
||||||
|
.power
|
||||||
|
.average
|
||||||
|
.filter(|value| *value != 0.0)
|
||||||
|
.or(stats.power.current.filter(|value| *value != 0.0))
|
||||||
|
{
|
||||||
|
gpu_gauge! {
|
||||||
|
"lact_gpu_power_current",
|
||||||
|
"Current power consumption",
|
||||||
|
power_current,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(power_cap) = stats.power.cap_current {
|
||||||
|
gpu_gauge! {
|
||||||
|
"lact_gpu_power_cap",
|
||||||
|
"Power consumption cap",
|
||||||
|
power_cap,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
for (temp_name, temp) in stats.temps {
|
||||||
|
if let Some(value) = temp.current {
|
||||||
|
gpu_gauge!(
|
||||||
|
"lact_gpu_temperature",
|
||||||
|
"Current temperature",
|
||||||
|
value,
|
||||||
|
"sensor" => temp_name,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(gpu_clock) = stats.clockspeed.gpu_clockspeed {
|
||||||
|
gpu_gauge!(
|
||||||
|
"lact_gpu_frequency",
|
||||||
|
"Current frequency",
|
||||||
|
gpu_clock as f64,
|
||||||
|
"type" => "GPU",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(vram_clock) = stats.clockspeed.vram_clockspeed {
|
||||||
|
gpu_gauge!(
|
||||||
|
"lact_gpu_frequency",
|
||||||
|
"Current frequency",
|
||||||
|
vram_clock as f64,
|
||||||
|
"type" => "VRAM",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(fan_pwm) = stats.fan.pwm_current {
|
||||||
|
gpu_gauge!("lact_gpu_fan_pwm", "Fan speed (in PWM)", fan_pwm,);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(fan_rpm) = stats.fan.speed_current {
|
||||||
|
gpu_gauge!("lact_gpu_fan_rpm", "Fan speed (in RPM)", fan_rpm,);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user