mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Introduce Metric Aggregation starting with Silences (#62512)
* Alerting: Introduce Metric Aggregation starting with Silences --------- Co-authored-by: Alexander Weaver <weaver.alex.d@gmail.com>
This commit is contained in:
parent
138575cbe9
commit
55e7cf1aed
3
go.mod
3
go.mod
@ -420,3 +420,6 @@ replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-aler
|
|||||||
replace google.golang.org/grpc => google.golang.org/grpc v1.45.0
|
replace google.golang.org/grpc => google.golang.org/grpc v1.45.0
|
||||||
|
|
||||||
replace google.golang.org/genproto => google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3
|
replace google.golang.org/genproto => google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3
|
||||||
|
|
||||||
|
// Remove this once https://github.com/grafana/dskit/pull/258 is merged.
|
||||||
|
replace github.com/grafana/dskit => github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27
|
||||||
|
4
go.sum
4
go.sum
@ -1247,14 +1247,14 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
|
|||||||
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||||
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
|
github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc=
|
||||||
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||||
|
github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27 h1:rWMt8wsjGjzT/6AX6/Ie0JTA0CNZzzbDfup34lSJnTw=
|
||||||
|
github.com/gotjosh/dskit v0.0.0-20230131123646-8dda768daa27/go.mod h1:ulYLLoSd71AWIjxgifLO86Lndx82Yj+IcV+fFnh8tkI=
|
||||||
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0 h1:BzkQNnj+eevX30EMqJiUS1w3CPoGc8kp7pDf/ari/4Y=
|
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0 h1:BzkQNnj+eevX30EMqJiUS1w3CPoGc8kp7pDf/ari/4Y=
|
||||||
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0/go.mod h1:NoSLbfmUwE+omWFReFrLtbtOItmvTbuQERJ6XFYp9ME=
|
github.com/grafana/alerting v0.0.0-20230125210216-facc6b27b9e0/go.mod h1:NoSLbfmUwE+omWFReFrLtbtOItmvTbuQERJ6XFYp9ME=
|
||||||
github.com/grafana/codejen v0.0.3 h1:tAWxoTUuhgmEqxJPOLtJoxlPBbMULFwKFOcRsPRPXDw=
|
github.com/grafana/codejen v0.0.3 h1:tAWxoTUuhgmEqxJPOLtJoxlPBbMULFwKFOcRsPRPXDw=
|
||||||
github.com/grafana/codejen v0.0.3/go.mod h1:zmwwM/DRyQB7pfuBjTWII3CWtxcXh8LTwAYGfDfpR6s=
|
github.com/grafana/codejen v0.0.3/go.mod h1:zmwwM/DRyQB7pfuBjTWII3CWtxcXh8LTwAYGfDfpR6s=
|
||||||
github.com/grafana/cuetsy v0.1.5 h1:mnFwAXdbqCsyL8r7kkdUMJ4kOAR26cxIPmrZj7JzTeY=
|
github.com/grafana/cuetsy v0.1.5 h1:mnFwAXdbqCsyL8r7kkdUMJ4kOAR26cxIPmrZj7JzTeY=
|
||||||
github.com/grafana/cuetsy v0.1.5/go.mod h1:4KWkUOslwvRTpEv7wdQG0jDFTuJmU+0L9x0h4kWxa2A=
|
github.com/grafana/cuetsy v0.1.5/go.mod h1:4KWkUOslwvRTpEv7wdQG0jDFTuJmU+0L9x0h4kWxa2A=
|
||||||
github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8 h1:5nqLvzKugVUb9sCQkKuOPecRshawSrbHsXyGxBkTBus=
|
|
||||||
github.com/grafana/dskit v0.0.0-20230126115530-71478074eab8/go.mod h1:zj+5BNZAVmQafV583uLTAOzRr963KPdEm4d6NPmtbwg=
|
|
||||||
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036 h1:GplhUk6Xes5JIhUUrggPcPBhOn+eT8+WsHiebvq7GgA=
|
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036 h1:GplhUk6Xes5JIhUUrggPcPBhOn+eT8+WsHiebvq7GgA=
|
||||||
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU=
|
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU=
|
||||||
github.com/grafana/grafana-aws-sdk v0.12.0 h1:eUjFdFZeZE+nyu/RMRz+qFxTBew69ToLBrbRhTbjkfM=
|
github.com/grafana/grafana-aws-sdk v0.12.0 h1:eUjFdFZeZE+nyu/RMRz+qFxTBew69ToLBrbRhTbjkfM=
|
||||||
|
@ -1,21 +1,31 @@
|
|||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
|
||||||
|
"github.com/grafana/dskit/metrics"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MultiOrgAlertmanager struct {
|
type MultiOrgAlertmanager struct {
|
||||||
Registerer prometheus.Registerer
|
Registerer prometheus.Registerer
|
||||||
|
registries *metrics.TenantRegistries
|
||||||
|
|
||||||
ActiveConfigurations prometheus.Gauge
|
ActiveConfigurations prometheus.Gauge
|
||||||
DiscoveredConfigurations prometheus.Gauge
|
DiscoveredConfigurations prometheus.Gauge
|
||||||
registries *OrgRegistries
|
|
||||||
|
aggregatedMetrics *AlertmanagerAggregatedMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
||||||
return &MultiOrgAlertmanager{
|
registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not.
|
||||||
|
moa := &MultiOrgAlertmanager{
|
||||||
Registerer: r,
|
Registerer: r,
|
||||||
registries: NewOrgRegistries(),
|
registries: registries,
|
||||||
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
Subsystem: Subsystem,
|
Subsystem: Subsystem,
|
||||||
@ -28,5 +38,112 @@ func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanag
|
|||||||
Name: "active_configurations",
|
Name: "active_configurations",
|
||||||
Help: "The number of active Alertmanager configurations.",
|
Help: "The number of active Alertmanager configurations.",
|
||||||
}),
|
}),
|
||||||
|
aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// These metrics use a different registration method as the struct itself represents a custom collector.
|
||||||
|
// There's no way to "auto-register" a collector.
|
||||||
|
r.MustRegister(moa.aggregatedMetrics)
|
||||||
|
|
||||||
|
return moa
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||||
|
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
||||||
|
moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||||
|
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
||||||
|
sid := strconv.FormatInt(id, 10)
|
||||||
|
reg := moa.registries.GetRegistryForTenant(sid)
|
||||||
|
if reg != nil {
|
||||||
|
return reg
|
||||||
|
}
|
||||||
|
|
||||||
|
result := prometheus.NewRegistry()
|
||||||
|
moa.registries.AddTenantRegistry(sid, result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
|
||||||
|
// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
|
||||||
|
type AlertmanagerAggregatedMetrics struct {
|
||||||
|
registries *metrics.TenantRegistries
|
||||||
|
|
||||||
|
// exported metrics, gathered from Alertmanager Silences
|
||||||
|
silencesGCDuration *prometheus.Desc
|
||||||
|
silencesSnapshotDuration *prometheus.Desc
|
||||||
|
silencesSnapshotSize *prometheus.Desc
|
||||||
|
silencesQueriesTotal *prometheus.Desc
|
||||||
|
silencesQueryErrorsTotal *prometheus.Desc
|
||||||
|
silencesQueryDuration *prometheus.Desc
|
||||||
|
silences *prometheus.Desc
|
||||||
|
silencesPropagatedMessagesTotal *prometheus.Desc
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
|
||||||
|
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
|
||||||
|
registries: registries,
|
||||||
|
|
||||||
|
silencesGCDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of the last silence garbage collection cycle.",
|
||||||
|
nil, nil),
|
||||||
|
silencesSnapshotDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of the last silence snapshot.",
|
||||||
|
nil, nil),
|
||||||
|
silencesSnapshotSize: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem),
|
||||||
|
"Size of the last silence snapshot in bytes.",
|
||||||
|
nil, nil),
|
||||||
|
silencesQueriesTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem),
|
||||||
|
"How many silence queries were received.",
|
||||||
|
nil, nil),
|
||||||
|
silencesQueryErrorsTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem),
|
||||||
|
"How many silence received queries did not succeed.",
|
||||||
|
nil, nil),
|
||||||
|
silencesQueryDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of silence query evaluation.",
|
||||||
|
nil, nil),
|
||||||
|
silencesPropagatedMessagesTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem),
|
||||||
|
"Number of received gossip messages that have been further gossiped.",
|
||||||
|
nil, nil),
|
||||||
|
silences: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
|
||||||
|
"How many silences by state.",
|
||||||
|
[]string{"org", "state"}, nil),
|
||||||
|
}
|
||||||
|
|
||||||
|
return aggregatedMetrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
||||||
|
out <- a.silencesGCDuration
|
||||||
|
out <- a.silencesSnapshotDuration
|
||||||
|
out <- a.silencesSnapshotSize
|
||||||
|
out <- a.silencesQueriesTotal
|
||||||
|
out <- a.silencesQueryErrorsTotal
|
||||||
|
out <- a.silencesQueryDuration
|
||||||
|
out <- a.silencesPropagatedMessagesTotal
|
||||||
|
out <- a.silences
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
||||||
|
data := a.registries.BuildMetricFamiliesPerTenant()
|
||||||
|
|
||||||
|
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
||||||
|
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
||||||
|
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
||||||
|
data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total")
|
||||||
|
data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
|
||||||
|
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
|
||||||
|
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
|
||||||
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
|
||||||
}
|
}
|
||||||
|
@ -57,13 +57,3 @@ func (ng *NGAlert) GetAPIMetrics() *API {
|
|||||||
func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
|
func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
|
||||||
return ng.multiOrgAlertmanagerMetrics
|
return ng.multiOrgAlertmanagerMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
||||||
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
|
||||||
moa.registries.RemoveOrgRegistry(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
||||||
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
|
||||||
return moa.registries.GetOrCreateOrgRegistry(id)
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user