mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting/ruler metrics (#34144)
* adds active configurations metric * rule evaluation metrics * ruler metrics * pr feedback
This commit is contained in:
parent
eb74994b8b
commit
1367f7171e
@ -29,8 +29,13 @@ type Metrics struct {
|
||||
*metrics.Alerts
|
||||
AlertState *prometheus.GaugeVec
|
||||
// Registerer is for use by subcomponents which register their own metrics.
|
||||
Registerer prometheus.Registerer
|
||||
RequestDuration *prometheus.HistogramVec
|
||||
Registerer prometheus.Registerer
|
||||
RequestDuration *prometheus.HistogramVec
|
||||
ActiveConfigurations prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -68,6 +73,54 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
},
|
||||
[]string{"method", "route", "status_code", "backend"},
|
||||
),
|
||||
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "active_configurations",
|
||||
Help: "The number of active, non default alertmanager configurations for grafana managed alerts",
|
||||
}),
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
EvalTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "rule_evaluations_total",
|
||||
Help: "The total number of rule evaluations.",
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
EvalFailures: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "rule_evaluation_failures_total",
|
||||
Help: "The total number of rule evaluation failures.",
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
EvalDuration: prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Help: "The duration for a rule to execute.",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
GroupRules: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,11 @@ func (ng *AlertNG) Init() error {
|
||||
ng.stateManager = state.NewManager(ng.Log, ng.Metrics)
|
||||
baseInterval := baseIntervalSeconds * time.Second
|
||||
|
||||
store := &store.DBstore{BaseInterval: baseInterval, DefaultIntervalSeconds: defaultIntervalSeconds, SQLStore: ng.SQLStore}
|
||||
store := &store.DBstore{
|
||||
BaseInterval: baseInterval,
|
||||
DefaultIntervalSeconds: defaultIntervalSeconds,
|
||||
SQLStore: ng.SQLStore,
|
||||
}
|
||||
|
||||
var err error
|
||||
ng.Alertmanager, err = notifier.New(ng.Cfg, store, ng.Metrics)
|
||||
@ -82,6 +86,7 @@ func (ng *AlertNG) Init() error {
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
Notifier: ng.Alertmanager,
|
||||
Metrics: ng.Metrics,
|
||||
}
|
||||
ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService)
|
||||
|
||||
|
@ -212,6 +212,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
am.Metrics.ActiveConfigurations.Set(1)
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -253,6 +254,12 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
|
||||
return fmt.Errorf("unable to reload configuration: %w", err)
|
||||
}
|
||||
|
||||
if q.Result.Default {
|
||||
am.Metrics.ActiveConfigurations.Set(0)
|
||||
} else {
|
||||
am.Metrics.ActiveConfigurations.Set(1)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -35,6 +35,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
DataPath: dir,
|
||||
}
|
||||
|
||||
m := metrics.NewMetrics(prometheus.NewRegistry())
|
||||
sqlStore := sqlstore.InitTestDB(t)
|
||||
store := &store.DBstore{
|
||||
BaseInterval: 10 * time.Second,
|
||||
@ -42,7 +43,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
SQLStore: sqlStore,
|
||||
}
|
||||
|
||||
am, err := New(cfg, store, metrics.NewMetrics(prometheus.NewRegistry()))
|
||||
am, err := New(cfg, store, m)
|
||||
require.NoError(t, err)
|
||||
return am
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
@ -39,7 +40,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
||||
sch.log.Debug("alert rule routine started", "key", key)
|
||||
|
||||
evalRunning := false
|
||||
var start, end time.Time
|
||||
var attempt int64
|
||||
var alertRule *models.AlertRule
|
||||
for {
|
||||
@ -50,7 +50,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
||||
}
|
||||
|
||||
evaluate := func(attempt int64) error {
|
||||
start = timeNow()
|
||||
start := timeNow()
|
||||
|
||||
// fetch latest alert rule version
|
||||
if alertRule == nil || alertRule.Version < ctx.version {
|
||||
@ -70,8 +70,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
||||
Data: alertRule.Data,
|
||||
}
|
||||
results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
|
||||
end = timeNow()
|
||||
var (
|
||||
end = timeNow()
|
||||
tenant = fmt.Sprint(alertRule.OrgID)
|
||||
dur = end.Sub(start).Seconds()
|
||||
)
|
||||
|
||||
sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
|
||||
sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
|
||||
if err != nil {
|
||||
sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
|
||||
// consider saving alert instance on error
|
||||
sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
|
||||
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
|
||||
@ -153,6 +161,7 @@ type schedule struct {
|
||||
dataService *tsdb.Service
|
||||
|
||||
notifier Notifier
|
||||
metrics *metrics.Metrics
|
||||
}
|
||||
|
||||
// SchedulerCfg is the scheduler configuration.
|
||||
@ -167,6 +176,7 @@ type SchedulerCfg struct {
|
||||
RuleStore store.RuleStore
|
||||
InstanceStore store.InstanceStore
|
||||
Notifier Notifier
|
||||
Metrics *metrics.Metrics
|
||||
}
|
||||
|
||||
// NewScheduler returns a new schedule.
|
||||
@ -186,6 +196,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule {
|
||||
instanceStore: cfg.InstanceStore,
|
||||
dataService: dataService,
|
||||
notifier: cfg.Notifier,
|
||||
metrics: cfg.Metrics,
|
||||
}
|
||||
return &sch
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/tests"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
|
||||
@ -104,6 +105,7 @@ func TestWarmStateCache(t *testing.T) {
|
||||
|
||||
RuleStore: dbstore,
|
||||
InstanceStore: dbstore,
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
}
|
||||
sched := schedule.NewScheduler(schedCfg, nil)
|
||||
st := state.NewManager(schedCfg.Logger, nilMetrics)
|
||||
@ -151,6 +153,7 @@ func TestAlertingTicker(t *testing.T) {
|
||||
RuleStore: dbstore,
|
||||
InstanceStore: dbstore,
|
||||
Logger: log.New("ngalert schedule test"),
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
}
|
||||
sched := schedule.NewScheduler(schedCfg, nil)
|
||||
|
||||
|
@ -149,8 +149,9 @@ func (c *cache) trim() {
|
||||
eval.Error: 0,
|
||||
}
|
||||
|
||||
for _, org := range c.states {
|
||||
for _, rule := range org {
|
||||
for org, orgMap := range c.states {
|
||||
c.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
|
||||
for _, rule := range orgMap {
|
||||
for _, state := range rule {
|
||||
if len(state.Results) > 100 {
|
||||
newResults := make([]Evaluation, 100)
|
||||
|
@ -13,29 +13,21 @@ var (
|
||||
ErrNoAlertmanagerConfiguration = fmt.Errorf("could not find an Alertmanager configuration")
|
||||
)
|
||||
|
||||
func getLatestAlertmanagerConfiguration(sess *sqlstore.DBSession) (*models.AlertConfiguration, error) {
|
||||
c := &models.AlertConfiguration{}
|
||||
// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
|
||||
ok, err := sess.Desc("id").Limit(1).Get(c)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
return nil, ErrNoAlertmanagerConfiguration
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// GetLatestAlertmanagerConfiguration returns the lastest version of the alertmanager configuration.
|
||||
// It returns ErrNoAlertmanagerConfiguration if no configuration is found.
|
||||
func (st *DBstore) GetLatestAlertmanagerConfiguration(query *models.GetLatestAlertmanagerConfigurationQuery) error {
|
||||
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
|
||||
c, err := getLatestAlertmanagerConfiguration(sess)
|
||||
c := &models.AlertConfiguration{}
|
||||
// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
|
||||
ok, err := sess.Desc("id").Limit(1).Get(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
return ErrNoAlertmanagerConfiguration
|
||||
}
|
||||
|
||||
query.Result = c
|
||||
return nil
|
||||
})
|
||||
|
@ -27,5 +27,5 @@ type DBstore struct {
|
||||
BaseInterval time.Duration
|
||||
// default alert definiiton interval
|
||||
DefaultIntervalSeconds int64
|
||||
SQLStore *sqlstore.SQLStore `inject:""`
|
||||
SQLStore *sqlstore.SQLStore
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user