mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Refactor metrics/ngalert.go into seperate files (#62362)
* Alerting: Refactor metrics/ngalert.go into seperate files
This commit is contained in:
parent
9256a520a4
commit
3c616da83f
21
pkg/services/ngalert/metrics/alertmanager.go
Normal file
21
pkg/services/ngalert/metrics/alertmanager.go
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/prometheus/alertmanager/api/metrics"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Alertmanager struct {
|
||||||
|
Registerer prometheus.Registerer
|
||||||
|
*metrics.Alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewAlertmanagerMetrics creates a set of metrics for the Alertmanager of each organization.
|
||||||
|
func NewAlertmanagerMetrics(r prometheus.Registerer) *Alertmanager {
|
||||||
|
return &Alertmanager{
|
||||||
|
Registerer: r,
|
||||||
|
Alerts: metrics.NewAlerts("grafana", prometheus.WrapRegistererWithPrefix(fmt.Sprintf("%s_%s_", Namespace, Subsystem), r)),
|
||||||
|
}
|
||||||
|
}
|
25
pkg/services/ngalert/metrics/api.go
Normal file
25
pkg/services/ngalert/metrics/api.go
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
type API struct {
|
||||||
|
RequestDuration *prometheus.HistogramVec
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAPIMetrics(r prometheus.Registerer) *API {
|
||||||
|
return &API{
|
||||||
|
RequestDuration: promauto.With(r).NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "request_duration_seconds",
|
||||||
|
Help: "Histogram of requests to the Alerting API",
|
||||||
|
Buckets: prometheus.DefBuckets,
|
||||||
|
},
|
||||||
|
[]string{"method", "route", "status_code", "backend"},
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
32
pkg/services/ngalert/metrics/multi_org_alertmanager.go
Normal file
32
pkg/services/ngalert/metrics/multi_org_alertmanager.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MultiOrgAlertmanager struct {
|
||||||
|
Registerer prometheus.Registerer
|
||||||
|
ActiveConfigurations prometheus.Gauge
|
||||||
|
DiscoveredConfigurations prometheus.Gauge
|
||||||
|
registries *OrgRegistries
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
||||||
|
return &MultiOrgAlertmanager{
|
||||||
|
Registerer: r,
|
||||||
|
registries: NewOrgRegistries(),
|
||||||
|
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "discovered_configurations",
|
||||||
|
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
|
||||||
|
}),
|
||||||
|
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "active_configurations",
|
||||||
|
Help: "The number of active Alertmanager configurations.",
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
@ -1,22 +1,7 @@
|
|||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/prometheus/alertmanager/api/metrics"
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/api/response"
|
|
||||||
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
|
||||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
|
||||||
"github.com/grafana/grafana/pkg/util/ticker"
|
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/web"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -37,47 +22,24 @@ func ProvideServiceForTest() *NGAlert {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NGAlert struct {
|
type NGAlert struct {
|
||||||
// Registerer is for use by subcomponents which register their own metrics.
|
// Registerer is used by subcomponents which register their own metrics.
|
||||||
Registerer prometheus.Registerer
|
Registerer prometheus.Registerer
|
||||||
|
|
||||||
schedulerMetrics *Scheduler
|
schedulerMetrics *Scheduler
|
||||||
stateMetrics *State
|
stateMetrics *State
|
||||||
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
|
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
|
||||||
apiMetrics *API
|
apiMetrics *API
|
||||||
}
|
}
|
||||||
|
|
||||||
type Scheduler struct {
|
// NewNGAlert manages the metrics of all the alerting components.
|
||||||
Registerer prometheus.Registerer
|
func NewNGAlert(r prometheus.Registerer) *NGAlert {
|
||||||
BehindSeconds prometheus.Gauge
|
return &NGAlert{
|
||||||
EvalTotal *prometheus.CounterVec
|
Registerer: r,
|
||||||
EvalFailures *prometheus.CounterVec
|
schedulerMetrics: NewSchedulerMetrics(r),
|
||||||
EvalDuration *prometheus.HistogramVec
|
stateMetrics: NewStateMetrics(r),
|
||||||
SchedulePeriodicDuration prometheus.Histogram
|
multiOrgAlertmanagerMetrics: NewMultiOrgAlertmanagerMetrics(r),
|
||||||
SchedulableAlertRules prometheus.Gauge
|
apiMetrics: NewAPIMetrics(r),
|
||||||
SchedulableAlertRulesHash prometheus.Gauge
|
}
|
||||||
UpdateSchedulableAlertRulesDuration prometheus.Histogram
|
|
||||||
Ticker *ticker.Metrics
|
|
||||||
EvaluationMissed *prometheus.CounterVec
|
|
||||||
}
|
|
||||||
|
|
||||||
type MultiOrgAlertmanager struct {
|
|
||||||
Registerer prometheus.Registerer
|
|
||||||
ActiveConfigurations prometheus.Gauge
|
|
||||||
DiscoveredConfigurations prometheus.Gauge
|
|
||||||
registries *OrgRegistries
|
|
||||||
}
|
|
||||||
|
|
||||||
type API struct {
|
|
||||||
RequestDuration *prometheus.HistogramVec
|
|
||||||
}
|
|
||||||
|
|
||||||
type Alertmanager struct {
|
|
||||||
Registerer prometheus.Registerer
|
|
||||||
*metrics.Alerts
|
|
||||||
}
|
|
||||||
|
|
||||||
type State struct {
|
|
||||||
GroupRules *prometheus.GaugeVec
|
|
||||||
AlertState *prometheus.GaugeVec
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ng *NGAlert) GetSchedulerMetrics() *Scheduler {
|
func (ng *NGAlert) GetSchedulerMetrics() *Scheduler {
|
||||||
@ -96,25 +58,6 @@ func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
|
|||||||
return ng.multiOrgAlertmanagerMetrics
|
return ng.multiOrgAlertmanagerMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewNGAlert manages the metrics of all the alerting components.
|
|
||||||
func NewNGAlert(r prometheus.Registerer) *NGAlert {
|
|
||||||
return &NGAlert{
|
|
||||||
Registerer: r,
|
|
||||||
schedulerMetrics: NewSchedulerMetrics(r),
|
|
||||||
stateMetrics: newStateMetrics(r),
|
|
||||||
multiOrgAlertmanagerMetrics: newMultiOrgAlertmanagerMetrics(r),
|
|
||||||
apiMetrics: newAPIMetrics(r),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewAlertmanagerMetrics creates a set of metrics for the Alertmanager of each organization.
|
|
||||||
func NewAlertmanagerMetrics(r prometheus.Registerer) *Alertmanager {
|
|
||||||
return &Alertmanager{
|
|
||||||
Registerer: r,
|
|
||||||
Alerts: metrics.NewAlerts("grafana", prometheus.WrapRegistererWithPrefix(fmt.Sprintf("%s_%s_", Namespace, Subsystem), r)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||||
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
||||||
moa.registries.RemoveOrgRegistry(id)
|
moa.registries.RemoveOrgRegistry(id)
|
||||||
@ -124,232 +67,3 @@ func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
|||||||
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
||||||
return moa.registries.GetOrCreateOrgRegistry(id)
|
return moa.registries.GetOrCreateOrgRegistry(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|
||||||
return &Scheduler{
|
|
||||||
Registerer: r,
|
|
||||||
BehindSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "scheduler_behind_seconds",
|
|
||||||
Help: "The total number of seconds the scheduler is behind.",
|
|
||||||
}),
|
|
||||||
// TODO: once rule groups support multiple rules, consider partitioning
|
|
||||||
// on rule group as well as tenant, similar to loki|cortex.
|
|
||||||
EvalTotal: promauto.With(r).NewCounterVec(
|
|
||||||
prometheus.CounterOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "rule_evaluations_total",
|
|
||||||
Help: "The total number of rule evaluations.",
|
|
||||||
},
|
|
||||||
[]string{"org"},
|
|
||||||
),
|
|
||||||
// TODO: once rule groups support multiple rules, consider partitioning
|
|
||||||
// on rule group as well as tenant, similar to loki|cortex.
|
|
||||||
EvalFailures: promauto.With(r).NewCounterVec(
|
|
||||||
prometheus.CounterOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "rule_evaluation_failures_total",
|
|
||||||
Help: "The total number of rule evaluation failures.",
|
|
||||||
},
|
|
||||||
[]string{"org"},
|
|
||||||
),
|
|
||||||
EvalDuration: promauto.With(r).NewHistogramVec(
|
|
||||||
prometheus.HistogramOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "rule_evaluation_duration_seconds",
|
|
||||||
Help: "The duration for a rule to execute.",
|
|
||||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
|
||||||
},
|
|
||||||
[]string{"org"},
|
|
||||||
),
|
|
||||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
|
||||||
prometheus.HistogramOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "schedule_periodic_duration_seconds",
|
|
||||||
Help: "The time taken to run the scheduler.",
|
|
||||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
SchedulableAlertRules: promauto.With(r).NewGauge(
|
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "schedule_alert_rules",
|
|
||||||
Help: "The number of alert rules that could be considered for evaluation at the next tick.",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
|
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "schedule_alert_rules_hash",
|
|
||||||
Help: "A hash of the alert rules that could be considered for evaluation at the next tick.",
|
|
||||||
}),
|
|
||||||
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
|
|
||||||
prometheus.HistogramOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "schedule_query_alert_rules_duration_seconds",
|
|
||||||
Help: "The time taken to fetch alert rules from the database.",
|
|
||||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Ticker: ticker.NewMetrics(r, "alerting"),
|
|
||||||
EvaluationMissed: promauto.With(r).NewCounterVec(
|
|
||||||
prometheus.CounterOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "schedule_rule_evaluations_missed_total",
|
|
||||||
Help: "The total number of rule evaluations missed due to a slow rule evaluation.",
|
|
||||||
},
|
|
||||||
[]string{"org", "name"},
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newStateMetrics(r prometheus.Registerer) *State {
|
|
||||||
return &State{
|
|
||||||
// TODO: once rule groups support multiple rules, consider partitioning
|
|
||||||
// on rule group as well as tenant, similar to loki|cortex.
|
|
||||||
GroupRules: promauto.With(r).NewGaugeVec(
|
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "rule_group_rules",
|
|
||||||
Help: "The number of rules.",
|
|
||||||
},
|
|
||||||
[]string{"org"},
|
|
||||||
),
|
|
||||||
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "alerts",
|
|
||||||
Help: "How many alerts by state.",
|
|
||||||
}, []string{"state"}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
|
||||||
return &MultiOrgAlertmanager{
|
|
||||||
Registerer: r,
|
|
||||||
registries: NewOrgRegistries(),
|
|
||||||
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "discovered_configurations",
|
|
||||||
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
|
|
||||||
}),
|
|
||||||
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "active_configurations",
|
|
||||||
Help: "The number of active Alertmanager configurations.",
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newAPIMetrics(r prometheus.Registerer) *API {
|
|
||||||
return &API{
|
|
||||||
RequestDuration: promauto.With(r).NewHistogramVec(
|
|
||||||
prometheus.HistogramOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "request_duration_seconds",
|
|
||||||
Help: "Histogram of requests to the Alerting API",
|
|
||||||
Buckets: prometheus.DefBuckets,
|
|
||||||
},
|
|
||||||
[]string{"method", "route", "status_code", "backend"},
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// OrgRegistries represents a map of registries per org.
|
|
||||||
type OrgRegistries struct {
|
|
||||||
regsMu sync.Mutex
|
|
||||||
regs map[int64]prometheus.Registerer
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewOrgRegistries() *OrgRegistries {
|
|
||||||
return &OrgRegistries{
|
|
||||||
regs: make(map[int64]prometheus.Registerer),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
||||||
func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registerer {
|
|
||||||
m.regsMu.Lock()
|
|
||||||
defer m.regsMu.Unlock()
|
|
||||||
|
|
||||||
orgRegistry, ok := m.regs[orgID]
|
|
||||||
if !ok {
|
|
||||||
reg := prometheus.NewRegistry()
|
|
||||||
m.regs[orgID] = reg
|
|
||||||
return reg
|
|
||||||
}
|
|
||||||
return orgRegistry
|
|
||||||
}
|
|
||||||
|
|
||||||
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
||||||
func (m *OrgRegistries) RemoveOrgRegistry(org int64) {
|
|
||||||
m.regsMu.Lock()
|
|
||||||
defer m.regsMu.Unlock()
|
|
||||||
delete(m.regs, org)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Instrument wraps a middleware, instrumenting the request latencies.
|
|
||||||
func Instrument(
|
|
||||||
method,
|
|
||||||
path string,
|
|
||||||
action func(*contextmodel.ReqContext) response.Response,
|
|
||||||
metrics *API,
|
|
||||||
) web.Handler {
|
|
||||||
normalizedPath := MakeLabelValue(path)
|
|
||||||
|
|
||||||
return func(c *contextmodel.ReqContext) {
|
|
||||||
start := time.Now()
|
|
||||||
res := action(c)
|
|
||||||
|
|
||||||
// TODO: We could look up the datasource type via our datasource service
|
|
||||||
var backend string
|
|
||||||
datasourceID := web.Params(c.Req)[":DatasourceID"]
|
|
||||||
if datasourceID == apimodels.GrafanaBackend.String() || datasourceID == "" {
|
|
||||||
backend = GrafanaBackend
|
|
||||||
} else {
|
|
||||||
backend = ProxyBackend
|
|
||||||
}
|
|
||||||
|
|
||||||
ls := prometheus.Labels{
|
|
||||||
"method": method,
|
|
||||||
"route": normalizedPath,
|
|
||||||
"status_code": fmt.Sprint(res.Status()),
|
|
||||||
"backend": backend,
|
|
||||||
}
|
|
||||||
res.WriteTo(c)
|
|
||||||
metrics.RequestDuration.With(ls).Observe(time.Since(start).Seconds())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var invalidChars = regexp.MustCompile(`[^a-zA-Z0-9]+`)
|
|
||||||
|
|
||||||
// MakeLabelValue normalizes a path template
|
|
||||||
func MakeLabelValue(path string) string {
|
|
||||||
// Convert non-alnums to underscores.
|
|
||||||
result := invalidChars.ReplaceAllString(path, "_")
|
|
||||||
|
|
||||||
// Trim leading and trailing underscores.
|
|
||||||
result = strings.Trim(result, "_")
|
|
||||||
|
|
||||||
// Make it all lowercase
|
|
||||||
result = strings.ToLower(result)
|
|
||||||
|
|
||||||
// Special case.
|
|
||||||
if result == "" {
|
|
||||||
result = "root"
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
108
pkg/services/ngalert/metrics/scheduler.go
Normal file
108
pkg/services/ngalert/metrics/scheduler.go
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/grafana/grafana/pkg/util/ticker"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Scheduler struct {
|
||||||
|
Registerer prometheus.Registerer
|
||||||
|
BehindSeconds prometheus.Gauge
|
||||||
|
EvalTotal *prometheus.CounterVec
|
||||||
|
EvalFailures *prometheus.CounterVec
|
||||||
|
EvalDuration *prometheus.HistogramVec
|
||||||
|
SchedulePeriodicDuration prometheus.Histogram
|
||||||
|
SchedulableAlertRules prometheus.Gauge
|
||||||
|
SchedulableAlertRulesHash prometheus.Gauge
|
||||||
|
UpdateSchedulableAlertRulesDuration prometheus.Histogram
|
||||||
|
Ticker *ticker.Metrics
|
||||||
|
EvaluationMissed *prometheus.CounterVec
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||||
|
return &Scheduler{
|
||||||
|
Registerer: r,
|
||||||
|
BehindSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "scheduler_behind_seconds",
|
||||||
|
Help: "The total number of seconds the scheduler is behind.",
|
||||||
|
}),
|
||||||
|
// TODO: once rule groups support multiple rules, consider partitioning
|
||||||
|
// on rule group as well as tenant, similar to loki|cortex.
|
||||||
|
EvalTotal: promauto.With(r).NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_evaluations_total",
|
||||||
|
Help: "The total number of rule evaluations.",
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
// TODO: once rule groups support multiple rules, consider partitioning
|
||||||
|
// on rule group as well as tenant, similar to loki|cortex.
|
||||||
|
EvalFailures: promauto.With(r).NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_evaluation_failures_total",
|
||||||
|
Help: "The total number of rule evaluation failures.",
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
EvalDuration: promauto.With(r).NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_evaluation_duration_seconds",
|
||||||
|
Help: "The duration for a rule to execute.",
|
||||||
|
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_periodic_duration_seconds",
|
||||||
|
Help: "The time taken to run the scheduler.",
|
||||||
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
SchedulableAlertRules: promauto.With(r).NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_alert_rules",
|
||||||
|
Help: "The number of alert rules that could be considered for evaluation at the next tick.",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_alert_rules_hash",
|
||||||
|
Help: "A hash of the alert rules that could be considered for evaluation at the next tick.",
|
||||||
|
}),
|
||||||
|
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_query_alert_rules_duration_seconds",
|
||||||
|
Help: "The time taken to fetch alert rules from the database.",
|
||||||
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Ticker: ticker.NewMetrics(r, "alerting"),
|
||||||
|
EvaluationMissed: promauto.With(r).NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_rule_evaluations_missed_total",
|
||||||
|
Help: "The total number of rule evaluations missed due to a slow rule evaluation.",
|
||||||
|
},
|
||||||
|
[]string{"org", "name"},
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
33
pkg/services/ngalert/metrics/state.go
Normal file
33
pkg/services/ngalert/metrics/state.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
type State struct {
|
||||||
|
GroupRules *prometheus.GaugeVec
|
||||||
|
AlertState *prometheus.GaugeVec
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewStateMetrics(r prometheus.Registerer) *State {
|
||||||
|
return &State{
|
||||||
|
// TODO: once rule groups support multiple rules, consider partitioning
|
||||||
|
// on rule group as well as tenant, similar to loki|cortex.
|
||||||
|
GroupRules: promauto.With(r).NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_group_rules",
|
||||||
|
Help: "The number of rules.",
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "alerts",
|
||||||
|
Help: "How many alerts by state.",
|
||||||
|
}, []string{"state"}),
|
||||||
|
}
|
||||||
|
}
|
104
pkg/services/ngalert/metrics/util.go
Normal file
104
pkg/services/ngalert/metrics/util.go
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/web"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/api/response"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
|
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
||||||
|
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||||
|
)
|
||||||
|
|
||||||
|
// OrgRegistries represents a map of registries per org.
|
||||||
|
type OrgRegistries struct {
|
||||||
|
regsMu sync.Mutex
|
||||||
|
regs map[int64]prometheus.Registerer
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewOrgRegistries() *OrgRegistries {
|
||||||
|
return &OrgRegistries{
|
||||||
|
regs: make(map[int64]prometheus.Registerer),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||||
|
func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registerer {
|
||||||
|
m.regsMu.Lock()
|
||||||
|
defer m.regsMu.Unlock()
|
||||||
|
|
||||||
|
orgRegistry, ok := m.regs[orgID]
|
||||||
|
if !ok {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
m.regs[orgID] = reg
|
||||||
|
return reg
|
||||||
|
}
|
||||||
|
return orgRegistry
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||||
|
func (m *OrgRegistries) RemoveOrgRegistry(org int64) {
|
||||||
|
m.regsMu.Lock()
|
||||||
|
defer m.regsMu.Unlock()
|
||||||
|
delete(m.regs, org)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Instrument wraps a middleware, instrumenting the request latencies.
|
||||||
|
func Instrument(
|
||||||
|
method,
|
||||||
|
path string,
|
||||||
|
action func(*contextmodel.ReqContext) response.Response,
|
||||||
|
metrics *API,
|
||||||
|
) web.Handler {
|
||||||
|
normalizedPath := MakeLabelValue(path)
|
||||||
|
|
||||||
|
return func(c *contextmodel.ReqContext) {
|
||||||
|
start := time.Now()
|
||||||
|
res := action(c)
|
||||||
|
|
||||||
|
// TODO: We could look up the datasource type via our datasource service
|
||||||
|
var backend string
|
||||||
|
datasourceID := web.Params(c.Req)[":DatasourceID"]
|
||||||
|
if datasourceID == apimodels.GrafanaBackend.String() || datasourceID == "" {
|
||||||
|
backend = GrafanaBackend
|
||||||
|
} else {
|
||||||
|
backend = ProxyBackend
|
||||||
|
}
|
||||||
|
|
||||||
|
ls := prometheus.Labels{
|
||||||
|
"method": method,
|
||||||
|
"route": normalizedPath,
|
||||||
|
"status_code": fmt.Sprint(res.Status()),
|
||||||
|
"backend": backend,
|
||||||
|
}
|
||||||
|
res.WriteTo(c)
|
||||||
|
metrics.RequestDuration.With(ls).Observe(time.Since(start).Seconds())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var invalidChars = regexp.MustCompile(`[^a-zA-Z0-9]+`)
|
||||||
|
|
||||||
|
// MakeLabelValue normalizes a path template
|
||||||
|
func MakeLabelValue(path string) string {
|
||||||
|
// Convert non-alnums to underscores.
|
||||||
|
result := invalidChars.ReplaceAllString(path, "_")
|
||||||
|
|
||||||
|
// Trim leading and trailing underscores.
|
||||||
|
result = strings.Trim(result, "_")
|
||||||
|
|
||||||
|
// Make it all lowercase
|
||||||
|
result = strings.ToLower(result)
|
||||||
|
|
||||||
|
// Special case.
|
||||||
|
if result == "" {
|
||||||
|
result = "root"
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user