Alerting/metrics (#33547)

* moves alerting metrics to their own pkg

* adds grafana_alerting_alerts (by state) metric

* alerts_received_{total,invalid}

* embed alertmanager alerting struct in ng metrics & remove duplicated notification metrics (already embed alertmanager notifier metrics)

* use silence metrics from alertmanager lib

* fix - manager has metrics

* updates ngalert tests

* comment lint
Signed-off-by: Owen Diehl <ow.diehl@gmail.com>

* cleaner prom registry code

* removes ngalert global metrics

* new registry use in all tests

* ngalert metrics impl service, hack testinfra code to prevent duplicate metric registrations

* nilmetrics unexported
This commit is contained in:
Owen Diehl 2021-04-30 12:28:06 -04:00 committed by GitHub
parent b45120b999
commit 5e48b54549
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 165 additions and 137 deletions

View File

@ -3,8 +3,8 @@ package api
import (
"time"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/prometheus/client_golang/prometheus"
"github.com/go-macaron/binding"
@ -24,11 +24,6 @@ import (
// timeNow makes it possible to test usage of time
var timeNow = time.Now
// metrics are a globally registered metric suite for alerting.
// TODO: refactor testware to allow these to be created without
// panicking on duplicate registration, thus enabling non-global vars.
var metrics = NewMetrics(prometheus.DefaultRegisterer)
type Alertmanager interface {
// Configuration
SaveAndApplyConfig(config *apimodels.PostableUserConfig) error
@ -60,7 +55,7 @@ type API struct {
}
// RegisterAPIEndpoints registers API handlers
func (api *API) RegisterAPIEndpoints() {
func (api *API) RegisterAPIEndpoints(m *metrics.Metrics) {
logger := log.New("ngalert.api")
proxy := &AlertingProxy{
DataProxy: api.DataProxy,
@ -71,26 +66,26 @@ func (api *API) RegisterAPIEndpoints() {
api.DatasourceCache,
NewLotexAM(proxy, logger),
AlertmanagerSrv{store: api.AlertingStore, am: api.Alertmanager, log: logger},
), metrics)
), m)
// Register endpoints for proxing to Prometheus-compatible backends.
api.RegisterPrometheusApiEndpoints(NewForkedProm(
api.DatasourceCache,
NewLotexProm(proxy, logger),
PrometheusSrv{log: logger, manager: api.StateManager, store: api.RuleStore},
), metrics)
), m)
// Register endpoints for proxing to Cortex Ruler-compatible backends.
api.RegisterRulerApiEndpoints(NewForkedRuler(
api.DatasourceCache,
NewLotexRuler(proxy, logger),
RulerSrv{DatasourceCache: api.DatasourceCache, store: api.RuleStore, log: logger},
), metrics)
), m)
api.RegisterTestingApiEndpoints(TestingApiSrv{
AlertingProxy: proxy,
Cfg: api.Cfg,
DataService: api.DataService,
DatasourceCache: api.DatasourceCache,
log: logger,
}, metrics)
}, m)
// Legacy routes; they will be removed in v8
api.RouteRegister.Group("/api/alert-definitions", func(alertDefinitions routing.RouteRegister) {

View File

@ -17,6 +17,7 @@ import (
"github.com/grafana/grafana/pkg/middleware"
"github.com/grafana/grafana/pkg/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
)
type AlertmanagerApiService interface {
@ -32,99 +33,99 @@ type AlertmanagerApiService interface {
RoutePostAlertingConfig(*models.ReqContext, apimodels.PostableUserConfig) response.Response
}
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, metrics *Metrics) {
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, m *metrics.Metrics) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Post(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silences"),
binding.Bind(apimodels.PostableSilence{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/alertmanager/{Recipient}/api/v2/silences",
srv.RouteCreateSilence,
metrics,
m,
),
)
group.Delete(
toMacaronPath("/api/alertmanager/{Recipient}/config/api/v1/alerts"),
Instrument(
metrics.Instrument(
http.MethodDelete,
"/api/alertmanager/{Recipient}/config/api/v1/alerts",
srv.RouteDeleteAlertingConfig,
metrics,
m,
),
)
group.Delete(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silence/{SilenceId}"),
Instrument(
metrics.Instrument(
http.MethodDelete,
"/api/alertmanager/{Recipient}/api/v2/silence/{SilenceId}",
srv.RouteDeleteSilence,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/alerts/groups"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/alertmanager/{Recipient}/api/v2/alerts/groups",
srv.RouteGetAMAlertGroups,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/alerts"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/alertmanager/{Recipient}/api/v2/alerts",
srv.RouteGetAMAlerts,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/alertmanager/{Recipient}/config/api/v1/alerts"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/alertmanager/{Recipient}/config/api/v1/alerts",
srv.RouteGetAlertingConfig,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silence/{SilenceId}"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/alertmanager/{Recipient}/api/v2/silence/{SilenceId}",
srv.RouteGetSilence,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silences"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/alertmanager/{Recipient}/api/v2/silences",
srv.RouteGetSilences,
metrics,
m,
),
)
group.Post(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/alerts"),
binding.Bind(apimodels.PostableAlerts{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/alertmanager/{Recipient}/api/v2/alerts",
srv.RoutePostAMAlerts,
metrics,
m,
),
)
group.Post(
toMacaronPath("/api/alertmanager/{Recipient}/config/api/v1/alerts"),
binding.Bind(apimodels.PostableUserConfig{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/alertmanager/{Recipient}/config/api/v1/alerts",
srv.RoutePostAlertingConfig,
metrics,
m,
),
)
}, middleware.ReqSignedIn)

View File

@ -14,6 +14,7 @@ import (
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/middleware"
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
)
type PrometheusApiService interface {
@ -21,24 +22,24 @@ type PrometheusApiService interface {
RouteGetRuleStatuses(*models.ReqContext) response.Response
}
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, metrics *Metrics) {
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, m *metrics.Metrics) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Get(
toMacaronPath("/api/prometheus/{Recipient}/api/v1/alerts"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/prometheus/{Recipient}/api/v1/alerts",
srv.RouteGetAlertStatuses,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/prometheus/{Recipient}/api/v1/rules"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/prometheus/{Recipient}/api/v1/rules",
srv.RouteGetRuleStatuses,
metrics,
m,
),
)
}, middleware.ReqSignedIn)

View File

@ -17,6 +17,7 @@ import (
"github.com/grafana/grafana/pkg/middleware"
"github.com/grafana/grafana/pkg/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
)
type RulerApiService interface {
@ -28,61 +29,61 @@ type RulerApiService interface {
RoutePostNameRulesConfig(*models.ReqContext, apimodels.PostableRuleGroupConfig) response.Response
}
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, metrics *Metrics) {
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, m *metrics.Metrics) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Delete(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}"),
Instrument(
metrics.Instrument(
http.MethodDelete,
"/api/ruler/{Recipient}/api/v1/rules/{Namespace}",
srv.RouteDeleteNamespaceRulesConfig,
metrics,
m,
),
)
group.Delete(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}/{Groupname}"),
Instrument(
metrics.Instrument(
http.MethodDelete,
"/api/ruler/{Recipient}/api/v1/rules/{Namespace}/{Groupname}",
srv.RouteDeleteRuleGroupConfig,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/ruler/{Recipient}/api/v1/rules/{Namespace}",
srv.RouteGetNamespaceRulesConfig,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}/{Groupname}"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/ruler/{Recipient}/api/v1/rules/{Namespace}/{Groupname}",
srv.RouteGetRulegGroupConfig,
metrics,
m,
),
)
group.Get(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules"),
Instrument(
metrics.Instrument(
http.MethodGet,
"/api/ruler/{Recipient}/api/v1/rules",
srv.RouteGetRulesConfig,
metrics,
m,
),
)
group.Post(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}"),
binding.Bind(apimodels.PostableRuleGroupConfig{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/ruler/{Recipient}/api/v1/rules/{Namespace}",
srv.RoutePostNameRulesConfig,
metrics,
m,
),
)
}, middleware.ReqSignedIn)

View File

@ -17,6 +17,7 @@ import (
"github.com/grafana/grafana/pkg/middleware"
"github.com/grafana/grafana/pkg/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
)
type TestingApiService interface {
@ -25,36 +26,36 @@ type TestingApiService interface {
RouteTestRuleConfig(*models.ReqContext, apimodels.TestRulePayload) response.Response
}
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, metrics *Metrics) {
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, m *metrics.Metrics) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Post(
toMacaronPath("/api/v1/eval"),
binding.Bind(apimodels.EvalQueriesPayload{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/v1/eval",
srv.RouteEvalQueries,
metrics,
m,
),
)
group.Post(
toMacaronPath("/api/v1/receiver/test/{Recipient}"),
binding.Bind(apimodels.ExtendedReceiver{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/v1/receiver/test/{Recipient}",
srv.RouteTestReceiverConfig,
metrics,
m,
),
)
group.Post(
toMacaronPath("/api/v1/rule/test/{Recipient}"),
binding.Bind(apimodels.TestRulePayload{}),
Instrument(
metrics.Instrument(
http.MethodPost,
"/api/v1/rule/test/{Recipient}",
srv.RouteTestRuleConfig,
metrics,
m,
),
)
}, middleware.ReqSignedIn)

View File

@ -9,6 +9,7 @@ import (
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/middleware"
)
@ -16,16 +17,16 @@ type {{classname}}Service interface { {{#operation}}
{{nickname}}(*models.ReqContext{{#bodyParams}}, apimodels.{{dataType}}{{/bodyParams}}) response.Response{{/operation}}
}
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, metrics *Metrics) {
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, m *metrics.Metrics) {
api.RouteRegister.Group("", func(group routing.RouteRegister){ {{#operations}}{{#operation}}
group.{{httpMethod}}(
toMacaronPath("{{{path}}}"){{#bodyParams}},
binding.Bind(apimodels.{{dataType}}{}){{/bodyParams}},
Instrument(
metrics.Instrument(
http.Method{{httpMethod}},
"{{{path}}}",
srv.{{nickname}},
metrics,
m,
),
){{/operation}}{{/operations}}
}, middleware.ReqSignedIn)

View File

@ -1,4 +1,4 @@
package api
package metrics
import (
"fmt"
@ -6,9 +6,12 @@ import (
"strings"
"time"
"github.com/prometheus/alertmanager/api/metrics"
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/registry"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
@ -20,57 +23,42 @@ const (
ProxyBackend = "proxy"
)
var GlobalMetrics = NewMetrics(prometheus.DefaultRegisterer)
type Metrics struct {
alerts *prometheus.GaugeVec
alertsInvalid prometheus.Counter
alertsReceived prometheus.Counter
notificationLatency prometheus.Histogram
notifications *prometheus.CounterVec
notificationsFailed *prometheus.CounterVec
requestDuration *prometheus.HistogramVec
silences *prometheus.GaugeVec
*metrics.Alerts
AlertState *prometheus.GaugeVec
// Registerer is for use by subcomponents which register their own metrics.
Registerer prometheus.Registerer
RequestDuration *prometheus.HistogramVec
}
func init() {
registry.RegisterService(GlobalMetrics)
}
func (m *Metrics) Init() error {
return nil
}
// SwapRegisterer overwrites the prometheus register used by a *Metrics in place.
// It's used by tests to prevent duplicate registration errors
func (m *Metrics) SwapRegisterer(r prometheus.Registerer) {
next := NewMetrics(r)
*m = *next
}
func NewMetrics(r prometheus.Registerer) *Metrics {
return &Metrics{
alerts: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Alerts: metrics.NewAlerts("v2", r),
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "alerts",
Help: "How many alerts by state.",
}, []string{"state"}),
alertsInvalid: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "alerts_invalid_total",
Help: "The total number of invalid received alerts.",
}),
alertsReceived: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "alerts_received_total",
Help: "The total number of received alerts.",
}),
notificationLatency: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "notification_latency_seconds",
Help: "Histogram of notification deliveries",
Buckets: prometheus.DefBuckets,
}),
notifications: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "notifications_total",
Help: "The total number of attempted notfications by integration.",
}, []string{"integration"}),
notificationsFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "notifications_failed_total",
Help: "The total number of failed notfications by integration.",
}, []string{"integration"}),
requestDuration: promauto.With(r).NewHistogramVec(
Registerer: r,
RequestDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "grafana",
Subsystem: "alerting",
@ -80,12 +68,6 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
},
[]string{"method", "route", "status_code", "backend"},
),
silences: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "silences",
Help: "The total number of silences by state.",
}, []string{"state"}),
}
}
@ -124,7 +106,7 @@ func Instrument(
"backend": backend,
}
res.WriteTo(c)
metrics.requestDuration.With(ls).Observe(time.Since(start).Seconds())
metrics.RequestDuration.With(ls).Observe(time.Since(start).Seconds())
}
}

View File

@ -4,6 +4,7 @@ import (
"context"
"time"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/benbjohnson/clock"
@ -45,6 +46,7 @@ type AlertNG struct {
DataService *tsdb.Service `inject:""`
Alertmanager *notifier.Alertmanager `inject:""`
DataProxy *datasourceproxy.DatasourceProxyService `inject:""`
Metrics *metrics.Metrics `inject:""`
Log log.Logger
schedule schedule.ScheduleService
stateManager *state.Manager
@ -57,7 +59,7 @@ func init() {
// Init initializes the AlertingService.
func (ng *AlertNG) Init() error {
ng.Log = log.New("ngalert")
ng.stateManager = state.NewManager(ng.Log)
ng.stateManager = state.NewManager(ng.Log, ng.Metrics)
baseInterval := baseIntervalSeconds * time.Second
store := store.DBstore{BaseInterval: baseInterval, DefaultIntervalSeconds: defaultIntervalSeconds, SQLStore: ng.SQLStore}
@ -87,7 +89,7 @@ func (ng *AlertNG) Init() error {
Alertmanager: ng.Alertmanager,
StateManager: ng.stateManager,
}
api.RegisterAPIEndpoints()
api.RegisterAPIEndpoints(ng.Metrics)
return nil
}

View File

@ -22,7 +22,6 @@ import (
"github.com/prometheus/alertmanager/silence"
"github.com/prometheus/alertmanager/template"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/grafana/grafana/pkg/components/securejsondata"
@ -31,6 +30,7 @@ import (
"github.com/grafana/grafana/pkg/registry"
"github.com/grafana/grafana/pkg/services/alerting"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier/channels"
"github.com/grafana/grafana/pkg/services/ngalert/store"
@ -81,6 +81,7 @@ type Alertmanager struct {
Settings *setting.Cfg `inject:""`
SQLStore *sqlstore.SQLStore `inject:""`
Store store.AlertingStore
Metrics *metrics.Metrics `inject:""`
notificationLog *nflog.Log
marker types.Marker
@ -116,13 +117,19 @@ func (am *Alertmanager) IsDisabled() bool {
return !am.Settings.IsNgAlertEnabled()
}
func (am *Alertmanager) Init() (err error) {
func (am *Alertmanager) Init() error {
return am.InitWithMetrics(am.Metrics)
}
// InitWithMetrics uses the supplied metrics for instantiation and
// allows testware to circumvent duplicate registration errors.
func (am *Alertmanager) InitWithMetrics(m *metrics.Metrics) (err error) {
am.stopc = make(chan struct{})
am.logger = log.New("alertmanager")
r := prometheus.NewRegistry()
am.marker = types.NewMarker(r)
am.stageMetrics = notify.NewMetrics(r)
am.dispatcherMetrics = dispatch.NewDispatcherMetrics(r)
am.marker = types.NewMarker(m.Registerer)
am.stageMetrics = notify.NewMetrics(m.Registerer)
am.dispatcherMetrics = dispatch.NewDispatcherMetrics(m.Registerer)
am.Metrics = m
am.Store = store.DBstore{SQLStore: am.SQLStore}
// Initialize the notification log
@ -137,6 +144,7 @@ func (am *Alertmanager) Init() (err error) {
}
// Initialize silences
am.silences, err = silence.New(silence.Options{
Metrics: m.Registerer,
SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"),
Retention: retentionNotificationsAndSilences,
})
@ -456,12 +464,19 @@ func (am *Alertmanager) PutAlerts(postableAlerts apimodels.PostableAlerts) error
alert.EndsAt = now.Add(defaultResolveTimeout)
}
if alert.EndsAt.After(now) {
am.Metrics.Firing().Inc()
} else {
am.Metrics.Resolved().Inc()
}
if err := alert.Validate(); err != nil {
if validationErr == nil {
validationErr = &AlertValidationError{}
}
validationErr.Alerts = append(validationErr.Alerts, a)
validationErr.Errors = append(validationErr.Errors, err)
am.Metrics.Invalid().Inc()
continue
}

View File

@ -19,6 +19,7 @@ import (
"github.com/stretchr/testify/require"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/setting"
)
@ -27,7 +28,7 @@ func TestAlertmanager_ShouldUseDefaultConfigurationWhenNoConfiguration(t *testin
am := &Alertmanager{}
am.Settings = &setting.Cfg{}
am.SQLStore = sqlstore.InitTestDB(t)
require.NoError(t, am.Init())
require.NoError(t, am.InitWithMetrics(metrics.NewMetrics(prometheus.NewRegistry())))
require.NoError(t, am.SyncAndApplyConfigFromDatabase())
require.NotNil(t, am.config)
}
@ -44,7 +45,7 @@ func TestPutAlert(t *testing.T) {
DataPath: dir,
}
require.NoError(t, am.Init())
require.NoError(t, am.InitWithMetrics(metrics.NewMetrics(prometheus.NewRegistry())))
startTime := time.Now()
endTime := startTime.Add(2 * time.Hour)

View File

@ -2,12 +2,14 @@ package state
import (
"fmt"
"strings"
"sync"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
prometheusModel "github.com/prometheus/common/model"
)
@ -16,12 +18,14 @@ type cache struct {
states map[string]*State
mtxStates sync.RWMutex
log log.Logger
metrics *metrics.Metrics
}
func newCache(logger log.Logger) *cache {
func newCache(logger log.Logger, metrics *metrics.Metrics) *cache {
return &cache{
states: make(map[string]*State),
log: logger,
states: make(map[string]*State),
log: logger,
metrics: metrics,
}
}
@ -118,13 +122,23 @@ func (c *cache) reset() {
func (c *cache) trim() {
c.mtxStates.Lock()
defer c.mtxStates.Unlock()
ct := make(map[eval.State]int)
for _, v := range c.states {
if len(v.Results) > 100 {
newResults := make([]Evaluation, 100)
copy(newResults, v.Results[100:])
// Keep last 100 results
copy(newResults, v.Results[len(v.Results)-100:])
v.Results = newResults
c.set(v)
}
n := ct[v.State]
ct[v.State] = n + 1
}
for k, n := range ct {
c.metrics.AlertState.WithLabelValues(strings.ToLower(k.String())).Set(float64(n))
}
}

View File

@ -1,25 +1,29 @@
package state
import (
"fmt"
"time"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
type Manager struct {
cache *cache
quit chan struct{}
Log log.Logger
cache *cache
quit chan struct{}
Log log.Logger
metrics *metrics.Metrics
}
func NewManager(logger log.Logger) *Manager {
func NewManager(logger log.Logger, metrics *metrics.Metrics) *Manager {
manager := &Manager{
cache: newCache(logger),
quit: make(chan struct{}),
Log: logger,
cache: newCache(logger, metrics),
quit: make(chan struct{}),
Log: logger,
metrics: metrics,
}
go manager.cleanUp()
return manager
@ -95,8 +99,11 @@ func (st *Manager) GetStatesByRuleUID() map[string][]*State {
}
func (st *Manager) cleanUp() {
ticker := time.NewTicker(time.Duration(60) * time.Minute)
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
// TODO: parameterize?
// Setting to a reasonable default scrape interval for Prometheus.
dur := time.Duration(15) * time.Second
ticker := time.NewTicker(dur)
st.Log.Debug("starting cleanup process", "dur", fmt.Sprint(dur))
for {
select {
case <-ticker.C:

View File

@ -6,6 +6,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/infra/log"
@ -16,6 +17,8 @@ import (
"github.com/stretchr/testify/assert"
)
var nilMetrics = metrics.NewMetrics(nil)
func TestProcessEvalResults(t *testing.T) {
evaluationTime, err := time.Parse("2006-01-02", "2021-03-25")
if err != nil {
@ -775,7 +778,7 @@ func TestProcessEvalResults(t *testing.T) {
}
for _, tc := range testCases {
st := state.NewManager(log.New("test_state_manager"))
st := state.NewManager(log.New("test_state_manager"), nilMetrics)
t.Run(tc.desc, func(t *testing.T) {
for _, res := range tc.evalResults {
_ = st.ProcessEvalResults(tc.alertRule, res)

View File

@ -94,7 +94,7 @@ func TestWarmStateCache(t *testing.T) {
Store: dbstore,
}
sched := schedule.NewScheduler(schedCfg, nil)
st := state.NewManager(schedCfg.Logger)
st := state.NewManager(schedCfg.Logger, nilMetrics)
sched.WarmStateCache(st)
t.Run("instance cache has expected entries", func(t *testing.T) {
@ -140,7 +140,7 @@ func TestAlertingTicker(t *testing.T) {
ctx := context.Background()
st := state.NewManager(schedCfg.Logger)
st := state.NewManager(schedCfg.Logger, nilMetrics)
go func() {
err := sched.Ticker(ctx, st)
require.NoError(t, err)

View File

@ -16,7 +16,9 @@ import (
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/registry"
"github.com/grafana/grafana/pkg/server"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gopkg.in/ini.v1"
@ -26,8 +28,10 @@ import (
// The server address is returned.
func StartGrafana(t *testing.T, grafDir, cfgPath string, sqlStore *sqlstore.SQLStore) string {
t.Helper()
ctx := context.Background()
// Prevent duplicate registration errors between tests by replacing
// the registry used.
metrics.GlobalMetrics.SwapRegisterer(prometheus.NewRegistry())
origSQLStore := registry.GetService(sqlstore.ServiceName)
t.Cleanup(func() {