mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Refactor & fix unified alerting metrics structure (#39151)
* Alerting: Refactor & fix unified alerting metrics structure Fixes and refactors the metrics structure we have for the ngalert service. Now, each component has its own metric struct that includes the JUST the metrics it uses. Additionally, I have fixed the configuration metrics and added new metrics to determine if we have discovered and started all the necessary configurations of an instance. This allows us to alert on `grafana_alerting_discovered_configurations - grafana_alerting_active_configurations != 0` to know whether an alertmanager instance did not start successfully.
This commit is contained in:
@@ -66,7 +66,7 @@ type API struct {
|
||||
}
|
||||
|
||||
// RegisterAPIEndpoints registers API handlers
|
||||
func (api *API) RegisterAPIEndpoints(m *metrics.Metrics) {
|
||||
func (api *API) RegisterAPIEndpoints(m *metrics.API) {
|
||||
logger := log.New("ngalert.api")
|
||||
proxy := &AlertingProxy{
|
||||
DataProxy: api.DataProxy,
|
||||
|
||||
@@ -34,7 +34,7 @@ type AlertmanagerApiService interface {
|
||||
RoutePostTestReceivers(*models.ReqContext, apimodels.TestReceiversConfigParams) response.Response
|
||||
}
|
||||
|
||||
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, m *metrics.Metrics) {
|
||||
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister) {
|
||||
group.Post(
|
||||
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silences"),
|
||||
|
||||
@@ -26,7 +26,7 @@ type ConfigurationApiService interface {
|
||||
RoutePostNGalertConfig(*models.ReqContext, apimodels.PostableNGalertConfig) response.Response
|
||||
}
|
||||
|
||||
func (api *API) RegisterConfigurationApiEndpoints(srv ConfigurationApiService, m *metrics.Metrics) {
|
||||
func (api *API) RegisterConfigurationApiEndpoints(srv ConfigurationApiService, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister) {
|
||||
group.Delete(
|
||||
toMacaronPath("/api/v1/ngalert/admin_config"),
|
||||
|
||||
@@ -21,7 +21,7 @@ type PrometheusApiService interface {
|
||||
RouteGetRuleStatuses(*models.ReqContext) response.Response
|
||||
}
|
||||
|
||||
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, m *metrics.Metrics) {
|
||||
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister) {
|
||||
group.Get(
|
||||
toMacaronPath("/api/prometheus/{Recipient}/api/v1/alerts"),
|
||||
|
||||
@@ -28,7 +28,7 @@ type RulerApiService interface {
|
||||
RoutePostNameRulesConfig(*models.ReqContext, apimodels.PostableRuleGroupConfig) response.Response
|
||||
}
|
||||
|
||||
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, m *metrics.Metrics) {
|
||||
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister) {
|
||||
group.Delete(
|
||||
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}"),
|
||||
|
||||
@@ -24,7 +24,7 @@ type TestingApiService interface {
|
||||
RouteTestRuleConfig(*models.ReqContext, apimodels.TestRulePayload) response.Response
|
||||
}
|
||||
|
||||
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, m *metrics.Metrics) {
|
||||
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister) {
|
||||
group.Post(
|
||||
toMacaronPath("/api/v1/eval"),
|
||||
|
||||
@@ -17,7 +17,7 @@ type {{classname}}Service interface { {{#operation}}
|
||||
{{nickname}}(*models.ReqContext{{#bodyParams}}, apimodels.{{dataType}}{{/bodyParams}}) response.Response{{/operation}}
|
||||
}
|
||||
|
||||
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, m *metrics.Metrics) {
|
||||
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, m *metrics.API) {
|
||||
api.RouteRegister.Group("", func(group routing.RouteRegister){ {{#operations}}{{#operation}}
|
||||
group.{{httpMethod}}(
|
||||
toMacaronPath("{{{path}}}"){{#bodyParams}},
|
||||
|
||||
@@ -7,12 +7,12 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/alertmanager/api/metrics"
|
||||
|
||||
"github.com/grafana/grafana/pkg/api/response"
|
||||
"github.com/grafana/grafana/pkg/api/routing"
|
||||
"github.com/grafana/grafana/pkg/models"
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
|
||||
"github.com/prometheus/alertmanager/api/metrics"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"gopkg.in/macaron.v1"
|
||||
@@ -21,63 +21,110 @@ import (
|
||||
const (
|
||||
GrafanaBackend = "grafana"
|
||||
ProxyBackend = "proxy"
|
||||
Namespace = "grafana"
|
||||
Subsystem = "alerting"
|
||||
)
|
||||
|
||||
// ProvideService is a Metrics factory.
|
||||
func ProvideService() *Metrics {
|
||||
return NewMetrics(prometheus.DefaultRegisterer)
|
||||
func ProvideService() *NGAlert {
|
||||
return NewNGAlert(prometheus.DefaultRegisterer)
|
||||
}
|
||||
|
||||
// ProvideServiceForTest is a Metrics factory used for test.
|
||||
func ProvideServiceForTest() *Metrics {
|
||||
return NewMetrics(prometheus.NewRegistry())
|
||||
func ProvideServiceForTest() *NGAlert {
|
||||
return NewNGAlert(prometheus.NewRegistry())
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
*metrics.Alerts
|
||||
type NGAlert struct {
|
||||
// Registerer is for use by subcomponents which register their own metrics.
|
||||
Registerer prometheus.Registerer
|
||||
AlertState *prometheus.GaugeVec
|
||||
RequestDuration *prometheus.HistogramVec
|
||||
ActiveConfigurations prometheus.Gauge
|
||||
schedulerMetrics *Scheduler
|
||||
stateMetrics *State
|
||||
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
|
||||
apiMetrics *API
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
Registerer prometheus.Registerer
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
return &Metrics{
|
||||
type MultiOrgAlertmanager struct {
|
||||
ActiveConfigurations prometheus.Gauge
|
||||
DiscoveredConfigurations prometheus.Gauge
|
||||
registries *OrgRegistries
|
||||
}
|
||||
|
||||
type API struct {
|
||||
RequestDuration *prometheus.HistogramVec
|
||||
}
|
||||
|
||||
type Alertmanager struct {
|
||||
Registerer prometheus.Registerer
|
||||
*metrics.Alerts
|
||||
}
|
||||
|
||||
type State struct {
|
||||
GroupRules *prometheus.GaugeVec
|
||||
AlertState *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func (ng *NGAlert) GetSchedulerMetrics() *Scheduler {
|
||||
return ng.schedulerMetrics
|
||||
}
|
||||
|
||||
func (ng *NGAlert) GetStateMetrics() *State {
|
||||
return ng.stateMetrics
|
||||
}
|
||||
|
||||
func (ng *NGAlert) GetAPIMetrics() *API {
|
||||
return ng.apiMetrics
|
||||
}
|
||||
|
||||
func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
|
||||
return ng.multiOrgAlertmanagerMetrics
|
||||
}
|
||||
|
||||
// NewNGAlert manages the metrics of all the alerting components.
|
||||
func NewNGAlert(r prometheus.Registerer) *NGAlert {
|
||||
return &NGAlert{
|
||||
Registerer: r,
|
||||
schedulerMetrics: newSchedulerMetrics(r),
|
||||
stateMetrics: newStateMetrics(r),
|
||||
multiOrgAlertmanagerMetrics: newMultiOrgAlertmanagerMetrics(r),
|
||||
apiMetrics: newAPIMetrics(r),
|
||||
}
|
||||
}
|
||||
|
||||
// NewAlertmanagerMetrics creates a set of metrics for the Alertmanager of each organization.
|
||||
func NewAlertmanagerMetrics(r prometheus.Registerer) *Alertmanager {
|
||||
return &Alertmanager{
|
||||
Registerer: r,
|
||||
Alerts: metrics.NewAlerts("grafana", prometheus.WrapRegistererWithPrefix(fmt.Sprintf("%s_%s_", Namespace, Subsystem), r)),
|
||||
}
|
||||
}
|
||||
|
||||
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
||||
moa.registries.RemoveOrgRegistry(id)
|
||||
}
|
||||
|
||||
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
||||
return moa.registries.GetOrCreateOrgRegistry(id)
|
||||
}
|
||||
|
||||
func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
return &Scheduler{
|
||||
Registerer: r,
|
||||
Alerts: metrics.NewAlerts("v2", r),
|
||||
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "alerts",
|
||||
Help: "How many alerts by state.",
|
||||
}, []string{"state"}),
|
||||
RequestDuration: promauto.With(r).NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "request_duration_seconds",
|
||||
Help: "Histogram of requests to the Alerting API",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
},
|
||||
[]string{"method", "route", "status_code", "backend"},
|
||||
),
|
||||
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "active_configurations",
|
||||
Help: "The number of active, non default alertmanager configurations for grafana managed alerts",
|
||||
}),
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
EvalTotal: promauto.With(r).NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_evaluations_total",
|
||||
Help: "The total number of rule evaluations.",
|
||||
},
|
||||
@@ -87,8 +134,8 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
EvalFailures: promauto.With(r).NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_evaluation_failures_total",
|
||||
Help: "The total number of rule evaluation failures.",
|
||||
},
|
||||
@@ -96,29 +143,73 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
),
|
||||
EvalDuration: promauto.With(r).NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_evaluation_duration_seconds",
|
||||
Help: "The duration for a rule to execute.",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
func newStateMetrics(r prometheus.Registerer) *State {
|
||||
return &State{
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
GroupRules: promauto.With(r).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
},
|
||||
[]string{"user"},
|
||||
),
|
||||
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "alerts",
|
||||
Help: "How many alerts by state.",
|
||||
}, []string{"state"}),
|
||||
}
|
||||
}
|
||||
|
||||
// multi-thread safety and stable ordering of prometheus registries.
|
||||
func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
||||
return &MultiOrgAlertmanager{
|
||||
registries: NewOrgRegistries(),
|
||||
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "discovered_configurations",
|
||||
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
|
||||
}),
|
||||
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "active_configurations",
|
||||
Help: "The number of active Alertmanager configurations.",
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
func newAPIMetrics(r prometheus.Registerer) *API {
|
||||
return &API{
|
||||
RequestDuration: promauto.With(r).NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "request_duration_seconds",
|
||||
Help: "Histogram of requests to the Alerting API",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
},
|
||||
[]string{"method", "route", "status_code", "backend"},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// OrgRegistries represents a map of registries per org.
|
||||
type OrgRegistries struct {
|
||||
regsMu sync.Mutex
|
||||
regs map[int64]prometheus.Registerer
|
||||
@@ -130,6 +221,7 @@ func NewOrgRegistries() *OrgRegistries {
|
||||
}
|
||||
}
|
||||
|
||||
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||
func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registerer {
|
||||
m.regsMu.Lock()
|
||||
defer m.regsMu.Unlock()
|
||||
@@ -143,6 +235,7 @@ func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registere
|
||||
return orgRegistry
|
||||
}
|
||||
|
||||
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
||||
func (m *OrgRegistries) RemoveOrgRegistry(org int64) {
|
||||
m.regsMu.Lock()
|
||||
defer m.regsMu.Unlock()
|
||||
@@ -154,7 +247,7 @@ func Instrument(
|
||||
method,
|
||||
path string,
|
||||
action interface{},
|
||||
metrics *Metrics,
|
||||
metrics *API,
|
||||
) macaron.Handler {
|
||||
normalizedPath := MakeLabelValue(path)
|
||||
|
||||
@@ -39,7 +39,7 @@ const (
|
||||
|
||||
func ProvideService(cfg *setting.Cfg, dataSourceCache datasources.CacheService, routeRegister routing.RouteRegister,
|
||||
sqlStore *sqlstore.SQLStore, kvStore kvstore.KVStore, dataService *tsdb.Service, dataProxy *datasourceproxy.DataSourceProxyService,
|
||||
quotaService *quota.QuotaService, m *metrics.Metrics) (*AlertNG, error) {
|
||||
quotaService *quota.QuotaService, m *metrics.NGAlert) (*AlertNG, error) {
|
||||
ng := &AlertNG{
|
||||
Cfg: cfg,
|
||||
DataSourceCache: dataSourceCache,
|
||||
@@ -74,7 +74,7 @@ type AlertNG struct {
|
||||
DataService *tsdb.Service
|
||||
DataProxy *datasourceproxy.DataSourceProxyService
|
||||
QuotaService *quota.QuotaService
|
||||
Metrics *metrics.Metrics
|
||||
Metrics *metrics.NGAlert
|
||||
Log log.Logger
|
||||
schedule schedule.ScheduleService
|
||||
stateManager *state.Manager
|
||||
@@ -97,7 +97,7 @@ func (ng *AlertNG) init() error {
|
||||
Logger: ng.Log,
|
||||
}
|
||||
|
||||
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore)
|
||||
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, ng.Metrics.GetMultiOrgAlertmanagerMetrics())
|
||||
|
||||
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
|
||||
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {
|
||||
@@ -115,10 +115,10 @@ func (ng *AlertNG) init() error {
|
||||
AdminConfigStore: store,
|
||||
OrgStore: store,
|
||||
MultiOrgNotifier: ng.MultiOrgAlertmanager,
|
||||
Metrics: ng.Metrics,
|
||||
Metrics: ng.Metrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: ng.Cfg.AdminConfigPollInterval,
|
||||
}
|
||||
stateManager := state.NewManager(ng.Log, ng.Metrics, store, store)
|
||||
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
|
||||
schedule := schedule.NewScheduler(schedCfg, ng.DataService, ng.Cfg.AppURL, stateManager)
|
||||
|
||||
ng.stateManager = stateManager
|
||||
@@ -139,7 +139,7 @@ func (ng *AlertNG) init() error {
|
||||
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
|
||||
StateManager: ng.stateManager,
|
||||
}
|
||||
api.RegisterAPIEndpoints(ng.Metrics)
|
||||
api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ type Alertmanager struct {
|
||||
Settings *setting.Cfg
|
||||
Store store.AlertingStore
|
||||
fileStore *FileStore
|
||||
Metrics *metrics.Metrics
|
||||
Metrics *metrics.Alertmanager
|
||||
|
||||
notificationLog *nflog.Log
|
||||
marker types.Marker
|
||||
@@ -111,7 +111,7 @@ type Alertmanager struct {
|
||||
orgID int64
|
||||
}
|
||||
|
||||
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Metrics) (*Alertmanager, error) {
|
||||
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) {
|
||||
am := &Alertmanager{
|
||||
Settings: cfg,
|
||||
stopc: make(chan struct{}),
|
||||
@@ -232,7 +232,6 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
am.Metrics.ActiveConfigurations.Set(1)
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -263,7 +262,6 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
am.Metrics.ActiveConfigurations.Set(1)
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -306,12 +304,6 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
|
||||
return fmt.Errorf("unable to reload configuration: %w", err)
|
||||
}
|
||||
|
||||
if q.Result.Default {
|
||||
am.Metrics.ActiveConfigurations.Set(0)
|
||||
} else {
|
||||
am.Metrics.ActiveConfigurations.Set(1)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -38,9 +38,9 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
DataPath: dir,
|
||||
}
|
||||
|
||||
m := metrics.NewMetrics(prometheus.NewRegistry())
|
||||
m := metrics.NewAlertmanagerMetrics(prometheus.NewRegistry())
|
||||
sqlStore := sqlstore.InitTestDB(t)
|
||||
store := &store.DBstore{
|
||||
s := &store.DBstore{
|
||||
BaseInterval: 10 * time.Second,
|
||||
DefaultIntervalSeconds: 60,
|
||||
SQLStore: sqlStore,
|
||||
@@ -48,7 +48,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
}
|
||||
|
||||
kvStore := newFakeKVStore(t)
|
||||
am, err := newAlertmanager(1, cfg, store, kvStore, m)
|
||||
am, err := newAlertmanager(1, cfg, s, kvStore, m)
|
||||
require.NoError(t, err)
|
||||
return am
|
||||
}
|
||||
|
||||
@@ -30,10 +30,10 @@ type MultiOrgAlertmanager struct {
|
||||
orgStore store.OrgStore
|
||||
kvStore kvstore.KVStore
|
||||
|
||||
orgRegistry *metrics.OrgRegistries
|
||||
metrics *metrics.MultiOrgAlertmanager
|
||||
}
|
||||
|
||||
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore) *MultiOrgAlertmanager {
|
||||
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager) *MultiOrgAlertmanager {
|
||||
return &MultiOrgAlertmanager{
|
||||
settings: cfg,
|
||||
logger: log.New("multiorg.alertmanager"),
|
||||
@@ -41,7 +41,7 @@ func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore,
|
||||
configStore: configStore,
|
||||
orgStore: orgStore,
|
||||
kvStore: kvStore,
|
||||
orgRegistry: metrics.NewOrgRegistries(),
|
||||
metrics: m,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ func (moa *MultiOrgAlertmanager) LoadAndSyncAlertmanagersForOrgs(ctx context.Con
|
||||
}
|
||||
|
||||
// Then, sync them by creating or deleting Alertmanagers as necessary.
|
||||
moa.metrics.DiscoveredConfigurations.Set(float64(len(orgIDs)))
|
||||
moa.SyncAlertmanagersForOrgs(orgIDs)
|
||||
|
||||
moa.logger.Debug("done synchronizing Alertmanagers for orgs")
|
||||
@@ -85,8 +86,11 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
|
||||
|
||||
existing, found := moa.alertmanagers[orgID]
|
||||
if !found {
|
||||
reg := moa.orgRegistry.GetOrCreateOrgRegistry(orgID)
|
||||
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, metrics.NewMetrics(reg))
|
||||
// These metrics are not exported by Grafana and are mostly a placeholder.
|
||||
// To export them, we need to translate the metrics from each individual registry and,
|
||||
// then aggregate them on the main registry.
|
||||
m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID))
|
||||
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, m)
|
||||
if err != nil {
|
||||
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
|
||||
}
|
||||
@@ -105,9 +109,10 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
|
||||
if _, exists := orgsFound[orgId]; !exists {
|
||||
amsToStop[orgId] = am
|
||||
delete(moa.alertmanagers, orgId)
|
||||
moa.orgRegistry.RemoveOrgRegistry(orgId)
|
||||
moa.metrics.RemoveOrgRegistry(orgId)
|
||||
}
|
||||
}
|
||||
moa.metrics.ActiveConfigurations.Set(float64(len(moa.alertmanagers)))
|
||||
moa.alertmanagersMtx.Unlock()
|
||||
|
||||
// Now, we can stop the Alertmanagers without having to hold a lock.
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@@ -21,25 +25,51 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
}
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
kvStore := newFakeKVStore(t)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
m := metrics.NewNGAlert(reg)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure that one Alertmanager is created per org.
|
||||
{
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 3)
|
||||
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
||||
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
||||
# TYPE grafana_alerting_active_configurations gauge
|
||||
grafana_alerting_active_configurations 3
|
||||
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
||||
# TYPE grafana_alerting_discovered_configurations gauge
|
||||
grafana_alerting_discovered_configurations 3
|
||||
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
||||
}
|
||||
// When an org is removed, it should detect it.
|
||||
{
|
||||
orgStore.orgs = []int64{1, 3}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 2)
|
||||
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
||||
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
||||
# TYPE grafana_alerting_active_configurations gauge
|
||||
grafana_alerting_active_configurations 2
|
||||
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
||||
# TYPE grafana_alerting_discovered_configurations gauge
|
||||
grafana_alerting_discovered_configurations 2
|
||||
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
||||
}
|
||||
// if the org comes back, it should detect it.
|
||||
{
|
||||
orgStore.orgs = []int64{1, 2, 3, 4}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 4)
|
||||
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
||||
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
||||
# TYPE grafana_alerting_active_configurations gauge
|
||||
grafana_alerting_active_configurations 4
|
||||
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
||||
# TYPE grafana_alerting_discovered_configurations gauge
|
||||
grafana_alerting_discovered_configurations 4
|
||||
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,7 +84,9 @@ func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
||||
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
kvStore := newFakeKVStore(t)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
m := metrics.NewNGAlert(reg)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure that one Alertmanagers is created per org.
|
||||
|
||||
@@ -77,7 +77,7 @@ type schedule struct {
|
||||
appURL string
|
||||
|
||||
multiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
metrics *metrics.Metrics
|
||||
metrics *metrics.Scheduler
|
||||
|
||||
// Senders help us send alerts to external Alertmanagers.
|
||||
sendersMtx sync.RWMutex
|
||||
@@ -100,7 +100,7 @@ type SchedulerCfg struct {
|
||||
InstanceStore store.InstanceStore
|
||||
AdminConfigStore store.AdminConfigurationStore
|
||||
MultiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
Metrics *metrics.Metrics
|
||||
Metrics *metrics.Scheduler
|
||||
AdminConfigPollInterval time.Duration
|
||||
}
|
||||
|
||||
|
||||
@@ -8,9 +8,6 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
@@ -19,12 +16,16 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/tests"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var nilMetrics = metrics.NewMetrics(nil)
|
||||
var testMetrics = metrics.NewNGAlert(prometheus.NewPedanticRegistry())
|
||||
|
||||
type evalAppliedInfo struct {
|
||||
alertDefKey models.AlertRuleKey
|
||||
@@ -98,10 +99,10 @@ func TestWarmStateCache(t *testing.T) {
|
||||
|
||||
RuleStore: dbstore,
|
||||
InstanceStore: dbstore,
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, nilMetrics, dbstore, dbstore)
|
||||
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
|
||||
st.Warm()
|
||||
|
||||
t.Run("instance cache has expected entries", func(t *testing.T) {
|
||||
@@ -143,10 +144,10 @@ func TestAlertingTicker(t *testing.T) {
|
||||
RuleStore: dbstore,
|
||||
InstanceStore: dbstore,
|
||||
Logger: log.New("ngalert schedule test"),
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, nilMetrics, dbstore, dbstore)
|
||||
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
|
||||
sched := schedule.NewScheduler(schedCfg, nil, "http://localhost", st)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
@@ -18,6 +17,8 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
@@ -229,7 +230,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
|
||||
|
||||
mockedClock := clock.NewMock()
|
||||
logger := log.New("ngalert schedule test")
|
||||
nilMetrics := metrics.NewMetrics(nil)
|
||||
m := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
|
||||
schedCfg := SchedulerCfg{
|
||||
C: mockedClock,
|
||||
BaseInterval: time.Second,
|
||||
@@ -238,12 +239,12 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
|
||||
RuleStore: rs,
|
||||
InstanceStore: is,
|
||||
AdminConfigStore: acs,
|
||||
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}),
|
||||
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}, nil),
|
||||
Logger: logger,
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
Metrics: m.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, nilMetrics, rs, is)
|
||||
st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), rs, is)
|
||||
return NewScheduler(schedCfg, nil, "http://localhost", st), mockedClock
|
||||
}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ type Sender struct {
|
||||
sdManager *discovery.Manager
|
||||
}
|
||||
|
||||
func New(metrics *metrics.Metrics) (*Sender, error) {
|
||||
func New(_ *metrics.Scheduler) (*Sender, error) {
|
||||
l := log.New("sender")
|
||||
sdCtx, sdCancel := context.WithCancel(context.Background())
|
||||
s := &Sender{
|
||||
@@ -51,6 +51,8 @@ func New(metrics *metrics.Metrics) (*Sender, error) {
|
||||
}
|
||||
|
||||
s.manager = notifier.NewManager(
|
||||
// Injecting a new registry here means these metrics are not exported.
|
||||
// Once we fix the individual Alertmanager metrics we should fix this scenario too.
|
||||
¬ifier.Options{QueueCapacity: defaultMaxQueueCapacity, Registerer: prometheus.NewRegistry()},
|
||||
s.gokitLogger,
|
||||
)
|
||||
|
||||
@@ -22,10 +22,10 @@ type cache struct {
|
||||
states map[int64]map[string]map[string]*State // orgID > alertRuleUID > stateID > state
|
||||
mtxStates sync.RWMutex
|
||||
log log.Logger
|
||||
metrics *metrics.Metrics
|
||||
metrics *metrics.State
|
||||
}
|
||||
|
||||
func newCache(logger log.Logger, metrics *metrics.Metrics) *cache {
|
||||
func newCache(logger log.Logger, metrics *metrics.State) *cache {
|
||||
return &cache{
|
||||
states: make(map[int64]map[string]map[string]*State),
|
||||
log: logger,
|
||||
|
||||
@@ -21,7 +21,7 @@ var ResendDelay = 30 * time.Second
|
||||
|
||||
type Manager struct {
|
||||
log log.Logger
|
||||
metrics *metrics.Metrics
|
||||
metrics *metrics.State
|
||||
|
||||
cache *cache
|
||||
quit chan struct{}
|
||||
@@ -31,7 +31,7 @@ type Manager struct {
|
||||
instanceStore store.InstanceStore
|
||||
}
|
||||
|
||||
func NewManager(logger log.Logger, metrics *metrics.Metrics, ruleStore store.RuleStore, instanceStore store.InstanceStore) *Manager {
|
||||
func NewManager(logger log.Logger, metrics *metrics.State, ruleStore store.RuleStore, instanceStore store.InstanceStore) *Manager {
|
||||
manager := &Manager{
|
||||
cache: newCache(logger, metrics),
|
||||
quit: make(chan struct{}),
|
||||
|
||||
@@ -4,22 +4,20 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/tests"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var nilMetrics = metrics.NewMetrics(nil)
|
||||
var testMetrics = metrics.NewNGAlert(prometheus.NewPedanticRegistry())
|
||||
|
||||
func TestProcessEvalResults(t *testing.T) {
|
||||
evaluationTime, err := time.Parse("2006-01-02", "2021-03-25")
|
||||
@@ -853,7 +851,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
st := state.NewManager(log.New("test_state_manager"), nilMetrics, nil, nil)
|
||||
st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil)
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
for _, res := range tc.evalResults {
|
||||
_ = st.ProcessEvalResults(tc.alertRule, res)
|
||||
@@ -948,7 +946,7 @@ func TestStaleResultsHandler(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
st := state.NewManager(log.New("test_stale_results_handler"), nilMetrics, dbstore, dbstore)
|
||||
st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), dbstore, dbstore)
|
||||
st.Warm()
|
||||
existingStatesForRule := st.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
|
||||
// its database migrations run and the relative database tables are created
|
||||
cfg.FeatureToggles = map[string]bool{"ngalert": true}
|
||||
|
||||
m := metrics.NewMetrics(prometheus.NewRegistry())
|
||||
m := metrics.NewNGAlert(prometheus.NewRegistry())
|
||||
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil, nil, m)
|
||||
require.NoError(t, err)
|
||||
return ng, &store.DBstore{
|
||||
|
||||
Reference in New Issue
Block a user