Alerting: Refactor & fix unified alerting metrics structure (#39151)

* Alerting: Refactor & fix unified alerting metrics structure

Fixes and refactors the metrics structure we have for the ngalert service. Now, each component has its own metric struct that includes the JUST the metrics it uses. Additionally, I have fixed the configuration metrics and added new metrics to determine if we have discovered and started all the necessary configurations of an instance.

This allows us to alert on `grafana_alerting_discovered_configurations - grafana_alerting_active_configurations != 0` to know whether an alertmanager instance did not start successfully.
This commit is contained in:
gotjosh
2021-09-14 12:55:01 +01:00
committed by GitHub
parent 1edd415ddf
commit a2f4344bf2
21 changed files with 243 additions and 119 deletions

View File

@@ -66,7 +66,7 @@ type API struct {
}
// RegisterAPIEndpoints registers API handlers
func (api *API) RegisterAPIEndpoints(m *metrics.Metrics) {
func (api *API) RegisterAPIEndpoints(m *metrics.API) {
logger := log.New("ngalert.api")
proxy := &AlertingProxy{
DataProxy: api.DataProxy,

View File

@@ -34,7 +34,7 @@ type AlertmanagerApiService interface {
RoutePostTestReceivers(*models.ReqContext, apimodels.TestReceiversConfigParams) response.Response
}
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, m *metrics.Metrics) {
func (api *API) RegisterAlertmanagerApiEndpoints(srv AlertmanagerApiService, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Post(
toMacaronPath("/api/alertmanager/{Recipient}/api/v2/silences"),

View File

@@ -26,7 +26,7 @@ type ConfigurationApiService interface {
RoutePostNGalertConfig(*models.ReqContext, apimodels.PostableNGalertConfig) response.Response
}
func (api *API) RegisterConfigurationApiEndpoints(srv ConfigurationApiService, m *metrics.Metrics) {
func (api *API) RegisterConfigurationApiEndpoints(srv ConfigurationApiService, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Delete(
toMacaronPath("/api/v1/ngalert/admin_config"),

View File

@@ -21,7 +21,7 @@ type PrometheusApiService interface {
RouteGetRuleStatuses(*models.ReqContext) response.Response
}
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, m *metrics.Metrics) {
func (api *API) RegisterPrometheusApiEndpoints(srv PrometheusApiService, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Get(
toMacaronPath("/api/prometheus/{Recipient}/api/v1/alerts"),

View File

@@ -28,7 +28,7 @@ type RulerApiService interface {
RoutePostNameRulesConfig(*models.ReqContext, apimodels.PostableRuleGroupConfig) response.Response
}
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, m *metrics.Metrics) {
func (api *API) RegisterRulerApiEndpoints(srv RulerApiService, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Delete(
toMacaronPath("/api/ruler/{Recipient}/api/v1/rules/{Namespace}"),

View File

@@ -24,7 +24,7 @@ type TestingApiService interface {
RouteTestRuleConfig(*models.ReqContext, apimodels.TestRulePayload) response.Response
}
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, m *metrics.Metrics) {
func (api *API) RegisterTestingApiEndpoints(srv TestingApiService, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister) {
group.Post(
toMacaronPath("/api/v1/eval"),

View File

@@ -17,7 +17,7 @@ type {{classname}}Service interface { {{#operation}}
{{nickname}}(*models.ReqContext{{#bodyParams}}, apimodels.{{dataType}}{{/bodyParams}}) response.Response{{/operation}}
}
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, m *metrics.Metrics) {
func (api *API) Register{{classname}}Endpoints(srv {{classname}}Service, m *metrics.API) {
api.RouteRegister.Group("", func(group routing.RouteRegister){ {{#operations}}{{#operation}}
group.{{httpMethod}}(
toMacaronPath("{{{path}}}"){{#bodyParams}},

View File

@@ -7,12 +7,12 @@ import (
"sync"
"time"
"github.com/prometheus/alertmanager/api/metrics"
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/prometheus/alertmanager/api/metrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"gopkg.in/macaron.v1"
@@ -21,63 +21,110 @@ import (
const (
GrafanaBackend = "grafana"
ProxyBackend = "proxy"
Namespace = "grafana"
Subsystem = "alerting"
)
// ProvideService is a Metrics factory.
func ProvideService() *Metrics {
return NewMetrics(prometheus.DefaultRegisterer)
func ProvideService() *NGAlert {
return NewNGAlert(prometheus.DefaultRegisterer)
}
// ProvideServiceForTest is a Metrics factory used for test.
func ProvideServiceForTest() *Metrics {
return NewMetrics(prometheus.NewRegistry())
func ProvideServiceForTest() *NGAlert {
return NewNGAlert(prometheus.NewRegistry())
}
type Metrics struct {
*metrics.Alerts
type NGAlert struct {
// Registerer is for use by subcomponents which register their own metrics.
Registerer prometheus.Registerer
AlertState *prometheus.GaugeVec
RequestDuration *prometheus.HistogramVec
ActiveConfigurations prometheus.Gauge
schedulerMetrics *Scheduler
stateMetrics *State
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
apiMetrics *API
}
type Scheduler struct {
Registerer prometheus.Registerer
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
GroupRules *prometheus.GaugeVec
}
func NewMetrics(r prometheus.Registerer) *Metrics {
return &Metrics{
type MultiOrgAlertmanager struct {
ActiveConfigurations prometheus.Gauge
DiscoveredConfigurations prometheus.Gauge
registries *OrgRegistries
}
type API struct {
RequestDuration *prometheus.HistogramVec
}
type Alertmanager struct {
Registerer prometheus.Registerer
*metrics.Alerts
}
type State struct {
GroupRules *prometheus.GaugeVec
AlertState *prometheus.GaugeVec
}
func (ng *NGAlert) GetSchedulerMetrics() *Scheduler {
return ng.schedulerMetrics
}
func (ng *NGAlert) GetStateMetrics() *State {
return ng.stateMetrics
}
func (ng *NGAlert) GetAPIMetrics() *API {
return ng.apiMetrics
}
func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
return ng.multiOrgAlertmanagerMetrics
}
// NewNGAlert manages the metrics of all the alerting components.
func NewNGAlert(r prometheus.Registerer) *NGAlert {
return &NGAlert{
Registerer: r,
schedulerMetrics: newSchedulerMetrics(r),
stateMetrics: newStateMetrics(r),
multiOrgAlertmanagerMetrics: newMultiOrgAlertmanagerMetrics(r),
apiMetrics: newAPIMetrics(r),
}
}
// NewAlertmanagerMetrics creates a set of metrics for the Alertmanager of each organization.
func NewAlertmanagerMetrics(r prometheus.Registerer) *Alertmanager {
return &Alertmanager{
Registerer: r,
Alerts: metrics.NewAlerts("grafana", prometheus.WrapRegistererWithPrefix(fmt.Sprintf("%s_%s_", Namespace, Subsystem), r)),
}
}
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
moa.registries.RemoveOrgRegistry(id)
}
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
return moa.registries.GetOrCreateOrgRegistry(id)
}
func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
return &Scheduler{
Registerer: r,
Alerts: metrics.NewAlerts("v2", r),
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "alerts",
Help: "How many alerts by state.",
}, []string{"state"}),
RequestDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "request_duration_seconds",
Help: "Histogram of requests to the Alerting API",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "route", "status_code", "backend"},
),
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Name: "active_configurations",
Help: "The number of active, non default alertmanager configurations for grafana managed alerts",
}),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalTotal: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluations_total",
Help: "The total number of rule evaluations.",
},
@@ -87,8 +134,8 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
// on rule group as well as tenant, similar to loki|cortex.
EvalFailures: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Subsystem: "alerting",
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.",
},
@@ -96,29 +143,73 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
),
EvalDuration: promauto.With(r).NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "grafana",
Subsystem: "alerting",
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"user"},
),
}
}
func newStateMetrics(r prometheus.Registerer) *State {
return &State{
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
GroupRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "grafana",
Subsystem: "alerting",
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_group_rules",
Help: "The number of rules.",
},
[]string{"user"},
),
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "alerts",
Help: "How many alerts by state.",
}, []string{"state"}),
}
}
// multi-thread safety and stable ordering of prometheus registries.
func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
return &MultiOrgAlertmanager{
registries: NewOrgRegistries(),
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "discovered_configurations",
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
}),
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "active_configurations",
Help: "The number of active Alertmanager configurations.",
}),
}
}
func newAPIMetrics(r prometheus.Registerer) *API {
return &API{
RequestDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "request_duration_seconds",
Help: "Histogram of requests to the Alerting API",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "route", "status_code", "backend"},
),
}
}
// OrgRegistries represents a map of registries per org.
type OrgRegistries struct {
regsMu sync.Mutex
regs map[int64]prometheus.Registerer
@@ -130,6 +221,7 @@ func NewOrgRegistries() *OrgRegistries {
}
}
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registerer {
m.regsMu.Lock()
defer m.regsMu.Unlock()
@@ -143,6 +235,7 @@ func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registere
return orgRegistry
}
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
func (m *OrgRegistries) RemoveOrgRegistry(org int64) {
m.regsMu.Lock()
defer m.regsMu.Unlock()
@@ -154,7 +247,7 @@ func Instrument(
method,
path string,
action interface{},
metrics *Metrics,
metrics *API,
) macaron.Handler {
normalizedPath := MakeLabelValue(path)

View File

@@ -39,7 +39,7 @@ const (
func ProvideService(cfg *setting.Cfg, dataSourceCache datasources.CacheService, routeRegister routing.RouteRegister,
sqlStore *sqlstore.SQLStore, kvStore kvstore.KVStore, dataService *tsdb.Service, dataProxy *datasourceproxy.DataSourceProxyService,
quotaService *quota.QuotaService, m *metrics.Metrics) (*AlertNG, error) {
quotaService *quota.QuotaService, m *metrics.NGAlert) (*AlertNG, error) {
ng := &AlertNG{
Cfg: cfg,
DataSourceCache: dataSourceCache,
@@ -74,7 +74,7 @@ type AlertNG struct {
DataService *tsdb.Service
DataProxy *datasourceproxy.DataSourceProxyService
QuotaService *quota.QuotaService
Metrics *metrics.Metrics
Metrics *metrics.NGAlert
Log log.Logger
schedule schedule.ScheduleService
stateManager *state.Manager
@@ -97,7 +97,7 @@ func (ng *AlertNG) init() error {
Logger: ng.Log,
}
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore)
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, ng.Metrics.GetMultiOrgAlertmanagerMetrics())
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {
@@ -115,10 +115,10 @@ func (ng *AlertNG) init() error {
AdminConfigStore: store,
OrgStore: store,
MultiOrgNotifier: ng.MultiOrgAlertmanager,
Metrics: ng.Metrics,
Metrics: ng.Metrics.GetSchedulerMetrics(),
AdminConfigPollInterval: ng.Cfg.AdminConfigPollInterval,
}
stateManager := state.NewManager(ng.Log, ng.Metrics, store, store)
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
schedule := schedule.NewScheduler(schedCfg, ng.DataService, ng.Cfg.AppURL, stateManager)
ng.stateManager = stateManager
@@ -139,7 +139,7 @@ func (ng *AlertNG) init() error {
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
StateManager: ng.stateManager,
}
api.RegisterAPIEndpoints(ng.Metrics)
api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
return nil
}

View File

@@ -84,7 +84,7 @@ type Alertmanager struct {
Settings *setting.Cfg
Store store.AlertingStore
fileStore *FileStore
Metrics *metrics.Metrics
Metrics *metrics.Alertmanager
notificationLog *nflog.Log
marker types.Marker
@@ -111,7 +111,7 @@ type Alertmanager struct {
orgID int64
}
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Metrics) (*Alertmanager, error) {
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) {
am := &Alertmanager{
Settings: cfg,
stopc: make(chan struct{}),
@@ -232,7 +232,6 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig() error {
if err != nil {
return err
}
am.Metrics.ActiveConfigurations.Set(1)
return nil
}
@@ -263,7 +262,6 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er
if err != nil {
return err
}
am.Metrics.ActiveConfigurations.Set(1)
return nil
}
@@ -306,12 +304,6 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
return fmt.Errorf("unable to reload configuration: %w", err)
}
if q.Result.Default {
am.Metrics.ActiveConfigurations.Set(0)
} else {
am.Metrics.ActiveConfigurations.Set(1)
}
return nil
}

View File

@@ -38,9 +38,9 @@ func setupAMTest(t *testing.T) *Alertmanager {
DataPath: dir,
}
m := metrics.NewMetrics(prometheus.NewRegistry())
m := metrics.NewAlertmanagerMetrics(prometheus.NewRegistry())
sqlStore := sqlstore.InitTestDB(t)
store := &store.DBstore{
s := &store.DBstore{
BaseInterval: 10 * time.Second,
DefaultIntervalSeconds: 60,
SQLStore: sqlStore,
@@ -48,7 +48,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
}
kvStore := newFakeKVStore(t)
am, err := newAlertmanager(1, cfg, store, kvStore, m)
am, err := newAlertmanager(1, cfg, s, kvStore, m)
require.NoError(t, err)
return am
}

View File

@@ -30,10 +30,10 @@ type MultiOrgAlertmanager struct {
orgStore store.OrgStore
kvStore kvstore.KVStore
orgRegistry *metrics.OrgRegistries
metrics *metrics.MultiOrgAlertmanager
}
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore) *MultiOrgAlertmanager {
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager) *MultiOrgAlertmanager {
return &MultiOrgAlertmanager{
settings: cfg,
logger: log.New("multiorg.alertmanager"),
@@ -41,7 +41,7 @@ func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore,
configStore: configStore,
orgStore: orgStore,
kvStore: kvStore,
orgRegistry: metrics.NewOrgRegistries(),
metrics: m,
}
}
@@ -70,6 +70,7 @@ func (moa *MultiOrgAlertmanager) LoadAndSyncAlertmanagersForOrgs(ctx context.Con
}
// Then, sync them by creating or deleting Alertmanagers as necessary.
moa.metrics.DiscoveredConfigurations.Set(float64(len(orgIDs)))
moa.SyncAlertmanagersForOrgs(orgIDs)
moa.logger.Debug("done synchronizing Alertmanagers for orgs")
@@ -85,8 +86,11 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
existing, found := moa.alertmanagers[orgID]
if !found {
reg := moa.orgRegistry.GetOrCreateOrgRegistry(orgID)
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, metrics.NewMetrics(reg))
// These metrics are not exported by Grafana and are mostly a placeholder.
// To export them, we need to translate the metrics from each individual registry and,
// then aggregate them on the main registry.
m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID))
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, m)
if err != nil {
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
}
@@ -105,9 +109,10 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
if _, exists := orgsFound[orgId]; !exists {
amsToStop[orgId] = am
delete(moa.alertmanagers, orgId)
moa.orgRegistry.RemoveOrgRegistry(orgId)
moa.metrics.RemoveOrgRegistry(orgId)
}
}
moa.metrics.ActiveConfigurations.Set(float64(len(moa.alertmanagers)))
moa.alertmanagersMtx.Unlock()
// Now, we can stop the Alertmanagers without having to hold a lock.

View File

@@ -1,13 +1,17 @@
package notifier
import (
"bytes"
"context"
"testing"
"time"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/setting"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
)
@@ -21,25 +25,51 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
}
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
kvStore := newFakeKVStore(t)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
reg := prometheus.NewPedanticRegistry()
m := metrics.NewNGAlert(reg)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
ctx := context.Background()
// Ensure that one Alertmanager is created per org.
{
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
require.Len(t, mam.alertmanagers, 3)
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
# TYPE grafana_alerting_active_configurations gauge
grafana_alerting_active_configurations 3
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
# TYPE grafana_alerting_discovered_configurations gauge
grafana_alerting_discovered_configurations 3
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
}
// When an org is removed, it should detect it.
{
orgStore.orgs = []int64{1, 3}
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
require.Len(t, mam.alertmanagers, 2)
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
# TYPE grafana_alerting_active_configurations gauge
grafana_alerting_active_configurations 2
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
# TYPE grafana_alerting_discovered_configurations gauge
grafana_alerting_discovered_configurations 2
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
}
// if the org comes back, it should detect it.
{
orgStore.orgs = []int64{1, 2, 3, 4}
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
require.Len(t, mam.alertmanagers, 4)
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
# TYPE grafana_alerting_active_configurations gauge
grafana_alerting_active_configurations 4
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
# TYPE grafana_alerting_discovered_configurations gauge
grafana_alerting_discovered_configurations 4
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
}
}
@@ -54,7 +84,9 @@ func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
kvStore := newFakeKVStore(t)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
reg := prometheus.NewPedanticRegistry()
m := metrics.NewNGAlert(reg)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
ctx := context.Background()
// Ensure that one Alertmanagers is created per org.

View File

@@ -77,7 +77,7 @@ type schedule struct {
appURL string
multiOrgNotifier *notifier.MultiOrgAlertmanager
metrics *metrics.Metrics
metrics *metrics.Scheduler
// Senders help us send alerts to external Alertmanagers.
sendersMtx sync.RWMutex
@@ -100,7 +100,7 @@ type SchedulerCfg struct {
InstanceStore store.InstanceStore
AdminConfigStore store.AdminConfigurationStore
MultiOrgNotifier *notifier.MultiOrgAlertmanager
Metrics *metrics.Metrics
Metrics *metrics.Scheduler
AdminConfigPollInterval time.Duration
}

View File

@@ -8,9 +8,6 @@ import (
"testing"
"time"
"github.com/benbjohnson/clock"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
@@ -19,12 +16,16 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/tests"
"github.com/benbjohnson/clock"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var nilMetrics = metrics.NewMetrics(nil)
var testMetrics = metrics.NewNGAlert(prometheus.NewPedanticRegistry())
type evalAppliedInfo struct {
alertDefKey models.AlertRuleKey
@@ -98,10 +99,10 @@ func TestWarmStateCache(t *testing.T) {
RuleStore: dbstore,
InstanceStore: dbstore,
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
Metrics: testMetrics.GetSchedulerMetrics(),
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
}
st := state.NewManager(schedCfg.Logger, nilMetrics, dbstore, dbstore)
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
st.Warm()
t.Run("instance cache has expected entries", func(t *testing.T) {
@@ -143,10 +144,10 @@ func TestAlertingTicker(t *testing.T) {
RuleStore: dbstore,
InstanceStore: dbstore,
Logger: log.New("ngalert schedule test"),
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
Metrics: testMetrics.GetSchedulerMetrics(),
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
}
st := state.NewManager(schedCfg.Logger, nilMetrics, dbstore, dbstore)
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
sched := schedule.NewScheduler(schedCfg, nil, "http://localhost", st)
ctx := context.Background()

View File

@@ -8,7 +8,6 @@ import (
"testing"
"time"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/infra/log"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
@@ -18,6 +17,8 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/setting"
"github.com/benbjohnson/clock"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
@@ -229,7 +230,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
mockedClock := clock.NewMock()
logger := log.New("ngalert schedule test")
nilMetrics := metrics.NewMetrics(nil)
m := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
schedCfg := SchedulerCfg{
C: mockedClock,
BaseInterval: time.Second,
@@ -238,12 +239,12 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
RuleStore: rs,
InstanceStore: is,
AdminConfigStore: acs,
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}, &notifier.FakeKVStore{}),
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}, &notifier.FakeKVStore{}, nil),
Logger: logger,
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
Metrics: m.GetSchedulerMetrics(),
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
}
st := state.NewManager(schedCfg.Logger, nilMetrics, rs, is)
st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), rs, is)
return NewScheduler(schedCfg, nil, "http://localhost", st), mockedClock
}

View File

@@ -41,7 +41,7 @@ type Sender struct {
sdManager *discovery.Manager
}
func New(metrics *metrics.Metrics) (*Sender, error) {
func New(_ *metrics.Scheduler) (*Sender, error) {
l := log.New("sender")
sdCtx, sdCancel := context.WithCancel(context.Background())
s := &Sender{
@@ -51,6 +51,8 @@ func New(metrics *metrics.Metrics) (*Sender, error) {
}
s.manager = notifier.NewManager(
// Injecting a new registry here means these metrics are not exported.
// Once we fix the individual Alertmanager metrics we should fix this scenario too.
&notifier.Options{QueueCapacity: defaultMaxQueueCapacity, Registerer: prometheus.NewRegistry()},
s.gokitLogger,
)

View File

@@ -22,10 +22,10 @@ type cache struct {
states map[int64]map[string]map[string]*State // orgID > alertRuleUID > stateID > state
mtxStates sync.RWMutex
log log.Logger
metrics *metrics.Metrics
metrics *metrics.State
}
func newCache(logger log.Logger, metrics *metrics.Metrics) *cache {
func newCache(logger log.Logger, metrics *metrics.State) *cache {
return &cache{
states: make(map[int64]map[string]map[string]*State),
log: logger,

View File

@@ -21,7 +21,7 @@ var ResendDelay = 30 * time.Second
type Manager struct {
log log.Logger
metrics *metrics.Metrics
metrics *metrics.State
cache *cache
quit chan struct{}
@@ -31,7 +31,7 @@ type Manager struct {
instanceStore store.InstanceStore
}
func NewManager(logger log.Logger, metrics *metrics.Metrics, ruleStore store.RuleStore, instanceStore store.InstanceStore) *Manager {
func NewManager(logger log.Logger, metrics *metrics.State, ruleStore store.RuleStore, instanceStore store.InstanceStore) *Manager {
manager := &Manager{
cache: newCache(logger, metrics),
quit: make(chan struct{}),

View File

@@ -4,22 +4,20 @@ import (
"testing"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/tests"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var nilMetrics = metrics.NewMetrics(nil)
var testMetrics = metrics.NewNGAlert(prometheus.NewPedanticRegistry())
func TestProcessEvalResults(t *testing.T) {
evaluationTime, err := time.Parse("2006-01-02", "2021-03-25")
@@ -853,7 +851,7 @@ func TestProcessEvalResults(t *testing.T) {
}
for _, tc := range testCases {
st := state.NewManager(log.New("test_state_manager"), nilMetrics, nil, nil)
st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil)
t.Run(tc.desc, func(t *testing.T) {
for _, res := range tc.evalResults {
_ = st.ProcessEvalResults(tc.alertRule, res)
@@ -948,7 +946,7 @@ func TestStaleResultsHandler(t *testing.T) {
}
for _, tc := range testCases {
st := state.NewManager(log.New("test_stale_results_handler"), nilMetrics, dbstore, dbstore)
st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), dbstore, dbstore)
st.Warm()
existingStatesForRule := st.GetStatesForRuleUID(rule.OrgID, rule.UID)

View File

@@ -34,7 +34,7 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
// its database migrations run and the relative database tables are created
cfg.FeatureToggles = map[string]bool{"ngalert": true}
m := metrics.NewMetrics(prometheus.NewRegistry())
m := metrics.NewNGAlert(prometheus.NewRegistry())
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil, nil, m)
require.NoError(t, err)
return ng, &store.DBstore{