Alerting: refactor scheduler and separate notification logic (#48144)

* Introduce AlertsRouter in the sender package, and move all fields and methods related to notifications out of the scheduler to this router.
* Introduce a new interface AlertsSender in the schedule package and replace calls of anonymous function `notify` inside the ruleRoutine to calling methods of that interface.
* Rename interface Scheduler in api package to ExternalAlertmanagerProvider, and replace scheduler with AlertRouter as struct that implements the interface.
This commit is contained in:
Yuriy Tseretyan
2022-07-12 15:13:04 -04:00
committed by GitHub
parent ededf1dd6f
commit a6b1090879
11 changed files with 538 additions and 462 deletions

View File

@@ -0,0 +1,255 @@
package sender
import (
"context"
"errors"
"net/url"
"sync"
"time"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/store"
)
// AlertsRouter handles alerts generated during alert rule evaluation.
// Based on rule's orgID and the configuration for that organization,
// it determines whether an alert needs to be sent to an external Alertmanager and\or internal notifier.Alertmanager
//
// After creating a AlertsRouter, you must call Run to keep the AlertsRouter's
// state synchronized with the alerting configuration.
type AlertsRouter struct {
logger log.Logger
clock clock.Clock
adminConfigStore store.AdminConfigurationStore
// Senders help us send alerts to external Alertmanagers.
AdminConfigMtx sync.RWMutex
SendAlertsTo map[int64]models.AlertmanagersChoice
Senders map[int64]*Sender
SendersCfgHash map[int64]string
MultiOrgNotifier *notifier.MultiOrgAlertmanager
appURL *url.URL
disabledOrgs map[int64]struct{}
adminConfigPollInterval time.Duration
}
func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store store.AdminConfigurationStore, clk clock.Clock, appURL *url.URL, disabledOrgs map[int64]struct{}, configPollInterval time.Duration) *AlertsRouter {
d := &AlertsRouter{
logger: log.New("alerts-router"),
clock: clk,
adminConfigStore: store,
AdminConfigMtx: sync.RWMutex{},
Senders: map[int64]*Sender{},
SendersCfgHash: map[int64]string{},
SendAlertsTo: map[int64]models.AlertmanagersChoice{},
MultiOrgNotifier: multiOrgNotifier,
appURL: appURL,
disabledOrgs: disabledOrgs,
adminConfigPollInterval: configPollInterval,
}
return d
}
// SyncAndApplyConfigFromDatabase looks for the admin configuration in the database
// and adjusts the sender(s) and alert handling mechanism accordingly.
func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
d.logger.Debug("start of admin configuration sync")
cfgs, err := d.adminConfigStore.GetAdminConfigurations()
if err != nil {
return err
}
d.logger.Debug("found admin configurations", "count", len(cfgs))
orgsFound := make(map[int64]struct{}, len(cfgs))
d.AdminConfigMtx.Lock()
for _, cfg := range cfgs {
_, isDisabledOrg := d.disabledOrgs[cfg.OrgID]
if isDisabledOrg {
d.logger.Debug("skipping starting sender for disabled org", "org", cfg.OrgID)
continue
}
// Update the Alertmanagers choice for the organization.
d.SendAlertsTo[cfg.OrgID] = cfg.SendAlertsTo
orgsFound[cfg.OrgID] = struct{}{} // keep track of the which senders we need to keep.
existing, ok := d.Senders[cfg.OrgID]
// We have no running sender and no Alertmanager(s) configured, no-op.
if !ok && len(cfg.Alertmanagers) == 0 {
d.logger.Debug("no external alertmanagers configured", "org", cfg.OrgID)
continue
}
// We have no running sender and alerts are handled internally, no-op.
if !ok && cfg.SendAlertsTo == models.InternalAlertmanager {
d.logger.Debug("alerts are handled internally", "org", cfg.OrgID)
continue
}
// We have a running sender but no Alertmanager(s) configured, shut it down.
if ok && len(cfg.Alertmanagers) == 0 {
d.logger.Debug("no external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID)
delete(orgsFound, cfg.OrgID)
continue
}
// We have a running sender, check if we need to apply a new config.
if ok {
if d.SendersCfgHash[cfg.OrgID] == cfg.AsSHA256() {
d.logger.Debug("sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers)
continue
}
d.logger.Debug("applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers)
err := existing.ApplyConfig(cfg)
if err != nil {
d.logger.Error("failed to apply configuration", "err", err, "org", cfg.OrgID)
continue
}
d.SendersCfgHash[cfg.OrgID] = cfg.AsSHA256()
continue
}
// No sender and have Alertmanager(s) to send to - start a new one.
d.logger.Info("creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", cfg.Alertmanagers)
s, err := New()
if err != nil {
d.logger.Error("unable to start the sender", "err", err, "org", cfg.OrgID)
continue
}
d.Senders[cfg.OrgID] = s
s.Run()
err = s.ApplyConfig(cfg)
if err != nil {
d.logger.Error("failed to apply configuration", "err", err, "org", cfg.OrgID)
continue
}
d.SendersCfgHash[cfg.OrgID] = cfg.AsSHA256()
}
sendersToStop := map[int64]*Sender{}
for orgID, s := range d.Senders {
if _, exists := orgsFound[orgID]; !exists {
sendersToStop[orgID] = s
delete(d.Senders, orgID)
delete(d.SendersCfgHash, orgID)
}
}
d.AdminConfigMtx.Unlock()
// We can now stop these senders w/o having to hold a lock.
for orgID, s := range sendersToStop {
d.logger.Info("stopping sender", "org", orgID)
s.Stop()
d.logger.Info("stopped sender", "org", orgID)
}
d.logger.Debug("finish of admin configuration sync")
return nil
}
func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.PostableAlerts) {
logger := d.logger.New("rule_uid", key.UID, "org", key.OrgID)
if len(alerts.PostableAlerts) == 0 {
logger.Debug("no alerts to notify about")
return
}
// Send alerts to local notifier if they need to be handled internally
// or if no external AMs have been discovered yet.
var localNotifierExist, externalNotifierExist bool
if d.SendAlertsTo[key.OrgID] == models.ExternalAlertmanagers && len(d.AlertmanagersFor(key.OrgID)) > 0 {
logger.Debug("no alerts to put in the notifier")
} else {
logger.Debug("sending alerts to local notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
n, err := d.MultiOrgNotifier.AlertmanagerFor(key.OrgID)
if err == nil {
localNotifierExist = true
if err := n.PutAlerts(alerts); err != nil {
logger.Error("failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "err", err)
}
} else {
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
logger.Debug("local notifier was not found")
} else {
logger.Error("local notifier is not available", "err", err)
}
}
}
// Send alerts to external Alertmanager(s) if we have a sender for this organization
// and alerts are not being handled just internally.
d.AdminConfigMtx.RLock()
defer d.AdminConfigMtx.RUnlock()
s, ok := d.Senders[key.OrgID]
if ok && d.SendAlertsTo[key.OrgID] != models.InternalAlertmanager {
logger.Debug("sending alerts to external notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
s.SendAlerts(alerts)
externalNotifierExist = true
}
if !localNotifierExist && !externalNotifierExist {
logger.Error("no external or internal notifier - [%d] alerts not delivered", len(alerts.PostableAlerts))
}
}
// AlertmanagersFor returns all the discovered Alertmanager(s) for a particular organization.
func (d *AlertsRouter) AlertmanagersFor(orgID int64) []*url.URL {
d.AdminConfigMtx.RLock()
defer d.AdminConfigMtx.RUnlock()
s, ok := d.Senders[orgID]
if !ok {
return []*url.URL{}
}
return s.Alertmanagers()
}
// DroppedAlertmanagersFor returns all the dropped Alertmanager(s) for a particular organization.
func (d *AlertsRouter) DroppedAlertmanagersFor(orgID int64) []*url.URL {
d.AdminConfigMtx.RLock()
defer d.AdminConfigMtx.RUnlock()
s, ok := d.Senders[orgID]
if !ok {
return []*url.URL{}
}
return s.DroppedAlertmanagers()
}
// Run starts regular updates of the configuration.
func (d *AlertsRouter) Run(ctx context.Context) error {
for {
select {
case <-time.After(d.adminConfigPollInterval):
if err := d.SyncAndApplyConfigFromDatabase(); err != nil {
d.logger.Error("unable to sync admin configuration", "err", err)
}
case <-ctx.Done():
// Stop sending alerts to all external Alertmanager(s).
d.AdminConfigMtx.Lock()
for orgID, s := range d.Senders {
delete(d.Senders, orgID) // delete before we stop to make sure we don't accept any more alerts.
s.Stop()
}
d.AdminConfigMtx.Unlock()
return nil
}
}
}

View File

@@ -9,7 +9,6 @@ import (
"github.com/grafana/grafana/pkg/infra/log"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/prometheus/alertmanager/api/v2/models"
@@ -38,7 +37,7 @@ type Sender struct {
sdManager *discovery.Manager
}
func New(_ *metrics.Scheduler) (*Sender, error) {
func New() (*Sender, error) {
l := log.New("sender")
sdCtx, sdCancel := context.WithCancel(context.Background())
s := &Sender{