Alerting: Improve operational logs in sender package (#57134)

* Audit logs in sender package

* Fix casing and touch up a few key names

* Avoid logging entire alert struct

* Log configuration ID being applied

* Revert change to errorf rather than log

* Tune levels further and remove some redundancies

* Adjust logger naming and standardize log context

* Adjust logger naming in router

* Move log and get rid of dead error handling code
This commit is contained in:
Alexander Weaver 2022-10-20 14:19:04 -05:00 committed by GitHub
parent f3c219a980
commit 5ee4744d62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 46 deletions

View File

@ -51,7 +51,7 @@ func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store stor
clk clock.Clock, appURL *url.URL, disabledOrgs map[int64]struct{}, configPollInterval time.Duration,
datasourceService datasources.DataSourceService, secretService secrets.Service) *AlertsRouter {
d := &AlertsRouter{
logger: log.New("alerts-router"),
logger: log.New("ngalert.sender.router"),
clock: clk,
adminConfigStore: store,
@ -75,20 +75,18 @@ func NewAlertsRouter(multiOrgNotifier *notifier.MultiOrgAlertmanager, store stor
// SyncAndApplyConfigFromDatabase looks for the admin configuration in the database
// and adjusts the sender(s) and alert handling mechanism accordingly.
func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
d.logger.Debug("start of admin configuration sync")
cfgs, err := d.adminConfigStore.GetAdminConfigurations()
if err != nil {
return err
}
d.logger.Debug("found admin configurations", "count", len(cfgs))
d.logger.Debug("Attempting to sync admin configs", "count", len(cfgs))
orgsFound := make(map[int64]struct{}, len(cfgs))
d.adminConfigMtx.Lock()
for _, cfg := range cfgs {
_, isDisabledOrg := d.disabledOrgs[cfg.OrgID]
if isDisabledOrg {
d.logger.Debug("skipping starting sender for disabled org", "org", cfg.OrgID)
continue
}
@ -101,13 +99,13 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
// We have no running sender and alerts are handled internally, no-op.
if !ok && cfg.SendAlertsTo == models.InternalAlertmanager {
d.logger.Debug("alerts are handled internally", "org", cfg.OrgID)
d.logger.Debug("Grafana is configured to send alerts to the internal alertmanager only. Skipping synchronization with external alertmanager", "org", cfg.OrgID)
continue
}
externalAlertmanagers, err := d.alertmanagersFromDatasources(cfg.OrgID)
if err != nil {
d.logger.Error("failed to get alertmanagers from datasources",
d.logger.Error("Failed to get alertmanagers from datasources",
"org", cfg.OrgID,
"error", err)
continue
@ -116,13 +114,13 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
// We have no running sender and no Alertmanager(s) configured, no-op.
if !ok && len(cfg.Alertmanagers) == 0 {
d.logger.Debug("no external alertmanagers configured", "org", cfg.OrgID)
d.logger.Debug("No external alertmanagers configured", "org", cfg.OrgID)
continue
}
// We have a running sender but no Alertmanager(s) configured, shut it down.
if ok && len(cfg.Alertmanagers) == 0 {
d.logger.Debug("no external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID)
d.logger.Info("No external alertmanager(s) configured, sender will be stopped", "org", cfg.OrgID)
delete(orgsFound, cfg.OrgID)
continue
}
@ -132,7 +130,7 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
for _, am := range cfg.Alertmanagers {
parsedAM, err := url.Parse(am)
if err != nil {
d.logger.Error("failed to parse alertmanager string",
d.logger.Error("Failed to parse alertmanager string",
"org", cfg.OrgID,
"error", err)
continue
@ -140,20 +138,20 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
redactedAMs = append(redactedAMs, parsedAM.Redacted())
}
d.logger.Debug("alertmanagers found in the configuration", "alertmanagers", redactedAMs)
d.logger.Debug("Alertmanagers found in the configuration", "alertmanagers", redactedAMs)
// We have a running sender, check if we need to apply a new config.
amHash := cfg.AsSHA256()
if ok {
if d.externalAlertmanagersCfgHash[cfg.OrgID] == amHash {
d.logger.Debug("sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", redactedAMs)
d.logger.Debug("Sender configuration is the same as the one running, no-op", "org", cfg.OrgID, "alertmanagers", redactedAMs)
continue
}
d.logger.Debug("applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", redactedAMs)
d.logger.Info("Applying new configuration to sender", "org", cfg.OrgID, "alertmanagers", redactedAMs, "cfg", cfg.ID)
err := existing.ApplyConfig(cfg)
if err != nil {
d.logger.Error("failed to apply configuration", "error", err, "org", cfg.OrgID)
d.logger.Error("Failed to apply configuration", "error", err, "org", cfg.OrgID)
continue
}
d.externalAlertmanagersCfgHash[cfg.OrgID] = amHash
@ -161,19 +159,14 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
}
// No sender and have Alertmanager(s) to send to - start a new one.
d.logger.Info("creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", redactedAMs)
s, err := NewExternalAlertmanagerSender()
if err != nil {
d.logger.Error("unable to start the sender", "error", err, "org", cfg.OrgID)
continue
}
d.logger.Info("Creating new sender for the external alertmanagers", "org", cfg.OrgID, "alertmanagers", redactedAMs)
s := NewExternalAlertmanagerSender()
d.externalAlertmanagers[cfg.OrgID] = s
s.Run()
err = s.ApplyConfig(cfg)
if err != nil {
d.logger.Error("failed to apply configuration", "error", err, "org", cfg.OrgID)
d.logger.Error("Failed to apply configuration", "error", err, "org", cfg.OrgID)
continue
}
@ -193,12 +186,12 @@ func (d *AlertsRouter) SyncAndApplyConfigFromDatabase() error {
// We can now stop these externalAlertmanagers w/o having to hold a lock.
for orgID, s := range sendersToStop {
d.logger.Info("stopping sender", "org", orgID)
d.logger.Info("Stopping sender", "org", orgID)
s.Stop()
d.logger.Info("stopped sender", "org", orgID)
d.logger.Info("Stopped sender", "org", orgID)
}
d.logger.Debug("finish of admin configuration sync")
d.logger.Debug("Finish of admin configuration sync")
return nil
}
@ -223,7 +216,7 @@ func (d *AlertsRouter) alertmanagersFromDatasources(orgID int64) ([]string, erro
}
amURL, err := d.buildExternalURL(ds)
if err != nil {
d.logger.Error("failed to build external alertmanager URL",
d.logger.Error("Failed to build external alertmanager URL",
"org", ds.OrgId,
"uid", ds.Uid,
"error", err)
@ -255,29 +248,29 @@ func (d *AlertsRouter) buildExternalURL(ds *datasources.DataSource) (string, err
}
func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.PostableAlerts) {
logger := d.logger.New("rule_uid", key.UID, "org", key.OrgID)
logger := d.logger.New(key.LogContext()...)
if len(alerts.PostableAlerts) == 0 {
logger.Debug("no alerts to notify about")
logger.Info("No alerts to notify about")
return
}
// Send alerts to local notifier if they need to be handled internally
// or if no external AMs have been discovered yet.
var localNotifierExist, externalNotifierExist bool
if d.sendAlertsTo[key.OrgID] == models.ExternalAlertmanagers && len(d.AlertmanagersFor(key.OrgID)) > 0 {
logger.Debug("no alerts to put in the notifier")
logger.Debug("All alerts for the given org should be routed to external notifiers only. skipping the internal notifier.")
} else {
logger.Debug("sending alerts to local notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
logger.Info("Sending alerts to local notifier", "count", len(alerts.PostableAlerts))
n, err := d.multiOrgNotifier.AlertmanagerFor(key.OrgID)
if err == nil {
localNotifierExist = true
if err := n.PutAlerts(alerts); err != nil {
logger.Error("failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "error", err)
logger.Error("Failed to put alerts in the local notifier", "count", len(alerts.PostableAlerts), "error", err)
}
} else {
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
logger.Debug("local notifier was not found")
logger.Debug("Local notifier was not found")
} else {
logger.Error("local notifier is not available", "error", err)
logger.Error("Local notifier is not available", "error", err)
}
}
}
@ -288,13 +281,13 @@ func (d *AlertsRouter) Send(key models.AlertRuleKey, alerts definitions.Postable
defer d.adminConfigMtx.RUnlock()
s, ok := d.externalAlertmanagers[key.OrgID]
if ok && d.sendAlertsTo[key.OrgID] != models.InternalAlertmanager {
logger.Debug("sending alerts to external notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
logger.Info("Sending alerts to external notifier", "count", len(alerts.PostableAlerts))
s.SendAlerts(alerts)
externalNotifierExist = true
}
if !localNotifierExist && !externalNotifierExist {
logger.Error("no external or internal notifier - [%d] alerts not delivered", len(alerts.PostableAlerts))
logger.Error("No external or internal notifier - alerts not delivered", "count", len(alerts.PostableAlerts))
}
}
@ -327,7 +320,7 @@ func (d *AlertsRouter) Run(ctx context.Context) error {
select {
case <-time.After(d.adminConfigPollInterval):
if err := d.SyncAndApplyConfigFromDatabase(); err != nil {
d.logger.Error("unable to sync admin configuration", "error", err)
d.logger.Error("Unable to sync admin configuration", "error", err)
}
case <-ctx.Done():
// Stop sending alerts to all external Alertmanager(s).

View File

@ -42,8 +42,8 @@ type ExternalAlertmanager struct {
sdManager *discovery.Manager
}
func NewExternalAlertmanagerSender() (*ExternalAlertmanager, error) {
l := log.New("sender")
func NewExternalAlertmanagerSender() *ExternalAlertmanager {
l := log.New("ngalert.sender.external-alertmanager")
sdCtx, sdCancel := context.WithCancel(context.Background())
s := &ExternalAlertmanager{
logger: l,
@ -59,7 +59,7 @@ func NewExternalAlertmanagerSender() (*ExternalAlertmanager, error) {
s.sdManager = discovery.NewManager(sdCtx, s.logger)
return s, nil
return s
}
// ApplyConfig syncs a configuration with the sender.
@ -69,6 +69,9 @@ func (s *ExternalAlertmanager) ApplyConfig(cfg *ngmodels.AdminConfiguration) err
return err
}
s.logger = s.logger.New("org", cfg.OrgID, "cfg", cfg.ID)
s.logger.Info("Synchronizing config with external Alertmanager group")
if err := s.manager.ApplyConfig(notifierCfg); err != nil {
return err
}
@ -85,8 +88,10 @@ func (s *ExternalAlertmanager) Run() {
s.wg.Add(2)
go func() {
s.logger.Info("Initiating communication with a group of external Alertmanagers")
if err := s.sdManager.Run(); err != nil {
s.logger.Error("failed to start the sender service discovery manager", "error", err)
s.logger.Error("Failed to start the sender service discovery manager", "error", err)
}
s.wg.Done()
}()
@ -100,7 +105,6 @@ func (s *ExternalAlertmanager) Run() {
// SendAlerts sends a set of alerts to the configured Alertmanager(s).
func (s *ExternalAlertmanager) SendAlerts(alerts apimodels.PostableAlerts) {
if len(alerts.PostableAlerts) == 0 {
s.logger.Debug("no alerts to send to external Alertmanager(s)")
return
}
as := make([]*notifier.Alert, 0, len(alerts.PostableAlerts))
@ -109,12 +113,12 @@ func (s *ExternalAlertmanager) SendAlerts(alerts apimodels.PostableAlerts) {
as = append(as, na)
}
s.logger.Debug("sending alerts to the external Alertmanager(s)", "am_count", len(s.manager.Alertmanagers()), "alert_count", len(as))
s.manager.Send(as...)
}
// Stop shuts down the sender.
func (s *ExternalAlertmanager) Stop() {
s.logger.Info("Shutting down communication with the external Alertmanager group")
s.sdCancel()
s.manager.Stop()
s.wg.Wait()
@ -197,14 +201,14 @@ func (s *ExternalAlertmanager) sanitizeLabelSet(lbls models.LabelSet) labels.Lab
for _, k := range sortedKeys(lbls) {
sanitizedLabelName, err := s.sanitizeLabelName(k)
if err != nil {
s.logger.Error("alert sending to external Alertmanager(s) contains an invalid label/annotation name that failed to sanitize, skipping", "name", k, "error", err)
s.logger.Error("Alert sending to external Alertmanager(s) contains an invalid label/annotation name that failed to sanitize, skipping", "name", k, "error", err)
continue
}
// There can be label name collisions after we sanitize. We check for this and attempt to make the name unique again using a short hash of the original name.
if _, ok := set[sanitizedLabelName]; ok {
sanitizedLabelName = sanitizedLabelName + fmt.Sprintf("_%.3x", md5.Sum([]byte(k)))
s.logger.Warn("alert contains duplicate label/annotation name after sanitization, appending unique suffix", "name", k, "new_name", sanitizedLabelName, "error", err)
s.logger.Warn("Alert contains duplicate label/annotation name after sanitization, appending unique suffix", "name", k, "newName", sanitizedLabelName, "error", err)
}
set[sanitizedLabelName] = struct{}{}
@ -227,7 +231,7 @@ func (s *ExternalAlertmanager) sanitizeLabelName(name string) (string, error) {
return name, nil
}
s.logger.Warn("alert sending to external Alertmanager(s) contains label/annotation name with invalid characters", "name", name)
s.logger.Warn("Alert sending to external Alertmanager(s) contains label/annotation name with invalid characters", "name", name)
// Remove spaces. We do this instead of replacing with underscore for backwards compatibility as this existed before the rest of this function.
sanitized := strings.Join(strings.Fields(name), "")

View File

@ -48,7 +48,7 @@ func TestSanitizeLabelName(t *testing.T) {
}
for _, tc := range cases {
am, _ := NewExternalAlertmanagerSender()
am := NewExternalAlertmanagerSender()
t.Run(tc.desc, func(t *testing.T) {
res, err := am.sanitizeLabelName(tc.labelName)
@ -95,7 +95,7 @@ func TestSanitizeLabelSet(t *testing.T) {
}
for _, tc := range cases {
am, _ := NewExternalAlertmanagerSender()
am := NewExternalAlertmanagerSender()
t.Run(tc.desc, func(t *testing.T) {
require.Equal(t, tc.expectedResult, am.sanitizeLabelSet(tc.labelset))
})