2022-07-12 14:13:04 -05:00
package sender
import (
"context"
2022-11-10 09:34:13 -06:00
"crypto/sha256"
2022-07-12 14:13:04 -05:00
"errors"
2022-07-20 09:50:49 -05:00
"fmt"
2022-07-12 14:13:04 -05:00
"net/url"
2022-11-10 09:34:13 -06:00
"sort"
2022-07-12 14:13:04 -05:00
"sync"
"time"
"github.com/benbjohnson/clock"
2022-08-01 03:20:43 -05:00
"github.com/grafana/grafana/pkg/api/datasource"
2022-07-12 14:13:04 -05:00
"github.com/grafana/grafana/pkg/infra/log"
2022-07-20 09:50:49 -05:00
"github.com/grafana/grafana/pkg/services/datasources"
2022-07-12 14:13:04 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/store"
2022-07-20 09:50:49 -05:00
"github.com/grafana/grafana/pkg/services/secrets"
2022-07-12 14:13:04 -05:00
)
// AlertsRouter handles alerts generated during alert rule evaluation.
// Based on rule's orgID and the configuration for that organization,
// it determines whether an alert needs to be sent to an external Alertmanager and\or internal notifier.Alertmanager
//
// After creating a AlertsRouter, you must call Run to keep the AlertsRouter's
// state synchronized with the alerting configuration.
type AlertsRouter struct {
logger log . Logger
clock clock . Clock
adminConfigStore store . AdminConfigurationStore
2022-07-19 13:04:48 -05:00
// externalAlertmanagers help us send alerts to external Alertmanagers.
adminConfigMtx sync . RWMutex
sendAlertsTo map [ int64 ] models . AlertmanagersChoice
externalAlertmanagers map [ int64 ] * ExternalAlertmanager
externalAlertmanagersCfgHash map [ int64 ] string
2022-07-12 14:13:04 -05:00
2022-07-19 08:32:54 -05:00
multiOrgNotifier * notifier . MultiOrgAlertmanager
2022-07-12 14:13:04 -05:00
appURL * url . URL
disabledOrgs map [ int64 ] struct { }
adminConfigPollInterval time . Duration
2022-07-20 09:50:49 -05:00
datasourceService datasources . DataSourceService
secretService secrets . Service
2022-07-12 14:13:04 -05:00
}
2022-07-20 09:50:49 -05:00
func NewAlertsRouter ( multiOrgNotifier * notifier . MultiOrgAlertmanager , store store . AdminConfigurationStore ,
clk clock . Clock , appURL * url . URL , disabledOrgs map [ int64 ] struct { } , configPollInterval time . Duration ,
datasourceService datasources . DataSourceService , secretService secrets . Service ) * AlertsRouter {
2022-07-12 14:13:04 -05:00
d := & AlertsRouter {
2022-10-20 14:19:04 -05:00
logger : log . New ( "ngalert.sender.router" ) ,
2022-07-12 14:13:04 -05:00
clock : clk ,
adminConfigStore : store ,
2022-07-19 13:04:48 -05:00
adminConfigMtx : sync . RWMutex { } ,
externalAlertmanagers : map [ int64 ] * ExternalAlertmanager { } ,
externalAlertmanagersCfgHash : map [ int64 ] string { } ,
sendAlertsTo : map [ int64 ] models . AlertmanagersChoice { } ,
2022-07-12 14:13:04 -05:00
2022-07-19 08:32:54 -05:00
multiOrgNotifier : multiOrgNotifier ,
2022-07-12 14:13:04 -05:00
appURL : appURL ,
disabledOrgs : disabledOrgs ,
adminConfigPollInterval : configPollInterval ,
2022-07-20 09:50:49 -05:00
datasourceService : datasourceService ,
secretService : secretService ,
2022-07-12 14:13:04 -05:00
}
return d
}
// SyncAndApplyConfigFromDatabase looks for the admin configuration in the database
// and adjusts the sender(s) and alert handling mechanism accordingly.
func ( d * AlertsRouter ) SyncAndApplyConfigFromDatabase ( ) error {
cfgs , err := d . adminConfigStore . GetAdminConfigurations ( )
if err != nil {
return err
}
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "Attempting to sync admin configs" , "count" , len ( cfgs ) )
2022-07-12 14:13:04 -05:00
orgsFound := make ( map [ int64 ] struct { } , len ( cfgs ) )
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . Lock ( )
2022-07-12 14:13:04 -05:00
for _ , cfg := range cfgs {
_ , isDisabledOrg := d . disabledOrgs [ cfg . OrgID ]
if isDisabledOrg {
continue
}
// Update the Alertmanagers choice for the organization.
2022-07-19 08:32:54 -05:00
d . sendAlertsTo [ cfg . OrgID ] = cfg . SendAlertsTo
2022-07-12 14:13:04 -05:00
2022-07-19 13:04:48 -05:00
orgsFound [ cfg . OrgID ] = struct { } { } // keep track of the which externalAlertmanagers we need to keep.
2022-07-12 14:13:04 -05:00
2022-07-19 13:04:48 -05:00
existing , ok := d . externalAlertmanagers [ cfg . OrgID ]
2022-07-12 14:13:04 -05:00
// We have no running sender and alerts are handled internally, no-op.
if ! ok && cfg . SendAlertsTo == models . InternalAlertmanager {
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "Grafana is configured to send alerts to the internal alertmanager only. Skipping synchronization with external alertmanager" , "org" , cfg . OrgID )
2022-07-12 14:13:04 -05:00
continue
}
2022-11-10 09:34:13 -06:00
alertmanagers , err := d . alertmanagersFromDatasources ( cfg . OrgID )
2022-07-20 09:50:49 -05:00
if err != nil {
2022-11-10 09:34:13 -06:00
d . logger . Error ( "Failed to get alertmanagers from datasources" , "org" , cfg . OrgID , "error" , err )
2022-07-20 09:50:49 -05:00
continue
}
// We have no running sender and no Alertmanager(s) configured, no-op.
2022-11-10 09:34:13 -06:00
if ! ok && len ( alertmanagers ) == 0 {
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "No external alertmanagers configured" , "org" , cfg . OrgID )
2022-07-20 09:50:49 -05:00
continue
}
2022-07-12 14:13:04 -05:00
// We have a running sender but no Alertmanager(s) configured, shut it down.
2022-11-10 09:34:13 -06:00
if ok && len ( alertmanagers ) == 0 {
2022-10-20 14:19:04 -05:00
d . logger . Info ( "No external alertmanager(s) configured, sender will be stopped" , "org" , cfg . OrgID )
2022-07-12 14:13:04 -05:00
delete ( orgsFound , cfg . OrgID )
continue
}
2022-08-24 12:52:39 -05:00
// Avoid logging sensitive data
2022-11-10 09:34:13 -06:00
redactedAMs := buildRedactedAMs ( d . logger , alertmanagers , cfg . OrgID )
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "Alertmanagers found in the configuration" , "alertmanagers" , redactedAMs )
2022-08-01 03:20:43 -05:00
2023-04-21 09:16:27 -05:00
var hashes [ ] string
for _ , cfg := range alertmanagers {
hashes = append ( hashes , cfg . SHA256 ( ) )
}
2022-07-12 14:13:04 -05:00
// We have a running sender, check if we need to apply a new config.
2023-04-21 09:16:27 -05:00
amHash := asSHA256 ( hashes )
2022-07-12 14:13:04 -05:00
if ok {
2022-08-24 12:52:39 -05:00
if d . externalAlertmanagersCfgHash [ cfg . OrgID ] == amHash {
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "Sender configuration is the same as the one running, no-op" , "org" , cfg . OrgID , "alertmanagers" , redactedAMs )
2022-07-12 14:13:04 -05:00
continue
}
2022-10-20 14:19:04 -05:00
d . logger . Info ( "Applying new configuration to sender" , "org" , cfg . OrgID , "alertmanagers" , redactedAMs , "cfg" , cfg . ID )
2022-11-10 09:34:13 -06:00
err := existing . ApplyConfig ( cfg . OrgID , cfg . ID , alertmanagers )
2022-07-12 14:13:04 -05:00
if err != nil {
2022-10-20 14:19:04 -05:00
d . logger . Error ( "Failed to apply configuration" , "error" , err , "org" , cfg . OrgID )
2022-07-12 14:13:04 -05:00
continue
}
2022-08-24 12:52:39 -05:00
d . externalAlertmanagersCfgHash [ cfg . OrgID ] = amHash
2022-07-12 14:13:04 -05:00
continue
}
// No sender and have Alertmanager(s) to send to - start a new one.
2022-10-20 14:19:04 -05:00
d . logger . Info ( "Creating new sender for the external alertmanagers" , "org" , cfg . OrgID , "alertmanagers" , redactedAMs )
s := NewExternalAlertmanagerSender ( )
2022-07-19 13:04:48 -05:00
d . externalAlertmanagers [ cfg . OrgID ] = s
2022-07-12 14:13:04 -05:00
s . Run ( )
2022-11-10 09:34:13 -06:00
err = s . ApplyConfig ( cfg . OrgID , cfg . ID , alertmanagers )
2022-07-12 14:13:04 -05:00
if err != nil {
2022-10-20 14:19:04 -05:00
d . logger . Error ( "Failed to apply configuration" , "error" , err , "org" , cfg . OrgID )
2022-07-12 14:13:04 -05:00
continue
}
2022-08-24 12:52:39 -05:00
d . externalAlertmanagersCfgHash [ cfg . OrgID ] = amHash
2022-07-12 14:13:04 -05:00
}
2022-07-19 13:04:48 -05:00
sendersToStop := map [ int64 ] * ExternalAlertmanager { }
2022-07-12 14:13:04 -05:00
2022-07-19 13:04:48 -05:00
for orgID , s := range d . externalAlertmanagers {
2022-07-12 14:13:04 -05:00
if _ , exists := orgsFound [ orgID ] ; ! exists {
sendersToStop [ orgID ] = s
2022-07-19 13:04:48 -05:00
delete ( d . externalAlertmanagers , orgID )
delete ( d . externalAlertmanagersCfgHash , orgID )
2022-07-12 14:13:04 -05:00
}
}
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . Unlock ( )
2022-07-12 14:13:04 -05:00
2022-11-10 09:34:13 -06:00
// We can now stop these external Alertmanagers w/o having to hold a lock.
2022-07-12 14:13:04 -05:00
for orgID , s := range sendersToStop {
2022-10-20 14:19:04 -05:00
d . logger . Info ( "Stopping sender" , "org" , orgID )
2022-07-12 14:13:04 -05:00
s . Stop ( )
2022-10-20 14:19:04 -05:00
d . logger . Info ( "Stopped sender" , "org" , orgID )
2022-07-12 14:13:04 -05:00
}
2022-10-20 14:19:04 -05:00
d . logger . Debug ( "Finish of admin configuration sync" )
2022-07-12 14:13:04 -05:00
return nil
}
2023-04-21 09:16:27 -05:00
func buildRedactedAMs ( l log . Logger , alertmanagers [ ] externalAMcfg , ordId int64 ) [ ] string {
2022-11-10 09:34:13 -06:00
var redactedAMs [ ] string
for _ , am := range alertmanagers {
2023-04-21 09:16:27 -05:00
parsedAM , err := url . Parse ( am . amURL )
2022-11-10 09:34:13 -06:00
if err != nil {
l . Error ( "Failed to parse alertmanager string" , "org" , ordId , "error" , err )
continue
}
redactedAMs = append ( redactedAMs , parsedAM . Redacted ( ) )
}
return redactedAMs
}
func asSHA256 ( strings [ ] string ) string {
h := sha256 . New ( )
sort . Strings ( strings )
_ , _ = h . Write ( [ ] byte ( fmt . Sprintf ( "%v" , strings ) ) )
return fmt . Sprintf ( "%x" , h . Sum ( nil ) )
}
2023-04-21 09:16:27 -05:00
func ( d * AlertsRouter ) alertmanagersFromDatasources ( orgID int64 ) ( [ ] externalAMcfg , error ) {
var (
alertmanagers [ ] externalAMcfg
)
2022-07-20 09:50:49 -05:00
// We might have alertmanager datasources that are acting as external
// alertmanager, let's fetch them.
query := & datasources . GetDataSourcesByTypeQuery {
2023-02-02 10:22:43 -06:00
OrgID : orgID ,
2022-07-20 09:50:49 -05:00
Type : datasources . DS_ALERTMANAGER ,
}
ctx , cancel := context . WithTimeout ( context . Background ( ) , time . Second * 5 )
defer cancel ( )
2023-02-09 08:49:44 -06:00
dataSources , err := d . datasourceService . GetDataSourcesByType ( ctx , query )
2022-07-20 09:50:49 -05:00
if err != nil {
return nil , fmt . Errorf ( "failed to fetch datasources for org: %w" , err )
}
2023-02-09 08:49:44 -06:00
for _ , ds := range dataSources {
2022-07-20 09:50:49 -05:00
if ! ds . JsonData . Get ( definitions . HandleGrafanaManagedAlerts ) . MustBool ( false ) {
continue
}
amURL , err := d . buildExternalURL ( ds )
if err != nil {
2022-10-20 14:19:04 -05:00
d . logger . Error ( "Failed to build external alertmanager URL" ,
2023-02-02 10:22:43 -06:00
"org" , ds . OrgID ,
"uid" , ds . UID ,
2022-10-19 16:36:54 -05:00
"error" , err )
2022-07-20 09:50:49 -05:00
continue
}
2023-04-21 09:16:27 -05:00
ctx , cancel := context . WithTimeout ( context . Background ( ) , time . Second * 10 )
headers , err := d . datasourceService . CustomHeaders ( ctx , ds )
cancel ( )
if err != nil {
d . logger . Error ( "Failed to get headers for external alertmanager" ,
"org" , ds . OrgID ,
"uid" , ds . UID ,
"error" , err )
continue
}
alertmanagers = append ( alertmanagers , externalAMcfg {
amURL : amURL ,
headers : headers ,
} )
2022-07-20 09:50:49 -05:00
}
return alertmanagers , nil
}
func ( d * AlertsRouter ) buildExternalURL ( ds * datasources . DataSource ) ( string , error ) {
2022-08-01 03:20:43 -05:00
// We re-use the same parsing logic as the datasource to make sure it matches whatever output the user received
// when doing the healthcheck.
2023-02-02 10:22:43 -06:00
parsed , err := datasource . ValidateURL ( datasources . DS_ALERTMANAGER , ds . URL )
2022-08-01 03:20:43 -05:00
if err != nil {
return "" , fmt . Errorf ( "failed to parse alertmanager datasource url: %w" , err )
}
2023-02-23 12:55:26 -06:00
// If this is a Mimir or Cortex implementation, the Alert API is under a different path than config API
if ds . JsonData != nil {
impl := ds . JsonData . Get ( "implementation" ) . MustString ( "" )
switch impl {
case "mimir" , "cortex" :
if parsed . Path == "" {
parsed . Path = "/"
}
parsed = parsed . JoinPath ( "/alertmanager" )
default :
}
}
2022-07-20 09:50:49 -05:00
// if basic auth is enabled we need to build the url with basic auth baked in
if ! ds . BasicAuth {
2022-08-01 03:20:43 -05:00
return parsed . String ( ) , nil
2022-07-20 09:50:49 -05:00
}
password := d . secretService . GetDecryptedValue ( context . Background ( ) , ds . SecureJsonData , "basicAuthPassword" , "" )
if password == "" {
return "" , fmt . Errorf ( "basic auth enabled but no password set" )
}
return fmt . Sprintf ( "%s://%s:%s@%s%s%s" , parsed . Scheme , ds . BasicAuthUser ,
password , parsed . Host , parsed . Path , parsed . RawQuery ) , nil
}
2022-07-12 14:13:04 -05:00
func ( d * AlertsRouter ) Send ( key models . AlertRuleKey , alerts definitions . PostableAlerts ) {
2022-10-20 14:19:04 -05:00
logger := d . logger . New ( key . LogContext ( ) ... )
2022-07-12 14:13:04 -05:00
if len ( alerts . PostableAlerts ) == 0 {
2022-10-20 14:19:04 -05:00
logger . Info ( "No alerts to notify about" )
2022-07-12 14:13:04 -05:00
return
}
// Send alerts to local notifier if they need to be handled internally
// or if no external AMs have been discovered yet.
var localNotifierExist , externalNotifierExist bool
2022-07-19 08:32:54 -05:00
if d . sendAlertsTo [ key . OrgID ] == models . ExternalAlertmanagers && len ( d . AlertmanagersFor ( key . OrgID ) ) > 0 {
2022-10-20 14:19:04 -05:00
logger . Debug ( "All alerts for the given org should be routed to external notifiers only. skipping the internal notifier." )
2022-07-12 14:13:04 -05:00
} else {
2022-10-20 14:19:04 -05:00
logger . Info ( "Sending alerts to local notifier" , "count" , len ( alerts . PostableAlerts ) )
2022-07-19 08:32:54 -05:00
n , err := d . multiOrgNotifier . AlertmanagerFor ( key . OrgID )
2022-07-12 14:13:04 -05:00
if err == nil {
localNotifierExist = true
if err := n . PutAlerts ( alerts ) ; err != nil {
2022-10-20 14:19:04 -05:00
logger . Error ( "Failed to put alerts in the local notifier" , "count" , len ( alerts . PostableAlerts ) , "error" , err )
2022-07-12 14:13:04 -05:00
}
} else {
if errors . Is ( err , notifier . ErrNoAlertmanagerForOrg ) {
2022-10-20 14:19:04 -05:00
logger . Debug ( "Local notifier was not found" )
2022-07-12 14:13:04 -05:00
} else {
2022-10-20 14:19:04 -05:00
logger . Error ( "Local notifier is not available" , "error" , err )
2022-07-12 14:13:04 -05:00
}
}
}
// Send alerts to external Alertmanager(s) if we have a sender for this organization
// and alerts are not being handled just internally.
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . RLock ( )
defer d . adminConfigMtx . RUnlock ( )
2022-07-19 13:04:48 -05:00
s , ok := d . externalAlertmanagers [ key . OrgID ]
2022-07-19 08:32:54 -05:00
if ok && d . sendAlertsTo [ key . OrgID ] != models . InternalAlertmanager {
2022-10-20 14:19:04 -05:00
logger . Info ( "Sending alerts to external notifier" , "count" , len ( alerts . PostableAlerts ) )
2022-07-12 14:13:04 -05:00
s . SendAlerts ( alerts )
externalNotifierExist = true
}
if ! localNotifierExist && ! externalNotifierExist {
2022-10-20 14:19:04 -05:00
logger . Error ( "No external or internal notifier - alerts not delivered" , "count" , len ( alerts . PostableAlerts ) )
2022-07-12 14:13:04 -05:00
}
}
// AlertmanagersFor returns all the discovered Alertmanager(s) for a particular organization.
func ( d * AlertsRouter ) AlertmanagersFor ( orgID int64 ) [ ] * url . URL {
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . RLock ( )
defer d . adminConfigMtx . RUnlock ( )
2022-07-19 13:04:48 -05:00
s , ok := d . externalAlertmanagers [ orgID ]
2022-07-12 14:13:04 -05:00
if ! ok {
return [ ] * url . URL { }
}
return s . Alertmanagers ( )
}
// DroppedAlertmanagersFor returns all the dropped Alertmanager(s) for a particular organization.
func ( d * AlertsRouter ) DroppedAlertmanagersFor ( orgID int64 ) [ ] * url . URL {
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . RLock ( )
defer d . adminConfigMtx . RUnlock ( )
2022-07-19 13:04:48 -05:00
s , ok := d . externalAlertmanagers [ orgID ]
2022-07-12 14:13:04 -05:00
if ! ok {
return [ ] * url . URL { }
}
return s . DroppedAlertmanagers ( )
}
// Run starts regular updates of the configuration.
func ( d * AlertsRouter ) Run ( ctx context . Context ) error {
for {
select {
case <- time . After ( d . adminConfigPollInterval ) :
if err := d . SyncAndApplyConfigFromDatabase ( ) ; err != nil {
2022-10-20 14:19:04 -05:00
d . logger . Error ( "Unable to sync admin configuration" , "error" , err )
2022-07-12 14:13:04 -05:00
}
case <- ctx . Done ( ) :
// Stop sending alerts to all external Alertmanager(s).
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . Lock ( )
2022-07-19 13:04:48 -05:00
for orgID , s := range d . externalAlertmanagers {
delete ( d . externalAlertmanagers , orgID ) // delete before we stop to make sure we don't accept any more alerts.
2022-07-12 14:13:04 -05:00
s . Stop ( )
}
2022-07-19 08:32:54 -05:00
d . adminConfigMtx . Unlock ( )
2022-07-12 14:13:04 -05:00
return nil
}
}
}