2021-08-24 05:28:09 -05:00
package notifier
import (
"context"
"fmt"
"sync"
"time"
2021-09-16 09:33:51 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/logging"
gokit_log "github.com/go-kit/kit/log"
"github.com/prometheus/alertmanager/cluster"
"github.com/prometheus/client_golang/prometheus"
2021-09-09 11:25:22 -05:00
"github.com/grafana/grafana/pkg/infra/kvstore"
2021-08-24 05:28:09 -05:00
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
2021-09-21 10:01:23 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/models"
2021-08-24 05:28:09 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/setting"
)
var (
ErrNoAlertmanagerForOrg = fmt . Errorf ( "Alertmanager does not exist for this organization" )
ErrAlertmanagerNotReady = fmt . Errorf ( "Alertmanager is not ready yet" )
)
type MultiOrgAlertmanager struct {
alertmanagersMtx sync . RWMutex
alertmanagers map [ int64 ] * Alertmanager
settings * setting . Cfg
logger log . Logger
2021-09-16 09:33:51 -05:00
// clusterPeer represents the clustering peers of Alertmanagers between Grafana instances.
peer ClusterPeer
settleCancel context . CancelFunc
2021-08-24 05:28:09 -05:00
configStore store . AlertingStore
orgStore store . OrgStore
2021-09-09 11:25:22 -05:00
kvStore kvstore . KVStore
2021-08-24 05:28:09 -05:00
2021-09-14 06:55:01 -05:00
metrics * metrics . MultiOrgAlertmanager
2021-08-24 05:28:09 -05:00
}
2021-09-16 09:33:51 -05:00
func NewMultiOrgAlertmanager ( cfg * setting . Cfg , configStore store . AlertingStore , orgStore store . OrgStore , kvStore kvstore . KVStore , m * metrics . MultiOrgAlertmanager , l log . Logger ) ( * MultiOrgAlertmanager , error ) {
moa := & MultiOrgAlertmanager {
logger : l ,
2021-08-24 05:28:09 -05:00
settings : cfg ,
alertmanagers : map [ int64 ] * Alertmanager { } ,
configStore : configStore ,
orgStore : orgStore ,
2021-09-09 11:25:22 -05:00
kvStore : kvStore ,
2021-09-14 06:55:01 -05:00
metrics : m ,
2021-08-24 05:28:09 -05:00
}
2021-09-16 09:33:51 -05:00
clusterLogger := gokit_log . With ( gokit_log . NewLogfmtLogger ( logging . NewWrapper ( l ) ) , "component" , "cluster" )
moa . peer = & NilPeer { }
2021-09-20 02:12:21 -05:00
if len ( cfg . UnifiedAlerting . HAPeers ) > 0 {
2021-09-16 09:33:51 -05:00
peer , err := cluster . Create (
clusterLogger ,
m . Registerer ,
2021-09-20 02:12:21 -05:00
cfg . UnifiedAlerting . HAListenAddr ,
cfg . UnifiedAlerting . HAAdvertiseAddr ,
cfg . UnifiedAlerting . HAPeers , // peers
2021-09-16 09:33:51 -05:00
true ,
2021-09-20 02:12:21 -05:00
cfg . UnifiedAlerting . HAPushPullInterval ,
cfg . UnifiedAlerting . HAGossipInterval ,
2021-09-16 09:33:51 -05:00
cluster . DefaultTcpTimeout ,
cluster . DefaultProbeTimeout ,
cluster . DefaultProbeInterval ,
nil ,
)
if err != nil {
return nil , fmt . Errorf ( "unable to initialize gossip mesh: %w" , err )
}
err = peer . Join ( cluster . DefaultReconnectInterval , cluster . DefaultReconnectTimeout )
if err != nil {
l . Error ( "msg" , "unable to join gossip mesh while initializing cluster for high availability mode" , "err" , err )
}
// Attempt to verify the number of peers for 30s every 2s. The risk here is what we send a notification "too soon".
// Which should _never_ happen given we share the notification log via the database so the risk of double notification is very low.
var ctx context . Context
ctx , moa . settleCancel = context . WithTimeout ( context . Background ( ) , 30 * time . Second )
go peer . Settle ( ctx , cluster . DefaultGossipInterval * 10 )
moa . peer = peer
}
return moa , nil
2021-08-24 05:28:09 -05:00
}
func ( moa * MultiOrgAlertmanager ) Run ( ctx context . Context ) error {
moa . logger . Info ( "starting MultiOrg Alertmanager" )
for {
select {
case <- ctx . Done ( ) :
moa . StopAndWait ( )
return nil
2021-09-20 02:12:21 -05:00
case <- time . After ( moa . settings . UnifiedAlerting . AlertmanagerConfigPollInterval ) :
2021-08-24 05:28:09 -05:00
if err := moa . LoadAndSyncAlertmanagersForOrgs ( ctx ) ; err != nil {
moa . logger . Error ( "error while synchronizing Alertmanager orgs" , "err" , err )
}
}
}
}
func ( moa * MultiOrgAlertmanager ) LoadAndSyncAlertmanagersForOrgs ( ctx context . Context ) error {
moa . logger . Debug ( "synchronizing Alertmanagers for orgs" )
// First, load all the organizations from the database.
orgIDs , err := moa . orgStore . GetOrgs ( ctx )
if err != nil {
return err
}
// Then, sync them by creating or deleting Alertmanagers as necessary.
2021-09-14 06:55:01 -05:00
moa . metrics . DiscoveredConfigurations . Set ( float64 ( len ( orgIDs ) ) )
2021-09-21 10:01:23 -05:00
moa . SyncAlertmanagersForOrgs ( ctx , orgIDs )
2021-08-24 05:28:09 -05:00
moa . logger . Debug ( "done synchronizing Alertmanagers for orgs" )
return nil
}
2021-09-21 10:01:23 -05:00
// getLatestConfigs retrieves the latest Alertmanager configuration for every organization. It returns a map where the key is the ID of each organization and the value is the configuration.
func ( moa * MultiOrgAlertmanager ) getLatestConfigs ( ctx context . Context ) ( map [ int64 ] * models . AlertConfiguration , error ) {
configs , err := moa . configStore . GetAllLatestAlertmanagerConfiguration ( ctx )
if err != nil {
return nil , err
}
result := make ( map [ int64 ] * models . AlertConfiguration , len ( configs ) )
for _ , config := range configs {
result [ config . OrgID ] = config
}
return result , nil
}
// SyncAlertmanagersForOrgs syncs configuration of the Alertmanager required by each organization.
func ( moa * MultiOrgAlertmanager ) SyncAlertmanagersForOrgs ( ctx context . Context , orgIDs [ ] int64 ) {
2021-08-24 05:28:09 -05:00
orgsFound := make ( map [ int64 ] struct { } , len ( orgIDs ) )
2021-09-21 10:01:23 -05:00
dbConfigs , err := moa . getLatestConfigs ( ctx )
if err != nil {
moa . logger . Error ( "failed to load Alertmanager configurations" , "err" , err )
return
}
2021-08-24 05:28:09 -05:00
moa . alertmanagersMtx . Lock ( )
for _ , orgID := range orgIDs {
2021-09-29 09:16:40 -05:00
if _ , isDisabledOrg := moa . settings . UnifiedAlerting . DisabledOrgs [ orgID ] ; isDisabledOrg {
moa . logger . Debug ( "skipping syncing Alertmanger for disabled org" , "org" , orgID )
continue
}
2021-08-24 05:28:09 -05:00
orgsFound [ orgID ] = struct { } { }
2021-09-21 10:01:23 -05:00
alertmanager , found := moa . alertmanagers [ orgID ]
2021-09-29 09:16:40 -05:00
2021-08-24 05:28:09 -05:00
if ! found {
2021-09-14 06:55:01 -05:00
// These metrics are not exported by Grafana and are mostly a placeholder.
// To export them, we need to translate the metrics from each individual registry and,
// then aggregate them on the main registry.
m := metrics . NewAlertmanagerMetrics ( moa . metrics . GetOrCreateOrgRegistry ( orgID ) )
2021-09-16 09:33:51 -05:00
am , err := newAlertmanager ( orgID , moa . settings , moa . configStore , moa . kvStore , moa . peer , m )
2021-08-24 05:28:09 -05:00
if err != nil {
moa . logger . Error ( "unable to create Alertmanager for org" , "org" , orgID , "err" , err )
}
2021-09-21 10:01:23 -05:00
alertmanager = am
2021-08-24 05:28:09 -05:00
}
2021-09-21 10:01:23 -05:00
dbConfig , cfgFound := dbConfigs [ orgID ]
if ! cfgFound {
if found {
// This means that the configuration is gone but the organization, as well as the Alertmanager, exists.
moa . logger . Warn ( "Alertmanager exists for org but the configuration is gone. Applying the default configuration" , "org" , orgID )
}
err := alertmanager . SaveAndApplyDefaultConfig ( )
if err != nil {
moa . logger . Error ( "failed to apply the default Alertmanager configuration" , "org" , orgID )
continue
}
moa . alertmanagers [ orgID ] = alertmanager
continue
}
err := alertmanager . ApplyConfig ( dbConfig )
if err != nil {
moa . logger . Error ( "failed to apply Alertmanager config for org" , "org" , orgID , "id" , dbConfig . ID , "err" , err )
continue
2021-08-24 05:28:09 -05:00
}
2021-09-21 10:01:23 -05:00
moa . alertmanagers [ orgID ] = alertmanager
2021-08-24 05:28:09 -05:00
}
amsToStop := map [ int64 ] * Alertmanager { }
for orgId , am := range moa . alertmanagers {
if _ , exists := orgsFound [ orgId ] ; ! exists {
amsToStop [ orgId ] = am
delete ( moa . alertmanagers , orgId )
2021-09-14 06:55:01 -05:00
moa . metrics . RemoveOrgRegistry ( orgId )
2021-08-24 05:28:09 -05:00
}
}
2021-09-14 06:55:01 -05:00
moa . metrics . ActiveConfigurations . Set ( float64 ( len ( moa . alertmanagers ) ) )
2021-08-24 05:28:09 -05:00
moa . alertmanagersMtx . Unlock ( )
// Now, we can stop the Alertmanagers without having to hold a lock.
for orgID , am := range amsToStop {
moa . logger . Info ( "stopping Alertmanager" , "org" , orgID )
am . StopAndWait ( )
moa . logger . Info ( "stopped Alertmanager" , "org" , orgID )
}
}
func ( moa * MultiOrgAlertmanager ) StopAndWait ( ) {
moa . alertmanagersMtx . Lock ( )
defer moa . alertmanagersMtx . Unlock ( )
for _ , am := range moa . alertmanagers {
am . StopAndWait ( )
}
2021-09-16 09:33:51 -05:00
p , ok := moa . peer . ( * cluster . Peer )
if ok {
moa . settleCancel ( )
if err := p . Leave ( 10 * time . Second ) ; err != nil {
moa . logger . Warn ( "unable to leave the gossip mesh" , "err" , err )
}
}
2021-08-24 05:28:09 -05:00
}
// AlertmanagerFor returns the Alertmanager instance for the organization provided.
// When the organization does not have an active Alertmanager, it returns a ErrNoAlertmanagerForOrg.
// When the Alertmanager of the organization is not ready, it returns a ErrAlertmanagerNotReady.
func ( moa * MultiOrgAlertmanager ) AlertmanagerFor ( orgID int64 ) ( * Alertmanager , error ) {
moa . alertmanagersMtx . RLock ( )
defer moa . alertmanagersMtx . RUnlock ( )
orgAM , existing := moa . alertmanagers [ orgID ]
if ! existing {
return nil , ErrNoAlertmanagerForOrg
}
if ! orgAM . Ready ( ) {
return nil , ErrAlertmanagerNotReady
}
return orgAM , nil
}
2021-09-16 09:33:51 -05:00
// NilPeer and NilChannel implements the Alertmanager clustering interface.
type NilPeer struct { }
func ( p * NilPeer ) Position ( ) int { return 0 }
func ( p * NilPeer ) WaitReady ( context . Context ) error { return nil }
func ( p * NilPeer ) AddState ( string , cluster . State , prometheus . Registerer ) cluster . ClusterChannel {
return & NilChannel { }
}
type NilChannel struct { }
func ( c * NilChannel ) Broadcast ( [ ] byte ) { }