Alerting: Use a default configuration and periodically poll for new ones (#32851)

* Alerting: Use a default configuration and periodically poll for new ones

Use a default configuration to make sure we always start the grafana
instance. Then, regularly poll for new ones.

I've also made sure that failures to apply configuration do not stop the
Grafana server but instead keep polling until it is a success.
This commit is contained in:
gotjosh 2021-04-13 13:02:44 +01:00 committed by GitHub
parent 4178ebc0a1
commit 528ca9134b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 137 additions and 166 deletions

2
go.mod
View File

@ -41,7 +41,7 @@ require (
github.com/google/go-cmp v0.5.5
github.com/google/uuid v1.2.0
github.com/gosimple/slug v1.9.0
github.com/grafana/alerting-api v0.0.0-20210409134845-c36ac1eae41b
github.com/grafana/alerting-api v0.0.0-20210412090350-fcb11bfbb6a4
github.com/grafana/grafana-aws-sdk v0.4.0
github.com/grafana/grafana-live-sdk v0.0.4
github.com/grafana/grafana-plugin-model v0.0.0-20190930120109-1fc953a61fb4

4
go.sum
View File

@ -818,8 +818,8 @@ github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0U
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/gosimple/slug v1.9.0 h1:r5vDcYrFz9BmfIAMC829un9hq7hKM4cHUrsv36LbEqs=
github.com/gosimple/slug v1.9.0/go.mod h1:AMZ+sOVe65uByN3kgEyf9WEBKBCSS+dJjMX9x4vDJbg=
github.com/grafana/alerting-api v0.0.0-20210409134845-c36ac1eae41b h1:QG52Et3EVCxPoYZifm91bRPVknccfjQURcpi7zXVut8=
github.com/grafana/alerting-api v0.0.0-20210409134845-c36ac1eae41b/go.mod h1:Ce2PwraBlFMa+P0ArBzubfB/BXZV35mfYWQjM8C/BSE=
github.com/grafana/alerting-api v0.0.0-20210412090350-fcb11bfbb6a4 h1:S4nnWhH40AIWCkk3F7pUYVr67rqqangwm8a8cskYGyc=
github.com/grafana/alerting-api v0.0.0-20210412090350-fcb11bfbb6a4/go.mod h1:Ce2PwraBlFMa+P0ArBzubfB/BXZV35mfYWQjM8C/BSE=
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036 h1:GplhUk6Xes5JIhUUrggPcPBhOn+eT8+WsHiebvq7GgA=
github.com/grafana/go-mssqldb v0.0.0-20210326084033-d0ce3c521036/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU=
github.com/grafana/grafana v1.9.2-0.20210308201921-4ce0a49eac03/go.mod h1:AHRRvd4utJGY25J5nW8aL7wZzn/LcJ0z2za9oOp14j4=

View File

@ -25,7 +25,7 @@ var timeNow = time.Now
type Alertmanager interface {
// Configuration
ApplyConfig(config *apimodels.PostableUserConfig) error
SaveAndApplyConfig(config *apimodels.PostableUserConfig) error
// Silences
CreateSilence(ps *apimodels.PostableSilence) (string, error)

View File

@ -2,11 +2,8 @@ package api
import (
"errors"
"fmt"
"net/http"
"gopkg.in/yaml.v3"
apimodels "github.com/grafana/alerting-api/pkg/api"
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/infra/log"
@ -64,8 +61,8 @@ func (srv AlertmanagerSrv) RouteGetAlertingConfig(c *models.ReqContext) response
return response.Error(http.StatusInternalServerError, "failed to get latest configuration", err)
}
cfg := apimodels.PostableUserConfig{}
if err := yaml.Unmarshal([]byte(query.Result.AlertmanagerConfiguration), &cfg); err != nil {
cfg, err := notifier.Load([]byte(query.Result.AlertmanagerConfiguration))
if err != nil {
return response.Error(http.StatusInternalServerError, "failed to unmarshal alertmanager configuration", err)
}
@ -178,21 +175,8 @@ func (srv AlertmanagerSrv) RouteGetSilences(c *models.ReqContext) response.Respo
}
func (srv AlertmanagerSrv) RoutePostAlertingConfig(c *models.ReqContext, body apimodels.PostableUserConfig) response.Response {
config, err := yaml.Marshal(&body)
if err != nil {
return response.Error(http.StatusInternalServerError, "failed to serialize to the Alertmanager configuration", err)
}
cmd := ngmodels.SaveAlertmanagerConfigurationCmd{
AlertmanagerConfiguration: string(config),
ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion),
}
if err := srv.store.SaveAlertmanagerConfiguration(&cmd); err != nil {
return response.Error(http.StatusInternalServerError, "failed to save Alertmanager configuration", err)
}
if err := srv.am.ApplyConfig(&body); err != nil {
return response.Error(http.StatusInternalServerError, "failed to apply Alertmanager configuration", err)
if err := srv.am.SaveAndApplyConfig(&body); err != nil {
return response.Error(http.StatusInternalServerError, "failed to save and apply Alertmanager configuration", err)
}
return response.JSON(http.StatusAccepted, util.DynMap{"message": "configuration created"})

View File

@ -2,13 +2,16 @@ package notifier
import (
"context"
"crypto/md5"
"encoding/json"
"fmt"
"net/url"
"path/filepath"
"sync"
"time"
gokit_log "github.com/go-kit/kit/log"
"github.com/grafana/alerting-api/pkg/api"
apimodels "github.com/grafana/alerting-api/pkg/api"
"github.com/pkg/errors"
"github.com/prometheus/alertmanager/dispatch"
"github.com/prometheus/alertmanager/nflog"
@ -33,9 +36,33 @@ import (
)
const (
workingDir = "alerting"
pollInterval = 1 * time.Minute
workingDir = "alerting"
// How long should we keep silences and notification entries on-disk after they've served their purpose.
retentionNotificationsAndSilences = 5 * 24 * time.Hour
// To start, the alertmanager needs at least one route defined.
// TODO: we should move this to Grafana settings and define this as the default.
alertmanagerDefaultConfiguration = `
{
"alertmanager_config": {
"route": {
"receiver": "grafana-default-email"
},
"receivers": [{
"name": "grafana-default-email",
"grafana_managed_receiver_configs": [{
"uid": "",
"name": "email receiver",
"type": "email",
"isDefault": true,
"settings": {
"addresses": "<example@email.com>"
}
}]
}]
}
}
`
)
type Alertmanager struct {
@ -59,6 +86,7 @@ type Alertmanager struct {
dispatcherMetrics *dispatch.DispatcherMetrics
reloadConfigMtx sync.RWMutex
config []byte
}
func init() {
@ -105,8 +133,8 @@ func (am *Alertmanager) Init() (err error) {
func (am *Alertmanager) Run(ctx context.Context) error {
// Make sure dispatcher starts. We can tolerate future reload failures.
if err := am.SyncAndApplyConfigFromDatabase(); err != nil && !errors.Is(err, store.ErrNoAlertmanagerConfiguration) {
return err
if err := am.SyncAndApplyConfigFromDatabase(); err != nil {
am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error())
}
for {
@ -114,14 +142,10 @@ func (am *Alertmanager) Run(ctx context.Context) error {
case <-ctx.Done():
am.StopAndWait()
return nil
case <-time.After(1 * time.Minute):
// TODO: once we have a check to skip reload on same config, uncomment this.
//if err := am.SyncAndApplyConfigFromDatabase(); err != nil {
// if err == store.ErrNoAlertmanagerConfiguration {
// am.logger.Warn(errors.Wrap(err, "unable to sync configuration").Error())
// }
// am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error())
//}
case <-time.After(pollInterval):
if err := am.SyncAndApplyConfigFromDatabase(); err != nil {
am.logger.Error(errors.Wrap(err, "unable to sync configuration").Error())
}
}
}
}
@ -138,33 +162,54 @@ func (am *Alertmanager) StopAndWait() {
am.dispatcherWG.Wait()
}
func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error {
rawConfig, err := json.Marshal(&cfg)
if err != nil {
return errors.Wrap(err, "failed to serialize to the Alertmanager configuration")
}
am.reloadConfigMtx.Lock()
defer am.reloadConfigMtx.Unlock()
cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{
AlertmanagerConfiguration: string(rawConfig),
ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion),
}
if err := am.Store.SaveAlertmanagerConfiguration(cmd); err != nil {
return errors.Wrap(err, "failed to save Alertmanager configuration")
}
return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration")
}
// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts
// the components with the new config.
func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
am.reloadConfigMtx.Lock()
defer am.reloadConfigMtx.Unlock()
// TODO: check if config is same as before using hashes and skip reload in case they are same.
cfg, err := am.getConfigFromDatabase()
if err != nil {
return errors.Wrap(err, "get config from database")
}
return errors.Wrap(am.applyConfig(cfg), "reload from config")
}
func (am *Alertmanager) getConfigFromDatabase() (*api.PostableUserConfig, error) {
// First, let's get the configuration we need from the database.
q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{}
if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil {
return nil, err
// If there's no configuration in the database, let's use the default configuration.
if errors.Is(err, store.ErrNoAlertmanagerConfiguration) {
q.Result = &ngmodels.AlertConfiguration{AlertmanagerConfiguration: alertmanagerDefaultConfiguration}
} else {
return errors.Wrap(err, "unable to get Alertmanager configuration from the database")
}
}
// Then, let's parse and return the alertmanager configuration.
return Load(q.Result.AlertmanagerConfiguration)
cfg, err := Load([]byte(q.Result.AlertmanagerConfiguration))
if err != nil {
return err
}
return errors.Wrap(am.applyConfig(cfg), "unable to reload configuration")
}
// ApplyConfig applies a new configuration by re-initializing all components using the configuration provided.
func (am *Alertmanager) ApplyConfig(cfg *api.PostableUserConfig) error {
func (am *Alertmanager) ApplyConfig(cfg *apimodels.PostableUserConfig) error {
am.reloadConfigMtx.Lock()
defer am.reloadConfigMtx.Unlock()
@ -175,13 +220,30 @@ const defaultTemplate = "templates/default.tmpl"
// applyConfig applies a new configuration by re-initializing all components using the configuration provided.
// It is not safe to call concurrently.
func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error {
// First, we need to make sure we persist the templates to disk.
paths, _, err := PersistTemplates(cfg, am.WorkingDirPath())
func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig) error {
// First, let's make sure this config is not already loaded
var configChanged bool
rawConfig, err := json.Marshal(cfg.AlertmanagerConfig)
if err != nil {
// In theory, this should never happen.
return err
}
if md5.Sum(am.config) != md5.Sum(rawConfig) {
configChanged = true
}
// next, we need to make sure we persist the templates to disk.
paths, templatesChanged, err := PersistTemplates(cfg, am.WorkingDirPath())
if err != nil {
return err
}
// If neither the configuration nor templates have changed, we've got nothing to do.
if !configChanged && !templatesChanged {
am.logger.Debug("neither config nor template have changed, skipping configuration sync.")
return nil
}
paths = append([]string{defaultTemplate}, paths...)
// With the templates persisted, create the template list using the paths.
@ -217,6 +279,7 @@ func (am *Alertmanager) applyConfig(cfg *api.PostableUserConfig) error {
am.dispatcher.Run()
}()
am.config = rawConfig
return nil
}
@ -225,7 +288,7 @@ func (am *Alertmanager) WorkingDirPath() string {
}
// buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config.
func (am *Alertmanager) buildIntegrationsMap(receivers []*api.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) {
func (am *Alertmanager) buildIntegrationsMap(receivers []*apimodels.PostableApiReceiver, templates *template.Template) (map[string][]notify.Integration, error) {
integrationsMap := make(map[string][]notify.Integration, len(receivers))
for _, receiver := range receivers {
integrations, err := am.buildReceiverIntegrations(receiver, templates)
@ -244,7 +307,7 @@ type NotificationChannel interface {
}
// buildReceiverIntegrations builds a list of integration notifiers off of a receiver config.
func (am *Alertmanager) buildReceiverIntegrations(receiver *api.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) {
func (am *Alertmanager) buildReceiverIntegrations(receiver *apimodels.PostableApiReceiver, tmpl *template.Template) ([]notify.Integration, error) {
var integrations []notify.Integration
for i, r := range receiver.GrafanaManagedReceivers {

View File

@ -4,10 +4,17 @@ import (
"testing"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/setting"
)
func TestAlertmanager(t *testing.T) {
t.SkipNow()
am := &Alertmanager{}
func TestAlertmanager_ShouldUseDefaultConfigurationWhenNoConfiguration(t *testing.T) {
am := &Alertmanager{
Settings: &setting.Cfg{},
SQLStore: sqlstore.InitTestDB(t),
}
require.NoError(t, am.Init())
require.NoError(t, am.SyncAndApplyConfigFromDatabase())
require.NotNil(t, am.config)
}

View File

@ -1,6 +1,7 @@
package notifier
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
@ -10,7 +11,6 @@ import (
"github.com/grafana/grafana/pkg/infra/log"
"github.com/pkg/errors"
"gopkg.in/yaml.v3"
)
func PersistTemplates(cfg *api.PostableUserConfig, path string) ([]string, bool, error) {
@ -54,7 +54,7 @@ func PersistTemplates(cfg *api.PostableUserConfig, path string) ([]string, bool,
// Now that we have the list of _actual_ templates, let's remove the ones that we don't need.
existingFiles, err := ioutil.ReadDir(path)
if err != nil {
log.Error("unable to read directory for deleting alertmanager templates", "err", err, "path", path)
log.Error("unable to read directory for deleting Alertmanager templates", "err", err, "path", path)
}
for _, existingFile := range existingFiles {
p := filepath.Join(path, existingFile.Name())
@ -75,25 +75,12 @@ func PersistTemplates(cfg *api.PostableUserConfig, path string) ([]string, bool,
return paths, templatesChanged, nil
}
func Load(rawConfig string) (*api.PostableUserConfig, error) {
func Load(rawConfig []byte) (*api.PostableUserConfig, error) {
cfg := &api.PostableUserConfig{}
if err := yaml.Unmarshal([]byte(rawConfig), cfg); err != nil {
if err := json.Unmarshal(rawConfig, cfg); err != nil {
return nil, errors.Wrap(err, "unable to parse Alertmanager configuration")
}
// Taken from https://github.com/prometheus/alertmanager/blob/master/config/config.go#L170-L191
// Check if we have a root route. We cannot check for it in the
// UnmarshalYAML method because it won't be called if the input is empty
// (e.g. the config file is empty or only contains whitespace).
if cfg.AlertmanagerConfig.Route == nil {
return nil, errors.New("no route provided in config")
}
// Check if continue in root route.
if cfg.AlertmanagerConfig.Route.Continue {
return nil, errors.New("cannot have continue in root route")
}
return cfg, nil
}

View File

@ -118,27 +118,37 @@ func TestLoad(t *testing.T) {
{
name: "with a valid config and template",
rawConfig: `
alertmanager_config:
global:
smtp_from: noreply@grafana.net
route:
receiver: email
receivers:
template_files:
'email.template': something with a pretty good content
{
"alertmanager_config": {
"global": {
"smtp_from": "noreply@grafana.net"
},
"route": {
"receiver": "email"
},
"receivers": [
{
"name": "email"
}
]
},
"template_files": {
"email.template": "something with a pretty good content"
}
}
`,
expectedTemplates: map[string]string{"email.template": "something with a pretty good content"},
},
{
name: "with an empty configuration, it is not valid.",
rawConfig: "",
expectedError: errors.New("no route provided in config"),
rawConfig: "{}",
expectedError: errors.New("unable to parse Alertmanager configuration: no route provided in config"),
},
}
for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
c, err := Load(tt.rawConfig)
c, err := Load([]byte(tt.rawConfig))
if tt.expectedError != nil {
assert.Nil(t, c)

View File

@ -1,15 +1,10 @@
package alerting
import (
"context"
"fmt"
"io/ioutil"
"net/http"
"testing"
"time"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/tests/testinfra"
"github.com/stretchr/testify/require"
@ -19,7 +14,7 @@ func TestAlertAndGroupsQuery(t *testing.T) {
dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
EnableFeatureToggles: []string{"ngalert"},
})
store := setupDB(t, dir)
store := testinfra.SetUpDatabase(t, dir)
grafanaListedAddr := testinfra.StartGrafana(t, dir, path, store)
// When there are no alerts available, it returns an empty list.
@ -55,78 +50,3 @@ func TestAlertAndGroupsQuery(t *testing.T) {
require.JSONEq(t, "[]", string(b))
}
}
func setupDB(t *testing.T, dir string) *sqlstore.SQLStore {
store := testinfra.SetUpDatabase(t, dir)
// Let's make sure we create a default configuration from which we can start.
err := store.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
_, err := sess.Insert(&models.AlertConfiguration{
ID: 1,
AlertmanagerConfiguration: AMConfigFixture,
ConfigurationVersion: "v1",
CreatedAt: time.Now(),
})
return err
})
require.NoError(t, err)
return store
}
var AMConfigFixture = `
{
"template_files": {},
"alertmanager_config": {
"global": {
"resolve_timeout": "4m",
"http_config": {
"BasicAuth": null,
"Authorization": null,
"BearerToken": "",
"BearerTokenFile": "",
"ProxyURL": {},
"TLSConfig": {
"CAFile": "",
"CertFile": "",
"KeyFile": "",
"ServerName": "",
"InsecureSkipVerify": false
},
"FollowRedirects": true
},
"smtp_from": "youraddress@example.org",
"smtp_hello": "localhost",
"smtp_smarthost": "localhost:25",
"smtp_require_tls": true,
"pagerduty_url": "https://events.pagerduty.com/v2/enqueue",
"opsgenie_api_url": "https://api.opsgenie.com/",
"wechat_api_url": "https://qyapi.weixin.qq.com/cgi-bin/",
"victorops_api_url": "https://alert.victorops.com/integrations/generic/20131114/alert/"
},
"route": {
"receiver": "example-email"
},
"templates": [],
"receivers": [
{
"name": "example-email",
"email_configs": [
{
"send_resolved": false,
"to": "youraddress@example.org",
"smarthost": "",
"html": "{{ template \"email.default.html\" . }}",
"tls_config": {
"CAFile": "",
"CertFile": "",
"KeyFile": "",
"ServerName": "",
"InsecureSkipVerify": false
}
}
]
}
]
}
}
`