mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: create wrapper for Alertmanager to enable org level isolation (#37320)
Introduces org-level isolation for the Alertmanager and its components. Silences, Alerts and Contact points are not separated by org and are not shared between them. Co-authored with @davidmparrott and @papagian
This commit is contained in:
parent
7ebf4027a7
commit
7fbeefc090
@ -385,7 +385,7 @@ func (hs *HTTPServer) registerRoutes() {
|
||||
})
|
||||
|
||||
apiRoute.Get("/alert-notifiers", reqEditorRole, routing.Wrap(
|
||||
GetAlertNotifiers(hs.Alertmanager != nil && hs.Cfg.IsNgAlertEnabled())),
|
||||
GetAlertNotifiers(hs.MultiOrgAlertmanager != nil && hs.Cfg.IsNgAlertEnabled())),
|
||||
)
|
||||
|
||||
apiRoute.Group("/alert-notifications", func(alertNotifications routing.RouteRegister) {
|
||||
|
@ -56,7 +56,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/util/errutil"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
macaron "gopkg.in/macaron.v1"
|
||||
"gopkg.in/macaron.v1"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@ -104,7 +104,7 @@ type HTTPServer struct {
|
||||
PluginDashboardService *plugindashboards.Service `inject:""`
|
||||
AlertEngine *alerting.AlertEngine `inject:""`
|
||||
LoadSchemaService *schemaloader.SchemaLoaderService `inject:""`
|
||||
Alertmanager *notifier.Alertmanager `inject:""`
|
||||
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager `inject:""`
|
||||
LibraryPanelService librarypanels.Service `inject:""`
|
||||
LibraryElementService libraryelements.Service `inject:""`
|
||||
SocialService social.Service `inject:""`
|
||||
|
@ -30,10 +30,8 @@ type Scheduler interface {
|
||||
|
||||
type Alertmanager interface {
|
||||
// Configuration
|
||||
// temporary add orgID parameter; this will move to the Alertmanager wrapper when it will be available
|
||||
SaveAndApplyConfig(orgID int64, config *apimodels.PostableUserConfig) error
|
||||
// temporary add orgID parameter; this will move to the Alertmanager wrapper when it will be available
|
||||
SaveAndApplyDefaultConfig(orgID int64) error
|
||||
SaveAndApplyConfig(config *apimodels.PostableUserConfig) error
|
||||
SaveAndApplyDefaultConfig() error
|
||||
GetStatus() apimodels.GettableStatus
|
||||
|
||||
// Silences
|
||||
@ -52,19 +50,19 @@ type Alertmanager interface {
|
||||
|
||||
// API handlers.
|
||||
type API struct {
|
||||
Cfg *setting.Cfg
|
||||
DatasourceCache datasources.CacheService
|
||||
RouteRegister routing.RouteRegister
|
||||
DataService *tsdb.Service
|
||||
QuotaService *quota.QuotaService
|
||||
Schedule schedule.ScheduleService
|
||||
RuleStore store.RuleStore
|
||||
InstanceStore store.InstanceStore
|
||||
AlertingStore store.AlertingStore
|
||||
AdminConfigStore store.AdminConfigurationStore
|
||||
DataProxy *datasourceproxy.DatasourceProxyService
|
||||
Alertmanager Alertmanager
|
||||
StateManager *state.Manager
|
||||
Cfg *setting.Cfg
|
||||
DatasourceCache datasources.CacheService
|
||||
RouteRegister routing.RouteRegister
|
||||
DataService *tsdb.Service
|
||||
QuotaService *quota.QuotaService
|
||||
Schedule schedule.ScheduleService
|
||||
RuleStore store.RuleStore
|
||||
InstanceStore store.InstanceStore
|
||||
AlertingStore store.AlertingStore
|
||||
AdminConfigStore store.AdminConfigurationStore
|
||||
DataProxy *datasourceproxy.DatasourceProxyService
|
||||
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
|
||||
StateManager *state.Manager
|
||||
}
|
||||
|
||||
// RegisterAPIEndpoints registers API handlers
|
||||
@ -78,7 +76,7 @@ func (api *API) RegisterAPIEndpoints(m *metrics.Metrics) {
|
||||
api.RegisterAlertmanagerApiEndpoints(NewForkedAM(
|
||||
api.DatasourceCache,
|
||||
NewLotexAM(proxy, logger),
|
||||
AlertmanagerSrv{store: api.AlertingStore, am: api.Alertmanager, log: logger},
|
||||
AlertmanagerSrv{store: api.AlertingStore, mam: api.MultiOrgAlertmanager, log: logger},
|
||||
), m)
|
||||
// Register endpoints for proxying to Prometheus-compatible backends.
|
||||
api.RegisterPrometheusApiEndpoints(NewForkedProm(
|
||||
|
@ -25,7 +25,7 @@ const (
|
||||
)
|
||||
|
||||
type AlertmanagerSrv struct {
|
||||
am Alertmanager
|
||||
mam *notifier.MultiOrgAlertmanager
|
||||
store store.AlertingStore
|
||||
log log.Logger
|
||||
}
|
||||
@ -93,14 +93,25 @@ func (srv AlertmanagerSrv) loadSecureSettings(orgId int64, receivers []*apimodel
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteGetAMStatus(c *models.ReqContext) response.Response {
|
||||
return response.JSON(http.StatusOK, srv.am.GetStatus())
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
return response.JSON(http.StatusOK, am.GetStatus())
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteCreateSilence(c *models.ReqContext, postableSilence apimodels.PostableSilence) response.Response {
|
||||
if !c.HasUserRole(models.ROLE_EDITOR) {
|
||||
return ErrResp(http.StatusForbidden, errors.New("permission denied"), "")
|
||||
}
|
||||
silenceID, err := srv.am.CreateSilence(&postableSilence)
|
||||
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
silenceID, err := am.CreateSilence(&postableSilence)
|
||||
if err != nil {
|
||||
if errors.Is(err, notifier.ErrSilenceNotFound) {
|
||||
return ErrResp(http.StatusNotFound, err, "")
|
||||
@ -119,7 +130,13 @@ func (srv AlertmanagerSrv) RouteDeleteAlertingConfig(c *models.ReqContext) respo
|
||||
if !c.HasUserRole(models.ROLE_EDITOR) {
|
||||
return ErrResp(http.StatusForbidden, errors.New("permission denied"), "")
|
||||
}
|
||||
if err := srv.am.SaveAndApplyDefaultConfig(c.OrgId); err != nil {
|
||||
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
if err := am.SaveAndApplyDefaultConfig(); err != nil {
|
||||
srv.log.Error("unable to save and apply default alertmanager configuration", "err", err)
|
||||
return ErrResp(http.StatusInternalServerError, err, "failed to save and apply default Alertmanager configuration")
|
||||
}
|
||||
@ -131,8 +148,14 @@ func (srv AlertmanagerSrv) RouteDeleteSilence(c *models.ReqContext) response.Res
|
||||
if !c.HasUserRole(models.ROLE_EDITOR) {
|
||||
return ErrResp(http.StatusForbidden, errors.New("permission denied"), "")
|
||||
}
|
||||
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
silenceID := c.Params(":SilenceId")
|
||||
if err := srv.am.DeleteSilence(silenceID); err != nil {
|
||||
if err := am.DeleteSilence(silenceID); err != nil {
|
||||
if errors.Is(err, notifier.ErrSilenceNotFound) {
|
||||
return ErrResp(http.StatusNotFound, err, "")
|
||||
}
|
||||
@ -202,7 +225,12 @@ func (srv AlertmanagerSrv) RouteGetAlertingConfig(c *models.ReqContext) response
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteGetAMAlertGroups(c *models.ReqContext) response.Response {
|
||||
groups, err := srv.am.GetAlertGroups(
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
groups, err := am.GetAlertGroups(
|
||||
c.QueryBoolWithDefault("active", true),
|
||||
c.QueryBoolWithDefault("silenced", true),
|
||||
c.QueryBoolWithDefault("inhibited", true),
|
||||
@ -221,7 +249,12 @@ func (srv AlertmanagerSrv) RouteGetAMAlertGroups(c *models.ReqContext) response.
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteGetAMAlerts(c *models.ReqContext) response.Response {
|
||||
alerts, err := srv.am.GetAlerts(
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
alerts, err := am.GetAlerts(
|
||||
c.QueryBoolWithDefault("active", true),
|
||||
c.QueryBoolWithDefault("silenced", true),
|
||||
c.QueryBoolWithDefault("inhibited", true),
|
||||
@ -243,8 +276,13 @@ func (srv AlertmanagerSrv) RouteGetAMAlerts(c *models.ReqContext) response.Respo
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteGetSilence(c *models.ReqContext) response.Response {
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
silenceID := c.Params(":SilenceId")
|
||||
gettableSilence, err := srv.am.GetSilence(silenceID)
|
||||
gettableSilence, err := am.GetSilence(silenceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, notifier.ErrSilenceNotFound) {
|
||||
return ErrResp(http.StatusNotFound, err, "")
|
||||
@ -256,7 +294,12 @@ func (srv AlertmanagerSrv) RouteGetSilence(c *models.ReqContext) response.Respon
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RouteGetSilences(c *models.ReqContext) response.Response {
|
||||
gettableSilences, err := srv.am.ListSilences(c.QueryStrings("filter"))
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
gettableSilences, err := am.ListSilences(c.QueryStrings("filter"))
|
||||
if err != nil {
|
||||
if errors.Is(err, notifier.ErrListSilencesBadPayload) {
|
||||
return ErrResp(http.StatusBadRequest, err, "")
|
||||
@ -293,7 +336,12 @@ func (srv AlertmanagerSrv) RoutePostAlertingConfig(c *models.ReqContext, body ap
|
||||
return ErrResp(http.StatusInternalServerError, err, "failed to post process Alertmanager configuration")
|
||||
}
|
||||
|
||||
if err := srv.am.SaveAndApplyConfig(c.OrgId, &body); err != nil {
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
if err := am.SaveAndApplyConfig(&body); err != nil {
|
||||
srv.log.Error("unable to save and apply alertmanager configuration", "err", err)
|
||||
return ErrResp(http.StatusBadRequest, err, "failed to save and apply Alertmanager configuration")
|
||||
}
|
||||
@ -301,7 +349,7 @@ func (srv AlertmanagerSrv) RoutePostAlertingConfig(c *models.ReqContext, body ap
|
||||
return response.JSON(http.StatusAccepted, util.DynMap{"message": "configuration created"})
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) RoutePostAMAlerts(c *models.ReqContext, body apimodels.PostableAlerts) response.Response {
|
||||
func (srv AlertmanagerSrv) RoutePostAMAlerts(_ *models.ReqContext, _ apimodels.PostableAlerts) response.Response {
|
||||
return NotImplementedResp
|
||||
}
|
||||
|
||||
@ -332,7 +380,12 @@ func (srv AlertmanagerSrv) RoutePostTestReceivers(c *models.ReqContext, body api
|
||||
}
|
||||
defer cancelFunc()
|
||||
|
||||
result, err := srv.am.TestReceivers(ctx, body)
|
||||
am, errResp := srv.AlertmanagerFor(c.OrgId)
|
||||
if errResp != nil {
|
||||
return errResp
|
||||
}
|
||||
|
||||
result, err := am.TestReceivers(ctx, body)
|
||||
if err != nil {
|
||||
if errors.Is(err, notifier.ErrNoReceivers) {
|
||||
return response.Error(http.StatusBadRequest, "", err)
|
||||
@ -429,3 +482,21 @@ func statusForTestReceivers(v []notifier.TestReceiverResult) int {
|
||||
return http.StatusOK
|
||||
}
|
||||
}
|
||||
|
||||
func (srv AlertmanagerSrv) AlertmanagerFor(orgID int64) (Alertmanager, *response.NormalResponse) {
|
||||
am, err := srv.mam.AlertmanagerFor(orgID)
|
||||
if err == nil {
|
||||
return am, nil
|
||||
}
|
||||
|
||||
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
|
||||
return nil, response.Error(http.StatusNotFound, err.Error(), nil)
|
||||
}
|
||||
|
||||
if errors.Is(err, notifier.ErrAlertmanagerNotReady) {
|
||||
return nil, response.Error(http.StatusConflict, err.Error(), nil)
|
||||
}
|
||||
|
||||
srv.log.Error("unable to obtain the org's Alertmanager", "err", err)
|
||||
return nil, response.Error(http.StatusInternalServerError, "unable to obtain org's Alertmanager", err)
|
||||
}
|
||||
|
@ -2247,6 +2247,76 @@
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/prometheus/common/config"
|
||||
},
|
||||
"TestReceiverConfigResult": {
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"x-go-name": "Error"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"x-go-name": "Name"
|
||||
},
|
||||
"status": {
|
||||
"type": "string",
|
||||
"x-go-name": "Status"
|
||||
},
|
||||
"uid": {
|
||||
"type": "string",
|
||||
"x-go-name": "UID"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"TestReceiverResult": {
|
||||
"properties": {
|
||||
"grafana_managed_receiver_configs": {
|
||||
"items": {
|
||||
"$ref": "#/definitions/TestReceiverConfigResult"
|
||||
},
|
||||
"type": "array",
|
||||
"x-go-name": "Configs"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"x-go-name": "Name"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"TestReceiversConfig": {
|
||||
"properties": {
|
||||
"receivers": {
|
||||
"items": {
|
||||
"$ref": "#/definitions/PostableApiReceiver"
|
||||
},
|
||||
"type": "array",
|
||||
"x-go-name": "Receivers"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"TestReceiversResult": {
|
||||
"properties": {
|
||||
"notified_at": {
|
||||
"format": "date-time",
|
||||
"type": "string",
|
||||
"x-go-name": "NotifedAt"
|
||||
},
|
||||
"receivers": {
|
||||
"items": {
|
||||
"$ref": "#/definitions/TestReceiverResult"
|
||||
},
|
||||
"type": "array",
|
||||
"x-go-name": "Receivers"
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"TestRulePayload": {
|
||||
"properties": {
|
||||
"expr": {
|
||||
@ -2274,6 +2344,7 @@
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"URL": {
|
||||
"description": "The general form represented is:\n\n[scheme:][//[userinfo@]host][/]path[?query][#fragment]\n\nURLs that do not start with a slash after the scheme are interpreted as:\n\nscheme:opaque[?query][#fragment]\n\nNote that the Path field is stored in decoded form: /%47%6f%2f becomes /Go/.\nA consequence is that it is impossible to tell which slashes in the Path were\nslashes in the raw URL and which were %2f. This distinction is rarely important,\nbut when it is, the code should use RawPath, an optional field which only gets\nset if the default encoding is different from Path.\n\nURL's String method uses the EscapedPath method to obtain the path. See the\nEscapedPath method for more details.",
|
||||
"properties": {
|
||||
"ForceQuery": {
|
||||
"type": "boolean"
|
||||
@ -2306,9 +2377,9 @@
|
||||
"$ref": "#/definitions/Userinfo"
|
||||
}
|
||||
},
|
||||
"title": "URL is a custom URL type that allows validation at configuration load time.",
|
||||
"title": "A URL represents a parsed URL (technically, a URI reference).",
|
||||
"type": "object",
|
||||
"x-go-package": "github.com/prometheus/common/config"
|
||||
"x-go-package": "net/url"
|
||||
},
|
||||
"Userinfo": {
|
||||
"description": "The Userinfo type is an immutable encapsulation of username and\npassword details for a URL. An existing Userinfo value is guaranteed\nto have a username set (potentially empty, as allowed by RFC 2396),\nand optionally a password.",
|
||||
@ -2475,7 +2546,6 @@
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"alertGroup": {
|
||||
"description": "AlertGroup alert group",
|
||||
"properties": {
|
||||
"alerts": {
|
||||
"description": "alerts",
|
||||
@ -2497,14 +2567,17 @@
|
||||
"labels",
|
||||
"receiver"
|
||||
],
|
||||
"type": "object"
|
||||
"type": "object",
|
||||
"x-go-name": "AlertGroup",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"alertGroups": {
|
||||
"description": "AlertGroups alert groups",
|
||||
"items": {
|
||||
"$ref": "#/definitions/alertGroup"
|
||||
},
|
||||
"type": "array"
|
||||
"type": "array",
|
||||
"x-go-name": "AlertGroups",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"alertStatus": {
|
||||
"description": "AlertStatus alert status",
|
||||
@ -2624,6 +2697,7 @@
|
||||
"$ref": "#/definitions/Duration"
|
||||
},
|
||||
"gettableAlert": {
|
||||
"description": "GettableAlert gettable alert",
|
||||
"properties": {
|
||||
"annotations": {
|
||||
"$ref": "#/definitions/labelSet"
|
||||
@ -2682,9 +2756,7 @@
|
||||
"status",
|
||||
"updatedAt"
|
||||
],
|
||||
"type": "object",
|
||||
"x-go-name": "GettableAlert",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
"type": "object"
|
||||
},
|
||||
"gettableAlerts": {
|
||||
"items": {
|
||||
@ -2928,6 +3000,7 @@
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"receiver": {
|
||||
"description": "Receiver receiver",
|
||||
"properties": {
|
||||
"name": {
|
||||
"description": "name",
|
||||
@ -2938,9 +3011,7 @@
|
||||
"required": [
|
||||
"name"
|
||||
],
|
||||
"type": "object",
|
||||
"x-go-name": "Receiver",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
"type": "object"
|
||||
},
|
||||
"silence": {
|
||||
"description": "Silence silence",
|
||||
@ -3523,6 +3594,49 @@
|
||||
]
|
||||
}
|
||||
},
|
||||
"/api/alertmanager/{Recipient}/config/api/v1/receivers/test": {
|
||||
"post": {
|
||||
"operationId": "RoutePostTestReceivers",
|
||||
"parameters": [
|
||||
{
|
||||
"in": "query",
|
||||
"items": {
|
||||
"$ref": "#/definitions/PostableApiReceiver"
|
||||
},
|
||||
"name": "receivers",
|
||||
"type": "array",
|
||||
"x-go-name": "Receivers"
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Ack",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/Ack"
|
||||
}
|
||||
},
|
||||
"207": {
|
||||
"$ref": "#/responses/MultiStatus"
|
||||
},
|
||||
"400": {
|
||||
"description": "ValidationError",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/ValidationError"
|
||||
}
|
||||
},
|
||||
"408": {
|
||||
"description": "Failure",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/Failure"
|
||||
}
|
||||
}
|
||||
},
|
||||
"summary": "Test Grafana managed receivers without saving them.",
|
||||
"tags": [
|
||||
"alertmanager"
|
||||
]
|
||||
}
|
||||
},
|
||||
"/api/prometheus/{Recipient}/api/v1/alerts": {
|
||||
"get": {
|
||||
"description": "gets the current alerts",
|
||||
|
@ -1750,7 +1750,6 @@
|
||||
"enum": [
|
||||
"Alerting"
|
||||
],
|
||||
"x-go-enum-desc": "Alerting AlertingErrState",
|
||||
"x-go-name": "ExecErrState"
|
||||
},
|
||||
"id": {
|
||||
@ -1779,7 +1778,6 @@
|
||||
"NoData",
|
||||
"OK"
|
||||
],
|
||||
"x-go-enum-desc": "Alerting Alerting\nNoData NoData\nOK OK",
|
||||
"x-go-name": "NoDataState"
|
||||
},
|
||||
"orgId": {
|
||||
@ -2592,7 +2590,6 @@
|
||||
"enum": [
|
||||
"Alerting"
|
||||
],
|
||||
"x-go-enum-desc": "Alerting AlertingErrState",
|
||||
"x-go-name": "ExecErrState"
|
||||
},
|
||||
"no_data_state": {
|
||||
@ -2602,7 +2599,6 @@
|
||||
"NoData",
|
||||
"OK"
|
||||
],
|
||||
"x-go-enum-desc": "Alerting Alerting\nNoData NoData\nOK OK",
|
||||
"x-go-name": "NoDataState"
|
||||
},
|
||||
"title": {
|
||||
@ -3373,8 +3369,9 @@
|
||||
"x-go-package": "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
},
|
||||
"URL": {
|
||||
"description": "The general form represented is:\n\n[scheme:][//[userinfo@]host][/]path[?query][#fragment]\n\nURLs that do not start with a slash after the scheme are interpreted as:\n\nscheme:opaque[?query][#fragment]\n\nNote that the Path field is stored in decoded form: /%47%6f%2f becomes /Go/.\nA consequence is that it is impossible to tell which slashes in the Path were\nslashes in the raw URL and which were %2f. This distinction is rarely important,\nbut when it is, the code should use RawPath, an optional field which only gets\nset if the default encoding is different from Path.\n\nURL's String method uses the EscapedPath method to obtain the path. See the\nEscapedPath method for more details.",
|
||||
"type": "object",
|
||||
"title": "URL is a custom URL type that allows validation at configuration load time.",
|
||||
"title": "A URL represents a parsed URL (technically, a URI reference).",
|
||||
"properties": {
|
||||
"ForceQuery": {
|
||||
"type": "boolean"
|
||||
@ -3407,7 +3404,7 @@
|
||||
"$ref": "#/definitions/Userinfo"
|
||||
}
|
||||
},
|
||||
"x-go-package": "github.com/prometheus/common/config"
|
||||
"x-go-package": "net/url"
|
||||
},
|
||||
"Userinfo": {
|
||||
"description": "The Userinfo type is an immutable encapsulation of username and\npassword details for a URL. An existing Userinfo value is guaranteed\nto have a username set (potentially empty, as allowed by RFC 2396),\nand optionally a password.",
|
||||
@ -3574,7 +3571,6 @@
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"alertGroup": {
|
||||
"description": "AlertGroup alert group",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"alerts",
|
||||
@ -3597,6 +3593,8 @@
|
||||
"$ref": "#/definitions/receiver"
|
||||
}
|
||||
},
|
||||
"x-go-name": "AlertGroup",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/alertGroup"
|
||||
},
|
||||
"alertGroups": {
|
||||
@ -3726,6 +3724,7 @@
|
||||
"$ref": "#/definitions/Duration"
|
||||
},
|
||||
"gettableAlert": {
|
||||
"description": "GettableAlert gettable alert",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"labels",
|
||||
@ -3785,19 +3784,19 @@
|
||||
"x-go-name": "UpdatedAt"
|
||||
}
|
||||
},
|
||||
"x-go-name": "GettableAlert",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/gettableAlert"
|
||||
},
|
||||
"gettableAlerts": {
|
||||
"description": "GettableAlerts gettable alerts",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/gettableAlert"
|
||||
},
|
||||
"x-go-name": "GettableAlerts",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/gettableAlerts"
|
||||
},
|
||||
"gettableSilence": {
|
||||
"description": "GettableSilence gettable silence",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"comment",
|
||||
@ -3850,8 +3849,6 @@
|
||||
"x-go-name": "UpdatedAt"
|
||||
}
|
||||
},
|
||||
"x-go-name": "GettableSilence",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/gettableSilence"
|
||||
},
|
||||
"gettableSilences": {
|
||||
@ -3990,7 +3987,6 @@
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models"
|
||||
},
|
||||
"postableSilence": {
|
||||
"description": "PostableSilence postable silence",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"comment",
|
||||
@ -4031,9 +4027,12 @@
|
||||
"x-go-name": "StartsAt"
|
||||
}
|
||||
},
|
||||
"x-go-name": "PostableSilence",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/postableSilence"
|
||||
},
|
||||
"receiver": {
|
||||
"description": "Receiver receiver",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"name"
|
||||
@ -4045,8 +4044,6 @@
|
||||
"x-go-name": "Name"
|
||||
}
|
||||
},
|
||||
"x-go-name": "Receiver",
|
||||
"x-go-package": "github.com/prometheus/alertmanager/api/v2/models",
|
||||
"$ref": "#/definitions/receiver"
|
||||
},
|
||||
"silence": {
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/alertmanager/api/metrics"
|
||||
@ -27,9 +28,9 @@ var GlobalMetrics = NewMetrics(prometheus.DefaultRegisterer)
|
||||
|
||||
type Metrics struct {
|
||||
*metrics.Alerts
|
||||
AlertState *prometheus.GaugeVec
|
||||
// Registerer is for use by subcomponents which register their own metrics.
|
||||
Registerer prometheus.Registerer
|
||||
AlertState *prometheus.GaugeVec
|
||||
RequestDuration *prometheus.HistogramVec
|
||||
ActiveConfigurations prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
@ -55,14 +56,14 @@ func (m *Metrics) SwapRegisterer(r prometheus.Registerer) {
|
||||
|
||||
func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
return &Metrics{
|
||||
Alerts: metrics.NewAlerts("v2", r),
|
||||
Registerer: r,
|
||||
Alerts: metrics.NewAlerts("v2", r),
|
||||
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Subsystem: "alerting",
|
||||
Name: "alerts",
|
||||
Help: "How many alerts by state.",
|
||||
}, []string{"state"}),
|
||||
Registerer: r,
|
||||
RequestDuration: promauto.With(r).NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "grafana",
|
||||
@ -125,6 +126,37 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
|
||||
}
|
||||
}
|
||||
|
||||
// multi-thread safety and stable ordering of prometheus registries.
|
||||
type OrgRegistries struct {
|
||||
regsMu sync.Mutex
|
||||
regs map[int64]prometheus.Registerer
|
||||
}
|
||||
|
||||
func NewOrgRegistries() *OrgRegistries {
|
||||
return &OrgRegistries{
|
||||
regs: make(map[int64]prometheus.Registerer),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *OrgRegistries) GetOrCreateOrgRegistry(orgID int64) prometheus.Registerer {
|
||||
m.regsMu.Lock()
|
||||
defer m.regsMu.Unlock()
|
||||
|
||||
orgRegistry, ok := m.regs[orgID]
|
||||
if !ok {
|
||||
reg := prometheus.NewRegistry()
|
||||
m.regs[orgID] = reg
|
||||
return reg
|
||||
}
|
||||
return orgRegistry
|
||||
}
|
||||
|
||||
func (m *OrgRegistries) RemoveOrgRegistry(org int64) {
|
||||
m.regsMu.Lock()
|
||||
defer m.regsMu.Unlock()
|
||||
delete(m.regs, org)
|
||||
}
|
||||
|
||||
// Instrument wraps a middleware, instrumenting the request latencies.
|
||||
func Instrument(
|
||||
method,
|
||||
|
@ -55,7 +55,7 @@ type AlertNG struct {
|
||||
stateManager *state.Manager
|
||||
|
||||
// Alerting notification services
|
||||
Alertmanager *notifier.Alertmanager
|
||||
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -74,9 +74,10 @@ func (ng *AlertNG) Init() error {
|
||||
Logger: ng.Log,
|
||||
}
|
||||
|
||||
var err error
|
||||
ng.Alertmanager, err = notifier.New(ng.Cfg, store, ng.Metrics)
|
||||
if err != nil {
|
||||
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store)
|
||||
|
||||
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
|
||||
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -89,7 +90,8 @@ func (ng *AlertNG) Init() error {
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
AdminConfigStore: store,
|
||||
Notifier: ng.Alertmanager,
|
||||
OrgStore: store,
|
||||
MultiOrgNotifier: ng.MultiOrgAlertmanager,
|
||||
Metrics: ng.Metrics,
|
||||
AdminConfigPollInterval: ng.Cfg.AdminConfigPollInterval,
|
||||
}
|
||||
@ -98,19 +100,19 @@ func (ng *AlertNG) Init() error {
|
||||
ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService, ng.Cfg.AppURL, ng.stateManager)
|
||||
|
||||
api := api.API{
|
||||
Cfg: ng.Cfg,
|
||||
DatasourceCache: ng.DatasourceCache,
|
||||
RouteRegister: ng.RouteRegister,
|
||||
DataService: ng.DataService,
|
||||
Schedule: ng.schedule,
|
||||
DataProxy: ng.DataProxy,
|
||||
QuotaService: ng.QuotaService,
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
AlertingStore: store,
|
||||
AdminConfigStore: store,
|
||||
Alertmanager: ng.Alertmanager,
|
||||
StateManager: ng.stateManager,
|
||||
Cfg: ng.Cfg,
|
||||
DatasourceCache: ng.DatasourceCache,
|
||||
RouteRegister: ng.RouteRegister,
|
||||
DataService: ng.DataService,
|
||||
Schedule: ng.schedule,
|
||||
DataProxy: ng.DataProxy,
|
||||
QuotaService: ng.QuotaService,
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
AlertingStore: store,
|
||||
AdminConfigStore: store,
|
||||
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
|
||||
StateManager: ng.stateManager,
|
||||
}
|
||||
api.RegisterAPIEndpoints(ng.Metrics)
|
||||
|
||||
@ -127,7 +129,7 @@ func (ng *AlertNG) Run(ctx context.Context) error {
|
||||
return ng.schedule.Run(subCtx)
|
||||
})
|
||||
children.Go(func() error {
|
||||
return ng.Alertmanager.Run(subCtx)
|
||||
return ng.MultiOrgAlertmanager.Run(subCtx)
|
||||
})
|
||||
return children.Wait()
|
||||
}
|
||||
|
@ -34,13 +34,11 @@ import (
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier/channels"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
"github.com/grafana/grafana/pkg/services/sqlstore"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
const (
|
||||
pollInterval = 1 * time.Minute
|
||||
workingDir = "alerting"
|
||||
workingDir = "alerting"
|
||||
// How long should we keep silences and notification entries on-disk after they've served their purpose.
|
||||
retentionNotificationsAndSilences = 5 * 24 * time.Hour
|
||||
// maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences
|
||||
@ -73,18 +71,15 @@ const (
|
||||
}
|
||||
}
|
||||
`
|
||||
//TODO: temporary until fix org isolation
|
||||
mainOrgID = 1
|
||||
)
|
||||
|
||||
type Alertmanager struct {
|
||||
logger log.Logger
|
||||
gokitLogger gokit_log.Logger
|
||||
|
||||
Settings *setting.Cfg `inject:""`
|
||||
SQLStore *sqlstore.SQLStore `inject:""`
|
||||
Settings *setting.Cfg
|
||||
Store store.AlertingStore
|
||||
Metrics *metrics.Metrics `inject:""`
|
||||
Metrics *metrics.Metrics
|
||||
|
||||
notificationLog *nflog.Log
|
||||
marker types.Marker
|
||||
@ -108,18 +103,20 @@ type Alertmanager struct {
|
||||
reloadConfigMtx sync.RWMutex
|
||||
config *apimodels.PostableUserConfig
|
||||
configHash [16]byte
|
||||
orgID int64
|
||||
}
|
||||
|
||||
func New(cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) {
|
||||
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) {
|
||||
am := &Alertmanager{
|
||||
Settings: cfg,
|
||||
stopc: make(chan struct{}),
|
||||
logger: log.New("alertmanager"),
|
||||
logger: log.New("alertmanager", "org", orgID),
|
||||
marker: types.NewMarker(m.Registerer),
|
||||
stageMetrics: notify.NewMetrics(m.Registerer),
|
||||
dispatcherMetrics: dispatch.NewDispatcherMetrics(m.Registerer),
|
||||
Store: store,
|
||||
Metrics: m,
|
||||
orgID: orgID,
|
||||
}
|
||||
|
||||
am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger))
|
||||
@ -174,25 +171,7 @@ func (am *Alertmanager) ready() bool {
|
||||
return am.config != nil
|
||||
}
|
||||
|
||||
func (am *Alertmanager) Run(ctx context.Context) error {
|
||||
// Make sure dispatcher starts. We can tolerate future reload failures.
|
||||
if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil {
|
||||
am.logger.Error("unable to sync configuration", "err", err)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return am.StopAndWait()
|
||||
case <-time.After(pollInterval):
|
||||
if err := am.SyncAndApplyConfigFromDatabase(mainOrgID); err != nil {
|
||||
am.logger.Error("unable to sync configuration", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (am *Alertmanager) StopAndWait() error {
|
||||
func (am *Alertmanager) StopAndWait() {
|
||||
if am.dispatcher != nil {
|
||||
am.dispatcher.Stop()
|
||||
}
|
||||
@ -206,12 +185,11 @@ func (am *Alertmanager) StopAndWait() error {
|
||||
close(am.stopc)
|
||||
|
||||
am.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// SaveAndApplyDefaultConfig saves the default configuration the database and applies the configuration to the Alertmanager.
|
||||
// It rollbacks the save if we fail to apply the configuration.
|
||||
func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error {
|
||||
func (am *Alertmanager) SaveAndApplyDefaultConfig() error {
|
||||
am.reloadConfigMtx.Lock()
|
||||
defer am.reloadConfigMtx.Unlock()
|
||||
|
||||
@ -219,7 +197,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error {
|
||||
AlertmanagerConfiguration: alertmanagerDefaultConfiguration,
|
||||
Default: true,
|
||||
ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion),
|
||||
OrgID: orgID,
|
||||
OrgID: am.orgID,
|
||||
}
|
||||
|
||||
cfg, err := Load([]byte(alertmanagerDefaultConfiguration))
|
||||
@ -243,7 +221,7 @@ func (am *Alertmanager) SaveAndApplyDefaultConfig(orgID int64) error {
|
||||
|
||||
// SaveAndApplyConfig saves the configuration the database and applies the configuration to the Alertmanager.
|
||||
// It rollbacks the save if we fail to apply the configuration.
|
||||
func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableUserConfig) error {
|
||||
func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) error {
|
||||
rawConfig, err := json.Marshal(&cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to serialize to the Alertmanager configuration: %w", err)
|
||||
@ -255,7 +233,7 @@ func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableU
|
||||
cmd := &ngmodels.SaveAlertmanagerConfigurationCmd{
|
||||
AlertmanagerConfiguration: string(rawConfig),
|
||||
ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion),
|
||||
OrgID: orgID,
|
||||
OrgID: am.orgID,
|
||||
}
|
||||
|
||||
err = am.Store.SaveAlertmanagerConfigurationWithCallback(cmd, func() error {
|
||||
@ -274,12 +252,12 @@ func (am *Alertmanager) SaveAndApplyConfig(orgID int64, cfg *apimodels.PostableU
|
||||
|
||||
// SyncAndApplyConfigFromDatabase picks the latest config from database and restarts
|
||||
// the components with the new config.
|
||||
func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error {
|
||||
func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
|
||||
am.reloadConfigMtx.Lock()
|
||||
defer am.reloadConfigMtx.Unlock()
|
||||
|
||||
// First, let's get the configuration we need from the database.
|
||||
q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: mainOrgID}
|
||||
q := &ngmodels.GetLatestAlertmanagerConfigurationQuery{OrgID: am.orgID}
|
||||
if err := am.Store.GetLatestAlertmanagerConfiguration(q); err != nil {
|
||||
// If there's no configuration in the database, let's use the default configuration.
|
||||
if errors.Is(err, store.ErrNoAlertmanagerConfiguration) {
|
||||
@ -289,7 +267,7 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase(orgID int64) error {
|
||||
AlertmanagerConfiguration: alertmanagerDefaultConfiguration,
|
||||
Default: true,
|
||||
ConfigurationVersion: fmt.Sprintf("v%d", ngmodels.AlertConfigurationVersion),
|
||||
OrgID: orgID,
|
||||
OrgID: am.orgID,
|
||||
}
|
||||
if err := am.Store.SaveAlertmanagerConfiguration(savecmd); err != nil {
|
||||
return err
|
||||
@ -389,7 +367,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
|
||||
// Finally, build the integrations map using the receiver configuration and templates.
|
||||
integrationsMap, err := am.buildIntegrationsMap(cfg.AlertmanagerConfig.Receivers, tmpl)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to build integration map: %w", err)
|
||||
}
|
||||
// Now, let's put together our notification pipeline
|
||||
routingStage := make(notify.RoutingStage, len(integrationsMap))
|
||||
@ -433,7 +411,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
|
||||
}
|
||||
|
||||
func (am *Alertmanager) WorkingDirPath() string {
|
||||
return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(mainOrgID))
|
||||
return filepath.Join(am.Settings.DataPath, workingDir, strconv.Itoa(int(am.orgID)))
|
||||
}
|
||||
|
||||
// buildIntegrationsMap builds a map of name to the list of Grafana integration notifiers off of a list of receiver config.
|
||||
|
@ -47,14 +47,14 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
Logger: log.New("alertmanager-test"),
|
||||
}
|
||||
|
||||
am, err := New(cfg, store, m)
|
||||
am, err := newAlertmanager(1, cfg, store, m)
|
||||
require.NoError(t, err)
|
||||
return am
|
||||
}
|
||||
|
||||
func TestAlertmanager_ShouldUseDefaultConfigurationWhenNoConfiguration(t *testing.T) {
|
||||
am := setupAMTest(t)
|
||||
require.NoError(t, am.SyncAndApplyConfigFromDatabase(mainOrgID))
|
||||
require.NoError(t, am.SyncAndApplyConfigFromDatabase())
|
||||
require.NotNil(t, am.config)
|
||||
}
|
||||
|
||||
|
147
pkg/services/ngalert/notifier/multiorg_alertmanager.go
Normal file
147
pkg/services/ngalert/notifier/multiorg_alertmanager.go
Normal file
@ -0,0 +1,147 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
var (
|
||||
SyncOrgsPollInterval = 1 * time.Minute
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNoAlertmanagerForOrg = fmt.Errorf("Alertmanager does not exist for this organization")
|
||||
ErrAlertmanagerNotReady = fmt.Errorf("Alertmanager is not ready yet")
|
||||
)
|
||||
|
||||
type MultiOrgAlertmanager struct {
|
||||
alertmanagersMtx sync.RWMutex
|
||||
alertmanagers map[int64]*Alertmanager
|
||||
|
||||
settings *setting.Cfg
|
||||
logger log.Logger
|
||||
|
||||
configStore store.AlertingStore
|
||||
orgStore store.OrgStore
|
||||
|
||||
orgRegistry *metrics.OrgRegistries
|
||||
}
|
||||
|
||||
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore) *MultiOrgAlertmanager {
|
||||
return &MultiOrgAlertmanager{
|
||||
settings: cfg,
|
||||
logger: log.New("multiorg.alertmanager"),
|
||||
alertmanagers: map[int64]*Alertmanager{},
|
||||
configStore: configStore,
|
||||
orgStore: orgStore,
|
||||
orgRegistry: metrics.NewOrgRegistries(),
|
||||
}
|
||||
}
|
||||
|
||||
func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
|
||||
moa.logger.Info("starting MultiOrg Alertmanager")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
moa.StopAndWait()
|
||||
return nil
|
||||
case <-time.After(SyncOrgsPollInterval):
|
||||
if err := moa.LoadAndSyncAlertmanagersForOrgs(ctx); err != nil {
|
||||
moa.logger.Error("error while synchronizing Alertmanager orgs", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (moa *MultiOrgAlertmanager) LoadAndSyncAlertmanagersForOrgs(ctx context.Context) error {
|
||||
moa.logger.Debug("synchronizing Alertmanagers for orgs")
|
||||
// First, load all the organizations from the database.
|
||||
orgIDs, err := moa.orgStore.GetOrgs(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Then, sync them by creating or deleting Alertmanagers as necessary.
|
||||
moa.SyncAlertmanagersForOrgs(orgIDs)
|
||||
|
||||
moa.logger.Debug("done synchronizing Alertmanagers for orgs")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
|
||||
orgsFound := make(map[int64]struct{}, len(orgIDs))
|
||||
moa.alertmanagersMtx.Lock()
|
||||
for _, orgID := range orgIDs {
|
||||
orgsFound[orgID] = struct{}{}
|
||||
|
||||
existing, found := moa.alertmanagers[orgID]
|
||||
if !found {
|
||||
reg := moa.orgRegistry.GetOrCreateOrgRegistry(orgID)
|
||||
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, metrics.NewMetrics(reg))
|
||||
if err != nil {
|
||||
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
|
||||
}
|
||||
moa.alertmanagers[orgID] = am
|
||||
existing = am
|
||||
}
|
||||
|
||||
//TODO: This will create an N+1 query
|
||||
if err := existing.SyncAndApplyConfigFromDatabase(); err != nil {
|
||||
moa.logger.Error("failed to apply Alertmanager config for org", "org", orgID, "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
amsToStop := map[int64]*Alertmanager{}
|
||||
for orgId, am := range moa.alertmanagers {
|
||||
if _, exists := orgsFound[orgId]; !exists {
|
||||
amsToStop[orgId] = am
|
||||
delete(moa.alertmanagers, orgId)
|
||||
moa.orgRegistry.RemoveOrgRegistry(orgId)
|
||||
}
|
||||
}
|
||||
moa.alertmanagersMtx.Unlock()
|
||||
|
||||
// Now, we can stop the Alertmanagers without having to hold a lock.
|
||||
for orgID, am := range amsToStop {
|
||||
moa.logger.Info("stopping Alertmanager", "org", orgID)
|
||||
am.StopAndWait()
|
||||
moa.logger.Info("stopped Alertmanager", "org", orgID)
|
||||
}
|
||||
}
|
||||
|
||||
func (moa *MultiOrgAlertmanager) StopAndWait() {
|
||||
moa.alertmanagersMtx.Lock()
|
||||
defer moa.alertmanagersMtx.Unlock()
|
||||
|
||||
for _, am := range moa.alertmanagers {
|
||||
am.StopAndWait()
|
||||
}
|
||||
}
|
||||
|
||||
// AlertmanagerFor returns the Alertmanager instance for the organization provided.
|
||||
// When the organization does not have an active Alertmanager, it returns a ErrNoAlertmanagerForOrg.
|
||||
// When the Alertmanager of the organization is not ready, it returns a ErrAlertmanagerNotReady.
|
||||
func (moa *MultiOrgAlertmanager) AlertmanagerFor(orgID int64) (*Alertmanager, error) {
|
||||
moa.alertmanagersMtx.RLock()
|
||||
defer moa.alertmanagersMtx.RUnlock()
|
||||
|
||||
orgAM, existing := moa.alertmanagers[orgID]
|
||||
if !existing {
|
||||
return nil, ErrNoAlertmanagerForOrg
|
||||
}
|
||||
|
||||
if !orgAM.Ready() {
|
||||
return nil, ErrAlertmanagerNotReady
|
||||
}
|
||||
|
||||
return orgAM, nil
|
||||
}
|
92
pkg/services/ngalert/notifier/multiorg_alertmanager_test.go
Normal file
92
pkg/services/ngalert/notifier/multiorg_alertmanager_test.go
Normal file
@ -0,0 +1,92 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
configStore := &FakeConfigStore{
|
||||
configs: map[int64]*models.AlertConfiguration{},
|
||||
}
|
||||
orgStore := &FakeOrgStore{
|
||||
orgs: []int64{1, 2, 3},
|
||||
}
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore)
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure that one Alertmanager is created per org.
|
||||
{
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 3)
|
||||
}
|
||||
// When an org is removed, it should detect it.
|
||||
{
|
||||
orgStore.orgs = []int64{1, 3}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 2)
|
||||
}
|
||||
// if the org comes back, it should detect it.
|
||||
{
|
||||
orgStore.orgs = []int64{1, 2, 3, 4}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 4)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
||||
configStore := &FakeConfigStore{
|
||||
configs: map[int64]*models.AlertConfiguration{},
|
||||
}
|
||||
orgStore := &FakeOrgStore{
|
||||
orgs: []int64{1, 2, 3},
|
||||
}
|
||||
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore)
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure that one Alertmanagers is created per org.
|
||||
{
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 3)
|
||||
}
|
||||
|
||||
// First, let's try to request an Alertmanager from an org that doesn't exist.
|
||||
{
|
||||
_, err := mam.AlertmanagerFor(5)
|
||||
require.EqualError(t, err, ErrNoAlertmanagerForOrg.Error())
|
||||
}
|
||||
|
||||
// Now, let's try to request an Alertmanager that is not ready.
|
||||
{
|
||||
// let's delete its "running config" to make it non-ready
|
||||
mam.alertmanagers[1].config = nil
|
||||
_, err := mam.AlertmanagerFor(1)
|
||||
require.EqualError(t, err, ErrAlertmanagerNotReady.Error())
|
||||
}
|
||||
|
||||
// With an Alertmanager that exists, it responds correctly.
|
||||
{
|
||||
am, err := mam.AlertmanagerFor(2)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, *am.GetStatus().VersionInfo.Version, "N/A")
|
||||
require.Equal(t, am.orgID, int64(2))
|
||||
require.NotNil(t, am.config)
|
||||
}
|
||||
|
||||
// Let's now remove the previous queried organization.
|
||||
orgStore.orgs = []int64{1, 3}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
{
|
||||
_, err := mam.AlertmanagerFor(2)
|
||||
require.EqualError(t, err, ErrNoAlertmanagerForOrg.Error())
|
||||
}
|
||||
}
|
@ -1 +0,0 @@
|
||||
package notifier
|
56
pkg/services/ngalert/notifier/testing.go
Normal file
56
pkg/services/ngalert/notifier/testing.go
Normal file
@ -0,0 +1,56 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
)
|
||||
|
||||
type FakeConfigStore struct {
|
||||
configs map[int64]*models.AlertConfiguration
|
||||
}
|
||||
|
||||
func (f *FakeConfigStore) GetLatestAlertmanagerConfiguration(query *models.GetLatestAlertmanagerConfigurationQuery) error {
|
||||
var ok bool
|
||||
query.Result, ok = f.configs[query.OrgID]
|
||||
if !ok {
|
||||
return store.ErrNoAlertmanagerConfiguration
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *FakeConfigStore) SaveAlertmanagerConfiguration(cmd *models.SaveAlertmanagerConfigurationCmd) error {
|
||||
f.configs[cmd.OrgID] = &models.AlertConfiguration{
|
||||
AlertmanagerConfiguration: cmd.AlertmanagerConfiguration,
|
||||
OrgID: cmd.OrgID,
|
||||
ConfigurationVersion: "v1",
|
||||
Default: cmd.Default,
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *FakeConfigStore) SaveAlertmanagerConfigurationWithCallback(cmd *models.SaveAlertmanagerConfigurationCmd, callback store.SaveCallback) error {
|
||||
f.configs[cmd.OrgID] = &models.AlertConfiguration{
|
||||
AlertmanagerConfiguration: cmd.AlertmanagerConfiguration,
|
||||
OrgID: cmd.OrgID,
|
||||
ConfigurationVersion: "v1",
|
||||
Default: cmd.Default,
|
||||
}
|
||||
|
||||
if err := callback(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type FakeOrgStore struct {
|
||||
orgs []int64
|
||||
}
|
||||
|
||||
func (f *FakeOrgStore) GetOrgs(_ context.Context) ([]int64, error) {
|
||||
return f.orgs, nil
|
||||
}
|
@ -9,10 +9,10 @@ import (
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/alerting"
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/sender"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
@ -39,11 +39,6 @@ type ScheduleService interface {
|
||||
overrideCfg(cfg SchedulerCfg)
|
||||
}
|
||||
|
||||
// Notifier handles the delivery of alert notifications to the end user
|
||||
type Notifier interface {
|
||||
PutAlerts(alerts apimodels.PostableAlerts) error
|
||||
}
|
||||
|
||||
type schedule struct {
|
||||
// base tick rate (fastest possible configured check)
|
||||
baseInterval time.Duration
|
||||
@ -74,14 +69,15 @@ type schedule struct {
|
||||
ruleStore store.RuleStore
|
||||
instanceStore store.InstanceStore
|
||||
adminConfigStore store.AdminConfigurationStore
|
||||
orgStore store.OrgStore
|
||||
dataService *tsdb.Service
|
||||
|
||||
stateManager *state.Manager
|
||||
|
||||
appURL string
|
||||
|
||||
notifier Notifier
|
||||
metrics *metrics.Metrics
|
||||
multiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
metrics *metrics.Metrics
|
||||
|
||||
// Senders help us send alerts to external Alertmanagers.
|
||||
sendersMtx sync.RWMutex
|
||||
@ -100,9 +96,10 @@ type SchedulerCfg struct {
|
||||
StopAppliedFunc func(models.AlertRuleKey)
|
||||
Evaluator eval.Evaluator
|
||||
RuleStore store.RuleStore
|
||||
OrgStore store.OrgStore
|
||||
InstanceStore store.InstanceStore
|
||||
AdminConfigStore store.AdminConfigurationStore
|
||||
Notifier Notifier
|
||||
MultiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
Metrics *metrics.Metrics
|
||||
AdminConfigPollInterval time.Duration
|
||||
}
|
||||
@ -122,9 +119,10 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
|
||||
evaluator: cfg.Evaluator,
|
||||
ruleStore: cfg.RuleStore,
|
||||
instanceStore: cfg.InstanceStore,
|
||||
orgStore: cfg.OrgStore,
|
||||
dataService: dataService,
|
||||
adminConfigStore: cfg.AdminConfigStore,
|
||||
notifier: cfg.Notifier,
|
||||
multiOrgNotifier: cfg.MultiOrgNotifier,
|
||||
metrics: cfg.Metrics,
|
||||
appURL: appURL,
|
||||
stateManager: stateManager,
|
||||
@ -454,10 +452,15 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
||||
processedStates := sch.stateManager.ProcessEvalResults(alertRule, results)
|
||||
sch.saveAlertStates(processedStates)
|
||||
alerts := FromAlertStateToPostableAlerts(sch.log, processedStates, sch.stateManager, sch.appURL)
|
||||
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts)
|
||||
err = sch.notifier.PutAlerts(alerts)
|
||||
if err != nil {
|
||||
sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
|
||||
|
||||
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts), "alerts", alerts.PostableAlerts, "org", alertRule.OrgID)
|
||||
n, err := sch.multiOrgNotifier.AlertmanagerFor(alertRule.OrgID)
|
||||
if err == nil {
|
||||
if err := n.PutAlerts(alerts); err != nil {
|
||||
sch.log.Error("failed to put alerts in the notifier", "count", len(alerts.PostableAlerts), "err", err)
|
||||
}
|
||||
} else {
|
||||
sch.log.Error("unable to lookup local notifier for this org - alerts not delivered", "org", alertRule.OrgID, "count", len(alerts.PostableAlerts), "err", err)
|
||||
}
|
||||
|
||||
// Send alerts to external Alertmanager(s) if we have a sender for this organization.
|
||||
|
@ -14,6 +14,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
@ -243,7 +244,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
|
||||
RuleStore: rs,
|
||||
InstanceStore: is,
|
||||
AdminConfigStore: acs,
|
||||
Notifier: &fakeNotifier{},
|
||||
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}),
|
||||
Logger: logger,
|
||||
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
|
@ -10,7 +10,6 @@ import (
|
||||
"time"
|
||||
|
||||
models2 "github.com/grafana/grafana/pkg/models"
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
@ -230,13 +229,6 @@ func (f *fakeAdminConfigStore) UpdateAdminConfiguration(cmd store.UpdateAdminCon
|
||||
return nil
|
||||
}
|
||||
|
||||
// fakeNotifier represents a fake internal Alertmanager.
|
||||
type fakeNotifier struct{}
|
||||
|
||||
func (n *fakeNotifier) PutAlerts(alerts apimodels.PostableAlerts) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type FakeExternalAlertmanager struct {
|
||||
t *testing.T
|
||||
mtx sync.Mutex
|
||||
|
26
pkg/services/ngalert/store/org.go
Normal file
26
pkg/services/ngalert/store/org.go
Normal file
@ -0,0 +1,26 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/sqlstore"
|
||||
)
|
||||
|
||||
type OrgStore interface {
|
||||
GetOrgs(ctx context.Context) ([]int64, error)
|
||||
}
|
||||
|
||||
func (st DBstore) GetOrgs(ctx context.Context) ([]int64, error) {
|
||||
orgs := make([]int64, 0)
|
||||
err := st.SQLStore.WithDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
q := "SELECT id FROM org"
|
||||
if err := sess.SQL(q).Find(&orgs); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return orgs, nil
|
||||
}
|
@ -6,6 +6,9 @@ import (
|
||||
"net/http"
|
||||
"regexp"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
|
||||
|
||||
"github.com/grafana/grafana/pkg/bus"
|
||||
"github.com/grafana/grafana/pkg/models"
|
||||
@ -16,6 +19,13 @@ import (
|
||||
)
|
||||
|
||||
func TestAlertmanagerConfigurationIsTransactional(t *testing.T) {
|
||||
// TODO: We need a reliable way to ensure Alertmanagers have synced correctly.
|
||||
// For now, make them sync quicker.
|
||||
p := notifier.SyncOrgsPollInterval
|
||||
notifier.SyncOrgsPollInterval = 2 * time.Second
|
||||
t.Cleanup(func() {
|
||||
notifier.SyncOrgsPollInterval = p
|
||||
})
|
||||
dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
|
||||
EnableFeatureToggles: []string{"ngalert"},
|
||||
DisableAnonymous: true,
|
||||
@ -85,7 +95,7 @@ func TestAlertmanagerConfigurationIsTransactional(t *testing.T) {
|
||||
}
|
||||
`
|
||||
resp := postRequest(t, alertConfigURL, payload, http.StatusBadRequest) // nolint
|
||||
require.JSONEq(t, `{"message":"failed to save and apply Alertmanager configuration: the receiver is invalid: failed to validate receiver \"slack.receiver\" of type \"slack\": token must be specified when using the Slack chat API"}`, getBody(t, resp.Body))
|
||||
require.JSONEq(t, `{"message":"failed to save and apply Alertmanager configuration: failed to build integration map: the receiver is invalid: failed to validate receiver \"slack.receiver\" of type \"slack\": token must be specified when using the Slack chat API"}`, getBody(t, resp.Body))
|
||||
|
||||
resp = getRequest(t, alertConfigURL, http.StatusOK) // nolint
|
||||
require.JSONEq(t, defaultAlertmanagerConfigJSON, getBody(t, resp.Body))
|
||||
@ -94,6 +104,13 @@ func TestAlertmanagerConfigurationIsTransactional(t *testing.T) {
|
||||
// editor42 from organisation 42 posts configuration
|
||||
alertConfigURL = fmt.Sprintf("http://editor-42:editor-42@%s/api/alertmanager/grafana/config/api/v1/alerts", grafanaListedAddr)
|
||||
|
||||
// Before we start operating, make sure we've synced this org.
|
||||
require.Eventually(t, func() bool {
|
||||
resp, err := http.Get(alertConfigURL) // nolint
|
||||
require.NoError(t, err)
|
||||
return resp.StatusCode == http.StatusOK
|
||||
}, 10*time.Second, 2*time.Second)
|
||||
|
||||
// Post the alertmanager config.
|
||||
{
|
||||
mockChannel := newMockNotificationChannel(t, grafanaListedAddr)
|
||||
|
Loading…
Reference in New Issue
Block a user