Alerting: Persist notification log and silences to the database (#39005)

* Alerting: Persist notification log and silences to the database

This removes the dependency of having persistent disk to run grafana alerting. Instead of regularly flushing the notification log and silences to disk we now flush the binary content of those files to the database encoded as a base64 string.
This commit is contained in:
gotjosh 2021-09-09 17:25:22 +01:00 committed by GitHub
parent 3ee861f57e
commit 39a3bb8a1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 358 additions and 59 deletions

View File

@ -47,14 +47,14 @@ Alerting rules can only query backend data sources with alerting enabled:
The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../administration/view-server/internal-metrics.md" >}}).
| Metric Name | Type | Description |
| ------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
| `alerting.alerts` | gauge | How many alerts by state |
| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API |
| `alerting.active_configurations` | gauge | The number of active, non default Alertmanager configurations for grafana managed alerts |
| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations |
| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
| `alerting.rule_evaluation_duration_seconds` | summary | The duration for a rule to execute |
| `alerting.rule_group_rules` | gauge | The number of rules |
| Metric Name | Type | Description |
| ------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
| `grafana_alerting_alerts` | gauge | How many alerts by state |
| `grafana_alerting_request_duration` | histogram | Histogram of requests to the Alerting API |
| `grafana_alerting_active_configurations` | gauge | The number of active, non default Alertmanager configurations for grafana managed alerts |
| `grafana_alerting_rule_evaluations_total` | counter | The total number of rule evaluations |
| `grafana_alerting_rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
| `grafana_alerting_rule_evaluation_duration` | summary | The duration for a rule to execute |
| `grafana_alerting_rule_group_rules` | gauge | The number of rules |
- [View alert rules and their current state]({{< relref "alerting-rules/rule-list.md" >}})

View File

@ -12,7 +12,9 @@ Setting the `ngalert` feature toggle enables the new Grafana 8 alerting system.
At startup, when [the feature toggle is enabled]({{< relref "../../administration/configuration.md">}}#feature_toggles), the legacy Grafana dashboard alerting is disabled and existing dashboard alerts are migrated into a format that is compatible with the Grafana 8 alerting system. You can view these migrated rules, alongside any new alerts you create after the migration, from the Alerting page of your Grafana instance.
> **Note:** Since the new system stores the notification log and silences on disk, we require the use of persistent disks for using Grafana 8 alerts. Otherwise, the silences and notification log will get lost on a restart, and you might get unwanted or duplicate notifications.
> **Note - v8.2 or earlier:** Since the new system stores the notification log and silences on disk, we require the use of persistent disks for using Grafana 8 alerts. Otherwise, the silences and notification log will get lost on a restart, and you might get unwanted or duplicate notifications.
> **Note - v8.3+**: We have removed the need of persistent disk. The notification log and silences are now stored in the database. If you used the file-based approach, we'll read those files and eventually (every 15 minutes) persist them to the database.
Read and write access to dashboard alerts in Grafana versions 7 and earlier were governed by the dashboard and folder permissions under which the alerts were stored. In Grafana 8, alerts are stored in folders and inherit the permissions of those folders. During the migration, dashboard alert permissions are matched to the new rules permissions as follows:

14
go.mod
View File

@ -19,7 +19,7 @@ require (
github.com/BurntSushi/toml v0.3.1
github.com/Masterminds/semver v1.5.0
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f
github.com/aws/aws-sdk-go v1.38.68
github.com/aws/aws-sdk-go v1.40.11
github.com/beevik/etree v1.1.0
github.com/benbjohnson/clock v1.1.0
github.com/bradfitz/gomemcache v0.0.0-20190913173617-a41fca850d0b
@ -75,10 +75,10 @@ require (
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 // indirect
github.com/pkg/errors v0.9.1
github.com/prometheus/alertmanager v0.22.2
github.com/prometheus/alertmanager v0.23.0-rc.0.0.20210906104939-8da517524a87
github.com/prometheus/client_golang v1.11.0
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.29.0
github.com/prometheus/common v0.30.0
github.com/prometheus/prometheus v1.8.2-0.20210621150501-ff58416a0b02
github.com/robfig/cron v0.0.0-20180505203441-b41be1df6967
github.com/robfig/cron/v3 v3.0.1
@ -98,7 +98,7 @@ require (
go.opentelemetry.io/collector/model v0.31.0
golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f // indirect
golang.org/x/net v0.0.0-20210614182718-04defd469f4e
golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985
golang.org/x/oauth2 v0.0.0-20210805134026-6f1e6394065a
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac
@ -154,7 +154,7 @@ require (
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.19.5 // indirect
github.com/go-openapi/loads v0.20.2 // indirect
github.com/go-openapi/runtime v0.19.28 // indirect
github.com/go-openapi/runtime v0.19.29 // indirect
github.com/go-openapi/spec v0.20.3 // indirect
github.com/go-openapi/swag v0.19.15 // indirect
github.com/go-openapi/validate v0.20.2 // indirect
@ -176,7 +176,7 @@ require (
github.com/hashicorp/go-multierror v1.1.0 // indirect
github.com/hashicorp/go-sockaddr v1.0.2 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/hashicorp/memberlist v0.2.3 // indirect
github.com/hashicorp/memberlist v0.2.4 // indirect
github.com/hashicorp/yamux v0.0.0-20200609203250-aecfd211c9ce // indirect
github.com/igm/sockjs-go/v3 v3.0.0 // indirect
github.com/jessevdk/go-flags v1.5.0 // indirect
@ -204,6 +204,8 @@ require (
github.com/opentracing-contrib/go-grpc v0.0.0-20210225150812-73cb765af46e // indirect
github.com/opentracing-contrib/go-stdlib v1.0.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/prometheus/exporter-toolkit v0.6.1 // indirect
github.com/prometheus/node_exporter v1.0.0-rc.0.0.20200428091818-01054558c289 // indirect
github.com/prometheus/procfs v0.6.0 // indirect
github.com/rainycape/unidecode v0.0.0-20150907023854-cb7f23ec59be // indirect

23
go.sum
View File

@ -283,9 +283,11 @@ github.com/aws/aws-sdk-go v1.35.30/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9
github.com/aws/aws-sdk-go v1.35.31/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.37.8/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.38.3/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.38.60/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.38.68 h1:aOG8geU4SohNp659eKBHRBgbqSrZ6jNZlfimIuJAwL8=
github.com/aws/aws-sdk-go v1.38.68/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.40.11 h1:iIRx5w2FbiaEKnCFcai+NSnYa9zKFe6Lzt6aLLUh61A=
github.com/aws/aws-sdk-go v1.40.11/go.mod h1:585smgzpB/KqRA+K3y/NL/oYRqQvpNJYvLm+LY1U59Q=
github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g=
github.com/aws/aws-sdk-go-v2 v1.7.0/go.mod h1:tb9wi5s61kTDA5qCkcDbt3KRVV74GGslQkl/DRdX/P4=
github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.5.0/go.mod h1:acH3+MQoiMzozT/ivU+DbRg7Ooo2298RdRaWcOv+4vM=
@ -699,8 +701,9 @@ github.com/go-openapi/runtime v0.19.15/go.mod h1:dhGWCTKRXlAfGnQG0ONViOZpjfg0m2g
github.com/go-openapi/runtime v0.19.16/go.mod h1:5P9104EJgYcizotuXhEuUrzVc+j1RiSjahULvYmlv98=
github.com/go-openapi/runtime v0.19.24/go.mod h1:Lm9YGCeecBnUUkFTxPC4s1+lwrkJ0pthx8YvyjCfkgk=
github.com/go-openapi/runtime v0.19.26/go.mod h1:BvrQtn6iVb2QmiVXRsFAm6ZCAZBpbVKFfN6QWCp582M=
github.com/go-openapi/runtime v0.19.28 h1:9lYu6axek8LJrVkMVViVirRcpoaCxXX7+sSvmizGVnA=
github.com/go-openapi/runtime v0.19.28/go.mod h1:BvrQtn6iVb2QmiVXRsFAm6ZCAZBpbVKFfN6QWCp582M=
github.com/go-openapi/runtime v0.19.29 h1:5IIvCaIDbxetN674vX9eOxvoZ9mYGQ16fV1Q0VSG+NA=
github.com/go-openapi/runtime v0.19.29/go.mod h1:BvrQtn6iVb2QmiVXRsFAm6ZCAZBpbVKFfN6QWCp582M=
github.com/go-openapi/spec v0.0.0-20160808142527-6aced65f8501/go.mod h1:J8+jY1nAiCcj+friV/PDoE1/3eeccG9LYBs0tYvLOWc=
github.com/go-openapi/spec v0.17.0/go.mod h1:XkF/MOi14NmjsfZ8VtAKf8pIlbZzyoTvZsdfssdxcBI=
github.com/go-openapi/spec v0.17.2/go.mod h1:XkF/MOi14NmjsfZ8VtAKf8pIlbZzyoTvZsdfssdxcBI=
@ -1142,8 +1145,9 @@ github.com/hashicorp/memberlist v0.1.4/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2p
github.com/hashicorp/memberlist v0.1.5/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I=
github.com/hashicorp/memberlist v0.2.0/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/hashicorp/memberlist v0.2.3 h1:BwZa5IjREr75J0am7nblP+X5i95Rmp8EEbMI5vkUWdA=
github.com/hashicorp/memberlist v0.2.3/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/hashicorp/memberlist v0.2.4 h1:OOhYzSvFnkFQXm1ysE8RjXTHsqSRDyP4emusC9K7DYg=
github.com/hashicorp/memberlist v0.2.4/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69/go.mod h1:/z+jUGRBlwVpUZfjute9jWaF6/HuhjuFQuL1YXzVD1Q=
github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
@ -1685,8 +1689,9 @@ github.com/prometheus/alertmanager v0.21.1-0.20200911160112-1fdff6b3f939/go.mod
github.com/prometheus/alertmanager v0.21.1-0.20201106142418-c39b78780054/go.mod h1:imXRHOP6QTsE0fFsIsAV/cXimS32m7gVZOiUj11m6Ig=
github.com/prometheus/alertmanager v0.21.1-0.20210310093010-0f9cab6991e6/go.mod h1:MTqVn+vIupE0dzdgo+sMcNCp37SCAi8vPrvKTTnTz9g=
github.com/prometheus/alertmanager v0.21.1-0.20210422101724-8176f78a70e1/go.mod h1:gsEqwD5BHHW9RNKvCuPOrrTMiP5I+faJUyLXvnivHik=
github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0=
github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE=
github.com/prometheus/alertmanager v0.23.0-rc.0.0.20210906104939-8da517524a87 h1:98dGpT8+lYi0ADl1oN/JHPbr5pyzSA9M4id68zFkh3Y=
github.com/prometheus/alertmanager v0.23.0-rc.0.0.20210906104939-8da517524a87/go.mod h1:U7pGu+z7A9ZKhK8lq1MvIOp5GdVlZjwOYk+S0h3LSbA=
github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM=
@ -1736,10 +1741,15 @@ github.com/prometheus/common v0.21.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16
github.com/prometheus/common v0.23.0/go.mod h1:H6QK/N6XVT42whUeIdI3dp36w49c+/iMDk7UAI2qm7Q=
github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
github.com/prometheus/common v0.28.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.29.0 h1:3jqPBvKT4OHAbje2Ql7KeaaSicDBCxMYwEJU1zRJceE=
github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.30.0 h1:JEkYlQnpzrzQFxi6gnukFPdQ+ac82oRhzMcIduJu/Ug=
github.com/prometheus/common v0.30.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4=
github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI=
github.com/prometheus/exporter-toolkit v0.5.0/go.mod h1:OCkM4805mmisBhLmVFw858QYi3v0wKdY6/UxrT0pZVg=
github.com/prometheus/exporter-toolkit v0.5.1/go.mod h1:OCkM4805mmisBhLmVFw858QYi3v0wKdY6/UxrT0pZVg=
github.com/prometheus/exporter-toolkit v0.6.1 h1:Aqk75wQD92N9CqmTlZwjKwq6272nOGrWIbc8Z7+xQO0=
github.com/prometheus/exporter-toolkit v0.6.1/go.mod h1:ZUBIj498ePooX9t/2xtDjeQYwvRpiPP2lh5u4iblj2g=
github.com/prometheus/node_exporter v1.0.0-rc.0.0.20200428091818-01054558c289 h1:dTUS1vaLWq+Y6XKOTnrFpoVsQKLCbCp1OLj24TDi7oM=
github.com/prometheus/node_exporter v1.0.0-rc.0.0.20200428091818-01054558c289/go.mod h1:FGbBv5OPKjch+jNUJmEQpMZytIdyW0NdBtWFcfSKusc=
github.com/prometheus/procfs v0.0.0-20180612222113-7d6f385de8be/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
@ -2351,8 +2361,9 @@ golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qx
golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210610132358-84b48f89b13b/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985 h1:4CSI6oo7cOjJKajidEljs9h+uP0rRZBPPPhcCbj5mw8=
golang.org/x/net v0.0.0-20210726213435-c6fcb2dbf985/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=

View File

@ -4,25 +4,25 @@ import (
"context"
"time"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/services/quota"
"golang.org/x/sync/errgroup"
"github.com/grafana/grafana/pkg/services/ngalert/api"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/datasourceproxy"
"github.com/grafana/grafana/pkg/services/datasources"
"github.com/grafana/grafana/pkg/services/ngalert/api"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/services/quota"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/tsdb"
"github.com/benbjohnson/clock"
"golang.org/x/sync/errgroup"
)
const (
@ -38,13 +38,14 @@ const (
)
func ProvideService(cfg *setting.Cfg, dataSourceCache datasources.CacheService, routeRegister routing.RouteRegister,
sqlStore *sqlstore.SQLStore, dataService *tsdb.Service, dataProxy *datasourceproxy.DataSourceProxyService,
sqlStore *sqlstore.SQLStore, kvStore kvstore.KVStore, dataService *tsdb.Service, dataProxy *datasourceproxy.DataSourceProxyService,
quotaService *quota.QuotaService, m *metrics.Metrics) (*AlertNG, error) {
ng := &AlertNG{
Cfg: cfg,
DataSourceCache: dataSourceCache,
RouteRegister: routeRegister,
SQLStore: sqlStore,
KVStore: kvStore,
DataService: dataService,
DataProxy: dataProxy,
QuotaService: quotaService,
@ -69,6 +70,7 @@ type AlertNG struct {
DataSourceCache datasources.CacheService
RouteRegister routing.RouteRegister
SQLStore *sqlstore.SQLStore
KVStore kvstore.KVStore
DataService *tsdb.Service
DataProxy *datasourceproxy.DataSourceProxyService
QuotaService *quota.QuotaService
@ -95,7 +97,7 @@ func (ng *AlertNG) init() error {
Logger: ng.Log,
}
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store)
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore)
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {

View File

@ -27,6 +27,7 @@ import (
"github.com/prometheus/common/model"
"github.com/grafana/grafana/pkg/components/securejsondata"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/logging"
@ -38,6 +39,9 @@ import (
)
const (
notificationLogFilename = "notifications"
silencesFilename = "silences"
workingDir = "alerting"
// How long should we keep silences and notification entries on-disk after they've served their purpose.
retentionNotificationsAndSilences = 5 * 24 * time.Hour
@ -77,9 +81,10 @@ type Alertmanager struct {
logger log.Logger
gokitLogger gokit_log.Logger
Settings *setting.Cfg
Store store.AlertingStore
Metrics *metrics.Metrics
Settings *setting.Cfg
Store store.AlertingStore
fileStore *FileStore
Metrics *metrics.Metrics
notificationLog *nflog.Log
marker types.Marker
@ -106,28 +111,39 @@ type Alertmanager struct {
orgID int64
}
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m *metrics.Metrics) (*Alertmanager, error) {
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Metrics) (*Alertmanager, error) {
am := &Alertmanager{
Settings: cfg,
stopc: make(chan struct{}),
logger: log.New("alertmanager", "org", orgID),
marker: types.NewMarker(m.Registerer),
stageMetrics: notify.NewMetrics(m.Registerer),
dispatcherMetrics: dispatch.NewDispatcherMetrics(m.Registerer),
dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer),
Store: store,
Metrics: m,
orgID: orgID,
}
am.gokitLogger = gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger))
am.fileStore = NewFileStore(am.orgID, kvStore, am.WorkingDirPath())
nflogFilepath, err := am.fileStore.FilepathFor(context.TODO(), notificationLogFilename)
if err != nil {
return nil, err
}
silencesFilePath, err := am.fileStore.FilepathFor(context.TODO(), silencesFilename)
if err != nil {
return nil, err
}
// Initialize the notification log
am.wg.Add(1)
var err error
am.notificationLog, err = nflog.New(
nflog.WithRetention(retentionNotificationsAndSilences),
nflog.WithSnapshot(filepath.Join(am.WorkingDirPath(), "notifications")),
nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done),
nflog.WithSnapshot(nflogFilepath),
nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done, func() (int64, error) {
return am.fileStore.Persist(context.TODO(), notificationLogFilename, am.notificationLog)
}),
)
if err != nil {
return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err)
@ -135,7 +151,7 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m
// Initialize silences
am.silences, err = silence.New(silence.Options{
Metrics: m.Registerer,
SnapshotFile: filepath.Join(am.WorkingDirPath(), "silences"),
SnapshotFile: silencesFilePath,
Retention: retentionNotificationsAndSilences,
})
if err != nil {
@ -144,12 +160,14 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, m
am.wg.Add(1)
go func() {
am.silences.Maintenance(15*time.Minute, filepath.Join(am.WorkingDirPath(), "silences"), am.stopc)
am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) {
return am.fileStore.Persist(context.TODO(), silencesFilename, am.silences)
})
am.wg.Done()
}()
// Initialize in-memory alerts
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, am.gokitLogger)
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, memoryAlertsGCInterval, nil, am.gokitLogger)
if err != nil {
return nil, fmt.Errorf("unable to initialize the alert provider component of alerting: %w", err)
}
@ -390,7 +408,7 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
}
am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil)
am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, am.gokitLogger, am.dispatcherMetrics)
am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics)
am.wg.Add(1)
go func() {
@ -707,3 +725,7 @@ func timeoutFunc(d time.Duration) time.Duration {
}
return d + waitFunc()
}
type nilLimits struct{}
func (n nilLimits) MaxNumberOfAggregationGroups() int { return 0 }

View File

@ -47,7 +47,8 @@ func setupAMTest(t *testing.T) *Alertmanager {
Logger: log.New("alertmanager-test"),
}
am, err := newAlertmanager(1, cfg, store, m)
kvStore := newFakeKVStore(t)
am, err := newAlertmanager(1, cfg, store, kvStore, m)
require.NoError(t, err)
return am
}
@ -310,7 +311,7 @@ func TestPutAlert(t *testing.T) {
t.Run(c.title, func(t *testing.T) {
r := prometheus.NewRegistry()
am.marker = types.NewMarker(r)
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 15*time.Minute, gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)))
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 15*time.Minute, nil, gokit_log.NewLogfmtLogger(logging.NewWrapper(am.logger)))
require.NoError(t, err)
alerts := []*types.Alert{}

View File

@ -0,0 +1,109 @@
package notifier
import (
"context"
"encoding/base64"
"fmt"
"os"
"path/filepath"
"github.com/grafana/grafana/pkg/infra/kvstore"
)
const KVNamespace = "alertmanager"
// State represents any of the two 'states' of the alertmanager. Notification log or Silences.
// MarshalBinary returns the binary representation of this internal state based on the protobuf.
type State interface {
MarshalBinary() ([]byte, error)
}
// FileStore is in charge of persisting the alertmanager files to the database.
// It uses the KVstore table and encodes the files as a base64 string.
type FileStore struct {
kv *kvstore.NamespacedKVStore
orgID int64
workingDirPath string
}
func NewFileStore(orgID int64, store kvstore.KVStore, workingDirPath string) *FileStore {
return &FileStore{
workingDirPath: workingDirPath,
orgID: orgID,
kv: kvstore.WithNamespace(store, orgID, KVNamespace),
}
}
// FilepathFor returns the filepath to an Alertmanager file.
// If the file is already present on disk it no-ops.
// If not, it tries to read the database and if there's no file it no-ops.
// If there is a file in the database, it decodes it and writes to disk for Alertmanager consumption.
func (fs *FileStore) FilepathFor(ctx context.Context, filename string) (string, error) {
// If a file is already present, we'll use that one and eventually save it to the database.
// We don't need to do anything else.
if fs.IsExists(filename) {
return fs.pathFor(filename), nil
}
// Then, let's attempt to read it from the database.
content, exists, err := fs.kv.Get(ctx, filename)
if err != nil {
return "", fmt.Errorf("error reading file '%s' from database: %w", filename, err)
}
// if it doesn't exist, let's no-op and let the Alertmanager create one. We'll eventually save it to the database.
if !exists {
return fs.pathFor(filename), nil
}
// If we have a file stored in the database, let's decode it and write it to disk to perform that initial load to memory.
bytes, err := decode(content)
if err != nil {
return "", fmt.Errorf("error decoding file '%s': %w", filename, err)
}
if err := fs.WriteFileToDisk(filename, bytes); err != nil {
return "", fmt.Errorf("error writing file %s: %w", filename, err)
}
return fs.pathFor(filename), err
}
// Persist takes care of persisting the binary representation of internal state to the database as a base64 encoded string.
func (fs *FileStore) Persist(ctx context.Context, filename string, st State) (int64, error) {
var size int64
bytes, err := st.MarshalBinary()
if err != nil {
return size, err
}
if err = fs.kv.Set(ctx, filename, encode(bytes)); err != nil {
return size, err
}
return int64(len(bytes)), err
}
// IsExists verifies if the file exists or not.
func (fs *FileStore) IsExists(fn string) bool {
_, err := os.Stat(fs.pathFor(fn))
return os.IsExist(err)
}
// WriteFileToDisk writes a file with the provided name and contents to the Alertmanager working directory with the default grafana permission.
func (fs *FileStore) WriteFileToDisk(fn string, content []byte) error {
return os.WriteFile(fs.pathFor(fn), content, 0644)
}
func (fs *FileStore) pathFor(fn string) string {
return filepath.Join(fs.workingDirPath, fn)
}
func decode(s string) ([]byte, error) {
return base64.StdEncoding.DecodeString(s)
}
func encode(b []byte) string {
return base64.StdEncoding.EncodeToString(b)
}

View File

@ -0,0 +1,74 @@
package notifier
import (
"context"
"io/ioutil"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
func TestFileStore_FilepathFor(t *testing.T) {
store := newFakeKVStore(t)
workingDir := t.TempDir()
fs := NewFileStore(1, store, workingDir)
filekey := "silences"
filePath := filepath.Join(workingDir, filekey)
// With a file already on disk, it returns the existing file's filepath and no modification to the original file.
{
require.NoError(t, os.WriteFile(filePath, []byte("silence1,silence2"), 0644))
r, err := fs.FilepathFor(context.Background(), filekey)
require.NoError(t, err)
require.Equal(t, filePath, r)
f, err := ioutil.ReadFile(filepath.Clean(filePath))
require.NoError(t, err)
require.Equal(t, "silence1,silence2", string(f))
require.NoError(t, os.Remove(filePath))
}
// With a file already on the database, it writes the file to disk and returns the filepath.
{
require.NoError(t, store.Set(context.Background(), 1, KVNamespace, filekey, encode([]byte("silence1,silence3"))))
r, err := fs.FilepathFor(context.Background(), filekey)
require.NoError(t, err)
require.Equal(t, filePath, r)
f, err := ioutil.ReadFile(filepath.Clean(filePath))
require.NoError(t, err)
require.Equal(t, "silence1,silence3", string(f))
require.NoError(t, os.Remove(filePath))
require.NoError(t, store.Del(context.Background(), 1, KVNamespace, filekey))
}
// With no file on disk or database, it returns the original filepath.
{
r, err := fs.FilepathFor(context.Background(), filekey)
require.NoError(t, err)
require.Equal(t, filePath, r)
_, err = ioutil.ReadFile(filepath.Clean(filePath))
require.Error(t, err)
}
}
func TestFileStore_Persist(t *testing.T) {
store := newFakeKVStore(t)
state := &fakeState{data: "something to marshal"}
workingDir := t.TempDir()
fs := NewFileStore(1, store, workingDir)
filekey := "silences"
size, err := fs.Persist(context.Background(), filekey, state)
require.NoError(t, err)
require.Equal(t, int64(20), size)
store.mtx.Lock()
require.Len(t, store.store, 1)
store.mtx.Unlock()
v, ok, err := store.Get(context.Background(), 1, KVNamespace, filekey)
require.NoError(t, err)
require.True(t, ok)
b, err := decode(v)
require.NoError(t, err)
require.Equal(t, "something to marshal", string(b))
}

View File

@ -6,6 +6,7 @@ import (
"sync"
"time"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/store"
@ -13,10 +14,7 @@ import (
)
var (
SyncOrgsPollInterval = 1 * time.Minute
)
var (
SyncOrgsPollInterval = 1 * time.Minute
ErrNoAlertmanagerForOrg = fmt.Errorf("Alertmanager does not exist for this organization")
ErrAlertmanagerNotReady = fmt.Errorf("Alertmanager is not ready yet")
)
@ -30,17 +28,19 @@ type MultiOrgAlertmanager struct {
configStore store.AlertingStore
orgStore store.OrgStore
kvStore kvstore.KVStore
orgRegistry *metrics.OrgRegistries
}
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore) *MultiOrgAlertmanager {
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore) *MultiOrgAlertmanager {
return &MultiOrgAlertmanager{
settings: cfg,
logger: log.New("multiorg.alertmanager"),
alertmanagers: map[int64]*Alertmanager{},
configStore: configStore,
orgStore: orgStore,
kvStore: kvStore,
orgRegistry: metrics.NewOrgRegistries(),
}
}
@ -86,7 +86,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
existing, found := moa.alertmanagers[orgID]
if !found {
reg := moa.orgRegistry.GetOrCreateOrgRegistry(orgID)
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, metrics.NewMetrics(reg))
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, metrics.NewMetrics(reg))
if err != nil {
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
}

View File

@ -19,7 +19,8 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
orgs: []int64{1, 2, 3},
}
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore)
kvStore := newFakeKVStore(t)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
ctx := context.Background()
// Ensure that one Alertmanager is created per org.
@ -50,7 +51,8 @@ func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
}
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore)
kvStore := newFakeKVStore(t)
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore)
ctx := context.Background()
// Ensure that one Alertmanagers is created per org.

View File

@ -2,6 +2,8 @@ package notifier
import (
"context"
"sync"
"testing"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/store"
@ -54,3 +56,76 @@ type FakeOrgStore struct {
func (f *FakeOrgStore) GetOrgs(_ context.Context) ([]int64, error) {
return f.orgs, nil
}
type FakeKVStore struct {
mtx sync.Mutex
store map[int64]map[string]map[string]string
}
func newFakeKVStore(t *testing.T) *FakeKVStore {
t.Helper()
return &FakeKVStore{
store: map[int64]map[string]map[string]string{},
}
}
func (fkv *FakeKVStore) Get(_ context.Context, orgId int64, namespace string, key string) (string, bool, error) {
fkv.mtx.Lock()
defer fkv.mtx.Unlock()
org, ok := fkv.store[orgId]
if !ok {
return "", false, nil
}
k, ok := org[namespace]
if !ok {
return "", false, nil
}
v, ok := k[key]
if !ok {
return "", false, nil
}
return v, true, nil
}
func (fkv *FakeKVStore) Set(_ context.Context, orgId int64, namespace string, key string, value string) error {
fkv.mtx.Lock()
defer fkv.mtx.Unlock()
org, ok := fkv.store[orgId]
if !ok {
fkv.store[orgId] = map[string]map[string]string{}
}
_, ok = org[namespace]
if !ok {
fkv.store[orgId][namespace] = map[string]string{}
}
fkv.store[orgId][namespace][key] = value
return nil
}
func (fkv *FakeKVStore) Del(_ context.Context, orgId int64, namespace string, key string) error {
fkv.mtx.Lock()
defer fkv.mtx.Unlock()
org, ok := fkv.store[orgId]
if !ok {
return nil
}
_, ok = org[namespace]
if !ok {
return nil
}
delete(fkv.store[orgId][namespace], key)
return nil
}
type fakeState struct {
data string
}
func (fs *fakeState) MarshalBinary() ([]byte, error) {
return []byte(fs.data), nil
}

View File

@ -238,7 +238,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
RuleStore: rs,
InstanceStore: is,
AdminConfigStore: acs,
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}),
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}, &notifier.FakeKVStore{}),
Logger: logger,
Metrics: metrics.NewMetrics(prometheus.NewRegistry()),
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.

View File

@ -35,8 +35,7 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
cfg.FeatureToggles = map[string]bool{"ngalert": true}
m := metrics.NewMetrics(prometheus.NewRegistry())
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil,
m)
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil, nil, m)
require.NoError(t, err)
return ng, &store.DBstore{
SQLStore: ng.SQLStore,