mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Support Unified Alerting with Grafana HA (#37920)
* Alerting: Support Unified Alerting in Grafana's HA mode.
This commit is contained in:
@@ -211,7 +211,7 @@ rudderstack_data_plane_url =
|
||||
# Application Insights connection string. Specify an URL string to enable this feature.
|
||||
application_insights_connection_string =
|
||||
|
||||
# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``.
|
||||
# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``.
|
||||
application_insights_endpoint_url =
|
||||
|
||||
#################################### Security ############################
|
||||
@@ -732,7 +732,37 @@ global_alert_rule = -1
|
||||
#################################### Unified Alerting ####################
|
||||
[unified_alerting]
|
||||
# Specify the frequency of polling for admin config changes.
|
||||
admin_config_poll_interval_seconds = 60
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
admin_config_poll_interval = 60s
|
||||
|
||||
# Specify the frequency of polling for Alertmanager config changes.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
alertmanager_config_poll_interval = 60s
|
||||
|
||||
# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port.
|
||||
ha_listen_address = "0.0.0.0:9094"
|
||||
|
||||
# Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP.
|
||||
ha_advertise_address = ""
|
||||
|
||||
# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
|
||||
ha_peers = ""
|
||||
|
||||
# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
|
||||
# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
|
||||
# each instance wait before sending the notification to take into account replication lag.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_peer_timeout = 15s
|
||||
|
||||
# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
|
||||
# across cluster more quickly at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_gossip_interval = 200ms
|
||||
|
||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
# across larger clusters at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_push_pull_interval = 60s
|
||||
|
||||
#################################### Alerting ############################
|
||||
[alerting]
|
||||
|
||||
@@ -709,7 +709,38 @@
|
||||
#################################### Unified Alerting ####################
|
||||
[unified_alerting]
|
||||
# Specify the frequency of polling for admin config changes.
|
||||
;admin_config_poll_interval_seconds = 60
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;admin_config_poll_interval = 60s
|
||||
|
||||
# Specify the frequency of polling for Alertmanager config changes.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;alertmanager_config_poll_interval = 60s
|
||||
|
||||
# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
|
||||
;ha_listen_address = "0.0.0.0:9094"
|
||||
|
||||
# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
|
||||
;ha_advertise_address = ""
|
||||
|
||||
# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
|
||||
;ha_peers = ""
|
||||
|
||||
# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
|
||||
# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
|
||||
# each instance wait before sending the notification to take into account replication lag.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;ha_peer_timeout = "15s"
|
||||
|
||||
# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
|
||||
# across cluster more quickly at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;ha_gossip_interval = "200ms"
|
||||
|
||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
# across larger clusters at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;ha_push_pull_interval = "60s"
|
||||
|
||||
|
||||
#################################### Alerting ############################
|
||||
[alerting]
|
||||
|
||||
1
devenv/docker/ha-test-unified-alerting/.gitignore
vendored
Normal file
1
devenv/docker/ha-test-unified-alerting/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
grafana/provisioning/dashboards/alerts/alert-*
|
||||
66
devenv/docker/ha-test-unified-alerting/README.md
Normal file
66
devenv/docker/ha-test-unified-alerting/README.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# Grafana Unified Alerting High Availability (HA) test setup
|
||||
|
||||
A set of docker compose services which together creates a Grafana HA test setup for unified alerting.
|
||||
|
||||
Included services
|
||||
|
||||
- Grafana
|
||||
- Mysql - Grafana configuration database, exporter for metrics and session storage
|
||||
- Prometheus - Monitoring of Grafana and used as data source
|
||||
- Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Build grafana docker container
|
||||
|
||||
Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev.
|
||||
|
||||
```bash
|
||||
$ cd <grafana repo>
|
||||
$ make build-docker-full
|
||||
```
|
||||
|
||||
### Virtual host names
|
||||
|
||||
#### Alternative 1 - Use dnsmasq
|
||||
|
||||
```bash
|
||||
$ sudo apt-get install dnsmasq
|
||||
$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null
|
||||
$ sudo /etc/init.d/dnsmasq restart
|
||||
$ ping whatever.loc
|
||||
PING whatever.loc (127.0.0.1) 56(84) bytes of data.
|
||||
64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms
|
||||
--- whatever.loc ping statistics ---
|
||||
1 packet transmitted, 1 received, 0% packet loss, time 1998ms
|
||||
```
|
||||
|
||||
#### Alternative 2 - Manually update /etc/hosts
|
||||
|
||||
Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname.
|
||||
|
||||
```bash
|
||||
$ cat /etc/hosts
|
||||
127.0.0.1 grafana.loc
|
||||
127.0.0.1 prometheus.loc
|
||||
```
|
||||
|
||||
## Start services
|
||||
|
||||
```bash
|
||||
$ docker-compose up -d
|
||||
```
|
||||
|
||||
Browse
|
||||
- http://grafana.loc/
|
||||
- http://prometheus.loc/
|
||||
|
||||
|
||||
## Test alerting
|
||||
|
||||
### Create contact points
|
||||
TBD
|
||||
### Create alerts
|
||||
TBD
|
||||
### Create silences
|
||||
TBD
|
||||
90
devenv/docker/ha-test-unified-alerting/docker-compose.yaml
Normal file
90
devenv/docker/ha-test-unified-alerting/docker-compose.yaml
Normal file
@@ -0,0 +1,90 @@
|
||||
version: "2.1"
|
||||
|
||||
services:
|
||||
db:
|
||||
image: mysql:5.6
|
||||
platform: linux/x86_64
|
||||
environment:
|
||||
MYSQL_ROOT_PASSWORD: rootpass
|
||||
MYSQL_DATABASE: grafana
|
||||
MYSQL_USER: grafana
|
||||
MYSQL_PASSWORD: password
|
||||
command: [mysqld, --character-set-server=utf8mb4, --collation-server=utf8mb4_unicode_ci, --innodb_monitor_enable=all, --max-connections=1001]
|
||||
ports:
|
||||
- 3306
|
||||
healthcheck:
|
||||
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
mysqld-exporter:
|
||||
image: prom/mysqld-exporter
|
||||
environment:
|
||||
- DATA_SOURCE_NAME=root:rootpass@(db:3306)/
|
||||
ports:
|
||||
- 9104
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.4.2
|
||||
volumes:
|
||||
- ./prometheus/:/etc/prometheus/
|
||||
environment:
|
||||
- VIRTUAL_HOST=prometheus.loc
|
||||
ports:
|
||||
- 909
|
||||
nginx-proxy:
|
||||
image: jwilder/nginx-proxy
|
||||
ports:
|
||||
- "80:80"
|
||||
volumes:
|
||||
- /var/run/docker.sock:/tmp/docker.sock:ro
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
grafana1:
|
||||
image: grafana/grafana:dev
|
||||
volumes:
|
||||
- ./grafana/provisioning/:/etc/grafana/provisioning/
|
||||
environment:
|
||||
- VIRTUAL_HOST=grafana.loc
|
||||
- GF_FEATURE_TOGGLES_ENABLE=ngalert
|
||||
- GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094
|
||||
- GF_SERVER_ROOT_URL=http://grafana.loc
|
||||
- GF_DATABASE_NAME=grafana
|
||||
- GF_DATABASE_USER=grafana
|
||||
- GF_DATABASE_PASSWORD=password
|
||||
- GF_DATABASE_TYPE=mysql
|
||||
- GF_DATABASE_HOST=db:3306
|
||||
- GF_DATABASE_MAX_OPEN_CONN=300
|
||||
- GF_SESSION_PROVIDER=mysql
|
||||
- GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true
|
||||
ports:
|
||||
- 3010:3000
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
||||
grafana2:
|
||||
image: grafana/grafana:dev
|
||||
volumes:
|
||||
- ./grafana/provisioning/:/etc/grafana/provisioning/
|
||||
environment:
|
||||
- VIRTUAL_HOST=grafana.loc
|
||||
- GF_FEATURE_TOGGLES_ENABLE=ngalert
|
||||
- GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094
|
||||
- GF_SERVER_ROOT_URL=http://grafana.loc
|
||||
- GF_DATABASE_NAME=grafana
|
||||
- GF_DATABASE_USER=grafana
|
||||
- GF_DATABASE_PASSWORD=password
|
||||
- GF_DATABASE_TYPE=mysql
|
||||
- GF_DATABASE_HOST=db:3306
|
||||
- GF_DATABASE_MAX_OPEN_CONN=300
|
||||
- GF_SESSION_PROVIDER=mysql
|
||||
- GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true
|
||||
ports:
|
||||
- 3020:3000
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
||||
@@ -0,0 +1,203 @@
|
||||
local numAlerts = std.extVar('alerts');
|
||||
local condition = std.extVar('condition');
|
||||
local arr = std.range(1, numAlerts);
|
||||
|
||||
local alertDashboardTemplate = {
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"alert": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
65
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"A",
|
||||
"5m",
|
||||
"now"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"executionErrorState": "alerting",
|
||||
"frequency": "10s",
|
||||
"handler": 1,
|
||||
"for": "1m",
|
||||
"name": "bulk alerting",
|
||||
"noDataState": "no_data",
|
||||
"notifications": [
|
||||
{
|
||||
"id": 2
|
||||
}
|
||||
]
|
||||
},
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Prometheus",
|
||||
"fill": 1,
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"$$hashKey": "object:117",
|
||||
"expr": "go_goroutines",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [
|
||||
{
|
||||
"colorMode": "critical",
|
||||
"fill": true,
|
||||
"line": true,
|
||||
"op": "gt",
|
||||
"value": 50
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Panel Title",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 16,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "New dashboard",
|
||||
"uid": null,
|
||||
"version": 0
|
||||
};
|
||||
|
||||
|
||||
{
|
||||
['alert-' + std.toString(x) + '.json']:
|
||||
alertDashboardTemplate + {
|
||||
panels: [
|
||||
alertDashboardTemplate.panels[0] +
|
||||
{
|
||||
alert+: {
|
||||
name: 'Alert rule ' + x,
|
||||
conditions: [
|
||||
alertDashboardTemplate.panels[0].alert.conditions[0] +
|
||||
{
|
||||
evaluator+: {
|
||||
params: [condition]
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
uid: 'alert-' + x,
|
||||
title: 'Alert ' + x
|
||||
},
|
||||
for x in arr
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
"Active alerts": "#bf1b00"
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Prometheus",
|
||||
"fill": 1,
|
||||
"gridPos": {
|
||||
"h": 12,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"interval": "",
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 2,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "Active grafana instances",
|
||||
"dashes": true,
|
||||
"fill": 0
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"interval": "1m",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Notifications sent",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "min(grafana_alerting_active_alerts) without(instance)",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Active alerts",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "count(up{job=\"grafana\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Active grafana instances",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Notifications sent vs active alerts",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": 3
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 16,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Overview",
|
||||
"uid": "xHy7-hAik",
|
||||
"version": 6
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Alerts'
|
||||
folder: 'Alerts'
|
||||
type: file
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards/alerts
|
||||
|
||||
- name: 'MySQL'
|
||||
folder: 'MySQL'
|
||||
type: file
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards/mysql
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,16 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
jsonData:
|
||||
timeInterval: 10s
|
||||
queryTimeout: 30s
|
||||
httpMethod: POST
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
@@ -0,0 +1,47 @@
|
||||
# my global config
|
||||
global:
|
||||
scrape_interval: 10s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 10s # By default, scrape targets every 15 seconds.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
#rule_files:
|
||||
# - "alert.rules"
|
||||
# - "first.rules"
|
||||
# - "second.rules"
|
||||
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - scheme: http
|
||||
# static_configs:
|
||||
# - targets:
|
||||
# - "127.0.0.1:9093"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'grafana'
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- 'grafana'
|
||||
type: 'A'
|
||||
port: 3000
|
||||
refresh_interval: 10s
|
||||
|
||||
- job_name: 'mysql'
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- 'mysqld-exporter'
|
||||
type: 'A'
|
||||
port: 9104
|
||||
refresh_interval: 10s
|
||||
|
||||
- job_name: 'loki'
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- 'loki'
|
||||
type: 'A'
|
||||
port: 3100
|
||||
refresh_interval: 10s
|
||||
@@ -1119,9 +1119,51 @@ Sets a global limit on number of alert rules that can be created. Default is -1
|
||||
|
||||
For more information about the Grafana 8 alerts, refer to [Unified Alerting]({{< relref "../alerting/unified-alerting/_index.md" >}}).
|
||||
|
||||
### admin_config_poll_interval_seconds
|
||||
### admin_config_poll_interval
|
||||
|
||||
Specify the frequency of polling for admin config changes. The default value is `60`.
|
||||
Specify the frequency of polling for admin config changes. The default value is `60s`.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### alertmanager_config_poll_interval
|
||||
|
||||
Specify the frequency of polling for Alertmanager config changes. The default value is `60s`.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### ha_listen_address
|
||||
|
||||
Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
|
||||
|
||||
### ha_advertise_address
|
||||
|
||||
Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP.
|
||||
|
||||
### ha_peers
|
||||
|
||||
Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
|
||||
|
||||
### ha_peer_timeout
|
||||
|
||||
Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
|
||||
be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
|
||||
each instance wait before sending the notification to take into account replication lag. The default value is `15s`.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### ha_gossip_interval
|
||||
|
||||
The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
|
||||
across cluster more quickly at the expense of increased bandwidth usage. The default value is `200ms`.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### ha_push_pull_interval
|
||||
|
||||
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
across larger clusters at the expense of increased bandwidth usage. The default value is `60s`.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
<hr>
|
||||
|
||||
|
||||
@@ -52,6 +52,7 @@ type Scheduler struct {
|
||||
}
|
||||
|
||||
type MultiOrgAlertmanager struct {
|
||||
Registerer prometheus.Registerer
|
||||
ActiveConfigurations prometheus.Gauge
|
||||
DiscoveredConfigurations prometheus.Gauge
|
||||
registries *OrgRegistries
|
||||
@@ -178,6 +179,7 @@ func newStateMetrics(r prometheus.Registerer) *State {
|
||||
|
||||
func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
||||
return &MultiOrgAlertmanager{
|
||||
Registerer: r,
|
||||
registries: NewOrgRegistries(),
|
||||
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
|
||||
@@ -84,6 +84,8 @@ type AlertNG struct {
|
||||
}
|
||||
|
||||
func (ng *AlertNG) init() error {
|
||||
var err error
|
||||
|
||||
baseInterval := ng.Cfg.AlertingBaseInterval
|
||||
if baseInterval <= 0 {
|
||||
baseInterval = defaultBaseIntervalSeconds
|
||||
@@ -97,7 +99,11 @@ func (ng *AlertNG) init() error {
|
||||
Logger: ng.Log,
|
||||
}
|
||||
|
||||
ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, ng.Metrics.GetMultiOrgAlertmanagerMetrics())
|
||||
multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics()
|
||||
ng.MultiOrgAlertmanager, err = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, multiOrgMetrics, log.New("ngalert.multiorg.alertmanager"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
|
||||
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
|
||||
gokit_log "github.com/go-kit/kit/log"
|
||||
amv2 "github.com/prometheus/alertmanager/api/v2/models"
|
||||
"github.com/prometheus/alertmanager/cluster"
|
||||
"github.com/prometheus/alertmanager/dispatch"
|
||||
"github.com/prometheus/alertmanager/inhibit"
|
||||
"github.com/prometheus/alertmanager/nflog"
|
||||
@@ -24,6 +25,7 @@ import (
|
||||
"github.com/prometheus/alertmanager/silence"
|
||||
"github.com/prometheus/alertmanager/template"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/grafana/grafana/pkg/components/securejsondata"
|
||||
@@ -77,9 +79,16 @@ const (
|
||||
`
|
||||
)
|
||||
|
||||
type ClusterPeer interface {
|
||||
AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel
|
||||
Position() int
|
||||
WaitReady(context.Context) error
|
||||
}
|
||||
|
||||
type Alertmanager struct {
|
||||
logger log.Logger
|
||||
gokitLogger gokit_log.Logger
|
||||
OrgID int64
|
||||
|
||||
Settings *setting.Cfg
|
||||
Store store.AlertingStore
|
||||
@@ -90,6 +99,8 @@ type Alertmanager struct {
|
||||
marker types.Marker
|
||||
alerts *mem.Alerts
|
||||
route *dispatch.Route
|
||||
peer ClusterPeer
|
||||
peerTimeout time.Duration
|
||||
|
||||
dispatcher *dispatch.Dispatcher
|
||||
inhibitor *inhibit.Inhibitor
|
||||
@@ -111,7 +122,7 @@ type Alertmanager struct {
|
||||
orgID int64
|
||||
}
|
||||
|
||||
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) {
|
||||
func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, m *metrics.Alertmanager) (*Alertmanager, error) {
|
||||
am := &Alertmanager{
|
||||
Settings: cfg,
|
||||
stopc: make(chan struct{}),
|
||||
@@ -120,6 +131,8 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
|
||||
stageMetrics: notify.NewMetrics(m.Registerer),
|
||||
dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer),
|
||||
Store: store,
|
||||
peer: peer,
|
||||
peerTimeout: cfg.HAPeerTimeout,
|
||||
Metrics: m,
|
||||
orgID: orgID,
|
||||
}
|
||||
@@ -148,6 +161,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err)
|
||||
}
|
||||
c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer)
|
||||
am.notificationLog.SetBroadcast(c.Broadcast)
|
||||
|
||||
// Initialize silences
|
||||
am.silences, err = silence.New(silence.Options{
|
||||
Metrics: m.Registerer,
|
||||
@@ -158,6 +174,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
|
||||
return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err)
|
||||
}
|
||||
|
||||
c = am.peer.AddState(fmt.Sprintf("silences:%d", am.OrgID), am.silences, m.Registerer)
|
||||
am.silences.SetBroadcast(c.Broadcast)
|
||||
|
||||
am.wg.Add(1)
|
||||
go func() {
|
||||
am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) {
|
||||
@@ -392,15 +411,16 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
|
||||
am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger)
|
||||
am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger)
|
||||
|
||||
meshStage := notify.NewGossipSettleStage(am.peer)
|
||||
inhibitionStage := notify.NewMuteStage(am.inhibitor)
|
||||
silencingStage := notify.NewMuteStage(am.silencer)
|
||||
for name := range integrationsMap {
|
||||
stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog)
|
||||
routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage}
|
||||
stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog)
|
||||
routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage}
|
||||
}
|
||||
|
||||
am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil)
|
||||
am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics)
|
||||
am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics)
|
||||
|
||||
am.wg.Add(1)
|
||||
go func() {
|
||||
@@ -701,21 +721,17 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I
|
||||
return fs
|
||||
}
|
||||
|
||||
func waitFunc() time.Duration {
|
||||
// When it's a single instance, we don't need additional wait. The routing policies will have their own group wait.
|
||||
// We need >0 wait here in case we have peers to sync the notification state with. 0 wait in that case can result
|
||||
// in duplicate notifications being sent.
|
||||
// TODO: we have setting.AlertingNotificationTimeout in legacy settings. Either use that or separate set of config
|
||||
// for clustering with intuitive name, like "PeerTimeout".
|
||||
return 0
|
||||
func (am *Alertmanager) waitFunc() time.Duration {
|
||||
return time.Duration(am.peer.Position()) * am.peerTimeout
|
||||
}
|
||||
|
||||
func timeoutFunc(d time.Duration) time.Duration {
|
||||
//TODO: What does MinTimeout means here?
|
||||
func (am *Alertmanager) timeoutFunc(d time.Duration) time.Duration {
|
||||
// time.Duration d relates to the receiver's group_interval. Even with a group interval of 1s,
|
||||
// we need to make sure (non-position-0) peers in the cluster wait before flushing the notifications.
|
||||
if d < notify.MinTimeout {
|
||||
d = notify.MinTimeout
|
||||
}
|
||||
return d + waitFunc()
|
||||
return d + am.waitFunc()
|
||||
}
|
||||
|
||||
type nilLimits struct{}
|
||||
|
||||
@@ -48,7 +48,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
}
|
||||
|
||||
kvStore := newFakeKVStore(t)
|
||||
am, err := newAlertmanager(1, cfg, s, kvStore, m)
|
||||
am, err := newAlertmanager(1, cfg, s, kvStore, &NilPeer{}, m)
|
||||
require.NoError(t, err)
|
||||
return am
|
||||
}
|
||||
|
||||
@@ -6,6 +6,12 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/logging"
|
||||
|
||||
gokit_log "github.com/go-kit/kit/log"
|
||||
"github.com/prometheus/alertmanager/cluster"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/kvstore"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
@@ -14,7 +20,6 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
SyncOrgsPollInterval = 1 * time.Minute
|
||||
ErrNoAlertmanagerForOrg = fmt.Errorf("Alertmanager does not exist for this organization")
|
||||
ErrAlertmanagerNotReady = fmt.Errorf("Alertmanager is not ready yet")
|
||||
)
|
||||
@@ -26,6 +31,10 @@ type MultiOrgAlertmanager struct {
|
||||
settings *setting.Cfg
|
||||
logger log.Logger
|
||||
|
||||
// clusterPeer represents the clustering peers of Alertmanagers between Grafana instances.
|
||||
peer ClusterPeer
|
||||
settleCancel context.CancelFunc
|
||||
|
||||
configStore store.AlertingStore
|
||||
orgStore store.OrgStore
|
||||
kvStore kvstore.KVStore
|
||||
@@ -33,16 +42,52 @@ type MultiOrgAlertmanager struct {
|
||||
metrics *metrics.MultiOrgAlertmanager
|
||||
}
|
||||
|
||||
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager) *MultiOrgAlertmanager {
|
||||
return &MultiOrgAlertmanager{
|
||||
func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager, l log.Logger) (*MultiOrgAlertmanager, error) {
|
||||
moa := &MultiOrgAlertmanager{
|
||||
logger: l,
|
||||
settings: cfg,
|
||||
logger: log.New("multiorg.alertmanager"),
|
||||
alertmanagers: map[int64]*Alertmanager{},
|
||||
configStore: configStore,
|
||||
orgStore: orgStore,
|
||||
kvStore: kvStore,
|
||||
metrics: m,
|
||||
}
|
||||
|
||||
clusterLogger := gokit_log.With(gokit_log.NewLogfmtLogger(logging.NewWrapper(l)), "component", "cluster")
|
||||
moa.peer = &NilPeer{}
|
||||
if len(cfg.HAPeers) > 0 {
|
||||
peer, err := cluster.Create(
|
||||
clusterLogger,
|
||||
m.Registerer,
|
||||
cfg.HAListenAddr,
|
||||
cfg.HAAdvertiseAddr,
|
||||
cfg.HAPeers, // peers
|
||||
true,
|
||||
cfg.HAPushPullInterval,
|
||||
cfg.HAGossipInterval,
|
||||
cluster.DefaultTcpTimeout,
|
||||
cluster.DefaultProbeTimeout,
|
||||
cluster.DefaultProbeInterval,
|
||||
nil,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to initialize gossip mesh: %w", err)
|
||||
}
|
||||
|
||||
err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
|
||||
if err != nil {
|
||||
l.Error("msg", "unable to join gossip mesh while initializing cluster for high availability mode", "err", err)
|
||||
}
|
||||
// Attempt to verify the number of peers for 30s every 2s. The risk here is what we send a notification "too soon".
|
||||
// Which should _never_ happen given we share the notification log via the database so the risk of double notification is very low.
|
||||
var ctx context.Context
|
||||
ctx, moa.settleCancel = context.WithTimeout(context.Background(), 30*time.Second)
|
||||
go peer.Settle(ctx, cluster.DefaultGossipInterval*10)
|
||||
moa.peer = peer
|
||||
}
|
||||
|
||||
return moa, nil
|
||||
}
|
||||
|
||||
func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
|
||||
@@ -53,7 +98,7 @@ func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
|
||||
case <-ctx.Done():
|
||||
moa.StopAndWait()
|
||||
return nil
|
||||
case <-time.After(SyncOrgsPollInterval):
|
||||
case <-time.After(moa.settings.AlertmanagerConfigPollInterval):
|
||||
if err := moa.LoadAndSyncAlertmanagersForOrgs(ctx); err != nil {
|
||||
moa.logger.Error("error while synchronizing Alertmanager orgs", "err", err)
|
||||
}
|
||||
@@ -90,7 +135,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
|
||||
// To export them, we need to translate the metrics from each individual registry and,
|
||||
// then aggregate them on the main registry.
|
||||
m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID))
|
||||
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, m)
|
||||
am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, moa.peer, m)
|
||||
if err != nil {
|
||||
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
|
||||
}
|
||||
@@ -130,6 +175,14 @@ func (moa *MultiOrgAlertmanager) StopAndWait() {
|
||||
for _, am := range moa.alertmanagers {
|
||||
am.StopAndWait()
|
||||
}
|
||||
|
||||
p, ok := moa.peer.(*cluster.Peer)
|
||||
if ok {
|
||||
moa.settleCancel()
|
||||
if err := p.Leave(10 * time.Second); err != nil {
|
||||
moa.logger.Warn("unable to leave the gossip mesh", "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AlertmanagerFor returns the Alertmanager instance for the organization provided.
|
||||
@@ -150,3 +203,16 @@ func (moa *MultiOrgAlertmanager) AlertmanagerFor(orgID int64) (*Alertmanager, er
|
||||
|
||||
return orgAM, nil
|
||||
}
|
||||
|
||||
// NilPeer and NilChannel implements the Alertmanager clustering interface.
|
||||
type NilPeer struct{}
|
||||
|
||||
func (p *NilPeer) Position() int { return 0 }
|
||||
func (p *NilPeer) WaitReady(context.Context) error { return nil }
|
||||
func (p *NilPeer) AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel {
|
||||
return &NilChannel{}
|
||||
}
|
||||
|
||||
type NilChannel struct{}
|
||||
|
||||
func (c *NilChannel) Broadcast([]byte) {}
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
@@ -18,7 +19,6 @@ import (
|
||||
)
|
||||
|
||||
func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
t.Skipf("Skipping multiorg alertmanager tests for now")
|
||||
configStore := &FakeConfigStore{
|
||||
configs: map[int64]*models.AlertConfiguration{},
|
||||
}
|
||||
@@ -28,12 +28,15 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
|
||||
tmpDir, err := ioutil.TempDir("", "test")
|
||||
require.NoError(t, err)
|
||||
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
kvStore := newFakeKVStore(t)
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
m := metrics.NewNGAlert(reg)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
||||
cfg := &setting.Cfg{
|
||||
DataPath: tmpDir,
|
||||
AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests
|
||||
}
|
||||
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
|
||||
require.NoError(t, err)
|
||||
ctx := context.Background()
|
||||
|
||||
t.Cleanup(cleanOrgDirectories(tmpDir, t))
|
||||
@@ -82,22 +85,23 @@ grafana_alerting_discovered_configurations 4
|
||||
}
|
||||
|
||||
func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
||||
t.Skipf("Skipping multiorg alertmanager tests for now")
|
||||
configStore := &FakeConfigStore{
|
||||
configs: map[int64]*models.AlertConfiguration{},
|
||||
}
|
||||
orgStore := &FakeOrgStore{
|
||||
orgs: []int64{1, 2, 3},
|
||||
}
|
||||
|
||||
tmpDir, err := ioutil.TempDir("", "test")
|
||||
require.NoError(t, err)
|
||||
|
||||
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
||||
cfg := &setting.Cfg{
|
||||
DataPath: tmpDir,
|
||||
AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests
|
||||
}
|
||||
kvStore := newFakeKVStore(t)
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
m := metrics.NewNGAlert(reg)
|
||||
mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
||||
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
|
||||
require.NoError(t, err)
|
||||
ctx := context.Background()
|
||||
|
||||
t.Cleanup(cleanOrgDirectories(tmpDir, t))
|
||||
|
||||
@@ -231,6 +231,8 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
|
||||
mockedClock := clock.NewMock()
|
||||
logger := log.New("ngalert schedule test")
|
||||
m := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
|
||||
moa, err := notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}, nil, log.New("testlogger"))
|
||||
require.NoError(t, err)
|
||||
schedCfg := SchedulerCfg{
|
||||
C: mockedClock,
|
||||
BaseInterval: time.Second,
|
||||
@@ -239,7 +241,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
|
||||
RuleStore: rs,
|
||||
InstanceStore: is,
|
||||
AdminConfigStore: acs,
|
||||
MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}, nil),
|
||||
MultiOrgNotifier: moa,
|
||||
Logger: logger,
|
||||
Metrics: m.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
|
||||
@@ -18,15 +18,14 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gobwas/glob"
|
||||
|
||||
"github.com/prometheus/common/model"
|
||||
"gopkg.in/ini.v1"
|
||||
|
||||
"github.com/grafana/grafana-aws-sdk/pkg/awsds"
|
||||
"github.com/grafana/grafana/pkg/components/gtime"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
|
||||
"github.com/gobwas/glob"
|
||||
"github.com/prometheus/common/model"
|
||||
"gopkg.in/ini.v1"
|
||||
)
|
||||
|
||||
type Scheme string
|
||||
@@ -420,7 +419,14 @@ type Cfg struct {
|
||||
GeomapEnableCustomBaseLayers bool
|
||||
|
||||
// Unified Alerting
|
||||
AdminConfigPollInterval time.Duration
|
||||
AdminConfigPollInterval time.Duration
|
||||
AlertmanagerConfigPollInterval time.Duration
|
||||
HAListenAddr string
|
||||
HAAdvertiseAddr string
|
||||
HAPeers []string
|
||||
HAPeerTimeout time.Duration
|
||||
HAGossipInterval time.Duration
|
||||
HAPushPullInterval time.Duration
|
||||
}
|
||||
|
||||
// IsLiveConfigEnabled returns true if live should be able to save configs to SQL tables
|
||||
@@ -916,8 +922,7 @@ func (cfg *Cfg) Load(args CommandLineArgs) error {
|
||||
if err := readAlertingSettings(iniFile); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := cfg.readUnifiedAlertingSettings(iniFile); err != nil {
|
||||
if err := cfg.ReadUnifiedAlertingSettings(iniFile); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1374,13 +1379,6 @@ func (cfg *Cfg) readRenderingSettings(iniFile *ini.File) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cfg *Cfg) readUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
ua := iniFile.Section("unified_alerting")
|
||||
s := ua.Key("admin_config_poll_interval_seconds").MustInt(60)
|
||||
cfg.AdminConfigPollInterval = time.Second * time.Duration(s)
|
||||
return nil
|
||||
}
|
||||
|
||||
func readAlertingSettings(iniFile *ini.File) error {
|
||||
alerting := iniFile.Section("alerting")
|
||||
AlertingEnabled = alerting.Key("enabled").MustBool(true)
|
||||
|
||||
57
pkg/setting/setting_unified_alerting.go
Normal file
57
pkg/setting/setting_unified_alerting.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package setting
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/components/gtime"
|
||||
|
||||
"github.com/prometheus/alertmanager/cluster"
|
||||
"gopkg.in/ini.v1"
|
||||
)
|
||||
|
||||
const (
|
||||
AlertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
||||
AlertmanagerDefaultPeerTimeout = 15 * time.Second
|
||||
AlertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
|
||||
AlertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
|
||||
SchedulerDefaultAdminConfigPollInterval = 60 * time.Second
|
||||
AlertmanagerDefaultConfigPollInterval = 60 * time.Second
|
||||
)
|
||||
|
||||
func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
ua := iniFile.Section("unified_alerting")
|
||||
var err error
|
||||
cfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr)
|
||||
cfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("")
|
||||
peers := ua.Key("ha_peers").MustString("")
|
||||
cfg.HAPeers = make([]string, 0)
|
||||
if peers != "" {
|
||||
for _, peer := range strings.Split(peers, ",") {
|
||||
peer = strings.TrimSpace(peer)
|
||||
cfg.HAPeers = append(cfg.HAPeers, peer)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
39
pkg/setting/setting_unified_alerting_test.go
Normal file
39
pkg/setting/setting_unified_alerting_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package setting
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
|
||||
cfg := NewCfg()
|
||||
err := cfg.Load(CommandLineArgs{HomePath: "../../", Config: "../../conf/defaults.ini"})
|
||||
require.NoError(t, err)
|
||||
|
||||
// It sets the correct defaults.
|
||||
{
|
||||
require.Equal(t, 60*time.Second, cfg.AdminConfigPollInterval)
|
||||
require.Equal(t, 60*time.Second, cfg.AlertmanagerConfigPollInterval)
|
||||
require.Equal(t, 15*time.Second, cfg.HAPeerTimeout)
|
||||
require.Equal(t, "0.0.0.0:9094", cfg.HAListenAddr)
|
||||
require.Equal(t, "", cfg.HAAdvertiseAddr)
|
||||
require.Len(t, cfg.HAPeers, 0)
|
||||
require.Equal(t, 200*time.Millisecond, cfg.HAGossipInterval)
|
||||
require.Equal(t, 60*time.Second, cfg.HAPushPullInterval)
|
||||
}
|
||||
|
||||
// With peers set, it correctly parses them.
|
||||
{
|
||||
require.Len(t, cfg.HAPeers, 0)
|
||||
s, err := cfg.Raw.NewSection("unified_alerting")
|
||||
require.NoError(t, err)
|
||||
_, err = s.NewKey("ha_peers", "hostname1:9090,hostname2:9090,hostname3:9090")
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, cfg.ReadUnifiedAlertingSettings(cfg.Raw))
|
||||
require.Len(t, cfg.HAPeers, 3)
|
||||
require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.HAPeers)
|
||||
}
|
||||
}
|
||||
@@ -21,9 +21,9 @@ import (
|
||||
|
||||
func TestAdminConfiguration_SendingToExternalAlertmanagers(t *testing.T) {
|
||||
dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
|
||||
EnableFeatureToggles: []string{"ngalert"},
|
||||
DisableAnonymous: true,
|
||||
NGAlertAdminConfigIntervalSeconds: 2,
|
||||
EnableFeatureToggles: []string{"ngalert"},
|
||||
DisableAnonymous: true,
|
||||
NGAlertAdminConfigPollInterval: 2 * time.Second,
|
||||
})
|
||||
|
||||
grafanaListedAddr, s := testinfra.StartGrafana(t, dir, path)
|
||||
|
||||
@@ -8,8 +8,6 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
|
||||
|
||||
"github.com/grafana/grafana/pkg/bus"
|
||||
"github.com/grafana/grafana/pkg/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||
@@ -19,16 +17,10 @@ import (
|
||||
)
|
||||
|
||||
func TestAlertmanagerConfigurationIsTransactional(t *testing.T) {
|
||||
// TODO: We need a reliable way to ensure Alertmanagers have synced correctly.
|
||||
// For now, make them sync quicker.
|
||||
p := notifier.SyncOrgsPollInterval
|
||||
notifier.SyncOrgsPollInterval = 2 * time.Second
|
||||
t.Cleanup(func() {
|
||||
notifier.SyncOrgsPollInterval = p
|
||||
})
|
||||
dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
|
||||
EnableFeatureToggles: []string{"ngalert"},
|
||||
DisableAnonymous: true,
|
||||
EnableFeatureToggles: []string{"ngalert"},
|
||||
NGAlertAlertmanagerConfigPollInterval: 2 * time.Second,
|
||||
DisableAnonymous: true,
|
||||
})
|
||||
|
||||
grafanaListedAddr, store := testinfra.StartGrafana(t, dir, path)
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/api"
|
||||
"github.com/grafana/grafana/pkg/infra/fs"
|
||||
@@ -204,13 +205,18 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
|
||||
_, err = featureSection.NewKey("enable", strings.Join(o.EnableFeatureToggles, " "))
|
||||
require.NoError(t, err)
|
||||
}
|
||||
if o.NGAlertAdminConfigIntervalSeconds != 0 {
|
||||
ngalertingSection, err := cfg.NewSection("ngalerting")
|
||||
if o.NGAlertAdminConfigPollInterval != 0 {
|
||||
ngalertingSection, err := cfg.NewSection("unified_alerting")
|
||||
require.NoError(t, err)
|
||||
_, err = ngalertingSection.NewKey("admin_config_poll_interval_seconds", fmt.Sprintf("%d", o.NGAlertAdminConfigIntervalSeconds))
|
||||
_, err = ngalertingSection.NewKey("admin_config_poll_interval", o.NGAlertAdminConfigPollInterval.String())
|
||||
require.NoError(t, err)
|
||||
}
|
||||
if o.NGAlertAlertmanagerConfigPollInterval != 0 {
|
||||
ngalertingSection, err := cfg.NewSection("unified_alerting")
|
||||
require.NoError(t, err)
|
||||
_, err = ngalertingSection.NewKey("alertmanager_config_poll_interval", o.NGAlertAlertmanagerConfigPollInterval.String())
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
if o.AnonymousUserRole != "" {
|
||||
_, err = anonSect.NewKey("org_role", string(o.AnonymousUserRole))
|
||||
require.NoError(t, err)
|
||||
@@ -252,13 +258,14 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
|
||||
}
|
||||
|
||||
type GrafanaOpts struct {
|
||||
EnableCSP bool
|
||||
EnableFeatureToggles []string
|
||||
NGAlertAdminConfigIntervalSeconds int
|
||||
AnonymousUserRole models.RoleType
|
||||
EnableQuota bool
|
||||
DisableAnonymous bool
|
||||
CatalogAppEnabled bool
|
||||
ViewersCanEdit bool
|
||||
PluginAdminEnabled bool
|
||||
EnableCSP bool
|
||||
EnableFeatureToggles []string
|
||||
NGAlertAdminConfigPollInterval time.Duration
|
||||
NGAlertAlertmanagerConfigPollInterval time.Duration
|
||||
AnonymousUserRole models.RoleType
|
||||
EnableQuota bool
|
||||
DisableAnonymous bool
|
||||
CatalogAppEnabled bool
|
||||
ViewersCanEdit bool
|
||||
PluginAdminEnabled bool
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user