mirror of
https://github.com/grafana/grafana.git
synced 2025-02-14 09:33:34 -06:00
* Alerting: Refactor & fix unified alerting metrics structure Fixes and refactors the metrics structure we have for the ngalert service. Now, each component has its own metric struct that includes the JUST the metrics it uses. Additionally, I have fixed the configuration metrics and added new metrics to determine if we have discovered and started all the necessary configurations of an instance. This allows us to alert on `grafana_alerting_discovered_configurations - grafana_alerting_active_configurations != 0` to know whether an alertmanager instance did not start successfully.
129 lines
4.9 KiB
Go
129 lines
4.9 KiB
Go
package notifier
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
"github.com/grafana/grafana/pkg/setting"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/testutil"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
|
t.Skipf("Skipping multiorg alertmanager tests for now")
|
|
configStore := &FakeConfigStore{
|
|
configs: map[int64]*models.AlertConfiguration{},
|
|
}
|
|
orgStore := &FakeOrgStore{
|
|
orgs: []int64{1, 2, 3},
|
|
}
|
|
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
|
kvStore := newFakeKVStore(t)
|
|
reg := prometheus.NewPedanticRegistry()
|
|
m := metrics.NewNGAlert(reg)
|
|
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
|
ctx := context.Background()
|
|
|
|
// Ensure that one Alertmanager is created per org.
|
|
{
|
|
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
|
require.Len(t, mam.alertmanagers, 3)
|
|
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
|
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
|
# TYPE grafana_alerting_active_configurations gauge
|
|
grafana_alerting_active_configurations 3
|
|
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
|
# TYPE grafana_alerting_discovered_configurations gauge
|
|
grafana_alerting_discovered_configurations 3
|
|
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
|
}
|
|
// When an org is removed, it should detect it.
|
|
{
|
|
orgStore.orgs = []int64{1, 3}
|
|
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
|
require.Len(t, mam.alertmanagers, 2)
|
|
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
|
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
|
# TYPE grafana_alerting_active_configurations gauge
|
|
grafana_alerting_active_configurations 2
|
|
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
|
# TYPE grafana_alerting_discovered_configurations gauge
|
|
grafana_alerting_discovered_configurations 2
|
|
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
|
}
|
|
// if the org comes back, it should detect it.
|
|
{
|
|
orgStore.orgs = []int64{1, 2, 3, 4}
|
|
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
|
require.Len(t, mam.alertmanagers, 4)
|
|
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
|
# HELP grafana_alerting_active_configurations The number of active Alertmanager configurations.
|
|
# TYPE grafana_alerting_active_configurations gauge
|
|
grafana_alerting_active_configurations 4
|
|
# HELP grafana_alerting_discovered_configurations The number of organizations we've discovered that require an Alertmanager configuration.
|
|
# TYPE grafana_alerting_discovered_configurations gauge
|
|
grafana_alerting_discovered_configurations 4
|
|
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
|
}
|
|
}
|
|
|
|
func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
|
t.Skipf("Skipping multiorg alertmanager tests for now")
|
|
configStore := &FakeConfigStore{
|
|
configs: map[int64]*models.AlertConfiguration{},
|
|
}
|
|
orgStore := &FakeOrgStore{
|
|
orgs: []int64{1, 2, 3},
|
|
}
|
|
|
|
SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
|
|
kvStore := newFakeKVStore(t)
|
|
reg := prometheus.NewPedanticRegistry()
|
|
m := metrics.NewNGAlert(reg)
|
|
mam := NewMultiOrgAlertmanager(&setting.Cfg{}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
|
|
ctx := context.Background()
|
|
|
|
// Ensure that one Alertmanagers is created per org.
|
|
{
|
|
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
|
require.Len(t, mam.alertmanagers, 3)
|
|
}
|
|
|
|
// First, let's try to request an Alertmanager from an org that doesn't exist.
|
|
{
|
|
_, err := mam.AlertmanagerFor(5)
|
|
require.EqualError(t, err, ErrNoAlertmanagerForOrg.Error())
|
|
}
|
|
|
|
// Now, let's try to request an Alertmanager that is not ready.
|
|
{
|
|
// let's delete its "running config" to make it non-ready
|
|
mam.alertmanagers[1].config = nil
|
|
_, err := mam.AlertmanagerFor(1)
|
|
require.EqualError(t, err, ErrAlertmanagerNotReady.Error())
|
|
}
|
|
|
|
// With an Alertmanager that exists, it responds correctly.
|
|
{
|
|
am, err := mam.AlertmanagerFor(2)
|
|
require.NoError(t, err)
|
|
require.Equal(t, *am.GetStatus().VersionInfo.Version, "N/A")
|
|
require.Equal(t, am.orgID, int64(2))
|
|
require.NotNil(t, am.config)
|
|
}
|
|
|
|
// Let's now remove the previous queried organization.
|
|
orgStore.orgs = []int64{1, 3}
|
|
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
|
{
|
|
_, err := mam.AlertmanagerFor(2)
|
|
require.EqualError(t, err, ErrNoAlertmanagerForOrg.Error())
|
|
}
|
|
}
|