Alerting: Introduce a Mimir client as part of the Remote Alertmanager (#78357)

* Alerting: Introduce a Mimir client as part of the Remote Alertmanager

This is our first attempt at making Grafana communicate use Mimir as a backend - it uses a new set of APIs that we've developed on the Mimir side to upload the grafana configuration and alertmanager state so that it can then be ported over.

Codewise, we've introduced a couple of things:

A client to isolate in its own package all the communication that happens with Mimir
A few changes to the remote/alertmanager to include uploading the configuration and state when it starts
A few refactors that align a bit better with the design approach that we're thinking
An integration tests again these newly developed APIs using a custom image

---------

Signed-off-by: gotjosh <josue.abreu@gmail.com>
Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>
This commit is contained in:
gotjosh 2023-11-23 16:59:36 +00:00 committed by GitHub
parent eedc19f9f0
commit 23fe8f4e9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 547 additions and 92 deletions

View File

@ -802,7 +802,7 @@ services:
- commands: - commands:
- /bin/mimir -target=backend - /bin/mimir -target=backend
environment: {} environment: {}
image: grafana/mimir:latest image: us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
name: mimir_backend name: mimir_backend
- environment: {} - environment: {}
image: redis:6.2.11-alpine image: redis:6.2.11-alpine
@ -980,7 +980,6 @@ steps:
- wire-install - wire-install
- wait-for-remote-alertmanager - wait-for-remote-alertmanager
environment: environment:
AM_PASSWORD: test
AM_TENANT_ID: test AM_TENANT_ID: test
AM_URL: http://mimir_backend:8080 AM_URL: http://mimir_backend:8080
image: golang:1.21.3-alpine image: golang:1.21.3-alpine
@ -1246,7 +1245,7 @@ services:
- commands: - commands:
- /bin/mimir -target=backend - /bin/mimir -target=backend
environment: {} environment: {}
image: grafana/mimir:latest image: us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
name: mimir_backend name: mimir_backend
- environment: {} - environment: {}
image: redis:6.2.11-alpine image: redis:6.2.11-alpine
@ -2174,7 +2173,7 @@ services:
- commands: - commands:
- /bin/mimir -target=backend - /bin/mimir -target=backend
environment: {} environment: {}
image: grafana/mimir:latest image: us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
name: mimir_backend name: mimir_backend
- environment: {} - environment: {}
image: redis:6.2.11-alpine image: redis:6.2.11-alpine
@ -2331,7 +2330,6 @@ steps:
- wire-install - wire-install
- wait-for-remote-alertmanager - wait-for-remote-alertmanager
environment: environment:
AM_PASSWORD: test
AM_TENANT_ID: test AM_TENANT_ID: test
AM_URL: http://mimir_backend:8080 AM_URL: http://mimir_backend:8080
image: golang:1.21.3-alpine image: golang:1.21.3-alpine
@ -3844,7 +3842,7 @@ services:
- commands: - commands:
- /bin/mimir -target=backend - /bin/mimir -target=backend
environment: {} environment: {}
image: grafana/mimir:latest image: us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
name: mimir_backend name: mimir_backend
- environment: {} - environment: {}
image: redis:6.2.11-alpine image: redis:6.2.11-alpine
@ -3994,7 +3992,6 @@ steps:
- wire-install - wire-install
- wait-for-remote-alertmanager - wait-for-remote-alertmanager
environment: environment:
AM_PASSWORD: test
AM_TENANT_ID: test AM_TENANT_ID: test
AM_URL: http://mimir_backend:8080 AM_URL: http://mimir_backend:8080
image: golang:1.21.3-alpine image: golang:1.21.3-alpine
@ -4413,7 +4410,7 @@ steps:
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM plugins/slack - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM plugins/slack
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM python:3.8 - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM python:3.8
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM postgres:12.3-alpine - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM postgres:12.3-alpine
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM grafana/mimir:latest - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM mysql:5.7.39 - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM mysql:5.7.39
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM mysql:8.0.32 - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM mysql:8.0.32
- trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM redis:6.2.11-alpine - trivy --exit-code 0 --severity UNKNOWN,LOW,MEDIUM redis:6.2.11-alpine
@ -4447,7 +4444,7 @@ steps:
- trivy --exit-code 1 --severity HIGH,CRITICAL plugins/slack - trivy --exit-code 1 --severity HIGH,CRITICAL plugins/slack
- trivy --exit-code 1 --severity HIGH,CRITICAL python:3.8 - trivy --exit-code 1 --severity HIGH,CRITICAL python:3.8
- trivy --exit-code 1 --severity HIGH,CRITICAL postgres:12.3-alpine - trivy --exit-code 1 --severity HIGH,CRITICAL postgres:12.3-alpine
- trivy --exit-code 1 --severity HIGH,CRITICAL grafana/mimir:latest - trivy --exit-code 1 --severity HIGH,CRITICAL us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78
- trivy --exit-code 1 --severity HIGH,CRITICAL mysql:5.7.39 - trivy --exit-code 1 --severity HIGH,CRITICAL mysql:5.7.39
- trivy --exit-code 1 --severity HIGH,CRITICAL mysql:8.0.32 - trivy --exit-code 1 --severity HIGH,CRITICAL mysql:8.0.32
- trivy --exit-code 1 --severity HIGH,CRITICAL redis:6.2.11-alpine - trivy --exit-code 1 --severity HIGH,CRITICAL redis:6.2.11-alpine
@ -4685,6 +4682,6 @@ kind: secret
name: gcr_credentials name: gcr_credentials
--- ---
kind: signature kind: signature
hmac: 64302d9316abab775d7ec1132f26ea4f1829558fa0bfd85812597182c1abe61a hmac: 0e9f67184e414d3afbda81c86dfa58b3c2cf7c1a668be5313c851ff5f42de44d
... ...

View File

@ -184,7 +184,7 @@ test-go-integration: ## Run integration tests for backend with flags.
test-go-integration-alertmanager: ## Run integration tests for the remote alertmanager (config taken from the mimir_backend block). test-go-integration-alertmanager: ## Run integration tests for the remote alertmanager (config taken from the mimir_backend block).
@echo "test remote alertmanager integration tests" @echo "test remote alertmanager integration tests"
$(GO) clean -testcache $(GO) clean -testcache
AM_URL=http://localhost:8080 AM_TENANT_ID=test AM_PASSWORD=test \ AM_URL=http://localhost:8080 AM_TENANT_ID=test \
$(GO) test -count=1 -run "^TestIntegrationRemoteAlertmanager" -covermode=atomic -timeout=5m ./pkg/services/ngalert/... $(GO) test -count=1 -run "^TestIntegrationRemoteAlertmanager" -covermode=atomic -timeout=5m ./pkg/services/ngalert/...
.PHONY: test-go-integration-postgres .PHONY: test-go-integration-postgres

View File

@ -2,8 +2,8 @@ package remote
import ( import (
"context" "context"
"crypto/md5"
"fmt" "fmt"
"io"
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
@ -15,6 +15,7 @@ import (
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier" "github.com/grafana/grafana/pkg/services/ngalert/notifier"
mimirClient "github.com/grafana/grafana/pkg/services/ngalert/remote/client"
"github.com/grafana/grafana/pkg/services/ngalert/sender" "github.com/grafana/grafana/pkg/services/ngalert/sender"
amclient "github.com/prometheus/alertmanager/api/v2/client" amclient "github.com/prometheus/alertmanager/api/v2/client"
amalert "github.com/prometheus/alertmanager/api/v2/client/alert" amalert "github.com/prometheus/alertmanager/api/v2/client/alert"
@ -32,6 +33,7 @@ type Alertmanager struct {
url string url string
amClient *amclient.AlertmanagerAPI amClient *amclient.AlertmanagerAPI
mimirClient mimirClient.MimirClient
httpClient *http.Client httpClient *http.Client
ready bool ready bool
sender *sender.ExternalAlertmanager sender *sender.ExternalAlertmanager
@ -45,10 +47,10 @@ type AlertmanagerConfig struct {
func NewAlertmanager(cfg AlertmanagerConfig, orgID int64) (*Alertmanager, error) { func NewAlertmanager(cfg AlertmanagerConfig, orgID int64) (*Alertmanager, error) {
client := http.Client{ client := http.Client{
Transport: &roundTripper{ Transport: &mimirClient.MimirAuthRoundTripper{
tenantID: cfg.TenantID, TenantID: cfg.TenantID,
basicAuthPassword: cfg.BasicAuthPassword, Password: cfg.BasicAuthPassword,
next: http.DefaultTransport, Next: http.DefaultTransport,
}, },
} }
@ -56,12 +58,26 @@ func NewAlertmanager(cfg AlertmanagerConfig, orgID int64) (*Alertmanager, error)
return nil, fmt.Errorf("empty URL for tenant %s", cfg.TenantID) return nil, fmt.Errorf("empty URL for tenant %s", cfg.TenantID)
} }
logger := log.New("ngalert.remote.alertmanager")
mcCfg := &mimirClient.Config{
Address: cfg.URL,
TenantID: cfg.TenantID,
Password: cfg.BasicAuthPassword,
Logger: logger,
}
mc, err := mimirClient.New(mcCfg)
if err != nil {
return nil, err
}
u, err := url.Parse(cfg.URL) u, err := url.Parse(cfg.URL)
if err != nil { if err != nil {
return nil, err return nil, err
} }
u = u.JoinPath(amclient.DefaultBasePath) u = u.JoinPath("/alertmanager", amclient.DefaultBasePath)
transport := httptransport.NewWithClient(u.Host, u.Path, []string{u.Scheme}, &client) transport := httptransport.NewWithClient(u.Host, u.Path, []string{u.Scheme}, &client)
// Using our client with custom headers and basic auth credentials. // Using our client with custom headers and basic auth credentials.
@ -72,16 +88,17 @@ func NewAlertmanager(cfg AlertmanagerConfig, orgID int64) (*Alertmanager, error)
s.Run() s.Run()
err = s.ApplyConfig(orgID, 0, []sender.ExternalAMcfg{{ err = s.ApplyConfig(orgID, 0, []sender.ExternalAMcfg{{
URL: cfg.URL, URL: cfg.URL + "/alertmanager",
}}) }})
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &Alertmanager{ return &Alertmanager{
log: logger,
mimirClient: mc,
amClient: amclient.New(transport, nil), amClient: amclient.New(transport, nil),
httpClient: &client, httpClient: &client,
log: log.New("ngalert.remote.alertmanager"),
sender: s, sender: s,
orgID: orgID, orgID: orgID,
tenantID: cfg.TenantID, tenantID: cfg.TenantID,
@ -89,16 +106,49 @@ func NewAlertmanager(cfg AlertmanagerConfig, orgID int64) (*Alertmanager, error)
}, nil }, nil
} }
// ApplyConfig is called everytime we've determined we need to apply an existing configuration to the Alertmanager,
// including the first time the Alertmanager is started. In the context of a "remote Alertmanager" it's as good of a heuristic,
// for "a function that gets called when the Alertmanager starts". As a result we do two things:
// 1. Execute a readiness check to make sure the remote Alertmanager we're about to communicate with is up and ready.
// 2. Upload the configuration and state we currently hold.
func (am *Alertmanager) ApplyConfig(ctx context.Context, config *models.AlertConfiguration) error { func (am *Alertmanager) ApplyConfig(ctx context.Context, config *models.AlertConfiguration) error {
if am.ready { if am.ready {
am.log.Debug("Alertmanager previously marked as ready, skipping readiness check")
return nil return nil
} }
return am.checkReadiness(ctx) // First, execute a readiness check to make sure the remote Alertmanager is ready.
am.log.Debug("Start readiness check for remote Alertmanager", "url", am.url)
if err := am.checkReadiness(ctx); err != nil {
am.log.Error("unable to pass the readiness check", "err", err)
return err
}
am.log.Debug("Completed readiness check for remote Alertmanager", "url", am.url)
am.log.Debug("Start configuration upload to remote Alertmanager", "url", am.url)
if ok := am.compareRemoteConfig(ctx, config); !ok {
err := am.mimirClient.CreateGrafanaAlertmanagerConfig(ctx, config.AlertmanagerConfiguration, config.ConfigurationHash, config.ID, config.CreatedAt, config.Default)
if err != nil {
am.log.Error("Unable to upload the configuration to the remote Alertmanager", "err", err)
} else {
am.log.Debug("Completed configuration upload to remote Alertmanager", "url", am.url)
}
}
am.log.Debug("Start state upload to remote Alertmanager", "url", am.url)
if ok := am.compareRemoteState(ctx, ""); !ok {
if err := am.mimirClient.CreateGrafanaAlertmanagerState(ctx, ""); err != nil {
am.log.Error("Unable to upload the state to the remote Alertmanager", "err", err)
}
}
am.log.Debug("Completed state upload to remote Alertmanager", "url", am.url)
// upload the state
return nil
} }
func (am *Alertmanager) checkReadiness(ctx context.Context) error { func (am *Alertmanager) checkReadiness(ctx context.Context) error {
readyURL := strings.TrimSuffix(am.url, "/") + readyPath readyURL := strings.TrimSuffix(am.url, "/") + "/alertmanager" + readyPath
req, err := http.NewRequestWithContext(ctx, http.MethodGet, readyURL, nil) req, err := http.NewRequestWithContext(ctx, http.MethodGet, readyURL, nil)
if err != nil { if err != nil {
return fmt.Errorf("error creating readiness request: %w", err) return fmt.Errorf("error creating readiness request: %w", err)
@ -286,64 +336,41 @@ func (am *Alertmanager) TestTemplate(ctx context.Context, c apimodels.TestTempla
return &notifier.TestTemplatesResults{}, nil return &notifier.TestTemplatesResults{}, nil
} }
// StopAndWait is called when the grafana server is instructed to shut down or an org is deleted.
// In the context of a "remote Alertmanager" it is a good heuristic for Grafana is about to shut down or we no longer need you.
func (am *Alertmanager) StopAndWait() { func (am *Alertmanager) StopAndWait() {
am.sender.Stop() am.sender.Stop()
// Upload the configuration and state
} }
func (am *Alertmanager) Ready() bool { func (am *Alertmanager) Ready() bool {
return am.ready return am.ready
} }
// We don't have files on disk, no-op. // CleanUp does not have an equivalent in a "remote Alertmanager" context, we don't have files on disk, no-op.
func (am *Alertmanager) CleanUp() {} func (am *Alertmanager) CleanUp() {}
type roundTripper struct { // compareRemoteConfig gets the remote Alertmanager config and compares it to the existing configuration.
tenantID string func (am *Alertmanager) compareRemoteConfig(ctx context.Context, config *models.AlertConfiguration) bool {
basicAuthPassword string rc, err := am.mimirClient.GetGrafanaAlertmanagerConfig(ctx)
next http.RoundTripper
}
// RoundTrip implements the http.RoundTripper interface
// while adding the `X-Scope-OrgID` header and basic auth credentials.
func (r *roundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
req.Header.Set("X-Scope-OrgID", r.tenantID)
if r.tenantID != "" && r.basicAuthPassword != "" {
req.SetBasicAuth(r.tenantID, r.basicAuthPassword)
}
return r.next.RoundTrip(req)
}
// TODO: change implementation, this is only useful for testing other methods.
func (am *Alertmanager) postConfig(ctx context.Context, rawConfig string) error {
alertsURL := strings.TrimSuffix(am.url, "/alertmanager") + "/api/v1/alerts"
req, err := http.NewRequestWithContext(ctx, http.MethodPost, alertsURL, strings.NewReader(rawConfig))
if err != nil { if err != nil {
return fmt.Errorf("error creating request: %v", err) // If we get an error trying to compare log it and return false so that we try to upload it anyway.
am.log.Error("Unable to get the remote Alertmanager Configuration for comparison", "err", err)
return false
} }
res, err := am.httpClient.Do(req) return md5.Sum([]byte(rc.GrafanaAlertmanagerConfig)) == md5.Sum([]byte(config.AlertmanagerConfiguration))
}
// compareRemoteState gets the remote Alertmanager state and compares it to the existing state.
func (am *Alertmanager) compareRemoteState(ctx context.Context, state string) bool {
rs, err := am.mimirClient.GetGrafanaAlertmanagerState(ctx)
if err != nil { if err != nil {
return err // If we get an error trying to compare log it and return false so that we try to upload it anyway.
am.log.Error("Unable to get the remote Alertmanager state for comparison", "err", err)
return false
} }
if res.StatusCode == http.StatusNotFound { return rs.State == state
return fmt.Errorf("config not found")
}
defer func() {
if err := res.Body.Close(); err != nil {
am.log.Warn("Error while closing body", "err", err)
}
}()
_, err = io.ReadAll(res.Body)
if err != nil {
return fmt.Errorf("error reading request response: %w", err)
}
if res.StatusCode != http.StatusCreated {
return fmt.Errorf("setting config failed with status code %d", res.StatusCode)
}
return nil
} }

View File

@ -2,20 +2,25 @@ package remote
import ( import (
"context" "context"
"crypto/md5"
"fmt"
"math/rand" "math/rand"
"net/http"
"net/http/httptest"
"os" "os"
"testing" "testing"
"time" "time"
"github.com/go-openapi/strfmt" "github.com/go-openapi/strfmt"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/util" "github.com/grafana/grafana/pkg/util"
amv2 "github.com/prometheus/alertmanager/api/v2/models" amv2 "github.com/prometheus/alertmanager/api/v2/models"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
// Valid config for Cloud AM, no `grafana_managed_receievers` field. // Valid Grafana Alertmanager configuration.
const upstreamConfig = `{"template_files": {}, "alertmanager_config": "{\"global\": {\"smtp_from\": \"test@test.com\"}, \"route\": {\"receiver\": \"discord\"}, \"receivers\": [{\"name\": \"discord\", \"discord_configs\": [{\"webhook_url\": \"http://localhost:1234\"}]}]}"}` const testGrafanaConfig = `{"template_files":{},"alertmanager_config":{"route":{"receiver":"grafana-default-email","group_by":["grafana_folder","alertname"]},"templates":null,"receivers":[{"name":"grafana-default-email","grafana_managed_receiver_configs":[{"uid":"","name":"some other name","type":"email","disableResolveMessage":false,"settings":{"addresses":"\u003cexample@email.com\u003e"},"secureSettings":null}]}]}}`
func TestNewAlertmanager(t *testing.T) { func TestNewAlertmanager(t *testing.T) {
tests := []struct { tests := []struct {
@ -66,6 +71,124 @@ func TestNewAlertmanager(t *testing.T) {
} }
} }
func TestApplyConfig(t *testing.T) {
errorHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
})
okHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
})
// ApplyConfig performs a readiness check at startup.
// A non-200 response should result in an error.
server := httptest.NewServer(errorHandler)
cfg := AlertmanagerConfig{
URL: server.URL,
}
am, err := NewAlertmanager(cfg, 1)
require.NoError(t, err)
config := &ngmodels.AlertConfiguration{}
ctx := context.Background()
require.Error(t, am.ApplyConfig(ctx, config))
require.False(t, am.Ready())
// A 200 status code response should make the check succeed.
server.Config.Handler = okHandler
require.NoError(t, am.ApplyConfig(ctx, config))
require.True(t, am.Ready())
// If we already got a 200 status code response, we shouldn't make the HTTP request again.
server.Config.Handler = errorHandler
require.NoError(t, am.ApplyConfig(ctx, config))
require.True(t, am.Ready())
}
func TestIntegrationRemoteAlertmanagerApplyConfigOnlyUploadsOnce(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
amURL, ok := os.LookupEnv("AM_URL")
if !ok {
t.Skip("No Alertmanager URL provided")
}
tenantID := os.Getenv("AM_TENANT_ID")
password := os.Getenv("AM_PASSWORD")
// ApplyConfig performs a readiness check.
cfg := AlertmanagerConfig{
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
}
fakeConfigHash := fmt.Sprintf("%x", md5.Sum([]byte(testGrafanaConfig)))
fakeConfigCreatedAt := time.Date(2020, 6, 5, 12, 6, 0, 0, time.UTC).Unix()
fakeConfig := &ngmodels.AlertConfiguration{
ID: 100,
AlertmanagerConfiguration: testGrafanaConfig,
ConfigurationHash: fakeConfigHash,
ConfigurationVersion: "v2",
CreatedAt: fakeConfigCreatedAt,
Default: true,
OrgID: 1,
}
ctx := context.Background()
am, err := NewAlertmanager(cfg, 1)
require.NoError(t, err)
// We should have no configuration at first.
{
_, err = am.mimirClient.GetGrafanaAlertmanagerConfig(ctx)
require.Error(t, err)
require.Equal(t, "Error response from the Mimir API: alertmanager storage object not found", err.Error())
}
// Using `ApplyConfig` as a heuristic of a function that gets called when the Alertmanager starts
// We call it as if the Alertmanager were starting.
{
require.NoError(t, am.ApplyConfig(ctx, fakeConfig))
// First, we need to verify that the readiness check passes.
require.True(t, am.Ready())
// Next, we need to verify that Mimir received the configuration.
config, err := am.mimirClient.GetGrafanaAlertmanagerConfig(ctx)
require.NoError(t, err)
require.Equal(t, int64(100), config.ID)
require.Equal(t, testGrafanaConfig, config.GrafanaAlertmanagerConfig)
require.Equal(t, fakeConfigHash, config.Hash)
require.Equal(t, fakeConfigCreatedAt, config.CreatedAt)
require.Equal(t, true, config.Default)
// TODO: Check that the state was uploaded.
}
// Calling `ApplyConfig` again with a changed configuration yields no effect.
{
fakeConfig.ID = 30000000000000000
require.NoError(t, am.ApplyConfig(ctx, fakeConfig))
// The remote Alertmanager continues to be ready.
require.True(t, am.Ready())
// Next, we need to verify that the config that was uploaded remains the same.
config, err := am.mimirClient.GetGrafanaAlertmanagerConfig(ctx)
require.NoError(t, err)
require.Equal(t, int64(100), config.ID)
require.Equal(t, testGrafanaConfig, config.GrafanaAlertmanagerConfig)
require.Equal(t, fakeConfigHash, config.Hash)
require.Equal(t, fakeConfigCreatedAt, config.CreatedAt)
require.Equal(t, true, config.Default)
}
// TODO: Now, shutdown the Alertmanager and we expect the latest configuration to be uploaded.
{
}
}
func TestIntegrationRemoteAlertmanagerSilences(t *testing.T) { func TestIntegrationRemoteAlertmanagerSilences(t *testing.T) {
if testing.Short() { if testing.Short() {
t.Skip("skipping integration test") t.Skip("skipping integration test")
@ -79,7 +202,7 @@ func TestIntegrationRemoteAlertmanagerSilences(t *testing.T) {
password := os.Getenv("AM_PASSWORD") password := os.Getenv("AM_PASSWORD")
cfg := AlertmanagerConfig{ cfg := AlertmanagerConfig{
URL: amURL + "/alertmanager", URL: amURL,
TenantID: tenantID, TenantID: tenantID,
BasicAuthPassword: password, BasicAuthPassword: password,
} }
@ -158,7 +281,7 @@ func TestIntegrationRemoteAlertmanagerAlerts(t *testing.T) {
password := os.Getenv("AM_PASSWORD") password := os.Getenv("AM_PASSWORD")
cfg := AlertmanagerConfig{ cfg := AlertmanagerConfig{
URL: amURL + "/alertmanager", URL: amURL,
TenantID: tenantID, TenantID: tenantID,
BasicAuthPassword: password, BasicAuthPassword: password,
} }
@ -219,7 +342,7 @@ func TestIntegrationRemoteAlertmanagerReceivers(t *testing.T) {
password := os.Getenv("AM_PASSWORD") password := os.Getenv("AM_PASSWORD")
cfg := AlertmanagerConfig{ cfg := AlertmanagerConfig{
URL: amURL + "/alertmanager", URL: amURL,
TenantID: tenantID, TenantID: tenantID,
BasicAuthPassword: password, BasicAuthPassword: password,
} }
@ -231,14 +354,6 @@ func TestIntegrationRemoteAlertmanagerReceivers(t *testing.T) {
rcvs, err := am.GetReceivers(context.Background()) rcvs, err := am.GetReceivers(context.Background())
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, "empty-receiver", *rcvs[0].Name) require.Equal(t, "empty-receiver", *rcvs[0].Name)
// After changing the configuration, we should have a new `discord` receiver.
require.NoError(t, am.postConfig(context.Background(), upstreamConfig))
require.Eventually(t, func() bool {
rcvs, err = am.GetReceivers(context.Background())
require.NoError(t, err)
return *rcvs[0].Name == "discord"
}, 16*time.Second, 1*time.Second)
} }
func genSilence(createdBy string) apimodels.PostableSilence { func genSilence(createdBy string) apimodels.PostableSilence {

View File

@ -0,0 +1,59 @@
package client
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
)
const (
grafanaAlertmanagerConfigPath = "/api/v1/grafana/config"
)
type UserGrafanaConfig struct {
ID int64 `json:"id"`
GrafanaAlertmanagerConfig string `json:"configuration"`
Hash string `json:"configuration_hash"`
CreatedAt int64 `json:"created"`
Default bool `json:"default"`
}
func (mc *Mimir) GetGrafanaAlertmanagerConfig(ctx context.Context) (*UserGrafanaConfig, error) {
gc := &UserGrafanaConfig{}
response := successResponse{
Data: gc,
}
// nolint:bodyclose
// closed within `do`
_, err := mc.do(ctx, grafanaAlertmanagerConfigPath, http.MethodGet, nil, &response)
if err != nil {
return nil, err
}
if response.Status != "success" {
return nil, fmt.Errorf("returned non-success `status` from the MimirAPI: %s", response.Status)
}
return gc, nil
}
func (mc *Mimir) CreateGrafanaAlertmanagerConfig(ctx context.Context, c, hash string, id, created int64, d bool) error {
payload, err := json.Marshal(&UserGrafanaConfig{
ID: id,
GrafanaAlertmanagerConfig: c,
Hash: hash,
CreatedAt: created,
Default: d,
})
if err != nil {
return err
}
return mc.doOK(ctx, grafanaAlertmanagerConfigPath, http.MethodPost, bytes.NewBuffer(payload))
}
func (mc *Mimir) DeleteGrafanaAlertmanagerConfig(ctx context.Context) error {
return mc.doOK(ctx, grafanaAlertmanagerConfigPath, http.MethodDelete, nil)
}

View File

@ -0,0 +1,51 @@
package client
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
)
const (
grafanaAlertmanagerStatePath = "/grafana/state"
)
type UserGrafanaState struct {
State string `json:"state"`
}
func (mc *Mimir) GetGrafanaAlertmanagerState(ctx context.Context) (*UserGrafanaState, error) {
gs := &UserGrafanaState{}
response := successResponse{
Data: gs,
}
// nolint:bodyclose
// closed within `do`
_, err := mc.do(ctx, grafanaAlertmanagerStatePath, http.MethodGet, nil, &response)
if err != nil {
return nil, err
}
if response.Status != "success" {
return nil, fmt.Errorf("returned non-success `status` from the MimirAPI: %s", response.Status)
}
return gs, nil
}
func (mc *Mimir) CreateGrafanaAlertmanagerState(ctx context.Context, state string) error {
payload, err := json.Marshal(&UserGrafanaState{
State: state,
})
if err != nil {
return err
}
return mc.doOK(ctx, grafanaAlertmanagerStatePath, http.MethodPost, bytes.NewBuffer(payload))
}
func (mc *Mimir) DeleteGrafanaAlertmanagerState(ctx context.Context) error {
return mc.doOK(ctx, grafanaAlertmanagerStatePath, http.MethodDelete, nil)
}

View File

@ -0,0 +1,179 @@
package client
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"path"
"strings"
"github.com/grafana/grafana/pkg/infra/log"
)
// MimirClient contains all the methods to query the migration critical endpoints of Mimir instance, it's an interface to allow multiple implementations.
type MimirClient interface {
GetGrafanaAlertmanagerState(ctx context.Context) (*UserGrafanaState, error)
CreateGrafanaAlertmanagerState(ctx context.Context, s string) error
DeleteGrafanaAlertmanagerState(ctx context.Context) error
GetGrafanaAlertmanagerConfig(ctx context.Context) (*UserGrafanaConfig, error)
CreateGrafanaAlertmanagerConfig(ctx context.Context, configuration string, hash string, id int64, at int64, d bool) error
DeleteGrafanaAlertmanagerConfig(ctx context.Context) error
}
type Mimir struct {
endpoint *url.URL
client http.Client
logger log.Logger
}
type Config struct {
Address string
TenantID string
Password string
Logger log.Logger
}
// successResponse represents a successful response from the Mimir API.
type successResponse struct {
Status string `json:"status"`
Data any `json:"data"`
}
// errorResponse represents an error from the Mimir API.
type errorResponse struct {
Status string `json:"status"`
Error1 string `json:"error"`
Error2 string `json:"Error"`
}
func (e *errorResponse) Error() string {
if e.Error1 != "" {
return e.Error1
}
return e.Error2
}
func New(cfg *Config) (*Mimir, error) {
endpoint, err := url.Parse(cfg.Address)
if err != nil {
return nil, err
}
rt := &MimirAuthRoundTripper{
TenantID: cfg.TenantID,
Password: cfg.Password,
Next: http.DefaultTransport,
}
c := http.Client{
Transport: rt,
}
return &Mimir{
endpoint: endpoint,
client: c,
logger: cfg.Logger,
}, nil
}
// do execute an HTTP requests against the specified path and method using the specified payload.
// It returns the HTTP response.
func (mc *Mimir) do(ctx context.Context, p, method string, payload io.Reader, out any) (*http.Response, error) {
pathURL, err := url.Parse(p)
if err != nil {
return nil, err
}
endpoint := *mc.endpoint
endpoint.Path = path.Join(endpoint.Path, pathURL.Path)
r, err := http.NewRequestWithContext(ctx, method, endpoint.String(), payload)
if err != nil {
return nil, err
}
r.Header.Set("Accept", "application/json")
r.Header.Set("Content-Type", "application/json")
resp, err := mc.client.Do(r)
if err != nil {
msg := "Unable to fulfill request to the Mimir API"
mc.logger.Error(msg, "err", err, "url", r.URL.String(), "method", r.Method)
return nil, fmt.Errorf("%s: %w", msg, err)
}
defer func() {
if err := resp.Body.Close(); err != nil {
mc.logger.Error("Error closing HTTP body", "err", err, "url", r.URL.String(), "method", r.Method)
}
}()
ct := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ct, "application/json") {
msg := "Response content-type is not application/json"
mc.logger.Error(msg, "content-type", "url", r.URL.String(), "method", r.Method, ct, "status", resp.StatusCode)
return nil, fmt.Errorf("%s: %s", msg, ct)
}
if out == nil {
return resp, nil
}
body, err := io.ReadAll(resp.Body)
if err != nil {
msg := "Failed to read the request body"
mc.logger.Error(msg, "err", err, "url", r.URL.String(), "method", r.Method, "status", resp.StatusCode)
return nil, fmt.Errorf("%s: %w", msg, err)
}
if resp.StatusCode/100 != 2 {
errResponse := &errorResponse{}
err = json.Unmarshal(body, errResponse)
if err == nil && errResponse.Error() != "" {
msg := "Error response from the Mimir API"
mc.logger.Error(msg, "err", errResponse, "url", r.URL.String(), "method", r.Method, "status", resp.StatusCode)
return nil, fmt.Errorf("%s: %w", msg, errResponse)
}
msg := "Failed to decode non-2xx JSON response"
mc.logger.Error(msg, "err", err, "url", r.URL.String(), "method", r.Method, "status", resp.StatusCode)
return nil, fmt.Errorf("%s: %w", msg, err)
}
if err = json.Unmarshal(body, out); err != nil {
msg := "Failed to decode 2xx JSON response"
mc.logger.Error(msg, "err", err, "url", r.URL.String(), "method", r.Method, "status", resp.StatusCode)
return nil, fmt.Errorf("%s: %w", msg, err)
}
return resp, nil
}
func (mc *Mimir) doOK(ctx context.Context, p, method string, payload io.Reader) error {
var sr successResponse
resp, err := mc.do(ctx, p, method, payload, &sr)
if err != nil {
return err
}
defer func() {
if err := resp.Body.Close(); err != nil {
mc.logger.Error("Error closing HTTP body", "err", err)
}
}()
switch sr.Status {
case "success":
return nil
case "error":
return errors.New("received an 2xx status code but the request body reflected an error")
default:
return fmt.Errorf("received an unknown status from the request body: %s", sr.Status)
}
}

View File

@ -0,0 +1,28 @@
package client
import (
"net/http"
)
const mimirTenantHeader = "X-Scope-OrgID"
type MimirAuthRoundTripper struct {
TenantID string
Password string
Next http.RoundTripper
}
// RoundTrip implements the http.RoundTripper interface
// It adds an `X-Scope-OrgID` header with the TenantID if only provided with a tenantID or sets HTTP Basic Authentication if both
// a tenantID and a password are provided.
func (r *MimirAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
if r.TenantID != "" && r.Password == "" {
req.Header.Set(mimirTenantHeader, r.TenantID)
}
if r.TenantID != "" && r.Password != "" {
req.SetBasicAuth(r.TenantID, r.Password)
}
return r.Next.RoundTrip(req)
}

View File

@ -971,7 +971,6 @@ def remote_alertmanager_integration_tests_steps():
environment = { environment = {
"AM_TENANT_ID": "test", "AM_TENANT_ID": "test",
"AM_PASSWORD": "test",
"AM_URL": "http://mimir_backend:8080", "AM_URL": "http://mimir_backend:8080",
} }

View File

@ -20,7 +20,7 @@ images = {
"plugins_slack": "plugins/slack", "plugins_slack": "plugins/slack",
"python": "python:3.8", "python": "python:3.8",
"postgres_alpine": "postgres:12.3-alpine", "postgres_alpine": "postgres:12.3-alpine",
"mimir": "grafana/mimir:latest", "mimir": "us.gcr.io/kubernetes-dev/mimir:gotjosh-state-config-grafana-663a0ae78",
"mysql5": "mysql:5.7.39", "mysql5": "mysql:5.7.39",
"mysql8": "mysql:8.0.32", "mysql8": "mysql:8.0.32",
"redis_alpine": "redis:6.2.11-alpine", "redis_alpine": "redis:6.2.11-alpine",