mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Expose Prometheus metrics for persisting state history (#63157)
* Create historian metrics and dependency inject * Record counter for total number of state transitions logged * Track write failures * Track current number of active write goroutines * Record histogram of how long it takes to write history data * Don't copy the registerer * Adjust naming of write failures metric * Introduce WritesTotal to complement WritesFailedTotal * Measure TransitionsFailedTotal to complement TransitionsTotal * Rename all to state_history * Remove redundant Total suffix * Increment totals all the time, not just on success * Drop ActiveWriteGoroutines * Drop PersistDuration in favor of WriteDuration * Drop unused gauge * Make writes and writesFailed per org * Add metric indicating backend and a spot for future metadata * Drop _batch_ from names and update help * Add metric for bytes written * Better pairing of total + failure metric updates * Few tweaks to wording and naming * Record info metric during composition * Create fakeRequester and simple happy path test using it * Blocking test for the full historian and test for happy path metrics * Add tests for failure case metrics * Smoke test for full annotation persistence * Create test for metrics on annotation persistence, both happy and failing paths * Address linter complaints * More linter complaints * Remove unnecessary whitespace * Consistency improvements to help texts * Update tests to match new descs
This commit is contained in:
@@ -7,17 +7,59 @@ import (
|
||||
)
|
||||
|
||||
type Historian struct {
|
||||
Info *prometheus.GaugeVec
|
||||
TransitionsTotal *prometheus.CounterVec
|
||||
TransitionsFailed *prometheus.CounterVec
|
||||
WritesTotal *prometheus.CounterVec
|
||||
WritesFailed *prometheus.CounterVec
|
||||
WriteDuration *instrument.HistogramCollector
|
||||
BytesWritten prometheus.Counter
|
||||
}
|
||||
|
||||
func NewHistorianMetrics(r prometheus.Registerer) *Historian {
|
||||
return &Historian{
|
||||
Info: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_info",
|
||||
Help: "Information about the state history store.",
|
||||
}, []string{"backend"}),
|
||||
TransitionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_transitions_total",
|
||||
Help: "The total number of state transitions processed.",
|
||||
}, []string{"org"}),
|
||||
TransitionsFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_transitions_failed_total",
|
||||
Help: "The total number of state transitions that failed to be written - they are not retried.",
|
||||
}, []string{"org"}),
|
||||
WritesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_writes_total",
|
||||
Help: "The total number of state history batches that were attempted to be written.",
|
||||
}, []string{"org"}),
|
||||
WritesFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_writes_failed_total",
|
||||
Help: "The total number of failed writes of state history batches.",
|
||||
}, []string{"org"}),
|
||||
WriteDuration: instrument.NewHistogramCollector(promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_request_duration_seconds",
|
||||
Help: "Histogram of request durations to the state history store.",
|
||||
Help: "Histogram of request durations to the state history store. Only valid when using external stores.",
|
||||
Buckets: instrument.DefBuckets,
|
||||
}, instrument.HistogramCollectorBuckets)),
|
||||
BytesWritten: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "state_history_writes_bytes_total",
|
||||
Help: "The total number of bytes sent within a batch to the state history store. Only valid when using the Loki store.",
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -391,11 +391,13 @@ type Historian interface {
|
||||
|
||||
func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian) (Historian, error) {
|
||||
if !cfg.Enabled {
|
||||
met.Info.WithLabelValues("noop").Set(0)
|
||||
return historian.NewNopHistorian(), nil
|
||||
}
|
||||
|
||||
met.Info.WithLabelValues(cfg.Backend).Set(1)
|
||||
if cfg.Backend == "annotations" {
|
||||
return historian.NewAnnotationBackend(ar, ds, rs), nil
|
||||
return historian.NewAnnotationBackend(ar, ds, rs, met), nil
|
||||
}
|
||||
if cfg.Backend == "loki" {
|
||||
lcfg, err := historian.NewLokiConfig(cfg)
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
|
||||
"github.com/grafana/grafana/pkg/components/simplejson"
|
||||
@@ -16,6 +17,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/annotations"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
||||
@@ -23,9 +25,11 @@ import (
|
||||
|
||||
// AnnotationBackend is an implementation of state.Historian that uses Grafana Annotations as the backing datastore.
|
||||
type AnnotationBackend struct {
|
||||
annotations annotations.Repository
|
||||
annotations AnnotationStore
|
||||
dashboards *dashboardResolver
|
||||
rules RuleStore
|
||||
clock clock.Clock
|
||||
metrics *metrics.Historian
|
||||
log log.Logger
|
||||
}
|
||||
|
||||
@@ -33,12 +37,20 @@ type RuleStore interface {
|
||||
GetAlertRuleByUID(ctx context.Context, query *ngmodels.GetAlertRuleByUIDQuery) error
|
||||
}
|
||||
|
||||
func NewAnnotationBackend(annotations annotations.Repository, dashboards dashboards.DashboardService, rules RuleStore) *AnnotationBackend {
|
||||
type AnnotationStore interface {
|
||||
Find(ctx context.Context, query *annotations.ItemQuery) ([]*annotations.ItemDTO, error)
|
||||
SaveMany(ctx context.Context, items []annotations.Item) error
|
||||
}
|
||||
|
||||
func NewAnnotationBackend(annotations AnnotationStore, dashboards dashboards.DashboardService, rules RuleStore, metrics *metrics.Historian) *AnnotationBackend {
|
||||
logger := log.New("ngalert.state.historian", "backend", "annotations")
|
||||
return &AnnotationBackend{
|
||||
annotations: annotations,
|
||||
dashboards: newDashboardResolver(dashboards, defaultDashboardCacheExpiry),
|
||||
rules: rules,
|
||||
log: log.New("ngalert.state.historian"),
|
||||
clock: clock.New(),
|
||||
metrics: metrics,
|
||||
log: logger,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,10 +60,11 @@ func (h *AnnotationBackend) RecordStatesAsync(ctx context.Context, rule history_
|
||||
// Build annotations before starting goroutine, to make sure all data is copied and won't mutate underneath us.
|
||||
annotations := buildAnnotations(rule, states, logger)
|
||||
panel := parsePanelKey(rule, logger)
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
errCh <- h.recordAnnotationsSync(ctx, panel, annotations, logger)
|
||||
errCh <- h.recordAnnotations(ctx, panel, annotations, rule.OrgID, logger)
|
||||
}()
|
||||
return errCh
|
||||
}
|
||||
@@ -165,7 +178,11 @@ func buildAnnotations(rule history_model.RuleMeta, states []state.StateTransitio
|
||||
return items
|
||||
}
|
||||
|
||||
func (h *AnnotationBackend) recordAnnotationsSync(ctx context.Context, panel *panelKey, annotations []annotations.Item, logger log.Logger) error {
|
||||
func (h *AnnotationBackend) recordAnnotations(ctx context.Context, panel *panelKey, annotations []annotations.Item, orgID int64, logger log.Logger) error {
|
||||
if len(annotations) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if panel != nil {
|
||||
dashID, err := h.dashboards.getID(ctx, panel.orgID, panel.dashUID)
|
||||
if err != nil {
|
||||
@@ -179,8 +196,13 @@ func (h *AnnotationBackend) recordAnnotationsSync(ctx context.Context, panel *pa
|
||||
}
|
||||
}
|
||||
|
||||
org := fmt.Sprint(orgID)
|
||||
h.metrics.WritesTotal.WithLabelValues(org).Inc()
|
||||
h.metrics.TransitionsTotal.WithLabelValues(org).Add(float64(len(annotations)))
|
||||
if err := h.annotations.SaveMany(ctx, annotations); err != nil {
|
||||
logger.Error("Error saving alert annotation batch", "error", err)
|
||||
h.metrics.WritesFailed.WithLabelValues(org).Inc()
|
||||
h.metrics.TransitionsFailed.WithLabelValues(org).Add(float64(len(annotations)))
|
||||
return fmt.Errorf("error saving alert annotation batch: %w", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,31 +1,37 @@
|
||||
package historian
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/stretchr/testify/mock"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/components/simplejson"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/annotations"
|
||||
"github.com/grafana/grafana/pkg/services/annotations/annotationstest"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/tests/fakes"
|
||||
)
|
||||
|
||||
func TestAnnotationHistorian_Integration(t *testing.T) {
|
||||
func TestAnnotationHistorian(t *testing.T) {
|
||||
t.Run("alert annotations are queryable", func(t *testing.T) {
|
||||
anns := createTestAnnotationBackendSut(t)
|
||||
items := []annotations.Item{createAnnotation()}
|
||||
require.NoError(t, anns.recordAnnotationsSync(context.Background(), nil, items, log.NewNopLogger()))
|
||||
require.NoError(t, anns.recordAnnotations(context.Background(), nil, items, 1, log.NewNopLogger()))
|
||||
|
||||
q := models.HistoryQuery{
|
||||
RuleUID: "my-rule",
|
||||
@@ -40,16 +46,85 @@ func TestAnnotationHistorian_Integration(t *testing.T) {
|
||||
require.Equal(t, frame.Fields[i].Len(), 1)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("writing state transitions as annotations succeeds", func(t *testing.T) {
|
||||
anns := createTestAnnotationBackendSut(t)
|
||||
rule := createTestRule()
|
||||
states := singleFromNormal(&state.State{
|
||||
State: eval.Alerting,
|
||||
Labels: data.Labels{"a": "b"},
|
||||
})
|
||||
|
||||
err := <-anns.RecordStatesAsync(context.Background(), rule, states)
|
||||
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("emits expected write metrics", func(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
met := metrics.NewHistorianMetrics(reg)
|
||||
anns := createTestAnnotationBackendSutWithMetrics(t, met)
|
||||
errAnns := createFailingAnnotationSut(t, met)
|
||||
rule := createTestRule()
|
||||
states := singleFromNormal(&state.State{
|
||||
State: eval.Alerting,
|
||||
Labels: data.Labels{"a": "b"},
|
||||
})
|
||||
|
||||
<-anns.RecordStatesAsync(context.Background(), rule, states)
|
||||
<-errAnns.RecordStatesAsync(context.Background(), rule, states)
|
||||
|
||||
exp := bytes.NewBufferString(`
|
||||
# HELP grafana_alerting_state_history_transitions_failed_total The total number of state transitions that failed to be written - they are not retried.
|
||||
# TYPE grafana_alerting_state_history_transitions_failed_total counter
|
||||
grafana_alerting_state_history_transitions_failed_total{org="1"} 1
|
||||
# HELP grafana_alerting_state_history_transitions_total The total number of state transitions processed.
|
||||
# TYPE grafana_alerting_state_history_transitions_total counter
|
||||
grafana_alerting_state_history_transitions_total{org="1"} 2
|
||||
# HELP grafana_alerting_state_history_writes_failed_total The total number of failed writes of state history batches.
|
||||
# TYPE grafana_alerting_state_history_writes_failed_total counter
|
||||
grafana_alerting_state_history_writes_failed_total{org="1"} 1
|
||||
# HELP grafana_alerting_state_history_writes_total The total number of state history batches that were attempted to be written.
|
||||
# TYPE grafana_alerting_state_history_writes_total counter
|
||||
grafana_alerting_state_history_writes_total{org="1"} 2
|
||||
`)
|
||||
err := testutil.GatherAndCompare(reg, exp,
|
||||
"grafana_alerting_state_history_transitions_total",
|
||||
"grafana_alerting_state_history_transitions_failed_total",
|
||||
"grafana_alerting_state_history_writes_total",
|
||||
"grafana_alerting_state_history_writes_failed_total",
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func createTestAnnotationBackendSut(t *testing.T) *AnnotationBackend {
|
||||
return createTestAnnotationBackendSutWithMetrics(t, metrics.NewHistorianMetrics(prometheus.NewRegistry()))
|
||||
}
|
||||
|
||||
func createTestAnnotationBackendSutWithMetrics(t *testing.T, met *metrics.Historian) *AnnotationBackend {
|
||||
t.Helper()
|
||||
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
|
||||
rules := fakes.NewRuleStore(t)
|
||||
rules.Rules[1] = []*models.AlertRule{
|
||||
models.AlertRuleGen(withOrgID(1), withUID("my-rule"))(),
|
||||
}
|
||||
return NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, rules)
|
||||
dbs := &dashboards.FakeDashboardService{}
|
||||
dbs.On("GetDashboard", mock.Anything, mock.Anything).Return(&dashboards.Dashboard{}, nil)
|
||||
return NewAnnotationBackend(fakeAnnoRepo, dbs, rules, met)
|
||||
}
|
||||
|
||||
func createFailingAnnotationSut(t *testing.T, met *metrics.Historian) *AnnotationBackend {
|
||||
fakeAnnoRepo := &failingAnnotationRepo{}
|
||||
rules := fakes.NewRuleStore(t)
|
||||
rules.Rules[1] = []*models.AlertRule{
|
||||
models.AlertRuleGen(withOrgID(1), withUID("my-rule"))(),
|
||||
}
|
||||
dbs := &dashboards.FakeDashboardService{}
|
||||
dbs.On("GetDashboard", mock.Anything, mock.Anything).Return(&dashboards.Dashboard{}, nil)
|
||||
return NewAnnotationBackend(fakeAnnoRepo, dbs, rules, met)
|
||||
}
|
||||
|
||||
func createAnnotation() annotations.Item {
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/weaveworks/common/http/client"
|
||||
|
||||
@@ -46,6 +47,7 @@ type remoteLokiClient interface {
|
||||
type RemoteLokiBackend struct {
|
||||
client remoteLokiClient
|
||||
externalLabels map[string]string
|
||||
clock clock.Clock
|
||||
metrics *metrics.Historian
|
||||
log log.Logger
|
||||
}
|
||||
@@ -55,6 +57,7 @@ func NewRemoteLokiBackend(cfg LokiConfig, req client.Requester, metrics *metrics
|
||||
return &RemoteLokiBackend{
|
||||
client: newLokiClient(cfg, req, metrics, logger),
|
||||
externalLabels: cfg.ExternalLabels,
|
||||
clock: clock.New(),
|
||||
metrics: metrics,
|
||||
log: logger,
|
||||
}
|
||||
@@ -70,8 +73,19 @@ func (h *RemoteLokiBackend) RecordStatesAsync(ctx context.Context, rule history_
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
|
||||
org := fmt.Sprint(rule.OrgID)
|
||||
h.metrics.WritesTotal.WithLabelValues(org).Inc()
|
||||
samples := 0
|
||||
for _, s := range streams {
|
||||
samples += len(s.Values)
|
||||
}
|
||||
h.metrics.TransitionsTotal.WithLabelValues(org).Add(float64(samples))
|
||||
|
||||
if err := h.recordStreams(ctx, streams, logger); err != nil {
|
||||
logger.Error("Failed to save alert state history batch", "error", err)
|
||||
h.metrics.WritesFailed.WithLabelValues(org).Inc()
|
||||
h.metrics.TransitionsFailed.WithLabelValues(org).Add(float64(samples))
|
||||
errCh <- fmt.Errorf("failed to save alert state history batch: %w", err)
|
||||
}
|
||||
}()
|
||||
@@ -278,6 +292,7 @@ func (h *RemoteLokiBackend) recordStreams(ctx context.Context, streams []stream,
|
||||
if err := h.client.push(ctx, streams); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
logger.Debug("Done saving alert state history batch")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@ func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig,
|
||||
type httpLokiClient struct {
|
||||
client client.Requester
|
||||
cfg LokiConfig
|
||||
metrics *metrics.Historian
|
||||
log log.Logger
|
||||
}
|
||||
|
||||
@@ -94,6 +95,7 @@ func newLokiClient(cfg LokiConfig, req client.Requester, metrics *metrics.Histor
|
||||
return &httpLokiClient{
|
||||
client: tc,
|
||||
cfg: cfg,
|
||||
metrics: metrics,
|
||||
log: logger.New("protocol", "http"),
|
||||
}
|
||||
}
|
||||
@@ -177,6 +179,7 @@ func (c *httpLokiClient) push(ctx context.Context, s []stream) error {
|
||||
c.setAuthAndTenantHeaders(req)
|
||||
req.Header.Add("content-type", "application/json")
|
||||
|
||||
c.metrics.BytesWritten.Add(float64(len(enc)))
|
||||
req = req.WithContext(ctx)
|
||||
resp, err := c.client.Do(req)
|
||||
if resp != nil {
|
||||
|
||||
@@ -3,6 +3,9 @@ package historian
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -11,6 +14,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/weaveworks/common/http/client"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
)
|
||||
@@ -75,8 +79,35 @@ func TestLokiConfig(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// This function can be used for local testing, just remove the skip call.
|
||||
func TestLokiHTTPClient(t *testing.T) {
|
||||
t.Run("push formats expected data", func(t *testing.T) {
|
||||
req := NewFakeRequester()
|
||||
client := createTestLokiClient(req)
|
||||
now := time.Now().UTC()
|
||||
data := []stream{
|
||||
{
|
||||
Stream: map[string]string{},
|
||||
Values: []sample{
|
||||
{
|
||||
T: now,
|
||||
V: "some line",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
err := client.push(context.Background(), data)
|
||||
|
||||
require.NoError(t, err)
|
||||
require.Contains(t, "/loki/api/v1/push", req.lastRequest.URL.Path)
|
||||
sent := reqBody(t, req.lastRequest)
|
||||
exp := fmt.Sprintf(`{"streams": [{"stream": {}, "values": [["%d", "some line"]]}]}`, now.UnixNano())
|
||||
require.JSONEq(t, exp, sent)
|
||||
})
|
||||
}
|
||||
|
||||
// This function can be used for local testing, just remove the skip call.
|
||||
func TestLokiHTTPClient_Manual(t *testing.T) {
|
||||
t.Skip()
|
||||
|
||||
t.Run("smoke test pinging Loki", func(t *testing.T) {
|
||||
@@ -222,3 +253,24 @@ func TestStream(t *testing.T) {
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
func createTestLokiClient(req client.Requester) *httpLokiClient {
|
||||
url, _ := url.Parse("http://some.url")
|
||||
cfg := LokiConfig{
|
||||
WritePathURL: url,
|
||||
ReadPathURL: url,
|
||||
}
|
||||
met := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
||||
return newLokiClient(cfg, req, met, log.NewNopLogger())
|
||||
}
|
||||
|
||||
func reqBody(t *testing.T, req *http.Request) string {
|
||||
t.Helper()
|
||||
|
||||
defer func() {
|
||||
_ = req.Body.Close()
|
||||
}()
|
||||
byt, err := io.ReadAll(req.Body)
|
||||
require.NoError(t, err)
|
||||
return string(byt)
|
||||
}
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
package historian
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -10,9 +15,13 @@ import (
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/weaveworks/common/http/client"
|
||||
)
|
||||
|
||||
func TestRemoteLokiBackend(t *testing.T) {
|
||||
@@ -250,6 +259,69 @@ func TestMerge(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecordStates(t *testing.T) {
|
||||
t.Run("writes state transitions to loki", func(t *testing.T) {
|
||||
req := NewFakeRequester()
|
||||
loki := createTestLokiBackend(req, metrics.NewHistorianMetrics(prometheus.NewRegistry()))
|
||||
rule := createTestRule()
|
||||
states := singleFromNormal(&state.State{
|
||||
State: eval.Alerting,
|
||||
Labels: data.Labels{"a": "b"},
|
||||
})
|
||||
|
||||
err := <-loki.RecordStatesAsync(context.Background(), rule, states)
|
||||
|
||||
require.NoError(t, err)
|
||||
require.Contains(t, "/loki/api/v1/push", req.lastRequest.URL.Path)
|
||||
})
|
||||
|
||||
t.Run("emits expected write metrics", func(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
met := metrics.NewHistorianMetrics(reg)
|
||||
loki := createTestLokiBackend(NewFakeRequester(), met)
|
||||
errLoki := createTestLokiBackend(NewFakeRequester().WithResponse(badResponse()), met) //nolint:bodyclose
|
||||
rule := createTestRule()
|
||||
states := singleFromNormal(&state.State{
|
||||
State: eval.Alerting,
|
||||
Labels: data.Labels{"a": "b"},
|
||||
})
|
||||
|
||||
<-loki.RecordStatesAsync(context.Background(), rule, states)
|
||||
<-errLoki.RecordStatesAsync(context.Background(), rule, states)
|
||||
|
||||
exp := bytes.NewBufferString(`
|
||||
# HELP grafana_alerting_state_history_transitions_failed_total The total number of state transitions that failed to be written - they are not retried.
|
||||
# TYPE grafana_alerting_state_history_transitions_failed_total counter
|
||||
grafana_alerting_state_history_transitions_failed_total{org="1"} 1
|
||||
# HELP grafana_alerting_state_history_transitions_total The total number of state transitions processed.
|
||||
# TYPE grafana_alerting_state_history_transitions_total counter
|
||||
grafana_alerting_state_history_transitions_total{org="1"} 2
|
||||
# HELP grafana_alerting_state_history_writes_failed_total The total number of failed writes of state history batches.
|
||||
# TYPE grafana_alerting_state_history_writes_failed_total counter
|
||||
grafana_alerting_state_history_writes_failed_total{org="1"} 1
|
||||
# HELP grafana_alerting_state_history_writes_total The total number of state history batches that were attempted to be written.
|
||||
# TYPE grafana_alerting_state_history_writes_total counter
|
||||
grafana_alerting_state_history_writes_total{org="1"} 2
|
||||
`)
|
||||
err := testutil.GatherAndCompare(reg, exp,
|
||||
"grafana_alerting_state_history_transitions_total",
|
||||
"grafana_alerting_state_history_transitions_failed_total",
|
||||
"grafana_alerting_state_history_writes_total",
|
||||
"grafana_alerting_state_history_writes_failed_total",
|
||||
)
|
||||
require.NoError(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func createTestLokiBackend(req client.Requester, met *metrics.Historian) *RemoteLokiBackend {
|
||||
url, _ := url.Parse("http://some.url")
|
||||
cfg := LokiConfig{
|
||||
WritePathURL: url,
|
||||
ReadPathURL: url,
|
||||
}
|
||||
return NewRemoteLokiBackend(cfg, req, met)
|
||||
}
|
||||
|
||||
func singleFromNormal(st *state.State) []state.StateTransition {
|
||||
return []state.StateTransition{
|
||||
{
|
||||
@@ -284,3 +356,13 @@ func requireEntry(t *testing.T, row sample) lokiEntry {
|
||||
require.NoError(t, err)
|
||||
return entry
|
||||
}
|
||||
|
||||
func badResponse() *http.Response {
|
||||
return &http.Response{
|
||||
Status: "400 Bad Request",
|
||||
StatusCode: http.StatusBadRequest,
|
||||
Body: io.NopCloser(bytes.NewBufferString("")),
|
||||
ContentLength: int64(0),
|
||||
Header: make(http.Header, 0),
|
||||
}
|
||||
}
|
||||
|
||||
49
pkg/services/ngalert/state/historian/testing.go
Normal file
49
pkg/services/ngalert/state/historian/testing.go
Normal file
@@ -0,0 +1,49 @@
|
||||
package historian
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/annotations"
|
||||
)
|
||||
|
||||
type fakeRequester struct {
|
||||
lastRequest *http.Request
|
||||
resp *http.Response
|
||||
}
|
||||
|
||||
func NewFakeRequester() *fakeRequester {
|
||||
return &fakeRequester{
|
||||
resp: &http.Response{
|
||||
Status: "200 OK",
|
||||
StatusCode: 200,
|
||||
Body: io.NopCloser(bytes.NewBufferString("")),
|
||||
ContentLength: int64(0),
|
||||
Header: make(http.Header, 0),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (f *fakeRequester) WithResponse(resp *http.Response) *fakeRequester {
|
||||
f.resp = resp
|
||||
return f
|
||||
}
|
||||
|
||||
func (f *fakeRequester) Do(req *http.Request) (*http.Response, error) {
|
||||
f.lastRequest = req
|
||||
f.resp.Request = req // Not concurrency-safe!
|
||||
return f.resp, nil
|
||||
}
|
||||
|
||||
type failingAnnotationRepo struct{}
|
||||
|
||||
func (f *failingAnnotationRepo) SaveMany(_ context.Context, _ []annotations.Item) error {
|
||||
return fmt.Errorf("failed to save annotations")
|
||||
}
|
||||
|
||||
func (f *failingAnnotationRepo) Find(_ context.Context, _ *annotations.ItemQuery) ([]*annotations.ItemDTO, error) {
|
||||
return nil, fmt.Errorf("failed to query annotations")
|
||||
}
|
||||
@@ -8,16 +8,19 @@ import (
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/annotations"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/mock"
|
||||
)
|
||||
|
||||
func BenchmarkProcessEvalResults(b *testing.B) {
|
||||
as := annotations.FakeAnnotationsRepo{}
|
||||
as.On("SaveMany", mock.Anything, mock.Anything).Return(nil)
|
||||
hist := historian.NewAnnotationBackend(&as, nil, nil)
|
||||
metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
||||
hist := historian.NewAnnotationBackend(&as, nil, nil, metrics)
|
||||
cfg := state.ManagerCfg{
|
||||
Historian: hist,
|
||||
}
|
||||
|
||||
@@ -220,7 +220,8 @@ func TestDashboardAnnotations(t *testing.T) {
|
||||
_, dbstore := tests.SetupTestEnv(t, 1)
|
||||
|
||||
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
|
||||
hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil)
|
||||
metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
||||
hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil, metrics)
|
||||
cfg := state.ManagerCfg{
|
||||
Metrics: testMetrics.GetStateMetrics(),
|
||||
ExternalURL: nil,
|
||||
@@ -2209,7 +2210,8 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
|
||||
for _, tc := range testCases {
|
||||
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
|
||||
hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil)
|
||||
metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
||||
hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil, metrics)
|
||||
cfg := state.ManagerCfg{
|
||||
Metrics: testMetrics.GetStateMetrics(),
|
||||
ExternalURL: nil,
|
||||
|
||||
Reference in New Issue
Block a user