Alerting: State Manager takes screenshots. (#49338)

The State Manager will now take screenshots when an alert instance
switches to an Alerting or Resolved state.

Signed-off-by: Joe Blubaugh joe.blubaugh@grafana.com
This commit is contained in:
Joe Blubaugh
2022-05-23 10:53:41 +08:00
committed by GitHub
parent 687e79538b
commit 1d724810de
8 changed files with 190 additions and 17 deletions

View File

@@ -33,7 +33,7 @@ var (
const ( const (
screenshotTimeout = 10 * time.Second screenshotTimeout = 10 * time.Second
screenshotCacheTTL = 15 * time.Second screenshotCacheTTL = 60 * time.Second
) )
// ScreenshotImageService takes screenshots of the panel for an alert rule and // ScreenshotImageService takes screenshots of the panel for an alert rule and
@@ -111,6 +111,12 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *ngmodels.Alert
return &v, nil return &v, nil
} }
type NotAvailableImageService struct{}
func (s *NotAvailableImageService) NewImage(ctx context.Context, r *ngmodels.AlertRule) (*store.Image, error) {
return nil, screenshot.ErrScreenshotsUnavailable
}
type NoopImageService struct{} type NoopImageService struct{}
func (s *NoopImageService) NewImage(ctx context.Context, r *ngmodels.AlertRule) (*store.Image, error) { func (s *NoopImageService) NewImage(ctx context.Context, r *ngmodels.AlertRule) (*store.Image, error) {

View File

@@ -81,6 +81,10 @@ const (
// Annotations are actually a set of labels, so technically this is the label name of an annotation. // Annotations are actually a set of labels, so technically this is the label name of an annotation.
DashboardUIDAnnotation = "__dashboardUid__" DashboardUIDAnnotation = "__dashboardUid__"
PanelIDAnnotation = "__panelId__" PanelIDAnnotation = "__panelId__"
// This isn't a hard-coded secret token, hence the nolint.
//nolint:gosec
ScreenshotTokenAnnotation = "__alertScreenshotToken__"
) )
var ( var (
@@ -89,7 +93,11 @@ var (
RuleUIDLabel: {}, RuleUIDLabel: {},
NamespaceUIDLabel: {}, NamespaceUIDLabel: {},
} }
InternalAnnotationNameSet = map[string]struct{}{} InternalAnnotationNameSet = map[string]struct{}{
DashboardUIDAnnotation: {},
PanelIDAnnotation: {},
ScreenshotTokenAnnotation: {},
}
) )
// AlertRule is the model for alert rules in unified alerting. // AlertRule is the model for alert rules in unified alerting.

View File

@@ -39,6 +39,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post
nA["__value_string__"] = alertState.LastEvaluationString nA["__value_string__"] = alertState.LastEvaluationString
} }
if alertState.Image != nil {
nA[ngModels.ScreenshotTokenAnnotation] = alertState.Image.Token
}
var urlStr string var urlStr string
if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil { if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil {
u := *appURL u := *appURL

View File

@@ -16,6 +16,7 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/eval"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models" ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state" "github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/util" "github.com/grafana/grafana/pkg/util"
) )
@@ -117,6 +118,22 @@ func Test_stateToPostableAlert(t *testing.T) {
result = stateToPostableAlert(alertState, appURL) result = stateToPostableAlert(alertState, appURL)
require.Equal(t, expected, result.Annotations) require.Equal(t, expected, result.Annotations)
}) })
t.Run("add __alertScreenshotToken__ if there is an image token", func(t *testing.T) {
alertState := randomState(tc.state)
alertState.Annotations = randomMapOfStrings()
alertState.Image = &store.Image{Token: "test_token"}
result := stateToPostableAlert(alertState, appURL)
expected := make(models.LabelSet, len(alertState.Annotations)+1)
for k, v := range alertState.Annotations {
expected[k] = v
}
expected["__alertScreenshotToken__"] = alertState.Image.Token
require.Equal(t, expected, result.Annotations)
})
}) })
switch tc.state { switch tc.state {

View File

@@ -170,19 +170,34 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, alertRule *ngModels.A
return states return states
} }
//nolint:unused // Maybe take a screenshot. Do it if:
func (st *Manager) newImage(ctx context.Context, alertRule *ngModels.AlertRule, state *State) error { // 1. The alert state is transitioning into the "Alerting" state from something else.
if state.Image == nil { // 2. The alert state has just transitioned to the resolved state.
image, err := st.imageService.NewImage(ctx, alertRule) // 3. The state is alerting and there is no screenshot annotation on the alert state.
if errors.Is(err, screenshot.ErrScreenshotsUnavailable) { func (st *Manager) maybeTakeScreenshot(
// It's not an error if screenshots are disabled. ctx context.Context,
return nil alertRule *ngModels.AlertRule,
} else if err != nil { state *State,
st.log.Error("failed to create image", "error", err) oldState eval.State,
return err ) error {
} shouldScreenshot := state.Resolved ||
state.Image = image state.State == eval.Alerting && oldState != eval.Alerting ||
state.State == eval.Alerting && state.Image == nil
if !shouldScreenshot {
return nil
} }
img, err := st.imageService.NewImage(ctx, alertRule)
if err != nil &&
errors.Is(err, screenshot.ErrScreenshotsUnavailable) ||
errors.Is(err, image.ErrNoDashboard) ||
errors.Is(err, image.ErrNoPanel) {
// It's not an error if screenshots are disabled, or our rule isn't allowed to generate screenshots.
return nil
} else if err != nil {
return err
}
state.Image = img
return nil return nil
} }
@@ -219,6 +234,14 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
// to Alertmanager. // to Alertmanager.
currentState.Resolved = oldState == eval.Alerting && currentState.State == eval.Normal currentState.Resolved = oldState == eval.Alerting && currentState.State == eval.Normal
err := st.maybeTakeScreenshot(ctx, alertRule, currentState, oldState)
if err != nil {
st.log.Warn("Error generating a screenshot for an alert instance.",
"alert_rule", alertRule.UID,
"dashboard", alertRule.DashboardUID,
"panel", alertRule.PanelID)
}
st.set(currentState) st.set(currentState)
if oldState != currentState.State { if oldState != currentState.State {
go st.annotateState(ctx, alertRule, currentState.Labels, result.EvaluatedAt, currentState.State, oldState) go st.annotateState(ctx, alertRule, currentState.Labels, result.EvaluatedAt, currentState.State, oldState)

View File

@@ -0,0 +1,106 @@
package state
import (
"context"
"fmt"
"math/rand"
"testing"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/services/sqlstore/mockstore"
"github.com/stretchr/testify/require"
)
// Not for parallel tests.
type CountingImageService struct {
Called int
}
func (c *CountingImageService) NewImage(_ context.Context, _ *ngmodels.AlertRule) (*store.Image, error) {
c.Called += 1
return &store.Image{
Token: fmt.Sprint(rand.Int()),
}, nil
}
func Test_maybeNewImage(t *testing.T) {
tests := []struct {
description string
shouldScreenshot bool
state *State
oldState eval.State
}{
{
"Take a screenshot when we change to an alerting state",
true,
&State{
State: eval.Alerting,
Image: &store.Image{
Token: "erase me",
},
},
eval.Normal,
},
{
"Take a screenshot if we're already alerting with no image",
true,
&State{
State: eval.Alerting,
},
eval.Alerting,
},
{
"Take a screenshot if we're resolved.",
true,
&State{
Resolved: true,
State: eval.Normal,
Image: &store.Image{
Token: "abcd",
},
},
eval.Alerting,
},
{
"Don't take a screenshot if we already have one.",
false,
&State{
State: eval.Alerting,
Image: &store.Image{
Token: "already set",
},
},
eval.Alerting,
},
{
"Don't take a screenshot if we're pending.",
false,
&State{
State: eval.Pending,
},
eval.Normal,
},
}
for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
imageService := &CountingImageService{}
mgr := NewManager(log.NewNopLogger(), &metrics.State{}, nil,
&store.FakeRuleStore{}, &store.FakeInstanceStore{}, mockstore.NewSQLStoreMock(),
&dashboards.FakeDashboardService{}, imageService)
err := mgr.maybeTakeScreenshot(context.Background(), &ngmodels.AlertRule{}, test.state, test.oldState)
require.NoError(t, err)
if !test.shouldScreenshot {
require.Equal(t, 0, imageService.Called)
} else {
require.Equal(t, 1, imageService.Called)
require.NotNil(t, test.state.Image)
}
})
}
}

View File

@@ -1770,7 +1770,7 @@ func TestProcessEvalResults(t *testing.T) {
for _, tc := range testCases { for _, tc := range testCases {
ss := mockstore.NewSQLStoreMock() ss := mockstore.NewSQLStoreMock()
st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, &store.FakeInstanceStore{}, ss, &dashboards.FakeDashboardService{}, &image.NoopImageService{}) st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, &store.FakeInstanceStore{}, ss, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{})
t.Run(tc.desc, func(t *testing.T) { t.Run(tc.desc, func(t *testing.T) {
fakeAnnoRepo := store.NewFakeAnnotationsRepo() fakeAnnoRepo := store.NewFakeAnnotationsRepo()
annotations.SetRepository(fakeAnnoRepo) annotations.SetRepository(fakeAnnoRepo)

View File

@@ -109,17 +109,26 @@ func TestDeleteExpiredImages(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
} }
// Images are availabile
img, err := dbstore.GetImage(ctx, imgs[0].Token)
require.NoError(t, err)
require.NotNil(t, img)
img, err = dbstore.GetImage(ctx, imgs[1].Token)
require.NoError(t, err)
require.NotNil(t, img)
// Wait until timeout. // Wait until timeout.
for i := 0; i < 120; i++ { for i := 0; i < 120; i++ {
store.TimeNow() store.TimeNow()
} }
// Call expired // Call expired
err := dbstore.DeleteExpiredImages(ctx) err = dbstore.DeleteExpiredImages(ctx)
require.NoError(t, err) require.NoError(t, err)
// All images are gone. // All images are gone.
img, err := dbstore.GetImage(ctx, imgs[0].Token) img, err = dbstore.GetImage(ctx, imgs[0].Token)
require.Nil(t, img) require.Nil(t, img)
require.Error(t, err) require.Error(t, err)