Alerting: Fix screenshots were not cached (#58493)

This commit is contained in:
George Robinson 2022-11-09 01:52:16 +00:00 committed by GitHub
parent 238a3f820c
commit c646ff0ce3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 244 additions and 7 deletions

View File

@ -0,0 +1,76 @@
package image
import (
"context"
"time"
gocache "github.com/patrickmn/go-cache"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
const (
namespace = "grafana"
subsystem = "alerting"
)
// CacheService caches images.
//
//go:generate mockgen -destination=cache_mock.go -package=image github.com/grafana/grafana/pkg/services/ngalert/image CacheService
type CacheService interface {
// Get returns the screenshot for the options or false if a screenshot with these
// options does not exist.
Get(ctx context.Context, k string) (models.Image, bool)
// Set the screenshot for the options. If another screenshot exists with these
// options then it will be replaced.
Set(ctx context.Context, k string, image models.Image) error
}
// InmemCacheService is an in-mem screenshot cache.
type InmemCacheService struct {
cache *gocache.Cache
cacheHits prometheus.Counter
cacheMisses prometheus.Counter
}
func NewInmemCacheService(expiration time.Duration, r prometheus.Registerer) CacheService {
return &InmemCacheService{
cache: gocache.New(expiration, time.Minute),
cacheHits: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "image_cache_hits_total",
Namespace: namespace,
Subsystem: subsystem,
}),
cacheMisses: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "image_cache_misses_total",
Namespace: namespace,
Subsystem: subsystem,
}),
}
}
func (s *InmemCacheService) Get(_ context.Context, k string) (models.Image, bool) {
if v, ok := s.cache.Get(k); ok {
defer s.cacheHits.Inc()
return v.(models.Image), true
}
defer s.cacheMisses.Inc()
return models.Image{}, false
}
func (s *InmemCacheService) Set(_ context.Context, k string, screenshot models.Image) error {
s.cache.Set(k, screenshot, 0)
return nil
}
type NoOpCacheService struct{}
func (s *NoOpCacheService) Get(_ context.Context, _ string) (models.Image, bool) {
return models.Image{}, false
}
func (s *NoOpCacheService) Set(_ context.Context, _ string, _ models.Image) error {
return nil
}

View File

@ -0,0 +1,65 @@
// Code generated by MockGen. DO NOT EDIT.
// Source: github.com/grafana/grafana/pkg/services/ngalert/image (interfaces: CacheService)
// Package image is a generated GoMock package.
package image
import (
context "context"
reflect "reflect"
gomock "github.com/golang/mock/gomock"
models "github.com/grafana/grafana/pkg/services/ngalert/models"
)
// MockCacheService is a mock of CacheService interface.
type MockCacheService struct {
ctrl *gomock.Controller
recorder *MockCacheServiceMockRecorder
}
// MockCacheServiceMockRecorder is the mock recorder for MockCacheService.
type MockCacheServiceMockRecorder struct {
mock *MockCacheService
}
// NewMockCacheService creates a new mock instance.
func NewMockCacheService(ctrl *gomock.Controller) *MockCacheService {
mock := &MockCacheService{ctrl: ctrl}
mock.recorder = &MockCacheServiceMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockCacheService) EXPECT() *MockCacheServiceMockRecorder {
return m.recorder
}
// Get mocks base method.
func (m *MockCacheService) Get(arg0 context.Context, arg1 string) (models.Image, bool) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Get", arg0, arg1)
ret0, _ := ret[0].(models.Image)
ret1, _ := ret[1].(bool)
return ret0, ret1
}
// Get indicates an expected call of Get.
func (mr *MockCacheServiceMockRecorder) Get(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Get", reflect.TypeOf((*MockCacheService)(nil).Get), arg0, arg1)
}
// Set mocks base method.
func (m *MockCacheService) Set(arg0 context.Context, arg1 string, arg2 models.Image) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Set", arg0, arg1, arg2)
ret0, _ := ret[0].(error)
return ret0
}
// Set indicates an expected call of Set.
func (mr *MockCacheServiceMockRecorder) Set(arg0, arg1, arg2 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Set", reflect.TypeOf((*MockCacheService)(nil).Set), arg0, arg1, arg2)
}

View File

@ -0,0 +1,38 @@
package image
import (
"context"
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func TestInmemCacheService(t *testing.T) {
s := NewInmemCacheService(time.Second, prometheus.DefaultRegisterer)
ctx := context.Background()
// should be a miss
actual, ok := s.Get(ctx, "test")
assert.False(t, ok)
assert.Equal(t, models.Image{}, actual)
// should be a hit
expected := models.Image{Path: "test.png"}
require.NoError(t, s.Set(ctx, "test", expected))
actual, ok = s.Get(ctx, "test")
assert.True(t, ok)
assert.Equal(t, expected, actual)
// wait 1s and the cached image should have expired
<-time.After(time.Second)
// should be a miss
actual, ok = s.Get(ctx, "test")
assert.False(t, ok)
assert.Equal(t, models.Image{}, actual)
}

View File

@ -21,8 +21,8 @@ import (
) )
const ( const (
screenshotTimeout = 10 * time.Second
screenshotCacheTTL = 60 * time.Second screenshotCacheTTL = 60 * time.Second
screenshotTimeout = 10 * time.Second
) )
var ( var (
@ -59,6 +59,7 @@ type ImageService interface {
// as an annotation or label to the Alertmanager. This service cannot take // as an annotation or label to the Alertmanager. This service cannot take
// screenshots of alert rules that are not associated with a dashboard panel. // screenshots of alert rules that are not associated with a dashboard panel.
type ScreenshotImageService struct { type ScreenshotImageService struct {
cache CacheService
limiter screenshot.RateLimiter limiter screenshot.RateLimiter
logger log.Logger logger log.Logger
screenshots screenshot.ScreenshotService screenshots screenshot.ScreenshotService
@ -69,12 +70,14 @@ type ScreenshotImageService struct {
// NewScreenshotImageService returns a new ScreenshotImageService. // NewScreenshotImageService returns a new ScreenshotImageService.
func NewScreenshotImageService( func NewScreenshotImageService(
cache CacheService,
limiter screenshot.RateLimiter, limiter screenshot.RateLimiter,
logger log.Logger, logger log.Logger,
screenshots screenshot.ScreenshotService, screenshots screenshot.ScreenshotService,
store store.ImageStore, store store.ImageStore,
uploads *UploadingService) ImageService { uploads *UploadingService) ImageService {
return &ScreenshotImageService{ return &ScreenshotImageService{
cache: cache,
limiter: limiter, limiter: limiter,
logger: logger, logger: logger,
screenshots: screenshots, screenshots: screenshots,
@ -88,6 +91,7 @@ func NewScreenshotImageService(
func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds dashboards.DashboardService, func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds dashboards.DashboardService,
rs rendering.Service, r prometheus.Registerer) (ImageService, error) { rs rendering.Service, r prometheus.Registerer) (ImageService, error) {
var ( var (
cache CacheService = &NoOpCacheService{}
limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{} limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{}
screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{} screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{}
uploads *UploadingService = nil uploads *UploadingService = nil
@ -95,6 +99,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
// If screenshots are enabled // If screenshots are enabled
if cfg.UnifiedAlerting.Screenshots.Capture { if cfg.UnifiedAlerting.Screenshots.Capture {
cache = NewInmemCacheService(screenshotCacheTTL, r)
limiter = screenshot.NewTokenRateLimiter(cfg.UnifiedAlerting.Screenshots.MaxConcurrentScreenshots) limiter = screenshot.NewTokenRateLimiter(cfg.UnifiedAlerting.Screenshots.MaxConcurrentScreenshots)
screenshots = screenshot.NewHeadlessScreenshotService(ds, rs, r) screenshots = screenshot.NewHeadlessScreenshotService(ds, rs, r)
@ -108,7 +113,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
} }
} }
return NewScreenshotImageService(limiter, cfg.Logger, screenshots, db, uploads), nil return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"), screenshots, db, uploads), nil
} }
// NewImage returns a screenshot of the alert rule or an error. // NewImage returns a screenshot of the alert rule or an error.
@ -119,7 +124,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
// alert rule has a Dashboard UID and the dashboard exists, but does not have a // alert rule has a Dashboard UID and the dashboard exists, but does not have a
// Panel ID in its annotations then an ErrNoPanel error is returned. // Panel ID in its annotations then an ErrNoPanel error is returned.
func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRule) (*models.Image, error) { func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRule) (*models.Image, error) {
if r.DashboardUID == nil { if r.DashboardUID == nil || *r.DashboardUID == "" {
return nil, ErrNoDashboard return nil, ErrNoDashboard
} }
@ -127,6 +132,17 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
return nil, ErrNoPanel return nil, ErrNoPanel
} }
// If there is an image is in the cache return it instead of taking another screenshot
if image, ok := s.cache.Get(ctx, r.GetKey().String()); ok {
s.logger.Debug("Found cached image", "token", image.Token)
return &image, nil
}
// We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout
// in the context is used for both database queries and the request to the rendering service,
// while the timeout in ScreenshotOptions is passed to the rendering service where it is used as
// a client timeout. It is not recommended to pass a context without a deadline and the context
// deadline should be at least as long as the timeout in ScreenshotOptions.
ctx, cancelFunc := context.WithTimeout(ctx, screenshotTimeout) ctx, cancelFunc := context.WithTimeout(ctx, screenshotTimeout)
defer cancelFunc() defer cancelFunc()
@ -136,8 +152,11 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
Timeout: screenshotTimeout, Timeout: screenshotTimeout,
} }
// To prevent concurrent screenshots of the same dashboard panel we use singleflight,
// deduplicated on a base64 hash of the screenshot options.
optsHash := base64.StdEncoding.EncodeToString(opts.Hash()) optsHash := base64.StdEncoding.EncodeToString(opts.Hash())
result, err, _ := s.singleflight.Do(optsHash, func() (interface{}, error) { result, err, _ := s.singleflight.Do(optsHash, func() (interface{}, error) {
// Once deduplicated concurrent screenshots are then rate-limited
screenshot, err := s.limiter.Do(ctx, opts, s.screenshots.Take) screenshot, err := s.limiter.Do(ctx, opts, s.screenshots.Take)
if err != nil { if err != nil {
if errors.Is(err, dashboards.ErrDashboardNotFound) { if errors.Is(err, dashboards.ErrDashboardNotFound) {
@ -145,15 +164,20 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
} }
return nil, err return nil, err
} }
image := models.Image{Path: screenshot.Path} image := models.Image{Path: screenshot.Path}
// Uploading images is optional
if s.uploads != nil { if s.uploads != nil {
if image, err = s.uploads.Upload(ctx, image); err != nil { if image, err = s.uploads.Upload(ctx, image); err != nil {
s.logger.Warn("failed to upload image", "path", image.Path, "error", err) s.logger.Warn("Failed to upload image", "path", image.Path, "error", err)
} }
} }
if err := s.store.SaveImage(ctx, &image); err != nil { if err := s.store.SaveImage(ctx, &image); err != nil {
return nil, fmt.Errorf("failed to save image: %w", err) return nil, fmt.Errorf("failed to save image: %w", err)
} }
s.logger.Debug("Saved new image", "token", image.Token)
return image, nil return image, nil
}) })
if err != nil { if err != nil {
@ -161,6 +185,10 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
} }
image := result.(models.Image) image := result.(models.Image)
if err = s.cache.Set(ctx, r.GetKey().String(), image); err != nil {
s.logger.Warn("Failed to cache image", "token", image.Token, "error", err)
}
return &image, nil return &image, nil
} }

View File

@ -23,17 +23,21 @@ func TestScreenshotImageService(t *testing.T) {
defer ctrl.Finish() defer ctrl.Finish()
var ( var (
cache = NewMockCacheService(ctrl)
images = store.NewFakeImageStore(t) images = store.NewFakeImageStore(t)
limiter = screenshot.NoOpRateLimiter{} limiter = screenshot.NoOpRateLimiter{}
screenshots = screenshot.NewMockScreenshotService(ctrl) screenshots = screenshot.NewMockScreenshotService(ctrl)
uploads = imguploader.NewMockImageUploader(ctrl) uploads = imguploader.NewMockImageUploader(ctrl)
) )
s := NewScreenshotImageService(&limiter, log.NewNopLogger(), screenshots, images, s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, images,
NewUploadingService(uploads, prometheus.NewRegistry())) NewUploadingService(uploads, prometheus.NewRegistry()))
ctx := context.Background() ctx := context.Background()
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: foo}").Return(models.Image{}, false)
// assert that a screenshot is taken // assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{ screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "foo", DashboardUID: "foo",
@ -43,11 +47,11 @@ func TestScreenshotImageService(t *testing.T) {
Path: "foo.png", Path: "foo.png",
}, nil) }, nil)
// the screenshot is made into an image and uploaded // assert that the screenshot is made into an image and uploaded
uploads.EXPECT().Upload(gomock.Any(), "foo.png"). uploads.EXPECT().Upload(gomock.Any(), "foo.png").
Return("https://example.com/foo.png", nil) Return("https://example.com/foo.png", nil)
// and then saved into the database // assert that the image is saved into the database
expected := models.Image{ expected := models.Image{
ID: 1, ID: 1,
Token: "foo", Token: "foo",
@ -55,12 +59,20 @@ func TestScreenshotImageService(t *testing.T) {
URL: "https://example.com/foo.png", URL: "https://example.com/foo.png",
} }
// assert that the image is saved into the cache
cache.EXPECT().Set(gomock.Any(), "{orgID: 1, UID: foo}", expected).Return(nil)
image, err := s.NewImage(ctx, &models.AlertRule{ image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "foo",
DashboardUID: pointer.String("foo"), DashboardUID: pointer.String("foo"),
PanelID: pointer.Int64(1)}) PanelID: pointer.Int64(1)})
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, expected, *image) assert.Equal(t, expected, *image)
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: bar}").Return(models.Image{}, false)
// assert that a screenshot is taken // assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{ screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "bar", DashboardUID: "bar",
@ -81,9 +93,27 @@ func TestScreenshotImageService(t *testing.T) {
Path: "bar.png", Path: "bar.png",
} }
// assert that the image is saved into the cache, but without a URL
cache.EXPECT().Set(gomock.Any(), "{orgID: 1, UID: bar}", expected).Return(nil)
image, err = s.NewImage(ctx, &models.AlertRule{ image, err = s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "bar",
DashboardUID: pointer.String("bar"), DashboardUID: pointer.String("bar"),
PanelID: pointer.Int64(1)}) PanelID: pointer.Int64(1)})
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, expected, *image) assert.Equal(t, expected, *image)
expected = models.Image{Path: "baz.png", URL: "https://example.com/baz.png"}
// assert that the cache is checked for an existing image and it is returned
cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: baz}").Return(expected, true)
image, err = s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "baz",
DashboardUID: pointer.String("baz"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
} }