diff --git a/pkg/services/ngalert/image/cache.go b/pkg/services/ngalert/image/cache.go new file mode 100644 index 00000000000..4de259d8bd1 --- /dev/null +++ b/pkg/services/ngalert/image/cache.go @@ -0,0 +1,76 @@ +package image + +import ( + "context" + "time" + + gocache "github.com/patrickmn/go-cache" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + + "github.com/grafana/grafana/pkg/services/ngalert/models" +) + +const ( + namespace = "grafana" + subsystem = "alerting" +) + +// CacheService caches images. +// +//go:generate mockgen -destination=cache_mock.go -package=image github.com/grafana/grafana/pkg/services/ngalert/image CacheService +type CacheService interface { + // Get returns the screenshot for the options or false if a screenshot with these + // options does not exist. + Get(ctx context.Context, k string) (models.Image, bool) + // Set the screenshot for the options. If another screenshot exists with these + // options then it will be replaced. + Set(ctx context.Context, k string, image models.Image) error +} + +// InmemCacheService is an in-mem screenshot cache. +type InmemCacheService struct { + cache *gocache.Cache + cacheHits prometheus.Counter + cacheMisses prometheus.Counter +} + +func NewInmemCacheService(expiration time.Duration, r prometheus.Registerer) CacheService { + return &InmemCacheService{ + cache: gocache.New(expiration, time.Minute), + cacheHits: promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "image_cache_hits_total", + Namespace: namespace, + Subsystem: subsystem, + }), + cacheMisses: promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "image_cache_misses_total", + Namespace: namespace, + Subsystem: subsystem, + }), + } +} + +func (s *InmemCacheService) Get(_ context.Context, k string) (models.Image, bool) { + if v, ok := s.cache.Get(k); ok { + defer s.cacheHits.Inc() + return v.(models.Image), true + } + defer s.cacheMisses.Inc() + return models.Image{}, false +} + +func (s *InmemCacheService) Set(_ context.Context, k string, screenshot models.Image) error { + s.cache.Set(k, screenshot, 0) + return nil +} + +type NoOpCacheService struct{} + +func (s *NoOpCacheService) Get(_ context.Context, _ string) (models.Image, bool) { + return models.Image{}, false +} + +func (s *NoOpCacheService) Set(_ context.Context, _ string, _ models.Image) error { + return nil +} diff --git a/pkg/services/ngalert/image/cache_mock.go b/pkg/services/ngalert/image/cache_mock.go new file mode 100644 index 00000000000..655e43fe241 --- /dev/null +++ b/pkg/services/ngalert/image/cache_mock.go @@ -0,0 +1,65 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/grafana/grafana/pkg/services/ngalert/image (interfaces: CacheService) + +// Package image is a generated GoMock package. +package image + +import ( + context "context" + reflect "reflect" + + gomock "github.com/golang/mock/gomock" + models "github.com/grafana/grafana/pkg/services/ngalert/models" +) + +// MockCacheService is a mock of CacheService interface. +type MockCacheService struct { + ctrl *gomock.Controller + recorder *MockCacheServiceMockRecorder +} + +// MockCacheServiceMockRecorder is the mock recorder for MockCacheService. +type MockCacheServiceMockRecorder struct { + mock *MockCacheService +} + +// NewMockCacheService creates a new mock instance. +func NewMockCacheService(ctrl *gomock.Controller) *MockCacheService { + mock := &MockCacheService{ctrl: ctrl} + mock.recorder = &MockCacheServiceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockCacheService) EXPECT() *MockCacheServiceMockRecorder { + return m.recorder +} + +// Get mocks base method. +func (m *MockCacheService) Get(arg0 context.Context, arg1 string) (models.Image, bool) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Get", arg0, arg1) + ret0, _ := ret[0].(models.Image) + ret1, _ := ret[1].(bool) + return ret0, ret1 +} + +// Get indicates an expected call of Get. +func (mr *MockCacheServiceMockRecorder) Get(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Get", reflect.TypeOf((*MockCacheService)(nil).Get), arg0, arg1) +} + +// Set mocks base method. +func (m *MockCacheService) Set(arg0 context.Context, arg1 string, arg2 models.Image) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Set", arg0, arg1, arg2) + ret0, _ := ret[0].(error) + return ret0 +} + +// Set indicates an expected call of Set. +func (mr *MockCacheServiceMockRecorder) Set(arg0, arg1, arg2 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Set", reflect.TypeOf((*MockCacheService)(nil).Set), arg0, arg1, arg2) +} diff --git a/pkg/services/ngalert/image/cache_test.go b/pkg/services/ngalert/image/cache_test.go new file mode 100644 index 00000000000..a5a889dd8e2 --- /dev/null +++ b/pkg/services/ngalert/image/cache_test.go @@ -0,0 +1,38 @@ +package image + +import ( + "context" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/grafana/grafana/pkg/services/ngalert/models" +) + +func TestInmemCacheService(t *testing.T) { + s := NewInmemCacheService(time.Second, prometheus.DefaultRegisterer) + ctx := context.Background() + + // should be a miss + actual, ok := s.Get(ctx, "test") + assert.False(t, ok) + assert.Equal(t, models.Image{}, actual) + + // should be a hit + expected := models.Image{Path: "test.png"} + require.NoError(t, s.Set(ctx, "test", expected)) + actual, ok = s.Get(ctx, "test") + assert.True(t, ok) + assert.Equal(t, expected, actual) + + // wait 1s and the cached image should have expired + <-time.After(time.Second) + + // should be a miss + actual, ok = s.Get(ctx, "test") + assert.False(t, ok) + assert.Equal(t, models.Image{}, actual) +} diff --git a/pkg/services/ngalert/image/service.go b/pkg/services/ngalert/image/service.go index 0c6a9d280e3..75076d8d2f0 100644 --- a/pkg/services/ngalert/image/service.go +++ b/pkg/services/ngalert/image/service.go @@ -21,8 +21,8 @@ import ( ) const ( - screenshotTimeout = 10 * time.Second screenshotCacheTTL = 60 * time.Second + screenshotTimeout = 10 * time.Second ) var ( @@ -59,6 +59,7 @@ type ImageService interface { // as an annotation or label to the Alertmanager. This service cannot take // screenshots of alert rules that are not associated with a dashboard panel. type ScreenshotImageService struct { + cache CacheService limiter screenshot.RateLimiter logger log.Logger screenshots screenshot.ScreenshotService @@ -69,12 +70,14 @@ type ScreenshotImageService struct { // NewScreenshotImageService returns a new ScreenshotImageService. func NewScreenshotImageService( + cache CacheService, limiter screenshot.RateLimiter, logger log.Logger, screenshots screenshot.ScreenshotService, store store.ImageStore, uploads *UploadingService) ImageService { return &ScreenshotImageService{ + cache: cache, limiter: limiter, logger: logger, screenshots: screenshots, @@ -88,6 +91,7 @@ func NewScreenshotImageService( func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds dashboards.DashboardService, rs rendering.Service, r prometheus.Registerer) (ImageService, error) { var ( + cache CacheService = &NoOpCacheService{} limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{} screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{} uploads *UploadingService = nil @@ -95,6 +99,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da // If screenshots are enabled if cfg.UnifiedAlerting.Screenshots.Capture { + cache = NewInmemCacheService(screenshotCacheTTL, r) limiter = screenshot.NewTokenRateLimiter(cfg.UnifiedAlerting.Screenshots.MaxConcurrentScreenshots) screenshots = screenshot.NewHeadlessScreenshotService(ds, rs, r) @@ -108,7 +113,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da } } - return NewScreenshotImageService(limiter, cfg.Logger, screenshots, db, uploads), nil + return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"), screenshots, db, uploads), nil } // NewImage returns a screenshot of the alert rule or an error. @@ -119,7 +124,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da // alert rule has a Dashboard UID and the dashboard exists, but does not have a // Panel ID in its annotations then an ErrNoPanel error is returned. func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRule) (*models.Image, error) { - if r.DashboardUID == nil { + if r.DashboardUID == nil || *r.DashboardUID == "" { return nil, ErrNoDashboard } @@ -127,6 +132,17 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu return nil, ErrNoPanel } + // If there is an image is in the cache return it instead of taking another screenshot + if image, ok := s.cache.Get(ctx, r.GetKey().String()); ok { + s.logger.Debug("Found cached image", "token", image.Token) + return &image, nil + } + + // We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout + // in the context is used for both database queries and the request to the rendering service, + // while the timeout in ScreenshotOptions is passed to the rendering service where it is used as + // a client timeout. It is not recommended to pass a context without a deadline and the context + // deadline should be at least as long as the timeout in ScreenshotOptions. ctx, cancelFunc := context.WithTimeout(ctx, screenshotTimeout) defer cancelFunc() @@ -136,8 +152,11 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu Timeout: screenshotTimeout, } + // To prevent concurrent screenshots of the same dashboard panel we use singleflight, + // deduplicated on a base64 hash of the screenshot options. optsHash := base64.StdEncoding.EncodeToString(opts.Hash()) result, err, _ := s.singleflight.Do(optsHash, func() (interface{}, error) { + // Once deduplicated concurrent screenshots are then rate-limited screenshot, err := s.limiter.Do(ctx, opts, s.screenshots.Take) if err != nil { if errors.Is(err, dashboards.ErrDashboardNotFound) { @@ -145,15 +164,20 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu } return nil, err } + image := models.Image{Path: screenshot.Path} + + // Uploading images is optional if s.uploads != nil { if image, err = s.uploads.Upload(ctx, image); err != nil { - s.logger.Warn("failed to upload image", "path", image.Path, "error", err) + s.logger.Warn("Failed to upload image", "path", image.Path, "error", err) } } + if err := s.store.SaveImage(ctx, &image); err != nil { return nil, fmt.Errorf("failed to save image: %w", err) } + s.logger.Debug("Saved new image", "token", image.Token) return image, nil }) if err != nil { @@ -161,6 +185,10 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu } image := result.(models.Image) + if err = s.cache.Set(ctx, r.GetKey().String(), image); err != nil { + s.logger.Warn("Failed to cache image", "token", image.Token, "error", err) + } + return &image, nil } diff --git a/pkg/services/ngalert/image/service_test.go b/pkg/services/ngalert/image/service_test.go index 62bd5eabd6a..57c0d1aeb78 100644 --- a/pkg/services/ngalert/image/service_test.go +++ b/pkg/services/ngalert/image/service_test.go @@ -23,17 +23,21 @@ func TestScreenshotImageService(t *testing.T) { defer ctrl.Finish() var ( + cache = NewMockCacheService(ctrl) images = store.NewFakeImageStore(t) limiter = screenshot.NoOpRateLimiter{} screenshots = screenshot.NewMockScreenshotService(ctrl) uploads = imguploader.NewMockImageUploader(ctrl) ) - s := NewScreenshotImageService(&limiter, log.NewNopLogger(), screenshots, images, + s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, images, NewUploadingService(uploads, prometheus.NewRegistry())) ctx := context.Background() + // assert that the cache is checked for an existing image + cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: foo}").Return(models.Image{}, false) + // assert that a screenshot is taken screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{ DashboardUID: "foo", @@ -43,11 +47,11 @@ func TestScreenshotImageService(t *testing.T) { Path: "foo.png", }, nil) - // the screenshot is made into an image and uploaded + // assert that the screenshot is made into an image and uploaded uploads.EXPECT().Upload(gomock.Any(), "foo.png"). Return("https://example.com/foo.png", nil) - // and then saved into the database + // assert that the image is saved into the database expected := models.Image{ ID: 1, Token: "foo", @@ -55,12 +59,20 @@ func TestScreenshotImageService(t *testing.T) { URL: "https://example.com/foo.png", } + // assert that the image is saved into the cache + cache.EXPECT().Set(gomock.Any(), "{orgID: 1, UID: foo}", expected).Return(nil) + image, err := s.NewImage(ctx, &models.AlertRule{ + OrgID: 1, + UID: "foo", DashboardUID: pointer.String("foo"), PanelID: pointer.Int64(1)}) require.NoError(t, err) assert.Equal(t, expected, *image) + // assert that the cache is checked for an existing image + cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: bar}").Return(models.Image{}, false) + // assert that a screenshot is taken screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{ DashboardUID: "bar", @@ -81,9 +93,27 @@ func TestScreenshotImageService(t *testing.T) { Path: "bar.png", } + // assert that the image is saved into the cache, but without a URL + cache.EXPECT().Set(gomock.Any(), "{orgID: 1, UID: bar}", expected).Return(nil) + image, err = s.NewImage(ctx, &models.AlertRule{ + OrgID: 1, + UID: "bar", DashboardUID: pointer.String("bar"), PanelID: pointer.Int64(1)}) require.NoError(t, err) assert.Equal(t, expected, *image) + + expected = models.Image{Path: "baz.png", URL: "https://example.com/baz.png"} + + // assert that the cache is checked for an existing image and it is returned + cache.EXPECT().Get(gomock.Any(), "{orgID: 1, UID: baz}").Return(expected, true) + + image, err = s.NewImage(ctx, &models.AlertRule{ + OrgID: 1, + UID: "baz", + DashboardUID: pointer.String("baz"), + PanelID: pointer.Int64(1)}) + require.NoError(t, err) + assert.Equal(t, expected, *image) }