Alerting: Support customizable timeout for screenshots (#60981)

This commit adds a customizable timeout for screenshots called
capture_timeout. The default value is 10 seconds, and the maximum
value is 30 seconds. This timeout should be less than the minimum
Interval of all Evaluation Groups to avoid back pressure on alert
rule evaluation.
This commit is contained in:
George Robinson 2023-01-05 16:07:46 +00:00 committed by GitHub
parent bd9cce2866
commit 9af7adef76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 148 additions and 100 deletions

View File

@ -942,6 +942,12 @@ min_interval = 10s
# For more information on configuration options, refer to [rendering].
capture = false
# The timeout for capturing screenshots. If a screenshot cannot be captured within the timeout then
# the notification is sent without a screenshot. The maximum duration is 30 seconds. This timeout
# should be less than the minimum Interval of all Evaluation Groups to avoid back pressure on alert
# rule evaluation.
capture_timeout = 10s
# The maximum number of screenshots that can be taken at the same time. This option is different from
# concurrent_render_request_limit as max_concurrent_screenshots sets the number of concurrent screenshots
# that can be taken at the same time for all firing alerts where as concurrent_render_request_limit sets
@ -1388,4 +1394,4 @@ client_key =
client_cert =
server_name =
# The address of the socks5 proxy datasources should connect to
proxy_address =
proxy_address =

View File

@ -21,7 +21,6 @@ import (
)
const (
screenshotTimeout = 10 * time.Second
screenshotCacheTTL = time.Minute
)
@ -48,13 +47,14 @@ type ImageService interface {
// as an annotation or label to the Alertmanager. This service cannot take
// screenshots of alert rules that are not associated with a dashboard panel.
type ScreenshotImageService struct {
cache CacheService
limiter screenshot.RateLimiter
logger log.Logger
screenshots screenshot.ScreenshotService
singleflight singleflight.Group
store store.ImageStore
uploads *UploadingService
cache CacheService
limiter screenshot.RateLimiter
logger log.Logger
screenshots screenshot.ScreenshotService
screenshotTimeout time.Duration
singleflight singleflight.Group
store store.ImageStore
uploads *UploadingService
}
// NewScreenshotImageService returns a new ScreenshotImageService.
@ -63,15 +63,17 @@ func NewScreenshotImageService(
limiter screenshot.RateLimiter,
logger log.Logger,
screenshots screenshot.ScreenshotService,
screenshotTimeout time.Duration,
store store.ImageStore,
uploads *UploadingService) ImageService {
return &ScreenshotImageService{
cache: cache,
limiter: limiter,
logger: logger,
screenshots: screenshots,
store: store,
uploads: uploads,
cache: cache,
limiter: limiter,
logger: logger,
screenshots: screenshots,
screenshotTimeout: screenshotTimeout,
store: store,
uploads: uploads,
}
}
@ -80,10 +82,11 @@ func NewScreenshotImageService(
func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds dashboards.DashboardService,
rs rendering.Service, r prometheus.Registerer) (ImageService, error) {
var (
cache CacheService = &NoOpCacheService{}
limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{}
screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{}
uploads *UploadingService = nil
cache CacheService = &NoOpCacheService{}
limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{}
screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{}
screenshotTimeout time.Duration = 0
uploads *UploadingService = nil
)
// If screenshots are enabled
@ -91,6 +94,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
cache = NewInmemCacheService(screenshotCacheTTL, r)
limiter = screenshot.NewTokenRateLimiter(cfg.UnifiedAlerting.Screenshots.MaxConcurrentScreenshots)
screenshots = screenshot.NewHeadlessScreenshotService(ds, rs, r)
screenshotTimeout = cfg.UnifiedAlerting.Screenshots.CaptureTimeout
// Image uploading is an optional feature
if cfg.UnifiedAlerting.Screenshots.UploadExternalImageStorage {
@ -102,16 +106,17 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
}
}
return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"), screenshots, db, uploads), nil
return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"),
screenshots, screenshotTimeout, db, uploads), nil
}
// NewImage returns a screenshot of the alert rule or an error.
//
// The alert rule must be associated with a dashboard panel for a screenshot to be
// taken. If the alert rule does not have a Dashboard UID in its annotations,
// or the dashboard does not exist, an models.ErrNoDashboard error is returned. If the
// or the dashboard does not exist, a models.ErrNoDashboard error is returned. If the
// alert rule has a Dashboard UID and the dashboard exists, but does not have a
// Panel ID in its annotations then an models.ErrNoPanel error is returned.
// Panel ID in its annotations then a models.ErrNoPanel error is returned.
func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRule) (*models.Image, error) {
logger := s.logger.FromContext(ctx)
@ -132,7 +137,7 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
opts := screenshot.ScreenshotOptions{
DashboardUID: dashboardUID,
PanelID: panelID,
Timeout: screenshotTimeout,
Timeout: s.screenshotTimeout,
}
// To prevent concurrent screenshots of the same dashboard panel we use singleflight,
@ -145,19 +150,19 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
return &image, nil
}
// We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout
// in the context is used for both database queries and the request to the rendering service,
// while the timeout in ScreenshotOptions is passed to the rendering service where it is used as
// a client timeout. It is not recommended to pass a context without a deadline and the context
// deadline should be at least as long as the timeout in ScreenshotOptions.
ctx, cancelFunc := context.WithTimeout(ctx, screenshotTimeout)
defer cancelFunc()
logger.Debug("Requesting screenshot")
result, err, _ := s.singleflight.Do(optsHash, func() (interface{}, error) {
// We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout
// in the context is used for both database queries and the request to the rendering service,
// while the timeout in ScreenshotOptions is passed to the rendering service where it is used as
// a client timeout. It is not recommended to pass a context without a deadline and the context
// deadline should be at least as long as the timeout in ScreenshotOptions.
screenshotCtx, cancelFunc := context.WithTimeout(ctx, s.screenshotTimeout)
defer cancelFunc()
// Once deduplicated concurrent screenshots are then rate-limited
screenshot, err := s.limiter.Do(ctx, opts, s.screenshots.Take)
screenshot, err := s.limiter.Do(screenshotCtx, opts, s.screenshots.Take)
if err != nil {
if errors.Is(err, dashboards.ErrDashboardNotFound) {
return nil, models.ErrNoDashboard

View File

@ -4,6 +4,7 @@ import (
"context"
"errors"
"testing"
"time"
"github.com/golang/mock/gomock"
"github.com/prometheus/client_golang/prometheus"
@ -30,90 +31,116 @@ func TestScreenshotImageService(t *testing.T) {
uploads = imguploader.NewMockImageUploader(ctrl)
)
s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, images,
s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, 5*time.Second, images,
NewUploadingService(uploads, prometheus.NewRegistry()))
ctx := context.Background()
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "M2DGZaRLXtg=").Return(models.Image{}, false)
t.Run("image is taken, uploaded, saved to database and cached", func(t *testing.T) {
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "M2DGZaRLXtg=").Return(models.Image{}, false)
// assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "foo",
PanelID: 1,
Timeout: screenshotTimeout,
}).Return(&screenshot.Screenshot{
Path: "foo.png",
}, nil)
// assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "foo",
PanelID: 1,
Timeout: 5 * time.Second,
}).Return(&screenshot.Screenshot{
Path: "foo.png",
}, nil)
// assert that the screenshot is made into an image and uploaded
uploads.EXPECT().Upload(gomock.Any(), "foo.png").
Return("https://example.com/foo.png", nil)
// assert that the screenshot is made into an image and uploaded
uploads.EXPECT().Upload(gomock.Any(), "foo.png").
Return("https://example.com/foo.png", nil)
// assert that the image is saved into the database
expected := models.Image{
ID: 1,
Token: "foo",
Path: "foo.png",
URL: "https://example.com/foo.png",
}
// assert that the image is saved into the database
expected := models.Image{
ID: 1,
Token: "foo",
Path: "foo.png",
URL: "https://example.com/foo.png",
}
// assert that the image is saved into the cache
cache.EXPECT().Set(gomock.Any(), "M2DGZaRLXtg=", expected).Return(nil)
// assert that the image is saved into the cache
cache.EXPECT().Set(gomock.Any(), "M2DGZaRLXtg=", expected).Return(nil)
image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "foo",
DashboardUID: pointer.String("foo"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "foo",
DashboardUID: pointer.String("foo"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
})
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "rTOWVcbRidk=").Return(models.Image{}, false)
t.Run("image is taken, upload return error, saved to database without URL and cached", func(t *testing.T) {
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "rTOWVcbRidk=").Return(models.Image{}, false)
// assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "bar",
PanelID: 1,
Timeout: screenshotTimeout,
}).Return(&screenshot.Screenshot{
Path: "bar.png",
}, nil)
// assert that a screenshot is taken
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "bar",
PanelID: 1,
Timeout: 5 * time.Second,
}).Return(&screenshot.Screenshot{
Path: "bar.png",
}, nil)
// the screenshot is made into an image and uploaded, but the upload returns an error
uploads.EXPECT().Upload(gomock.Any(), "bar.png").
Return("", errors.New("failed to upload bar.png"))
// the screenshot is made into an image and uploaded, but the upload returns an error
uploads.EXPECT().Upload(gomock.Any(), "bar.png").
Return("", errors.New("failed to upload bar.png"))
// and then saved into the database, but without a URL
expected = models.Image{
ID: 2,
Token: "bar",
Path: "bar.png",
}
// and then saved into the database, but without a URL
expected := models.Image{
ID: 2,
Token: "bar",
Path: "bar.png",
}
// assert that the image is saved into the cache, but without a URL
cache.EXPECT().Set(gomock.Any(), "rTOWVcbRidk=", expected).Return(nil)
// assert that the image is saved into the cache, but without a URL
cache.EXPECT().Set(gomock.Any(), "rTOWVcbRidk=", expected).Return(nil)
image, err = s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "bar",
DashboardUID: pointer.String("bar"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "bar",
DashboardUID: pointer.String("bar"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
})
expected = models.Image{Path: "baz.png", URL: "https://example.com/baz.png"}
t.Run("image is returned from cache", func(t *testing.T) {
expected := models.Image{Path: "baz.png", URL: "https://example.com/baz.png"}
// assert that the cache is checked for an existing image and it is returned
cache.EXPECT().Get(gomock.Any(), "8hJuVe20rVE=").Return(expected, true)
// assert that the cache is checked for an existing image and it is returned
cache.EXPECT().Get(gomock.Any(), "8hJuVe20rVE=").Return(expected, true)
image, err = s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "baz",
DashboardUID: pointer.String("baz"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "baz",
DashboardUID: pointer.String("baz"),
PanelID: pointer.Int64(1)})
require.NoError(t, err)
assert.Equal(t, expected, *image)
})
t.Run("error is returned when timeout is exceeded", func(t *testing.T) {
// assert that the cache is checked for an existing image
cache.EXPECT().Get(gomock.Any(), "jtThkFaZLA4=").Return(models.Image{}, false)
// assert that when the timeout is exceeded an error is returned
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
DashboardUID: "qux",
PanelID: 1,
Timeout: 5 * time.Second,
}).Return(nil, context.DeadlineExceeded)
image, err := s.NewImage(ctx, &models.AlertRule{
OrgID: 1,
UID: "qux",
DashboardUID: pointer.String("qux"),
PanelID: pointer.Int64(1)})
assert.EqualError(t, err, "context deadline exceeded")
assert.Nil(t, image)
})
}

View File

@ -50,6 +50,8 @@ const (
schedulerDefaultMaxAttempts = 3
schedulerDefaultLegacyMinInterval = 1
screenshotsDefaultCapture = false
screenshotsDefaultCaptureTimeout = 10 * time.Second
screenshotsMaxCaptureTimeout = 30 * time.Second
screenshotsDefaultMaxConcurrent = 5
screenshotsDefaultUploadImageStorage = false
// SchedulerBaseInterval base interval of the scheduler. Controls how often the scheduler fetches database for new changes as well as schedules evaluation of a rule
@ -87,6 +89,7 @@ type UnifiedAlertingSettings struct {
type UnifiedAlertingScreenshotSettings struct {
Capture bool
CaptureTimeout time.Duration
MaxConcurrentScreenshots int64
UploadExternalImageStorage bool
}
@ -281,6 +284,13 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
uaCfgScreenshots := uaCfg.Screenshots
uaCfgScreenshots.Capture = screenshots.Key("capture").MustBool(screenshotsDefaultCapture)
captureTimeout := screenshots.Key("capture_timeout").MustDuration(screenshotsDefaultCaptureTimeout)
if captureTimeout > screenshotsMaxCaptureTimeout {
return fmt.Errorf("value of setting 'capture_timeout' cannot exceed %s", screenshotsMaxCaptureTimeout)
}
uaCfgScreenshots.CaptureTimeout = captureTimeout
uaCfgScreenshots.MaxConcurrentScreenshots = screenshots.Key("max_concurrent_screenshots").MustInt64(screenshotsDefaultMaxConcurrent)
uaCfgScreenshots.UploadExternalImageStorage = screenshots.Key("upload_external_image_storage").MustBool(screenshotsDefaultUploadImageStorage)
uaCfg.Screenshots = uaCfgScreenshots