mirror of
https://github.com/grafana/grafana.git
synced 2024-12-28 01:41:24 -06:00
Alerting: Support customizable timeout for screenshots (#60981)
This commit adds a customizable timeout for screenshots called capture_timeout. The default value is 10 seconds, and the maximum value is 30 seconds. This timeout should be less than the minimum Interval of all Evaluation Groups to avoid back pressure on alert rule evaluation.
This commit is contained in:
parent
bd9cce2866
commit
9af7adef76
@ -942,6 +942,12 @@ min_interval = 10s
|
||||
# For more information on configuration options, refer to [rendering].
|
||||
capture = false
|
||||
|
||||
# The timeout for capturing screenshots. If a screenshot cannot be captured within the timeout then
|
||||
# the notification is sent without a screenshot. The maximum duration is 30 seconds. This timeout
|
||||
# should be less than the minimum Interval of all Evaluation Groups to avoid back pressure on alert
|
||||
# rule evaluation.
|
||||
capture_timeout = 10s
|
||||
|
||||
# The maximum number of screenshots that can be taken at the same time. This option is different from
|
||||
# concurrent_render_request_limit as max_concurrent_screenshots sets the number of concurrent screenshots
|
||||
# that can be taken at the same time for all firing alerts where as concurrent_render_request_limit sets
|
||||
@ -1388,4 +1394,4 @@ client_key =
|
||||
client_cert =
|
||||
server_name =
|
||||
# The address of the socks5 proxy datasources should connect to
|
||||
proxy_address =
|
||||
proxy_address =
|
||||
|
@ -21,7 +21,6 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
screenshotTimeout = 10 * time.Second
|
||||
screenshotCacheTTL = time.Minute
|
||||
)
|
||||
|
||||
@ -48,13 +47,14 @@ type ImageService interface {
|
||||
// as an annotation or label to the Alertmanager. This service cannot take
|
||||
// screenshots of alert rules that are not associated with a dashboard panel.
|
||||
type ScreenshotImageService struct {
|
||||
cache CacheService
|
||||
limiter screenshot.RateLimiter
|
||||
logger log.Logger
|
||||
screenshots screenshot.ScreenshotService
|
||||
singleflight singleflight.Group
|
||||
store store.ImageStore
|
||||
uploads *UploadingService
|
||||
cache CacheService
|
||||
limiter screenshot.RateLimiter
|
||||
logger log.Logger
|
||||
screenshots screenshot.ScreenshotService
|
||||
screenshotTimeout time.Duration
|
||||
singleflight singleflight.Group
|
||||
store store.ImageStore
|
||||
uploads *UploadingService
|
||||
}
|
||||
|
||||
// NewScreenshotImageService returns a new ScreenshotImageService.
|
||||
@ -63,15 +63,17 @@ func NewScreenshotImageService(
|
||||
limiter screenshot.RateLimiter,
|
||||
logger log.Logger,
|
||||
screenshots screenshot.ScreenshotService,
|
||||
screenshotTimeout time.Duration,
|
||||
store store.ImageStore,
|
||||
uploads *UploadingService) ImageService {
|
||||
return &ScreenshotImageService{
|
||||
cache: cache,
|
||||
limiter: limiter,
|
||||
logger: logger,
|
||||
screenshots: screenshots,
|
||||
store: store,
|
||||
uploads: uploads,
|
||||
cache: cache,
|
||||
limiter: limiter,
|
||||
logger: logger,
|
||||
screenshots: screenshots,
|
||||
screenshotTimeout: screenshotTimeout,
|
||||
store: store,
|
||||
uploads: uploads,
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,10 +82,11 @@ func NewScreenshotImageService(
|
||||
func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds dashboards.DashboardService,
|
||||
rs rendering.Service, r prometheus.Registerer) (ImageService, error) {
|
||||
var (
|
||||
cache CacheService = &NoOpCacheService{}
|
||||
limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{}
|
||||
screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{}
|
||||
uploads *UploadingService = nil
|
||||
cache CacheService = &NoOpCacheService{}
|
||||
limiter screenshot.RateLimiter = &screenshot.NoOpRateLimiter{}
|
||||
screenshots screenshot.ScreenshotService = &screenshot.ScreenshotUnavailableService{}
|
||||
screenshotTimeout time.Duration = 0
|
||||
uploads *UploadingService = nil
|
||||
)
|
||||
|
||||
// If screenshots are enabled
|
||||
@ -91,6 +94,7 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
|
||||
cache = NewInmemCacheService(screenshotCacheTTL, r)
|
||||
limiter = screenshot.NewTokenRateLimiter(cfg.UnifiedAlerting.Screenshots.MaxConcurrentScreenshots)
|
||||
screenshots = screenshot.NewHeadlessScreenshotService(ds, rs, r)
|
||||
screenshotTimeout = cfg.UnifiedAlerting.Screenshots.CaptureTimeout
|
||||
|
||||
// Image uploading is an optional feature
|
||||
if cfg.UnifiedAlerting.Screenshots.UploadExternalImageStorage {
|
||||
@ -102,16 +106,17 @@ func NewScreenshotImageServiceFromCfg(cfg *setting.Cfg, db *store.DBstore, ds da
|
||||
}
|
||||
}
|
||||
|
||||
return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"), screenshots, db, uploads), nil
|
||||
return NewScreenshotImageService(cache, limiter, log.New("ngalert.image"),
|
||||
screenshots, screenshotTimeout, db, uploads), nil
|
||||
}
|
||||
|
||||
// NewImage returns a screenshot of the alert rule or an error.
|
||||
//
|
||||
// The alert rule must be associated with a dashboard panel for a screenshot to be
|
||||
// taken. If the alert rule does not have a Dashboard UID in its annotations,
|
||||
// or the dashboard does not exist, an models.ErrNoDashboard error is returned. If the
|
||||
// or the dashboard does not exist, a models.ErrNoDashboard error is returned. If the
|
||||
// alert rule has a Dashboard UID and the dashboard exists, but does not have a
|
||||
// Panel ID in its annotations then an models.ErrNoPanel error is returned.
|
||||
// Panel ID in its annotations then a models.ErrNoPanel error is returned.
|
||||
func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRule) (*models.Image, error) {
|
||||
logger := s.logger.FromContext(ctx)
|
||||
|
||||
@ -132,7 +137,7 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
|
||||
opts := screenshot.ScreenshotOptions{
|
||||
DashboardUID: dashboardUID,
|
||||
PanelID: panelID,
|
||||
Timeout: screenshotTimeout,
|
||||
Timeout: s.screenshotTimeout,
|
||||
}
|
||||
|
||||
// To prevent concurrent screenshots of the same dashboard panel we use singleflight,
|
||||
@ -145,19 +150,19 @@ func (s *ScreenshotImageService) NewImage(ctx context.Context, r *models.AlertRu
|
||||
return &image, nil
|
||||
}
|
||||
|
||||
// We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout
|
||||
// in the context is used for both database queries and the request to the rendering service,
|
||||
// while the timeout in ScreenshotOptions is passed to the rendering service where it is used as
|
||||
// a client timeout. It is not recommended to pass a context without a deadline and the context
|
||||
// deadline should be at least as long as the timeout in ScreenshotOptions.
|
||||
ctx, cancelFunc := context.WithTimeout(ctx, screenshotTimeout)
|
||||
defer cancelFunc()
|
||||
|
||||
logger.Debug("Requesting screenshot")
|
||||
|
||||
result, err, _ := s.singleflight.Do(optsHash, func() (interface{}, error) {
|
||||
// We create both a context with timeout and set a timeout in ScreenshotOptions. The timeout
|
||||
// in the context is used for both database queries and the request to the rendering service,
|
||||
// while the timeout in ScreenshotOptions is passed to the rendering service where it is used as
|
||||
// a client timeout. It is not recommended to pass a context without a deadline and the context
|
||||
// deadline should be at least as long as the timeout in ScreenshotOptions.
|
||||
screenshotCtx, cancelFunc := context.WithTimeout(ctx, s.screenshotTimeout)
|
||||
defer cancelFunc()
|
||||
|
||||
// Once deduplicated concurrent screenshots are then rate-limited
|
||||
screenshot, err := s.limiter.Do(ctx, opts, s.screenshots.Take)
|
||||
screenshot, err := s.limiter.Do(screenshotCtx, opts, s.screenshots.Take)
|
||||
if err != nil {
|
||||
if errors.Is(err, dashboards.ErrDashboardNotFound) {
|
||||
return nil, models.ErrNoDashboard
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/golang/mock/gomock"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
@ -30,90 +31,116 @@ func TestScreenshotImageService(t *testing.T) {
|
||||
uploads = imguploader.NewMockImageUploader(ctrl)
|
||||
)
|
||||
|
||||
s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, images,
|
||||
s := NewScreenshotImageService(cache, &limiter, log.NewNopLogger(), screenshots, 5*time.Second, images,
|
||||
NewUploadingService(uploads, prometheus.NewRegistry()))
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// assert that the cache is checked for an existing image
|
||||
cache.EXPECT().Get(gomock.Any(), "M2DGZaRLXtg=").Return(models.Image{}, false)
|
||||
t.Run("image is taken, uploaded, saved to database and cached", func(t *testing.T) {
|
||||
// assert that the cache is checked for an existing image
|
||||
cache.EXPECT().Get(gomock.Any(), "M2DGZaRLXtg=").Return(models.Image{}, false)
|
||||
|
||||
// assert that a screenshot is taken
|
||||
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
|
||||
DashboardUID: "foo",
|
||||
PanelID: 1,
|
||||
Timeout: screenshotTimeout,
|
||||
}).Return(&screenshot.Screenshot{
|
||||
Path: "foo.png",
|
||||
}, nil)
|
||||
// assert that a screenshot is taken
|
||||
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
|
||||
DashboardUID: "foo",
|
||||
PanelID: 1,
|
||||
Timeout: 5 * time.Second,
|
||||
}).Return(&screenshot.Screenshot{
|
||||
Path: "foo.png",
|
||||
}, nil)
|
||||
|
||||
// assert that the screenshot is made into an image and uploaded
|
||||
uploads.EXPECT().Upload(gomock.Any(), "foo.png").
|
||||
Return("https://example.com/foo.png", nil)
|
||||
// assert that the screenshot is made into an image and uploaded
|
||||
uploads.EXPECT().Upload(gomock.Any(), "foo.png").
|
||||
Return("https://example.com/foo.png", nil)
|
||||
|
||||
// assert that the image is saved into the database
|
||||
expected := models.Image{
|
||||
ID: 1,
|
||||
Token: "foo",
|
||||
Path: "foo.png",
|
||||
URL: "https://example.com/foo.png",
|
||||
}
|
||||
// assert that the image is saved into the database
|
||||
expected := models.Image{
|
||||
ID: 1,
|
||||
Token: "foo",
|
||||
Path: "foo.png",
|
||||
URL: "https://example.com/foo.png",
|
||||
}
|
||||
|
||||
// assert that the image is saved into the cache
|
||||
cache.EXPECT().Set(gomock.Any(), "M2DGZaRLXtg=", expected).Return(nil)
|
||||
// assert that the image is saved into the cache
|
||||
cache.EXPECT().Set(gomock.Any(), "M2DGZaRLXtg=", expected).Return(nil)
|
||||
|
||||
image, err := s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "foo",
|
||||
DashboardUID: pointer.String("foo"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
image, err := s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "foo",
|
||||
DashboardUID: pointer.String("foo"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
})
|
||||
|
||||
// assert that the cache is checked for an existing image
|
||||
cache.EXPECT().Get(gomock.Any(), "rTOWVcbRidk=").Return(models.Image{}, false)
|
||||
t.Run("image is taken, upload return error, saved to database without URL and cached", func(t *testing.T) {
|
||||
// assert that the cache is checked for an existing image
|
||||
cache.EXPECT().Get(gomock.Any(), "rTOWVcbRidk=").Return(models.Image{}, false)
|
||||
|
||||
// assert that a screenshot is taken
|
||||
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
|
||||
DashboardUID: "bar",
|
||||
PanelID: 1,
|
||||
Timeout: screenshotTimeout,
|
||||
}).Return(&screenshot.Screenshot{
|
||||
Path: "bar.png",
|
||||
}, nil)
|
||||
// assert that a screenshot is taken
|
||||
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
|
||||
DashboardUID: "bar",
|
||||
PanelID: 1,
|
||||
Timeout: 5 * time.Second,
|
||||
}).Return(&screenshot.Screenshot{
|
||||
Path: "bar.png",
|
||||
}, nil)
|
||||
|
||||
// the screenshot is made into an image and uploaded, but the upload returns an error
|
||||
uploads.EXPECT().Upload(gomock.Any(), "bar.png").
|
||||
Return("", errors.New("failed to upload bar.png"))
|
||||
// the screenshot is made into an image and uploaded, but the upload returns an error
|
||||
uploads.EXPECT().Upload(gomock.Any(), "bar.png").
|
||||
Return("", errors.New("failed to upload bar.png"))
|
||||
|
||||
// and then saved into the database, but without a URL
|
||||
expected = models.Image{
|
||||
ID: 2,
|
||||
Token: "bar",
|
||||
Path: "bar.png",
|
||||
}
|
||||
// and then saved into the database, but without a URL
|
||||
expected := models.Image{
|
||||
ID: 2,
|
||||
Token: "bar",
|
||||
Path: "bar.png",
|
||||
}
|
||||
|
||||
// assert that the image is saved into the cache, but without a URL
|
||||
cache.EXPECT().Set(gomock.Any(), "rTOWVcbRidk=", expected).Return(nil)
|
||||
// assert that the image is saved into the cache, but without a URL
|
||||
cache.EXPECT().Set(gomock.Any(), "rTOWVcbRidk=", expected).Return(nil)
|
||||
|
||||
image, err = s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "bar",
|
||||
DashboardUID: pointer.String("bar"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
image, err := s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "bar",
|
||||
DashboardUID: pointer.String("bar"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
})
|
||||
|
||||
expected = models.Image{Path: "baz.png", URL: "https://example.com/baz.png"}
|
||||
t.Run("image is returned from cache", func(t *testing.T) {
|
||||
expected := models.Image{Path: "baz.png", URL: "https://example.com/baz.png"}
|
||||
|
||||
// assert that the cache is checked for an existing image and it is returned
|
||||
cache.EXPECT().Get(gomock.Any(), "8hJuVe20rVE=").Return(expected, true)
|
||||
// assert that the cache is checked for an existing image and it is returned
|
||||
cache.EXPECT().Get(gomock.Any(), "8hJuVe20rVE=").Return(expected, true)
|
||||
|
||||
image, err = s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "baz",
|
||||
DashboardUID: pointer.String("baz"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
image, err := s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "baz",
|
||||
DashboardUID: pointer.String("baz"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, expected, *image)
|
||||
})
|
||||
|
||||
t.Run("error is returned when timeout is exceeded", func(t *testing.T) {
|
||||
// assert that the cache is checked for an existing image
|
||||
cache.EXPECT().Get(gomock.Any(), "jtThkFaZLA4=").Return(models.Image{}, false)
|
||||
|
||||
// assert that when the timeout is exceeded an error is returned
|
||||
screenshots.EXPECT().Take(gomock.Any(), screenshot.ScreenshotOptions{
|
||||
DashboardUID: "qux",
|
||||
PanelID: 1,
|
||||
Timeout: 5 * time.Second,
|
||||
}).Return(nil, context.DeadlineExceeded)
|
||||
|
||||
image, err := s.NewImage(ctx, &models.AlertRule{
|
||||
OrgID: 1,
|
||||
UID: "qux",
|
||||
DashboardUID: pointer.String("qux"),
|
||||
PanelID: pointer.Int64(1)})
|
||||
assert.EqualError(t, err, "context deadline exceeded")
|
||||
assert.Nil(t, image)
|
||||
})
|
||||
}
|
||||
|
@ -50,6 +50,8 @@ const (
|
||||
schedulerDefaultMaxAttempts = 3
|
||||
schedulerDefaultLegacyMinInterval = 1
|
||||
screenshotsDefaultCapture = false
|
||||
screenshotsDefaultCaptureTimeout = 10 * time.Second
|
||||
screenshotsMaxCaptureTimeout = 30 * time.Second
|
||||
screenshotsDefaultMaxConcurrent = 5
|
||||
screenshotsDefaultUploadImageStorage = false
|
||||
// SchedulerBaseInterval base interval of the scheduler. Controls how often the scheduler fetches database for new changes as well as schedules evaluation of a rule
|
||||
@ -87,6 +89,7 @@ type UnifiedAlertingSettings struct {
|
||||
|
||||
type UnifiedAlertingScreenshotSettings struct {
|
||||
Capture bool
|
||||
CaptureTimeout time.Duration
|
||||
MaxConcurrentScreenshots int64
|
||||
UploadExternalImageStorage bool
|
||||
}
|
||||
@ -281,6 +284,13 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
uaCfgScreenshots := uaCfg.Screenshots
|
||||
|
||||
uaCfgScreenshots.Capture = screenshots.Key("capture").MustBool(screenshotsDefaultCapture)
|
||||
|
||||
captureTimeout := screenshots.Key("capture_timeout").MustDuration(screenshotsDefaultCaptureTimeout)
|
||||
if captureTimeout > screenshotsMaxCaptureTimeout {
|
||||
return fmt.Errorf("value of setting 'capture_timeout' cannot exceed %s", screenshotsMaxCaptureTimeout)
|
||||
}
|
||||
uaCfgScreenshots.CaptureTimeout = captureTimeout
|
||||
|
||||
uaCfgScreenshots.MaxConcurrentScreenshots = screenshots.Key("max_concurrent_screenshots").MustInt64(screenshotsDefaultMaxConcurrent)
|
||||
uaCfgScreenshots.UploadExternalImageStorage = screenshots.Key("upload_external_image_storage").MustBool(screenshotsDefaultUploadImageStorage)
|
||||
uaCfg.Screenshots = uaCfgScreenshots
|
||||
|
Loading…
Reference in New Issue
Block a user