Usage stats: Tune collector execution startup and interval (#72688)

* Do not update statistics at service collector startup

* Configurable collector interval

* Introduce initial random delay

* Prevent reporting metrics until the stats have been collected

* Apply suggestion from code review
This commit is contained in:
Sofia Papagiannaki 2023-08-03 11:01:44 +03:00 committed by GitHub
parent ba723c911a
commit afb59af79b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 2 deletions

View File

@ -1293,6 +1293,8 @@ enabled = true
interval_seconds = 10 interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated # Disable total stats (stat_totals_*) metrics to be generated
disable_total_stats = false disable_total_stats = false
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
total_stats_collector_interval_seconds = 1800
#If both are set, basic auth will be required for the metrics endpoints. #If both are set, basic auth will be required for the metrics endpoints.
basic_auth_username = basic_auth_username =

View File

@ -1216,6 +1216,8 @@
;interval_seconds = 10 ;interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated # Disable total stats (stat_totals_*) metrics to be generated
;disable_total_stats = false ;disable_total_stats = false
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
;total_stats_collector_interval_seconds = 1800
#If both are set, basic auth will be required for the metrics endpoints. #If both are set, basic auth will be required for the metrics endpoints.
; basic_auth_username = ; basic_auth_username =

View File

@ -1711,6 +1711,10 @@ Flush/write interval when sending metrics to external TSDB. Defaults to `10`.
If set to `true`, then total stats generation (`stat_totals_*` metrics) is disabled. Default is `false`. If set to `true`, then total stats generation (`stat_totals_*` metrics) is disabled. Default is `false`.
### total_stats_collector_interval_seconds
Sets the total stats collector interval. The default is 1800 seconds (30 minutes).
### basic_auth_username and basic_auth_password ### basic_auth_username and basic_auth_password
If both are set, then basic authentication is required to access the metrics endpoint. If both are set, then basic authentication is required to access the metrics endpoint.

View File

@ -30,3 +30,5 @@ func (usm *UsageStatsMock) GetUsageReport(ctx context.Context) (Report, error) {
} }
func (usm *UsageStatsMock) RegisterSendReportCallback(_ SendReportCallbackFunc) {} func (usm *UsageStatsMock) RegisterSendReportCallback(_ SendReportCallbackFunc) {}
func (usm *UsageStatsMock) SetReadyToReport(_ context.Context) {}

View File

@ -23,4 +23,5 @@ type Service interface {
GetUsageReport(context.Context) (Report, error) GetUsageReport(context.Context) (Report, error)
RegisterMetricsFunc(MetricsFunc) RegisterMetricsFunc(MetricsFunc)
RegisterSendReportCallback(SendReportCallbackFunc) RegisterSendReportCallback(SendReportCallbackFunc)
SetReadyToReport(context.Context)
} }

View File

@ -26,6 +26,8 @@ type UsageStats struct {
externalMetrics []usagestats.MetricsFunc externalMetrics []usagestats.MetricsFunc
sendReportCallbacks []usagestats.SendReportCallbackFunc sendReportCallbacks []usagestats.SendReportCallbackFunc
readyToReport bool
} }
func ProvideService(cfg *setting.Cfg, func ProvideService(cfg *setting.Cfg,
@ -82,6 +84,12 @@ func (uss *UsageStats) Run(ctx context.Context) error {
for { for {
select { select {
case <-sendReportTicker.C: case <-sendReportTicker.C:
if !uss.readyToReport {
nextSendInterval = time.Minute
sendReportTicker.Reset(nextSendInterval)
continue
}
if traceID, err := uss.sendUsageStats(ctx); err != nil { if traceID, err := uss.sendUsageStats(ctx); err != nil {
uss.log.Warn("Failed to send usage stats", "error", err, "traceID", traceID) uss.log.Warn("Failed to send usage stats", "error", err, "traceID", traceID)
} }
@ -109,6 +117,11 @@ func (uss *UsageStats) RegisterSendReportCallback(c usagestats.SendReportCallbac
uss.sendReportCallbacks = append(uss.sendReportCallbacks, c) uss.sendReportCallbacks = append(uss.sendReportCallbacks, c)
} }
func (uss *UsageStats) SetReadyToReport(context.Context) {
uss.log.Info("Usage stats are ready to report")
uss.readyToReport = true
}
func (uss *UsageStats) supportBundleCollector() supportbundles.Collector { func (uss *UsageStats) supportBundleCollector() supportbundles.Collector {
return supportbundles.Collector{ return supportbundles.Collector{
UID: "usage-stats", UID: "usage-stats",

View File

@ -2,6 +2,7 @@ package statscollector
import ( import (
"context" "context"
"math/rand"
"strings" "strings"
"time" "time"
@ -22,6 +23,11 @@ import (
"github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana/pkg/setting"
) )
const (
MIN_DELAY = 30
MAX_DELAY = 120
)
type Service struct { type Service struct {
cfg *setting.Cfg cfg *setting.Cfg
sqlstore db.DB sqlstore db.DB
@ -91,14 +97,21 @@ func (s *Service) RegisterProviders(usageStatProviders []registry.ProvidesUsageS
} }
func (s *Service) Run(ctx context.Context) error { func (s *Service) Run(ctx context.Context) error {
s.updateTotalStats(ctx) sendInterval := time.Second * time.Duration(s.cfg.MetricsTotalStatsIntervalSeconds)
updateStatsTicker := time.NewTicker(time.Minute * 30) nextSendInterval := time.Duration(rand.Intn(MAX_DELAY-MIN_DELAY)+MIN_DELAY) * time.Second
s.log.Debug("usage stats collector started", "sendInterval", sendInterval, "nextSendInterval", nextSendInterval)
updateStatsTicker := time.NewTicker(nextSendInterval)
defer updateStatsTicker.Stop() defer updateStatsTicker.Stop()
for { for {
select { select {
case <-updateStatsTicker.C: case <-updateStatsTicker.C:
s.updateTotalStats(ctx) s.updateTotalStats(ctx)
if nextSendInterval != sendInterval {
nextSendInterval = sendInterval
updateStatsTicker.Reset(nextSendInterval)
}
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
} }
@ -325,6 +338,8 @@ func (s *Service) updateTotalStats(ctx context.Context) bool {
metrics.MStatTotalCorrelations.Set(float64(statsResult.Correlations)) metrics.MStatTotalCorrelations.Set(float64(statsResult.Correlations))
s.usageStats.SetReadyToReport(ctx)
dsResult, err := s.statsService.GetDataSourceStats(ctx, &stats.GetDataSourceStatsQuery{}) dsResult, err := s.statsService.GetDataSourceStats(ctx, &stats.GetDataSourceStatsQuery{})
if err != nil { if err != nil {
s.log.Error("Failed to get datasource stats", "error", err) s.log.Error("Failed to get datasource stats", "error", err)

View File

@ -256,6 +256,7 @@ type Cfg struct {
MetricsEndpointBasicAuthUsername string MetricsEndpointBasicAuthUsername string
MetricsEndpointBasicAuthPassword string MetricsEndpointBasicAuthPassword string
MetricsEndpointDisableTotalStats bool MetricsEndpointDisableTotalStats bool
MetricsTotalStatsIntervalSeconds int
MetricsGrafanaEnvironmentInfo map[string]string MetricsGrafanaEnvironmentInfo map[string]string
// Dashboards // Dashboards
@ -1085,6 +1086,7 @@ func (cfg *Cfg) Load(args CommandLineArgs) error {
cfg.MetricsEndpointBasicAuthUsername = valueAsString(iniFile.Section("metrics"), "basic_auth_username", "") cfg.MetricsEndpointBasicAuthUsername = valueAsString(iniFile.Section("metrics"), "basic_auth_username", "")
cfg.MetricsEndpointBasicAuthPassword = valueAsString(iniFile.Section("metrics"), "basic_auth_password", "") cfg.MetricsEndpointBasicAuthPassword = valueAsString(iniFile.Section("metrics"), "basic_auth_password", "")
cfg.MetricsEndpointDisableTotalStats = iniFile.Section("metrics").Key("disable_total_stats").MustBool(false) cfg.MetricsEndpointDisableTotalStats = iniFile.Section("metrics").Key("disable_total_stats").MustBool(false)
cfg.MetricsTotalStatsIntervalSeconds = iniFile.Section("metrics").Key("total_stats_collector_interval_seconds").MustInt(1800)
analytics := iniFile.Section("analytics") analytics := iniFile.Section("analytics")
cfg.CheckForGrafanaUpdates = analytics.Key("check_for_updates").MustBool(true) cfg.CheckForGrafanaUpdates = analytics.Key("check_for_updates").MustBool(true)