Usage stats: Tune collector execution startup and interval (#72688)

* Do not update statistics at service collector startup

* Configurable collector interval

* Introduce initial random delay

* Prevent reporting metrics until the stats have been collected

* Apply suggestion from code review
This commit is contained in:
Sofia Papagiannaki 2023-08-03 11:01:44 +03:00 committed by GitHub
parent ba723c911a
commit afb59af79b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 2 deletions

View File

@ -1293,6 +1293,8 @@ enabled = true
interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated
disable_total_stats = false
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
total_stats_collector_interval_seconds = 1800
#If both are set, basic auth will be required for the metrics endpoints.
basic_auth_username =

View File

@ -1216,6 +1216,8 @@
;interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated
;disable_total_stats = false
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
;total_stats_collector_interval_seconds = 1800
#If both are set, basic auth will be required for the metrics endpoints.
; basic_auth_username =

View File

@ -1711,6 +1711,10 @@ Flush/write interval when sending metrics to external TSDB. Defaults to `10`.
If set to `true`, then total stats generation (`stat_totals_*` metrics) is disabled. Default is `false`.
### total_stats_collector_interval_seconds
Sets the total stats collector interval. The default is 1800 seconds (30 minutes).
### basic_auth_username and basic_auth_password
If both are set, then basic authentication is required to access the metrics endpoint.

View File

@ -30,3 +30,5 @@ func (usm *UsageStatsMock) GetUsageReport(ctx context.Context) (Report, error) {
}
func (usm *UsageStatsMock) RegisterSendReportCallback(_ SendReportCallbackFunc) {}
func (usm *UsageStatsMock) SetReadyToReport(_ context.Context) {}

View File

@ -23,4 +23,5 @@ type Service interface {
GetUsageReport(context.Context) (Report, error)
RegisterMetricsFunc(MetricsFunc)
RegisterSendReportCallback(SendReportCallbackFunc)
SetReadyToReport(context.Context)
}

View File

@ -26,6 +26,8 @@ type UsageStats struct {
externalMetrics []usagestats.MetricsFunc
sendReportCallbacks []usagestats.SendReportCallbackFunc
readyToReport bool
}
func ProvideService(cfg *setting.Cfg,
@ -82,6 +84,12 @@ func (uss *UsageStats) Run(ctx context.Context) error {
for {
select {
case <-sendReportTicker.C:
if !uss.readyToReport {
nextSendInterval = time.Minute
sendReportTicker.Reset(nextSendInterval)
continue
}
if traceID, err := uss.sendUsageStats(ctx); err != nil {
uss.log.Warn("Failed to send usage stats", "error", err, "traceID", traceID)
}
@ -109,6 +117,11 @@ func (uss *UsageStats) RegisterSendReportCallback(c usagestats.SendReportCallbac
uss.sendReportCallbacks = append(uss.sendReportCallbacks, c)
}
func (uss *UsageStats) SetReadyToReport(context.Context) {
uss.log.Info("Usage stats are ready to report")
uss.readyToReport = true
}
func (uss *UsageStats) supportBundleCollector() supportbundles.Collector {
return supportbundles.Collector{
UID: "usage-stats",

View File

@ -2,6 +2,7 @@ package statscollector
import (
"context"
"math/rand"
"strings"
"time"
@ -22,6 +23,11 @@ import (
"github.com/grafana/grafana/pkg/setting"
)
const (
MIN_DELAY = 30
MAX_DELAY = 120
)
type Service struct {
cfg *setting.Cfg
sqlstore db.DB
@ -91,14 +97,21 @@ func (s *Service) RegisterProviders(usageStatProviders []registry.ProvidesUsageS
}
func (s *Service) Run(ctx context.Context) error {
s.updateTotalStats(ctx)
updateStatsTicker := time.NewTicker(time.Minute * 30)
sendInterval := time.Second * time.Duration(s.cfg.MetricsTotalStatsIntervalSeconds)
nextSendInterval := time.Duration(rand.Intn(MAX_DELAY-MIN_DELAY)+MIN_DELAY) * time.Second
s.log.Debug("usage stats collector started", "sendInterval", sendInterval, "nextSendInterval", nextSendInterval)
updateStatsTicker := time.NewTicker(nextSendInterval)
defer updateStatsTicker.Stop()
for {
select {
case <-updateStatsTicker.C:
s.updateTotalStats(ctx)
if nextSendInterval != sendInterval {
nextSendInterval = sendInterval
updateStatsTicker.Reset(nextSendInterval)
}
case <-ctx.Done():
return ctx.Err()
}
@ -325,6 +338,8 @@ func (s *Service) updateTotalStats(ctx context.Context) bool {
metrics.MStatTotalCorrelations.Set(float64(statsResult.Correlations))
s.usageStats.SetReadyToReport(ctx)
dsResult, err := s.statsService.GetDataSourceStats(ctx, &stats.GetDataSourceStatsQuery{})
if err != nil {
s.log.Error("Failed to get datasource stats", "error", err)

View File

@ -256,6 +256,7 @@ type Cfg struct {
MetricsEndpointBasicAuthUsername string
MetricsEndpointBasicAuthPassword string
MetricsEndpointDisableTotalStats bool
MetricsTotalStatsIntervalSeconds int
MetricsGrafanaEnvironmentInfo map[string]string
// Dashboards
@ -1085,6 +1086,7 @@ func (cfg *Cfg) Load(args CommandLineArgs) error {
cfg.MetricsEndpointBasicAuthUsername = valueAsString(iniFile.Section("metrics"), "basic_auth_username", "")
cfg.MetricsEndpointBasicAuthPassword = valueAsString(iniFile.Section("metrics"), "basic_auth_password", "")
cfg.MetricsEndpointDisableTotalStats = iniFile.Section("metrics").Key("disable_total_stats").MustBool(false)
cfg.MetricsTotalStatsIntervalSeconds = iniFile.Section("metrics").Key("total_stats_collector_interval_seconds").MustInt(1800)
analytics := iniFile.Section("analytics")
cfg.CheckForGrafanaUpdates = analytics.Key("check_for_updates").MustBool(true)