Usage stats: Divide collection into multiple functions to isolate failures (#49928)

This commit is contained in:
Emil Tullstedt 2022-06-01 12:27:06 +02:00 committed by GitHub
parent dcccac6657
commit d3ffb9e245
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 210 additions and 135 deletions

View File

@ -48,10 +48,13 @@ func (uss *UsageStats) GetUsageReport(ctx context.Context) (usagestats.Report, e
}
func (uss *UsageStats) gatherMetrics(ctx context.Context, metrics map[string]interface{}) {
totC, errC := 0, 0
for _, fn := range uss.externalMetrics {
fnMetrics, err := fn(ctx)
totC++
if err != nil {
uss.log.Error("Failed to fetch external metrics", "error", err)
errC++
continue
}
@ -59,6 +62,8 @@ func (uss *UsageStats) gatherMetrics(ctx context.Context, metrics map[string]int
metrics[name] = value
}
}
metrics["stats.usagestats.debug.collect.total.count"] = totC
metrics["stats.usagestats.debug.collect.error.count"] = errC
}
func (uss *UsageStats) RegisterMetricsFunc(fn usagestats.MetricsFunc) {

View File

@ -178,6 +178,7 @@ func TestRegisterMetrics(t *testing.T) {
uss.gatherMetrics(context.Background(), metrics)
assert.Equal(t, 1, metrics[goodMetricName])
metricsCount := len(metrics)
t.Run("do not add metrics that return an error when fetched", func(t *testing.T) {
const badMetricName = "stats.test_external_metric_error.count"
@ -192,7 +193,8 @@ func TestRegisterMetrics(t *testing.T) {
require.Nil(t, extErrorMetric, "Invalid metric should not be added")
assert.Equal(t, 1, extMetric)
assert.Len(t, metrics, 3, "Expected only one available metric")
assert.Len(t, metrics, metricsCount, "Expected same number of metrics before and after collecting bad metric")
assert.EqualValues(t, 1, metrics["stats.usagestats.debug.collect.error.count"])
})
}

View File

@ -56,3 +56,24 @@ FROM (select count(1) as tokens from user_auth_token group by user_id) uat;`
s.concurrentUserStatsCache.memoized = time.Now()
return s.concurrentUserStatsCache.stats, nil
}
func (s *Service) collectConcurrentUsers(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
// Get concurrent users stats as histogram
concurrentUsersStats, err := s.concurrentUsers(ctx)
if err != nil {
s.log.Error("Failed to get concurrent users stats", "error", err)
return nil, err
}
// Histogram is cumulative and metric name has a postfix of le_"<upper inclusive bound>"
m["stats.auth_token_per_user_le_3"] = concurrentUsersStats.BucketLE3
m["stats.auth_token_per_user_le_6"] = concurrentUsersStats.BucketLE6
m["stats.auth_token_per_user_le_9"] = concurrentUsersStats.BucketLE9
m["stats.auth_token_per_user_le_12"] = concurrentUsersStats.BucketLE12
m["stats.auth_token_per_user_le_15"] = concurrentUsersStats.BucketLE15
m["stats.auth_token_per_user_le_inf"] = concurrentUsersStats.BucketLEInf
return m, nil
}

View File

@ -23,7 +23,7 @@ func TestConcurrentUsersMetrics(t *testing.T) {
createConcurrentTokens(t, sqlStore)
stats, err := s.collect(context.Background())
stats, err := s.collectConcurrentUsers(context.Background())
require.NoError(t, err)
assert.Equal(t, int32(1), stats["stats.auth_token_per_user_le_3"])

View File

@ -18,6 +18,19 @@ type memoPrometheusFlavor struct {
memoized time.Time
}
func (s *Service) collectPrometheusFlavors(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
variants, err := s.detectPrometheusVariants(ctx)
if err != nil {
return nil, err
}
for variant, count := range variants {
m["stats.ds.prometheus.flavor."+variant+".count"] = count
}
return m, nil
}
func (s *Service) detectPrometheusVariants(ctx context.Context) (map[string]int64, error) {
if s.promFlavorCache.memoized.Add(promFlavorCacheLifetime).After(time.Now()) &&
s.promFlavorCache.variants != nil {
@ -77,7 +90,7 @@ func (s *Service) detectPrometheusVariant(ctx context.Context, ds *models.DataSo
// Possibly configuration error, the risk of a false positive is
// too high.
s.log.Debug("Failed to send Prometheus build info request", "error", err)
return "", nil
return "unreachable", nil
}
defer func() {
err := resp.Body.Close()

View File

@ -6,6 +6,8 @@ import (
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/metrics"
@ -18,7 +20,6 @@ import (
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/setting"
"github.com/prometheus/client_golang/prometheus"
)
type Service struct {
@ -40,7 +41,7 @@ type Service struct {
}
func ProvideService(
usagestats usagestats.Service,
us usagestats.Service,
cfg *setting.Cfg,
store sqlstore.Store,
social social.Service,
@ -54,7 +55,7 @@ func ProvideService(
sqlstore: store,
plugins: plugins,
social: social,
usageStats: usagestats,
usageStats: us,
features: features,
datasources: datasourceService,
httpClientProvider: httpClientProvider,
@ -63,7 +64,19 @@ func ProvideService(
log: log.New("infra.usagestats.collector"),
}
usagestats.RegisterMetricsFunc(s.collect)
collectors := []usagestats.MetricsFunc{
s.collectSystemStats,
s.collectConcurrentUsers,
s.collectDatasourceStats,
s.collectDatasourceAccess,
s.collectElasticStats,
s.collectAlertNotifierStats,
s.collectPrometheusFlavors,
s.collectAdditionalMetrics,
}
for _, c := range collectors {
us.RegisterMetricsFunc(c)
}
return s
}
@ -89,7 +102,7 @@ func (s *Service) Run(ctx context.Context) error {
}
}
func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
func (s *Service) collectSystemStats(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
statsQuery := models.GetSystemStatsQuery{}
@ -158,7 +171,66 @@ func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
}
m["stats.avg_auth_token_per_user.count"] = avgAuthTokensPerUser
m["stats.packaging."+s.cfg.Packaging+".count"] = 1
m["stats.distributor."+s.cfg.ReportingDistributor+".count"] = 1
// Add stats about auth configuration
authTypes := map[string]bool{}
authTypes["anonymous"] = s.cfg.AnonymousEnabled
authTypes["basic_auth"] = s.cfg.BasicAuthEnabled
authTypes["ldap"] = s.cfg.LDAPEnabled
authTypes["auth_proxy"] = s.cfg.AuthProxyEnabled
for provider, enabled := range s.social.GetOAuthProviders() {
authTypes["oauth_"+provider] = enabled
}
for authType, enabled := range authTypes {
enabledValue := 0
if enabled {
enabledValue = 1
}
m["stats.auth_enabled."+authType+".count"] = enabledValue
}
m["stats.uptime"] = int64(time.Since(s.startTime).Seconds())
featureUsageStats := s.features.GetUsageStats(ctx)
for k, v := range featureUsageStats {
m[k] = v
}
return m, nil
}
func (s *Service) collectAdditionalMetrics(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
for _, usageStatProvider := range s.usageStatProviders {
stats := usageStatProvider.GetUsageStats(ctx)
for k, v := range stats {
m[k] = v
}
}
return m, nil
}
func (s *Service) collectAlertNotifierStats(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
// get stats about alert notifier usage
anStats := models.GetAlertNotifierUsageStatsQuery{}
if err := s.sqlstore.GetAlertNotifiersUsageStats(ctx, &anStats); err != nil {
s.log.Error("Failed to get alert notification stats", "error", err)
return nil, err
}
for _, stats := range anStats.Result {
m["stats.alert_notifiers."+stats.Type+".count"] = stats.Count
}
return m, nil
}
func (s *Service) collectDatasourceStats(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
dsStats := models.GetDataSourceStatsQuery{}
if err := s.sqlstore.GetDataSourceStats(ctx, &dsStats); err != nil {
s.log.Error("Failed to get datasource stats", "error", err)
@ -178,6 +250,11 @@ func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
}
m["stats.ds.other.count"] = dsOtherCount
return m, nil
}
func (s *Service) collectElasticStats(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
esDataSourcesQuery := models.GetDataSourcesByTypeQuery{Type: models.DS_ES}
if err := s.sqlstore.GetDataSourcesByType(ctx, &esDataSourcesQuery); err != nil {
s.log.Error("Failed to get elasticsearch json data", "error", err)
@ -196,9 +273,11 @@ func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
m[statName] = count + 1
}
return m, nil
}
m["stats.packaging."+s.cfg.Packaging+".count"] = 1
m["stats.distributor."+s.cfg.ReportingDistributor+".count"] = 1
func (s *Service) collectDatasourceAccess(ctx context.Context) (map[string]interface{}, error) {
m := map[string]interface{}{}
// fetch datasource access stats
dsAccessStats := models.GetDataSourceAccessStatsQuery{}
@ -207,15 +286,6 @@ func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
return nil, err
}
variants, err := s.detectPrometheusVariants(ctx)
if err != nil {
return nil, err
}
for variant, count := range variants {
m["stats.ds.prometheus.flavor."+variant+".count"] = count
}
// send access counters for each data source
// but ignore any custom data sources
// as sending that name could be sensitive information
@ -238,66 +308,6 @@ func (s *Service) collect(ctx context.Context) (map[string]interface{}, error) {
for access, count := range dsAccessOtherCount {
m["stats.ds_access.other."+access+".count"] = count
}
// get stats about alert notifier usage
anStats := models.GetAlertNotifierUsageStatsQuery{}
if err := s.sqlstore.GetAlertNotifiersUsageStats(ctx, &anStats); err != nil {
s.log.Error("Failed to get alert notification stats", "error", err)
return nil, err
}
for _, stats := range anStats.Result {
m["stats.alert_notifiers."+stats.Type+".count"] = stats.Count
}
// Add stats about auth configuration
authTypes := map[string]bool{}
authTypes["anonymous"] = s.cfg.AnonymousEnabled
authTypes["basic_auth"] = s.cfg.BasicAuthEnabled
authTypes["ldap"] = s.cfg.LDAPEnabled
authTypes["auth_proxy"] = s.cfg.AuthProxyEnabled
for provider, enabled := range s.social.GetOAuthProviders() {
authTypes["oauth_"+provider] = enabled
}
for authType, enabled := range authTypes {
enabledValue := 0
if enabled {
enabledValue = 1
}
m["stats.auth_enabled."+authType+".count"] = enabledValue
}
// Get concurrent users stats as histogram
concurrentUsersStats, err := s.concurrentUsers(ctx)
if err != nil {
s.log.Error("Failed to get concurrent users stats", "error", err)
return nil, err
}
// Histogram is cumulative and metric name has a postfix of le_"<upper inclusive bound>"
m["stats.auth_token_per_user_le_3"] = concurrentUsersStats.BucketLE3
m["stats.auth_token_per_user_le_6"] = concurrentUsersStats.BucketLE6
m["stats.auth_token_per_user_le_9"] = concurrentUsersStats.BucketLE9
m["stats.auth_token_per_user_le_12"] = concurrentUsersStats.BucketLE12
m["stats.auth_token_per_user_le_15"] = concurrentUsersStats.BucketLE15
m["stats.auth_token_per_user_le_inf"] = concurrentUsersStats.BucketLEInf
m["stats.uptime"] = int64(time.Since(s.startTime).Seconds())
featureUsageStats := s.features.GetUsageStats(ctx)
for k, v := range featureUsageStats {
m[k] = v
}
for _, usageStatProvider := range s.usageStatProviders {
stats := usageStatProvider.GetUsageStats(ctx)
for k, v := range stats {
m[k] = v
}
}
return m, nil
}

View File

@ -7,12 +7,13 @@ import (
"testing"
"time"
"github.com/grafana/grafana/pkg/components/simplejson"
sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/usagestats"
"github.com/grafana/grafana/pkg/login/social"
@ -97,7 +98,7 @@ func TestUsageStatsProviders(t *testing.T) {
s := createService(t, setting.NewCfg(), store)
s.RegisterProviders([]registry.ProvidesUsageStats{provider1, provider2})
m, err := s.collect(context.Background())
m, err := s.collectAdditionalMetrics(context.Background())
require.NoError(t, err, "Expected no error")
assert.Equal(t, "val1", m["my_stat_1"])
@ -111,7 +112,7 @@ func TestFeatureUsageStats(t *testing.T) {
mockSystemStats(store)
s := createService(t, setting.NewCfg(), store)
m, err := s.collect(context.Background())
m, err := s.collectSystemStats(context.Background())
require.NoError(t, err, "Expected no error")
assert.Equal(t, 1, m["stats.features.feature_1.count"])
@ -134,6 +135,56 @@ func TestCollectingUsageStats(t *testing.T) {
s.startTime = time.Now().Add(-1 * time.Minute)
mockSystemStats(sqlStore)
createConcurrentTokens(t, sqlStore)
s.social = &mockSocial{
OAuthProviders: map[string]bool{
"github": true,
"gitlab": true,
"azuread": true,
"google": true,
"generic_oauth": true,
"grafana_com": true,
},
}
metrics, err := s.collectSystemStats(context.Background())
require.NoError(t, err)
assert.EqualValues(t, 15, metrics["stats.total_auth_token.count"])
assert.EqualValues(t, 2, metrics["stats.api_keys.count"])
assert.EqualValues(t, 5, metrics["stats.avg_auth_token_per_user.count"])
assert.EqualValues(t, 16, metrics["stats.dashboard_versions.count"])
assert.EqualValues(t, 17, metrics["stats.annotations.count"])
assert.EqualValues(t, 18, metrics["stats.alert_rules.count"])
assert.EqualValues(t, 19, metrics["stats.library_panels.count"])
assert.EqualValues(t, 20, metrics["stats.library_variables.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.anonymous.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.basic_auth.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.ldap.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.auth_proxy.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_github.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_gitlab.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_google.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_azuread.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_generic_oauth.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_grafana_com.count"])
assert.EqualValues(t, 1, metrics["stats.packaging.deb.count"])
assert.EqualValues(t, 1, metrics["stats.distributor.hosted-grafana.count"])
assert.EqualValues(t, 11, metrics["stats.data_keys.count"])
assert.EqualValues(t, 3, metrics["stats.active_data_keys.count"])
assert.InDelta(t, int64(65), metrics["stats.uptime"], 6)
}
func TestDatasourceStats(t *testing.T) {
sqlStore := mockstore.NewSQLStoreMock()
s := createService(t, &setting.Cfg{}, sqlStore)
setupSomeDataSourcePlugins(t, s)
sqlStore.ExpectedDataSourceStats = []*models.DataSourceStats{
@ -216,6 +267,31 @@ func TestCollectingUsageStats(t *testing.T) {
},
}
{
db, err := s.collectDatasourceStats(context.Background())
require.NoError(t, err)
assert.EqualValues(t, 9, db["stats.ds."+models.DS_ES+".count"])
assert.EqualValues(t, 10, db["stats.ds."+models.DS_PROMETHEUS+".count"])
assert.EqualValues(t, 11+12, db["stats.ds.other.count"])
}
{
dba, err := s.collectDatasourceAccess(context.Background())
require.NoError(t, err)
assert.EqualValues(t, 1, dba["stats.ds_access."+models.DS_ES+".direct.count"])
assert.EqualValues(t, 2, dba["stats.ds_access."+models.DS_ES+".proxy.count"])
assert.EqualValues(t, 3, dba["stats.ds_access."+models.DS_PROMETHEUS+".proxy.count"])
assert.EqualValues(t, 6+7, dba["stats.ds_access.other.direct.count"])
assert.EqualValues(t, 4+8, dba["stats.ds_access.other.proxy.count"])
}
}
func TestAlertNotifiersStats(t *testing.T) {
sqlStore := mockstore.NewSQLStoreMock()
s := createService(t, &setting.Cfg{}, sqlStore)
sqlStore.ExpectedNotifierUsageStats = []*models.NotifierUsageStats{
{
Type: "slack",
@ -227,63 +303,11 @@ func TestCollectingUsageStats(t *testing.T) {
},
}
createConcurrentTokens(t, sqlStore)
s.social = &mockSocial{
OAuthProviders: map[string]bool{
"github": true,
"gitlab": true,
"azuread": true,
"google": true,
"generic_oauth": true,
"grafana_com": true,
},
}
metrics, err := s.collect(context.Background())
metrics, err := s.collectAlertNotifierStats(context.Background())
require.NoError(t, err)
assert.EqualValues(t, 15, metrics["stats.total_auth_token.count"])
assert.EqualValues(t, 2, metrics["stats.api_keys.count"])
assert.EqualValues(t, 5, metrics["stats.avg_auth_token_per_user.count"])
assert.EqualValues(t, 16, metrics["stats.dashboard_versions.count"])
assert.EqualValues(t, 17, metrics["stats.annotations.count"])
assert.EqualValues(t, 18, metrics["stats.alert_rules.count"])
assert.EqualValues(t, 19, metrics["stats.library_panels.count"])
assert.EqualValues(t, 20, metrics["stats.library_variables.count"])
assert.EqualValues(t, 9, metrics["stats.ds."+models.DS_ES+".count"])
assert.EqualValues(t, 10, metrics["stats.ds."+models.DS_PROMETHEUS+".count"])
assert.EqualValues(t, 11+12, metrics["stats.ds.other.count"])
assert.EqualValues(t, 1, metrics["stats.ds_access."+models.DS_ES+".direct.count"])
assert.EqualValues(t, 2, metrics["stats.ds_access."+models.DS_ES+".proxy.count"])
assert.EqualValues(t, 3, metrics["stats.ds_access."+models.DS_PROMETHEUS+".proxy.count"])
assert.EqualValues(t, 6+7, metrics["stats.ds_access.other.direct.count"])
assert.EqualValues(t, 4+8, metrics["stats.ds_access.other.proxy.count"])
assert.EqualValues(t, 1, metrics["stats.alert_notifiers.slack.count"])
assert.EqualValues(t, 2, metrics["stats.alert_notifiers.webhook.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.anonymous.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.basic_auth.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.ldap.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.auth_proxy.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_github.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_gitlab.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_google.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_azuread.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_generic_oauth.count"])
assert.EqualValues(t, 1, metrics["stats.auth_enabled.oauth_grafana_com.count"])
assert.EqualValues(t, 1, metrics["stats.packaging.deb.count"])
assert.EqualValues(t, 1, metrics["stats.distributor.hosted-grafana.count"])
assert.EqualValues(t, 11, metrics["stats.data_keys.count"])
assert.EqualValues(t, 3, metrics["stats.active_data_keys.count"])
assert.InDelta(t, int64(65), metrics["stats.uptime"], 6)
}
func mockSystemStats(sqlStore *mockstore.SQLStoreMock) {