grafana/pkg/services/cleanup/cleanup.go
2023-10-03 14:54:20 +02:00

282 lines
9.3 KiB
Go

package cleanup
import (
"context"
"errors"
"fmt"
"io/fs"
"os"
"path"
"strconv"
"time"
"go.opentelemetry.io/otel/attribute"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/serverlock"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/annotations"
"github.com/grafana/grafana/pkg/services/dashboardsnapshots"
dashver "github.com/grafana/grafana/pkg/services/dashboardversion"
"github.com/grafana/grafana/pkg/services/ngalert/image"
"github.com/grafana/grafana/pkg/services/queryhistory"
"github.com/grafana/grafana/pkg/services/shorturls"
tempuser "github.com/grafana/grafana/pkg/services/temp_user"
"github.com/grafana/grafana/pkg/setting"
)
func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService,
shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service,
dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService,
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner) *CleanUpService {
s := &CleanUpService{
Cfg: cfg,
ServerLockService: serverLockService,
ShortURLService: shortURLService,
QueryHistoryService: queryHistoryService,
store: sqlstore,
log: log.New("cleanup"),
dashboardVersionService: dashboardVersionService,
dashboardSnapshotService: dashSnapSvc,
deleteExpiredImageService: deleteExpiredImageService,
tempUserService: tempUserService,
tracer: tracer,
annotationCleaner: annotationCleaner,
}
return s
}
type CleanUpService struct {
log log.Logger
tracer tracing.Tracer
store db.DB
Cfg *setting.Cfg
ServerLockService *serverlock.ServerLockService
ShortURLService shorturls.Service
QueryHistoryService queryhistory.Service
dashboardVersionService dashver.Service
dashboardSnapshotService dashboardsnapshots.Service
deleteExpiredImageService *image.DeleteExpiredService
tempUserService tempuser.Service
annotationCleaner annotations.Cleaner
}
type cleanUpJob struct {
name string
fn func(context.Context)
}
func (j cleanUpJob) String() string {
return strconv.Quote(j.name)
}
func (srv *CleanUpService) Run(ctx context.Context) error {
srv.cleanUpTmpFiles(ctx)
ticker := time.NewTicker(time.Minute * 10)
for {
select {
case <-ticker.C:
srv.clean(ctx)
case <-ctx.Done():
return ctx.Err()
}
}
}
func (srv *CleanUpService) clean(ctx context.Context) {
const timeout = 9 * time.Minute
start := time.Now()
ctx, span := srv.tracer.Start(ctx, "cleanup background job")
defer span.End()
ctx, cancelFn := context.WithTimeout(ctx, timeout)
defer cancelFn()
cleanupJobs := []cleanUpJob{
{"clean up temporary files", srv.cleanUpTmpFiles},
{"delete expired snapshots", srv.deleteExpiredSnapshots},
{"delete expired dashboard versions", srv.deleteExpiredDashboardVersions},
{"delete expired images", srv.deleteExpiredImages},
{"cleanup old annotations", srv.cleanUpOldAnnotations},
{"expire old user invites", srv.expireOldUserInvites},
{"delete stale short URLs", srv.deleteStaleShortURLs},
{"delete stale query history", srv.deleteStaleQueryHistory},
}
logger := srv.log.FromContext(ctx)
logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs))
for _, j := range cleanupJobs {
if ctx.Err() != nil {
logger.Error("Cancelled cleanup job", "error", ctx.Err(), "duration", time.Since(start))
return
}
ctx, span := srv.tracer.Start(ctx, j.name)
j.fn(ctx)
span.End()
}
logger.Info("Completed cleanup jobs", "duration", time.Since(start))
}
func (srv *CleanUpService) cleanUpOldAnnotations(ctx context.Context) {
logger := srv.log.FromContext(ctx)
affected, affectedTags, err := srv.annotationCleaner.Run(ctx, srv.Cfg)
if err != nil && !errors.Is(err, context.DeadlineExceeded) {
logger.Error("failed to clean up old annotations", "error", err)
} else {
logger.Debug("Deleted excess annotations", "annotations affected", affected, "annotation tags affected", affectedTags)
}
}
func (srv *CleanUpService) cleanUpTmpFiles(ctx context.Context) {
folders := []string{
srv.Cfg.ImagesDir,
srv.Cfg.CSVsDir,
}
for _, f := range folders {
ctx, span := srv.tracer.Start(ctx, "delete stale files in temporary directory")
span.SetAttributes(attribute.String("directory", f))
srv.cleanUpTmpFolder(ctx, f)
span.End()
}
}
func (srv *CleanUpService) cleanUpTmpFolder(ctx context.Context, folder string) {
logger := srv.log.FromContext(ctx)
if _, err := os.Stat(folder); os.IsNotExist(err) {
return
}
files, err := os.ReadDir(folder)
if err != nil {
logger.Error("Problem reading dir", "folder", folder, "error", err)
return
}
var toDelete []fs.DirEntry
var now = time.Now()
for _, file := range files {
info, err := file.Info()
if err != nil {
logger.Error("Problem reading file", "folder", folder, "file", file, "error", err)
continue
}
if srv.shouldCleanupTempFile(info.ModTime(), now) {
toDelete = append(toDelete, file)
}
}
for _, file := range toDelete {
fullPath := path.Join(folder, file.Name())
err := os.Remove(fullPath)
if err != nil {
logger.Error("Failed to delete temp file", "file", file.Name(), "error", err)
}
}
logger.Debug("Found old rendered file to delete", "folder", folder, "deleted", len(toDelete), "kept", len(files))
}
func (srv *CleanUpService) shouldCleanupTempFile(filemtime time.Time, now time.Time) bool {
if srv.Cfg.TempDataLifetime == 0 {
return false
}
return filemtime.Add(srv.Cfg.TempDataLifetime).Before(now)
}
func (srv *CleanUpService) deleteExpiredSnapshots(ctx context.Context) {
logger := srv.log.FromContext(ctx)
cmd := dashboardsnapshots.DeleteExpiredSnapshotsCommand{}
if err := srv.dashboardSnapshotService.DeleteExpiredSnapshots(ctx, &cmd); err != nil {
logger.Error("Failed to delete expired snapshots", "error", err.Error())
} else {
logger.Debug("Deleted expired snapshots", "rows affected", cmd.DeletedRows)
}
}
func (srv *CleanUpService) deleteExpiredDashboardVersions(ctx context.Context) {
logger := srv.log.FromContext(ctx)
cmd := dashver.DeleteExpiredVersionsCommand{}
if err := srv.dashboardVersionService.DeleteExpired(ctx, &cmd); err != nil {
logger.Error("Failed to delete expired dashboard versions", "error", err.Error())
} else {
logger.Debug("Deleted old/expired dashboard versions", "rows affected", cmd.DeletedRows)
}
}
func (srv *CleanUpService) deleteExpiredImages(ctx context.Context) {
logger := srv.log.FromContext(ctx)
if !srv.Cfg.UnifiedAlerting.IsEnabled() {
return
}
if rowsAffected, err := srv.deleteExpiredImageService.DeleteExpired(ctx); err != nil {
logger.Error("Failed to delete expired images", "error", err.Error())
} else {
logger.Debug("Deleted expired images", "rows affected", rowsAffected)
}
}
func (srv *CleanUpService) expireOldUserInvites(ctx context.Context) {
logger := srv.log.FromContext(ctx)
maxInviteLifetime := srv.Cfg.UserInviteMaxLifetime
cmd := tempuser.ExpireTempUsersCommand{
OlderThan: time.Now().Add(-maxInviteLifetime),
}
if err := srv.tempUserService.ExpireOldUserInvites(ctx, &cmd); err != nil {
logger.Error("Problem expiring user invites", "error", err.Error())
} else {
logger.Debug("Expired user invites", "rows affected", cmd.NumExpired)
}
}
func (srv *CleanUpService) deleteStaleShortURLs(ctx context.Context) {
logger := srv.log.FromContext(ctx)
cmd := shorturls.DeleteShortUrlCommand{
OlderThan: time.Now().Add(-time.Hour * 24 * 7),
}
if err := srv.ShortURLService.DeleteStaleShortURLs(ctx, &cmd); err != nil {
logger.Error("Problem deleting stale short urls", "error", err.Error())
} else {
logger.Debug("Deleted short urls", "rows affected", cmd.NumDeleted)
}
}
func (srv *CleanUpService) deleteStaleQueryHistory(ctx context.Context) {
logger := srv.log.FromContext(ctx)
// Delete query history from 14+ days ago with exception of starred queries
maxQueryHistoryLifetime := time.Hour * 24 * 14
olderThan := time.Now().Add(-maxQueryHistoryLifetime).Unix()
rowsCount, err := srv.QueryHistoryService.DeleteStaleQueriesInQueryHistory(ctx, olderThan)
if err != nil {
logger.Error("Problem deleting stale query history", "error", err.Error())
} else {
logger.Debug("Deleted stale query history", "rows affected", rowsCount)
}
// Enforce 200k limit for query_history table
queryHistoryLimit := 200000
rowsCount, err = srv.QueryHistoryService.EnforceRowLimitInQueryHistory(ctx, queryHistoryLimit, false)
if err != nil {
logger.Error("Problem with enforcing row limit for query_history", "error", err.Error())
} else {
logger.Debug("Enforced row limit for query_history", "rows affected", rowsCount)
}
// Enforce 150k limit for query_history_star table
queryHistoryStarLimit := 150000
rowsCount, err = srv.QueryHistoryService.EnforceRowLimitInQueryHistory(ctx, queryHistoryStarLimit, true)
if err != nil {
logger.Error("Problem with enforcing row limit for query_history_star", "error", err.Error())
} else {
logger.Debug("Enforced row limit for query_history_star", "rows affected", rowsCount)
}
}