2021-04-05 17:05:39 -05:00
package api
import (
2024-04-23 07:50:26 -05:00
"context"
2021-04-27 15:15:00 -05:00
"encoding/json"
2021-10-04 10:33:55 -05:00
"errors"
2021-04-13 16:38:09 -05:00
"fmt"
2024-04-19 05:52:01 -05:00
"net/url"
2022-04-05 13:36:42 -05:00
"sort"
2021-10-04 10:33:55 -05:00
"strconv"
"strings"
2021-04-13 16:38:09 -05:00
"time"
2023-04-17 11:45:06 -05:00
"github.com/prometheus/alertmanager/pkg/labels"
2023-01-30 02:55:35 -06:00
apiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
2021-04-05 17:05:39 -05:00
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/infra/log"
2023-01-27 01:50:36 -06:00
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
2021-04-19 13:26:04 -05:00
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
2022-03-16 11:04:19 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
2021-04-05 17:05:39 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/state"
2023-04-17 11:45:06 -05:00
"github.com/grafana/grafana/pkg/util"
2021-04-05 17:05:39 -05:00
)
type PrometheusSrv struct {
2021-04-23 14:32:25 -05:00
log log . Logger
2022-03-09 12:20:29 -06:00
manager state . AlertInstanceManager
2022-09-27 08:56:30 -05:00
store RuleStore
2023-11-15 10:54:54 -06:00
authz RuleAccessControlService
2021-04-05 17:05:39 -05:00
}
2022-03-16 11:04:19 -05:00
const queryIncludeInternalLabels = "includeInternalLabels"
2024-04-19 05:52:01 -05:00
func getBoolWithDefault ( vals url . Values , field string , d bool ) bool {
f := vals . Get ( field )
if f == "" {
return d
}
v , _ := strconv . ParseBool ( f )
return v
}
func getInt64WithDefault ( vals url . Values , field string , d int64 ) int64 {
f := vals . Get ( field )
if f == "" {
return d
}
v , err := strconv . ParseInt ( f , 10 , 64 )
if err != nil {
return d
}
return v
}
2023-01-27 01:50:36 -06:00
func ( srv PrometheusSrv ) RouteGetAlertStatuses ( c * contextmodel . ReqContext ) response . Response {
2024-04-19 05:52:01 -05:00
// As we are using req.Form directly, this triggers a call to ParseForm() if needed.
c . Query ( "" )
2024-04-23 07:50:26 -05:00
resp := PrepareAlertStatuses ( srv . manager , AlertStatusesOptions {
OrgID : c . SignedInUser . GetOrgID ( ) ,
Query : c . Req . Form ,
} )
return response . JSON ( resp . HTTPStatusCode ( ) , resp )
}
type AlertStatusesOptions struct {
OrgID int64
Query url . Values
}
func PrepareAlertStatuses ( manager state . AlertInstanceManager , opts AlertStatusesOptions ) apimodels . AlertResponse {
2021-04-05 17:05:39 -05:00
alertResponse := apimodels . AlertResponse {
DiscoveryBase : apimodels . DiscoveryBase {
Status : "success" ,
} ,
Data : apimodels . AlertDiscovery {
Alerts : [ ] * apimodels . Alert { } ,
} ,
}
2022-03-16 11:04:19 -05:00
var labelOptions [ ] ngmodels . LabelOption
2024-04-23 07:50:26 -05:00
if ! getBoolWithDefault ( opts . Query , queryIncludeInternalLabels , false ) {
2022-03-16 11:04:19 -05:00
labelOptions = append ( labelOptions , ngmodels . WithoutInternalLabels ( ) )
}
2024-04-23 07:50:26 -05:00
for _ , alertState := range manager . GetAll ( opts . OrgID ) {
2021-04-05 17:05:39 -05:00
startsAt := alertState . StartsAt
2021-05-18 08:12:39 -05:00
valString := ""
2022-04-05 13:36:42 -05:00
if alertState . State == eval . Alerting || alertState . State == eval . Pending {
valString = formatValues ( alertState )
2021-05-18 08:12:39 -05:00
}
2022-03-09 12:20:29 -06:00
2021-04-05 17:05:39 -05:00
alertResponse . Data . Alerts = append ( alertResponse . Data . Alerts , & apimodels . Alert {
2022-03-16 11:04:19 -05:00
Labels : alertState . GetLabels ( labelOptions ... ) ,
2022-03-09 12:20:29 -06:00
Annotations : alertState . Annotations ,
2022-05-23 03:49:49 -05:00
// TODO: or should we make this two fields? Using one field lets the
// frontend use the same logic for parsing text on annotations and this.
2022-11-04 10:39:26 -05:00
State : state . FormatStateAndReason ( alertState . State , alertState . StateReason ) ,
2022-05-23 03:49:49 -05:00
ActiveAt : & startsAt ,
Value : valString ,
2021-04-05 17:05:39 -05:00
} )
}
2022-03-09 12:20:29 -06:00
2024-04-23 07:50:26 -05:00
return alertResponse
2021-04-05 17:05:39 -05:00
}
2022-04-05 13:36:42 -05:00
func formatValues ( alertState * state . State ) string {
var fv string
values := alertState . GetLastEvaluationValuesForCondition ( )
switch len ( values ) {
case 0 :
fv = alertState . LastEvaluationString
case 1 :
for _ , v := range values {
fv = strconv . FormatFloat ( v , 'e' , - 1 , 64 )
break
}
default :
vs := make ( [ ] string , 0 , len ( values ) )
for k , v := range values {
vs = append ( vs , fmt . Sprintf ( "%s: %s" , k , strconv . FormatFloat ( v , 'e' , - 1 , 64 ) ) )
}
// Ensure we have a consistent natural ordering after formatting e.g. A0, A1, A10, A11, A3, etc.
sort . Strings ( vs )
fv = strings . Join ( vs , ", " )
}
return fv
}
2024-04-23 07:50:26 -05:00
func getPanelIDFromQuery ( v url . Values ) ( int64 , error ) {
if s := strings . TrimSpace ( v . Get ( "panel_id" ) ) ; s != "" {
2021-10-04 10:33:55 -05:00
return strconv . ParseInt ( s , 10 , 64 )
}
return 0 , nil
}
2024-04-23 07:50:26 -05:00
func getMatchersFromQuery ( v url . Values ) ( labels . Matchers , error ) {
2023-04-17 11:45:06 -05:00
var matchers labels . Matchers
2024-04-23 07:50:26 -05:00
for _ , s := range v [ "matcher" ] {
2023-04-17 11:45:06 -05:00
var m labels . Matcher
if err := json . Unmarshal ( [ ] byte ( s ) , & m ) ; err != nil {
return nil , err
}
if len ( m . Name ) == 0 {
return nil , errors . New ( "bad matcher: the name cannot be blank" )
}
matchers = append ( matchers , & m )
}
return matchers , nil
}
2024-04-23 07:50:26 -05:00
func getStatesFromQuery ( v url . Values ) ( [ ] eval . State , error ) {
2023-04-17 11:45:06 -05:00
var states [ ] eval . State
2024-04-23 07:50:26 -05:00
for _ , s := range v [ "state" ] {
2023-04-17 11:45:06 -05:00
s = strings . ToLower ( s )
switch s {
case "normal" , "inactive" :
states = append ( states , eval . Normal )
case "alerting" , "firing" :
states = append ( states , eval . Alerting )
case "pending" :
states = append ( states , eval . Pending )
case "nodata" :
states = append ( states , eval . NoData )
// nolint:goconst
case "error" :
states = append ( states , eval . Error )
default :
return states , fmt . Errorf ( "unknown state '%s'" , s )
}
}
return states , nil
}
2024-04-23 07:50:26 -05:00
type RuleGroupStatusesOptions struct {
Ctx context . Context
OrgID int64
Query url . Values
Namespaces map [ string ] string
AuthorizeRuleGroup func ( rules [ ] * ngmodels . AlertRule ) ( bool , error )
}
type ListAlertRulesStore interface {
ListAlertRules ( ctx context . Context , query * ngmodels . ListAlertRulesQuery ) ( ngmodels . RulesGroup , error )
}
2023-01-27 01:50:36 -06:00
func ( srv PrometheusSrv ) RouteGetRuleStatuses ( c * contextmodel . ReqContext ) response . Response {
2024-04-19 05:52:01 -05:00
// As we are using req.Form directly, this triggers a call to ParseForm() if needed.
c . Query ( "" )
2024-04-19 14:03:20 -05:00
ruleResponse := apimodels . RuleResponse {
DiscoveryBase : apimodels . DiscoveryBase {
Status : "success" ,
} ,
Data : apimodels . RuleDiscovery {
RuleGroups : [ ] apimodels . RuleGroup { } ,
} ,
}
2024-04-23 07:50:26 -05:00
namespaceMap , err := srv . store . GetUserVisibleNamespaces ( c . Req . Context ( ) , c . SignedInUser . GetOrgID ( ) , c . SignedInUser )
if err != nil {
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = fmt . Sprintf ( "failed to get namespaces visible to the user: %s" , err . Error ( ) )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrServer
return response . JSON ( ruleResponse . HTTPStatusCode ( ) , ruleResponse )
}
namespaces := map [ string ] string { }
for namespaceUID , folder := range namespaceMap {
namespaces [ namespaceUID ] = folder . Fullpath
}
ruleResponse = PrepareRuleGroupStatuses ( srv . log , srv . manager , srv . store , RuleGroupStatusesOptions {
Ctx : c . Req . Context ( ) ,
OrgID : c . OrgID ,
Query : c . Req . Form ,
Namespaces : namespaces ,
AuthorizeRuleGroup : func ( rules [ ] * ngmodels . AlertRule ) ( bool , error ) {
return srv . authz . HasAccessToRuleGroup ( c . Req . Context ( ) , c . SignedInUser , rules )
} ,
} )
return response . JSON ( ruleResponse . HTTPStatusCode ( ) , ruleResponse )
}
2024-06-04 04:57:55 -05:00
// TODO: Refactor this function to reduce the cylomatic complexity
//
//nolint:gocyclo
2024-04-23 07:50:26 -05:00
func PrepareRuleGroupStatuses ( log log . Logger , manager state . AlertInstanceManager , store ListAlertRulesStore , opts RuleGroupStatusesOptions ) apimodels . RuleResponse {
ruleResponse := apimodels . RuleResponse {
DiscoveryBase : apimodels . DiscoveryBase {
Status : "success" ,
} ,
Data : apimodels . RuleDiscovery {
RuleGroups : [ ] apimodels . RuleGroup { } ,
} ,
}
dashboardUID := opts . Query . Get ( "dashboard_uid" )
panelID , err := getPanelIDFromQuery ( opts . Query )
2022-04-11 09:54:29 -05:00
if err != nil {
2024-04-19 14:03:20 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = fmt . Sprintf ( "invalid panel_id: %s" , err . Error ( ) )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrBadData
2024-04-23 07:50:26 -05:00
return ruleResponse
2022-04-11 09:54:29 -05:00
}
if dashboardUID == "" && panelID != 0 {
2024-04-19 14:03:20 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = "panel_id must be set with dashboard_uid"
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrBadData
2024-04-23 07:50:26 -05:00
return ruleResponse
2022-04-11 09:54:29 -05:00
}
2024-04-23 07:50:26 -05:00
limitGroups := getInt64WithDefault ( opts . Query , "limit" , - 1 )
limitRulesPerGroup := getInt64WithDefault ( opts . Query , "limit_rules" , - 1 )
limitAlertsPerRule := getInt64WithDefault ( opts . Query , "limit_alerts" , - 1 )
matchers , err := getMatchersFromQuery ( opts . Query )
2023-04-17 11:45:06 -05:00
if err != nil {
2024-04-19 14:03:20 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = err . Error ( )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrBadData
2024-04-23 07:50:26 -05:00
return ruleResponse
2023-04-17 11:45:06 -05:00
}
2024-04-23 07:50:26 -05:00
withStates , err := getStatesFromQuery ( opts . Query )
2023-04-17 11:45:06 -05:00
if err != nil {
2024-04-19 14:03:20 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = err . Error ( )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrBadData
2024-04-23 07:50:26 -05:00
return ruleResponse
2023-04-17 11:45:06 -05:00
}
withStatesFast := make ( map [ eval . State ] struct { } )
for _ , state := range withStates {
withStatesFast [ state ] = struct { } { }
}
2022-03-16 11:04:19 -05:00
var labelOptions [ ] ngmodels . LabelOption
2024-04-23 07:50:26 -05:00
if ! getBoolWithDefault ( opts . Query , queryIncludeInternalLabels , false ) {
2022-03-16 11:04:19 -05:00
labelOptions = append ( labelOptions , ngmodels . WithoutInternalLabels ( ) )
}
2024-04-23 07:50:26 -05:00
if len ( opts . Namespaces ) == 0 {
log . Debug ( "User does not have access to any namespaces" )
return ruleResponse
2021-07-22 01:53:14 -05:00
}
2024-06-04 04:57:55 -05:00
namespaceUIDs := make ( [ ] string , 0 , len ( opts . Namespaces ) )
folderUID := opts . Query . Get ( "folder_uid" )
_ , exists := opts . Namespaces [ folderUID ]
if folderUID != "" && exists {
namespaceUIDs = append ( namespaceUIDs , folderUID )
} else {
for k := range opts . Namespaces {
namespaceUIDs = append ( namespaceUIDs , k )
}
2021-07-22 01:53:14 -05:00
}
2024-06-04 04:57:55 -05:00
ruleGroups := opts . Query [ "rule_group" ]
2022-04-11 09:54:29 -05:00
alertRuleQuery := ngmodels . ListAlertRulesQuery {
2024-04-23 07:50:26 -05:00
OrgID : opts . OrgID ,
2021-07-22 01:53:14 -05:00
NamespaceUIDs : namespaceUIDs ,
2021-10-04 10:33:55 -05:00
DashboardUID : dashboardUID ,
PanelID : panelID ,
2024-06-04 04:57:55 -05:00
RuleGroups : ruleGroups ,
2021-04-13 16:38:09 -05:00
}
2024-04-23 07:50:26 -05:00
ruleList , err := store . ListAlertRules ( opts . Ctx , & alertRuleQuery )
2023-03-28 03:34:35 -05:00
if err != nil {
2021-10-01 08:39:04 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = fmt . Sprintf ( "failure getting rules: %s" , err . Error ( ) )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrServer
2024-04-23 07:50:26 -05:00
return ruleResponse
2021-10-01 08:39:04 -05:00
}
2024-06-04 04:57:55 -05:00
ruleNames := opts . Query [ "rule_name" ]
ruleNamesSet := make ( map [ string ] struct { } , len ( ruleNames ) )
for _ , rn := range ruleNames {
ruleNamesSet [ rn ] = struct { } { }
}
2023-04-17 11:45:06 -05:00
// Group rules together by Namespace and Rule Group. Rules are also grouped by Org ID,
2024-06-04 04:57:55 -05:00
// but in this API all rules belong to the same organization. Also filter by rule name if
// it was provided as a query param.
2022-05-16 14:45:45 -05:00
groupedRules := make ( map [ ngmodels . AlertRuleGroupKey ] [ ] * ngmodels . AlertRule )
2023-03-28 03:34:35 -05:00
for _ , rule := range ruleList {
2024-06-04 04:57:55 -05:00
if len ( ruleNamesSet ) > 0 {
if _ , exists := ruleNamesSet [ rule . Title ] ; ! exists {
continue
}
}
2023-04-17 11:45:06 -05:00
groupKey := rule . GetGroupKey ( )
ruleGroup := groupedRules [ groupKey ]
ruleGroup = append ( ruleGroup , rule )
groupedRules [ groupKey ] = ruleGroup
}
// Sort the rules in each rule group by index. We do this at the end instead of
// after each append to avoid having to sort each group multiple times.
for _ , groupRules := range groupedRules {
ngmodels . AlertRulesBy ( ngmodels . AlertRulesByIndex ) . Sort ( groupRules )
2022-05-16 14:45:45 -05:00
}
2023-04-17 11:45:06 -05:00
rulesTotals := make ( map [ string ] int64 , len ( groupedRules ) )
2022-05-16 14:45:45 -05:00
for groupKey , rules := range groupedRules {
2024-04-23 07:50:26 -05:00
folder , ok := opts . Namespaces [ groupKey . NamespaceUID ]
if ! ok {
log . Warn ( "Query returned rules that belong to folder the user does not have access to. All rules that belong to that namespace will not be added to the response" , "folder_uid" , groupKey . NamespaceUID )
2022-05-16 14:45:45 -05:00
continue
2021-10-01 08:39:04 -05:00
}
2024-04-23 07:50:26 -05:00
ok , err := opts . AuthorizeRuleGroup ( rules )
2023-12-01 17:42:11 -06:00
if err != nil {
2024-04-19 14:03:20 -05:00
ruleResponse . DiscoveryBase . Status = "error"
ruleResponse . DiscoveryBase . Error = fmt . Sprintf ( "cannot authorize access to rule group: %s" , err . Error ( ) )
ruleResponse . DiscoveryBase . ErrorType = apiv1 . ErrServer
2024-04-23 07:50:26 -05:00
return ruleResponse
2023-12-01 17:42:11 -06:00
}
if ! ok {
2022-06-01 09:23:54 -05:00
continue
}
2024-06-04 04:57:55 -05:00
2024-04-23 07:50:26 -05:00
ruleGroup , totals := toRuleGroup ( log , manager , groupKey , folder , rules , limitAlertsPerRule , withStatesFast , matchers , labelOptions )
2023-04-17 11:45:06 -05:00
ruleGroup . Totals = totals
for k , v := range totals {
rulesTotals [ k ] += v
}
if len ( withStates ) > 0 {
// Filtering is weird but firing, pending, and normal filters also need to be
// applied to the rule. Others such as nodata and error should have no effect.
// This is to match the current behavior in the UI.
filteredRules := make ( [ ] apimodels . AlertingRule , 0 , len ( ruleGroup . Rules ) )
for _ , rule := range ruleGroup . Rules {
var state * eval . State
switch rule . State {
case "normal" , "inactive" :
state = util . Pointer ( eval . Normal )
case "alerting" , "firing" :
state = util . Pointer ( eval . Alerting )
case "pending" :
state = util . Pointer ( eval . Pending )
}
if state != nil {
if _ , ok := withStatesFast [ * state ] ; ok {
filteredRules = append ( filteredRules , rule )
}
}
}
ruleGroup . Rules = filteredRules
}
if limitRulesPerGroup > - 1 && int64 ( len ( ruleGroup . Rules ) ) > limitRulesPerGroup {
ruleGroup . Rules = ruleGroup . Rules [ 0 : limitRulesPerGroup ]
}
ruleResponse . Data . RuleGroups = append ( ruleResponse . Data . RuleGroups , * ruleGroup )
2022-05-16 14:45:45 -05:00
}
2023-04-17 11:45:06 -05:00
ruleResponse . Data . Totals = rulesTotals
// Sort Rule Groups before checking limits
apimodels . RuleGroupsBy ( apimodels . RuleGroupsByFileAndName ) . Sort ( ruleResponse . Data . RuleGroups )
if limitGroups > - 1 && int64 ( len ( ruleResponse . Data . RuleGroups ) ) >= limitGroups {
ruleResponse . Data . RuleGroups = ruleResponse . Data . RuleGroups [ 0 : limitGroups ]
}
2024-04-23 07:50:26 -05:00
return ruleResponse
2022-05-16 14:45:45 -05:00
}
2023-04-17 11:45:06 -05:00
// This is the same as matchers.Matches but avoids the need to create a LabelSet
func matchersMatch ( matchers [ ] * labels . Matcher , labels map [ string ] string ) bool {
for _ , m := range matchers {
if ! m . Matches ( labels [ m . Name ] ) {
return false
}
}
return true
}
2024-04-23 07:50:26 -05:00
func toRuleGroup ( log log . Logger , manager state . AlertInstanceManager , groupKey ngmodels . AlertRuleGroupKey , folderFullPath string , rules [ ] * ngmodels . AlertRule , limitAlerts int64 , withStates map [ eval . State ] struct { } , matchers labels . Matchers , labelOptions [ ] ngmodels . LabelOption ) ( * apimodels . RuleGroup , map [ string ] int64 ) {
2022-05-16 14:45:45 -05:00
newGroup := & apimodels . RuleGroup {
2023-04-17 11:45:06 -05:00
Name : groupKey . RuleGroup ,
// file is what Prometheus uses for provisioning, we replace it with namespace which is the folder in Grafana.
2024-04-23 07:50:26 -05:00
File : folderFullPath ,
2022-05-16 14:45:45 -05:00
}
2023-04-17 11:45:06 -05:00
rulesTotals := make ( map [ string ] int64 , len ( rules ) )
2022-06-22 09:52:46 -05:00
ngmodels . RulesGroup ( rules ) . SortByGroupIndex ( )
2022-05-16 14:45:45 -05:00
for _ , rule := range rules {
2021-10-01 08:39:04 -05:00
alertingRule := apimodels . AlertingRule {
State : "inactive" ,
Name : rule . Title ,
2024-04-23 07:50:26 -05:00
Query : ruleToQuery ( log , rule ) ,
2021-10-01 08:39:04 -05:00
Duration : rule . For . Seconds ( ) ,
Annotations : rule . Annotations ,
}
newRule := apimodels . Rule {
Name : rule . Title ,
2022-03-16 11:04:19 -05:00
Labels : rule . GetLabels ( labelOptions ... ) ,
2021-10-01 08:39:04 -05:00
Health : "ok" ,
2024-06-07 11:24:06 -05:00
Type : rule . Type ( ) . String ( ) ,
2021-10-01 08:39:04 -05:00
LastEvaluation : time . Time { } ,
}
2024-04-23 07:50:26 -05:00
states := manager . GetStatesForRuleUID ( rule . OrgID , rule . UID )
2023-04-17 11:45:06 -05:00
totals := make ( map [ string ] int64 )
2023-04-21 03:35:12 -05:00
totalsFiltered := make ( map [ string ] int64 )
2023-04-17 11:45:06 -05:00
for _ , alertState := range states {
2021-10-01 08:39:04 -05:00
activeAt := alertState . StartsAt
valString := ""
2022-04-05 13:36:42 -05:00
if alertState . State == eval . Alerting || alertState . State == eval . Pending {
valString = formatValues ( alertState )
2021-04-27 15:15:00 -05:00
}
2023-04-21 03:35:12 -05:00
stateKey := strings . ToLower ( alertState . State . String ( ) )
totals [ stateKey ] += 1
2023-04-17 11:45:06 -05:00
// Do not add error twice when execution error state is Error
if alertState . Error != nil && rule . ExecErrState != ngmodels . ErrorErrState {
totals [ "error" ] += 1
}
alert := apimodels . Alert {
2022-03-16 11:04:19 -05:00
Labels : alertState . GetLabels ( labelOptions ... ) ,
2021-10-01 08:39:04 -05:00
Annotations : alertState . Annotations ,
2022-05-23 03:49:49 -05:00
// TODO: or should we make this two fields? Using one field lets the
// frontend use the same logic for parsing text on annotations and this.
2022-11-04 10:39:26 -05:00
State : state . FormatStateAndReason ( alertState . State , alertState . StateReason ) ,
2022-05-23 03:49:49 -05:00
ActiveAt : & activeAt ,
Value : valString ,
2021-04-13 16:38:09 -05:00
}
2021-10-01 08:39:04 -05:00
if alertState . LastEvaluationTime . After ( newRule . LastEvaluation ) {
newRule . LastEvaluation = alertState . LastEvaluationTime
2021-04-13 16:38:09 -05:00
}
2021-04-21 11:30:03 -05:00
2021-10-01 08:39:04 -05:00
newRule . EvaluationTime = alertState . EvaluationDuration . Seconds ( )
2021-05-04 12:08:12 -05:00
2021-10-01 08:39:04 -05:00
switch alertState . State {
case eval . Normal :
case eval . Pending :
if alertingRule . State == "inactive" {
alertingRule . State = "pending"
2021-05-04 12:08:12 -05:00
}
2021-10-01 08:39:04 -05:00
case eval . Alerting :
2023-04-17 11:45:06 -05:00
if alertingRule . ActiveAt == nil || alertingRule . ActiveAt . After ( activeAt ) {
alertingRule . ActiveAt = & activeAt
}
2021-10-01 08:39:04 -05:00
alertingRule . State = "firing"
case eval . Error :
newRule . Health = "error"
case eval . NoData :
newRule . Health = "nodata"
2021-04-13 16:38:09 -05:00
}
2021-04-21 11:30:03 -05:00
2021-10-01 08:39:04 -05:00
if alertState . Error != nil {
newRule . LastError = alertState . Error . Error ( )
newRule . Health = "error"
}
2022-03-14 05:39:20 -05:00
2023-04-17 11:45:06 -05:00
if len ( withStates ) > 0 {
if _ , ok := withStates [ alertState . State ] ; ! ok {
continue
}
}
if ! matchersMatch ( matchers , alertState . Labels ) {
continue
}
2023-04-21 03:35:12 -05:00
totalsFiltered [ stateKey ] += 1
// Do not add error twice when execution error state is Error
if alertState . Error != nil && rule . ExecErrState != ngmodels . ErrorErrState {
totalsFiltered [ "error" ] += 1
}
2021-10-01 08:39:04 -05:00
alertingRule . Alerts = append ( alertingRule . Alerts , alert )
2021-04-13 16:38:09 -05:00
}
2021-10-01 08:39:04 -05:00
2023-04-17 11:45:06 -05:00
if alertingRule . State != "" {
rulesTotals [ alertingRule . State ] += 1
}
if newRule . Health == "error" || newRule . Health == "nodata" {
rulesTotals [ newRule . Health ] += 1
}
Alerting: Optimize rule status gathering APIs when a limit is applied. (#86568)
* Alerting: Optimize rule status gathering APIs when a limit is applied.
The frontend very commonly calls the `/rules` API with `limit_alerts=16`. When
there are a very large number of alert instances present, this API is quite
slow to respond, and profiling suggests that a big part of the problem is
sorting the alerts by importance, in order to select the first 16.
This changes the application of the limit to use a more efficient heap-based
top-k algorithm. This maintains a slice of only the highest ranked items whilst
iterating the full set of alert instances, which substantially reduces the
number of comparisons needed. This is particularly effective, as the
`AlertsByImportance` comparison is quite complex.
I've included a benchmark to compare the new TopK function to the existing
Sort/limit strategy. It shows that for small limits, the new approach is
much faster, especially at high numbers of alerts, e.g.
100K alerts / limit 16: 1.91s vs 0.02s (-99%)
For situations where there is no effective limit, sorting is marginally faster,
therefore in the API implementation, if there is either a) no limit or b) no
effective limit, then we just sort the alerts as before. There is also a space
overhead using a heap which would matter for large limits.
* Remove commented test cases
* Make linter happy
2024-04-19 04:51:22 -05:00
alertsBy := apimodels . AlertsBy ( apimodels . AlertsByImportance )
2023-04-17 11:45:06 -05:00
if limitAlerts > - 1 && int64 ( len ( alertingRule . Alerts ) ) > limitAlerts {
Alerting: Optimize rule status gathering APIs when a limit is applied. (#86568)
* Alerting: Optimize rule status gathering APIs when a limit is applied.
The frontend very commonly calls the `/rules` API with `limit_alerts=16`. When
there are a very large number of alert instances present, this API is quite
slow to respond, and profiling suggests that a big part of the problem is
sorting the alerts by importance, in order to select the first 16.
This changes the application of the limit to use a more efficient heap-based
top-k algorithm. This maintains a slice of only the highest ranked items whilst
iterating the full set of alert instances, which substantially reduces the
number of comparisons needed. This is particularly effective, as the
`AlertsByImportance` comparison is quite complex.
I've included a benchmark to compare the new TopK function to the existing
Sort/limit strategy. It shows that for small limits, the new approach is
much faster, especially at high numbers of alerts, e.g.
100K alerts / limit 16: 1.91s vs 0.02s (-99%)
For situations where there is no effective limit, sorting is marginally faster,
therefore in the API implementation, if there is either a) no limit or b) no
effective limit, then we just sort the alerts as before. There is also a space
overhead using a heap which would matter for large limits.
* Remove commented test cases
* Make linter happy
2024-04-19 04:51:22 -05:00
alertingRule . Alerts = alertsBy . TopK ( alertingRule . Alerts , int ( limitAlerts ) )
} else {
// If there is no effective limit, then just sort the alerts.
// For large numbers of alerts, this can be faster.
alertsBy . Sort ( alertingRule . Alerts )
2023-04-17 11:45:06 -05:00
}
2021-10-01 08:39:04 -05:00
alertingRule . Rule = newRule
2023-04-17 11:45:06 -05:00
alertingRule . Totals = totals
2023-04-21 03:35:12 -05:00
alertingRule . TotalsFiltered = totalsFiltered
2021-10-01 08:39:04 -05:00
newGroup . Rules = append ( newGroup . Rules , alertingRule )
newGroup . Interval = float64 ( rule . IntervalSeconds )
2022-05-16 14:45:45 -05:00
// TODO yuri. Change that when scheduler will process alerts in groups
2022-04-11 09:54:29 -05:00
newGroup . EvaluationTime = newRule . EvaluationTime
newGroup . LastEvaluation = newRule . LastEvaluation
2021-04-13 16:38:09 -05:00
}
2023-04-17 11:45:06 -05:00
return newGroup , rulesTotals
2021-04-13 16:38:09 -05:00
}
2022-03-14 05:39:20 -05:00
// ruleToQuery attempts to extract the datasource queries from the alert query model.
// Returns the whole JSON model as a string if it fails to extract a minimum of 1 query.
func ruleToQuery ( logger log . Logger , rule * ngmodels . AlertRule ) string {
var queryErr error
2024-06-14 13:16:36 -05:00
queries := make ( [ ] string , 0 , len ( rule . Data ) )
2022-03-14 05:39:20 -05:00
for _ , q := range rule . Data {
q , err := q . GetQuery ( )
if err != nil {
// If we can't find the query simply omit it, and try the rest.
// Even single query alerts would have 2 `AlertQuery`, one for the query and one for the condition.
if errors . Is ( err , ngmodels . ErrNoQuery ) {
continue
}
// For any other type of error, it is unexpected abort and return the whole JSON.
2023-09-04 11:46:34 -05:00
logger . Debug ( "Failed to parse a query" , "error" , err )
2022-03-14 05:39:20 -05:00
queryErr = err
break
}
queries = append ( queries , q )
}
// If we were able to extract at least one query without failure use it.
if queryErr == nil && len ( queries ) > 0 {
return strings . Join ( queries , " | " )
}
return encodedQueriesOrError ( rule . Data )
}
// encodedQueriesOrError tries to encode rule query data into JSON if it fails returns the encoding error as a string.
func encodedQueriesOrError ( rules [ ] ngmodels . AlertQuery ) string {
encodedQueries , err := json . Marshal ( rules )
if err == nil {
return string ( encodedQueries )
}
return err . Error ( )
}