mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Create DatasourceError alert if evaluation returns error (#41869)
* Alerting: Create DatasourceError alert if evaluation returns error * Alerting: Add docs for DatasourceError alert * Alerting: Fix DatasourceError alert does not have dashboard_uid label * Alerting: Add break when datasource_uid found * Alerting: Update TestProcessEvalResults
This commit is contained in:
parent
1e5b0e64ac
commit
1b26d4d88e
docs/sources/alerting/unified-alerting/alerting-rules
pkg/services
ngalert
sqlstore/migrations/ualert
public/app
features/alerting/unified/components/rule-editor
types
@ -69,7 +69,8 @@ Configure alerting behavior in the absence of data using information in the foll
|
||||
| Alerting | Set alert rule state to `Alerting`. |
|
||||
| Ok | Set alert rule state to `Normal`. |
|
||||
|
||||
| Error or timeout option | Description |
|
||||
| ----------------------- | ---------------------------------- |
|
||||
| Alerting | Set alert rule state to `Alerting` |
|
||||
| OK | Set alert rule state to `Normal` |
|
||||
| Error or timeout option | Description |
|
||||
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Alerting | Set alert rule state to `Alerting` |
|
||||
| OK | Set alert rule state to `Normal` |
|
||||
| Error | Create a new alert `DatasourceError` with the name and UID of the alert rule, and UID of the datasource that returned no data as labels. |
|
||||
|
@ -41,6 +41,7 @@ func (executionErrorState ExecutionErrorState) String() string {
|
||||
|
||||
const (
|
||||
AlertingErrState ExecutionErrorState = "Alerting"
|
||||
ErrorErrState ExecutionErrorState = "Error"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -19,6 +19,7 @@ import (
|
||||
|
||||
const (
|
||||
NoDataAlertName = "DatasourceNoData"
|
||||
ErrorAlertName = "DatasourceError"
|
||||
|
||||
Rulename = "rulename"
|
||||
)
|
||||
@ -52,6 +53,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post
|
||||
return noDataAlert(nL, nA, alertState, urlStr)
|
||||
}
|
||||
|
||||
if alertState.State == eval.Error {
|
||||
return errorAlert(nL, nA, alertState, urlStr)
|
||||
}
|
||||
|
||||
return &models.PostableAlert{
|
||||
Annotations: models.LabelSet(nA),
|
||||
StartsAt: strfmt.DateTime(alertState.StartsAt),
|
||||
@ -84,6 +89,25 @@ func noDataAlert(labels data.Labels, annotations data.Labels, alertState *state.
|
||||
}
|
||||
}
|
||||
|
||||
// errorAlert is a special alert sent when evaluation of an alert rule failed due to an error. Like noDataAlert, it
|
||||
// replaces the old behaviour of "Keep Last State" creating a separate alert called DatasourceError.
|
||||
func errorAlert(labels, annotations data.Labels, alertState *state.State, urlStr string) *models.PostableAlert {
|
||||
if name, ok := labels[model.AlertNameLabel]; ok {
|
||||
labels[Rulename] = name
|
||||
}
|
||||
labels[model.AlertNameLabel] = ErrorAlertName
|
||||
|
||||
return &models.PostableAlert{
|
||||
Annotations: models.LabelSet(annotations),
|
||||
StartsAt: strfmt.DateTime(alertState.StartsAt),
|
||||
EndsAt: strfmt.DateTime(alertState.EndsAt),
|
||||
Alert: models.Alert{
|
||||
Labels: models.LabelSet(labels),
|
||||
GeneratorURL: strfmt.URI(urlStr),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func FromAlertStateToPostableAlerts(firingStates []*state.State, stateManager *state.Manager, appURL *url.URL) apimodels.PostableAlerts {
|
||||
alerts := apimodels.PostableAlerts{PostableAlerts: make([]models.PostableAlert, 0, len(firingStates))}
|
||||
var sentAlerts []*state.State
|
||||
|
@ -152,6 +152,35 @@ func Test_stateToPostableAlert(t *testing.T) {
|
||||
require.NotContains(t, result.Labels[model.AlertNameLabel], Rulename)
|
||||
})
|
||||
})
|
||||
case eval.Error:
|
||||
t.Run("should keep existing labels and change name", func(t *testing.T) {
|
||||
alertState := randomState(tc.state)
|
||||
alertState.Labels = randomMapOfStrings()
|
||||
alertName := util.GenerateShortUID()
|
||||
alertState.Labels[model.AlertNameLabel] = alertName
|
||||
|
||||
result := stateToPostableAlert(alertState, appURL)
|
||||
|
||||
expected := make(models.LabelSet, len(alertState.Labels)+1)
|
||||
for k, v := range alertState.Labels {
|
||||
expected[k] = v
|
||||
}
|
||||
expected[model.AlertNameLabel] = ErrorAlertName
|
||||
expected[Rulename] = alertName
|
||||
|
||||
require.Equal(t, expected, result.Labels)
|
||||
|
||||
t.Run("should not backup original alert name if it does not exist", func(t *testing.T) {
|
||||
alertState := randomState(tc.state)
|
||||
alertState.Labels = randomMapOfStrings()
|
||||
delete(alertState.Labels, model.AlertNameLabel)
|
||||
|
||||
result := stateToPostableAlert(alertState, appURL)
|
||||
|
||||
require.Equal(t, ErrorAlertName, result.Labels[model.AlertNameLabel])
|
||||
require.NotContains(t, result.Labels[model.AlertNameLabel], Rulename)
|
||||
})
|
||||
})
|
||||
default:
|
||||
t.Run("should copy labels as is", func(t *testing.T) {
|
||||
alertState := randomState(tc.state)
|
||||
|
@ -2,11 +2,13 @@ package state_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
|
||||
"github.com/grafana/grafana/pkg/expr"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
@ -1175,6 +1177,83 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "normal -> error when result is Error and ExecErrState is Error",
|
||||
alertRule: &models.AlertRule{
|
||||
OrgID: 1,
|
||||
Title: "test_title",
|
||||
UID: "test_alert_rule_uid_2",
|
||||
NamespaceUID: "test_namespace_uid",
|
||||
Data: []models.AlertQuery{{
|
||||
RefID: "A",
|
||||
DatasourceUID: "datasource_uid_1",
|
||||
}},
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
Labels: map[string]string{"label": "test"},
|
||||
IntervalSeconds: 10,
|
||||
For: 1 * time.Minute,
|
||||
ExecErrState: models.ErrorErrState,
|
||||
},
|
||||
evalResults: []eval.Results{
|
||||
{
|
||||
eval.Result{
|
||||
Instance: data.Labels{"instance_label": "test"},
|
||||
State: eval.Normal,
|
||||
EvaluatedAt: evaluationTime,
|
||||
EvaluationDuration: evaluationDuration,
|
||||
},
|
||||
},
|
||||
{
|
||||
eval.Result{
|
||||
Instance: data.Labels{"instance_label": "test"},
|
||||
Error: expr.QueryError{
|
||||
RefID: "A",
|
||||
Err: errors.New("this is an error"),
|
||||
},
|
||||
State: eval.Error,
|
||||
EvaluatedAt: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedStates: map[string]*state.State{
|
||||
`[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`: {
|
||||
AlertRuleUID: "test_alert_rule_uid_2",
|
||||
OrgID: 1,
|
||||
CacheId: `[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`,
|
||||
Labels: data.Labels{
|
||||
"__alert_rule_namespace_uid__": "test_namespace_uid",
|
||||
"__alert_rule_uid__": "test_alert_rule_uid_2",
|
||||
"alertname": "test_title",
|
||||
"label": "test",
|
||||
"instance_label": "test",
|
||||
"datasource_uid": "datasource_uid_1",
|
||||
},
|
||||
State: eval.Error,
|
||||
Error: expr.QueryError{
|
||||
RefID: "A",
|
||||
Err: errors.New("this is an error"),
|
||||
},
|
||||
Results: []state.Evaluation{
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Error,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test", "Error": "failed to execute query A: this is an error"},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "template is correctly expanded",
|
||||
alertRule: &models.AlertRule{
|
||||
|
@ -1,10 +1,12 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
|
||||
"github.com/grafana/grafana/pkg/expr"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
@ -87,6 +89,7 @@ func (a *State) resultAlerting(alertRule *ngModels.AlertRule, result eval.Result
|
||||
|
||||
func (a *State) resultError(alertRule *ngModels.AlertRule, result eval.Result) {
|
||||
a.Error = result.Error
|
||||
|
||||
if a.StartsAt.IsZero() {
|
||||
a.StartsAt = result.EvaluatedAt
|
||||
}
|
||||
@ -94,6 +97,23 @@ func (a *State) resultError(alertRule *ngModels.AlertRule, result eval.Result) {
|
||||
|
||||
if alertRule.ExecErrState == ngModels.AlertingErrState {
|
||||
a.State = eval.Alerting
|
||||
} else if alertRule.ExecErrState == ngModels.ErrorErrState {
|
||||
a.State = eval.Error
|
||||
|
||||
// If the evaluation failed because a query returned an error then
|
||||
// update the state with the Datasource UID as a label and the error
|
||||
// message as an annotation so other code can use this metadata to
|
||||
// add context to alerts
|
||||
var queryError expr.QueryError
|
||||
if errors.As(a.Error, &queryError) {
|
||||
for _, next := range alertRule.Data {
|
||||
if next.RefID == queryError.RefID {
|
||||
a.Labels["datasource_uid"] = next.DatasourceUID
|
||||
break
|
||||
}
|
||||
}
|
||||
a.Annotations["Error"] = queryError.Error()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -114,7 +134,7 @@ func (a *State) resultNoData(alertRule *ngModels.AlertRule, result eval.Result)
|
||||
}
|
||||
|
||||
func (a *State) NeedsSending(resendDelay time.Duration) bool {
|
||||
if a.State == eval.Pending || a.State == eval.Error || a.State == eval.Normal && !a.Resolved {
|
||||
if a.State == eval.Pending || a.State == eval.Normal && !a.Resolved {
|
||||
return false
|
||||
}
|
||||
// if LastSentAt is before or equal to LastEvaluationTime + resendDelay, send again
|
||||
|
@ -122,7 +122,7 @@ func TestNeedsSending(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "state: error, needs to be re-sent",
|
||||
expected: false,
|
||||
expected: true,
|
||||
resendDelay: 1 * time.Minute,
|
||||
testState: &State{
|
||||
State: eval.Error,
|
||||
|
@ -131,6 +131,10 @@ func (m *migration) makeAlertRule(cond condition, da dashAlert, folderUID string
|
||||
m.mg.Logger.Error("alert migration error: failed to create silence", "rule_name", ar.Title, "err", err)
|
||||
}
|
||||
|
||||
if err := m.addErrorSilence(da, ar); err != nil {
|
||||
m.mg.Logger.Error("alert migration error: failed to create silence for Error", "rule_name", ar.Title, "err", err)
|
||||
}
|
||||
|
||||
if err := m.addNoDataSilence(da, ar); err != nil {
|
||||
m.mg.Logger.Error("alert migration error: failed to create silence for NoData", "rule_name", ar.Title, "err", err)
|
||||
}
|
||||
@ -215,7 +219,9 @@ func transExecErr(s string) (string, error) {
|
||||
case "", "alerting":
|
||||
return "Alerting", nil
|
||||
case "keep_state":
|
||||
return "Alerting", nil
|
||||
// Keep last state is translated to error as we now emit a
|
||||
// DatasourceError alert when the state is error
|
||||
return "Error", nil
|
||||
}
|
||||
return "", fmt.Errorf("unrecognized Execution Error setting %v", s)
|
||||
}
|
||||
|
@ -22,6 +22,8 @@ import (
|
||||
const (
|
||||
// Should be the same as 'NoDataAlertName' in pkg/services/schedule/compat.go.
|
||||
NoDataAlertName = "DatasourceNoData"
|
||||
|
||||
ErrorAlertName = "DatasourceError"
|
||||
)
|
||||
|
||||
func (m *migration) addSilence(da dashAlert, rule *alertRule) error {
|
||||
@ -61,6 +63,45 @@ func (m *migration) addSilence(da dashAlert, rule *alertRule) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *migration) addErrorSilence(da dashAlert, rule *alertRule) error {
|
||||
if da.ParsedSettings.ExecutionErrorState != "keep_state" {
|
||||
return nil
|
||||
}
|
||||
|
||||
uid, err := uuid.NewV4()
|
||||
if err != nil {
|
||||
return errors.New("failed to create uuid for silence")
|
||||
}
|
||||
|
||||
s := &pb.MeshSilence{
|
||||
Silence: &pb.Silence{
|
||||
Id: uid.String(),
|
||||
Matchers: []*pb.Matcher{
|
||||
{
|
||||
Type: pb.Matcher_EQUAL,
|
||||
Name: model.AlertNameLabel,
|
||||
Pattern: ErrorAlertName,
|
||||
},
|
||||
{
|
||||
Type: pb.Matcher_EQUAL,
|
||||
Name: "rule_uid",
|
||||
Pattern: rule.UID,
|
||||
},
|
||||
},
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().AddDate(1, 0, 0), // 1 year
|
||||
CreatedBy: "Grafana Migration",
|
||||
Comment: fmt.Sprintf("Created during migration to unified alerting to silence Error state for alert rule ID '%s' and Title '%s' because the option 'Keep Last State' was selected for Error state", rule.UID, rule.Title),
|
||||
},
|
||||
ExpiresAt: time.Now().AddDate(1, 0, 0), // 1 year
|
||||
}
|
||||
if _, ok := m.silences[da.OrgId]; !ok {
|
||||
m.silences[da.OrgId] = make([]*pb.MeshSilence, 0)
|
||||
}
|
||||
m.silences[da.OrgId] = append(m.silences[da.OrgId], s)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *migration) addNoDataSilence(da dashAlert, rule *alertRule) error {
|
||||
if da.ParsedSettings.NoDataState != "keep_state" {
|
||||
return nil
|
||||
|
@ -6,20 +6,25 @@ import React, { FC, useMemo } from 'react';
|
||||
|
||||
type Props = Omit<SelectBaseProps<GrafanaAlertStateDecision>, 'options'> & {
|
||||
includeNoData: boolean;
|
||||
includeError: boolean;
|
||||
};
|
||||
|
||||
const options: SelectableValue[] = [
|
||||
{ value: GrafanaAlertStateDecision.Alerting, label: 'Alerting' },
|
||||
{ value: GrafanaAlertStateDecision.NoData, label: 'No Data' },
|
||||
{ value: GrafanaAlertStateDecision.OK, label: 'OK' },
|
||||
{ value: GrafanaAlertStateDecision.Error, label: 'Error' },
|
||||
];
|
||||
|
||||
export const GrafanaAlertStatePicker: FC<Props> = ({ includeNoData, ...props }) => {
|
||||
export const GrafanaAlertStatePicker: FC<Props> = ({ includeNoData, includeError, ...props }) => {
|
||||
const opts = useMemo(() => {
|
||||
if (includeNoData) {
|
||||
return options;
|
||||
if (!includeNoData) {
|
||||
return options.filter((opt) => opt.value !== GrafanaAlertStateDecision.NoData);
|
||||
}
|
||||
return options.filter((opt) => opt.value !== GrafanaAlertStateDecision.NoData);
|
||||
}, [includeNoData]);
|
||||
if (!includeError) {
|
||||
return options.filter((opt) => opt.value !== GrafanaAlertStateDecision.Error);
|
||||
}
|
||||
return options;
|
||||
}, [includeNoData, includeError]);
|
||||
return <Select menuShouldPortal options={opts} {...props} />;
|
||||
};
|
||||
|
@ -108,6 +108,7 @@ export const GrafanaConditionsStep: FC = () => {
|
||||
inputId="no-data-state-input"
|
||||
width={42}
|
||||
includeNoData={true}
|
||||
includeError={false}
|
||||
onChange={(value) => onChange(value?.value)}
|
||||
/>
|
||||
)}
|
||||
@ -122,6 +123,7 @@ export const GrafanaConditionsStep: FC = () => {
|
||||
inputId="exec-err-state-input"
|
||||
width={42}
|
||||
includeNoData={false}
|
||||
includeError={true}
|
||||
onChange={(value) => onChange(value?.value)}
|
||||
/>
|
||||
)}
|
||||
|
@ -99,6 +99,7 @@ export enum GrafanaAlertStateDecision {
|
||||
NoData = 'NoData',
|
||||
KeepLastState = 'KeepLastState',
|
||||
OK = 'OK',
|
||||
Error = 'Error',
|
||||
}
|
||||
|
||||
interface AlertDataQuery extends DataQuery {
|
||||
|
Loading…
Reference in New Issue
Block a user