From 4d8d916434d2acbd35d17f20bd58fb904d955e1f Mon Sep 17 00:00:00 2001 From: Alexander Weaver Date: Mon, 14 Oct 2024 15:31:36 -0500 Subject: [PATCH] Alerting: Separate write errors from Prometheus/Mimir into 400/500 categories (#94699) * Separate errors from Prometheus/Mimir into categories * Drop duplicate * tests --- pkg/services/ngalert/writer/prom.go | 36 ++++++++++++++++++------ pkg/services/ngalert/writer/prom_test.go | 17 +++++++++++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/pkg/services/ngalert/writer/prom.go b/pkg/services/ngalert/writer/prom.go index ea46b816125..5bfa688c10b 100644 --- a/pkg/services/ngalert/writer/prom.go +++ b/pkg/services/ngalert/writer/prom.go @@ -25,14 +25,18 @@ const backendType = "prometheus" const ( // Fixed error messages MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp" + MimirInvalidLabelError = "err-mimir-label-invalid" // Best effort error messages PrometheusDuplicateTimestampError = "duplicate sample for timestamp" ) var ( - ErrWriteFailure = errors.New("failed to write time series") - ErrBadFrame = errors.New("failed to read dataframe") + // Unexpected, 500-like write errors. + ErrUnexpectedWriteFailure = errors.New("failed to write time series") + // Expected, user-level write errors like trying to write an invalid series. + ErrRejectedWrite = errors.New("series was rejected") + ErrBadFrame = errors.New("failed to read dataframe") ) var DuplicateTimestampErrors = [...]string{ @@ -213,10 +217,12 @@ func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, f lvs = append(lvs, fmt.Sprint(res.StatusCode)) w.metrics.WritesTotal.WithLabelValues(lvs...).Inc() - if err, ignored := checkWriteError(writeErr); err != nil { - return errors.Join(ErrWriteFailure, err) - } else if ignored { - l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode) + if writeErr != nil { + if err, ignored := checkWriteError(writeErr); err != nil { + return err + } else if ignored { + l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode) + } } return nil @@ -242,7 +248,12 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) { return nil, false } - // special case for 400 status code + // All 500-range statuses are automatically unexpected and not the fault of the data. + if writeErr.StatusCode()/100 == 5 { + return errors.Join(ErrUnexpectedWriteFailure, writeErr), false + } + + // Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data. if writeErr.StatusCode() == 400 { msg := writeErr.Error() // HA may potentially write different values for the same timestamp, so we ignore this error @@ -252,7 +263,16 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) { return nil, true } } + + if strings.Contains(msg, MimirInvalidLabelError) { + return errors.Join(ErrRejectedWrite, writeErr), false + } + + // For now, all 400s that are not previously known are considered unexpected. + // TODO: Consider blanket-converting all 400s to be known errors. This should only be done once we are confident this is not a problem with this client. + return errors.Join(ErrUnexpectedWriteFailure, writeErr), false } - return writeErr, false + // All other errors which do not fit into the above categories are also unexpected. + return errors.Join(ErrUnexpectedWriteFailure, writeErr), false } diff --git a/pkg/services/ngalert/writer/prom_test.go b/pkg/services/ngalert/writer/prom_test.go index 3aa49d85831..43297948354 100644 --- a/pkg/services/ngalert/writer/prom_test.go +++ b/pkg/services/ngalert/writer/prom_test.go @@ -166,6 +166,7 @@ func TestPrometheusWriter_Write(t *testing.T) { err := writer.Write(ctx, "test", now, frames, 1, map[string]string{}) require.Error(t, err) require.ErrorIs(t, err, clientErr) + require.ErrorIs(t, err, ErrUnexpectedWriteFailure) }) t.Run("writes expected points", func(t *testing.T) { @@ -204,6 +205,22 @@ func TestPrometheusWriter_Write(t *testing.T) { }) } }) + + t.Run("bad labels fit under the client error category", func(t *testing.T) { + msg := MimirInvalidLabelError + clientErr := testClientWriteError{ + statusCode: http.StatusBadRequest, + msg: &msg, + } + client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) { + return promremote.WriteResult{}, clientErr + } + + err := writer.Write(ctx, "test", now, frames, 1, map[string]string{"extra": "label"}) + + require.Error(t, err) + require.ErrorIs(t, err, ErrRejectedWrite) + }) } func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {