mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Separate write errors from Prometheus/Mimir into 400/500 categories (#94699)
* Separate errors from Prometheus/Mimir into categories * Drop duplicate * tests
This commit is contained in:
parent
a8c1c15235
commit
4d8d916434
@ -25,14 +25,18 @@ const backendType = "prometheus"
|
||||
const (
|
||||
// Fixed error messages
|
||||
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
|
||||
MimirInvalidLabelError = "err-mimir-label-invalid"
|
||||
|
||||
// Best effort error messages
|
||||
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrWriteFailure = errors.New("failed to write time series")
|
||||
ErrBadFrame = errors.New("failed to read dataframe")
|
||||
// Unexpected, 500-like write errors.
|
||||
ErrUnexpectedWriteFailure = errors.New("failed to write time series")
|
||||
// Expected, user-level write errors like trying to write an invalid series.
|
||||
ErrRejectedWrite = errors.New("series was rejected")
|
||||
ErrBadFrame = errors.New("failed to read dataframe")
|
||||
)
|
||||
|
||||
var DuplicateTimestampErrors = [...]string{
|
||||
@ -213,10 +217,12 @@ func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, f
|
||||
lvs = append(lvs, fmt.Sprint(res.StatusCode))
|
||||
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
|
||||
|
||||
if err, ignored := checkWriteError(writeErr); err != nil {
|
||||
return errors.Join(ErrWriteFailure, err)
|
||||
} else if ignored {
|
||||
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
|
||||
if writeErr != nil {
|
||||
if err, ignored := checkWriteError(writeErr); err != nil {
|
||||
return err
|
||||
} else if ignored {
|
||||
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -242,7 +248,12 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// special case for 400 status code
|
||||
// All 500-range statuses are automatically unexpected and not the fault of the data.
|
||||
if writeErr.StatusCode()/100 == 5 {
|
||||
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||
}
|
||||
|
||||
// Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data.
|
||||
if writeErr.StatusCode() == 400 {
|
||||
msg := writeErr.Error()
|
||||
// HA may potentially write different values for the same timestamp, so we ignore this error
|
||||
@ -252,7 +263,16 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
|
||||
return nil, true
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(msg, MimirInvalidLabelError) {
|
||||
return errors.Join(ErrRejectedWrite, writeErr), false
|
||||
}
|
||||
|
||||
// For now, all 400s that are not previously known are considered unexpected.
|
||||
// TODO: Consider blanket-converting all 400s to be known errors. This should only be done once we are confident this is not a problem with this client.
|
||||
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||
}
|
||||
|
||||
return writeErr, false
|
||||
// All other errors which do not fit into the above categories are also unexpected.
|
||||
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||
}
|
||||
|
@ -166,6 +166,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
||||
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
|
||||
require.Error(t, err)
|
||||
require.ErrorIs(t, err, clientErr)
|
||||
require.ErrorIs(t, err, ErrUnexpectedWriteFailure)
|
||||
})
|
||||
|
||||
t.Run("writes expected points", func(t *testing.T) {
|
||||
@ -204,6 +205,22 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("bad labels fit under the client error category", func(t *testing.T) {
|
||||
msg := MimirInvalidLabelError
|
||||
clientErr := testClientWriteError{
|
||||
statusCode: http.StatusBadRequest,
|
||||
msg: &msg,
|
||||
}
|
||||
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
|
||||
return promremote.WriteResult{}, clientErr
|
||||
}
|
||||
|
||||
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{"extra": "label"})
|
||||
|
||||
require.Error(t, err)
|
||||
require.ErrorIs(t, err, ErrRejectedWrite)
|
||||
})
|
||||
}
|
||||
|
||||
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {
|
||||
|
Loading…
Reference in New Issue
Block a user