mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Separate write errors from Prometheus/Mimir into 400/500 categories (#94699)
* Separate errors from Prometheus/Mimir into categories * Drop duplicate * tests
This commit is contained in:
parent
a8c1c15235
commit
4d8d916434
@ -25,14 +25,18 @@ const backendType = "prometheus"
|
|||||||
const (
|
const (
|
||||||
// Fixed error messages
|
// Fixed error messages
|
||||||
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
|
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
|
||||||
|
MimirInvalidLabelError = "err-mimir-label-invalid"
|
||||||
|
|
||||||
// Best effort error messages
|
// Best effort error messages
|
||||||
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
|
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrWriteFailure = errors.New("failed to write time series")
|
// Unexpected, 500-like write errors.
|
||||||
ErrBadFrame = errors.New("failed to read dataframe")
|
ErrUnexpectedWriteFailure = errors.New("failed to write time series")
|
||||||
|
// Expected, user-level write errors like trying to write an invalid series.
|
||||||
|
ErrRejectedWrite = errors.New("series was rejected")
|
||||||
|
ErrBadFrame = errors.New("failed to read dataframe")
|
||||||
)
|
)
|
||||||
|
|
||||||
var DuplicateTimestampErrors = [...]string{
|
var DuplicateTimestampErrors = [...]string{
|
||||||
@ -213,10 +217,12 @@ func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, f
|
|||||||
lvs = append(lvs, fmt.Sprint(res.StatusCode))
|
lvs = append(lvs, fmt.Sprint(res.StatusCode))
|
||||||
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
|
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
|
||||||
|
|
||||||
if err, ignored := checkWriteError(writeErr); err != nil {
|
if writeErr != nil {
|
||||||
return errors.Join(ErrWriteFailure, err)
|
if err, ignored := checkWriteError(writeErr); err != nil {
|
||||||
} else if ignored {
|
return err
|
||||||
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
|
} else if ignored {
|
||||||
|
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@ -242,7 +248,12 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
|
|||||||
return nil, false
|
return nil, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// special case for 400 status code
|
// All 500-range statuses are automatically unexpected and not the fault of the data.
|
||||||
|
if writeErr.StatusCode()/100 == 5 {
|
||||||
|
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data.
|
||||||
if writeErr.StatusCode() == 400 {
|
if writeErr.StatusCode() == 400 {
|
||||||
msg := writeErr.Error()
|
msg := writeErr.Error()
|
||||||
// HA may potentially write different values for the same timestamp, so we ignore this error
|
// HA may potentially write different values for the same timestamp, so we ignore this error
|
||||||
@ -252,7 +263,16 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
|
|||||||
return nil, true
|
return nil, true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if strings.Contains(msg, MimirInvalidLabelError) {
|
||||||
|
return errors.Join(ErrRejectedWrite, writeErr), false
|
||||||
|
}
|
||||||
|
|
||||||
|
// For now, all 400s that are not previously known are considered unexpected.
|
||||||
|
// TODO: Consider blanket-converting all 400s to be known errors. This should only be done once we are confident this is not a problem with this client.
|
||||||
|
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||||
}
|
}
|
||||||
|
|
||||||
return writeErr, false
|
// All other errors which do not fit into the above categories are also unexpected.
|
||||||
|
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
|
||||||
}
|
}
|
||||||
|
@ -166,6 +166,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
|||||||
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
|
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
require.ErrorIs(t, err, clientErr)
|
require.ErrorIs(t, err, clientErr)
|
||||||
|
require.ErrorIs(t, err, ErrUnexpectedWriteFailure)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("writes expected points", func(t *testing.T) {
|
t.Run("writes expected points", func(t *testing.T) {
|
||||||
@ -204,6 +205,22 @@ func TestPrometheusWriter_Write(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("bad labels fit under the client error category", func(t *testing.T) {
|
||||||
|
msg := MimirInvalidLabelError
|
||||||
|
clientErr := testClientWriteError{
|
||||||
|
statusCode: http.StatusBadRequest,
|
||||||
|
msg: &msg,
|
||||||
|
}
|
||||||
|
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
|
||||||
|
return promremote.WriteResult{}, clientErr
|
||||||
|
}
|
||||||
|
|
||||||
|
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{"extra": "label"})
|
||||||
|
|
||||||
|
require.Error(t, err)
|
||||||
|
require.ErrorIs(t, err, ErrRejectedWrite)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {
|
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {
|
||||||
|
Loading…
Reference in New Issue
Block a user