Alerting: Expose Prometheus metrics for persisting state history (#63157)

* Create historian metrics and dependency inject

* Record counter for total number of state transitions logged

* Track write failures

* Track current number of active write goroutines

* Record histogram of how long it takes to write history data

* Don't copy the registerer

* Adjust naming of write failures metric

* Introduce WritesTotal to complement WritesFailedTotal

* Measure TransitionsFailedTotal to complement TransitionsTotal

* Rename all to state_history

* Remove redundant Total suffix

* Increment totals all the time, not just on success

* Drop ActiveWriteGoroutines

* Drop PersistDuration in favor of WriteDuration

* Drop unused gauge

* Make writes and writesFailed per org

* Add metric indicating backend and a spot for future metadata

* Drop _batch_ from names and update help

* Add metric for bytes written

* Better pairing of total + failure metric updates

* Few tweaks to wording and naming

* Record info metric during composition

* Create fakeRequester and simple happy path test using it

* Blocking test for the full historian and test for happy path metrics

* Add tests for failure case metrics

* Smoke test for full annotation persistence

* Create test for metrics on annotation persistence, both happy and failing paths

* Address linter complaints

* More linter complaints

* Remove unnecessary whitespace

* Consistency improvements to help texts

* Update tests to match new descs
This commit is contained in:
Alexander Weaver
2023-03-06 10:40:37 -06:00
committed by GitHub
parent 5422f7cf56
commit 19d01dff91
11 changed files with 368 additions and 21 deletions

View File

@@ -7,17 +7,59 @@ import (
)
type Historian struct {
WriteDuration *instrument.HistogramCollector
Info *prometheus.GaugeVec
TransitionsTotal *prometheus.CounterVec
TransitionsFailed *prometheus.CounterVec
WritesTotal *prometheus.CounterVec
WritesFailed *prometheus.CounterVec
WriteDuration *instrument.HistogramCollector
BytesWritten prometheus.Counter
}
func NewHistorianMetrics(r prometheus.Registerer) *Historian {
return &Historian{
Info: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_info",
Help: "Information about the state history store.",
}, []string{"backend"}),
TransitionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_transitions_total",
Help: "The total number of state transitions processed.",
}, []string{"org"}),
TransitionsFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_transitions_failed_total",
Help: "The total number of state transitions that failed to be written - they are not retried.",
}, []string{"org"}),
WritesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_writes_total",
Help: "The total number of state history batches that were attempted to be written.",
}, []string{"org"}),
WritesFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_writes_failed_total",
Help: "The total number of failed writes of state history batches.",
}, []string{"org"}),
WriteDuration: instrument.NewHistogramCollector(promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_request_duration_seconds",
Help: "Histogram of request durations to the state history store.",
Help: "Histogram of request durations to the state history store. Only valid when using external stores.",
Buckets: instrument.DefBuckets,
}, instrument.HistogramCollectorBuckets)),
BytesWritten: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_history_writes_bytes_total",
Help: "The total number of bytes sent within a batch to the state history store. Only valid when using the Loki store.",
}),
}
}