Alerting: Optimize rule status gathering APIs when a limit is applied. (#86568)

* Alerting: Optimize rule status gathering APIs when a limit is applied.

The frontend very commonly calls the `/rules` API with `limit_alerts=16`. When
there are a very large number of alert instances present, this API is quite
slow to respond, and profiling suggests that a big part of the problem is
sorting the alerts by importance, in order to select the first 16.

This changes the application of the limit to use a more efficient heap-based
top-k algorithm. This maintains a slice of only the highest ranked items whilst
iterating the full set of alert instances, which substantially reduces the
number of comparisons needed. This is particularly effective, as the
`AlertsByImportance` comparison is quite complex.

I've included a benchmark to compare the new TopK function to the existing
Sort/limit strategy. It shows that for small limits, the new approach is
much faster, especially at high numbers of alerts, e.g.

100K alerts / limit 16: 1.91s vs 0.02s (-99%)

For situations where there is no effective limit, sorting is marginally faster,
therefore in the API implementation, if there is either a) no limit or b) no
effective limit, then we just sort the alerts as before. There is also a space
overhead using a heap which would matter for large limits.

* Remove commented test cases

* Make linter happy
This commit is contained in:
Steve Simpson 2024-04-19 11:51:22 +02:00 committed by GitHub
parent 5a8384a245
commit 73873f5a8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 212 additions and 2 deletions

View File

@ -408,10 +408,14 @@ func (srv PrometheusSrv) toRuleGroup(groupKey ngmodels.AlertRuleGroupKey, folder
rulesTotals[newRule.Health] += 1
}
apimodels.AlertsBy(apimodels.AlertsByImportance).Sort(alertingRule.Alerts)
alertsBy := apimodels.AlertsBy(apimodels.AlertsByImportance)
if limitAlerts > -1 && int64(len(alertingRule.Alerts)) > limitAlerts {
alertingRule.Alerts = alertingRule.Alerts[0:limitAlerts]
alertingRule.Alerts = alertsBy.TopK(alertingRule.Alerts, int(limitAlerts))
} else {
// If there is no effective limit, then just sort the alerts.
// For large numbers of alerts, this can be faster.
alertsBy.Sort(alertingRule.Alerts)
}
alertingRule.Rule = newRule

View File

@ -1,6 +1,7 @@
package definitions
import (
"container/heap"
"fmt"
"sort"
"strings"
@ -206,6 +207,71 @@ func (by AlertsBy) Sort(alerts []Alert) {
sort.Sort(AlertsSorter{alerts: alerts, by: by})
}
// AlertsHeap extends AlertsSorter for use with container/heap functions.
type AlertsHeap struct {
AlertsSorter
}
func (h *AlertsHeap) Push(x any) {
h.alerts = append(h.alerts, x.(Alert))
}
func (h *AlertsHeap) Pop() any {
old := h.alerts
n := len(old)
x := old[n-1]
h.alerts = old[0 : n-1]
return x
}
// TopK returns the highest k elements. It does not modify the input.
func (by AlertsBy) TopK(alerts []Alert, k int) []Alert {
// Concept is that instead of sorting the whole list and taking the number
// of items we need, maintain a heap of the top k elements, and update it
// for each element. This vastly reduces the number of comparisons needed,
// which is important for sorting alerts, as the comparison function is
// very expensive.
// The heap must be in ascending order, so that the root of the heap is
// the current smallest element.
byAscending := func(a1, a2 *Alert) bool { return by(a2, a1) }
h := AlertsHeap{
AlertsSorter: AlertsSorter{
alerts: make([]Alert, 0, k),
by: byAscending,
},
}
// Go version of this algorithm taken from Prometheus (promql/engine.go)
heap.Init(&h)
for i := 0; i < len(alerts); i++ {
a := alerts[i]
// We build a heap of up to k elements, with the smallest element at heap[0].
switch {
case len(h.alerts) < k:
heap.Push(&h, a)
case h.by(&h.alerts[0], &a):
// This new element is bigger than the previous smallest element - overwrite that.
h.alerts[0] = a
// Maintain the heap invariant.
if k > 1 {
heap.Fix(&h, 0)
}
}
}
// The heap keeps the lowest value on top, so reverse it.
if len(h.alerts) > 1 {
sort.Sort(sort.Reverse(&h))
}
return h.alerts
}
// AlertsByImportance orders alerts by importance. An alert is more important
// than another alert if its status has higher importance. For example, "alerting"
// is more important than "normal". If two alerts have the same importance

View File

@ -0,0 +1,5 @@
#!/bin/bash
go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk sort | tee before.txt
go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk heap | tee after.txt
benchstat before.txt after.txt

View File

@ -0,0 +1,87 @@
package definitions
import (
"flag"
"fmt"
"math/rand"
"testing"
)
var topkStrategy = flag.String("topk", "heap", "topk strategy to benchmark. choices: sort, heap")
var showComparisons = flag.Bool("show-comparisons", false, "whether to show the number of comparisons made")
func makeAlerts(amount int) []Alert {
// A typical distribution of alert states is that most are Normal
// and a few are Alerting, so we assume 99% Normal and 1% Alerting.
percentAlerting := 1
// Series will commonly have many labels.
numLabels := 10
alerts := make([]Alert, amount)
for i := 0; i < len(alerts); i++ {
alerts[i].Labels = make(map[string]string)
for label := 0; label < numLabels; label++ {
alerts[i].Labels[fmt.Sprintf("label_%d", label)] = fmt.Sprintf("label_%d_value_%d", label, i%100)
}
if i%100 < percentAlerting {
alerts[i].State = "alerting"
// Should populate ActiveAt because this prevents needing label comparison
} else {
alerts[i].State = "normal"
}
}
// Shuffle in a repeatable order to avoid any bias from the initial ordering.
r := rand.New(rand.NewSource(1))
r.Shuffle(len(alerts), func(i, j int) { alerts[i], alerts[j] = alerts[j], alerts[i] })
return alerts
}
func BenchmarkSortAlertsByImportance(b *testing.B) {
var topkFunc func(AlertsBy, []Alert, int)
switch *topkStrategy {
case "sort":
topkFunc = func(by AlertsBy, alerts []Alert, limit int) {
by.Sort(alerts)
if len(alerts) > limit {
_ = alerts[0:limit]
}
}
case "heap":
topkFunc = func(by AlertsBy, alerts []Alert, limit int) {
_ = by.TopK(alerts, limit)
}
}
for _, n := range []int{1000, 10000, 100000} {
for _, k := range []int{16, 100, 1000, 100000} {
b.Run(fmt.Sprintf("n_%d_k_%d", n, k), func(b *testing.B) {
b.StopTimer()
for bi := 0; bi < b.N; bi++ {
alerts := makeAlerts(n)
comparisons := 0
by := func(a1, a2 *Alert) bool {
comparisons++
return AlertsByImportance(a1, a2)
}
b.StartTimer()
topkFunc(by, alerts, k)
b.StopTimer()
if *showComparisons {
fmt.Printf("Number of comparisons (strategy: %s): %d\n", *topkStrategy, comparisons)
}
}
})
}
}
}

View File

@ -64,3 +64,51 @@ func TestSortAlertsByImportance(t *testing.T) {
})
}
}
func TestTopKAlertsByImportance(t *testing.T) {
// tm1, tm2 := time.Now(), time.Now().Add(time.Second)
tc := []struct {
name string
k int
input []Alert
expected []Alert
}{{
name: "alerts are ordered in expected importance (k=1)",
k: 1,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}},
}, {
name: "alerts are ordered in expected importance (k=2)",
k: 2,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}, {State: "pending"}},
}, {
name: "alerts are ordered in expected importance (k=3)",
k: 3,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}},
}, {
name: "alerts are ordered in expected importance (k=4)",
k: 4,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}},
}, {
name: "alerts are ordered in expected importance (k=5)",
k: 5,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}},
}, {
name: "alerts are ordered in expected importance (k=6)",
k: 6,
input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}},
},
}
for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
result := AlertsBy(AlertsByImportance).TopK(tt.input, tt.k)
assert.EqualValues(t, tt.expected, result)
})
}
}