diff --git a/pkg/services/ngalert/api/api_prometheus.go b/pkg/services/ngalert/api/api_prometheus.go index 883dcf57724..5c1211f659d 100644 --- a/pkg/services/ngalert/api/api_prometheus.go +++ b/pkg/services/ngalert/api/api_prometheus.go @@ -408,10 +408,14 @@ func (srv PrometheusSrv) toRuleGroup(groupKey ngmodels.AlertRuleGroupKey, folder rulesTotals[newRule.Health] += 1 } - apimodels.AlertsBy(apimodels.AlertsByImportance).Sort(alertingRule.Alerts) + alertsBy := apimodels.AlertsBy(apimodels.AlertsByImportance) if limitAlerts > -1 && int64(len(alertingRule.Alerts)) > limitAlerts { - alertingRule.Alerts = alertingRule.Alerts[0:limitAlerts] + alertingRule.Alerts = alertsBy.TopK(alertingRule.Alerts, int(limitAlerts)) + } else { + // If there is no effective limit, then just sort the alerts. + // For large numbers of alerts, this can be faster. + alertsBy.Sort(alertingRule.Alerts) } alertingRule.Rule = newRule diff --git a/pkg/services/ngalert/api/tooling/definitions/prom.go b/pkg/services/ngalert/api/tooling/definitions/prom.go index 43ae592d329..c97d94d770d 100644 --- a/pkg/services/ngalert/api/tooling/definitions/prom.go +++ b/pkg/services/ngalert/api/tooling/definitions/prom.go @@ -1,6 +1,7 @@ package definitions import ( + "container/heap" "fmt" "sort" "strings" @@ -206,6 +207,71 @@ func (by AlertsBy) Sort(alerts []Alert) { sort.Sort(AlertsSorter{alerts: alerts, by: by}) } +// AlertsHeap extends AlertsSorter for use with container/heap functions. +type AlertsHeap struct { + AlertsSorter +} + +func (h *AlertsHeap) Push(x any) { + h.alerts = append(h.alerts, x.(Alert)) +} + +func (h *AlertsHeap) Pop() any { + old := h.alerts + n := len(old) + x := old[n-1] + h.alerts = old[0 : n-1] + return x +} + +// TopK returns the highest k elements. It does not modify the input. +func (by AlertsBy) TopK(alerts []Alert, k int) []Alert { + // Concept is that instead of sorting the whole list and taking the number + // of items we need, maintain a heap of the top k elements, and update it + // for each element. This vastly reduces the number of comparisons needed, + // which is important for sorting alerts, as the comparison function is + // very expensive. + + // The heap must be in ascending order, so that the root of the heap is + // the current smallest element. + byAscending := func(a1, a2 *Alert) bool { return by(a2, a1) } + + h := AlertsHeap{ + AlertsSorter: AlertsSorter{ + alerts: make([]Alert, 0, k), + by: byAscending, + }, + } + + // Go version of this algorithm taken from Prometheus (promql/engine.go) + + heap.Init(&h) + for i := 0; i < len(alerts); i++ { + a := alerts[i] + + // We build a heap of up to k elements, with the smallest element at heap[0]. + switch { + case len(h.alerts) < k: + heap.Push(&h, a) + + case h.by(&h.alerts[0], &a): + // This new element is bigger than the previous smallest element - overwrite that. + h.alerts[0] = a + // Maintain the heap invariant. + if k > 1 { + heap.Fix(&h, 0) + } + } + } + + // The heap keeps the lowest value on top, so reverse it. + if len(h.alerts) > 1 { + sort.Sort(sort.Reverse(&h)) + } + + return h.alerts +} + // AlertsByImportance orders alerts by importance. An alert is more important // than another alert if its status has higher importance. For example, "alerting" // is more important than "normal". If two alerts have the same importance diff --git a/pkg/services/ngalert/api/tooling/definitions/prom_bench.sh b/pkg/services/ngalert/api/tooling/definitions/prom_bench.sh new file mode 100755 index 00000000000..fadf371a29a --- /dev/null +++ b/pkg/services/ngalert/api/tooling/definitions/prom_bench.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk sort | tee before.txt +go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk heap | tee after.txt +benchstat before.txt after.txt diff --git a/pkg/services/ngalert/api/tooling/definitions/prom_bench_test.go b/pkg/services/ngalert/api/tooling/definitions/prom_bench_test.go new file mode 100644 index 00000000000..64a051751cc --- /dev/null +++ b/pkg/services/ngalert/api/tooling/definitions/prom_bench_test.go @@ -0,0 +1,87 @@ +package definitions + +import ( + "flag" + "fmt" + "math/rand" + "testing" +) + +var topkStrategy = flag.String("topk", "heap", "topk strategy to benchmark. choices: sort, heap") +var showComparisons = flag.Bool("show-comparisons", false, "whether to show the number of comparisons made") + +func makeAlerts(amount int) []Alert { + // A typical distribution of alert states is that most are Normal + // and a few are Alerting, so we assume 99% Normal and 1% Alerting. + percentAlerting := 1 + + // Series will commonly have many labels. + numLabels := 10 + + alerts := make([]Alert, amount) + + for i := 0; i < len(alerts); i++ { + alerts[i].Labels = make(map[string]string) + for label := 0; label < numLabels; label++ { + alerts[i].Labels[fmt.Sprintf("label_%d", label)] = fmt.Sprintf("label_%d_value_%d", label, i%100) + } + + if i%100 < percentAlerting { + alerts[i].State = "alerting" + // Should populate ActiveAt because this prevents needing label comparison + } else { + alerts[i].State = "normal" + } + } + + // Shuffle in a repeatable order to avoid any bias from the initial ordering. + r := rand.New(rand.NewSource(1)) + r.Shuffle(len(alerts), func(i, j int) { alerts[i], alerts[j] = alerts[j], alerts[i] }) + + return alerts +} + +func BenchmarkSortAlertsByImportance(b *testing.B) { + var topkFunc func(AlertsBy, []Alert, int) + + switch *topkStrategy { + case "sort": + topkFunc = func(by AlertsBy, alerts []Alert, limit int) { + by.Sort(alerts) + if len(alerts) > limit { + _ = alerts[0:limit] + } + } + + case "heap": + topkFunc = func(by AlertsBy, alerts []Alert, limit int) { + _ = by.TopK(alerts, limit) + } + } + + for _, n := range []int{1000, 10000, 100000} { + for _, k := range []int{16, 100, 1000, 100000} { + b.Run(fmt.Sprintf("n_%d_k_%d", n, k), func(b *testing.B) { + b.StopTimer() + + for bi := 0; bi < b.N; bi++ { + alerts := makeAlerts(n) + + comparisons := 0 + by := func(a1, a2 *Alert) bool { + comparisons++ + return AlertsByImportance(a1, a2) + } + + b.StartTimer() + topkFunc(by, alerts, k) + b.StopTimer() + + if *showComparisons { + fmt.Printf("Number of comparisons (strategy: %s): %d\n", *topkStrategy, comparisons) + } + } + }) + } + } +} diff --git a/pkg/services/ngalert/api/tooling/definitions/prom_test.go b/pkg/services/ngalert/api/tooling/definitions/prom_test.go index 107eedcb5c2..5546cd374f1 100644 --- a/pkg/services/ngalert/api/tooling/definitions/prom_test.go +++ b/pkg/services/ngalert/api/tooling/definitions/prom_test.go @@ -64,3 +64,51 @@ func TestSortAlertsByImportance(t *testing.T) { }) } } + +func TestTopKAlertsByImportance(t *testing.T) { + // tm1, tm2 := time.Now(), time.Now().Add(time.Second) + tc := []struct { + name string + k int + input []Alert + expected []Alert + }{{ + name: "alerts are ordered in expected importance (k=1)", + k: 1, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}}, + }, { + name: "alerts are ordered in expected importance (k=2)", + k: 2, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}, {State: "pending"}}, + }, { + name: "alerts are ordered in expected importance (k=3)", + k: 3, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}}, + }, { + name: "alerts are ordered in expected importance (k=4)", + k: 4, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}}, + }, { + name: "alerts are ordered in expected importance (k=5)", + k: 5, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}}, + }, { + name: "alerts are ordered in expected importance (k=6)", + k: 6, + input: []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}}, + expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}}, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + result := AlertsBy(AlertsByImportance).TopK(tt.input, tt.k) + assert.EqualValues(t, tt.expected, result) + }) + } +}