Alerting: Optimize rule status gathering APIs when a limit is applied. (#86568)

* Alerting: Optimize rule status gathering APIs when a limit is applied. The frontend very commonly calls the `/rules` API with `limit_alerts=16`. When there are a very large number of alert instances present, this API is quite slow to respond, and profiling suggests that a big part of the problem is sorting the alerts by importance, in order to select the first 16. This changes the application of the limit to use a more efficient heap-based top-k algorithm. This maintains a slice of only the highest ranked items whilst iterating the full set of alert instances, which substantially reduces the number of comparisons needed. This is particularly effective, as the `AlertsByImportance` comparison is quite complex. I've included a benchmark to compare the new TopK function to the existing Sort/limit strategy. It shows that for small limits, the new approach is much faster, especially at high numbers of alerts, e.g. 100K alerts / limit 16: 1.91s vs 0.02s (-99%) For situations where there is no effective limit, sorting is marginally faster, therefore in the API implementation, if there is either a) no limit or b) no effective limit, then we just sort the alerts as before. There is also a space overhead using a heap which would matter for large limits. * Remove commented test cases * Make linter happy
2025-02-25 18:55:37 -06:00 · 2024-04-19 11:51:22 +02:00 · 2024-04-19 11:51:22 +02:00 · 73873f5a8a
commit 73873f5a8a
parent 5a8384a245
5 changed files with 212 additions and 2 deletions
--- a/pkg/services/ngalert/api/api_prometheus.go
+++ b/pkg/services/ngalert/api/api_prometheus.go
@ -408,10 +408,14 @@ func (srv PrometheusSrv) toRuleGroup(groupKey ngmodels.AlertRuleGroupKey, folder
 			rulesTotals[newRule.Health] += 1
 		}
-		apimodels.AlertsBy(apimodels.AlertsByImportance).Sort(alertingRule.Alerts)
+		alertsBy := apimodels.AlertsBy(apimodels.AlertsByImportance)
 		if limitAlerts > -1 && int64(len(alertingRule.Alerts)) > limitAlerts {
-			alertingRule.Alerts = alertingRule.Alerts[0:limitAlerts]
+			alertingRule.Alerts = alertsBy.TopK(alertingRule.Alerts, int(limitAlerts))
 		} else {
 			// If there is no effective limit, then just sort the alerts.
 			// For large numbers of alerts, this can be faster.
 			alertsBy.Sort(alertingRule.Alerts)
 		}
 		alertingRule.Rule = newRule
--- a/pkg/services/ngalert/api/tooling/definitions/prom.go
+++ b/pkg/services/ngalert/api/tooling/definitions/prom.go
@ -1,6 +1,7 @@
 package definitions
 import (
 	"container/heap"
 	"fmt"
 	"sort"
 	"strings"
@ -206,6 +207,71 @@ func (by AlertsBy) Sort(alerts []Alert) {
 	sort.Sort(AlertsSorter{alerts: alerts, by: by})
 }
 // AlertsHeap extends AlertsSorter for use with container/heap functions.
 type AlertsHeap struct {
 	AlertsSorter
 }
 func (h *AlertsHeap) Push(x any) {
 	h.alerts = append(h.alerts, x.(Alert))
 }
 func (h *AlertsHeap) Pop() any {
 	old := h.alerts
 	n := len(old)
 	x := old[n-1]
 	h.alerts = old[0 : n-1]
 	return x
 }
 // TopK returns the highest k elements. It does not modify the input.
 func (by AlertsBy) TopK(alerts []Alert, k int) []Alert {
 	// Concept is that instead of sorting the whole list and taking the number
 	// of items we need, maintain a heap of the top k elements, and update it
 	// for each element. This vastly reduces the number of comparisons needed,
 	// which is important for sorting alerts, as the comparison function is
 	// very expensive.
 	// The heap must be in ascending order, so that the root of the heap is
 	// the current smallest element.
 	byAscending := func(a1, a2 *Alert) bool { return by(a2, a1) }
 	h := AlertsHeap{
 		AlertsSorter: AlertsSorter{
 			alerts: make([]Alert, 0, k),
 			by:     byAscending,
 		},
 	}
 	// Go version of this algorithm taken from Prometheus (promql/engine.go)
 	heap.Init(&h)
 	for i := 0; i < len(alerts); i++ {
 		a := alerts[i]
 		// We build a heap of up to k elements, with the smallest element at heap[0].
 		switch {
 		case len(h.alerts) < k:
 			heap.Push(&h, a)
 		case h.by(&h.alerts[0], &a):
 			// This new element is bigger than the previous smallest element - overwrite that.
 			h.alerts[0] = a
 			// Maintain the heap invariant.
 			if k > 1 {
 				heap.Fix(&h, 0)
 			}
 		}
 	}
 	// The heap keeps the lowest value on top, so reverse it.
 	if len(h.alerts) > 1 {
 		sort.Sort(sort.Reverse(&h))
 	}
 	return h.alerts
 }
 // AlertsByImportance orders alerts by importance. An alert is more important
 // than another alert if its status has higher importance. For example, "alerting"
 // is more important than "normal". If two alerts have the same importance
--- a/pkg/services/ngalert/api/tooling/definitions/prom_bench.sh
+++ b/pkg/services/ngalert/api/tooling/definitions/prom_bench.sh
@ -0,0 +1,5 @@
 #!/bin/bash
 go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk sort | tee before.txt
 go test -v -run=^# -bench BenchmarkSortAlertsByImportance -count 5 -topk heap | tee after.txt
 benchstat before.txt after.txt
--- a/pkg/services/ngalert/api/tooling/definitions/prom_bench_test.go
+++ b/pkg/services/ngalert/api/tooling/definitions/prom_bench_test.go
@ -0,0 +1,87 @@
 package definitions
 import (
 	"flag"
 	"fmt"
 	"math/rand"
 	"testing"
 )
 var topkStrategy = flag.String("topk", "heap", "topk strategy to benchmark. choices: sort, heap")
 var showComparisons = flag.Bool("show-comparisons", false, "whether to show the number of comparisons made")
 func makeAlerts(amount int) []Alert {
 	// A typical distribution of alert states is that most are Normal
 	// and a few are Alerting, so we assume 99% Normal and 1% Alerting.
 	percentAlerting := 1
 	// Series will commonly have many labels.
 	numLabels := 10
 	alerts := make([]Alert, amount)
 	for i := 0; i < len(alerts); i++ {
 		alerts[i].Labels = make(map[string]string)
 		for label := 0; label < numLabels; label++ {
 			alerts[i].Labels[fmt.Sprintf("label_%d", label)] = fmt.Sprintf("label_%d_value_%d", label, i%100)
 		}
 		if i%100 < percentAlerting {
 			alerts[i].State = "alerting"
 			// Should populate ActiveAt because this prevents needing label comparison
 		} else {
 			alerts[i].State = "normal"
 		}
 	}
 	// Shuffle in a repeatable order to avoid any bias from the initial ordering.
 	r := rand.New(rand.NewSource(1))
 	r.Shuffle(len(alerts), func(i, j int) { alerts[i], alerts[j] = alerts[j], alerts[i] })
 	return alerts
 }
 func BenchmarkSortAlertsByImportance(b *testing.B) {
 	var topkFunc func(AlertsBy, []Alert, int)
 	switch *topkStrategy {
 	case "sort":
 		topkFunc = func(by AlertsBy, alerts []Alert, limit int) {
 			by.Sort(alerts)
 			if len(alerts) > limit {
 				_ = alerts[0:limit]
 			}
 		}
 	case "heap":
 		topkFunc = func(by AlertsBy, alerts []Alert, limit int) {
 			_ = by.TopK(alerts, limit)
 		}
 	}
 	for _, n := range []int{1000, 10000, 100000} {
 		for _, k := range []int{16, 100, 1000, 100000} {
 			b.Run(fmt.Sprintf("n_%d_k_%d", n, k), func(b *testing.B) {
 				b.StopTimer()
 				for bi := 0; bi < b.N; bi++ {
 					alerts := makeAlerts(n)
 					comparisons := 0
 					by := func(a1, a2 *Alert) bool {
 						comparisons++
 						return AlertsByImportance(a1, a2)
 					}
 					b.StartTimer()
 					topkFunc(by, alerts, k)
 					b.StopTimer()
 					if *showComparisons {
 						fmt.Printf("Number of comparisons (strategy: %s): %d\n", *topkStrategy, comparisons)
 					}
 				}
 			})
 		}
 	}
 }
--- a/pkg/services/ngalert/api/tooling/definitions/prom_test.go
+++ b/pkg/services/ngalert/api/tooling/definitions/prom_test.go
@ -64,3 +64,51 @@ func TestSortAlertsByImportance(t *testing.T) {
 		})
 	}
 }
 func TestTopKAlertsByImportance(t *testing.T) {
 	//	tm1, tm2 := time.Now(), time.Now().Add(time.Second)
 	tc := []struct {
 		name     string
 		k        int
 		input    []Alert
 		expected []Alert
 	}{{
 		name:     "alerts are ordered in expected importance (k=1)",
 		k:        1,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}},
 	}, {
 		name:     "alerts are ordered in expected importance (k=2)",
 		k:        2,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}, {State: "pending"}},
 	}, {
 		name:     "alerts are ordered in expected importance (k=3)",
 		k:        3,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}},
 	}, {
 		name:     "alerts are ordered in expected importance (k=4)",
 		k:        4,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}},
 	}, {
 		name:     "alerts are ordered in expected importance (k=5)",
 		k:        5,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}},
 	}, {
 		name:     "alerts are ordered in expected importance (k=6)",
 		k:        6,
 		input:    []Alert{{State: "normal"}, {State: "nodata"}, {State: "error"}, {State: "pending"}, {State: "alerting"}},
 		expected: []Alert{{State: "alerting"}, {State: "pending"}, {State: "error"}, {State: "nodata"}, {State: "normal"}},
 	},
 	}
 	for _, tt := range tc {
 		t.Run(tt.name, func(t *testing.T) {
 			result := AlertsBy(AlertsByImportance).TopK(tt.input, tt.k)
 			assert.EqualValues(t, tt.expected, result)
 		})
 	}
 }