From 7edbe724832ef9b744b793d2de82f50208c7a2e2 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Fri, 23 Jun 2023 11:36:07 +0100 Subject: [PATCH] Alerting: Support concurrent queries for saving alert instances (#70525) This commit adds support for concurrent queries when saving alert instances to the database. This is an experimental feature in response to some customers experiencing delays between rule evaluation and sending alerts to Alertmanager, resulting in flapping. It is disabled by default. --- conf/defaults.ini | 5 + go.mod | 18 +-- go.sum | 33 ++++-- pkg/services/ngalert/api/api_testing.go | 13 ++- pkg/services/ngalert/backtesting/engine.go | 13 ++- pkg/services/ngalert/ngalert.go | 15 +-- .../ngalert/schedule/schedule_unit_test.go | 26 +++-- pkg/services/ngalert/state/manager.go | 42 ++++--- .../ngalert/state/manager_bench_test.go | 3 +- .../ngalert/state/manager_private_test.go | 4 +- pkg/services/ngalert/state/manager_test.go | 104 ++++++++++-------- pkg/setting/setting_unified_alerting.go | 4 + 12 files changed, 161 insertions(+), 119 deletions(-) diff --git a/conf/defaults.ini b/conf/defaults.ini index 3ef1c89601f..081d07cdcf6 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1086,6 +1086,11 @@ max_attempts = 3 # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. min_interval = 10s +# This is an experimental option to add parallelization to saving alert states in the database. +# It configures the maximum number of concurrent queries per rule evaluated. The default value is 1 +# (concurrent queries per rule disabled). +max_state_save_concurrency = 1 + [unified_alerting.screenshots] # Enable screenshots in notifications. You must have either installed the Grafana image rendering # plugin, or set up Grafana to use a remote rendering service. diff --git a/go.mod b/go.mod index 0b4579b991a..36adb6b1331 100644 --- a/go.mod +++ b/go.mod @@ -86,8 +86,8 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/alertmanager v0.25.0 github.com/prometheus/client_golang v1.15.1 - github.com/prometheus/client_model v0.3.0 - github.com/prometheus/common v0.42.0 + github.com/prometheus/client_model v0.4.0 + github.com/prometheus/common v0.43.0 github.com/prometheus/prometheus v1.8.2-0.20210621150501-ff58416a0b02 github.com/robfig/cron/v3 v3.0.1 github.com/russellhaering/goxmldsig v1.2.0 @@ -107,15 +107,15 @@ require ( go.opentelemetry.io/otel/sdk v1.14.0 go.opentelemetry.io/otel/trace v1.14.0 golang.org/x/crypto v0.7.0 - golang.org/x/exp v0.0.0-20230307190834-24139beb5833 + golang.org/x/exp v0.0.0-20230321023759-10a507213a29 golang.org/x/net v0.9.0 - golang.org/x/oauth2 v0.6.0 + golang.org/x/oauth2 v0.7.0 golang.org/x/sync v0.3.0 golang.org/x/time v0.3.0 golang.org/x/tools v0.7.0 gonum.org/v1/gonum v0.11.0 google.golang.org/api v0.111.0 - google.golang.org/grpc v1.54.0 + google.golang.org/grpc v1.55.0 google.golang.org/protobuf v1.30.0 gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect gopkg.in/ini.v1 v1.67.0 @@ -217,7 +217,7 @@ require ( go.opencensus.io v0.24.0 // indirect go.uber.org/atomic v1.10.0 go.uber.org/goleak v1.2.1 // indirect - golang.org/x/sys v0.7.0 // indirect + golang.org/x/sys v0.8.0 // indirect golang.org/x/text v0.9.0 golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect @@ -240,7 +240,7 @@ require ( github.com/golang-migrate/migrate/v4 v4.7.0 github.com/google/go-github/v45 v45.2.0 github.com/grafana/codejen v0.0.3 - github.com/grafana/dskit v0.0.0-20230202092222-880a7f8141cc + github.com/grafana/dskit v0.0.0-20230620150242-3dc2113b720d github.com/grafana/phlare/api v0.1.4-0.20230426005640-f90edba05413 github.com/huandu/xstrings v1.3.1 github.com/jmoiron/sqlx v1.3.5 @@ -267,7 +267,7 @@ require ( github.com/grafana/thema v0.0.0-20230615161902-b6e21996aef8 github.com/ory/fosite v0.44.1-0.20230317114349-45a6785cc54f github.com/redis/go-redis/v9 v9.0.2 - github.com/weaveworks/common v0.0.0-20230208133027-16871410fca4 + github.com/weaveworks/common v0.0.0-20230511094633-334485600903 github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f go.opentelemetry.io/contrib/samplers/jaegerremote v0.9.0 gopkg.in/square/go-jose.v2 v2.5.2-0.20210529014059-a5c7eec3c614 @@ -339,7 +339,7 @@ require ( github.com/pelletier/go-toml v1.9.5 // indirect github.com/perimeterx/marshmallow v1.1.4 // indirect github.com/rivo/uniseg v0.3.4 // indirect - github.com/rogpeppe/go-internal v1.9.0 // indirect + github.com/rogpeppe/go-internal v1.10.0 // indirect github.com/rueian/rueidis v0.0.100-go1.18 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/segmentio/asm v1.2.0 // indirect diff --git a/go.sum b/go.sum index 9d78ba91b79..419e42242c2 100644 --- a/go.sum +++ b/go.sum @@ -1365,8 +1365,8 @@ github.com/grafana/dataplane/examples v0.0.0-20230404174214-4d6fd58a18ad h1:bROI github.com/grafana/dataplane/examples v0.0.0-20230404174214-4d6fd58a18ad/go.mod h1:h5YwY8s407/17XF5/dS8XrUtsTVV2RnuW8+m1Mp46mg= github.com/grafana/dataplane/sdata v0.0.6 h1:Ejlj8d1Hvy/uDLeI4sOvL34Y8WLlVDd9iN270F+8aTw= github.com/grafana/dataplane/sdata v0.0.6/go.mod h1:Jvs5ddpGmn6vcxT7tCTWAZ1mgi4sbcdFt9utQx5uMAU= -github.com/grafana/dskit v0.0.0-20230202092222-880a7f8141cc h1:lQFgXpsZNDdi0whUROW15r/akzLIdXAn6xr5vqlZucI= -github.com/grafana/dskit v0.0.0-20230202092222-880a7f8141cc/go.mod h1:ulYLLoSd71AWIjxgifLO86Lndx82Yj+IcV+fFnh8tkI= +github.com/grafana/dskit v0.0.0-20230620150242-3dc2113b720d h1:1w9c1/z4JypJUatX+rPIiK3eTAV7+OF094UkDIPDTIs= +github.com/grafana/dskit v0.0.0-20230620150242-3dc2113b720d/go.mod h1:M03k2fzuQ2n9TVE1xfVKTESibxsXdw0wYfWT3+9Owp4= github.com/grafana/go-mssqldb v0.9.1 h1:3CqteWF0CadwXV9f3FxoI+i3uSW3azjTlQipyOJtWtQ= github.com/grafana/go-mssqldb v0.9.1/go.mod h1:HTCsUqZdb7oIO7jc37YauiSB5C3P/13AnpctVWBhlus= github.com/grafana/go-mssqldb v0.9.2 h1:FkyRJR4ywsT07iMtpFMHStrl8uuNkGIwp253Fee06z8= @@ -2133,8 +2133,9 @@ github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1: github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/client_model v0.4.0 h1:5lQXD3cAg1OXBf4Wq03gTrXHeaV0TQvGfUooCfx1yqY= +github.com/prometheus/client_model v0.4.0/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= @@ -2153,8 +2154,9 @@ github.com/prometheus/common v0.30.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+ github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.41.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= +github.com/prometheus/common v0.43.0 h1:iq+BVjvYLei5f27wiuNiB1DN6DYQkp1c8Bx0Vykh5us= +github.com/prometheus/common v0.43.0/go.mod h1:NCvr5cQIh3Y/gy73/RdVtC9r8xxrxwJnB+2lB3BxrFc= github.com/prometheus/common/assets v0.2.0/go.mod h1:D17UVUE12bHbim7HzwUvtqm6gwBEaDQ0F+hIGbFbccI= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI= @@ -2211,8 +2213,9 @@ github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= github.com/rogpeppe/go-internal v1.8.1/go.mod h1:JeRgkft04UBgHMgCIwADu4Pn6Mtm5d4nPKWu0nJ5d+o= -github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/cors v1.6.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/rs/cors v1.8.0/go.mod h1:EBwu+T5AvHOcXwvZIkQFjUN6s8Czyqw12GL/Y0tUyRM= github.com/rs/cors v1.9.0 h1:l9HGsTsHJcvW14Nk7J9KFz8bzeAWXn3CG6bgt7LsrAE= @@ -2260,7 +2263,7 @@ github.com/segmentio/encoding v0.3.6 h1:E6lVLyDPseWEulBmCmAKPanDd3jiyGDo5gMcugCR github.com/segmentio/encoding v0.3.6/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM= github.com/segmentio/go-snakecase v1.1.0/go.mod h1:jk1miR5MS7Na32PZUykG89Arm+1BUSYhuGR6b7+hJto= github.com/segmentio/objconv v1.0.1/go.mod h1:auayaH5k3137Cl4SoXTgrzQcuQDmvuVtZgS0fb1Ahys= -github.com/sercand/kuberesolver v2.4.0+incompatible/go.mod h1:lWF3GL0xptCB/vCiJPl/ZshwPsX/n4Y7u0CW9E7aQIQ= +github.com/sercand/kuberesolver/v4 v4.0.0/go.mod h1:F4RGyuRmMAjeXHKL+w4P7AwUnPceEAPAhxUgXZjKgvM= github.com/serenize/snaker v0.0.0-20171204205717-a683aaf2d516/go.mod h1:Yow6lPLSAXx2ifx470yD/nUe22Dv5vBvxK/UK9UUTVs= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= @@ -2457,8 +2460,8 @@ github.com/vektra/mockery v0.0.0-20181123154057-e78b021dcbb5/go.mod h1:ppEjwdhyy github.com/vultr/govultr/v2 v2.17.2 h1:gej/rwr91Puc/tgh+j33p/BLR16UrIPnSr+AIwYWZQs= github.com/vultr/govultr/v2 v2.17.2/go.mod h1:ZFOKGWmgjytfyjeyAdhQlSWwTjh2ig+X49cAp50dzXI= github.com/wadey/gocovmerge v0.0.0-20160331181800-b5bfa59ec0ad/go.mod h1:Hy8o65+MXnS6EwGElrSRjUzQDLXreJlzYLlWiHtt8hM= -github.com/weaveworks/common v0.0.0-20230208133027-16871410fca4 h1:8eoXaryYVOWJZCnCzULYXtxiHHLrJpvoD7p283ogmo8= -github.com/weaveworks/common v0.0.0-20230208133027-16871410fca4/go.mod h1:KoQ+3z63GUJzQ7AhU0AWQNU+LPda2EwL/cx1PlbDzVQ= +github.com/weaveworks/common v0.0.0-20230511094633-334485600903 h1:ph7R2CS/0o1gBzpzK/CioUKJVsXNVXfDGR8FZ9rMZIw= +github.com/weaveworks/common v0.0.0-20230511094633-334485600903/go.mod h1:rgbeLfJUtEr+G74cwFPR1k/4N0kDeaeSv/qhUNE4hm8= github.com/weaveworks/promrus v1.2.0 h1:jOLf6pe6/vss4qGHjXmGz4oDJQA+AOCqEL3FvvZGz7M= github.com/weaveworks/promrus v1.2.0/go.mod h1:SaE82+OJ91yqjrE1rsvBWVzNZKcHYFtMUyS1+Ogs/KA= github.com/wk8/go-ordered-map v1.0.0 h1:BV7z+2PaK8LTSd/mWgY12HyMAo5CEgkHqbkVq2thqr8= @@ -2738,8 +2741,9 @@ golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= -golang.org/x/exp v0.0.0-20230307190834-24139beb5833 h1:SChBja7BCQewoTAU7IgvucQKMIXrEpFxNMs0spT3/5s= golang.org/x/exp v0.0.0-20230307190834-24139beb5833/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug= +golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= @@ -2882,6 +2886,7 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= +golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= @@ -2919,8 +2924,9 @@ golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20221006150949-b44042a4b9c1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= -golang.org/x/oauth2 v0.6.0 h1:Lh8GPgSKBfWSwFvtuWOfeI3aAAnbXTSutYxJiOJFgIw= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -3107,10 +3113,11 @@ golang.org/x/sys v0.0.0-20220919091848-fb04ddd9f9c8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -3118,6 +3125,7 @@ golang.org/x/term v0.0.0-20220526004731-065cf7ba2467/go.mod h1:jbD1KX2456YbFQfuX golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= +golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0 h1:BEvjmm5fURWqcfbSKTdpkDXYBrUS1c0m8agp14W48vQ= @@ -3133,6 +3141,7 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= diff --git a/pkg/services/ngalert/api/api_testing.go b/pkg/services/ngalert/api/api_testing.go index e82506a52f6..df0364948ed 100644 --- a/pkg/services/ngalert/api/api_testing.go +++ b/pkg/services/ngalert/api/api_testing.go @@ -79,12 +79,13 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext, } cfg := state.ManagerCfg{ - Metrics: nil, - ExternalURL: srv.appUrl, - InstanceStore: nil, - Images: &backtesting.NoopImageService{}, - Clock: clock.New(), - Historian: nil, + Metrics: nil, + ExternalURL: srv.appUrl, + InstanceStore: nil, + Images: &backtesting.NoopImageService{}, + Clock: clock.New(), + Historian: nil, + MaxStateSaveConcurrency: 1, } manager := state.NewManager(cfg) includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel) diff --git a/pkg/services/ngalert/backtesting/engine.go b/pkg/services/ngalert/backtesting/engine.go index e91c9845f60..e80d2d1c5fb 100644 --- a/pkg/services/ngalert/backtesting/engine.go +++ b/pkg/services/ngalert/backtesting/engine.go @@ -45,12 +45,13 @@ func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine { evalFactory: evalFactory, createStateManager: func() stateManager { cfg := state.ManagerCfg{ - Metrics: nil, - ExternalURL: appUrl, - InstanceStore: nil, - Images: &NoopImageService{}, - Clock: clock.New(), - Historian: nil, + Metrics: nil, + ExternalURL: appUrl, + InstanceStore: nil, + Images: &NoopImageService{}, + Clock: clock.New(), + Historian: nil, + MaxStateSaveConcurrency: 1, } return state.NewManager(cfg) }, diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index e35205a3f17..eaaba0f37ee 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -212,13 +212,14 @@ func (ng *AlertNG) init() error { return err } cfg := state.ManagerCfg{ - Metrics: ng.Metrics.GetStateMetrics(), - ExternalURL: appUrl, - InstanceStore: ng.store, - Images: ng.imageService, - Clock: clk, - Historian: history, - DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState), + Metrics: ng.Metrics.GetStateMetrics(), + ExternalURL: appUrl, + InstanceStore: ng.store, + Images: ng.imageService, + Clock: clk, + Historian: history, + DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState), + MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency, } stateManager := state.NewManager(cfg) scheduler := schedule.NewScheduler(schedCfg, stateManager) diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index 53280768f91..465d0dd7c78 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -75,12 +75,13 @@ func TestProcessTicks(t *testing.T) { Tracer: testTracer, } managerCfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: nil, - Images: &state.NoopImageService{}, - Clock: mockedClock, - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: nil, + Images: &state.NoopImageService{}, + Clock: mockedClock, + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(managerCfg) @@ -818,12 +819,13 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor Tracer: testTracer, } managerCfg := state.ManagerCfg{ - Metrics: m.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: is, - Images: &state.NoopImageService{}, - Clock: mockedClock, - Historian: &state.FakeHistorian{}, + Metrics: m.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: is, + Images: &state.NoopImageService{}, + Clock: mockedClock, + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(managerCfg) diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index 01372b1b54b..e107ae18abb 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -6,6 +6,7 @@ import ( "time" "github.com/benbjohnson/clock" + "github.com/grafana/dskit/concurrency" "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/infra/log" @@ -39,7 +40,8 @@ type Manager struct { historian Historian externalURL *url.URL - doNotSaveNormalState bool + doNotSaveNormalState bool + maxStateSaveConcurrency int } type ManagerCfg struct { @@ -51,20 +53,23 @@ type ManagerCfg struct { Historian Historian // DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods DoNotSaveNormalState bool + // MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel. + MaxStateSaveConcurrency int } func NewManager(cfg ManagerCfg) *Manager { return &Manager{ - cache: newCache(), - ResendDelay: ResendDelay, // TODO: make this configurable - log: log.New("ngalert.state.manager"), - metrics: cfg.Metrics, - instanceStore: cfg.InstanceStore, - images: cfg.Images, - historian: cfg.Historian, - clock: cfg.Clock, - externalURL: cfg.ExternalURL, - doNotSaveNormalState: cfg.DoNotSaveNormalState, + cache: newCache(), + ResendDelay: ResendDelay, // TODO: make this configurable + log: log.New("ngalert.state.manager"), + metrics: cfg.Metrics, + instanceStore: cfg.InstanceStore, + images: cfg.Images, + historian: cfg.Historian, + clock: cfg.Clock, + externalURL: cfg.ExternalURL, + doNotSaveNormalState: cfg.DoNotSaveNormalState, + maxStateSaveConcurrency: cfg.MaxStateSaveConcurrency, } } @@ -350,17 +355,17 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state return } - logger.Debug("Saving alert states", "count", len(states)) - for _, s := range states { + saveState := func(ctx context.Context, idx int) error { + s := states[idx] // Do not save normal state to database and remove transition to Normal state but keep mapped states if st.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() { - continue + return nil } key, err := s.GetAlertInstanceKey() if err != nil { logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String()) - continue + return nil } instance := ngModels.AlertInstance{ AlertInstanceKey: key, @@ -375,9 +380,14 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state err = st.instanceStore.SaveAlertInstance(ctx, instance) if err != nil { logger.Error("Failed to save alert state", "labels", s.Labels.String(), "state", s.State, "error", err) + return nil } + return nil } - logger.Debug("Saving alert states done", "count", len(states)) + + logger.Debug("Saving alert states", "count", len(states), "max_state_save_concurrency", st.maxStateSaveConcurrency) + _ = concurrency.ForEachJob(ctx, len(states), st.maxStateSaveConcurrency, saveState) + logger.Debug("Saving alert states done", "count", len(states), "max_state_save_concurrency", st.maxStateSaveConcurrency) } func (st *Manager) deleteAlertStates(ctx context.Context, logger log.Logger, states []StateTransition) { diff --git a/pkg/services/ngalert/state/manager_bench_test.go b/pkg/services/ngalert/state/manager_bench_test.go index b68f33b5c2b..6906230cf6e 100644 --- a/pkg/services/ngalert/state/manager_bench_test.go +++ b/pkg/services/ngalert/state/manager_bench_test.go @@ -22,7 +22,8 @@ func BenchmarkProcessEvalResults(b *testing.B) { metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry()) hist := historian.NewAnnotationBackend(&as, nil, nil, metrics) cfg := state.ManagerCfg{ - Historian: hist, + Historian: hist, + MaxStateSaveConcurrency: 1, } sut := state.NewManager(cfg) now := time.Now().UTC() diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index 44cc72043b5..e4ddb5ff6b4 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -114,7 +114,7 @@ func TestManager_saveAlertStates(t *testing.T) { t.Run("should save all transitions if doNotSaveNormalState is false", func(t *testing.T) { st := &FakeInstanceStore{} - m := Manager{instanceStore: st, doNotSaveNormalState: false} + m := Manager{instanceStore: st, doNotSaveNormalState: false, maxStateSaveConcurrency: 1} m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...) savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{} @@ -131,7 +131,7 @@ func TestManager_saveAlertStates(t *testing.T) { t.Run("should not save Normal->Normal if doNotSaveNormalState is true", func(t *testing.T) { st := &FakeInstanceStore{} - m := Manager{instanceStore: st, doNotSaveNormalState: true} + m := Manager{instanceStore: st, doNotSaveNormalState: true, maxStateSaveConcurrency: 1} m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...) savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{} diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index b4abfee70f1..b8b53ecdf5d 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -194,12 +194,13 @@ func TestWarmStateCache(t *testing.T) { } cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: dbstore, - Images: &state.NoopImageService{}, - Clock: clock.NewMock(), - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: dbstore, + Images: &state.NoopImageService{}, + Clock: clock.NewMock(), + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -227,12 +228,13 @@ func TestDashboardAnnotations(t *testing.T) { metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry()) hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil, metrics) cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: dbstore, - Images: &state.NoopImageService{}, - Clock: clock.New(), - Historian: hist, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: dbstore, + Images: &state.NoopImageService{}, + Clock: clock.New(), + Historian: hist, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) @@ -2256,12 +2258,13 @@ func TestProcessEvalResults(t *testing.T) { metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry()) hist := historian.NewAnnotationBackend(fakeAnnoRepo, &dashboards.FakeDashboardService{}, nil, metrics) cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: &state.FakeInstanceStore{}, - Images: &state.NotAvailableImageService{}, - Clock: clock.New(), - Historian: hist, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: &state.FakeInstanceStore{}, + Images: &state.NotAvailableImageService{}, + Clock: clock.New(), + Historian: hist, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) t.Run(tc.desc, func(t *testing.T) { @@ -2291,12 +2294,13 @@ func TestProcessEvalResults(t *testing.T) { instanceStore := &state.FakeInstanceStore{} clk := clock.New() cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: instanceStore, - Images: &state.NotAvailableImageService{}, - Clock: clk, - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: instanceStore, + Images: &state.NotAvailableImageService{}, + Clock: clk, + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) rule := models.AlertRuleGen()() @@ -2432,12 +2436,13 @@ func TestStaleResultsHandler(t *testing.T) { for _, tc := range testCases { ctx := context.Background() cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: dbstore, - Images: &state.NoopImageService{}, - Clock: clock.New(), - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: dbstore, + Images: &state.NoopImageService{}, + Clock: clock.New(), + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -2511,12 +2516,13 @@ func TestStaleResults(t *testing.T) { store := &state.FakeInstanceStore{} cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: store, - Images: &state.NoopImageService{}, - Clock: clk, - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: store, + Images: &state.NoopImageService{}, + Clock: clk, + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) @@ -2678,12 +2684,13 @@ func TestDeleteStateByRuleUID(t *testing.T) { clk := clock.NewMock() clk.Set(time.Now()) cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: dbstore, - Images: &state.NoopImageService{}, - Clock: clk, - Historian: &state.FakeHistorian{}, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: dbstore, + Images: &state.NoopImageService{}, + Clock: clk, + Historian: &state.FakeHistorian{}, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -2817,12 +2824,13 @@ func TestResetStateByRuleUID(t *testing.T) { clk := clock.NewMock() clk.Set(time.Now()) cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), - ExternalURL: nil, - InstanceStore: dbstore, - Images: &state.NoopImageService{}, - Clock: clk, - Historian: fakeHistorian, + Metrics: testMetrics.GetStateMetrics(), + ExternalURL: nil, + InstanceStore: dbstore, + Images: &state.NoopImageService{}, + Clock: clk, + Historian: fakeHistorian, + MaxStateSaveConcurrency: 1, } st := state.NewManager(cfg) st.Warm(ctx, dbstore) diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 88d3dd63674..c594a76315a 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -93,6 +93,8 @@ type UnifiedAlertingSettings struct { Screenshots UnifiedAlertingScreenshotSettings ReservedLabels UnifiedAlertingReservedLabelSettings StateHistory UnifiedAlertingStateHistorySettings + // MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel. + MaxStateSaveConcurrency int } type UnifiedAlertingScreenshotSettings struct { @@ -352,6 +354,8 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { } uaCfg.StateHistory = uaCfgStateHistory + uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1) + cfg.UnifiedAlerting = uaCfg return nil }