2023-04-19 10:05:26 -05:00
|
|
|
package notifier
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2023-11-01 11:17:38 -05:00
|
|
|
"slices"
|
2023-04-19 10:05:26 -05:00
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/gogo/protobuf/proto"
|
|
|
|
"github.com/google/uuid"
|
2023-12-21 05:34:48 -06:00
|
|
|
alertingCluster "github.com/grafana/alerting/cluster"
|
|
|
|
alertingClusterPB "github.com/grafana/alerting/cluster/clusterpb"
|
2023-04-19 10:05:26 -05:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
|
|
|
|
"github.com/redis/go-redis/v9"
|
|
|
|
|
|
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
|
|
)
|
|
|
|
|
|
|
|
type redisConfig struct {
|
|
|
|
addr string
|
|
|
|
username string
|
|
|
|
password string
|
|
|
|
db int
|
|
|
|
name string
|
|
|
|
prefix string
|
2023-08-29 14:59:12 -05:00
|
|
|
maxConns int
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
peerPattern = "*"
|
|
|
|
fullState = "full_state"
|
|
|
|
fullStateChannel = fullState
|
|
|
|
fullStateChannelReq = fullStateChannel + ":request"
|
|
|
|
update = "update"
|
|
|
|
redisServerLabel = "redis-server"
|
|
|
|
networkRetryIntervalMin = time.Millisecond * 100
|
|
|
|
networkRetryIntervalMax = time.Second * 10
|
|
|
|
membersSyncInterval = time.Second * 5
|
|
|
|
waitForMsgIdle = time.Millisecond * 100
|
|
|
|
reasonBufferOverflow = "buffer_overflow"
|
|
|
|
reasonRedisIssue = "redis_issue"
|
|
|
|
heartbeatInterval = time.Second * 5
|
|
|
|
heartbeatTimeout = time.Minute
|
2023-08-29 14:59:12 -05:00
|
|
|
defaultPoolSize = 5
|
2023-04-19 10:05:26 -05:00
|
|
|
// The duration we want to return the members if the network is down.
|
|
|
|
membersValidFor = time.Minute
|
|
|
|
)
|
|
|
|
|
|
|
|
type redisPeer struct {
|
|
|
|
name string
|
|
|
|
redis *redis.Client
|
|
|
|
prefix string
|
|
|
|
logger log.Logger
|
2023-12-21 05:34:48 -06:00
|
|
|
states map[string]alertingCluster.State
|
2023-04-19 10:05:26 -05:00
|
|
|
subs map[string]*redis.PubSub
|
|
|
|
statesMtx sync.RWMutex
|
|
|
|
|
|
|
|
readyc chan struct{}
|
|
|
|
shutdownc chan struct{}
|
|
|
|
|
|
|
|
pushPullInterval time.Duration
|
|
|
|
|
|
|
|
messagesReceived *prometheus.CounterVec
|
|
|
|
messagesReceivedSize *prometheus.CounterVec
|
|
|
|
messagesSent *prometheus.CounterVec
|
|
|
|
messagesSentSize *prometheus.CounterVec
|
|
|
|
messagesPublishFailures *prometheus.CounterVec
|
|
|
|
nodePingDuration *prometheus.HistogramVec
|
|
|
|
nodePingFailures prometheus.Counter
|
|
|
|
|
|
|
|
// List of active members of the cluster. Should be accessed through the Members function.
|
|
|
|
members []string
|
|
|
|
membersMtx sync.Mutex
|
|
|
|
// The time when we fetched the members from redis the last time successfully.
|
|
|
|
membersFetchedAt time.Time
|
|
|
|
}
|
|
|
|
|
|
|
|
func newRedisPeer(cfg redisConfig, logger log.Logger, reg prometheus.Registerer,
|
|
|
|
pushPullInterval time.Duration) (*redisPeer, error) {
|
|
|
|
name := "peer-" + uuid.New().String()
|
|
|
|
// If a specific name is provided, overwrite default one.
|
|
|
|
if cfg.name != "" {
|
|
|
|
name = cfg.name
|
|
|
|
}
|
2023-08-29 14:59:12 -05:00
|
|
|
// Allow zero through, since it'll fall back to go-redis's default.
|
|
|
|
poolSize := defaultPoolSize
|
|
|
|
if cfg.maxConns >= 0 {
|
|
|
|
poolSize = cfg.maxConns
|
|
|
|
}
|
2023-04-19 10:05:26 -05:00
|
|
|
rdb := redis.NewClient(&redis.Options{
|
|
|
|
Addr: cfg.addr,
|
|
|
|
Username: cfg.username,
|
|
|
|
Password: cfg.password,
|
|
|
|
DB: cfg.db,
|
2023-08-29 14:59:12 -05:00
|
|
|
PoolSize: poolSize,
|
2023-04-19 10:05:26 -05:00
|
|
|
})
|
|
|
|
cmd := rdb.Ping(context.Background())
|
|
|
|
if cmd.Err() != nil {
|
2023-09-05 10:43:13 -05:00
|
|
|
logger.Error("Failed to ping redis - redis-based alertmanager clustering may not be available", "err", cmd.Err())
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
2023-09-05 10:43:13 -05:00
|
|
|
|
2023-04-19 10:05:26 -05:00
|
|
|
// Make sure that the prefix uses a colon at the end as deliminator.
|
|
|
|
if cfg.prefix != "" && cfg.prefix[len(cfg.prefix)-1] != ':' {
|
|
|
|
cfg.prefix = cfg.prefix + ":"
|
|
|
|
}
|
|
|
|
p := &redisPeer{
|
|
|
|
name: name,
|
|
|
|
redis: rdb,
|
|
|
|
logger: logger,
|
2023-12-21 05:34:48 -06:00
|
|
|
states: map[string]alertingCluster.State{},
|
2023-04-19 10:05:26 -05:00
|
|
|
subs: map[string]*redis.PubSub{},
|
|
|
|
pushPullInterval: pushPullInterval,
|
|
|
|
readyc: make(chan struct{}),
|
|
|
|
shutdownc: make(chan struct{}),
|
|
|
|
prefix: cfg.prefix,
|
|
|
|
members: make([]string, 0),
|
|
|
|
}
|
|
|
|
|
|
|
|
// The metrics for the redis peer are exactly the same as for the official
|
|
|
|
// upstream Memberlist implementation. Three metrics that doesn't make sense
|
|
|
|
// for redis are not available: messagesPruned, messagesQueued, nodeAlive.
|
|
|
|
messagesReceived := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_messages_received_total",
|
|
|
|
Help: "Total number of cluster messages received.",
|
|
|
|
}, []string{"msg_type"})
|
|
|
|
messagesReceivedSize := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_messages_received_size_total",
|
|
|
|
Help: "Total size of cluster messages received.",
|
|
|
|
}, []string{"msg_type"})
|
|
|
|
messagesSent := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_messages_sent_total",
|
|
|
|
Help: "Total number of cluster messages sent.",
|
|
|
|
}, []string{"msg_type"})
|
|
|
|
messagesSentSize := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_messages_sent_size_total",
|
|
|
|
Help: "Total size of cluster messages sent.",
|
|
|
|
}, []string{"msg_type"})
|
|
|
|
messagesPublishFailures := prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_messages_publish_failures_total",
|
|
|
|
Help: "Total number of messages that failed to be published.",
|
|
|
|
}, []string{"msg_type", "reason"})
|
|
|
|
gossipClusterMembers := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
|
|
|
Name: "alertmanager_cluster_members",
|
|
|
|
Help: "Number indicating current number of members in cluster.",
|
|
|
|
}, func() float64 {
|
|
|
|
return float64(p.ClusterSize())
|
|
|
|
})
|
|
|
|
peerPosition := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
|
|
|
Name: "alertmanager_peer_position",
|
|
|
|
Help: "Position the Alertmanager instance believes it's in. The position determines a peer's behavior in the cluster.",
|
|
|
|
}, func() float64 {
|
|
|
|
return float64(p.Position())
|
|
|
|
})
|
|
|
|
healthScore := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
|
|
|
|
Name: "alertmanager_cluster_health_score",
|
|
|
|
Help: "Health score of the cluster. Lower values are better and zero means 'totally healthy'.",
|
|
|
|
}, func() float64 {
|
|
|
|
return float64(p.GetHealthScore())
|
|
|
|
})
|
|
|
|
nodePingDuration := prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
|
|
Name: "alertmanager_cluster_pings_seconds",
|
|
|
|
Help: "Histogram of latencies for ping messages.",
|
|
|
|
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5},
|
|
|
|
}, []string{"peer"},
|
|
|
|
)
|
|
|
|
nodePingFailures := prometheus.NewCounter(prometheus.CounterOpts{
|
|
|
|
Name: "alertmanager_cluster_pings_failures_total",
|
|
|
|
Help: "Total number of failed pings.",
|
|
|
|
})
|
|
|
|
|
|
|
|
messagesReceived.WithLabelValues(fullState)
|
|
|
|
messagesReceivedSize.WithLabelValues(fullState)
|
|
|
|
messagesReceived.WithLabelValues(update)
|
|
|
|
messagesReceivedSize.WithLabelValues(update)
|
|
|
|
messagesSent.WithLabelValues(fullState)
|
|
|
|
messagesSentSize.WithLabelValues(fullState)
|
|
|
|
messagesSent.WithLabelValues(update)
|
|
|
|
messagesSentSize.WithLabelValues(update)
|
|
|
|
messagesPublishFailures.WithLabelValues(fullState, reasonRedisIssue)
|
|
|
|
messagesPublishFailures.WithLabelValues(update, reasonRedisIssue)
|
|
|
|
messagesPublishFailures.WithLabelValues(update, reasonBufferOverflow)
|
|
|
|
|
|
|
|
reg.MustRegister(messagesReceived, messagesReceivedSize, messagesSent, messagesSentSize,
|
|
|
|
gossipClusterMembers, peerPosition, healthScore, nodePingDuration, nodePingFailures,
|
|
|
|
messagesPublishFailures,
|
|
|
|
)
|
|
|
|
|
|
|
|
p.messagesReceived = messagesReceived
|
|
|
|
p.messagesReceivedSize = messagesReceivedSize
|
|
|
|
p.messagesSent = messagesSent
|
|
|
|
p.messagesSentSize = messagesSentSize
|
|
|
|
p.messagesPublishFailures = messagesPublishFailures
|
|
|
|
p.nodePingDuration = nodePingDuration
|
|
|
|
p.nodePingFailures = nodePingFailures
|
|
|
|
|
|
|
|
p.subs[fullStateChannel] = p.redis.Subscribe(context.Background(), p.withPrefix(fullStateChannel))
|
|
|
|
p.subs[fullStateChannelReq] = p.redis.Subscribe(context.Background(), p.withPrefix(fullStateChannelReq))
|
|
|
|
|
|
|
|
go p.heartbeatLoop()
|
|
|
|
go p.membersSyncLoop()
|
|
|
|
go p.fullStateSyncPublishLoop()
|
|
|
|
go p.fullStateSyncReceiveLoop()
|
|
|
|
go p.fullStateReqReceiveLoop()
|
|
|
|
|
|
|
|
return p, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) withPrefix(str string) string {
|
|
|
|
return p.prefix + str
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) heartbeatLoop() {
|
|
|
|
ticker := time.NewTicker(heartbeatInterval)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
|
|
|
startTime := time.Now()
|
|
|
|
cmd := p.redis.Set(context.Background(), p.withPrefix(p.name), time.Now().Unix(), time.Minute*5)
|
|
|
|
reqDur := time.Since(startTime)
|
|
|
|
if cmd.Err() != nil {
|
|
|
|
p.nodePingFailures.Inc()
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error setting the heartbeat key", "err", cmd.Err(), "peer", p.withPrefix(p.name))
|
2023-04-19 10:05:26 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
p.nodePingDuration.WithLabelValues(redisServerLabel).Observe(reqDur.Seconds())
|
|
|
|
case <-p.shutdownc:
|
|
|
|
ticker.Stop()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) membersSyncLoop() {
|
|
|
|
ticker := time.NewTicker(membersSyncInterval)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
|
|
|
p.membersSync()
|
|
|
|
case <-p.shutdownc:
|
|
|
|
ticker.Stop()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) membersSync() {
|
|
|
|
startTime := time.Now()
|
2023-06-26 11:12:10 -05:00
|
|
|
members, err := p.membersScan()
|
2023-04-19 10:05:26 -05:00
|
|
|
if err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
|
2023-04-19 10:05:26 -05:00
|
|
|
// To prevent a spike of duplicate messages, we return for the duration of
|
|
|
|
// membersValidFor the last known members and only empty the list if we do
|
|
|
|
// not eventually recover.
|
|
|
|
if p.membersFetchedAt.Before(time.Now().Add(-membersValidFor)) {
|
|
|
|
p.membersMtx.Lock()
|
|
|
|
p.members = []string{}
|
|
|
|
p.membersMtx.Unlock()
|
|
|
|
return
|
|
|
|
}
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Fetching members from redis failed, falling back to last known members", "last_known", p.members)
|
2023-04-19 10:05:26 -05:00
|
|
|
return
|
|
|
|
}
|
|
|
|
// This might happen on startup, when no value is in the store yet.
|
|
|
|
if len(members) == 0 {
|
|
|
|
p.membersMtx.Lock()
|
|
|
|
p.members = []string{}
|
|
|
|
p.membersMtx.Unlock()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
values := p.redis.MGet(context.Background(), members...)
|
|
|
|
if values.Err() != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error getting values from redis", "err", values.Err(), "keys", members)
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
// After getting the list of possible members from redis, we filter
|
|
|
|
// those out that have failed to send a heartbeat during the heartbeatTimeout.
|
|
|
|
peers := p.filterUnhealthyMembers(members, values.Val())
|
|
|
|
sort.Strings(peers)
|
2023-06-26 11:12:10 -05:00
|
|
|
|
|
|
|
// Redis Scan may return duplicate elements
|
|
|
|
// Filtering duplicates with Compact after sorting to prevent inconsistencies when calculating Position
|
|
|
|
peers = slices.Compact(peers)
|
|
|
|
|
2023-04-19 10:05:26 -05:00
|
|
|
dur := time.Since(startTime)
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Debug("Membership sync done", "duration_ms", dur.Milliseconds())
|
2023-04-19 10:05:26 -05:00
|
|
|
p.membersMtx.Lock()
|
|
|
|
p.members = peers
|
|
|
|
p.membersMtx.Unlock()
|
|
|
|
p.membersFetchedAt = time.Now()
|
|
|
|
}
|
|
|
|
|
2023-06-26 11:12:10 -05:00
|
|
|
func (p *redisPeer) membersScan() ([]string, error) {
|
|
|
|
var (
|
|
|
|
cursor uint64
|
|
|
|
err error
|
|
|
|
members = []string{}
|
|
|
|
keys []string
|
|
|
|
)
|
|
|
|
// The 100 is a hint for the server, how many records there might be for the
|
|
|
|
// provided pattern. It _might_ only return the first 100 records, which should
|
|
|
|
// be more than enough for our use case.
|
|
|
|
// More here: https://redis.io/commands/scan/
|
|
|
|
for {
|
|
|
|
keys, cursor, err = p.redis.Scan(context.Background(), cursor, p.withPrefix(peerPattern), 100).Result()
|
|
|
|
if err != nil {
|
|
|
|
return []string{}, err
|
|
|
|
}
|
|
|
|
members = append(members, keys...)
|
|
|
|
if cursor == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return members, nil
|
|
|
|
}
|
|
|
|
|
2023-04-19 10:05:26 -05:00
|
|
|
// filterUnhealthyMembers will filter out the members that have failed to send
|
|
|
|
// a heartbeat since heartbeatTimeout.
|
2023-08-30 10:46:47 -05:00
|
|
|
func (p *redisPeer) filterUnhealthyMembers(members []string, values []any) []string {
|
2023-04-19 10:05:26 -05:00
|
|
|
peers := []string{}
|
|
|
|
for i, peer := range members {
|
|
|
|
val := values[i]
|
|
|
|
if val == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ts, err := strconv.ParseInt(val.(string), 10, 64)
|
|
|
|
if err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error parsing timestamp value", "err", err, "peer", peer, "val", val)
|
2023-04-19 10:05:26 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
tm := time.Unix(ts, 0)
|
|
|
|
if tm.Before(time.Now().Add(-heartbeatTimeout)) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
peers = append(peers, peer)
|
|
|
|
}
|
|
|
|
return peers
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) Position() int {
|
|
|
|
for i, peer := range p.Members() {
|
|
|
|
if peer == p.withPrefix(p.name) {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Debug("Cluster position found", "name", p.name, "position", i)
|
2023-04-19 10:05:26 -05:00
|
|
|
return i
|
|
|
|
}
|
|
|
|
}
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Failed to look up position, falling back to position 0")
|
2023-04-19 10:05:26 -05:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the known size of the Cluster. This also includes dead nodes that
|
|
|
|
// haven't timeout yet.
|
|
|
|
func (p *redisPeer) ClusterSize() int {
|
2023-06-26 11:12:10 -05:00
|
|
|
members, err := p.membersScan()
|
|
|
|
if err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
|
2023-04-19 10:05:26 -05:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
return len(members)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the cluster is healthy it should return 0, otherwise the number of
|
|
|
|
// unhealthy nodes.
|
|
|
|
func (p *redisPeer) GetHealthScore() int {
|
|
|
|
size := p.ClusterSize()
|
|
|
|
members := len(p.Members())
|
|
|
|
if size > members {
|
|
|
|
return size - members
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Members returns a list of active cluster Members.
|
|
|
|
func (p *redisPeer) Members() []string {
|
|
|
|
p.membersMtx.Lock()
|
|
|
|
defer p.membersMtx.Unlock()
|
|
|
|
return p.members
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) WaitReady(ctx context.Context) error {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return ctx.Err()
|
|
|
|
case <-p.readyc:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-22 03:43:38 -05:00
|
|
|
// Settle is mostly copied from upstream.
|
2023-04-19 10:05:26 -05:00
|
|
|
// Ref: https://github.com/prometheus/alertmanager/blob/2888649b473970400c0bd375fdd563486dc80481/cluster/cluster.go#L674-L712
|
|
|
|
func (p *redisPeer) Settle(ctx context.Context, interval time.Duration) {
|
|
|
|
const NumOkayRequired = 3
|
|
|
|
p.logger.Info("Waiting for gossip to settle...", "interval", interval)
|
|
|
|
start := time.Now()
|
|
|
|
nPeers := 0
|
|
|
|
nOkay := 0
|
|
|
|
totalPolls := 0
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
elapsed := time.Since(start)
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Info("Gossip not settled but continuing anyway", "polls", totalPolls, "elapsed", elapsed)
|
2023-04-19 10:05:26 -05:00
|
|
|
close(p.readyc)
|
|
|
|
return
|
|
|
|
case <-time.After(interval):
|
|
|
|
}
|
|
|
|
elapsed := time.Since(start)
|
|
|
|
n := len(p.Members())
|
|
|
|
if nOkay >= NumOkayRequired {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Info("Gossip settled; proceeding", "elapsed", elapsed)
|
2023-04-19 10:05:26 -05:00
|
|
|
break
|
|
|
|
}
|
|
|
|
if n == nPeers {
|
|
|
|
nOkay++
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Debug("Gossip looks settled", "elapsed", elapsed)
|
2023-04-19 10:05:26 -05:00
|
|
|
} else {
|
|
|
|
nOkay = 0
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Info("Gossip not settled", "polls", totalPolls, "before", nPeers, "now", n, "elapsed", elapsed)
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
nPeers = n
|
|
|
|
totalPolls++
|
|
|
|
}
|
|
|
|
p.requestFullState()
|
|
|
|
close(p.readyc)
|
|
|
|
}
|
|
|
|
|
2023-12-21 05:34:48 -06:00
|
|
|
func (p *redisPeer) AddState(key string, state alertingCluster.State, _ prometheus.Registerer) alertingCluster.ClusterChannel {
|
2023-04-19 10:05:26 -05:00
|
|
|
p.statesMtx.Lock()
|
|
|
|
defer p.statesMtx.Unlock()
|
|
|
|
p.states[key] = state
|
|
|
|
// As we also want to get the state from other nodes, we subscribe to the key.
|
|
|
|
sub := p.redis.Subscribe(context.Background(), p.withPrefix(key))
|
|
|
|
go p.receiveLoop(key, sub)
|
|
|
|
p.subs[key] = sub
|
|
|
|
return newRedisChannel(p, key, p.withPrefix(key), update)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) receiveLoop(name string, channel *redis.PubSub) {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-p.shutdownc:
|
|
|
|
return
|
|
|
|
case data := <-channel.Channel():
|
|
|
|
p.mergePartialState([]byte(data.Payload))
|
|
|
|
default:
|
|
|
|
time.Sleep(waitForMsgIdle)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) mergePartialState(buf []byte) {
|
|
|
|
p.messagesReceived.WithLabelValues(update).Inc()
|
|
|
|
p.messagesReceivedSize.WithLabelValues(update).Add(float64(len(buf)))
|
|
|
|
|
2023-12-21 05:34:48 -06:00
|
|
|
var part alertingClusterPB.Part
|
2023-04-19 10:05:26 -05:00
|
|
|
if err := proto.Unmarshal(buf, &part); err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error decoding the received broadcast message", "err", err)
|
2023-04-19 10:05:26 -05:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
p.statesMtx.RLock()
|
|
|
|
s, ok := p.states[part.Key]
|
|
|
|
p.statesMtx.RUnlock()
|
|
|
|
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err := s.Merge(part.Data); err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error merging the received broadcast message", "err", err, "key", part.Key)
|
2023-04-19 10:05:26 -05:00
|
|
|
return
|
|
|
|
}
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Debug("Partial state was successfully merged", "key", part.Key)
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) fullStateReqReceiveLoop() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-p.shutdownc:
|
|
|
|
return
|
|
|
|
case data := <-p.subs[fullStateChannelReq].Channel():
|
|
|
|
// The payload of a full state request is the name of the peer that is
|
|
|
|
// requesting the full state. In case we received our own request, we
|
|
|
|
// can just ignore it. Redis pub/sub fanouts to all clients, regardless
|
|
|
|
// if a client was also the publisher.
|
|
|
|
if data.Payload == p.name {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
p.fullStateSyncPublish()
|
|
|
|
default:
|
|
|
|
time.Sleep(waitForMsgIdle)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) fullStateSyncReceiveLoop() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-p.shutdownc:
|
|
|
|
return
|
|
|
|
case data := <-p.subs[fullStateChannel].Channel():
|
|
|
|
p.mergeFullState([]byte(data.Payload))
|
|
|
|
default:
|
|
|
|
time.Sleep(waitForMsgIdle)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) mergeFullState(buf []byte) {
|
|
|
|
p.messagesReceived.WithLabelValues(fullState).Inc()
|
|
|
|
p.messagesReceivedSize.WithLabelValues(fullState).Add(float64(len(buf)))
|
|
|
|
|
2023-12-21 05:34:48 -06:00
|
|
|
var fs alertingClusterPB.FullState
|
2023-04-19 10:05:26 -05:00
|
|
|
if err := proto.Unmarshal(buf, &fs); err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error unmarshaling the received remote state", "err", err)
|
2023-04-19 10:05:26 -05:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
p.statesMtx.RLock()
|
|
|
|
defer p.statesMtx.RUnlock()
|
|
|
|
for _, part := range fs.Parts {
|
|
|
|
s, ok := p.states[part.Key]
|
|
|
|
if !ok {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Received", "unknown state key", "len", len(buf), "key", part.Key)
|
2023-04-19 10:05:26 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err := s.Merge(part.Data); err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error merging the received remote state", "err", err, "key", part.Key)
|
2023-04-19 10:05:26 -05:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Debug("Full state was successfully merged")
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) fullStateSyncPublish() {
|
|
|
|
pub := p.redis.Publish(context.Background(), p.withPrefix(fullStateChannel), p.LocalState())
|
|
|
|
if pub.Err() != nil {
|
|
|
|
p.messagesPublishFailures.WithLabelValues(fullState, reasonRedisIssue).Inc()
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannel))
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) fullStateSyncPublishLoop() {
|
|
|
|
ticker := time.NewTicker(p.pushPullInterval)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
|
|
|
p.fullStateSyncPublish()
|
|
|
|
case <-p.shutdownc:
|
|
|
|
ticker.Stop()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) requestFullState() {
|
|
|
|
pub := p.redis.Publish(context.Background(), p.withPrefix(fullStateChannelReq), p.name)
|
|
|
|
if pub.Err() != nil {
|
|
|
|
p.messagesPublishFailures.WithLabelValues(fullState, reasonRedisIssue).Inc()
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannelReq))
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) LocalState() []byte {
|
|
|
|
p.statesMtx.RLock()
|
|
|
|
defer p.statesMtx.RUnlock()
|
2023-12-21 05:34:48 -06:00
|
|
|
all := &alertingClusterPB.FullState{
|
|
|
|
Parts: make([]alertingClusterPB.Part, 0, len(p.states)),
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
for key, s := range p.states {
|
|
|
|
b, err := s.MarshalBinary()
|
|
|
|
if err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error encoding the local state", "err", err, "key", key)
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
2023-12-21 05:34:48 -06:00
|
|
|
all.Parts = append(all.Parts, alertingClusterPB.Part{Key: key, Data: b})
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
b, err := proto.Marshal(all)
|
|
|
|
if err != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Warn("Error encoding the local state to proto", "err", err)
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
p.messagesSent.WithLabelValues(fullState).Inc()
|
|
|
|
p.messagesSentSize.WithLabelValues(fullState).Add(float64(len(b)))
|
|
|
|
return b
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *redisPeer) Shutdown() {
|
|
|
|
p.logger.Info("Stopping redis peer...")
|
|
|
|
close(p.shutdownc)
|
|
|
|
p.fullStateSyncPublish()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
|
|
|
|
defer cancel()
|
|
|
|
del := p.redis.Del(ctx, p.withPrefix(p.name))
|
|
|
|
if del.Err() != nil {
|
2023-09-04 11:46:34 -05:00
|
|
|
p.logger.Error("Error deleting the redis key on shutdown", "err", del.Err(), "key", p.withPrefix(p.name))
|
2023-04-19 10:05:26 -05:00
|
|
|
}
|
|
|
|
}
|