Chore: capitalise messages for alerting (#74335)

This commit is contained in:
Serge Zaitsev
2023-09-04 18:46:34 +02:00
committed by GitHub
parent bd12ce0cbc
commit 58f6648505
26 changed files with 110 additions and 110 deletions

View File

@@ -66,10 +66,10 @@ func (moa *MultiOrgAlertmanager) ActivateHistoricalConfiguration(ctx context.Con
}
if err := am.SaveAndApplyConfig(ctx, cfg); err != nil {
moa.logger.Error("unable to save and apply historical alertmanager configuration", "error", err, "org", orgId, "id", id)
moa.logger.Error("Unable to save and apply historical alertmanager configuration", "error", err, "org", orgId, "id", id)
return AlertmanagerConfigRejectedError{err}
}
moa.logger.Info("applied historical alertmanager configuration", "org", orgId, "id", id)
moa.logger.Info("Applied historical alertmanager configuration", "org", orgId, "id", id)
return nil
}
@@ -184,7 +184,7 @@ func (moa *MultiOrgAlertmanager) ApplyAlertmanagerConfiguration(ctx context.Cont
}
if err := am.SaveAndApplyConfig(ctx, &config); err != nil {
moa.logger.Error("unable to save and apply alertmanager configuration", "error", err)
moa.logger.Error("Unable to save and apply alertmanager configuration", "error", err)
return AlertmanagerConfigRejectedError{err}
}

View File

@@ -129,7 +129,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
if err != nil {
moa.logger.Error("msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
}
// Attempt to verify the number of peers for 30s every 2s. The risk here is what we send a notification "too soon".
// Which should _never_ happen given we share the notification log via the database so the risk of double notification is very low.
@@ -152,14 +152,14 @@ func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
return nil
case <-time.After(moa.settings.UnifiedAlerting.AlertmanagerConfigPollInterval):
if err := moa.LoadAndSyncAlertmanagersForOrgs(ctx); err != nil {
moa.logger.Error("error while synchronizing Alertmanager orgs", "error", err)
moa.logger.Error("Error while synchronizing Alertmanager orgs", "error", err)
}
}
}
}
func (moa *MultiOrgAlertmanager) LoadAndSyncAlertmanagersForOrgs(ctx context.Context) error {
moa.logger.Debug("synchronizing Alertmanagers for orgs")
moa.logger.Debug("Synchronizing Alertmanagers for orgs")
// First, load all the organizations from the database.
orgIDs, err := moa.orgStore.GetOrgs(ctx)
if err != nil {
@@ -170,7 +170,7 @@ func (moa *MultiOrgAlertmanager) LoadAndSyncAlertmanagersForOrgs(ctx context.Con
moa.metrics.DiscoveredConfigurations.Set(float64(len(orgIDs)))
moa.SyncAlertmanagersForOrgs(ctx, orgIDs)
moa.logger.Debug("done synchronizing Alertmanagers for orgs")
moa.logger.Debug("Done synchronizing Alertmanagers for orgs")
return nil
}
@@ -195,13 +195,13 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
orgsFound := make(map[int64]struct{}, len(orgIDs))
dbConfigs, err := moa.getLatestConfigs(ctx)
if err != nil {
moa.logger.Error("failed to load Alertmanager configurations", "error", err)
moa.logger.Error("Failed to load Alertmanager configurations", "error", err)
return
}
moa.alertmanagersMtx.Lock()
for _, orgID := range orgIDs {
if _, isDisabledOrg := moa.settings.UnifiedAlerting.DisabledOrgs[orgID]; isDisabledOrg {
moa.logger.Debug("skipping syncing Alertmanager for disabled org", "org", orgID)
moa.logger.Debug("Skipping syncing Alertmanager for disabled org", "org", orgID)
continue
}
orgsFound[orgID] = struct{}{}
@@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID))
am, err := newAlertmanager(ctx, orgID, moa.settings, moa.configStore, moa.kvStore, moa.peer, moa.decryptFn, moa.ns, m)
if err != nil {
moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "error", err)
moa.logger.Error("Unable to create Alertmanager for org", "org", orgID, "error", err)
}
moa.alertmanagers[orgID] = am
alertmanager = am
@@ -229,7 +229,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
}
err := alertmanager.SaveAndApplyDefaultConfig(ctx)
if err != nil {
moa.logger.Error("failed to apply the default Alertmanager configuration", "org", orgID)
moa.logger.Error("Failed to apply the default Alertmanager configuration", "org", orgID)
continue
}
moa.alertmanagers[orgID] = alertmanager
@@ -238,7 +238,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
err := alertmanager.ApplyConfig(ctx, dbConfig)
if err != nil {
moa.logger.Error("failed to apply Alertmanager config for org", "org", orgID, "id", dbConfig.ID, "error", err)
moa.logger.Error("Failed to apply Alertmanager config for org", "org", orgID, "id", dbConfig.ID, "error", err)
continue
}
moa.alertmanagers[orgID] = alertmanager
@@ -257,9 +257,9 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
// Now, we can stop the Alertmanagers without having to hold a lock.
for orgID, am := range amsToStop {
moa.logger.Info("stopping Alertmanager", "org", orgID)
moa.logger.Info("Stopping Alertmanager", "org", orgID)
am.StopAndWait()
moa.logger.Info("stopped Alertmanager", "org", orgID)
moa.logger.Info("Stopped Alertmanager", "org", orgID)
// Cleanup all the remaining resources from this alertmanager.
am.fileStore.CleanUp()
}
@@ -278,22 +278,22 @@ func (moa *MultiOrgAlertmanager) cleanupOrphanLocalOrgState(ctx context.Context,
dataDir := filepath.Join(moa.settings.DataPath, workingDir)
files, err := os.ReadDir(dataDir)
if err != nil {
moa.logger.Error("failed to list local working directory", "dir", dataDir, "error", err)
moa.logger.Error("Failed to list local working directory", "dir", dataDir, "error", err)
return
}
for _, file := range files {
if !file.IsDir() {
moa.logger.Warn("ignoring unexpected file while scanning local working directory", "filename", filepath.Join(dataDir, file.Name()))
moa.logger.Warn("Ignoring unexpected file while scanning local working directory", "filename", filepath.Join(dataDir, file.Name()))
continue
}
orgID, err := strconv.ParseInt(file.Name(), 10, 64)
if err != nil {
moa.logger.Error("unable to parse orgID from directory name", "name", file.Name(), "error", err)
moa.logger.Error("Unable to parse orgID from directory name", "name", file.Name(), "error", err)
continue
}
_, exists := activeOrganizations[orgID]
if !exists {
moa.logger.Info("found orphan organization directory", "orgID", orgID)
moa.logger.Info("Found orphan organization directory", "orgID", orgID)
workingDirPath := filepath.Join(dataDir, strconv.FormatInt(orgID, 10))
fileStore := NewFileStore(orgID, moa.kvStore, workingDirPath)
// Cleanup all the remaining resources from this alertmanager.
@@ -307,7 +307,7 @@ func (moa *MultiOrgAlertmanager) cleanupOrphanLocalOrgState(ctx context.Context,
for _, fileName := range storedFiles {
keys, err := moa.kvStore.Keys(ctx, kvstore.AllOrganizations, KVNamespace, fileName)
if err != nil {
moa.logger.Error("failed to fetch items from kvstore", "error", err,
moa.logger.Error("Failed to fetch items from kvstore", "error", err,
"namespace", KVNamespace, "key", fileName)
}
for _, key := range keys {
@@ -316,7 +316,7 @@ func (moa *MultiOrgAlertmanager) cleanupOrphanLocalOrgState(ctx context.Context,
}
err = moa.kvStore.Del(ctx, key.OrgId, key.Namespace, key.Key)
if err != nil {
moa.logger.Error("failed to delete item from kvstore", "error", err,
moa.logger.Error("Failed to delete item from kvstore", "error", err,
"orgID", key.OrgId, "namespace", KVNamespace, "key", key.Key)
}
}
@@ -335,7 +335,7 @@ func (moa *MultiOrgAlertmanager) StopAndWait() {
if ok {
moa.settleCancel()
if err := p.Leave(10 * time.Second); err != nil {
moa.logger.Warn("unable to leave the gossip mesh", "error", err)
moa.logger.Warn("Unable to leave the gossip mesh", "error", err)
}
}
r, ok := moa.peer.(*redisPeer)

View File

@@ -40,7 +40,7 @@ func (c *RedisChannel) handleMessages() {
// The state will eventually be propagated to other members by the full sync.
if pub.Err() != nil {
c.p.messagesPublishFailures.WithLabelValues(c.msgType, reasonRedisIssue).Inc()
c.p.logger.Error("error publishing a message to redis", "err", pub.Err(), "channel", c.channel)
c.p.logger.Error("Error publishing a message to redis", "err", pub.Err(), "channel", c.channel)
continue
}
c.p.messagesSent.WithLabelValues(c.msgType).Inc()
@@ -52,7 +52,7 @@ func (c *RedisChannel) handleMessages() {
func (c *RedisChannel) Broadcast(b []byte) {
b, err := proto.Marshal(&clusterpb.Part{Key: c.key, Data: b})
if err != nil {
c.p.logger.Error("error marshalling broadcast into proto", "err", err, "channel", c.channel)
c.p.logger.Error("Error marshalling broadcast into proto", "err", err, "channel", c.channel)
return
}
select {
@@ -60,6 +60,6 @@ func (c *RedisChannel) Broadcast(b []byte) {
default:
// This is not the end of the world, we will catch up when we do a full state sync.
c.p.messagesPublishFailures.WithLabelValues(c.msgType, reasonBufferOverflow).Inc()
c.p.logger.Warn("buffer full, droping message", "channel", c.channel)
c.p.logger.Warn("Buffer full, droping message", "channel", c.channel)
}
}

View File

@@ -222,7 +222,7 @@ func (p *redisPeer) heartbeatLoop() {
reqDur := time.Since(startTime)
if cmd.Err() != nil {
p.nodePingFailures.Inc()
p.logger.Error("error setting the heartbeat key", "err", cmd.Err(), "peer", p.withPrefix(p.name))
p.logger.Error("Error setting the heartbeat key", "err", cmd.Err(), "peer", p.withPrefix(p.name))
continue
}
p.nodePingDuration.WithLabelValues(redisServerLabel).Observe(reqDur.Seconds())
@@ -250,7 +250,7 @@ func (p *redisPeer) membersSync() {
startTime := time.Now()
members, err := p.membersScan()
if err != nil {
p.logger.Error("error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
p.logger.Error("Error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
// To prevent a spike of duplicate messages, we return for the duration of
// membersValidFor the last known members and only empty the list if we do
// not eventually recover.
@@ -260,7 +260,7 @@ func (p *redisPeer) membersSync() {
p.membersMtx.Unlock()
return
}
p.logger.Warn("fetching members from redis failed, falling back to last known members", "last_known", p.members)
p.logger.Warn("Fetching members from redis failed, falling back to last known members", "last_known", p.members)
return
}
// This might happen on startup, when no value is in the store yet.
@@ -272,7 +272,7 @@ func (p *redisPeer) membersSync() {
}
values := p.redis.MGet(context.Background(), members...)
if values.Err() != nil {
p.logger.Error("error getting values from redis", "err", values.Err(), "keys", members)
p.logger.Error("Error getting values from redis", "err", values.Err(), "keys", members)
}
// After getting the list of possible members from redis, we filter
// those out that have failed to send a heartbeat during the heartbeatTimeout.
@@ -284,7 +284,7 @@ func (p *redisPeer) membersSync() {
peers = slices.Compact(peers)
dur := time.Since(startTime)
p.logger.Debug("membership sync done", "duration_ms", dur.Milliseconds())
p.logger.Debug("Membership sync done", "duration_ms", dur.Milliseconds())
p.membersMtx.Lock()
p.members = peers
p.membersMtx.Unlock()
@@ -326,7 +326,7 @@ func (p *redisPeer) filterUnhealthyMembers(members []string, values []any) []str
}
ts, err := strconv.ParseInt(val.(string), 10, 64)
if err != nil {
p.logger.Error("error parsing timestamp value", "err", err, "peer", peer, "val", val)
p.logger.Error("Error parsing timestamp value", "err", err, "peer", peer, "val", val)
continue
}
tm := time.Unix(ts, 0)
@@ -341,11 +341,11 @@ func (p *redisPeer) filterUnhealthyMembers(members []string, values []any) []str
func (p *redisPeer) Position() int {
for i, peer := range p.Members() {
if peer == p.withPrefix(p.name) {
p.logger.Debug("cluster position found", "name", p.name, "position", i)
p.logger.Debug("Cluster position found", "name", p.name, "position", i)
return i
}
}
p.logger.Warn("failed to look up position, falling back to position 0")
p.logger.Warn("Failed to look up position, falling back to position 0")
return 0
}
@@ -354,7 +354,7 @@ func (p *redisPeer) Position() int {
func (p *redisPeer) ClusterSize() int {
members, err := p.membersScan()
if err != nil {
p.logger.Error("error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
p.logger.Error("Error getting keys from redis", "err", err, "pattern", p.withPrefix(peerPattern))
return 0
}
return len(members)
@@ -400,7 +400,7 @@ func (p *redisPeer) Settle(ctx context.Context, interval time.Duration) {
select {
case <-ctx.Done():
elapsed := time.Since(start)
p.logger.Info("gossip not settled but continuing anyway", "polls", totalPolls, "elapsed", elapsed)
p.logger.Info("Gossip not settled but continuing anyway", "polls", totalPolls, "elapsed", elapsed)
close(p.readyc)
return
case <-time.After(interval):
@@ -408,15 +408,15 @@ func (p *redisPeer) Settle(ctx context.Context, interval time.Duration) {
elapsed := time.Since(start)
n := len(p.Members())
if nOkay >= NumOkayRequired {
p.logger.Info("gossip settled; proceeding", "elapsed", elapsed)
p.logger.Info("Gossip settled; proceeding", "elapsed", elapsed)
break
}
if n == nPeers {
nOkay++
p.logger.Debug("gossip looks settled", "elapsed", elapsed)
p.logger.Debug("Gossip looks settled", "elapsed", elapsed)
} else {
nOkay = 0
p.logger.Info("gossip not settled", "polls", totalPolls, "before", nPeers, "now", n, "elapsed", elapsed)
p.logger.Info("Gossip not settled", "polls", totalPolls, "before", nPeers, "now", n, "elapsed", elapsed)
}
nPeers = n
totalPolls++
@@ -455,7 +455,7 @@ func (p *redisPeer) mergePartialState(buf []byte) {
var part clusterpb.Part
if err := proto.Unmarshal(buf, &part); err != nil {
p.logger.Warn("error decoding the received broadcast message", "err", err)
p.logger.Warn("Error decoding the received broadcast message", "err", err)
return
}
@@ -467,10 +467,10 @@ func (p *redisPeer) mergePartialState(buf []byte) {
return
}
if err := s.Merge(part.Data); err != nil {
p.logger.Warn("error merging the received broadcast message", "err", err, "key", part.Key)
p.logger.Warn("Error merging the received broadcast message", "err", err, "key", part.Key)
return
}
p.logger.Debug("partial state was successfully merged", "key", part.Key)
p.logger.Debug("Partial state was successfully merged", "key", part.Key)
}
func (p *redisPeer) fullStateReqReceiveLoop() {
@@ -512,7 +512,7 @@ func (p *redisPeer) mergeFullState(buf []byte) {
var fs clusterpb.FullState
if err := proto.Unmarshal(buf, &fs); err != nil {
p.logger.Warn("error unmarshaling the received remote state", "err", err)
p.logger.Warn("Error unmarshaling the received remote state", "err", err)
return
}
@@ -521,22 +521,22 @@ func (p *redisPeer) mergeFullState(buf []byte) {
for _, part := range fs.Parts {
s, ok := p.states[part.Key]
if !ok {
p.logger.Warn("received", "unknown state key", "len", len(buf), "key", part.Key)
p.logger.Warn("Received", "unknown state key", "len", len(buf), "key", part.Key)
continue
}
if err := s.Merge(part.Data); err != nil {
p.logger.Warn("error merging the received remote state", "err", err, "key", part.Key)
p.logger.Warn("Error merging the received remote state", "err", err, "key", part.Key)
return
}
}
p.logger.Debug("full state was successfully merged")
p.logger.Debug("Full state was successfully merged")
}
func (p *redisPeer) fullStateSyncPublish() {
pub := p.redis.Publish(context.Background(), p.withPrefix(fullStateChannel), p.LocalState())
if pub.Err() != nil {
p.messagesPublishFailures.WithLabelValues(fullState, reasonRedisIssue).Inc()
p.logger.Error("error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannel))
p.logger.Error("Error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannel))
}
}
@@ -557,7 +557,7 @@ func (p *redisPeer) requestFullState() {
pub := p.redis.Publish(context.Background(), p.withPrefix(fullStateChannelReq), p.name)
if pub.Err() != nil {
p.messagesPublishFailures.WithLabelValues(fullState, reasonRedisIssue).Inc()
p.logger.Error("error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannelReq))
p.logger.Error("Error publishing a message to redis", "err", pub.Err(), "channel", p.withPrefix(fullStateChannelReq))
}
}
@@ -571,13 +571,13 @@ func (p *redisPeer) LocalState() []byte {
for key, s := range p.states {
b, err := s.MarshalBinary()
if err != nil {
p.logger.Warn("error encoding the local state", "err", err, "key", key)
p.logger.Warn("Error encoding the local state", "err", err, "key", key)
}
all.Parts = append(all.Parts, clusterpb.Part{Key: key, Data: b})
}
b, err := proto.Marshal(all)
if err != nil {
p.logger.Warn("error encoding the local state to proto", "err", err)
p.logger.Warn("Error encoding the local state to proto", "err", err)
}
p.messagesSent.WithLabelValues(fullState).Inc()
p.messagesSentSize.WithLabelValues(fullState).Add(float64(len(b)))
@@ -592,6 +592,6 @@ func (p *redisPeer) Shutdown() {
defer cancel()
del := p.redis.Del(ctx, p.withPrefix(p.name))
if del.Err() != nil {
p.logger.Error("error deleting the redis key on shutdown", "err", del.Err(), "key", p.withPrefix(p.name))
p.logger.Error("Error deleting the redis key on shutdown", "err", del.Err(), "key", p.withPrefix(p.name))
}
}

View File

@@ -15,7 +15,7 @@ func (am *Alertmanager) GetStatus() apimodels.GettableStatus {
}
if err := json.Unmarshal(status, config); err != nil {
am.logger.Error("unable to unmarshall alertmanager config", "Err", err)
am.logger.Error("Unable to unmarshall alertmanager config", "Err", err)
}
return *apimodels.NewGettableStatus(&config.AlertmanagerConfig)