DEV: Add support for custom retries for scheduled admin checks (#24224)

We updated scheduled admin checks to run concurrently in their own jobs. The main reason for this was so that we can implement re-check functionality for especially flaky checks (e.g. group e-mail credentials check.)

This works in the following way:

1. The check declares its retry policy using class methods.
2. A block can be yielded to if there are problems, but before they are committed to Redis.
3. The job uses this block to either a) schedule a retry if there are any remaining or b) do nothing and let the check commit.
This commit is contained in:
Ted Johansson 2023-11-06 08:57:02 +08:00 committed by GitHub
parent d5e4b85e82
commit c3708c4276
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 73 additions and 10 deletions

View File

@ -1,15 +1,29 @@
# frozen_string_literal: true
module Jobs
class RetrySignal < Exception
end
# This job runs a singular scheduled admin check. It is scheduled by
# the ProblemChecks (plural) scheduled job.
class ProblemCheck < ::Jobs::Base
sidekiq_options retry: false
def execute(args)
identifier = args[:check_identifier]
retry_count = args[:retry_count].to_i
identifier = args[:check_identifier].to_sym
AdminDashboardData.execute_scheduled_check(identifier.to_sym)
check = AdminDashboardData.problem_scheduled_check_klasses[identifier]
AdminDashboardData.execute_scheduled_check(identifier) do |problems|
raise RetrySignal if retry_count < check.max_retries
end
rescue RetrySignal
Jobs.enqueue_in(
check.retry_wait,
:problem_check,
args.merge(retry_count: retry_count + 1).stringify_keys,
)
end
end
end

View File

@ -3,7 +3,11 @@
class AdminDashboardData
include StatsCacheable
cattr_reader :problem_syms, :problem_blocks, :problem_messages, :problem_scheduled_check_blocks
cattr_reader :problem_syms,
:problem_blocks,
:problem_messages,
:problem_scheduled_check_blocks,
:problem_scheduled_check_klasses
class Problem
VALID_PRIORITIES = %w[low high].freeze
@ -80,7 +84,8 @@ class AdminDashboardData
@@problem_blocks << blk if blk
end
def self.add_scheduled_problem_check(check_identifier, &blk)
def self.add_scheduled_problem_check(check_identifier, klass = nil, &blk)
@@problem_scheduled_check_klasses[check_identifier] = klass
@@problem_scheduled_check_blocks[check_identifier] = blk
end
@ -125,7 +130,7 @@ class AdminDashboardData
end
def self.register_default_scheduled_problem_checks
add_scheduled_problem_check(:group_smtp_credentials) do
add_scheduled_problem_check(:group_smtp_credentials, GroupEmailCredentialsCheck) do
problems = GroupEmailCredentialsCheck.run
problems.map do |p|
problem_message =
@ -158,6 +163,8 @@ class AdminDashboardData
problems = instance_exec(&check)
yield(problems) if block_given? && problems.present?
Array
.wrap(problems)
.compact
@ -186,6 +193,7 @@ class AdminDashboardData
@@problem_syms = []
@@problem_blocks = []
@@problem_scheduled_check_blocks = {}
@@problem_scheduled_check_klasses = {}
@@problem_messages = %w[
dashboard.bad_favicon_url

View File

@ -7,6 +7,9 @@
# problem checks, and if any credentials have issues they will show up on
# the admin dashboard as a high priority issue.
class GroupEmailCredentialsCheck
def self.max_retries = 2
def self.retry_wait = 30.seconds
def self.run
errors = []

View File

@ -6,8 +6,13 @@ RSpec.describe Jobs::ProblemCheck do
AdminDashboardData.reset_problem_checks
end
class TestCheck
def self.max_retries = 0
def self.retry_wait = 30.seconds
end
it "runs the scheduled problem check that has been added and adds the messages to the load_found_scheduled_check_problems array" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem")
end
@ -19,7 +24,7 @@ RSpec.describe Jobs::ProblemCheck do
end
it "can handle the problem check returning multiple problems" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
[
AdminDashboardData::Problem.new("big problem"),
AdminDashboardData::Problem.new(
@ -36,7 +41,7 @@ RSpec.describe Jobs::ProblemCheck do
end
it "does not add the same problem twice if the identifier already exists" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
[
AdminDashboardData::Problem.new(
"yuge problem",
@ -56,8 +61,36 @@ RSpec.describe Jobs::ProblemCheck do
expect(problems.map(&:to_s)).to match_array(["yuge problem"])
end
it "schedules a retry if there are attempts remaining" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem")
end
TestCheck.stubs(:max_retries).returns(1)
expect_enqueued_with(
job: :problem_check,
args: {
check_identifier: :test_identifier,
retry_count: 1,
},
) { described_class.new.execute(check_identifier: :test_identifier) }
end
it "does not schedule a retry if there are no more attempts remaining" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem")
end
TestCheck.stubs(:max_retries).returns(1)
expect_not_enqueued_with(job: :problem_check) do
described_class.new.execute(check_identifier: :test_identifier, retry_count: 1)
end
end
it "handles errors from a troublesome check" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
raise StandardError.new("something went wrong")
AdminDashboardData::Problem.new("polling issue")
end

View File

@ -8,10 +8,15 @@ RSpec.describe Jobs::ProblemChecks do
AdminDashboardData.reset_problem_checks
end
class TestCheck
def self.max_retries = 0
def self.retry_wait = 0.seconds
end
it "starts with a blank slate every time the checks are run to avoid duplicate problems and to clear no longer firing problems" do
problem_should_fire = true
AdminDashboardData.reset_problem_checks
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
if problem_should_fire
problem_should_fire = false
AdminDashboardData::Problem.new("yuge problem", priority: "high")