DEV: Add support for custom retries for scheduled admin checks (#24224)

We updated scheduled admin checks to run concurrently in their own jobs. The main reason for this was so that we can implement re-check functionality for especially flaky checks (e.g. group e-mail credentials check.)

This works in the following way:

1. The check declares its retry policy using class methods.
2. A block can be yielded to if there are problems, but before they are committed to Redis.
3. The job uses this block to either a) schedule a retry if there are any remaining or b) do nothing and let the check commit.
This commit is contained in:
Ted Johansson 2023-11-06 08:57:02 +08:00 committed by GitHub
parent d5e4b85e82
commit c3708c4276
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 73 additions and 10 deletions

View File

@ -1,15 +1,29 @@
# frozen_string_literal: true # frozen_string_literal: true
module Jobs module Jobs
class RetrySignal < Exception
end
# This job runs a singular scheduled admin check. It is scheduled by # This job runs a singular scheduled admin check. It is scheduled by
# the ProblemChecks (plural) scheduled job. # the ProblemChecks (plural) scheduled job.
class ProblemCheck < ::Jobs::Base class ProblemCheck < ::Jobs::Base
sidekiq_options retry: false sidekiq_options retry: false
def execute(args) def execute(args)
identifier = args[:check_identifier] retry_count = args[:retry_count].to_i
identifier = args[:check_identifier].to_sym
AdminDashboardData.execute_scheduled_check(identifier.to_sym) check = AdminDashboardData.problem_scheduled_check_klasses[identifier]
AdminDashboardData.execute_scheduled_check(identifier) do |problems|
raise RetrySignal if retry_count < check.max_retries
end
rescue RetrySignal
Jobs.enqueue_in(
check.retry_wait,
:problem_check,
args.merge(retry_count: retry_count + 1).stringify_keys,
)
end end
end end
end end

View File

@ -3,7 +3,11 @@
class AdminDashboardData class AdminDashboardData
include StatsCacheable include StatsCacheable
cattr_reader :problem_syms, :problem_blocks, :problem_messages, :problem_scheduled_check_blocks cattr_reader :problem_syms,
:problem_blocks,
:problem_messages,
:problem_scheduled_check_blocks,
:problem_scheduled_check_klasses
class Problem class Problem
VALID_PRIORITIES = %w[low high].freeze VALID_PRIORITIES = %w[low high].freeze
@ -80,7 +84,8 @@ class AdminDashboardData
@@problem_blocks << blk if blk @@problem_blocks << blk if blk
end end
def self.add_scheduled_problem_check(check_identifier, &blk) def self.add_scheduled_problem_check(check_identifier, klass = nil, &blk)
@@problem_scheduled_check_klasses[check_identifier] = klass
@@problem_scheduled_check_blocks[check_identifier] = blk @@problem_scheduled_check_blocks[check_identifier] = blk
end end
@ -125,7 +130,7 @@ class AdminDashboardData
end end
def self.register_default_scheduled_problem_checks def self.register_default_scheduled_problem_checks
add_scheduled_problem_check(:group_smtp_credentials) do add_scheduled_problem_check(:group_smtp_credentials, GroupEmailCredentialsCheck) do
problems = GroupEmailCredentialsCheck.run problems = GroupEmailCredentialsCheck.run
problems.map do |p| problems.map do |p|
problem_message = problem_message =
@ -158,6 +163,8 @@ class AdminDashboardData
problems = instance_exec(&check) problems = instance_exec(&check)
yield(problems) if block_given? && problems.present?
Array Array
.wrap(problems) .wrap(problems)
.compact .compact
@ -186,6 +193,7 @@ class AdminDashboardData
@@problem_syms = [] @@problem_syms = []
@@problem_blocks = [] @@problem_blocks = []
@@problem_scheduled_check_blocks = {} @@problem_scheduled_check_blocks = {}
@@problem_scheduled_check_klasses = {}
@@problem_messages = %w[ @@problem_messages = %w[
dashboard.bad_favicon_url dashboard.bad_favicon_url

View File

@ -7,6 +7,9 @@
# problem checks, and if any credentials have issues they will show up on # problem checks, and if any credentials have issues they will show up on
# the admin dashboard as a high priority issue. # the admin dashboard as a high priority issue.
class GroupEmailCredentialsCheck class GroupEmailCredentialsCheck
def self.max_retries = 2
def self.retry_wait = 30.seconds
def self.run def self.run
errors = [] errors = []

View File

@ -6,8 +6,13 @@ RSpec.describe Jobs::ProblemCheck do
AdminDashboardData.reset_problem_checks AdminDashboardData.reset_problem_checks
end end
class TestCheck
def self.max_retries = 0
def self.retry_wait = 30.seconds
end
it "runs the scheduled problem check that has been added and adds the messages to the load_found_scheduled_check_problems array" do it "runs the scheduled problem check that has been added and adds the messages to the load_found_scheduled_check_problems array" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem") AdminDashboardData::Problem.new("big problem")
end end
@ -19,7 +24,7 @@ RSpec.describe Jobs::ProblemCheck do
end end
it "can handle the problem check returning multiple problems" do it "can handle the problem check returning multiple problems" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
[ [
AdminDashboardData::Problem.new("big problem"), AdminDashboardData::Problem.new("big problem"),
AdminDashboardData::Problem.new( AdminDashboardData::Problem.new(
@ -36,7 +41,7 @@ RSpec.describe Jobs::ProblemCheck do
end end
it "does not add the same problem twice if the identifier already exists" do it "does not add the same problem twice if the identifier already exists" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
[ [
AdminDashboardData::Problem.new( AdminDashboardData::Problem.new(
"yuge problem", "yuge problem",
@ -56,8 +61,36 @@ RSpec.describe Jobs::ProblemCheck do
expect(problems.map(&:to_s)).to match_array(["yuge problem"]) expect(problems.map(&:to_s)).to match_array(["yuge problem"])
end end
it "schedules a retry if there are attempts remaining" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem")
end
TestCheck.stubs(:max_retries).returns(1)
expect_enqueued_with(
job: :problem_check,
args: {
check_identifier: :test_identifier,
retry_count: 1,
},
) { described_class.new.execute(check_identifier: :test_identifier) }
end
it "does not schedule a retry if there are no more attempts remaining" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
AdminDashboardData::Problem.new("big problem")
end
TestCheck.stubs(:max_retries).returns(1)
expect_not_enqueued_with(job: :problem_check) do
described_class.new.execute(check_identifier: :test_identifier, retry_count: 1)
end
end
it "handles errors from a troublesome check" do it "handles errors from a troublesome check" do
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
raise StandardError.new("something went wrong") raise StandardError.new("something went wrong")
AdminDashboardData::Problem.new("polling issue") AdminDashboardData::Problem.new("polling issue")
end end

View File

@ -8,10 +8,15 @@ RSpec.describe Jobs::ProblemChecks do
AdminDashboardData.reset_problem_checks AdminDashboardData.reset_problem_checks
end end
class TestCheck
def self.max_retries = 0
def self.retry_wait = 0.seconds
end
it "starts with a blank slate every time the checks are run to avoid duplicate problems and to clear no longer firing problems" do it "starts with a blank slate every time the checks are run to avoid duplicate problems and to clear no longer firing problems" do
problem_should_fire = true problem_should_fire = true
AdminDashboardData.reset_problem_checks AdminDashboardData.reset_problem_checks
AdminDashboardData.add_scheduled_problem_check(:test_identifier) do AdminDashboardData.add_scheduled_problem_check(:test_identifier, TestCheck) do
if problem_should_fire if problem_should_fire
problem_should_fire = false problem_should_fire = false
AdminDashboardData::Problem.new("yuge problem", priority: "high") AdminDashboardData::Problem.new("yuge problem", priority: "high")