mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
FEATURE: automatic orphan recovery
BUGFIX: improve scheduler robustness, in case redis is disconnected during operation If sidekiq is terminated while task is running, it will be picked up and ran again New owner on tasks to help debugging better #stop semantics for tests
This commit is contained in:
parent
2e5413437d
commit
71a38542a4
@ -12,32 +12,78 @@ module Scheduler
|
|||||||
|
|
||||||
class Runner
|
class Runner
|
||||||
def initialize(manager)
|
def initialize(manager)
|
||||||
|
@mutex = Mutex.new
|
||||||
@queue = Queue.new
|
@queue = Queue.new
|
||||||
@manager = manager
|
@manager = manager
|
||||||
|
@reschedule_orphans_thread = Thread.new do
|
||||||
|
while true
|
||||||
|
sleep 1.minute
|
||||||
|
@mutex.synchronize do
|
||||||
|
reschedule_orphans
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
@keep_alive_thread = Thread.new do
|
||||||
|
while true
|
||||||
|
@mutex.synchronize do
|
||||||
|
keep_alive
|
||||||
|
end
|
||||||
|
sleep (@manager.keep_alive_duration / 2)
|
||||||
|
end
|
||||||
|
end
|
||||||
@thread = Thread.new do
|
@thread = Thread.new do
|
||||||
while true
|
while true
|
||||||
klass = @queue.deq
|
process_queue
|
||||||
failed = false
|
|
||||||
start = Time.now.to_f
|
|
||||||
info = @manager.schedule_info(klass)
|
|
||||||
begin
|
|
||||||
info.prev_result = "RUNNING"
|
|
||||||
info.write!
|
|
||||||
klass.new.perform
|
|
||||||
rescue => e
|
|
||||||
Scheduler::Manager.handle_exception(e)
|
|
||||||
failed = true
|
|
||||||
end
|
|
||||||
duration = ((Time.now.to_f - start) * 1000).to_i
|
|
||||||
info.prev_duration = duration
|
|
||||||
info.prev_result = failed ? "FAILED" : "OK"
|
|
||||||
info.write!
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def keep_alive
|
||||||
|
@manager.keep_alive
|
||||||
|
rescue => ex
|
||||||
|
Scheduler::Manager.handle_exception(ex)
|
||||||
|
end
|
||||||
|
|
||||||
|
def reschedule_orphans
|
||||||
|
@manager.reschedule_orphans!
|
||||||
|
rescue => ex
|
||||||
|
Scheduler::Manager.handle_exception(ex)
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_queue
|
||||||
|
klass = @queue.deq
|
||||||
|
# hack alert, I need to both deq and set @running atomically.
|
||||||
|
@running = true
|
||||||
|
failed = false
|
||||||
|
start = Time.now.to_f
|
||||||
|
info = @mutex.synchronize { @manager.schedule_info(klass) }
|
||||||
|
begin
|
||||||
|
info.prev_result = "RUNNING"
|
||||||
|
@mutex.synchronize { info.write! }
|
||||||
|
klass.new.perform
|
||||||
|
rescue => e
|
||||||
|
Scheduler::Manager.handle_exception(e)
|
||||||
|
failed = true
|
||||||
|
end
|
||||||
|
duration = ((Time.now.to_f - start) * 1000).to_i
|
||||||
|
info.prev_duration = duration
|
||||||
|
info.prev_result = failed ? "FAILED" : "OK"
|
||||||
|
info.current_owner = nil
|
||||||
|
attempts(3) do
|
||||||
|
@mutex.synchronize { info.write! }
|
||||||
|
end
|
||||||
|
rescue => ex
|
||||||
|
Scheduler::Manager.handle_exception(ex)
|
||||||
|
ensure
|
||||||
|
@running = false
|
||||||
|
end
|
||||||
|
|
||||||
def stop!
|
def stop!
|
||||||
@thread.kill
|
@mutex.synchronize do
|
||||||
|
@thread.kill
|
||||||
|
@keep_alive_thread.kill
|
||||||
|
@reschedule_orphans_thread.kill
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def enq(klass)
|
def enq(klass)
|
||||||
@ -48,7 +94,23 @@ module Scheduler
|
|||||||
while !@queue.empty? && !(@queue.num_waiting > 0)
|
while !@queue.empty? && !(@queue.num_waiting > 0)
|
||||||
sleep 0.001
|
sleep 0.001
|
||||||
end
|
end
|
||||||
|
# this is a hack, but is only used for test anyway
|
||||||
|
sleep 0.001
|
||||||
|
while @running
|
||||||
|
sleep 0.001
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def attempts(n)
|
||||||
|
n.times {
|
||||||
|
begin
|
||||||
|
yield; break
|
||||||
|
rescue
|
||||||
|
sleep Random.rand
|
||||||
|
end
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.without_runner(redis=nil)
|
def self.without_runner(redis=nil)
|
||||||
@ -94,22 +156,42 @@ module Scheduler
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def reschedule_orphans!
|
||||||
|
lock do
|
||||||
|
redis.zrange(Manager.queue_key, 0, -1).each do |key|
|
||||||
|
klass = get_klass(key)
|
||||||
|
next unless klass
|
||||||
|
info = schedule_info(klass)
|
||||||
|
|
||||||
|
if ['QUEUED', 'RUNNING'].include?(info.prev_result) &&
|
||||||
|
(info.current_owner.blank? || !redis.get(info.current_owner))
|
||||||
|
info.prev_result = 'ORPHAN'
|
||||||
|
info.next_run = Time.now.to_i
|
||||||
|
info.write!
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_klass(name)
|
||||||
|
name.constantize
|
||||||
|
rescue NameError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
def tick
|
def tick
|
||||||
lock do
|
lock do
|
||||||
(key, due), _ = redis.zrange Manager.queue_key, 0, 0, withscores: true
|
(key, due), _ = redis.zrange Manager.queue_key, 0, 0, withscores: true
|
||||||
return unless key
|
return unless key
|
||||||
if due.to_i <= Time.now.to_i
|
if due.to_i <= Time.now.to_i
|
||||||
klass = begin
|
klass = get_klass(key)
|
||||||
key.constantize
|
|
||||||
rescue NameError
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
return unless klass
|
return unless klass
|
||||||
info = schedule_info(klass)
|
info = schedule_info(klass)
|
||||||
info.prev_run = Time.now.to_i
|
info.prev_run = Time.now.to_i
|
||||||
info.prev_result = "QUEUED"
|
info.prev_result = "QUEUED"
|
||||||
info.prev_duration = -1
|
info.prev_duration = -1
|
||||||
info.next_run = nil
|
info.next_run = nil
|
||||||
|
info.current_owner = identity_key
|
||||||
info.schedule!
|
info.schedule!
|
||||||
@runner.enq(klass)
|
@runner.enq(klass)
|
||||||
end
|
end
|
||||||
@ -126,6 +208,13 @@ module Scheduler
|
|||||||
self.class.current = nil
|
self.class.current = nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def keep_alive_duration
|
||||||
|
60
|
||||||
|
end
|
||||||
|
|
||||||
|
def keep_alive
|
||||||
|
redis.setex identity_key, keep_alive_duration, ""
|
||||||
|
end
|
||||||
|
|
||||||
def lock
|
def lock
|
||||||
got_lock = false
|
got_lock = false
|
||||||
@ -157,6 +246,7 @@ module Scheduler
|
|||||||
redis.del Manager.lock_key
|
redis.del Manager.lock_key
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
def self.discover_schedules
|
def self.discover_schedules
|
||||||
schedules = []
|
schedules = []
|
||||||
ObjectSpace.each_object(Scheduler::Schedule) do |schedule|
|
ObjectSpace.each_object(Scheduler::Schedule) do |schedule|
|
||||||
@ -165,6 +255,18 @@ module Scheduler
|
|||||||
schedules
|
schedules
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@mutex = Mutex.new
|
||||||
|
def self.seq
|
||||||
|
@mutex.synchronize do
|
||||||
|
@i ||= 0
|
||||||
|
@i += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def identity_key
|
||||||
|
@identity_key ||= "_scheduler_#{`hostname`}:#{Process.pid}:#{self.class.seq}"
|
||||||
|
end
|
||||||
|
|
||||||
def self.lock_key
|
def self.lock_key
|
||||||
"_scheduler_lock_"
|
"_scheduler_lock_"
|
||||||
end
|
end
|
||||||
|
@ -3,7 +3,8 @@ module Scheduler
|
|||||||
attr_accessor :next_run,
|
attr_accessor :next_run,
|
||||||
:prev_run,
|
:prev_run,
|
||||||
:prev_duration,
|
:prev_duration,
|
||||||
:prev_result
|
:prev_result,
|
||||||
|
:current_owner
|
||||||
|
|
||||||
def initialize(klass, manager)
|
def initialize(klass, manager)
|
||||||
@klass = klass
|
@klass = klass
|
||||||
@ -21,10 +22,11 @@ module Scheduler
|
|||||||
@prev_run = data["prev_run"]
|
@prev_run = data["prev_run"]
|
||||||
@prev_result = data["prev_result"]
|
@prev_result = data["prev_result"]
|
||||||
@prev_duration = data["prev_duration"]
|
@prev_duration = data["prev_duration"]
|
||||||
|
@current_owner = data["current_owner"]
|
||||||
end
|
end
|
||||||
rescue
|
rescue
|
||||||
# corrupt redis
|
# corrupt redis
|
||||||
@next_run = @prev_run = @prev_result = @prev_duration = nil
|
@next_run = @prev_run = @prev_result = @prev_duration = @current_owner = nil
|
||||||
end
|
end
|
||||||
|
|
||||||
def valid?
|
def valid?
|
||||||
@ -57,14 +59,15 @@ module Scheduler
|
|||||||
next_run: @next_run,
|
next_run: @next_run,
|
||||||
prev_run: @prev_run,
|
prev_run: @prev_run,
|
||||||
prev_duration: @prev_duration,
|
prev_duration: @prev_duration,
|
||||||
prev_result: @prev_result
|
prev_result: @prev_result,
|
||||||
|
current_owner: @current_owner
|
||||||
}.to_json
|
}.to_json
|
||||||
redis.zadd Manager.queue_key, @next_run , @klass
|
redis.zadd Manager.queue_key, @next_run , @klass
|
||||||
end
|
end
|
||||||
|
|
||||||
def del!
|
def del!
|
||||||
clear!
|
clear!
|
||||||
@next_run = @prev_run = @prev_result = @prev_duration = nil
|
@next_run = @prev_run = @prev_result = @prev_duration = @current_owner = nil
|
||||||
end
|
end
|
||||||
|
|
||||||
def key
|
def key
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
<th style="width: 15%">Last Run</th>
|
<th style="width: 15%">Last Run</th>
|
||||||
<th style="width: 15%">Last Result</th>
|
<th style="width: 15%">Last Result</th>
|
||||||
<th style="width: 15%">Last Duration</th>
|
<th style="width: 15%">Last Duration</th>
|
||||||
|
<th style="width: 15%">Last Owner</th>
|
||||||
<th style="width: 15%">Next Run Due</th>
|
<th style="width: 15%">Next Run Due</th>
|
||||||
<th style="width: 10%">Actions</th>
|
<th style="width: 10%">Actions</th>
|
||||||
</thead>
|
</thead>
|
||||||
@ -37,6 +38,9 @@
|
|||||||
<td>
|
<td>
|
||||||
<%= @info.prev_duration %>
|
<%= @info.prev_duration %>
|
||||||
</td>
|
</td>
|
||||||
|
<td>
|
||||||
|
<%= @info.current_owner %>
|
||||||
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<% next_run = @info.next_run %>
|
<% next_run = @info.next_run %>
|
||||||
<% if next_run.nil? %>
|
<% if next_run.nil? %>
|
||||||
|
@ -23,19 +23,60 @@ describe Scheduler::Manager do
|
|||||||
sleep 0.001
|
sleep 0.001
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class SuperLongJob
|
||||||
|
extend ::Scheduler::Schedule
|
||||||
|
|
||||||
|
every 10.minutes
|
||||||
|
|
||||||
|
def perform
|
||||||
|
sleep 1000
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
let(:manager) { Scheduler::Manager.new(DiscourseRedis.new) }
|
let(:manager) { Scheduler::Manager.new(DiscourseRedis.new) }
|
||||||
|
|
||||||
before do
|
before do
|
||||||
|
$redis.del manager.class.lock_key
|
||||||
$redis.del manager.class.queue_key
|
$redis.del manager.class.queue_key
|
||||||
|
manager.remove(Testing::RandomJob)
|
||||||
|
manager.remove(Testing::SuperLongJob)
|
||||||
end
|
end
|
||||||
|
|
||||||
after do
|
after do
|
||||||
manager.stop!
|
manager.stop!
|
||||||
|
manager.remove(Testing::RandomJob)
|
||||||
|
manager.remove(Testing::SuperLongJob)
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#sync' do
|
||||||
|
|
||||||
|
it 'increases' do
|
||||||
|
Scheduler::Manager.seq.should == Scheduler::Manager.seq - 1
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '#tick' do
|
describe '#tick' do
|
||||||
|
|
||||||
|
it 'should recover from crashed manager' do
|
||||||
|
|
||||||
|
info = manager.schedule_info(Testing::SuperLongJob)
|
||||||
|
info.next_run = Time.now.to_i - 1
|
||||||
|
info.write!
|
||||||
|
|
||||||
|
manager.tick
|
||||||
|
manager.stop!
|
||||||
|
|
||||||
|
$redis.del manager.identity_key
|
||||||
|
|
||||||
|
manager = Scheduler::Manager.new(DiscourseRedis.new)
|
||||||
|
manager.reschedule_orphans!
|
||||||
|
|
||||||
|
info = manager.schedule_info(Testing::SuperLongJob)
|
||||||
|
info.next_run.should <= Time.now.to_i
|
||||||
|
end
|
||||||
|
|
||||||
it 'should only run pending job once' do
|
it 'should only run pending job once' do
|
||||||
|
|
||||||
Testing::RandomJob.runs = 0
|
Testing::RandomJob.runs = 0
|
||||||
@ -48,6 +89,7 @@ describe Scheduler::Manager do
|
|||||||
Thread.new do
|
Thread.new do
|
||||||
manager = Scheduler::Manager.new(DiscourseRedis.new)
|
manager = Scheduler::Manager.new(DiscourseRedis.new)
|
||||||
manager.blocking_tick
|
manager.blocking_tick
|
||||||
|
manager.stop!
|
||||||
end
|
end
|
||||||
end.map(&:join)
|
end.map(&:join)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user