PERF: optimise backfilling of topic_id (#13545)

Relying on large offsets can have uneven performance on huge table, new
implementation recovers more cleanly and avoids double updates
This commit is contained in:
Sam 2021-06-28 16:16:22 +10:00 committed by GitHub
parent a6b9289477
commit 14a0247301
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -5,27 +5,22 @@ class BackfillEmailLogTopicId < ActiveRecord::Migration[6.1]
BATCH_SIZE = 30_000 BATCH_SIZE = 30_000
def up def up
offset = 0
email_log_count = DB.query_single("SELECT COUNT(*) FROM email_logs").first
loop do loop do
DB.exec(<<~SQL, offset: offset, batch_size: BATCH_SIZE) count = DB.exec(<<~SQL, batch_size: BATCH_SIZE)
WITH cte AS ( WITH cte AS (
SELECT post_id SELECT l1.id, p1.topic_id
FROM email_logs FROM email_logs l1
ORDER BY id INNER JOIN posts p1 ON p1.id = l1.post_id
WHERE l1.topic_id IS NULL AND p1.topic_id IS NOT NULL
LIMIT :batch_size LIMIT :batch_size
OFFSET :offset
) )
UPDATE email_logs UPDATE email_logs
SET topic_id = posts.topic_id SET topic_id = cte.topic_id
FROM cte FROM cte
INNER JOIN posts ON posts.id = cte.post_id WHERE email_logs.id = cte.id
WHERE email_logs.post_id = cte.post_id
SQL SQL
offset += BATCH_SIZE break if count == 0
break if offset > (email_log_count + BATCH_SIZE * 2)
end end
end end