From 14a0247301510a5703d1d4b7f62141c13935cc97 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 28 Jun 2021 16:16:22 +1000 Subject: [PATCH] PERF: optimise backfilling of topic_id (#13545) Relying on large offsets can have uneven performance on huge table, new implementation recovers more cleanly and avoids double updates --- ...10621234939_backfill_email_log_topic_id.rb | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb b/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb index 5e2770e167e..232f9e27184 100644 --- a/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb +++ b/db/post_migrate/20210621234939_backfill_email_log_topic_id.rb @@ -5,27 +5,22 @@ class BackfillEmailLogTopicId < ActiveRecord::Migration[6.1] BATCH_SIZE = 30_000 def up - offset = 0 - email_log_count = DB.query_single("SELECT COUNT(*) FROM email_logs").first - loop do - DB.exec(<<~SQL, offset: offset, batch_size: BATCH_SIZE) + count = DB.exec(<<~SQL, batch_size: BATCH_SIZE) WITH cte AS ( - SELECT post_id - FROM email_logs - ORDER BY id + SELECT l1.id, p1.topic_id + FROM email_logs l1 + INNER JOIN posts p1 ON p1.id = l1.post_id + WHERE l1.topic_id IS NULL AND p1.topic_id IS NOT NULL LIMIT :batch_size - OFFSET :offset ) UPDATE email_logs - SET topic_id = posts.topic_id + SET topic_id = cte.topic_id FROM cte - INNER JOIN posts ON posts.id = cte.post_id - WHERE email_logs.post_id = cte.post_id + WHERE email_logs.id = cte.id SQL - offset += BATCH_SIZE - break if offset > (email_log_count + BATCH_SIZE * 2) + break if count == 0 end end