FEATURE: Pull hotlinked images immediately after posting

Previously, with the default `editing_grace_period`, hotlinked images were pulled 5 minutes after a post is created. This delay was added to reduce the chance of automated edits clashing with user edits.

This commit refactors things so that we can pull hotlinked images immediately. URLs are immediately updated in the post's `cooked` HTML. The post's raw markdown is updated later, after the `editing_grace_period`.

This involves a number of behind-the-scenes changes including:

- Schedule Jobs::PullHotlinkedImages immediately after Jobs::ProcessPost. Move scheduling to after the `update_column` call to avoid race conditions

- Move raw changes into a separate job, which is delayed until after the ninja-edit window

- Move disable_if_low_on_disk_space logic into the `pull_hotlinked_images` job

- Move raw-parsing/replacing logic into `InlineUpload` so it can be easily be shared between `UpdateHotlinkedRaw` and `PullUserProfileHotlinkedImages`
This commit is contained in:
David Taylor 2022-05-16 17:56:00 +01:00
parent 0baabafa9d
commit bf6f8299a7
9 changed files with 193 additions and 199 deletions

View File

@ -40,6 +40,8 @@ module Jobs
end end
end end
enqueue_pull_hotlinked_images(post) unless args[:skip_pull_hotlinked_images]
if !post.user&.staff? && !post.user&.staged? if !post.user&.staff? && !post.user&.staged?
s = post.raw s = post.raw
s << " #{post.topic.title}" if post.post_number == 1 s << " #{post.topic.title}" if post.post_number == 1
@ -60,6 +62,11 @@ module Jobs
TopicLink.extract_from(post) TopicLink.extract_from(post)
QuotedPost.extract_from(post) QuotedPost.extract_from(post)
end end
def enqueue_pull_hotlinked_images(post)
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: post.id)
Jobs.enqueue(:pull_hotlinked_images, post_id: post.id)
end
end end
end end

View File

@ -10,16 +10,13 @@ module Jobs
end end
def execute(args) def execute(args)
disable_if_low_on_disk_space
@post_id = args[:post_id] @post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank? raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id) post = Post.find_by(id: @post_id)
return if post.blank? return if post.nil? || post.topic.nil?
return if post.topic.blank?
return if post.cook_method == Post.cook_methods[:raw_html]
raw = post.raw.dup
start_raw = raw.dup
hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h
@ -55,30 +52,23 @@ module Jobs
changed_hotlink_records = true changed_hotlink_records = true
hotlink_record.save! hotlink_record.save!
end end
# have we successfully downloaded that file?
if upload = hotlink_record&.upload
raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
end
rescue => e rescue => e
raise e if Rails.env.test? raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n")) log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end end
# If post changed while we were downloading images, never apply edits if changed_hotlink_records
post.reload
post_changed_elsewhere = (start_raw != post.raw)
raw_changed_here = (raw != post.raw)
if !post_changed_elsewhere && raw_changed_here
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
elsif changed_hotlink_records
post.trigger_post_process( post.trigger_post_process(
bypass_bump: true, bypass_bump: true,
skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
) )
end end
if hotlinked_map.size > 0
Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id)
update_raw_delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id)
end
end end
def download(src) def download(src)
@ -137,45 +127,6 @@ module Jobs
end end
end end
def replace_in_raw(original_src:, raw:, upload:)
raw = raw.dup
escaped_src = Regexp.escape(original_src)
replace_raw = ->(match, match_src, replacement, _index) {
if normalize_src(original_src) == normalize_src(match_src)
replacement =
if replacement.include?(InlineUploads::PLACEHOLDER)
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
end
raw = raw.gsub(
match,
replacement
)
end
}
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace_raw)
# BBCode tag - [img]http://...[/img]
InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
# Markdown linked image - [![alt](http://...)](http://...)
# Markdown inline - ![alt](http://...)
# Markdown inline - ![](http://... "image title")
# Markdown inline - ![alt](http://... "image title")
InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
# Direct link
raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
raw
end
def extract_images_from(html) def extract_images_from(html)
doc = Nokogiri::HTML5::fragment(html) doc = Nokogiri::HTML5::fragment(html)
@ -239,6 +190,30 @@ module Jobs
def normalize_src(src) def normalize_src(src)
PostHotlinkedMedia.normalize_src(src) PostHotlinkedMedia.normalize_src(src)
end end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
end end
end end

View File

@ -30,16 +30,16 @@ module Jobs
rescue ImageBrokenError rescue ImageBrokenError
broken_image_urls << normalized_src broken_image_urls << normalized_src
end end
# have we successfully downloaded that file?
if upload = downloaded_images[normalized_src]
user_profile.bio_raw = replace_in_raw(original_src: original_src, upload: upload, raw: user_profile.bio_raw)
end
rescue => e rescue => e
raise e if Rails.env.test? raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) user: #{@user_id}\n" + e.message + "\n" + e.backtrace.join("\n")) log(:error, "Failed to pull hotlinked image (#{download_src}) user: #{@user_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end end
user_profile.bio_raw = InlineUploads.replace_hotlinked_image_urls(raw: user_profile.bio_raw) do |match_src|
normalized_match_src = PostHotlinkedMedia.normalize_src(match_src)
downloaded_images[normalized_match_src]
end
user_profile.skip_pull_hotlinked_image = true user_profile.skip_pull_hotlinked_image = true
user_profile.save! user_profile.save!
end end

View File

@ -0,0 +1,28 @@
# frozen_string_literal: true
module Jobs
class UpdateHotlinkedRaw < ::Jobs::Base
sidekiq_options queue: 'low'
def execute(args)
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.nil?
return if post.cook_method == Post.cook_methods[:raw_html]
hotlinked_map = post.post_hotlinked_media.preload(:upload).map { |r| [r.url, r] }.to_h
raw = InlineUploads.replace_hotlinked_image_urls(raw: post.raw) do |match_src|
normalized_match_src = PostHotlinkedMedia.normalize_src(match_src)
hotlinked_map[normalized_match_src]&.upload
end
if post.raw != raw
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
end
end
end
end

View File

@ -214,6 +214,40 @@ class InlineUploads
end end
end end
def self.replace_hotlinked_image_urls(raw:, &blk)
replace = Proc.new do |match, match_src, replacement, _index|
upload = blk.call(match_src)
next if !upload
replacement =
if replacement.include?(InlineUploads::PLACEHOLDER)
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
end
raw = raw.gsub(
match,
replacement
)
end
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace)
# BBCode tag - [img]http://...[/img]
InlineUploads.match_bbcode_img(raw, external_src: true, &replace)
# Markdown linked image - [![alt](http://...)](http://...)
# Markdown inline - ![alt](http://...)
# Markdown inline - ![](http://... "image title")
# Markdown inline - ![alt](http://... "image title")
InlineUploads.match_md_inline_img(raw, external_src: true, &replace)
raw
end
def self.matched_uploads(node) def self.matched_uploads(node)
upload_path = Discourse.store.upload_path upload_path = Discourse.store.upload_path
base_url = Discourse.base_url.sub(/https?:\/\//, "(https?://)") base_url = Discourse.base_url.sub(/https?:\/\//, "(https?://)")

View File

@ -47,7 +47,6 @@ class CookedPostProcessor
remove_user_ids remove_user_ids
update_post_image update_post_image
enforce_nofollow enforce_nofollow
pull_hotlinked_images
grant_badges grant_badges
@post.link_post_uploads(fragments: @doc) @post.link_post_uploads(fragments: @doc)
DiscourseEvent.trigger(:post_process_cooked, @doc, @post) DiscourseEvent.trigger(:post_process_cooked, @doc, @post)
@ -361,41 +360,6 @@ class CookedPostProcessor
PrettyText.add_rel_attributes_to_user_content(@doc, add_nofollow) PrettyText.add_rel_attributes_to_user_content(@doc, add_nofollow)
end end
def pull_hotlinked_images
return if @opts[:skip_pull_hotlinked_images]
# have we enough disk space?
disable_if_low_on_disk_space # But still enqueue the job
# make sure no other job is scheduled
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id)
# schedule the job
delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(delay.seconds.to_i, :pull_hotlinked_images, post_id: @post.id)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
private private
def post_process_images def post_process_images

View File

@ -88,4 +88,35 @@ describe Jobs::ProcessPost do
expect(post.topic.reload.excerpt).to eq("Some OP content") expect(post.topic.reload.excerpt).to eq("Some OP content")
end end
end end
context "#enqueue_pull_hotlinked_images" do
fab!(:post) { Fabricate(:post, created_at: 20.days.ago) }
let(:job) { Jobs::ProcessPost.new }
it "runs even when download_remote_images_to_local is disabled" do
# We want to run it to pull hotlinked optimized images
SiteSetting.download_remote_images_to_local = false
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do
job.execute({ post_id: post.id })
end
end
context "when download_remote_images_to_local? is enabled" do
before do
SiteSetting.download_remote_images_to_local = true
end
it "enqueues" do
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do
job.execute({ post_id: post.id })
end
end
it "does not run when requested to skip" do
job.execute({ post_id: post.id, skip_pull_hotlinked_images: true })
expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0)
end
end
end
end end

View File

@ -67,6 +67,21 @@ describe Jobs::PullHotlinkedImages do
expect(post.reload.raw).to eq("<img src=\"#{Upload.last.short_url}\">") expect(post.reload.raw).to eq("<img src=\"#{Upload.last.short_url}\">")
end end
it 'enqueues raw replacement job with a delay' do
Jobs.run_later!
post = Fabricate(:post, raw: "<img src='#{image_url}'>")
stub_image_size
freeze_time
Jobs.expects(:cancel_scheduled_job).with(:update_hotlinked_raw, post_id: post.id).once
delay = SiteSetting.editing_grace_period + 1
expect_enqueued_with(job: :update_hotlinked_raw, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do
Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
end
end
it 'removes downloaded images when they are no longer needed' do it 'removes downloaded images when they are no longer needed' do
post = Fabricate(:post, raw: "<img src='#{image_url}'>") post = Fabricate(:post, raw: "<img src='#{image_url}'>")
stub_image_size stub_image_size
@ -196,14 +211,14 @@ describe Jobs::PullHotlinkedImages do
expect(post.uploads).to contain_exactly(upload) expect(post.uploads).to contain_exactly(upload)
end end
it "skips raw_html posts" do it "skips editing raw for raw_html posts" do
raw = "<img src=\"#{image_url}\">" raw = "<img src=\"#{image_url}\">"
post = Fabricate(:post, raw: raw, cook_method: Post.cook_methods[:raw_html]) post = Fabricate(:post, raw: raw, cook_method: Post.cook_methods[:raw_html])
stub_image_size stub_image_size
expect do expect do
post.rebake! post.rebake!
post.reload post.reload
end.not_to change { Upload.count } end.to change { Upload.count }.by(1)
expect(post.raw).to eq(raw) expect(post.raw).to eq(raw)
end end
@ -556,6 +571,43 @@ describe Jobs::PullHotlinkedImages do
end end
end end
context "#disable_if_low_on_disk_space" do
fab!(:post) { Fabricate(:post, created_at: 20.days.ago) }
let(:job) { Jobs::PullHotlinkedImages.new }
before do
SiteSetting.download_remote_images_to_local = true
SiteSetting.download_remote_images_threshold = 20
job.stubs(:available_disk_space).returns(50)
end
it "does nothing when there's enough disk space" do
SiteSetting.expects(:download_remote_images_to_local=).never
job.execute({ post_id: post.id })
end
context "when there's not enough disk space" do
before { SiteSetting.download_remote_images_threshold = 75 }
it "disables download_remote_images_threshold and send a notification to the admin" do
StaffActionLogger.any_instance.expects(:log_site_setting_change).once
SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once
job.execute({ post_id: post.id })
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "doesn't disable download_remote_images_to_local if site uses S3" do
setup_s3
job.execute({ post_id: post.id })
expect(SiteSetting.download_remote_images_to_local).to eq(true)
end
end
end
def stub_s3(upload) def stub_s3(upload)
stub_upload(upload) stub_upload(upload)
stub_request(:get, "https:" + upload.url).to_return(status: 200, body: file_from_fixtures("smallest.png")) stub_request(:get, "https:" + upload.url).to_return(status: 200, body: file_from_fixtures("smallest.png"))

View File

@ -21,7 +21,6 @@ describe CookedPostProcessor do
cpp.expects(:post_process_oneboxes).in_sequence(post_process) cpp.expects(:post_process_oneboxes).in_sequence(post_process)
cpp.expects(:post_process_images).in_sequence(post_process) cpp.expects(:post_process_images).in_sequence(post_process)
cpp.expects(:optimize_urls).in_sequence(post_process) cpp.expects(:optimize_urls).in_sequence(post_process)
cpp.expects(:pull_hotlinked_images).in_sequence(post_process)
cpp.post_process cpp.post_process
expect(PostUpload.exists?(post: post, upload: upload)).to eq(true) expect(PostUpload.exists?(post: post, upload: upload)).to eq(true)
@ -1543,102 +1542,6 @@ describe CookedPostProcessor do
end end
end end
context "#pull_hotlinked_images" do
let(:post) { build(:post, created_at: 20.days.ago) }
let(:cpp) { CookedPostProcessor.new(post) }
before { cpp.stubs(:available_disk_space).returns(90) }
it "runs even when download_remote_images_to_local is disabled" do
# We want to run it to pull hotlinked optimized images
SiteSetting.download_remote_images_to_local = false
expect { cpp.pull_hotlinked_images }.
to change { Jobs::PullHotlinkedImages.jobs.count }.by 1
end
context "when download_remote_images_to_local? is enabled" do
before do
SiteSetting.download_remote_images_to_local = true
end
it "disables download_remote_images if there is not enough disk space" do
cpp.expects(:available_disk_space).returns(5)
cpp.pull_hotlinked_images
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "does not run when requested to skip" do
CookedPostProcessor.new(post, skip_pull_hotlinked_images: true).pull_hotlinked_images
expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0)
end
context "and there is enough disk space" do
before { cpp.expects(:disable_if_low_on_disk_space).at_least_once }
context "and the post has been updated by an actual user" do
before { post.id = 42 }
it "ensures only one job is scheduled right after the editing_grace_period" do
freeze_time
Jobs.expects(:cancel_scheduled_job).with(:pull_hotlinked_images, post_id: post.id).once
delay = SiteSetting.editing_grace_period + 1
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do
cpp.pull_hotlinked_images
end
end
end
end
end
end
context "#disable_if_low_on_disk_space" do
let(:post) { build(:post, created_at: 20.days.ago) }
let(:cpp) { CookedPostProcessor.new(post) }
before do
SiteSetting.download_remote_images_to_local = true
SiteSetting.download_remote_images_threshold = 20
cpp.stubs(:available_disk_space).returns(50)
end
it "does nothing when there's enough disk space" do
SiteSetting.expects(:download_remote_images_to_local=).never
cpp.disable_if_low_on_disk_space
end
context "when there's not enough disk space" do
before { SiteSetting.download_remote_images_threshold = 75 }
it "disables download_remote_images_threshold and send a notification to the admin" do
StaffActionLogger.any_instance.expects(:log_site_setting_change).once
SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once
cpp.disable_if_low_on_disk_space
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "doesn't disable download_remote_images_to_local if site uses S3" do
setup_s3
cpp.disable_if_low_on_disk_space
expect(SiteSetting.download_remote_images_to_local).to eq(true)
end
end
end
context "#is_a_hyperlink?" do context "#is_a_hyperlink?" do
let(:post) { build(:post) } let(:post) { build(:post) }