From 15de4ac8904fd8de9499a1884f1c84369ab2a003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Thu, 10 Oct 2013 04:01:01 +0200 Subject: [PATCH] add a rake task to pull hotlinked images --- lib/tasks/images.rake | 98 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/lib/tasks/images.rake b/lib/tasks/images.rake index 74c85642110..a6d4e60e222 100644 --- a/lib/tasks/images.rake +++ b/lib/tasks/images.rake @@ -28,3 +28,101 @@ task "images:clean_orphans" => :environment do end puts "\ndone." end + +desc "download all hotlinked images" +task "images:pull_hotlinked" => :environment do + RailsMultisite::ConnectionManagement.each_connection do |db| + # currently only works when using the local storage + next if Discourse.store.external? + + puts "Pulling hotlinked images for: #{db}" + + # shorthand to the asset host + asset_host = Rails.configuration.action_controller.asset_host + # maximum size of the file in bytes + max_size = SiteSetting.max_image_size_kb * 1024 + # will hold the urls of the already downloaded images + upload_urls = {} + + Post.find_each do |post| + has_changed = false + + extract_images_from(post.cooked).each do |image| + src = image['src'] + if src.present? && + src !~ /^\/[^\/]/ && + !src.starts_with?(Discourse.base_url_no_prefix) && + !(asset_host.present? && src.starts_with?(asset_host)) + begin + # have we already downloaded that file? + if !upload_urls.include?(src) + # initialize + upload_urls[src] = nil + # download the file + hotlinked = download(src, max_size) + # if the hotlinked image is OK + if hotlinked.size <= max_size + file = ActionDispatch::Http::UploadedFile.new(tempfile: hotlinked, filename: File.basename(URI.parse(src).path)) + upload_urls[src] = Upload.create_for(post.user_id, file, hotlinked.size).url + else + puts "\nFailed to pull: #{src} for post ##{post.id} - too large\n" + end + end + # if we have downloaded a file + if upload_urls[src].present? + src_for_regexp = src.gsub("?", "\\?").gsub(".", "\\.").gsub("+", "\\+") + # there are 4 ways to insert an image in a post + # HTML tag - + post.raw.gsub!(/src=["']#{src_for_regexp}["']/i, "src='#{upload_urls[src]}'") + # BBCode tag - [img]http://...[/img] + post.raw.gsub!(/\[img\]#{src_for_regexp}\[\/img\]/i, "[img]#{upload_urls[src]}[/img]") + # Markdown - ![alt](http://...) + post.raw.gsub!(/!\[([^\]]*)\]\(#{src_for_regexp}\)/) { "![#{$1}](#{upload_urls[src]})" } + # Direct link + post.raw.gsub!(src, "") + # mark the post as changed + has_changed = true + end + rescue => e + puts "\nFailed to pull: #{src} for post ##{post.id} - #{e}\n" + ensure + # close & delete the temporary file + hotlinked && hotlinked.close! + end + end + end + + if has_changed + # since the raw has changed, we cook the post once again + post.cooked = post.cook(post.raw, topic_id: post.topic_id, invalidate_oneboxes: true) + # update both raw & cooked version of the post + Post.exec_sql('update posts set cooked = ?, raw = ? where id = ?', post.cooked, post.raw, post.id) + # trigger the post processing + post.trigger_post_process + putc "#" + else + putc "." + end + end + end + puts "\ndone." +end + +def extract_images_from(html) + doc = Nokogiri::HTML::fragment(html) + doc.css("img") - doc.css(".onebox-result img") - doc.css("img.avatar") +end + +def download(url, max_size) + # create a temporary file + temp_file = Tempfile.new(["discourse-hotlinked", File.extname(URI.parse(url).path)]) + # download the hotlinked image + File.open(temp_file.path, "wb") do |f| + hotlinked = open(url, "rb", read_timeout: 5) + while f.size <= max_size && data = hotlinked.read(max_size) + f.write(data) + end + hotlinked.close + end + temp_file +end