REFACTOR: Restoring of backups and migration of uploads to S3

This commit is contained in:
Gerhard Schlager
2020-01-13 00:12:27 +01:00
parent f10078eab4
commit e474cda321
37 changed files with 2454 additions and 1029 deletions

View File

@@ -224,59 +224,19 @@ def migrate_to_s3_all_sites
end
end
def migration_successful?(db, should_raise = false)
success = true
failure_message = "S3 migration failed for db '#{db}'."
prefix = ENV["MIGRATE_TO_MULTISITE"] ? "uploads/#{db}/original/" : "original/"
base_url = File.join(SiteSetting.Upload.s3_base_url, prefix)
count = Upload.by_users.where("url NOT LIKE '#{base_url}%'").count
error_message = "#{count} of #{Upload.count} uploads are not migrated to S3. #{failure_message}"
raise error_message if count > 0 && should_raise
success &&= count == 0
puts error_message if count > 0
cdn_path = SiteSetting.cdn_path("/uploads/#{db}/original").sub(/https?:/, "")
count = Post.where("cooked LIKE '%#{cdn_path}%'").count
error_message = "#{count} posts are not remapped to new S3 upload URL. #{failure_message}"
raise error_message if count > 0 && should_raise
success &&= count == 0
puts error_message if count > 0
Rake::Task['posts:missing_uploads'].invoke('single_site')
count = PostCustomField.where(name: Post::MISSING_UPLOADS).count
error_message = "rake posts:missing_uploads identified #{count} issues. #{failure_message}"
raise error_message if count > 0 && should_raise
success &&= count == 0
puts error_message if count > 0
count = Post.where('baked_version <> ? OR baked_version IS NULL', Post::BAKED_VERSION).count
if count > 0
puts "#{count} posts still require rebaking and will be rebaked during regular job"
if count > 100
puts "To speed up migrations of posts we recommend you run 'rake posts:rebake_uncooked_posts'"
end
success = false
else
puts "No posts require rebaking"
end
success
def migrate_to_s3
FileStore::ToS3Migration.new(
s3_options: FileStore::ToS3Migration.s3_options_from_env,
dry_run: !!ENV["DRY_RUN"],
migrate_to_multisite: !!ENV["MIGRATE_TO_MULTISITE"],
skip_etag_verify: !!ENV["SKIP_ETAG_VERIFY"]
).migrate
end
task "uploads:s3_migration_status" => :environment do
success = true
RailsMultisite::ConnectionManagement.each_connection do
db = RailsMultisite::ConnectionManagement.current_db
success &&= migration_successful?(db)
success &&= FileStore::ToS3Migration.new.migration_successful?
end
queued_jobs = Sidekiq::Stats.new.queues.sum { |_ , x| x }
@@ -293,266 +253,6 @@ task "uploads:s3_migration_status" => :environment do
puts "All sites appear to have uploads in order!"
end
def migrate_to_s3
# we don't want have migrated state, ensure we run all jobs here
Jobs.run_immediately!
db = RailsMultisite::ConnectionManagement.current_db
dry_run = !!ENV["DRY_RUN"]
puts "Checking if #{db} already migrated..."
return puts "Already migrated #{db}!" if migration_successful?(db)
puts "*" * 30 + " DRY RUN " + "*" * 30 if dry_run
puts "Migrating uploads to S3 for '#{db}'..."
if Upload.by_users.where("url NOT LIKE '//%' AND url NOT LIKE '#{GlobalSetting.relative_url_root}/uploads/#{db}/original/_X/%'").exists?
puts <<~TEXT
Some uploads were not migrated to the new scheme. Please run these commands in the rails console
SiteSetting.migrate_to_new_scheme = true
Jobs::MigrateUploadScheme.new.execute(nil)
TEXT
exit 1
end
unless ENV["DISCOURSE_S3_BUCKET"].present? &&
ENV["DISCOURSE_S3_REGION"].present? &&
(
(
ENV["DISCOURSE_S3_ACCESS_KEY_ID"].present? &&
ENV["DISCOURSE_S3_SECRET_ACCESS_KEY"].present?
) ||
ENV["DISCOURSE_S3_USE_IAM_PROFILE"].present?
)
puts <<~TEXT
Please provide the following environment variables
- DISCOURSE_S3_BUCKET
- DISCOURSE_S3_REGION
and either
- DISCOURSE_S3_ACCESS_KEY_ID
- DISCOURSE_S3_SECRET_ACCESS_KEY
or
- DISCOURSE_S3_USE_IAM_PROFILE
TEXT
exit 2
end
if SiteSetting.Upload.s3_cdn_url.blank?
puts "Please provide the 'DISCOURSE_S3_CDN_URL' environment variable"
exit 3
end
bucket_has_folder_path = true if ENV["DISCOURSE_S3_BUCKET"].include? "/"
public_directory = Rails.root.join("public").to_s
opts = {
region: ENV["DISCOURSE_S3_REGION"],
access_key_id: ENV["DISCOURSE_S3_ACCESS_KEY_ID"],
secret_access_key: ENV["DISCOURSE_S3_SECRET_ACCESS_KEY"]
}
# S3::Client ignores the `region` option when an `endpoint` is provided.
# Without `region`, non-default region bucket creation will break for S3, so we can only
# define endpoint when not using S3 i.e. when SiteSetting.s3_endpoint is provided.
opts[:endpoint] = SiteSetting.s3_endpoint if SiteSetting.s3_endpoint.present?
s3 = Aws::S3::Client.new(opts)
if bucket_has_folder_path
bucket, folder = S3Helper.get_bucket_and_folder_path(ENV["DISCOURSE_S3_BUCKET"])
folder = File.join(folder, "/")
else
bucket, folder = ENV["DISCOURSE_S3_BUCKET"], ""
end
puts "Uploading files to S3..."
print " - Listing local files"
local_files = []
IO.popen("cd #{public_directory} && find uploads/#{db}/original -type f").each do |file|
local_files << file.chomp
putc "." if local_files.size % 1000 == 0
end
puts " => #{local_files.size} files"
print " - Listing S3 files"
s3_objects = []
prefix = ENV["MIGRATE_TO_MULTISITE"] ? "uploads/#{db}/original/" : "original/"
options = { bucket: bucket, prefix: folder + prefix }
loop do
response = s3.list_objects_v2(options)
s3_objects.concat(response.contents)
putc "."
break if response.next_continuation_token.blank?
options[:continuation_token] = response.next_continuation_token
end
puts " => #{s3_objects.size} files"
puts " - Syncing files to S3"
synced = 0
failed = []
skip_etag_verify = ENV["SKIP_ETAG_VERIFY"].present?
local_files.each do |file|
path = File.join(public_directory, file)
name = File.basename(path)
etag = Digest::MD5.file(path).hexdigest unless skip_etag_verify
key = file[file.index(prefix)..-1]
key.prepend(folder) if bucket_has_folder_path
original_path = file.sub("uploads/#{db}", "")
if s3_object = s3_objects.find { |obj| obj.key.ends_with?(original_path) }
next if File.size(path) == s3_object.size && (skip_etag_verify || s3_object.etag[etag])
end
options = {
acl: "public-read",
body: File.open(path, "rb"),
bucket: bucket,
content_type: MiniMime.lookup_by_filename(name)&.content_type,
key: key,
}
if !FileHelper.is_supported_image?(name)
upload = Upload.find_by(url: "/#{file}")
if upload&.original_filename
options[:content_disposition] =
%Q{attachment; filename="#{upload.original_filename}"}
end
if upload&.secure
options[:acl] = "private"
end
end
etag ||= Digest::MD5.file(path).hexdigest
if dry_run
puts "#{file} => #{options[:key]}"
synced += 1
elsif s3.put_object(options).etag[etag]
putc "."
synced += 1
else
putc "X"
failed << path
end
end
puts
failure_message = "S3 migration failed for db '#{db}'."
if failed.size > 0
puts "Failed to upload #{failed.size} files"
puts failed.join("\n")
raise failure_message
elsif s3_objects.size + synced >= local_files.size
puts "Updating the URLs in the database..."
from = "/uploads/#{db}/original/"
to = "#{SiteSetting.Upload.s3_base_url}/#{prefix}"
if dry_run
puts "REPLACING '#{from}' WITH '#{to}'"
else
DbHelper.remap(from, to, anchor_left: true)
end
[
[
"src=\"/uploads/#{db}/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9\\.]*)",
"src=\"#{SiteSetting.Upload.s3_base_url}/#{prefix}\\1"
],
[
"src='/uploads/#{db}/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9\\.]*)",
"src='#{SiteSetting.Upload.s3_base_url}/#{prefix}\\1"
],
[
"href=\"/uploads/#{db}/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9\\.]*)",
"href=\"#{SiteSetting.Upload.s3_base_url}/#{prefix}\\1"
],
[
"href='/uploads/#{db}/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9\\.]*)",
"href='#{SiteSetting.Upload.s3_base_url}/#{prefix}\\1"
],
[
"\\[img\\]/uploads/#{db}/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9\\.]*)\\[/img\\]",
"[img]#{SiteSetting.Upload.s3_base_url}/#{prefix}\\1[/img]"
]
].each do |from_url, to_url|
if dry_run
puts "REPLACING '#{from_url}' WITH '#{to_url}'"
else
DbHelper.regexp_replace(from_url, to_url)
end
end
unless dry_run
# Legacy inline image format
Post.where("raw LIKE '%![](/uploads/default/original/%)%'").each do |post|
regexp = /!\[\](\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
post.raw.scan(regexp).each do |upload_url, _|
upload = Upload.get_from_url(upload_url)
post.raw = post.raw.gsub("![](#{upload_url})", "![](#{upload.short_url})")
end
post.save!(validate: false)
end
end
if Discourse.asset_host.present?
# Uploads that were on local CDN will now be on S3 CDN
from = "#{Discourse.asset_host}/uploads/#{db}/original/"
to = "#{SiteSetting.Upload.s3_cdn_url}/#{prefix}"
if dry_run
puts "REMAPPING '#{from}' TO '#{to}'"
else
DbHelper.remap(from, to)
end
end
# Uploads that were on base hostname will now be on S3 CDN
from = "#{Discourse.base_url}/uploads/#{db}/original/"
to = "#{SiteSetting.Upload.s3_cdn_url}/#{prefix}"
if dry_run
puts "REMAPPING '#{from}' TO '#{to}'"
else
DbHelper.remap(from, to)
end
unless dry_run
puts "Removing old optimized images..."
OptimizedImage
.joins("LEFT JOIN uploads u ON optimized_images.upload_id = u.id")
.where("u.id IS NOT NULL AND u.url LIKE '//%' AND optimized_images.url NOT LIKE '//%'")
.delete_all
puts "Flagging all posts containing lightboxes for rebake..."
count = Post.where("cooked LIKE '%class=\"lightbox\"%'").update_all(baked_version: nil)
puts "#{count} posts were flagged for a rebake"
end
end
migration_successful?(db, true)
puts "Done!"
end
################################################################################
# clean_up #
################################################################################