From 8cf4ed5f884a9462c810774304b7d4d13ff1fa1d Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Mon, 10 Jun 2024 13:16:00 +0800 Subject: [PATCH] DEV: Introduce hidden `s3_inventory_bucket` site setting (#27304) This commit introduces a hidden `s3_inventory_bucket` site setting which replaces the `enable_s3_inventory` and `s3_configure_inventory_policy` site setting. The reason `enable_s3_inventory` and `s3_configure_inventory_policy` site settings are removed is because this feature has technically been broken since it was introduced. When the `enable_s3_inventory` feature is turned on, the app will because configure a daily inventory policy for the `s3_upload_bucket` bucket and store the inventories under a prefix in the bucket. The problem here is that once the inventories are created, there is nothing cleaning up all these inventories so whoever that has enabled this feature would have been paying the cost of storing a whole bunch of inventory files which are never used. Given that we have not received any complains about inventory files inflating S3 storage costs, we think that it is very likely that this feature is no longer being used and we are looking to drop support for this feature in the not too distance future. For now, we will still support a hidden `s3_inventory_bucket` site setting which site administrators can configure via the `DISCOURSE_S3_INVENTORY_BUCKET` env. --- app/jobs/regular/update_s3_inventory.rb | 21 ---- .../scheduled/ensure_s3_uploads_existence.rb | 11 +- .../initializers/014-track-setting-changes.rb | 2 - config/locales/server.en.yml | 1 - config/site_settings.yml | 9 +- lib/file_store/s3_store.rb | 10 +- lib/s3_helper.rb | 13 ++- lib/s3_inventory.rb | 103 +++--------------- lib/site_settings/validations.rb | 6 - spec/jobs/ensure_s3_uploads_existence_spec.rb | 11 +- spec/jobs/update_s3_inventory_spec.rb | 63 ----------- spec/lib/s3_inventory_multisite_spec.rb | 10 +- spec/lib/s3_inventory_spec.rb | 62 +++-------- 13 files changed, 62 insertions(+), 260 deletions(-) delete mode 100644 app/jobs/regular/update_s3_inventory.rb delete mode 100644 spec/jobs/update_s3_inventory_spec.rb diff --git a/app/jobs/regular/update_s3_inventory.rb b/app/jobs/regular/update_s3_inventory.rb deleted file mode 100644 index 699db498860..00000000000 --- a/app/jobs/regular/update_s3_inventory.rb +++ /dev/null @@ -1,21 +0,0 @@ -# frozen_string_literal: true - -require "s3_inventory" - -module Jobs - # if upload bucket changes or inventory bucket changes we want to update s3 bucket policy and inventory configuration - class UpdateS3Inventory < ::Jobs::Base - def execute(args) - unless SiteSetting.enable_s3_inventory? && SiteSetting.Upload.enable_s3_uploads && - SiteSetting.s3_configure_inventory_policy - return - end - - %i[upload optimized].each do |type| - s3_inventory = S3Inventory.new(Discourse.store.s3_helper, type) - s3_inventory.update_bucket_policy if type == :upload - s3_inventory.update_bucket_inventory_configuration - end - end - end -end diff --git a/app/jobs/scheduled/ensure_s3_uploads_existence.rb b/app/jobs/scheduled/ensure_s3_uploads_existence.rb index 29338784433..c7a80e16a73 100644 --- a/app/jobs/scheduled/ensure_s3_uploads_existence.rb +++ b/app/jobs/scheduled/ensure_s3_uploads_existence.rb @@ -15,10 +15,6 @@ module Jobs end end - def s3_helper - Discourse.store.s3_helper - end - def prepare_for_all_sites inventory = S3Inventory.new(s3_helper, :upload) @db_inventories = inventory.prepare_for_all_sites @@ -26,8 +22,7 @@ module Jobs end def execute(args) - return if !SiteSetting.enable_s3_inventory - require "s3_inventory" + return if (s3_inventory_bucket = SiteSetting.s3_inventory_bucket).blank? if !@db_inventories && Rails.configuration.multisite && GlobalSetting.use_s3? prepare_for_all_sites @@ -37,13 +32,13 @@ module Jobs preloaded_inventory_file = @db_inventories[RailsMultisite::ConnectionManagement.current_db] S3Inventory.new( - s3_helper, :upload, + s3_inventory_bucket:, preloaded_inventory_file: preloaded_inventory_file, preloaded_inventory_date: @inventory_date, ).backfill_etags_and_list_missing else - S3Inventory.new(s3_helper, :upload).backfill_etags_and_list_missing + S3Inventory.new(:upload, s3_inventory_bucket:).backfill_etags_and_list_missing end end end diff --git a/config/initializers/014-track-setting-changes.rb b/config/initializers/014-track-setting-changes.rb index d6b3d750338..99347537d60 100644 --- a/config/initializers/014-track-setting-changes.rb +++ b/config/initializers/014-track-setting-changes.rb @@ -53,8 +53,6 @@ DiscourseEvent.on(:site_setting_changed) do |name, old_value, new_value| Scheduler::Defer.later("Null topic slug") { Topic.update_all(slug: nil) } end - Jobs.enqueue(:update_s3_inventory) if %i[enable_s3_inventory s3_upload_bucket].include?(name) - SvgSprite.expire_cache if name.to_s.include?("_icon") SiteIconManager.ensure_optimized! if SiteIconManager::WATCHED_SETTINGS.include?(name) diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 72b9a2d003d..7a17ca17631 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1973,7 +1973,6 @@ en: s3_endpoint: "The endpoint can be modified to backup to an S3 compatible service like DigitalOcean Spaces or Minio. WARNING: Leave blank if using AWS S3." s3_configure_tombstone_policy: "Enable automatic deletion policy for tombstone uploads. IMPORTANT: If disabled, no space will be reclaimed after uploads are deleted." s3_disable_cleanup: "Prevent removal of old backups from S3 when there are more backups than the maximum allowed." - enable_s3_inventory: "Generate reports and verify uploads using Amazon S3 inventory. IMPORTANT: requires valid S3 credentials (both access key id & secret access key)." s3_use_acls: "AWS recommends not using ACLs on S3 buckets; if you are following this advice, uncheck this option. This must be enabled if you are using secure uploads." backup_time_of_day: "Time of day UTC when the backup should occur." backup_with_uploads: "Include uploads in scheduled backups. Disabling this will only backup the database." diff --git a/config/site_settings.yml b/config/site_settings.yml index 20c72785120..c540d295d78 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1581,11 +1581,6 @@ files: default: true s3_use_acls: default: true - enable_s3_inventory: - default: false - s3_configure_inventory_policy: - default: true - hidden: true s3_presigned_get_url_expires_after_seconds: default: 300 hidden: true @@ -2487,6 +2482,10 @@ backups: s3_backup_bucket: default: "" regex: '^[a-z0-9\-\/]+$' # can't use '.' when using HTTPS + s3_inventory_bucket: + hidden: true + default: "" + regex: '^[a-z0-9\-\/_]+$' # can't use '.' when using HTTPS s3_disable_cleanup: default: false backup_time_of_day: diff --git a/lib/file_store/s3_store.rb b/lib/file_store/s3_store.rb index 55f6668f156..e363a1dea33 100644 --- a/lib/file_store/s3_store.rb +++ b/lib/file_store/s3_store.rb @@ -297,10 +297,12 @@ module FileStore end def list_missing_uploads(skip_optimized: false) - if SiteSetting.enable_s3_inventory - require "s3_inventory" - S3Inventory.new(s3_helper, :upload).backfill_etags_and_list_missing - S3Inventory.new(s3_helper, :optimized).backfill_etags_and_list_missing unless skip_optimized + if s3_inventory_bucket = SiteSetting.s3_inventory_bucket + S3Inventory.new(:upload, s3_inventory_bucket:).backfill_etags_and_list_missing + + unless skip_optimized + S3Inventory.new(:optimized, s3_inventory_bucket:).backfill_etags_and_list_missing + end else list_missing(Upload.by_users, "original/") list_missing(OptimizedImage, "optimized/") unless skip_optimized diff --git a/lib/s3_helper.rb b/lib/s3_helper.rb index e8d5a05e9e0..60b832f5aff 100644 --- a/lib/s3_helper.rb +++ b/lib/s3_helper.rb @@ -282,7 +282,12 @@ class S3Helper end def s3_client - @s3_client ||= Aws::S3::Client.new(@s3_options) + @s3_client ||= init_aws_s3_client + end + + def stub_client_responses! + raise "This method is only allowed to be used in the testing environment" if !Rails.env.test? + @s3_client = init_aws_s3_client(stub_responses: true) end def s3_inventory_path(path = "inventory") @@ -381,6 +386,12 @@ class S3Helper private + def init_aws_s3_client(stub_responses: false) + options = @s3_options + options = options.merge(stub_responses: true) if stub_responses + Aws::S3::Client.new(options) + end + def fetch_bucket_cors_rules begin s3_resource.client.get_bucket_cors(bucket: @s3_bucket_name).cors_rules&.map(&:to_h) || [] diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 73fb7ab46a4..eec2b1c4fe3 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -4,17 +4,21 @@ require "aws-sdk-s3" require "csv" class S3Inventory - attr_reader :type, :inventory_date + attr_reader :type, :inventory_date, :s3_helper CSV_KEY_INDEX = 1 CSV_ETAG_INDEX = 2 INVENTORY_PREFIX = "inventory" - INVENTORY_VERSION = "1" INVENTORY_LAG = 2.days WAIT_AFTER_RESTORE_DAYS = 2 - def initialize(s3_helper, type, preloaded_inventory_file: nil, preloaded_inventory_date: nil) - @s3_helper = s3_helper + def initialize( + type, + s3_inventory_bucket:, + preloaded_inventory_file: nil, + preloaded_inventory_date: nil + ) + @s3_helper = S3Helper.new(s3_inventory_bucket) if preloaded_inventory_file && preloaded_inventory_date # Data preloaded, so we don't need to fetch it again @@ -189,43 +193,6 @@ class S3Inventory ) end - def update_bucket_policy - @s3_helper.s3_client.put_bucket_policy( - bucket: bucket_name, - policy: { - Version: "2012-10-17", - Statement: [ - { - Sid: "InventoryAndAnalyticsPolicy", - Effect: "Allow", - Principal: { - Service: "s3.amazonaws.com", - }, - Action: ["s3:PutObject"], - Resource: ["#{inventory_path_arn}/*"], - Condition: { - ArnLike: { - "aws:SourceArn": bucket_arn, - }, - StringEquals: { - "s3:x-amz-acl": "bucket-owner-full-control", - }, - }, - }, - ], - }.to_json, - ) - end - - def update_bucket_inventory_configuration - @s3_helper.s3_client.put_bucket_inventory_configuration( - bucket: bucket_name, - id: inventory_id, - inventory_configuration: inventory_configuration, - use_accelerate_endpoint: false, - ) - end - def prepare_for_all_sites db_names = RailsMultisite::ConnectionManagement.all_dbs db_files = {} @@ -248,6 +215,10 @@ class S3Inventory cleanup! end + def s3_client + @s3_helper.s3_client + end + private def cleanup! @@ -318,31 +289,6 @@ class S3Inventory end end - def inventory_configuration - filter_prefix = type - filter_prefix = bucket_folder_path if bucket_folder_path.present? - - { - destination: { - s3_bucket_destination: { - bucket: bucket_arn, - prefix: inventory_path, - format: "CSV", - }, - }, - filter: { - prefix: filter_prefix, - }, - is_enabled: SiteSetting.enable_s3_inventory, - id: inventory_id, - included_object_versions: "Current", - optional_fields: ["ETag"], - schedule: { - frequency: "Daily", - }, - } - end - def bucket_name @s3_helper.s3_bucket_name end @@ -353,8 +299,7 @@ class S3Inventory def unsorted_files objects = [] - - hive_path = File.join(inventory_path, bucket_name, inventory_id, "hive") + hive_path = File.join(bucket_folder_path, "hive") @s3_helper.list(hive_path).each { |obj| objects << obj if obj.key.match?(/symlink\.txt\z/i) } objects @@ -363,28 +308,6 @@ class S3Inventory [] end - def inventory_id - @inventory_id ||= - begin - id = Rails.configuration.multisite ? "original" : type # TODO: rename multisite path to "uploads" - bucket_folder_path.present? ? "#{bucket_folder_path}-#{id}" : id - end - end - - def inventory_path_arn - File.join(bucket_arn, inventory_path) - end - - def inventory_path - path = File.join(INVENTORY_PREFIX, INVENTORY_VERSION) - path = File.join(bucket_folder_path, path) if bucket_folder_path.present? - path - end - - def bucket_arn - "arn:aws:s3:::#{bucket_name}" - end - def log(message, ex = nil) puts(message) Rails.logger.error("#{ex}\n" + (ex.backtrace || []).join("\n")) if ex diff --git a/lib/site_settings/validations.rb b/lib/site_settings/validations.rb index 4a25bc9eda4..828537c5439 100644 --- a/lib/site_settings/validations.rb +++ b/lib/site_settings/validations.rb @@ -187,12 +187,6 @@ module SiteSettings::Validations end end - def validate_enable_s3_inventory(new_val) - if new_val == "t" && !SiteSetting.Upload.enable_s3_uploads - validate_error :enable_s3_uploads_is_required - end - end - def validate_backup_location(new_val) return unless new_val == BackupLocationSiteSetting::S3 if SiteSetting.s3_backup_bucket.blank? diff --git a/spec/jobs/ensure_s3_uploads_existence_spec.rb b/spec/jobs/ensure_s3_uploads_existence_spec.rb index e7c8d267030..c4916548515 100644 --- a/spec/jobs/ensure_s3_uploads_existence_spec.rb +++ b/spec/jobs/ensure_s3_uploads_existence_spec.rb @@ -3,11 +3,8 @@ RSpec.describe Jobs::EnsureS3UploadsExistence do subject(:job) { described_class.new } - context "with S3 inventory enabled" do - before do - setup_s3 - SiteSetting.enable_s3_inventory = true - end + context "when `s3_inventory_bucket` has been set" do + before { SiteSetting.s3_inventory_bucket = "some-bucket-name" } it "works" do S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once @@ -15,8 +12,8 @@ RSpec.describe Jobs::EnsureS3UploadsExistence do end end - context "with S3 inventory disabled" do - before { SiteSetting.enable_s3_inventory = false } + context "when `s3_inventory_bucket` has not been set" do + before { SiteSetting.s3_inventory_bucket = nil } it "doesn't execute" do S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).never diff --git a/spec/jobs/update_s3_inventory_spec.rb b/spec/jobs/update_s3_inventory_spec.rb deleted file mode 100644 index ab43aa4060b..00000000000 --- a/spec/jobs/update_s3_inventory_spec.rb +++ /dev/null @@ -1,63 +0,0 @@ -# frozen_string_literal: true - -require "file_store/s3_store" - -RSpec.describe Jobs::UpdateS3Inventory do - before do - setup_s3 - SiteSetting.s3_upload_bucket = "special-bucket" - SiteSetting.enable_s3_inventory = true - - store = FileStore::S3Store.new - @client = Aws::S3::Client.new(stub_responses: true) - store.s3_helper.stubs(:s3_client).returns(@client) - Discourse.stubs(:store).returns(store) - end - - it "updates the bucket policy and inventory configuration in S3" do - id = "original" - path = File.join(S3Inventory::INVENTORY_PREFIX, S3Inventory::INVENTORY_VERSION) - - @client.expects(:put_bucket_policy).with( - bucket: "special-bucket", - policy: - %Q|{"Version":"2012-10-17","Statement":[{"Sid":"InventoryAndAnalyticsPolicy","Effect":"Allow","Principal":{"Service":"s3.amazonaws.com"},"Action":["s3:PutObject"],"Resource":["arn:aws:s3:::special-bucket/#{path}/*"],"Condition":{"ArnLike":{"aws:SourceArn":"arn:aws:s3:::special-bucket"},"StringEquals":{"s3:x-amz-acl":"bucket-owner-full-control"}}}]}|, - ) - @client.expects(:put_bucket_inventory_configuration) - @client.expects(:put_bucket_inventory_configuration).with( - bucket: "special-bucket", - id: id, - inventory_configuration: { - destination: { - s3_bucket_destination: { - bucket: "arn:aws:s3:::special-bucket", - prefix: path, - format: "CSV", - }, - }, - filter: { - prefix: id, - }, - is_enabled: true, - id: id, - included_object_versions: "Current", - optional_fields: ["ETag"], - schedule: { - frequency: "Daily", - }, - }, - use_accelerate_endpoint: false, - ) - - described_class.new.execute(nil) - end - - it "doesn't update the policy with s3_configure_inventory_policy disabled" do - SiteSetting.s3_configure_inventory_policy = false - - @client.expects(:put_bucket_policy).never - @client.expects(:put_bucket_inventory_configuration).never - - described_class.new.execute(nil) - end -end diff --git a/spec/lib/s3_inventory_multisite_spec.rb b/spec/lib/s3_inventory_multisite_spec.rb index 19cb36313c8..9fab4407274 100644 --- a/spec/lib/s3_inventory_multisite_spec.rb +++ b/spec/lib/s3_inventory_multisite_spec.rb @@ -5,15 +5,16 @@ require "s3_inventory" require "file_store/s3_store" RSpec.describe "S3Inventory", type: :multisite do - let(:client) { Aws::S3::Client.new(stub_responses: true) } - let(:helper) { S3Helper.new(SiteSetting.Upload.s3_upload_bucket.downcase, "", client: client) } - let(:inventory) { S3Inventory.new(helper, :upload) } + let(:inventory) do + S3Inventory.new(:upload, s3_inventory_bucket: "some-inventory-bucket/some/prefix") + end + let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" } it "can create per-site files" do freeze_time setup_s3 - SiteSetting.enable_s3_inventory = true + inventory.stubs(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]) inventory.stubs(:cleanup!) @@ -23,6 +24,7 @@ RSpec.describe "S3Inventory", type: :multisite do expect(db1.lines.count).to eq(3) expect(db2.lines.count).to eq(1) + files.values.each do |f| f.close f.unlink diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index c1b07b743eb..6021fb11bdc 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -1,25 +1,19 @@ # frozen_string_literal: true -require "s3_helper" -require "s3_inventory" -require "file_store/s3_store" - RSpec.describe S3Inventory do - let(:client) { Aws::S3::Client.new(stub_responses: true) } - let(:resource) { Aws::S3::Resource.new(client: client) } - let(:bucket) { resource.bucket(SiteSetting.Upload.s3_upload_bucket.downcase) } - let(:helper) { S3Helper.new(bucket.name, "", client: client, bucket: bucket) } - let(:inventory) { S3Inventory.new(helper, :upload) } + let(:inventory) do + S3Inventory.new(:upload, s3_inventory_bucket: "some-inventory-bucket/inventoried-bucket/prefix") + end + let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" } before do - setup_s3 - SiteSetting.enable_s3_inventory = true + inventory.s3_helper.stub_client_responses! inventory.stubs(:cleanup!) end it "should raise error if an inventory file is not found" do - client.stub_responses(:list_objects, contents: []) + inventory.s3_client.stub_responses(:list_objects, contents: []) output = capture_stdout { inventory.backfill_etags_and_list_missing } expect(output).to eq("Failed to list inventory from S3\n") end @@ -141,7 +135,7 @@ RSpec.describe S3Inventory do end it "should run if inventory files are at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do - client.stub_responses( + inventory.s3_client.stub_responses( :list_objects_v2, { contents: [ @@ -155,16 +149,13 @@ RSpec.describe S3Inventory do }, ) - client.expects(:get_object).once + inventory.s3_client.expects(:get_object).once - capture_stdout do - inventory = described_class.new(helper, :upload) - inventory.backfill_etags_and_list_missing - end + capture_stdout { inventory.backfill_etags_and_list_missing } end it "should not run if inventory files are not at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do - client.stub_responses( + inventory.s3_client.stub_responses( :list_objects_v2, { contents: [ @@ -177,12 +168,9 @@ RSpec.describe S3Inventory do }, ) - client.expects(:get_object).never + inventory.s3_client.expects(:get_object).never - capture_stdout do - inventory = described_class.new(helper, :upload) - inventory.backfill_etags_and_list_missing - end + capture_stdout { inventory.backfill_etags_and_list_missing } end end @@ -203,11 +191,12 @@ RSpec.describe S3Inventory do File.open(csv_filename) do |f| preloaded_inventory = S3Inventory.new( - helper, :upload, + s3_inventory_bucket: "some-inventory-bucket", preloaded_inventory_file: f, preloaded_inventory_date: Time.now, ) + preloaded_inventory.backfill_etags_and_list_missing end end @@ -215,27 +204,4 @@ RSpec.describe S3Inventory do expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n") expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) end - - describe "s3 inventory configuration" do - let(:bucket_name) { "s3-upload-bucket" } - let(:subfolder_path) { "subfolder" } - before { SiteSetting.s3_upload_bucket = "#{bucket_name}/#{subfolder_path}" } - - it "is formatted correctly for subfolders" do - s3_helper = S3Helper.new(SiteSetting.Upload.s3_upload_bucket.downcase, "", client: client) - config = S3Inventory.new(s3_helper, :upload).send(:inventory_configuration) - - expect(config[:destination][:s3_bucket_destination][:bucket]).to eq( - "arn:aws:s3:::#{bucket_name}", - ) - expect(config[:destination][:s3_bucket_destination][:prefix]).to eq( - "#{subfolder_path}/inventory/1", - ) - expect(config[:id]).to eq("#{subfolder_path}-original") - expect(config[:schedule][:frequency]).to eq("Daily") - expect(config[:included_object_versions]).to eq("Current") - expect(config[:optional_fields]).to eq(["ETag"]) - expect(config[:filter][:prefix]).to eq(subfolder_path) - end - end end