From bd0a7553c4ea75b493b4f0a003cef659bc520f26 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Thu, 13 Aug 2020 00:30:28 +0100 Subject: [PATCH] DEV: Detect when s3 inventory failure is caused by etag difference (#10427) --- lib/s3_inventory.rb | 14 +++++++++++++- spec/components/s3_inventory_spec.rb | 24 +++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 5b2fbffa471..e561d5cd306 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -67,6 +67,10 @@ class S3Inventory .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag") .where("#{table_name}.etag IS NULL") + exists_with_different_etag = missing_uploads + .joins("LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url") + .where("inventory2.etag IS NOT NULL").pluck(:id) + # marking as verified/not verified id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : "" DB.exec(<<~SQL, inventory_date @@ -86,10 +90,18 @@ class S3Inventory if (missing_count = missing_uploads.count) > 0 missing_uploads.select(:id, :url).find_each do |upload| - log upload.url + if exists_with_different_etag.include?(upload.id) + log "#{upload.url} has different etag" + else + log upload.url + end end log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing" + if exists_with_different_etag.present? + log "#{exists_with_different_etag.count} of these are caused by differing etags" + log "Null the etag column and re-run for automatic backfill" + end end Discourse.stats.set("missing_s3_#{model.table_name}", missing_count) diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb index ea042131cec..2f4122d0145 100644 --- a/spec/components/s3_inventory_spec.rb +++ b/spec/components/s3_inventory_spec.rb @@ -64,7 +64,10 @@ describe "S3Inventory" do CSV.foreach(csv_filename, headers: false) do |row| next unless row[S3Inventory::CSV_KEY_INDEX].include?("default") - Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago) + Fabricate(:upload, + etag: row[S3Inventory::CSV_ETAG_INDEX], + url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]), + updated_at: 2.days.ago) end @upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) @@ -84,6 +87,25 @@ describe "S3Inventory" do expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) end + it "should detect when a url match exists with a different etag" do + differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") + differing_etag.update_columns(etag: "somethingelse") + + output = capture_stdout do + inventory.backfill_etags_and_list_missing + end + + expect(output).to eq(<<~TEXT) + #{differing_etag.url} has different etag + #{@upload1.url} + #{@no_etag.url} + 3 of 5 uploads are missing + 1 of these are caused by differing etags + Null the etag column and re-run for automatic backfill + TEXT + expect(Discourse.stats.get("missing_s3_uploads")).to eq(3) + end + it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do expect(Upload.where(verified: nil).count).to eq(12) output = capture_stdout do