Revert "FEATURE: Mark bad uploads with :invalid_url (#29640)" (#29657)

This reverts commit 5a00a041f1.

Implementation is currently not correct. Multiple uploads can share the
same etag but have different paths in the S3 bucket.
This commit is contained in:
Alan Guo Xiang Tan 2024-11-08 13:04:52 +08:00 committed by GitHub
parent 4bc030f76f
commit af642d0d69
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 67 deletions

View File

@ -66,25 +66,13 @@ class Upload < ActiveRecord::Base
scope :with_invalid_etag_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
scope :with_invalid_url_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_url]) }
def self.verification_statuses
@verification_statuses ||=
Enum.new(
unchecked: 1,
verified: 2,
# Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to
# the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing
# upload file"
invalid_etag: 3,
# Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a
# file in the S3 file store
s3_file_missing_confirmed: 4,
# Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to
# the url value of the inventory file. A upload with invalid URL is equivalent to "file
# exists (same ETag), but with a different URL"
invalid_url: 5,
invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file
s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store
)
end

View File

@ -85,9 +85,7 @@ class S3Inventory
missing_uploads =
uploads.joins(
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where(
"#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url",
)
).where("#{tmp_table_name}.etag IS NULL")
exists_with_different_etag =
missing_uploads
@ -97,20 +95,11 @@ class S3Inventory
.where("inventory2.etag IS NOT NULL")
.pluck(:id)
exists_with_different_url =
missing_uploads
.joins(
"LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag",
)
.where("inventory3.url != #{table_name}.url")
.pluck(:id)
# marking as verified/not verified
if @model == Upload
sql_params = {
inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag],
invalid_url: Upload.verification_statuses[:invalid_url],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified],
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
@ -146,22 +135,6 @@ class S3Inventory
WHERE #{tmp_table_name}.etag = #{table_name}.etag
)
SQL
DB.exec(<<~SQL, sql_params)
UPDATE #{table_name}
SET verification_status = :invalid_url
WHERE verification_status <> :invalid_url
AND verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.url = #{table_name}.url
)
SQL
end
if (missing_count = missing_uploads.count) > 0
@ -170,8 +143,6 @@ class S3Inventory
.find_each do |upload|
if exists_with_different_etag.include?(upload.id)
log "#{upload.url} has different etag"
elsif exists_with_different_url.include?(upload.id)
log "#{upload.url} has different url"
else
log upload.url
end
@ -182,10 +153,6 @@ class S3Inventory
log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill"
end
if exists_with_different_url.present?
log "#{exists_with_different_url.count} of these are caused by differing urls"
log "Empty the url column and re-run for automatic backfill"
end
end
set_missing_s3_discourse_stats(missing_count)

View File

@ -74,29 +74,20 @@ RSpec.describe S3Inventory do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse")
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag
#{differing_url.url} has different url
#{@upload_1.url}
#{@no_etag.url}
4 of 5 uploads are missing
3 of 5 uploads are missing
1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill
1 of these are caused by differing urls
Empty the url column and re-run for automatic backfill
TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(4)
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12)
@ -105,10 +96,9 @@ RSpec.describe S3Inventory do
verification_status = Upload.pluck(:verification_status)
expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(2)
).to eq(3)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
expect(Upload.with_invalid_url_verification_status.count).to eq(1)
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
@ -208,12 +198,7 @@ RSpec.describe S3Inventory do
CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate(
:upload,
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
etag: row[S3Inventory::CSV_ETAG_INDEX],
updated_at: 2.days.ago,
)
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)