mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 08:57:10 -06:00
FIX: Apply crawler rate limits to cached requests (#27174)
This commit moves the logic for crawler rate limits out of the application controller and into the request tracker middleware. The reason for this move is to apply rate limits to all crawler requests instead of just the requests that make it to the application controller. Some requests are served early from the middleware stack without reaching the Rails app for performance reasons (e.g. `AnonymousCache`) which results in crawlers getting 200 responses even though they've reached their limits and should be getting 429 responses. Internal topic: t/128810.
This commit is contained in:
parent
7992d7a65a
commit
361992bb74
@ -29,7 +29,6 @@ class ApplicationController < ActionController::Base
|
||||
end
|
||||
end
|
||||
|
||||
before_action :rate_limit_crawlers
|
||||
before_action :check_readonly_mode
|
||||
before_action :handle_theme
|
||||
before_action :set_current_user_for_logs
|
||||
@ -1053,28 +1052,6 @@ class ApplicationController < ActionController::Base
|
||||
Theme.where(id: ids).pluck(:id, :name).to_h.to_json
|
||||
end
|
||||
|
||||
def rate_limit_crawlers
|
||||
return if current_user.present?
|
||||
return if SiteSetting.slow_down_crawler_user_agents.blank?
|
||||
|
||||
user_agent = request.user_agent&.downcase
|
||||
return if user_agent.blank?
|
||||
|
||||
SiteSetting
|
||||
.slow_down_crawler_user_agents
|
||||
.downcase
|
||||
.split("|")
|
||||
.each do |crawler|
|
||||
if user_agent.include?(crawler)
|
||||
key = "#{crawler}_crawler_rate_limit"
|
||||
limiter =
|
||||
RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key)
|
||||
limiter.performed!
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def run_second_factor!(action_class, action_data: nil, target_user: current_user)
|
||||
if current_user && target_user != current_user
|
||||
# Anon can run 2fa against another target, but logged-in users should not.
|
||||
|
@ -267,6 +267,20 @@ class Middleware::RequestTracker
|
||||
end
|
||||
return 429, headers, [message]
|
||||
end
|
||||
|
||||
if !cookie
|
||||
if error_details = check_crawler_limits(env)
|
||||
available_in, error_code = error_details
|
||||
message = "Too many crawling requests. Error code: #{error_code}."
|
||||
headers = {
|
||||
"Content-Type" => "text/plain",
|
||||
"Retry-After" => available_in.to_s,
|
||||
"Discourse-Rate-Limit-Error-Code" => error_code,
|
||||
}
|
||||
return 429, headers, [message]
|
||||
end
|
||||
end
|
||||
|
||||
env["discourse.request_tracker"] = self
|
||||
|
||||
MethodProfiler.start
|
||||
@ -443,4 +457,30 @@ class Middleware::RequestTracker
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def check_crawler_limits(env)
|
||||
slow_down_agents = SiteSetting.slow_down_crawler_user_agents
|
||||
return if slow_down_agents.blank?
|
||||
|
||||
user_agent = env["HTTP_USER_AGENT"]&.downcase
|
||||
return if user_agent.blank?
|
||||
|
||||
return if !CrawlerDetection.crawler?(user_agent)
|
||||
|
||||
slow_down_agents
|
||||
.downcase
|
||||
.split("|")
|
||||
.each do |crawler|
|
||||
if user_agent.include?(crawler)
|
||||
key = "#{crawler}_crawler_rate_limit"
|
||||
limiter =
|
||||
RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key)
|
||||
limiter.performed!
|
||||
break
|
||||
end
|
||||
end
|
||||
nil
|
||||
rescue RateLimiter::LimitExceeded => e
|
||||
[e.available_in, e.error_code]
|
||||
end
|
||||
end
|
||||
|
@ -369,6 +369,31 @@ RSpec.describe Middleware::RequestTracker do
|
||||
end
|
||||
end
|
||||
|
||||
describe "crawler rate limits" do
|
||||
context "when there are multiple matching crawlers" do
|
||||
before { SiteSetting.slow_down_crawler_user_agents = "badcrawler2|badcrawler22" }
|
||||
|
||||
it "only checks limits for the first match" do
|
||||
env = env("HTTP_USER_AGENT" => "badcrawler")
|
||||
|
||||
status, _ = middleware.call(env)
|
||||
expect(status).to eq(200)
|
||||
end
|
||||
end
|
||||
|
||||
it "compares user agents in a case-insensitive manner" do
|
||||
SiteSetting.slow_down_crawler_user_agents = "BaDCRawLer"
|
||||
env1 = env("HTTP_USER_AGENT" => "bADcrAWLer")
|
||||
env2 = env("HTTP_USER_AGENT" => "bADcrAWLer")
|
||||
|
||||
status, _ = middleware.call(env1)
|
||||
expect(status).to eq(200)
|
||||
|
||||
status, _ = middleware.call(env2)
|
||||
expect(status).to eq(429)
|
||||
end
|
||||
end
|
||||
|
||||
describe "register_ip_skipper" do
|
||||
before do
|
||||
Middleware::RequestTracker.register_ip_skipper { |ip| ip == "1.1.1.2" }
|
||||
|
@ -1103,13 +1103,17 @@ RSpec.describe ApplicationController do
|
||||
end
|
||||
|
||||
describe "crawlers in slow_down_crawler_user_agents site setting" do
|
||||
before { RateLimiter.enable }
|
||||
before do
|
||||
Fabricate(:admin) # to prevent redirect to the wizard
|
||||
RateLimiter.enable
|
||||
|
||||
SiteSetting.slow_down_crawler_rate = 128
|
||||
SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
|
||||
end
|
||||
|
||||
use_redis_snapshotting
|
||||
|
||||
it "are rate limited" do
|
||||
SiteSetting.slow_down_crawler_rate = 128
|
||||
SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler"
|
||||
now = Time.zone.now
|
||||
freeze_time now
|
||||
|
||||
@ -1141,6 +1145,21 @@ RSpec.describe ApplicationController do
|
||||
get "/", headers: { "HTTP_USER_AGENT" => "iam problematiccrawler" }
|
||||
expect(response.status).to eq(200)
|
||||
end
|
||||
|
||||
context "with anonymous caching" do
|
||||
before do
|
||||
global_setting :anon_cache_store_threshold, 1
|
||||
Middleware::AnonymousCache.enable_anon_cache
|
||||
end
|
||||
|
||||
it "don't bypass crawler rate limits" do
|
||||
get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" }
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" }
|
||||
expect(response.status).to eq(429)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "#banner_json" do
|
||||
|
Loading…
Reference in New Issue
Block a user