mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
Merge pull request #5700 from discourse/crawl-block
FEATURE: control web crawlers access with white/blacklist
This commit is contained in:
@@ -28,4 +28,25 @@ module CrawlerDetection
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
# Given a user_agent that returns true from crawler?, should its request be allowed?
|
||||
def self.allow_crawler?(user_agent)
|
||||
return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
|
||||
SiteSetting.blacklisted_crawler_user_agents.blank?
|
||||
|
||||
@whitelisted_matchers ||= {}
|
||||
@blacklisted_matchers ||= {}
|
||||
|
||||
if SiteSetting.whitelisted_crawler_user_agents.present?
|
||||
whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
|
||||
!user_agent.nil? && user_agent.match?(whitelisted)
|
||||
else
|
||||
blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
|
||||
user_agent.nil? || !user_agent.match?(blacklisted)
|
||||
end
|
||||
end
|
||||
|
||||
def self.is_blocked_crawler?(user_agent)
|
||||
crawler?(user_agent) && !allow_crawler?(user_agent)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -83,6 +83,7 @@ class Middleware::RequestTracker
|
||||
if track_view
|
||||
if data[:is_crawler]
|
||||
ApplicationRequest.increment!(:page_view_crawler)
|
||||
WebCrawlerRequest.increment!(data[:user_agent])
|
||||
elsif data[:has_auth_cookie]
|
||||
ApplicationRequest.increment!(:page_view_logged_in)
|
||||
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
|
||||
@@ -129,8 +130,9 @@ class Middleware::RequestTracker
|
||||
is_mobile: helper.is_mobile?,
|
||||
track_view: track_view,
|
||||
timing: timing
|
||||
}
|
||||
|
||||
}.tap do |h|
|
||||
h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler]
|
||||
end
|
||||
end
|
||||
|
||||
def log_request_info(env, result, info)
|
||||
@@ -155,12 +157,20 @@ class Middleware::RequestTracker
|
||||
|
||||
def call(env)
|
||||
result = nil
|
||||
log_request = true
|
||||
request = Rack::Request.new(env)
|
||||
|
||||
if rate_limit(env)
|
||||
if rate_limit(request)
|
||||
result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
|
||||
return result
|
||||
end
|
||||
|
||||
if block_crawler(request)
|
||||
log_request = false
|
||||
result = [403, { 'Content-Type' => 'text/plain' }, ['Crawler is not allowed']]
|
||||
return result
|
||||
end
|
||||
|
||||
env["discourse.request_tracker"] = self
|
||||
MethodProfiler.start
|
||||
result = @app.call(env)
|
||||
@@ -186,7 +196,7 @@ class Middleware::RequestTracker
|
||||
end
|
||||
end
|
||||
end
|
||||
log_request_info(env, result, info) unless env["discourse.request_tracker.skip"]
|
||||
log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"]
|
||||
end
|
||||
|
||||
PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
|
||||
@@ -196,7 +206,7 @@ class Middleware::RequestTracker
|
||||
!!(ip && ip.to_s.match?(PRIVATE_IP))
|
||||
end
|
||||
|
||||
def rate_limit(env)
|
||||
def rate_limit(request)
|
||||
|
||||
if (
|
||||
GlobalSetting.max_reqs_per_ip_mode == "block" ||
|
||||
@@ -204,7 +214,7 @@ class Middleware::RequestTracker
|
||||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
||||
)
|
||||
|
||||
ip = Rack::Request.new(env).ip
|
||||
ip = request.ip
|
||||
|
||||
if !GlobalSetting.max_reqs_rate_limit_on_private
|
||||
return false if is_private_ip?(ip)
|
||||
@@ -236,15 +246,15 @@ class Middleware::RequestTracker
|
||||
global: true
|
||||
)
|
||||
|
||||
env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
|
||||
env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
|
||||
request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
|
||||
request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
|
||||
|
||||
warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
|
||||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
|
||||
|
||||
if !limiter_assets10.can_perform?
|
||||
if warn
|
||||
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}")
|
||||
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}")
|
||||
end
|
||||
|
||||
return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
||||
@@ -257,7 +267,7 @@ class Middleware::RequestTracker
|
||||
limiter60.performed!
|
||||
rescue RateLimiter::LimitExceeded
|
||||
if warn
|
||||
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}")
|
||||
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}")
|
||||
!(GlobalSetting.max_reqs_per_ip_mode == "warn")
|
||||
else
|
||||
true
|
||||
@@ -266,6 +276,14 @@ class Middleware::RequestTracker
|
||||
end
|
||||
end
|
||||
|
||||
def block_crawler(request)
|
||||
request.get? &&
|
||||
!request.xhr? &&
|
||||
request.env['HTTP_ACCEPT'] =~ /text\/html/ &&
|
||||
!request.path.ends_with?('robots.txt') &&
|
||||
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
|
||||
end
|
||||
|
||||
def log_later(data, host)
|
||||
Scheduler::Defer.later("Track view", _db = nil) do
|
||||
self.class.log_request_on_site(data, host)
|
||||
|
||||
Reference in New Issue
Block a user