Merge pull request #5700 from discourse/crawl-block

FEATURE: control web crawlers access with white/blacklist
This commit is contained in:
Neil Lalonde
2018-03-27 15:06:03 -04:00
committed by GitHub
22 changed files with 734 additions and 97 deletions

View File

@@ -28,4 +28,25 @@ module CrawlerDetection
end
end
# Given a user_agent that returns true from crawler?, should its request be allowed?
def self.allow_crawler?(user_agent)
return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
SiteSetting.blacklisted_crawler_user_agents.blank?
@whitelisted_matchers ||= {}
@blacklisted_matchers ||= {}
if SiteSetting.whitelisted_crawler_user_agents.present?
whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
!user_agent.nil? && user_agent.match?(whitelisted)
else
blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
user_agent.nil? || !user_agent.match?(blacklisted)
end
end
def self.is_blocked_crawler?(user_agent)
crawler?(user_agent) && !allow_crawler?(user_agent)
end
end

View File

@@ -83,6 +83,7 @@ class Middleware::RequestTracker
if track_view
if data[:is_crawler]
ApplicationRequest.increment!(:page_view_crawler)
WebCrawlerRequest.increment!(data[:user_agent])
elsif data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in)
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
@@ -129,8 +130,9 @@ class Middleware::RequestTracker
is_mobile: helper.is_mobile?,
track_view: track_view,
timing: timing
}
}.tap do |h|
h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler]
end
end
def log_request_info(env, result, info)
@@ -155,12 +157,20 @@ class Middleware::RequestTracker
def call(env)
result = nil
log_request = true
request = Rack::Request.new(env)
if rate_limit(env)
if rate_limit(request)
result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
return result
end
if block_crawler(request)
log_request = false
result = [403, { 'Content-Type' => 'text/plain' }, ['Crawler is not allowed']]
return result
end
env["discourse.request_tracker"] = self
MethodProfiler.start
result = @app.call(env)
@@ -186,7 +196,7 @@ class Middleware::RequestTracker
end
end
end
log_request_info(env, result, info) unless env["discourse.request_tracker.skip"]
log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"]
end
PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
@@ -196,7 +206,7 @@ class Middleware::RequestTracker
!!(ip && ip.to_s.match?(PRIVATE_IP))
end
def rate_limit(env)
def rate_limit(request)
if (
GlobalSetting.max_reqs_per_ip_mode == "block" ||
@@ -204,7 +214,7 @@ class Middleware::RequestTracker
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
)
ip = Rack::Request.new(env).ip
ip = request.ip
if !GlobalSetting.max_reqs_rate_limit_on_private
return false if is_private_ip?(ip)
@@ -236,15 +246,15 @@ class Middleware::RequestTracker
global: true
)
env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
GlobalSetting.max_reqs_per_ip_mode == "warn+block"
if !limiter_assets10.can_perform?
if warn
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}")
Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}")
end
return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
@@ -257,7 +267,7 @@ class Middleware::RequestTracker
limiter60.performed!
rescue RateLimiter::LimitExceeded
if warn
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}")
Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}")
!(GlobalSetting.max_reqs_per_ip_mode == "warn")
else
true
@@ -266,6 +276,14 @@ class Middleware::RequestTracker
end
end
def block_crawler(request)
request.get? &&
!request.xhr? &&
request.env['HTTP_ACCEPT'] =~ /text\/html/ &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
end
def log_later(data, host)
Scheduler::Defer.later("Track view", _db = nil) do
self.class.log_request_on_site(data, host)