mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 02:11:08 -06:00
7b562d2f46
- phase one does it match 'trident|webkit|gecko|chrome|safari|msie|opera' yes- well it is possibly a browser - phase two does it match 'rss|bot|spider|crawler|facebook|archive|wayback|ping|monitor' probably a crawler then Based off: https://gist.github.com/SamSaffron/6cfad7ea3e6df321ffb7a84f93720a53
24 lines
682 B
Ruby
24 lines
682 B
Ruby
module CrawlerDetection
|
|
|
|
def self.to_matcher(string)
|
|
escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
|
|
Regexp.new(escaped, Regexp::IGNORECASE)
|
|
end
|
|
|
|
def self.crawler?(user_agent)
|
|
# this is done to avoid regenerating regexes
|
|
@non_crawler_matchers ||= {}
|
|
@matchers ||= {}
|
|
|
|
possibly_real = (@non_crawler_matchers[SiteSetting.non_crawler_user_agents] ||= to_matcher(SiteSetting.non_crawler_user_agents))
|
|
|
|
if user_agent.match?(possibly_real)
|
|
known_bots = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents))
|
|
user_agent.match?(known_bots)
|
|
else
|
|
true
|
|
end
|
|
|
|
end
|
|
end
|