mirror of
https://github.com/discourse/discourse.git
synced 2024-12-02 05:29:17 -06:00
8ecf313a81
This corrects an issue where we are hitting Gravatar for 404 over and over Also ensures file download properly reports errors
230 lines
5.4 KiB
Ruby
230 lines
5.4 KiB
Ruby
require "socket"
|
|
require "ipaddr"
|
|
require 'excon'
|
|
require 'rate_limiter'
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
class FinalDestination
|
|
|
|
attr_reader :status, :cookie, :status_code
|
|
|
|
def initialize(url, opts = nil)
|
|
@url = url
|
|
@uri =
|
|
begin
|
|
URI(escape_url) if @url
|
|
rescue URI::InvalidURIError
|
|
end
|
|
|
|
@opts = opts || {}
|
|
@force_get_hosts = @opts[:force_get_hosts] || []
|
|
@opts[:max_redirects] ||= 5
|
|
@opts[:lookup_ip] ||= lambda do |host|
|
|
begin
|
|
IPSocket::getaddress(host)
|
|
rescue SocketError
|
|
nil
|
|
end
|
|
end
|
|
@ignored = [Discourse.base_url_no_prefix] + (@opts[:ignore_redirects] || [])
|
|
@limit = @opts[:max_redirects]
|
|
@status = :ready
|
|
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
|
|
@cookie = nil
|
|
end
|
|
|
|
def self.connection_timeout
|
|
20
|
|
end
|
|
|
|
def redirected?
|
|
@limit < @opts[:max_redirects]
|
|
end
|
|
|
|
def request_headers
|
|
result = {
|
|
"User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
|
"Accept" => "text/html",
|
|
"Host" => @uri.hostname
|
|
}
|
|
|
|
result['cookie'] = @cookie if @cookie
|
|
|
|
result
|
|
end
|
|
|
|
def small_get(headers)
|
|
Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
|
|
http.open_timeout = FinalDestination.connection_timeout
|
|
http.read_timeout = FinalDestination.connection_timeout
|
|
|
|
request = Net::HTTP::Get.new(@uri.request_uri, headers)
|
|
http.request(request) do |response|
|
|
return response
|
|
end
|
|
end
|
|
end
|
|
|
|
def resolve
|
|
if @limit < 0
|
|
@status = :too_many_redirects
|
|
return nil
|
|
end
|
|
|
|
@ignored.each do |host|
|
|
if hostname_matches?(host)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
end
|
|
|
|
return nil unless validate_uri
|
|
headers = request_headers
|
|
response = Excon.public_send(@http_verb,
|
|
@uri.to_s,
|
|
read_timeout: FinalDestination.connection_timeout,
|
|
headers: headers
|
|
)
|
|
|
|
location = nil
|
|
headers = nil
|
|
|
|
response_status = response.status.to_i
|
|
|
|
case response.status
|
|
when 200
|
|
@status = :resolved
|
|
return @uri
|
|
when 405, 409, 501
|
|
get_response = small_get(headers)
|
|
|
|
response_status = get_response.code.to_i
|
|
if response_status == 200
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
headers = {}
|
|
if cookie_val = get_response.get_fields('set-cookie')
|
|
headers['set-cookie'] = cookie_val.join
|
|
end
|
|
|
|
# TODO this is confusing why grap location for anything not
|
|
# between 300-400 ?
|
|
if location_val = get_response.get_fields('location')
|
|
headers['location'] = location_val.join
|
|
end
|
|
end
|
|
|
|
unless headers
|
|
headers = {}
|
|
response.headers.each do |k, v|
|
|
headers[k.to_s.downcase] = v
|
|
end
|
|
end
|
|
|
|
if (300..399).include?(response_status)
|
|
location = headers["location"]
|
|
end
|
|
|
|
if set_cookie = headers["set-cookie"]
|
|
@cookie = set_cookie
|
|
end
|
|
|
|
if location
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = URI(location) rescue nil
|
|
@limit -= 1
|
|
return resolve
|
|
end
|
|
|
|
# this is weird an exception seems better
|
|
@status = :failure
|
|
@status_code = response.status
|
|
|
|
nil
|
|
rescue Excon::Errors::Timeout
|
|
nil
|
|
end
|
|
|
|
def validate_uri
|
|
validate_uri_format && is_dest_valid?
|
|
end
|
|
|
|
def validate_uri_format
|
|
return false unless @uri
|
|
return false unless ['https', 'http'].include?(@uri.scheme)
|
|
return false if @uri.scheme == 'http' && @uri.port != 80
|
|
return false if @uri.scheme == 'https' && @uri.port != 443
|
|
|
|
# Disallow IP based crawling
|
|
(IPAddr.new(@uri.hostname) rescue nil).nil?
|
|
end
|
|
|
|
def hostname_matches?(url)
|
|
@uri && url.present? && @uri.hostname == (URI(url) rescue nil)&.hostname
|
|
end
|
|
|
|
def is_dest_valid?
|
|
|
|
return false unless @uri && @uri.host
|
|
|
|
# Whitelisted hosts
|
|
return true if hostname_matches?(SiteSetting.s3_cdn_url) ||
|
|
hostname_matches?(GlobalSetting.try(:cdn_url)) ||
|
|
hostname_matches?(Discourse.base_url_no_prefix)
|
|
|
|
if SiteSetting.whitelist_internal_hosts.present?
|
|
SiteSetting.whitelist_internal_hosts.split('|').each do |h|
|
|
return true if @uri.hostname.downcase == h.downcase
|
|
end
|
|
end
|
|
|
|
address_s = @opts[:lookup_ip].call(@uri.hostname)
|
|
return false unless address_s
|
|
|
|
address = IPAddr.new(address_s)
|
|
|
|
if private_ranges.any? { |r| r === address }
|
|
@status = :invalid_address
|
|
return false
|
|
end
|
|
|
|
# Rate limit how often this IP can be crawled
|
|
unless @opts[:skip_rate_limit]
|
|
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed!
|
|
end
|
|
|
|
true
|
|
rescue RateLimiter::LimitExceeded
|
|
false
|
|
end
|
|
|
|
def escape_url
|
|
TopicEmbed.escape_uri(
|
|
CGI.unescapeHTML(@url),
|
|
Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]")
|
|
)
|
|
end
|
|
|
|
def private_ranges
|
|
FinalDestination.standard_private_ranges +
|
|
SiteSetting.blacklist_ip_blocks.split('|').map { |r| IPAddr.new(r) rescue nil }.compact
|
|
end
|
|
|
|
def self.standard_private_ranges
|
|
@private_ranges ||= [
|
|
IPAddr.new('127.0.0.1'),
|
|
IPAddr.new('172.16.0.0/12'),
|
|
IPAddr.new('192.168.0.0/16'),
|
|
IPAddr.new('10.0.0.0/8'),
|
|
IPAddr.new('fc00::/7')
|
|
]
|
|
end
|
|
|
|
def self.lookup_ip(host)
|
|
IPSocket::getaddress(host)
|
|
end
|
|
|
|
end
|