mirror of
https://github.com/discourse/discourse.git
synced 2024-11-30 04:34:13 -06:00
cf42466dea
This commit adds some system specs to test uploads with direct to S3 single and multipart uploads via uppy. This is done with minio as a local S3 replacement. We are doing this to catch regressions when uppy dependencies need to be upgraded or we change uppy upload code, since before this there was no way to know outside manual testing whether these changes would cause regressions. Minio's server lifecycle and the installed binaries are managed by the https://github.com/discourse/minio_runner gem, though the binaries are already installed on the discourse_test image we run GitHub CI from. These tests will only run in CI unless you specifically use the CI=1 or RUN_S3_SYSTEM_SPECS=1 env vars. For a history of experimentation here see https://github.com/discourse/discourse/pull/22381 Related PRs: * https://github.com/discourse/minio_runner/pull/1 * https://github.com/discourse/minio_runner/pull/2 * https://github.com/discourse/minio_runner/pull/3
591 lines
16 KiB
Ruby
591 lines
16 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "socket"
|
|
require "ipaddr"
|
|
require "excon"
|
|
require "rate_limiter"
|
|
require "url_helper"
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
class FinalDestination
|
|
class SSRFError < SocketError
|
|
end
|
|
|
|
MAX_REQUEST_TIME_SECONDS = 10
|
|
MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5
|
|
|
|
def self.clear_https_cache!(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.del(key)
|
|
end
|
|
|
|
def self.cache_https_domain(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
|
|
end
|
|
|
|
def self.is_https_domain?(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.get(key).present?
|
|
end
|
|
|
|
def self.redis_https_key(domain)
|
|
"HTTPS_DOMAIN_#{domain}"
|
|
end
|
|
|
|
DEFAULT_USER_AGENT =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
|
|
|
|
attr_reader :status, :cookie, :status_code, :content_type, :ignored
|
|
|
|
def initialize(url, opts = nil)
|
|
@url = url
|
|
@uri = uri(normalized_url) if @url
|
|
|
|
@opts = opts || {}
|
|
@force_get_hosts = @opts[:force_get_hosts] || []
|
|
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
|
|
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
|
|
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
|
@opts[:max_redirects] ||= 5
|
|
@https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit]
|
|
@include_port_in_host_header = @opts[:include_port_in_host_header] || false
|
|
|
|
@max_redirects = @opts[:max_redirects]
|
|
@limit = @max_redirects
|
|
|
|
@ignored = []
|
|
if @limit > 0
|
|
ignore_redirects = [Discourse.base_url_no_prefix]
|
|
|
|
ignore_redirects.concat(@opts[:ignore_redirects]) if @opts[:ignore_redirects]
|
|
|
|
ignore_redirects.each do |ignore_redirect|
|
|
ignore_redirect = uri(ignore_redirect)
|
|
@ignored << ignore_redirect.hostname if ignore_redirect.present? && ignore_redirect.hostname
|
|
end
|
|
end
|
|
|
|
@status = :ready
|
|
@follow_canonical = @opts[:follow_canonical]
|
|
@http_verb = @opts[:http_verb] || http_verb(@force_get_hosts, @follow_canonical)
|
|
@cookie = nil
|
|
@limited_ips = []
|
|
@verbose = @opts[:verbose] || false
|
|
@timeout = @opts[:timeout] || nil
|
|
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
|
|
@validate_uri = @opts.fetch(:validate_uri) { true }
|
|
@user_agent =
|
|
(
|
|
if @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) }
|
|
Onebox.options.user_agent
|
|
else
|
|
@default_user_agent
|
|
end
|
|
)
|
|
@stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
|
|
end
|
|
|
|
def self.connection_timeout
|
|
20
|
|
end
|
|
|
|
def self.resolve(url, opts = nil)
|
|
new(url, opts).resolve
|
|
end
|
|
|
|
def http_verb(force_get_hosts, follow_canonical)
|
|
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
|
|
:get
|
|
else
|
|
:head
|
|
end
|
|
end
|
|
|
|
def timeout
|
|
@timeout || FinalDestination.connection_timeout
|
|
end
|
|
|
|
def redirected?
|
|
@limit < @max_redirects
|
|
end
|
|
|
|
def request_headers
|
|
result = {
|
|
"User-Agent" => @user_agent,
|
|
"Accept" => "*/*",
|
|
"Accept-Language" => "*",
|
|
"Host" => @uri.hostname + (@include_port_in_host_header ? ":#{@uri.port}" : ""),
|
|
}
|
|
|
|
result["Cookie"] = @cookie if @cookie
|
|
|
|
result
|
|
end
|
|
|
|
def small_get(request_headers)
|
|
status_code, response_headers = nil
|
|
|
|
catch(:done) do
|
|
FinalDestination::HTTP.start(
|
|
@uri.host,
|
|
@uri.port,
|
|
use_ssl: @uri.is_a?(URI::HTTPS),
|
|
open_timeout: timeout,
|
|
) do |http|
|
|
http.read_timeout = timeout
|
|
http.request_get(@uri.request_uri, request_headers) do |resp|
|
|
status_code = resp.code.to_i
|
|
response_headers = resp.to_hash
|
|
|
|
# see: https://bugs.ruby-lang.org/issues/15624
|
|
# if we allow response to return then body will be read
|
|
# got to abort without reading body
|
|
throw :done
|
|
end
|
|
end
|
|
end
|
|
|
|
[status_code, response_headers]
|
|
end
|
|
|
|
# this is a new interface for simply getting
|
|
# N bytes accounting for all internal logic
|
|
def get(redirects = @limit, extra_headers: {}, &blk)
|
|
raise "Must specify block" unless block_given?
|
|
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
@uri.scheme = "https"
|
|
@uri = URI(@uri.to_s)
|
|
end
|
|
|
|
return if !validate_uri
|
|
return if @stop_at_blocked_pages && blocked_domain?(@uri)
|
|
|
|
result, headers_subset = safe_get(@uri, &blk)
|
|
return if !result
|
|
|
|
cookie = headers_subset.set_cookie
|
|
location = headers_subset.location
|
|
|
|
if result == :redirect
|
|
return if !location
|
|
|
|
old_uri = @uri
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = uri(location)
|
|
|
|
if @uri && redirects == @max_redirects && @https_redirect_ignore_limit &&
|
|
same_uri_but_https?(old_uri, @uri)
|
|
redirects += 1
|
|
@https_redirect_ignore_limit = false
|
|
end
|
|
|
|
return if redirects == 0
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
end
|
|
|
|
return if !@uri
|
|
|
|
extra = nil
|
|
extra = { "Cookie" => cookie } if cookie
|
|
|
|
get(redirects - 1, extra_headers: extra, &blk)
|
|
elsif result == :ok
|
|
@uri.to_s
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
def resolve
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
@uri.scheme = "https"
|
|
@uri = URI(@uri.to_s)
|
|
end
|
|
|
|
if @limit < 0
|
|
@status = :too_many_redirects
|
|
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}")
|
|
return
|
|
end
|
|
|
|
unless validate_uri
|
|
@status = :invalid_address
|
|
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}")
|
|
return
|
|
end
|
|
|
|
@ignored.each do |host|
|
|
if @uri&.hostname&.match?(host)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
end
|
|
|
|
if Oneboxer.cached_response_body_exists?(@uri.to_s)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
headers = request_headers
|
|
middlewares = Excon.defaults[:middlewares].dup
|
|
middlewares << Excon::Middleware::Decompress if @http_verb == :get
|
|
|
|
request_start_time = Time.now
|
|
response_body = +""
|
|
request_validator =
|
|
lambda do |chunk, _remaining_bytes, _total_bytes|
|
|
response_body << chunk
|
|
if response_body.bytesize > MAX_REQUEST_SIZE_BYTES
|
|
raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri.to_s}")
|
|
end
|
|
if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS
|
|
raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri.to_s}")
|
|
end
|
|
end
|
|
|
|
# This technique will only use the first resolved IP
|
|
# TODO: Can we standardise this by using FinalDestination::HTTP?
|
|
begin
|
|
resolved_ip = SSRFDetector.lookup_and_filter_ips(@uri.hostname).first
|
|
rescue SSRFDetector::DisallowedIpError, SocketError, Timeout::Error
|
|
@status = :invalid_address
|
|
return
|
|
end
|
|
request_uri = @uri.dup
|
|
request_uri.hostname = resolved_ip unless Rails.env.test? # WebMock doesn't understand the IP-based requests
|
|
|
|
response =
|
|
Excon.public_send(
|
|
@http_verb,
|
|
request_uri.to_s,
|
|
read_timeout: timeout,
|
|
connect_timeout: timeout,
|
|
headers: { "Host" => @uri.hostname }.merge(headers),
|
|
middlewares: middlewares,
|
|
response_block: request_validator,
|
|
ssl_verify_peer_host: @uri.hostname,
|
|
)
|
|
|
|
if @stop_at_blocked_pages
|
|
if blocked_domain?(@uri) || response.headers["Discourse-No-Onebox"] == "1"
|
|
@status = :blocked_page
|
|
return
|
|
end
|
|
end
|
|
|
|
location = nil
|
|
response_headers = nil
|
|
response_status = response.status.to_i
|
|
|
|
case response.status
|
|
when 200
|
|
# Cache body of successful `get` requests
|
|
if @http_verb == :get
|
|
if Oneboxer.cache_response_body?(@uri)
|
|
Oneboxer.cache_response_body(@uri.to_s, response_body)
|
|
end
|
|
end
|
|
|
|
if @follow_canonical
|
|
next_url = fetch_canonical_url(response_body)
|
|
|
|
if next_url.to_s.present? && next_url != @uri
|
|
@follow_canonical = false
|
|
@uri = next_url
|
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
|
|
|
return resolve
|
|
end
|
|
end
|
|
|
|
@content_type = response.headers["Content-Type"] if response.headers.has_key?("Content-Type")
|
|
@status = :resolved
|
|
return @uri
|
|
when 103, 400, 405, 406, 409, 500, 501
|
|
response_status, small_headers = small_get(request_headers)
|
|
|
|
if @stop_at_blocked_pages
|
|
# this may seem weird, but the #to_hash method of the response object
|
|
# of ruby's net/http lib returns a hash where each value is an array.
|
|
# small_headers here is like that so our no onebox header value is an
|
|
# array if it's set. Also the hash keys are always lower-cased.
|
|
dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
|
|
if dont_onebox || blocked_domain?(@uri)
|
|
@status = :blocked_page
|
|
return
|
|
end
|
|
end
|
|
|
|
if response_status == 200
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
response_headers = {}
|
|
if cookie_val = small_headers["set-cookie"]
|
|
response_headers[:cookies] = cookie_val
|
|
end
|
|
|
|
if location_val = small_headers["location"]
|
|
response_headers[:location] = location_val.join
|
|
end
|
|
end
|
|
|
|
unless response_headers
|
|
response_headers = {
|
|
cookies: response.data[:cookies] || response.headers[:"set-cookie"],
|
|
location: response.headers[:location],
|
|
}
|
|
end
|
|
|
|
location = response_headers[:location] if (300..399).include?(response_status)
|
|
|
|
if cookies = response_headers[:cookies]
|
|
@cookie = Array.wrap(cookies).map { |c| c.split(";").first.strip }.join("; ")
|
|
end
|
|
|
|
if location
|
|
redirect_uri = uri(location)
|
|
if @uri.host == redirect_uri.host &&
|
|
(redirect_uri.path =~ %r{/login} || redirect_uri.path =~ %r{/session})
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
old_uri = @uri
|
|
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = uri(location)
|
|
|
|
if @uri && @limit == @max_redirects && @https_redirect_ignore_limit &&
|
|
same_uri_but_https?(old_uri, @uri)
|
|
@limit += 1
|
|
@https_redirect_ignore_limit = false
|
|
end
|
|
@limit -= 1
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
end
|
|
return resolve
|
|
end
|
|
|
|
# this is weird an exception seems better
|
|
@status = :failure
|
|
@status_code = response.status
|
|
|
|
log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}")
|
|
nil
|
|
rescue Excon::Errors::Timeout
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
|
|
nil
|
|
end
|
|
|
|
def skip_validations?
|
|
!@validate_uri
|
|
end
|
|
|
|
def validate_uri
|
|
skip_validations? || validate_uri_format
|
|
end
|
|
|
|
def validate_uri_format
|
|
return false unless @uri && @uri.host
|
|
return false unless %w[https http].include?(@uri.scheme)
|
|
|
|
# In some cases (like local/test environments) we may want to allow http URLs
|
|
# to be used for internal hosts, but only if it's the case that the host is
|
|
# explicitly used for SiteSetting.s3_endpoint. This is to allow for local
|
|
# S3 providers like minio.
|
|
#
|
|
# In all other cases, we should not be allowing http calls to anything except
|
|
# port 80.
|
|
return false if @uri.scheme == "http" && !http_port_ok?
|
|
return false if @uri.scheme == "https" && @uri.port != 443
|
|
|
|
# Disallow IP based crawling
|
|
(
|
|
begin
|
|
IPAddr.new(@uri.hostname)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
).nil?
|
|
end
|
|
|
|
def http_port_ok?
|
|
return true if @uri.port == 80
|
|
|
|
allowed_internal_hosts =
|
|
SiteSetting.allowed_internal_hosts&.split(/[|\n]/).filter_map { |aih| aih.strip.presence }
|
|
return false if allowed_internal_hosts.empty? || SiteSetting.s3_endpoint.blank?
|
|
return false if allowed_internal_hosts.none? { |aih| hostname_matches_s3_endpoint?(aih) }
|
|
|
|
true
|
|
end
|
|
|
|
def hostname_matches_s3_endpoint?(allowed_internal_host)
|
|
s3_endpoint_uri = URI(SiteSetting.s3_endpoint)
|
|
hostname_matches?("http://#{allowed_internal_host}") && @uri.port == s3_endpoint_uri.port &&
|
|
@uri.hostname.end_with?(s3_endpoint_uri.hostname)
|
|
end
|
|
|
|
def hostname
|
|
@uri.hostname
|
|
end
|
|
|
|
def hostname_matches?(url)
|
|
url = uri(url)
|
|
|
|
if @uri&.hostname.present? && url&.hostname.present?
|
|
hostname_parts = url.hostname.split(".")
|
|
has_wildcard = hostname_parts.first == "*"
|
|
|
|
if has_wildcard
|
|
@uri.hostname.end_with?(hostname_parts[1..-1].join("."))
|
|
else
|
|
@uri.hostname == url.hostname
|
|
end
|
|
end
|
|
end
|
|
|
|
def normalized_url
|
|
UrlHelper.normalized_encode(@url)
|
|
end
|
|
|
|
def log(log_level, message)
|
|
return unless @verbose
|
|
return if @status_code == 404
|
|
|
|
Rails.logger.public_send(
|
|
log_level,
|
|
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
|
|
)
|
|
end
|
|
|
|
protected
|
|
|
|
def safe_get(uri)
|
|
result = nil
|
|
unsafe_close = false
|
|
headers_subset = Struct.new(:location, :set_cookie).new
|
|
|
|
safe_session(uri) do |http|
|
|
headers =
|
|
request_headers.merge(
|
|
"Accept-Encoding" => "gzip",
|
|
"Host" => uri.hostname + (@include_port_in_host_header ? ":#{uri.port}" : ""),
|
|
)
|
|
|
|
req = FinalDestination::HTTP::Get.new(uri.request_uri, headers)
|
|
|
|
http.request(req) do |resp|
|
|
headers_subset.set_cookie = resp["Set-Cookie"]
|
|
|
|
if @stop_at_blocked_pages
|
|
dont_onebox = resp["Discourse-No-Onebox"] == "1"
|
|
if dont_onebox
|
|
result = :blocked, headers_subset
|
|
next
|
|
end
|
|
end
|
|
|
|
if Net::HTTPRedirection === resp
|
|
headers_subset.location = resp["location"]
|
|
result = :redirect, headers_subset
|
|
end
|
|
|
|
if Net::HTTPSuccess === resp
|
|
resp.decode_content = true
|
|
resp.read_body do |chunk|
|
|
read_next = true
|
|
|
|
catch(:done) do
|
|
if read_next
|
|
read_next = false
|
|
yield resp, chunk, uri
|
|
read_next = true
|
|
end
|
|
end
|
|
|
|
# no clean way of finishing abruptly cause
|
|
# response likes reading till the end
|
|
if !read_next
|
|
unsafe_close = true
|
|
http.finish
|
|
raise StandardError
|
|
end
|
|
end
|
|
result = :ok, headers_subset
|
|
else
|
|
catch(:done) { yield resp, nil, nil }
|
|
end
|
|
end
|
|
end
|
|
|
|
result
|
|
rescue Timeout::Error
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
|
|
nil
|
|
rescue OpenSSL::SSL::SSLError => exception
|
|
log(:warn, "An error with SSL occurred: #{@uri} #{exception.message}")
|
|
nil
|
|
rescue StandardError
|
|
unsafe_close ? [:ok, headers_subset] : raise
|
|
end
|
|
|
|
def safe_session(uri)
|
|
FinalDestination::HTTP.start(
|
|
uri.host,
|
|
uri.port,
|
|
use_ssl: (uri.scheme == "https"),
|
|
open_timeout: timeout,
|
|
) do |http|
|
|
http.read_timeout = timeout
|
|
yield http
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def uri(location)
|
|
begin
|
|
URI.parse(location)
|
|
rescue URI::Error
|
|
end
|
|
end
|
|
|
|
def fetch_canonical_url(body)
|
|
return if body.blank?
|
|
|
|
canonical_element = Nokogiri.HTML5(body).at("link[rel='canonical']")
|
|
return if canonical_element.nil?
|
|
canonical_uri = uri(canonical_element["href"])
|
|
return if canonical_uri.blank?
|
|
|
|
return canonical_uri if canonical_uri.host.present?
|
|
parts = [@uri.host, canonical_uri.to_s]
|
|
complete_url = canonical_uri.to_s.starts_with?("/") ? parts.join("") : parts.join("/")
|
|
complete_url = "#{@uri.scheme}://#{complete_url}" if @uri.scheme
|
|
|
|
uri(complete_url)
|
|
end
|
|
|
|
def blocked_domain?(uri)
|
|
Onebox::DomainChecker.is_blocked?(uri.hostname)
|
|
end
|
|
|
|
def same_uri_but_https?(before, after)
|
|
before = before.to_s
|
|
after = after.to_s
|
|
before.start_with?("http://") && after.start_with?("https://") &&
|
|
before.sub("http://", "") == after.sub("https://", "")
|
|
end
|
|
end
|