DEV: Oneboxer wildcard subdomains (#13015)

* DEV: Allow wildcards in Oneboxer optional domain Site Settings

Allows a wildcard to be used as a subdomain on Oneboxer-related SiteSettings, e.g.:

- `force_get_hosts`
- `cache_onebox_response_body_domains`
- `force_custom_user_agent_hosts`

* DEV: fix typos

* FIX: Try doing a GET after receiving a 500 error from a HEAD

By default we try to do a `HEAD` requests. If this results in a 500 error response, we should try to do a `GET`

* DEV: `force_get_hosts` should be a hidden setting

* DEV: Oneboxer Strategies

Have an alternative oneboxing ‘strategy’ (i.e., set of options) to use when an attempt to generate a Onebox fails. Keep track of any non-default strategies that were used on a particular host, and use that strategy for that host in the future.

Initially, the alternate strategy (`force_get_and_ua`) forces the FinalDestination step of Oneboxing to do a `GET` rather than `HEAD`, and forces a custom user agent.

* DEV: change stubbed return code

The stubbed status code needs to be a value not recognized by FinalDestination
This commit is contained in:
jbrw 2021-05-13 15:48:35 -04:00 committed by GitHub
parent a62ad0fa4d
commit 19182b1386
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 155 additions and 5 deletions

View File

@ -1637,6 +1637,10 @@ onebox:
force_custom_user_agent_hosts: force_custom_user_agent_hosts:
default: "http://codepen.io" default: "http://codepen.io"
type: list type: list
force_get_hosts:
default: "us.battle.net|news.yahoo.com|*.medium.com"
type: list
hidden: true
facebook_app_access_token: facebook_app_access_token:
default: "" default: ""
secret: true secret: true

View File

@ -215,7 +215,7 @@ class FinalDestination
@status = :resolved @status = :resolved
return @uri return @uri
when 400, 405, 406, 409, 501 when 400, 405, 406, 409, 500, 501
response_status, small_headers = small_get(request_headers) response_status, small_headers = small_get(request_headers)
if response_status == 200 if response_status == 200
@ -300,7 +300,17 @@ class FinalDestination
def hostname_matches?(url) def hostname_matches?(url)
url = uri(url) url = uri(url)
@uri && url.present? && @uri.hostname == url&.hostname
if @uri&.hostname.present? && url&.hostname.present?
hostname_parts = url.hostname.split('.')
has_wildcard = hostname_parts.first == '*'
if has_wildcard
@uri.hostname.end_with?(hostname_parts[1..-1].join('.'))
else
@uri.hostname == url.hostname
end
end
end end
def is_dest_valid? def is_dest_valid?

View File

@ -32,7 +32,8 @@ module Oneboxer
end end
def self.force_get_hosts def self.force_get_hosts
hosts = ['http://us.battle.net', 'https://news.yahoo.com'] hosts = []
hosts += SiteSetting.force_get_hosts.split('|').collect { |domain| "https://#{domain}" }
hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" } hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" }
hosts += amazon_domains hosts += amazon_domains
@ -394,8 +395,13 @@ module Oneboxer
allowed += SiteSetting.allowed_iframes.split("|") allowed += SiteSetting.allowed_iframes.split("|")
end end
def self.external_onebox(url) def self.external_onebox(url, available_strategies = nil)
Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do
uri = URI(url)
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
strategy = available_strategies.shift
fd_options = { fd_options = {
ignore_redirects: ignore_redirects, ignore_redirects: ignore_redirects,
ignore_hostnames: blocked_domains, ignore_hostnames: blocked_domains,
@ -404,6 +410,13 @@ module Oneboxer
preserve_fragment_url_hosts: preserve_fragment_url_hosts preserve_fragment_url_hosts: preserve_fragment_url_hosts
} }
if strategy && Oneboxer.strategies[strategy][:force_get_host]
fd_options[:force_get_hosts] = ["https://#{uri.hostname}"]
end
if strategy && Oneboxer.strategies[strategy][:force_custom_user_agent_host]
fd_options[:force_custom_user_agent_hosts] = ["https://#{uri.hostname}"]
end
user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present? user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present?
fd_options[:default_user_agent] = user_agent_override if user_agent_override fd_options[:default_user_agent] = user_agent_override if user_agent_override
@ -415,6 +428,11 @@ module Oneboxer
if fd.status == :invalid_address if fd.status == :invalid_address
args[:error_message] = I18n.t("errors.onebox.invalid_address", hostname: fd.hostname) args[:error_message] = I18n.t("errors.onebox.invalid_address", hostname: fd.hostname)
elsif fd.status_code elsif fd.status_code
# Try a different oneboxing strategy, if we have any options left:
if available_strategies.present?
return external_onebox(url, available_strategies)
end
args[:error_message] = I18n.t("errors.onebox.error_response", status_code: fd.status_code) args[:error_message] = I18n.t("errors.onebox.error_response", status_code: fd.status_code)
end end
@ -466,6 +484,8 @@ module Oneboxer
end end
end end
Oneboxer.cache_preferred_strategy(uri.hostname, strategy)
result result
end end
end end
@ -490,4 +510,44 @@ module Oneboxer
end end
end end
def self.ordered_strategies(hostname)
all = strategies.keys
preferred = Oneboxer.preferred_strategy(hostname)
all.insert(0, all.delete(preferred)) if all.include?(preferred)
all
end
def self.strategies
{
default: {}, # don't override anything by default
force_get_and_ua: {
force_get_host: true,
force_custom_user_agent_host: true,
},
}
end
def self.cache_preferred_strategy(hostname, strategy)
return if strategy == :default
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.setex(key, 2.weeks.to_i, strategy.to_s)
end
def self.clear_preferred_strategy!(hostname)
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.del(key)
end
def self.preferred_strategy(hostname)
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.get(key)&.to_sym
end
def self.redis_oneboxer_strategy_key(hostname)
"ONEBOXER_STRATEGY_#{hostname}"
end
end end

View File

@ -9,7 +9,7 @@ describe FinalDestination do
{ {
ignore_redirects: ['https://ignore-me.com'], ignore_redirects: ['https://ignore-me.com'],
force_get_hosts: ['https://force.get.com'], force_get_hosts: ['https://force.get.com', 'https://*.ihaveawildcard.com/'],
preserve_fragment_url_hosts: ['https://eviltrout.com'], preserve_fragment_url_hosts: ['https://eviltrout.com'],
@ -17,6 +17,7 @@ describe FinalDestination do
lookup_ip: lambda do |host| lookup_ip: lambda do |host|
case host case host
when 'eviltrout.com' then '52.84.143.152' when 'eviltrout.com' then '52.84.143.152'
when 'particularly.eviltrout.com' then '52.84.143.152'
when 'codinghorror.com' then '91.146.108.148' when 'codinghorror.com' then '91.146.108.148'
when 'discourse.org' then '104.25.152.10' when 'discourse.org' then '104.25.152.10'
when 'some_thing.example.com' then '104.25.152.10' when 'some_thing.example.com' then '104.25.152.10'
@ -24,6 +25,7 @@ describe FinalDestination do
when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf' when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
when 'ignore-me.com' then '53.84.143.152' when 'ignore-me.com' then '53.84.143.152'
when 'force.get.com' then '22.102.29.40' when 'force.get.com' then '22.102.29.40'
when 'any-subdomain.ihaveawildcard.com' then '104.25.152.11'
when 'wikipedia.com' then '1.2.3.4' when 'wikipedia.com' then '1.2.3.4'
else else
as_ip = IPAddr.new(host) as_ip = IPAddr.new(host)
@ -170,8 +172,11 @@ describe FinalDestination do
before do before do
stub_request(:head, 'https://force.get.com/posts?page=4') stub_request(:head, 'https://force.get.com/posts?page=4')
stub_request(:get, 'https://force.get.com/posts?page=4') stub_request(:get, 'https://force.get.com/posts?page=4')
stub_request(:get, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
stub_request(:head, 'https://eviltrout.com/posts?page=2') stub_request(:head, 'https://eviltrout.com/posts?page=2')
stub_request(:get, 'https://eviltrout.com/posts?page=2') stub_request(:get, 'https://eviltrout.com/posts?page=2')
stub_request(:head, 'https://particularly.eviltrout.com/has/a/secret/plan')
stub_request(:get, 'https://particularly.eviltrout.com/has/a/secret/plan')
end end
it "will do a GET when forced" do it "will do a GET when forced" do
@ -189,6 +194,23 @@ describe FinalDestination do
expect(WebMock).to_not have_requested(:get, 'https://eviltrout.com/posts?page=2') expect(WebMock).to_not have_requested(:get, 'https://eviltrout.com/posts?page=2')
expect(WebMock).to have_requested(:head, 'https://eviltrout.com/posts?page=2') expect(WebMock).to have_requested(:head, 'https://eviltrout.com/posts?page=2')
end end
it "will do a GET when forced on a wildcard subdomain" do
final = FinalDestination.new('https://any-subdomain.ihaveawildcard.com/some/other/content', opts)
expect(final.resolve.to_s).to eq('https://any-subdomain.ihaveawildcard.com/some/other/content')
expect(final.status).to eq(:resolved)
expect(WebMock).to have_requested(:get, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
expect(WebMock).to_not have_requested(:head, 'https://any-subdomain.ihaveawildcard.com/some/other/content')
end
it "will do a HEAD if on a subdomain of a forced get domain without a wildcard" do
final = FinalDestination.new('https://particularly.eviltrout.com/has/a/secret/plan', opts)
expect(final.resolve.to_s).to eq('https://particularly.eviltrout.com/has/a/secret/plan')
expect(final.status).to eq(:resolved)
expect(WebMock).to_not have_requested(:get, 'https://particularly.eviltrout.com/has/a/secret/plan')
expect(WebMock).to have_requested(:head, 'https://particularly.eviltrout.com/has/a/secret/plan')
end
end end
context "HEAD not supported" do context "HEAD not supported" do

View File

@ -355,6 +355,60 @@ describe Oneboxer do
end end
end end
context 'strategies' do
it "has a 'default' strategy" do
expect(Oneboxer.strategies.keys.first).to eq(:default)
end
it "has a strategy with overrides" do
strategy = Oneboxer.strategies.keys[1]
expect(Oneboxer.strategies[strategy].keys).not_to eq([])
end
context "using a non-default strategy" do
let(:hostname) { "my.interesting.site" }
let(:url) { "https://#{hostname}/cool/content" }
let(:html) do
<<~HTML
<html>
<head>
<meta property="og:title" content="Page Title">
<meta property="og:description" content="Here is some cool content">
</head>
<body>
<p>body</p>
</body>
<html>
HTML
end
before do
stub_request(:head, url).to_return(status: 509)
stub_request(:get, url).to_return(status: 200, body: html)
end
after do
Oneboxer.clear_preferred_strategy!(hostname)
end
it "uses mutiple strategies" do
default_ordered = Oneboxer.strategies.keys
custom_ordered = Oneboxer.ordered_strategies(hostname)
expect(custom_ordered).to eq(default_ordered)
expect(Oneboxer.preferred_strategy(hostname)).to eq(nil)
expect(Oneboxer.preview(url, invalidate_oneboxes: true)).to include("Here is some cool content")
custom_ordered = Oneboxer.ordered_strategies(hostname)
expect(custom_ordered.count).to eq(default_ordered.count)
expect(custom_ordered).not_to eq(default_ordered)
expect(Oneboxer.preferred_strategy(hostname)).not_to eq(:default)
end
end
end
describe 'cache_onebox_response_body' do describe 'cache_onebox_response_body' do
let(:html) do let(:html) do
<<~HTML <<~HTML