discourse/spec/lib/retrieve_title_spec.rb
Osama Sayegh b0656f3ed0
FIX: Apply onebox blocked domain checks on every redirect (#16150)
The `blocked onebox domains` setting lets site owners change what sites
are allowed to be oneboxed. When a link is entered into a post,
Discourse checks the domain of the link against that setting and blocks
the onebox if the domain is blocked. But if there's a chain of
redirects, then only the final destination website is checked against
the site setting.

This commit amends that behavior so that every website in the redirect
chain is checked against the site setting, and if anything is blocked
the original link doesn't onebox at all in the post. The
`Discourse-No-Onebox` header is also checked in every response and the
onebox is blocked if the header is set to "1".

Additionally, Discourse will now include the `Discourse-No-Onebox`
header with every response if the site requires login to access content.
This is done to signal to a Discourse instance that it shouldn't attempt
to onebox other Discourse instances if they're login-only. Non-Discourse
websites can also use include that header if they don't wish to have
Discourse onebox their content.

Internal ticket: t59305.
2022-03-11 09:18:12 +03:00

161 lines
5.5 KiB
Ruby
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# frozen_string_literal: true
describe RetrieveTitle do
context "extract_title" do
it "will extract the value from the title tag" do
title = RetrieveTitle.extract_title(
"<html><title>My Cool Title</title></html>"
)
expect(title).to eq("My Cool Title")
end
it "will strip whitespace" do
title = RetrieveTitle.extract_title(
"<html><title> Another Title\n\n </title></html>"
)
expect(title).to eq("Another Title")
end
it "will pick og:title if title is missing" do
title = RetrieveTitle.extract_title(<<~HTML
<html>
<meta property="og:title" content="Good Title"
</html>
HTML
)
expect(title).to eq("Good Title")
end
it "will prefer the title over the opengraph tag" do
title = RetrieveTitle.extract_title(<<~HTML
<html>
<title>Good Title</title>
<meta property="og:title" content="Bad Title"
</html>
HTML
)
expect(title).to eq("Good Title")
end
it "will parse a YouTube url from javascript" do
title = RetrieveTitle.extract_title(<<~HTML
<html>
<title>YouTube</title>
<script>document.title = "Video Title";</script>
</html>
HTML
)
expect(title).to eq("Video Title")
end
end
context "crawl" do
it "can properly extract a title from a url" do
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
.to_return(status: 200, body: "<html><title>very amazing</title>")
# we still resolve the IP address for every host
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
end
it "detects and uses encoding from Content-Type header" do
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
.to_return(
status: 200,
body: "<html><title>fancy apostrophes </title>".dup.force_encoding('ASCII-8BIT'),
headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
)
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ")
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
.to_return(
status: 200,
body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
)
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
end
it "can follow redirect" do
stub_request(:get, "http://foobar.com/amazing").
to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
stub_request(:get, "https://wikipedia.com/amazing").
to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
end
it "returns empty title if redirect uri is in blacklist" do
SiteSetting.blocked_onebox_domains = "wikipedia.com"
stub_request(:get, "http://foobar.com/amazing")
.to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
stub_request(:get, "https://wikipedia.com/amazing")
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
end
it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
SiteSetting.blocked_onebox_domains = "wikipedia.com"
stub_request(:get, "http://foobar.com/amazing")
.to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
stub_request(:get, "https://wikipedia.com/amazing")
.to_return(status: 301, body: "", headers: { "location" => "https://cat.com/meow" })
stub_request(:get, "https://cat.com/meow")
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
end
it "doesn't return title if the Discourse-No-Onebox header == 1" do
stub_request(:get, "https://cat.com/meow/no-onebox")
.to_return(
status: 200,
body: "<html><title>discourse stay away</title>",
headers: { "Discourse-No-Onebox" => "1" }
)
expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
end
end
context 'fetch_title' do
it "does not parse broken title tag" do
# webmock does not do chunks
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
expect(title).to eq(nil)
end
it "can parse correct title tag" do
# webmock does not do chunks
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
expect(title).to eq("Internet - Wikipedia")
end
end
end