FIX: Allow to follow non-ASCII canonical links for oneboxes

This commit is contained in:
Loïc Guitaut 2025-02-03 16:38:29 +01:00 committed by Loïc Guitaut
parent 324857c4c4
commit 5055a071b8
3 changed files with 81 additions and 7 deletions

View File

@ -28,7 +28,7 @@ module Onebox
)
doc = Nokogiri.HTML(response)
uri = Addressable::URI.parse(url)
uri = Addressable::URI.parse(url).normalize!
ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]')
should_ignore_canonical =
@ -38,13 +38,13 @@ module Onebox
!should_ignore_canonical
# prefer canonical link
canonical_link = doc.at('//link[@rel="canonical"]/@href')
canonical_uri = Addressable::URI.parse(canonical_link)
canonical_uri = Addressable::URI.parse(canonical_link)&.normalize!
if canonical_link && canonical_uri &&
"#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
uri =
FinalDestination.new(
canonical_link,
Oneboxer.get_final_destination_options(canonical_link),
canonical_uri,
Oneboxer.get_final_destination_options(canonical_uri),
).resolve
if uri.present?
response =

View File

@ -76,8 +76,9 @@ RSpec.describe Onebox::Helpers do
end
context "with canonical link" do
let(:uri) { "https://www.example.com" }
it "follows canonical link" do
uri = "https://www.example.com"
stub_request(:get, uri).to_return(
status: 200,
body: "<!DOCTYPE html><link rel='canonical' href='http://foobar.com/'/><p>invalid</p>",
@ -92,7 +93,6 @@ RSpec.describe Onebox::Helpers do
end
it "does not follow canonical link pointing at localhost" do
uri = "https://www.example.com"
FinalDestination::SSRFDetector
.stubs(:lookup_ips)
.with { |h| h == "localhost" }
@ -104,6 +104,25 @@ RSpec.describe Onebox::Helpers do
expect(described_class.fetch_html_doc(uri).to_s).to match("success")
end
context "when canonical link contains non-ASCII characters" do
before do
stub_request(:get, uri).to_return(
status: 200,
body:
"<!DOCTYPE html><link rel='canonical' href='http://foobar.com/héhé'/><p>invalid</p>",
)
stub_request(:get, "http://foobar.com/héhé").to_return(
status: 200,
body: "<!DOCTYPE html><p>success</p>",
)
stub_request(:head, "http://foobar.com/héhé").to_return(status: 200, body: "")
end
it "does not break" do
expect(described_class.fetch_html_doc(uri).to_s).to match("success")
end
end
end
end

View File

@ -286,10 +286,65 @@ RSpec.describe Oneboxer do
"<a href='http://test.localhost/g/somegroup#&apos;onerror=&apos;'>http://test.localhost/g/somegroup#&apos;onerror=&apos;</a>",
)
end
context "when URL contains non-ASCII characters" do
let(:html) { <<~HTML }
<html>
<head>
<meta property="og:title" content="Cats">
<meta property="og:description" content="Meow">
</head>
<body>
<p>body</p>
</body>
<html>
HTML
context "when response is not a redirect" do
let(:url) { "https://its.me/héhé" }
before do
stub_request(:get, url).to_return(status: 200, body: html, headers: {})
stub_request(:head, url).to_return(status: 200, body: "", headers: {})
end
it "does not break" do
expect(described_class.onebox_raw(url)[:onebox]).to be_present
end
end
context "when response is a redirect" do
let(:url) { "https://its.me" }
let(:non_ascii_url) { "#{url}/h%C3%A9h%C3%A9" }
before do
stub_request(:get, url).to_return(
status: 301,
body: "",
headers: {
"location" => non_ascii_url,
},
)
stub_request(:head, url).to_return(
status: 301,
body: "",
headers: {
"location" => non_ascii_url,
},
)
stub_request(:get, non_ascii_url).to_return(status: 200, body: html, headers: {})
stub_request(:head, non_ascii_url).to_return(status: 200, body: "", headers: {})
end
it "does not break" do
expect(described_class.onebox_raw(url)[:onebox]).to be_present
end
end
end
end
describe ".external_onebox" do
html = <<~HTML
let(:html) { <<~HTML }
<html>
<head>
<meta property="og:title" content="Cats">