2019-04-29 19:27:42 -05:00
|
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
2022-07-27 21:27:38 -05:00
|
|
|
|
RSpec.describe RetrieveTitle do
|
2022-07-27 11:14:14 -05:00
|
|
|
|
describe ".extract_title" do
|
2017-07-21 14:29:04 -05:00
|
|
|
|
it "will extract the value from the title tag" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
title = RetrieveTitle.extract_title("<html><title>My Cool Title</title></html>")
|
2017-07-21 14:29:04 -05:00
|
|
|
|
|
|
|
|
|
expect(title).to eq("My Cool Title")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "will strip whitespace" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
title = RetrieveTitle.extract_title("<html><title> Another Title\n\n </title></html>")
|
2017-07-21 14:29:04 -05:00
|
|
|
|
|
|
|
|
|
expect(title).to eq("Another Title")
|
|
|
|
|
end
|
|
|
|
|
|
2017-08-02 13:27:21 -05:00
|
|
|
|
it "will pick og:title if title is missing" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML)
|
2017-08-02 13:27:21 -05:00
|
|
|
|
<html>
|
|
|
|
|
<meta property="og:title" content="Good Title"
|
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("Good Title")
|
|
|
|
|
end
|
|
|
|
|
|
2021-07-12 09:35:57 -05:00
|
|
|
|
it "will prefer the title over the opengraph tag" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML)
|
2017-07-21 14:29:04 -05:00
|
|
|
|
<html>
|
2017-08-02 13:27:21 -05:00
|
|
|
|
<title>Good Title</title>
|
|
|
|
|
<meta property="og:title" content="Bad Title"
|
2017-07-21 14:29:04 -05:00
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("Good Title")
|
|
|
|
|
end
|
|
|
|
|
|
2017-09-28 08:29:50 -05:00
|
|
|
|
it "will parse a YouTube url from javascript" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML)
|
2017-09-28 08:29:50 -05:00
|
|
|
|
<html>
|
|
|
|
|
<title>YouTube</title>
|
|
|
|
|
<script>document.title = "Video Title";</script>
|
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
expect(title).to eq("Video Title")
|
|
|
|
|
end
|
2022-08-23 00:03:57 -05:00
|
|
|
|
|
|
|
|
|
it "will not exception out for invalid html" do
|
|
|
|
|
attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
|
|
|
|
|
title = RetrieveTitle.extract_title <<~HTML
|
|
|
|
|
<html>
|
|
|
|
|
<title>test</title>
|
|
|
|
|
<body #{attributes}>
|
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
|
|
|
|
|
expect(title).to eq(nil)
|
|
|
|
|
end
|
2018-01-28 22:36:52 -06:00
|
|
|
|
end
|
|
|
|
|
|
2022-07-27 11:14:14 -05:00
|
|
|
|
describe ".crawl" do
|
2018-01-28 22:36:52 -06:00
|
|
|
|
it "can properly extract a title from a url" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>very amazing</title>",
|
|
|
|
|
)
|
2017-09-28 08:29:50 -05:00
|
|
|
|
|
2018-01-28 22:36:52 -06:00
|
|
|
|
# we still resolve the IP address for every host
|
2023-01-09 05:18:21 -06:00
|
|
|
|
IPSocket.stubs(:getaddress).returns("100.2.3.4")
|
2018-01-28 22:36:52 -06:00
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
|
|
|
|
|
end
|
2017-07-21 14:29:04 -05:00
|
|
|
|
|
2021-01-04 13:32:08 -06:00
|
|
|
|
it "detects and uses encoding from Content-Type header" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>fancy apostrophes ’’’</title>".dup.force_encoding("ASCII-8BIT"),
|
|
|
|
|
headers: {
|
|
|
|
|
"Content-Type" => 'text/html; charset="utf-8"',
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-01-04 13:32:08 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
IPSocket.stubs(:getaddress).returns("100.2.3.4")
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq(
|
|
|
|
|
"fancy apostrophes ’’’",
|
|
|
|
|
)
|
2021-01-04 13:32:08 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body:
|
|
|
|
|
"<html><title>japanese こんにちは website</title>".encode("EUC-JP").force_encoding(
|
|
|
|
|
"ASCII-8BIT",
|
|
|
|
|
),
|
|
|
|
|
headers: {
|
|
|
|
|
"Content-Type" => "text/html;charset=euc-jp",
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-01-04 13:32:08 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
IPSocket.stubs(:getaddress).returns("100.2.3.4")
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq(
|
|
|
|
|
"japanese こんにちは website",
|
|
|
|
|
)
|
2021-01-04 13:32:08 -06:00
|
|
|
|
end
|
2021-06-24 09:23:39 -05:00
|
|
|
|
|
|
|
|
|
it "can follow redirect" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "http://foobar.com/amazing").to_return(
|
|
|
|
|
status: 301,
|
|
|
|
|
body: "",
|
|
|
|
|
headers: {
|
|
|
|
|
"location" => "https://wikipedia.com/amazing",
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-06-24 09:23:39 -05:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>very amazing</title>",
|
|
|
|
|
headers: {
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-01-20 00:12:34 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
IPSocket.stubs(:getaddress).returns("100.2.3.4")
|
2022-01-20 00:12:34 -06:00
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "returns empty title if redirect uri is in blacklist" do
|
|
|
|
|
SiteSetting.blocked_onebox_domains = "wikipedia.com"
|
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "http://foobar.com/amazing").to_return(
|
|
|
|
|
status: 301,
|
|
|
|
|
body: "",
|
|
|
|
|
headers: {
|
|
|
|
|
"location" => "https://wikipedia.com/amazing",
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-01-20 00:12:34 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>very amazing</title>",
|
|
|
|
|
headers: {
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-01-20 00:12:34 -06:00
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
|
|
|
|
|
end
|
|
|
|
|
|
2022-03-11 00:18:12 -06:00
|
|
|
|
it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
|
2022-01-20 00:12:34 -06:00
|
|
|
|
SiteSetting.blocked_onebox_domains = "wikipedia.com"
|
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "http://foobar.com/amazing").to_return(
|
|
|
|
|
status: 301,
|
|
|
|
|
body: "",
|
|
|
|
|
headers: {
|
|
|
|
|
"location" => "https://wikipedia.com/amazing",
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-01-20 00:12:34 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing").to_return(
|
|
|
|
|
status: 301,
|
|
|
|
|
body: "",
|
|
|
|
|
headers: {
|
|
|
|
|
"location" => "https://cat.com/meow",
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-01-20 00:12:34 -06:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://cat.com/meow").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>very amazing</title>",
|
|
|
|
|
headers: {
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-06-24 09:23:39 -05:00
|
|
|
|
|
2022-03-11 00:18:12 -06:00
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "doesn't return title if the Discourse-No-Onebox header == 1" do
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://cat.com/meow/no-onebox").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>discourse stay away</title>",
|
|
|
|
|
headers: {
|
|
|
|
|
"Discourse-No-Onebox" => "1",
|
|
|
|
|
},
|
|
|
|
|
)
|
2022-03-11 00:18:12 -06:00
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
|
2021-06-24 09:23:39 -05:00
|
|
|
|
end
|
2022-03-22 13:13:27 -05:00
|
|
|
|
|
|
|
|
|
it "doesn't return a title if response is unsuccessful" do
|
|
|
|
|
stub_request(:get, "https://example.com").to_return(status: 404, body: "")
|
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
|
|
|
|
|
end
|
2022-06-09 14:30:22 -05:00
|
|
|
|
|
|
|
|
|
it "it raises errors other than Net::ReadTimeout, e.g. NoMethodError" do
|
|
|
|
|
stub_request(:get, "https://example.com").to_raise(NoMethodError)
|
|
|
|
|
|
|
|
|
|
expect { RetrieveTitle.crawl("https://example.com") }.to raise_error(NoMethodError)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "it ignores Net::ReadTimeout errors" do
|
|
|
|
|
stub_request(:get, "https://example.com").to_raise(Net::ReadTimeout)
|
|
|
|
|
|
2022-12-27 20:30:20 -06:00
|
|
|
|
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "ignores SSRF lookup errors" do
|
2023-06-21 09:00:19 -05:00
|
|
|
|
described_class.stubs(:fetch_title).raises(FinalDestination::SSRFDetector::LookupFailedError)
|
2022-12-27 20:30:20 -06:00
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
|
2022-06-09 14:30:22 -05:00
|
|
|
|
end
|
2023-12-01 01:03:06 -06:00
|
|
|
|
|
|
|
|
|
it "ignores URL encoding errors" do
|
|
|
|
|
described_class.stubs(:fetch_title).raises(FinalDestination::UrlEncodingError)
|
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
|
|
|
|
|
end
|
2021-01-04 13:32:08 -06:00
|
|
|
|
end
|
2021-09-03 02:45:58 -05:00
|
|
|
|
|
2023-01-09 05:18:21 -06:00
|
|
|
|
describe ".fetch_title" do
|
2021-09-03 02:45:58 -05:00
|
|
|
|
it "does not parse broken title tag" do
|
|
|
|
|
# webmock does not do chunks
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><head><title>Internet - Wikipedia</ti",
|
|
|
|
|
headers: {
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-09-03 02:45:58 -05:00
|
|
|
|
|
|
|
|
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
|
|
|
|
expect(title).to eq(nil)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "can parse correct title tag" do
|
|
|
|
|
# webmock does not do chunks
|
2023-01-09 05:18:21 -06:00
|
|
|
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><head><title>Internet - Wikipedia</title>",
|
|
|
|
|
headers: {
|
|
|
|
|
},
|
|
|
|
|
)
|
2021-09-03 02:45:58 -05:00
|
|
|
|
|
|
|
|
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
|
|
|
|
expect(title).to eq("Internet - Wikipedia")
|
|
|
|
|
end
|
|
|
|
|
end
|
2017-07-21 14:29:04 -05:00
|
|
|
|
end
|