mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 08:57:10 -06:00
FIX: Hack our title retriever so that it parses YouTube URLs
This commit is contained in:
parent
cd6dff58dd
commit
07e84a3afa
@ -15,6 +15,12 @@ module RetrieveTitle
|
||||
|
||||
title = doc.at('title')&.inner_text
|
||||
|
||||
# A horrible hack - YouTube uses `document.title` to populate the title
|
||||
# for some reason. For any other site than YouTube this wouldn't be worth it.
|
||||
if title == "YouTube" && html =~ /document\.title *= *"(.*)";/
|
||||
title = Regexp.last_match[1].sub(/ - YouTube$/, '')
|
||||
end
|
||||
|
||||
if !title && node = doc.at('meta[property="og:title"]')
|
||||
title = node['content']
|
||||
end
|
||||
@ -32,9 +38,11 @@ module RetrieveTitle
|
||||
private
|
||||
|
||||
def self.max_chunk_size(uri)
|
||||
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
||||
# one host but amazon is a big one.
|
||||
|
||||
# Amazon and YouTube leave the title until very late. Exceptions are bad
|
||||
# but these are large sites.
|
||||
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
|
||||
|
||||
# default is 10k
|
||||
10
|
||||
|
@ -44,6 +44,17 @@ describe RetrieveTitle do
|
||||
expect(title).to eq("Good Title")
|
||||
end
|
||||
|
||||
it "will parse a YouTube url from javascript" do
|
||||
title = RetrieveTitle.extract_title(<<~HTML
|
||||
<html>
|
||||
<title>YouTube</title>
|
||||
<script>document.title = "Video Title";</script>
|
||||
</html>
|
||||
HTML
|
||||
)
|
||||
expect(title).to eq("Video Title")
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user