mirror of
https://github.com/discourse/discourse.git
synced 2024-11-28 19:53:53 -06:00
b0656f3ed0
The `blocked onebox domains` setting lets site owners change what sites are allowed to be oneboxed. When a link is entered into a post, Discourse checks the domain of the link against that setting and blocks the onebox if the domain is blocked. But if there's a chain of redirects, then only the final destination website is checked against the site setting. This commit amends that behavior so that every website in the redirect chain is checked against the site setting, and if anything is blocked the original link doesn't onebox at all in the post. The `Discourse-No-Onebox` header is also checked in every response and the onebox is blocked if the header is set to "1". Additionally, Discourse will now include the `Discourse-No-Onebox` header with every response if the site requires login to access content. This is done to signal to a Discourse instance that it shouldn't attempt to onebox other Discourse instances if they're login-only. Non-Discourse websites can also use include that header if they don't wish to have Discourse onebox their content. Internal ticket: t59305.
87 lines
2.3 KiB
Ruby
87 lines
2.3 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module RetrieveTitle
|
|
CRAWL_TIMEOUT = 1
|
|
|
|
def self.crawl(url)
|
|
fetch_title(url)
|
|
rescue Exception => ex
|
|
raise if Rails.env.test?
|
|
Rails.logger.error(ex)
|
|
end
|
|
|
|
def self.extract_title(html, encoding = nil)
|
|
title = nil
|
|
if html =~ /<title>/ && html !~ /<\/title>/
|
|
return nil
|
|
end
|
|
if doc = Nokogiri::HTML5(html, nil, encoding)
|
|
|
|
title = doc.at('title')&.inner_text
|
|
|
|
# A horrible hack - YouTube uses `document.title` to populate the title
|
|
# for some reason. For any other site than YouTube this wouldn't be worth it.
|
|
if title == "YouTube" && html =~ /document\.title *= *"(.*)";/
|
|
title = Regexp.last_match[1].sub(/ - YouTube$/, '')
|
|
end
|
|
|
|
if !title && node = doc.at('meta[property="og:title"]')
|
|
title = node['content']
|
|
end
|
|
end
|
|
|
|
if title.present?
|
|
title.gsub!(/\n/, ' ')
|
|
title.gsub!(/ +/, ' ')
|
|
title.strip!
|
|
return title
|
|
end
|
|
nil
|
|
end
|
|
|
|
private
|
|
|
|
def self.max_chunk_size(uri)
|
|
# Exception for sites that leave the title until very late.
|
|
return 500 if uri.host =~ /(^|\.)amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
|
return 300 if uri.host =~ /(^|\.)youtube\.com$/ || uri.host =~ /(^|\.)youtu\.be$/
|
|
return 50 if uri.host =~ /(^|\.)github\.com$/
|
|
|
|
# default is 20k
|
|
20
|
|
end
|
|
|
|
# Fetch the beginning of a HTML document at a url
|
|
def self.fetch_title(url)
|
|
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true)
|
|
|
|
current = nil
|
|
title = nil
|
|
encoding = nil
|
|
|
|
fd.get do |_response, chunk, uri|
|
|
unless Net::HTTPRedirection === _response
|
|
if current
|
|
current << chunk
|
|
else
|
|
current = chunk
|
|
end
|
|
|
|
if !encoding && content_type = _response['content-type']&.strip&.downcase
|
|
if content_type =~ /charset="?([a-z0-9_-]+)"?/
|
|
encoding = Regexp.last_match(1)
|
|
if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
|
|
encoding = nil
|
|
end
|
|
end
|
|
end
|
|
|
|
max_size = max_chunk_size(uri) * 1024
|
|
title = extract_title(current, encoding)
|
|
throw :done if title || max_size < current.length
|
|
end
|
|
end
|
|
title
|
|
end
|
|
end
|