FEATURE: Whitelists for inline oneboxing

This commit is contained in:
Robin Ward
2017-07-21 15:29:04 -04:00
parent 4c7b725e19
commit 2f8f2aa1dd
7 changed files with 185 additions and 76 deletions

View File

@@ -1,38 +1,50 @@
require_dependency 'retrieve_title'
class InlineOneboxer
def initialize(urls)
def initialize(urls, opts=nil)
@urls = urls
@opts = opts || {}
end
def process
@urls.map {|url| InlineOneboxer.lookup(url) }.compact
@urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
end
def self.clear_cache!
def self.purge(url)
Rails.cache.delete(cache_key(url))
end
def self.cache_lookup(url)
Rails.cache.read(cache_key(url))
end
def self.lookup(url)
cached = cache_lookup(url)
return cached if cached.present?
def self.lookup(url, opts=nil)
opts ||= {}
unless opts[:skip_cache]
cached = cache_lookup(url)
return cached if cached.present?
end
if route = Discourse.route_for(url)
if route[:controller] == "topics" &&
route[:action] == "show" &&
topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
# Only public topics
if Guardian.new.can_see?(topic)
onebox = {
url: url,
title: Emoji.gsub_emoji_to_unicode(topic.title)
}
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
return onebox
end
return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
end
end
if whitelist = SiteSetting.inline_onebox_domains_whitelist
uri = URI(url) rescue nil
domains = whitelist.split('|')
if uri.present? &&
uri.hostname.present? &&
domains.include?(uri.hostname) &&
title = RetrieveTitle.crawl(url)
return onebox_for(url, title, opts)
end
end
@@ -41,6 +53,18 @@ class InlineOneboxer
private
def self.onebox_for(url, title, opts)
onebox = {
url: url,
title: Emoji.gsub_emoji_to_unicode(title)
}
unless opts[:skip_cache]
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
end
onebox
end
def self.cache_key(url)
"inline_onebox:#{url}"
end

70
lib/retrieve_title.rb Normal file
View File

@@ -0,0 +1,70 @@
require_dependency 'final_destination'
module RetrieveTitle
class ReadEnough < StandardError; end
def self.crawl(url)
extract_title(fetch_beginning(url))
rescue Exception
# If there was a connection error, do nothing
end
def self.extract_title(html)
title = nil
if doc = Nokogiri::HTML(html)
if node = doc.at('meta[property="og:title"]')
title = node['content']
end
title ||= doc.at('title')&.inner_text
end
if title.present?
title.gsub!(/\n/, ' ')
title.gsub!(/ +/, ' ')
title.strip!
return title
end
nil
end
private
def self.max_chunk_size(uri)
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
# one host but amazon is a big one.
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
# default is 10k
10
end
# Fetch the beginning of a HTML document at a url
def self.fetch_beginning(url)
# Never crawl in test mode
return if Rails.env.test?
fd = FinalDestination.new(url)
uri = fd.resolve
return "" unless uri
result = ""
streamer = lambda do |chunk, _, _|
result << chunk
# Using exceptions for flow control is really bad, but there really seems to
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
# that matter!)
raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
end
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
result
rescue Excon::Errors::SocketError => ex
return result if ex.socket_error.is_a?(ReadEnough)
raise
rescue ReadEnough
result
end
end