FEATURE: Whitelists for inline oneboxing

2025-02-25 18:55:32 -06:00 · 2017-07-21 15:29:04 -04:00
parent 4c7b725e19
commit 2f8f2aa1dd
7 changed files with 185 additions and 76 deletions
--- a/lib/inline_oneboxer.rb
+++ b/lib/inline_oneboxer.rb
@@ -1,38 +1,50 @@
+require_dependency 'retrieve_title'
+
 class InlineOneboxer

-  def initialize(urls)
+  def initialize(urls, opts=nil)
    @urls = urls
+    @opts = opts || {}
  end

  def process
-    @urls.map {|url| InlineOneboxer.lookup(url) }.compact
+    @urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
  end

-  def self.clear_cache!
+  def self.purge(url)
+    Rails.cache.delete(cache_key(url))
  end

  def self.cache_lookup(url)
    Rails.cache.read(cache_key(url))
  end

-  def self.lookup(url)
-    cached = cache_lookup(url)
-    return cached if cached.present?
+  def self.lookup(url, opts=nil)
+    opts ||= {}
+
+    unless opts[:skip_cache]
+      cached = cache_lookup(url)
+      return cached if cached.present?
+    end

    if route = Discourse.route_for(url)
      if route[:controller] == "topics" &&
        route[:action] == "show" &&
        topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)

-        # Only public topics
-        if Guardian.new.can_see?(topic)
-          onebox = {
-            url: url,
-            title: Emoji.gsub_emoji_to_unicode(topic.title)
-          }
-          Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
-          return onebox
-        end
+        return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
+      end
+    end
+
+    if whitelist = SiteSetting.inline_onebox_domains_whitelist
+      uri = URI(url) rescue nil
+
+      domains = whitelist.split('|')
+      if uri.present? &&
+        uri.hostname.present? &&
+        domains.include?(uri.hostname) &&
+        title = RetrieveTitle.crawl(url)
+          return onebox_for(url, title, opts)
      end
    end

@@ -41,6 +53,18 @@ class InlineOneboxer

  private

+    def self.onebox_for(url, title, opts)
+      onebox = {
+        url: url,
+        title: Emoji.gsub_emoji_to_unicode(title)
+      }
+      unless opts[:skip_cache]
+        Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
+      end
+
+      onebox
+    end
+
    def self.cache_key(url)
      "inline_onebox:#{url}"
    end
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -0,0 +1,70 @@
+require_dependency 'final_destination'
+
+module RetrieveTitle
+  class ReadEnough < StandardError; end
+
+  def self.crawl(url)
+    extract_title(fetch_beginning(url))
+  rescue Exception
+    # If there was a connection error, do nothing
+  end
+
+  def self.extract_title(html)
+    title = nil
+    if doc = Nokogiri::HTML(html)
+
+      if node = doc.at('meta[property="og:title"]')
+        title = node['content']
+      end
+
+      title ||= doc.at('title')&.inner_text
+    end
+
+    if title.present?
+      title.gsub!(/\n/, ' ')
+      title.gsub!(/ +/, ' ')
+      title.strip!
+      return title
+    end
+    nil
+  end
+
+  private
+
+    def self.max_chunk_size(uri)
+      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
+      # one host but amazon is a big one.
+      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
+
+      # default is 10k
+      10
+    end
+
+    # Fetch the beginning of a HTML document at a url
+    def self.fetch_beginning(url)
+      # Never crawl in test mode
+      return if Rails.env.test?
+
+      fd = FinalDestination.new(url)
+      uri = fd.resolve
+      return "" unless uri
+
+      result = ""
+      streamer = lambda do |chunk, _, _|
+        result << chunk
+
+        # Using exceptions for flow control is really bad, but there really seems to
+        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
+        # that matter!)
+        raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
+      end
+      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
+      result
+
+    rescue Excon::Errors::SocketError => ex
+      return result if ex.socket_error.is_a?(ReadEnough)
+      raise
+    rescue ReadEnough
+      result
+    end
+end