mirror of
https://github.com/discourse/discourse.git
synced 2025-02-16 18:24:52 -06:00
FEATURE: Whitelists for inline oneboxing
This commit is contained in:
parent
4c7b725e19
commit
2f8f2aa1dd
@ -1,50 +1,11 @@
|
|||||||
require 'open-uri'
|
require 'open-uri'
|
||||||
require 'nokogiri'
|
require 'nokogiri'
|
||||||
require 'excon'
|
require 'excon'
|
||||||
require 'final_destination'
|
require_dependency 'retrieve_title'
|
||||||
|
|
||||||
module Jobs
|
module Jobs
|
||||||
class CrawlTopicLink < Jobs::Base
|
class CrawlTopicLink < Jobs::Base
|
||||||
|
|
||||||
class ReadEnough < StandardError; end
|
|
||||||
|
|
||||||
def self.max_chunk_size(uri)
|
|
||||||
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
|
||||||
# one host but amazon is a big one.
|
|
||||||
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
|
||||||
|
|
||||||
# Default is 10k
|
|
||||||
10
|
|
||||||
end
|
|
||||||
|
|
||||||
# Fetch the beginning of a HTML document at a url
|
|
||||||
def self.fetch_beginning(url)
|
|
||||||
# Never crawl in test mode
|
|
||||||
return if Rails.env.test?
|
|
||||||
|
|
||||||
fd = FinalDestination.new(url)
|
|
||||||
uri = fd.resolve
|
|
||||||
return "" unless uri
|
|
||||||
|
|
||||||
result = ""
|
|
||||||
streamer = lambda do |chunk, _, _|
|
|
||||||
result << chunk
|
|
||||||
|
|
||||||
# Using exceptions for flow control is really bad, but there really seems to
|
|
||||||
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
|
||||||
# that matter!)
|
|
||||||
raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
|
|
||||||
end
|
|
||||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
|
||||||
result
|
|
||||||
|
|
||||||
rescue Excon::Errors::SocketError => ex
|
|
||||||
return result if ex.socket_error.is_a?(ReadEnough)
|
|
||||||
raise
|
|
||||||
rescue ReadEnough
|
|
||||||
result
|
|
||||||
end
|
|
||||||
|
|
||||||
def execute(args)
|
def execute(args)
|
||||||
raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
|
raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
|
||||||
|
|
||||||
@ -72,18 +33,9 @@ module Jobs
|
|||||||
|
|
||||||
unless crawled
|
unless crawled
|
||||||
# Fetch the beginning of the document to find the title
|
# Fetch the beginning of the document to find the title
|
||||||
result = CrawlTopicLink.fetch_beginning(topic_link.url)
|
title = RetrieveTitle.crawl(topic_link.url)
|
||||||
doc = Nokogiri::HTML(result)
|
if title.present?
|
||||||
if doc
|
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
|
||||||
title = doc.at('title').try(:inner_text)
|
|
||||||
if title.present?
|
|
||||||
title.gsub!(/\n/, ' ')
|
|
||||||
title.gsub!(/ +/, ' ')
|
|
||||||
title.strip!
|
|
||||||
if title.present?
|
|
||||||
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
rescue Exception
|
rescue Exception
|
||||||
|
@ -979,6 +979,7 @@ en:
|
|||||||
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
|
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
|
||||||
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
|
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
|
||||||
onebox_domains_blacklist: "A list of domains that will never be oneboxed."
|
onebox_domains_blacklist: "A list of domains that will never be oneboxed."
|
||||||
|
inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title"
|
||||||
max_oneboxes_per_post: "Maximum number of oneboxes in a post."
|
max_oneboxes_per_post: "Maximum number of oneboxes in a post."
|
||||||
|
|
||||||
logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."
|
logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."
|
||||||
|
@ -920,6 +920,9 @@ onebox:
|
|||||||
max_oneboxes_per_post:
|
max_oneboxes_per_post:
|
||||||
default: 50
|
default: 50
|
||||||
client: true
|
client: true
|
||||||
|
inline_onebox_domains_whitelist:
|
||||||
|
default: ''
|
||||||
|
type: list
|
||||||
|
|
||||||
spam:
|
spam:
|
||||||
add_rel_nofollow_to_user_content: true
|
add_rel_nofollow_to_user_content: true
|
||||||
|
@ -1,38 +1,50 @@
|
|||||||
|
require_dependency 'retrieve_title'
|
||||||
|
|
||||||
class InlineOneboxer
|
class InlineOneboxer
|
||||||
|
|
||||||
def initialize(urls)
|
def initialize(urls, opts=nil)
|
||||||
@urls = urls
|
@urls = urls
|
||||||
|
@opts = opts || {}
|
||||||
end
|
end
|
||||||
|
|
||||||
def process
|
def process
|
||||||
@urls.map {|url| InlineOneboxer.lookup(url) }.compact
|
@urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.clear_cache!
|
def self.purge(url)
|
||||||
|
Rails.cache.delete(cache_key(url))
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.cache_lookup(url)
|
def self.cache_lookup(url)
|
||||||
Rails.cache.read(cache_key(url))
|
Rails.cache.read(cache_key(url))
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.lookup(url)
|
def self.lookup(url, opts=nil)
|
||||||
cached = cache_lookup(url)
|
opts ||= {}
|
||||||
return cached if cached.present?
|
|
||||||
|
unless opts[:skip_cache]
|
||||||
|
cached = cache_lookup(url)
|
||||||
|
return cached if cached.present?
|
||||||
|
end
|
||||||
|
|
||||||
if route = Discourse.route_for(url)
|
if route = Discourse.route_for(url)
|
||||||
if route[:controller] == "topics" &&
|
if route[:controller] == "topics" &&
|
||||||
route[:action] == "show" &&
|
route[:action] == "show" &&
|
||||||
topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
|
topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
|
||||||
|
|
||||||
# Only public topics
|
return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
|
||||||
if Guardian.new.can_see?(topic)
|
end
|
||||||
onebox = {
|
end
|
||||||
url: url,
|
|
||||||
title: Emoji.gsub_emoji_to_unicode(topic.title)
|
if whitelist = SiteSetting.inline_onebox_domains_whitelist
|
||||||
}
|
uri = URI(url) rescue nil
|
||||||
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
|
|
||||||
return onebox
|
domains = whitelist.split('|')
|
||||||
end
|
if uri.present? &&
|
||||||
|
uri.hostname.present? &&
|
||||||
|
domains.include?(uri.hostname) &&
|
||||||
|
title = RetrieveTitle.crawl(url)
|
||||||
|
return onebox_for(url, title, opts)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -41,6 +53,18 @@ class InlineOneboxer
|
|||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
|
def self.onebox_for(url, title, opts)
|
||||||
|
onebox = {
|
||||||
|
url: url,
|
||||||
|
title: Emoji.gsub_emoji_to_unicode(title)
|
||||||
|
}
|
||||||
|
unless opts[:skip_cache]
|
||||||
|
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
|
||||||
|
end
|
||||||
|
|
||||||
|
onebox
|
||||||
|
end
|
||||||
|
|
||||||
def self.cache_key(url)
|
def self.cache_key(url)
|
||||||
"inline_onebox:#{url}"
|
"inline_onebox:#{url}"
|
||||||
end
|
end
|
||||||
|
70
lib/retrieve_title.rb
Normal file
70
lib/retrieve_title.rb
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
require_dependency 'final_destination'
|
||||||
|
|
||||||
|
module RetrieveTitle
|
||||||
|
class ReadEnough < StandardError; end
|
||||||
|
|
||||||
|
def self.crawl(url)
|
||||||
|
extract_title(fetch_beginning(url))
|
||||||
|
rescue Exception
|
||||||
|
# If there was a connection error, do nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.extract_title(html)
|
||||||
|
title = nil
|
||||||
|
if doc = Nokogiri::HTML(html)
|
||||||
|
|
||||||
|
if node = doc.at('meta[property="og:title"]')
|
||||||
|
title = node['content']
|
||||||
|
end
|
||||||
|
|
||||||
|
title ||= doc.at('title')&.inner_text
|
||||||
|
end
|
||||||
|
|
||||||
|
if title.present?
|
||||||
|
title.gsub!(/\n/, ' ')
|
||||||
|
title.gsub!(/ +/, ' ')
|
||||||
|
title.strip!
|
||||||
|
return title
|
||||||
|
end
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def self.max_chunk_size(uri)
|
||||||
|
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
||||||
|
# one host but amazon is a big one.
|
||||||
|
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||||
|
|
||||||
|
# default is 10k
|
||||||
|
10
|
||||||
|
end
|
||||||
|
|
||||||
|
# Fetch the beginning of a HTML document at a url
|
||||||
|
def self.fetch_beginning(url)
|
||||||
|
# Never crawl in test mode
|
||||||
|
return if Rails.env.test?
|
||||||
|
|
||||||
|
fd = FinalDestination.new(url)
|
||||||
|
uri = fd.resolve
|
||||||
|
return "" unless uri
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
streamer = lambda do |chunk, _, _|
|
||||||
|
result << chunk
|
||||||
|
|
||||||
|
# Using exceptions for flow control is really bad, but there really seems to
|
||||||
|
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
||||||
|
# that matter!)
|
||||||
|
raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
|
||||||
|
end
|
||||||
|
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
||||||
|
result
|
||||||
|
|
||||||
|
rescue Excon::Errors::SocketError => ex
|
||||||
|
return result if ex.socket_error.is_a?(ReadEnough)
|
||||||
|
raise
|
||||||
|
rescue ReadEnough
|
||||||
|
result
|
||||||
|
end
|
||||||
|
end
|
@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer'
|
|||||||
|
|
||||||
describe InlineOneboxer do
|
describe InlineOneboxer do
|
||||||
|
|
||||||
before do
|
|
||||||
InlineOneboxer.clear_cache!
|
|
||||||
end
|
|
||||||
|
|
||||||
it "should return nothing with empty input" do
|
it "should return nothing with empty input" do
|
||||||
expect(InlineOneboxer.new([]).process).to be_blank
|
expect(InlineOneboxer.new([]).process).to be_blank
|
||||||
end
|
end
|
||||||
|
|
||||||
it "can onebox a topic" do
|
it "can onebox a topic" do
|
||||||
topic = Fabricate(:topic)
|
topic = Fabricate(:topic)
|
||||||
results = InlineOneboxer.new([topic.url]).process
|
results = InlineOneboxer.new([topic.url], skip_cache: true).process
|
||||||
expect(results).to be_present
|
expect(results).to be_present
|
||||||
expect(results[0][:url]).to eq(topic.url)
|
expect(results[0][:url]).to eq(topic.url)
|
||||||
expect(results[0][:title]).to eq(topic.title)
|
expect(results[0][:title]).to eq(topic.title)
|
||||||
@ -21,13 +17,18 @@ describe InlineOneboxer do
|
|||||||
|
|
||||||
it "doesn't onebox private messages" do
|
it "doesn't onebox private messages" do
|
||||||
topic = Fabricate(:private_message_topic)
|
topic = Fabricate(:private_message_topic)
|
||||||
results = InlineOneboxer.new([topic.url]).process
|
results = InlineOneboxer.new([topic.url], skip_cache: true).process
|
||||||
expect(results).to be_blank
|
expect(results).to be_blank
|
||||||
end
|
end
|
||||||
|
|
||||||
context "caching" do
|
context "caching" do
|
||||||
|
let(:topic) { Fabricate(:topic) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
InlineOneboxer.purge(topic.url)
|
||||||
|
end
|
||||||
|
|
||||||
it "puts an entry in the cache" do
|
it "puts an entry in the cache" do
|
||||||
topic = Fabricate(:topic)
|
|
||||||
expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank
|
expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank
|
||||||
|
|
||||||
result = InlineOneboxer.lookup(topic.url)
|
result = InlineOneboxer.lookup(topic.url)
|
||||||
@ -43,7 +44,7 @@ describe InlineOneboxer do
|
|||||||
context ".lookup" do
|
context ".lookup" do
|
||||||
it "can lookup one link at a time" do
|
it "can lookup one link at a time" do
|
||||||
topic = Fabricate(:topic)
|
topic = Fabricate(:topic)
|
||||||
onebox = InlineOneboxer.lookup(topic.url)
|
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
|
||||||
expect(onebox).to be_present
|
expect(onebox).to be_present
|
||||||
expect(onebox[:url]).to eq(topic.url)
|
expect(onebox[:url]).to eq(topic.url)
|
||||||
expect(onebox[:title]).to eq(topic.title)
|
expect(onebox[:title]).to eq(topic.title)
|
||||||
@ -56,12 +57,30 @@ describe InlineOneboxer do
|
|||||||
|
|
||||||
it "will return the fancy title" do
|
it "will return the fancy title" do
|
||||||
topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
|
topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
|
||||||
onebox = InlineOneboxer.lookup(topic.url)
|
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
|
||||||
expect(onebox).to be_present
|
expect(onebox).to be_present
|
||||||
expect(onebox[:url]).to eq(topic.url)
|
expect(onebox[:url]).to eq(topic.url)
|
||||||
expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
|
expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "will not crawl domains that aren't whitelisted" do
|
||||||
|
onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true)
|
||||||
|
expect(onebox).to be_blank
|
||||||
|
end
|
||||||
|
|
||||||
|
it "will lookup whitelisted domains" do
|
||||||
|
SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com"
|
||||||
|
RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog")
|
||||||
|
|
||||||
|
onebox = InlineOneboxer.lookup(
|
||||||
|
"https://eviltrout.com/some-path",
|
||||||
|
skip_cache: true
|
||||||
|
)
|
||||||
|
expect(onebox).to be_present
|
||||||
|
expect(onebox[:url]).to eq("https://eviltrout.com/some-path")
|
||||||
|
expect(onebox[:title]).to eq("Evil Trout's Blog")
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
40
spec/components/retrieve_title_spec.rb
Normal file
40
spec/components/retrieve_title_spec.rb
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
require_dependency 'retrieve_title'
|
||||||
|
|
||||||
|
describe RetrieveTitle do
|
||||||
|
|
||||||
|
context "extract_title" do
|
||||||
|
|
||||||
|
it "will extract the value from the title tag" do
|
||||||
|
title = RetrieveTitle.extract_title(
|
||||||
|
"<html><title>My Cool Title</title></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(title).to eq("My Cool Title")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "will strip whitespace" do
|
||||||
|
title = RetrieveTitle.extract_title(
|
||||||
|
"<html><title> Another Title\n\n </title></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(title).to eq("Another Title")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "will prefer the title from an opengraph tag" do
|
||||||
|
title = RetrieveTitle.extract_title(<<~HTML
|
||||||
|
<html>
|
||||||
|
<title>Bad Title</title>
|
||||||
|
<meta property="og:title" content="Good Title" />
|
||||||
|
</html>
|
||||||
|
HTML
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(title).to eq("Good Title")
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user