mirror of
https://github.com/discourse/discourse.git
synced 2024-11-29 04:03:57 -06:00
2ccc5fc66e
Many blog posts use these to illustrate and images were previously omitted Additionally strip superfluous HTML and BODY tags from embed HTML. This was incorrectly returned from server.
318 lines
8.8 KiB
Ruby
318 lines
8.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class TopicEmbed < ActiveRecord::Base
|
|
include Trashable
|
|
|
|
belongs_to :topic
|
|
belongs_to :post
|
|
validates_presence_of :embed_url
|
|
validates_uniqueness_of :embed_url
|
|
|
|
before_validation(on: :create) do
|
|
unless (
|
|
topic_embed =
|
|
TopicEmbed
|
|
.with_deleted
|
|
.where("deleted_at IS NOT NULL AND embed_url = ?", embed_url)
|
|
.first
|
|
).nil?
|
|
topic_embed.destroy!
|
|
end
|
|
end
|
|
|
|
class FetchResponse
|
|
attr_accessor :title, :body, :author
|
|
end
|
|
|
|
def self.normalize_url(url)
|
|
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
|
|
end
|
|
|
|
def self.imported_from_html(url)
|
|
url = UrlHelper.normalized_encode(url)
|
|
I18n.with_locale(SiteSetting.default_locale) do
|
|
"\n<hr>\n<small>#{I18n.t("embed.imported_from", link: "<a href='#{url}'>#{url}</a>")}</small>\n"
|
|
end
|
|
end
|
|
|
|
# Import an article from a source (RSS/Atom/Other)
|
|
def self.import(user, url, title, contents, category_id: nil, cook_method: nil, tags: nil)
|
|
return unless url =~ %r{\Ahttps?\://}
|
|
|
|
contents = first_paragraph_from(contents) if SiteSetting.embed_truncate && cook_method.nil?
|
|
contents ||= ""
|
|
contents = contents.dup << imported_from_html(url)
|
|
|
|
url = normalize_url(url)
|
|
|
|
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
|
|
content_sha1 = Digest::SHA1.hexdigest(contents)
|
|
post = nil
|
|
|
|
# If there is no embed, create a topic, post and the embed.
|
|
if embed.blank?
|
|
Topic.transaction do
|
|
eh = EmbeddableHost.record_for_url(url)
|
|
|
|
cook_method ||=
|
|
if SiteSetting.embed_support_markdown
|
|
Post.cook_methods[:regular]
|
|
else
|
|
Post.cook_methods[:raw_html]
|
|
end
|
|
|
|
create_args = {
|
|
title: title,
|
|
raw: absolutize_urls(url, contents),
|
|
skip_validations: true,
|
|
cook_method: cook_method,
|
|
category: category_id || eh.try(:category_id),
|
|
tags: SiteSetting.tagging_enabled ? tags : nil,
|
|
}
|
|
create_args[:visible] = false if SiteSetting.embed_unlisted?
|
|
|
|
creator = PostCreator.new(user, create_args)
|
|
post = creator.create
|
|
if post.present?
|
|
TopicEmbed.create!(
|
|
topic_id: post.topic_id,
|
|
embed_url: url,
|
|
content_sha1: content_sha1,
|
|
post_id: post.id,
|
|
)
|
|
end
|
|
end
|
|
else
|
|
absolutize_urls(url, contents)
|
|
post = embed.post
|
|
|
|
# Update the topic if it changed
|
|
if post&.topic
|
|
if post.user != user
|
|
PostOwnerChanger.new(
|
|
post_ids: [post.id],
|
|
topic_id: post.topic_id,
|
|
new_owner: user,
|
|
acting_user: Discourse.system_user,
|
|
).change_owner!
|
|
|
|
# make sure the post returned has the right author
|
|
post.reload
|
|
end
|
|
|
|
if (content_sha1 != embed.content_sha1) || (title && title != post&.topic&.title)
|
|
changes = { raw: absolutize_urls(url, contents) }
|
|
changes[:title] = title if title.present?
|
|
|
|
post.revise(user, changes, skip_validations: true, bypass_rate_limiter: true)
|
|
embed.update!(content_sha1: content_sha1)
|
|
end
|
|
end
|
|
end
|
|
|
|
post
|
|
end
|
|
|
|
def self.find_remote(url)
|
|
url = UrlHelper.normalized_encode(url)
|
|
URI.parse(url) # ensure url parses, will raise if not
|
|
fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true)
|
|
|
|
uri = fd.resolve
|
|
return if uri.blank?
|
|
|
|
begin
|
|
html = uri.read
|
|
rescue OpenURI::HTTPError, Net::OpenTimeout
|
|
return
|
|
end
|
|
|
|
parse_html(html, url)
|
|
end
|
|
|
|
def self.parse_html(html, url)
|
|
require "ruby-readability"
|
|
|
|
opts = {
|
|
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote figure figcaption],
|
|
attributes: %w[href src class],
|
|
remove_empty_nodes: false,
|
|
}
|
|
|
|
opts[
|
|
:whitelist
|
|
] = SiteSetting.allowed_embed_selectors if SiteSetting.allowed_embed_selectors.present?
|
|
opts[
|
|
:blacklist
|
|
] = SiteSetting.blocked_embed_selectors if SiteSetting.blocked_embed_selectors.present?
|
|
allowed_embed_classnames =
|
|
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
|
|
|
|
response = FetchResponse.new
|
|
|
|
raw_doc = Nokogiri.HTML5(html)
|
|
auth_element =
|
|
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
|
|
if auth_element.present?
|
|
response.author = User.where(username_lower: auth_element[:content].strip).first
|
|
end
|
|
|
|
read_doc = Readability::Document.new(html, opts)
|
|
|
|
title = +(raw_doc.title || "")
|
|
title.strip!
|
|
|
|
if SiteSetting.embed_title_scrubber.present?
|
|
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), "")
|
|
title.strip!
|
|
end
|
|
response.title = title
|
|
doc = Nokogiri.HTML5(read_doc.content)
|
|
|
|
tags = { "img" => "src", "script" => "src", "a" => "href" }
|
|
doc
|
|
.search(tags.keys.join(","))
|
|
.each do |node|
|
|
url_param = tags[node.name]
|
|
src = node[url_param]
|
|
unless (src.nil? || src.empty?)
|
|
begin
|
|
# convert URL to absolute form
|
|
node[url_param] = URI.join(url, UrlHelper.normalized_encode(src)).to_s
|
|
rescue URI::Error, Addressable::URI::InvalidURIError
|
|
# If there is a mistyped URL, just do nothing
|
|
end
|
|
end
|
|
# only allow classes in the allowlist
|
|
allowed_classes =
|
|
if allowed_embed_classnames.blank?
|
|
[]
|
|
else
|
|
allowed_embed_classnames.split(/[ ,]+/i)
|
|
end
|
|
doc
|
|
.search('[class]:not([class=""])')
|
|
.each do |classnode|
|
|
classes =
|
|
classnode[:class]
|
|
.split(" ")
|
|
.select { |classname| allowed_classes.include?(classname) }
|
|
if classes.length === 0
|
|
classnode.delete("class")
|
|
else
|
|
classnode[:class] = classes.join(" ")
|
|
end
|
|
end
|
|
end
|
|
|
|
response.body = doc.at("body").children.to_html
|
|
response
|
|
end
|
|
|
|
def self.import_remote(url, opts = nil)
|
|
opts = opts || {}
|
|
response = find_remote(url)
|
|
return if response.nil?
|
|
|
|
response.title = opts[:title] if opts[:title].present?
|
|
import_user = opts[:user] if opts[:user].present?
|
|
import_user = response.author if response.author.present?
|
|
|
|
TopicEmbed.import(import_user, url, response.title, response.body)
|
|
end
|
|
|
|
# Convert any relative URLs to absolute. RSS is annoying for this.
|
|
def self.absolutize_urls(url, contents)
|
|
url = normalize_url(url)
|
|
begin
|
|
uri = URI(UrlHelper.normalized_encode(url))
|
|
rescue URI::Error
|
|
return contents
|
|
end
|
|
prefix = "#{uri.scheme}://#{uri.host}"
|
|
prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443
|
|
|
|
fragment = Nokogiri::HTML5.fragment("<div>#{contents}</div>")
|
|
fragment
|
|
.css("a")
|
|
.each do |a|
|
|
if a["href"].present?
|
|
begin
|
|
a["href"] = URI.join(prefix, a["href"]).to_s
|
|
rescue URI::InvalidURIError
|
|
# NOOP, URL is malformed
|
|
end
|
|
end
|
|
end
|
|
|
|
fragment
|
|
.css("img")
|
|
.each do |a|
|
|
if a["src"].present?
|
|
begin
|
|
a["src"] = URI.join(prefix, a["src"]).to_s
|
|
rescue URI::InvalidURIError
|
|
# NOOP, URL is malformed
|
|
end
|
|
end
|
|
end
|
|
|
|
fragment.at("div").inner_html
|
|
end
|
|
|
|
def self.topic_id_for_embed(embed_url)
|
|
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
|
|
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pick(:topic_id)
|
|
end
|
|
|
|
def self.first_paragraph_from(html)
|
|
doc = Nokogiri.HTML5(html)
|
|
|
|
result = +""
|
|
doc
|
|
.css("p")
|
|
.each do |p|
|
|
if p.text.present?
|
|
result << p.to_s
|
|
return result if result.size >= 100
|
|
end
|
|
end
|
|
return result unless result.blank?
|
|
|
|
# If there is no first paragraph, return the first div (onebox)
|
|
doc.css("div").first.to_s
|
|
end
|
|
|
|
def self.expanded_for(post)
|
|
Discourse
|
|
.cache
|
|
.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
|
|
url = TopicEmbed.where(topic_id: post.topic_id).pick(:embed_url)
|
|
response = TopicEmbed.find_remote(url)
|
|
|
|
body = response.body
|
|
body << TopicEmbed.imported_from_html(url)
|
|
body
|
|
end
|
|
end
|
|
end
|
|
|
|
# == Schema Information
|
|
#
|
|
# Table name: topic_embeds
|
|
#
|
|
# id :integer not null, primary key
|
|
# topic_id :integer not null
|
|
# post_id :integer not null
|
|
# embed_url :string(1000) not null
|
|
# content_sha1 :string(40)
|
|
# created_at :datetime not null
|
|
# updated_at :datetime not null
|
|
# deleted_at :datetime
|
|
# deleted_by_id :integer
|
|
#
|
|
# Indexes
|
|
#
|
|
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
|
|
#
|