FEATURE: add support for figure and figcaption tags in embeddings (#21276)

Many blog posts use these to illustrate and images were previously omitted

Additionally strip superfluous HTML and BODY tags from embed HTML.

This was incorrectly returned from server.
This commit is contained in:
Sam 2023-04-27 19:57:06 +10:00 committed by GitHub
parent 0b479d0137
commit 2ccc5fc66e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 50 additions and 10 deletions

View File

@ -114,17 +114,27 @@ class TopicEmbed < ActiveRecord::Base
end
def self.find_remote(url)
require "ruby-readability"
url = UrlHelper.normalized_encode(url)
original_uri = URI.parse(url)
URI.parse(url) # ensure url parses, will raise if not
fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true)
uri = fd.resolve
return if uri.blank?
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
parse_html(html, url)
end
def self.parse_html(html, url)
require "ruby-readability"
opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote figure figcaption],
attributes: %w[href src class],
remove_empty_nodes: false,
}
@ -139,11 +149,6 @@ class TopicEmbed < ActiveRecord::Base
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
response = FetchResponse.new
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
raw_doc = Nokogiri.HTML5(html)
auth_element =
@ -200,7 +205,7 @@ class TopicEmbed < ActiveRecord::Base
end
end
response.body = doc.to_html
response.body = doc.at("body").children.to_html
response
end

View File

@ -23,6 +23,41 @@ RSpec.describe TopicEmbed do
expect(TopicEmbed.count).to eq(0)
end
it "Allows figure and figcaption HTML tags" do
html = <<~HTML
<html>
<head>
<title>Some title</title>
</head>
<body>
<div class='content'>
<p>some content</p>
<figure>
<img src="/a.png">
<figcaption>Some caption</figcaption>
<figure>
</div>
</body>
</html>
HTML
parsed = TopicEmbed.parse_html(html, "https://blog.discourse.com/somepost.html")
# div inception is inserted by the readability gem
expected = <<~HTML
<div><div>
<div>
<p>some content</p>
<figure>
<img src="https://blog.discourse.com/a.png">
<figcaption>Some caption</figcaption>
<figure>
</figure></figure></div>
</div></div>
HTML
expect(parsed.body.strip).to eq(expected.strip)
end
context "when creating a post" do
let!(:post) { TopicEmbed.import(user, url, title, contents) }
let(:topic_embed) { TopicEmbed.find_by(post: post) }