FEATURE: Nokogumbo (#9577)

* FEATURE: Nokogumbo

Use Nokogumbo HTML parser.
This commit is contained in:
Krzysztof Kotlarek
2020-05-05 13:46:57 +10:00
committed by GitHub
parent b8b1cbbfb9
commit 9bff0882c3
50 changed files with 165 additions and 179 deletions

View File

@@ -338,7 +338,7 @@ module Email
markdown, elided_markdown = if html.present?
# use the first html extracter that matches
if html_extracter = HTML_EXTRACTERS.select { |_, r| html[r] }.min_by { |_, r| html =~ r }
doc = Nokogiri::HTML.fragment(html)
doc = Nokogiri::HTML5.fragment(html)
self.public_send(:"extract_from_#{html_extracter[0]}", doc)
else
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown

View File

@@ -15,7 +15,7 @@ module Email
def initialize(html, opts = nil)
@html = html
@opts = opts || {}
@fragment = Nokogiri::HTML.fragment(@html)
@fragment = Nokogiri::HTML5.parse(@html)
@custom_styles = nil
end
@@ -161,7 +161,7 @@ module Email
src_uri = i["data-original-href"].present? ? URI(i["data-original-href"]) : URI(i['src'])
# If an iframe is protocol relative, use SSL when displaying it
display_src = "#{src_uri.scheme || 'https'}://#{src_uri.host}#{src_uri.path}#{src_uri.query.nil? ? '' : '?' + src_uri.query}#{src_uri.fragment.nil? ? '' : '#' + src_uri.fragment}"
i.replace "<p><a href='#{src_uri.to_s}'>#{CGI.escapeHTML(display_src)}</a><p>"
i.replace(Nokogiri::HTML5.fragment("<p><a href='#{src_uri.to_s}'>#{CGI.escapeHTML(display_src)}</a><p>"))
rescue URI::Error
# If the URL is weird, remove the iframe
i.remove
@@ -242,7 +242,11 @@ module Email
strip_classes_and_ids
replace_relative_urls
replace_secure_media_urls
@fragment.to_html
include_body? ? @fragment.at("body").to_html : @fragment.at("body").children.to_html
end
def include_body?
@html =~ /<body>/i
end
def strip_avatars_and_emojis