mirror of
https://github.com/discourse/discourse.git
synced 2024-11-29 12:13:58 -06:00
cd93d1b5f7
This setting allows admin to de/activate automatic trimming of incoming email. There are instances where it does wonders in trimming all the garbage content and other instances where it's so bad that it trims the most important part of the email. FIX: don't remove hidden content using the style attribute when converting HTML to Markdown. The regexp used was doing more harm than good. It was way too broad. FIX: properly elide signatures from emails sent with Front App. This is fairly safe as Front App nicely identifies signatures in the HTML part.
363 lines
8.8 KiB
Ruby
363 lines
8.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "nokogumbo"
|
|
require "securerandom"
|
|
|
|
class HtmlToMarkdown
|
|
|
|
def initialize(html, opts = {})
|
|
@opts = opts
|
|
|
|
# we're only interested in <body>
|
|
@doc = Nokogiri::HTML5(html).at("body")
|
|
|
|
remove_not_allowed!(@doc)
|
|
remove_hidden!(@doc)
|
|
hoist_line_breaks!(@doc)
|
|
remove_whitespaces!(@doc)
|
|
end
|
|
|
|
def to_markdown
|
|
traverse(@doc)
|
|
.gsub(/\n{2,}/, "\n\n")
|
|
.strip
|
|
end
|
|
|
|
private
|
|
|
|
def remove_not_allowed!(doc)
|
|
allowed = Set.new
|
|
|
|
HtmlToMarkdown.private_instance_methods.each do |m|
|
|
if tag = m.to_s[/^visit_(.+)/, 1]
|
|
allowed << tag
|
|
end
|
|
end
|
|
|
|
@doc.traverse { |node| node.remove if !allowed.include?(node.name) }
|
|
end
|
|
|
|
def remove_hidden!(doc)
|
|
@doc.css("[hidden]").remove
|
|
@doc.css("img[width]").each { |n| n.remove if n["width"].to_i <= 0 }
|
|
@doc.css("img[height]").each { |n| n.remove if n["height"].to_i <= 0 }
|
|
end
|
|
|
|
# When there's a <br> inside an inline element, split the inline element around the <br>
|
|
def hoist_line_breaks!(doc)
|
|
klass = "_" + SecureRandom.hex
|
|
doc.css("br").each { |br| br.add_class(klass) }
|
|
|
|
loop do
|
|
changed = false
|
|
|
|
doc.css("br.#{klass}").each do |br|
|
|
parent = br.parent
|
|
|
|
if parent.description.block?
|
|
br.remove_class(klass)
|
|
else
|
|
before, after = parent.children.slice_when { |n| n == br }.to_a
|
|
|
|
if before.size > 1
|
|
b = Nokogiri::XML::Node.new(parent.name, doc)
|
|
before[0...-1].each { |c| b.add_child(c) }
|
|
parent.previous = b if b.inner_html.present?
|
|
end
|
|
|
|
if after.present?
|
|
a = Nokogiri::XML::Node.new(parent.name, doc)
|
|
after.each { |c| a.add_child(c) }
|
|
parent.next = a if a.inner_html.present?
|
|
end
|
|
|
|
parent.replace(br)
|
|
|
|
changed = true
|
|
end
|
|
end
|
|
|
|
break if !changed
|
|
end
|
|
end
|
|
|
|
# Removes most of the unnecessary white spaces for better markdown conversion
|
|
# Loosely based on the CSS' White Space Processing Rules (https://www.w3.org/TR/css-text-3/#white-space-rules)
|
|
def remove_whitespaces!(node)
|
|
return true if "pre" == node.name
|
|
|
|
node.children.chunk { |n| is_inline?(n) }.each do |inline, nodes|
|
|
if inline
|
|
collapse_spaces!(nodes) && remove_trailing_space!(nodes)
|
|
else
|
|
nodes.each { |n| remove_whitespaces!(n) }
|
|
end
|
|
end
|
|
end
|
|
|
|
def is_inline?(node)
|
|
node.text? || ("br" != node.name && node.description&.inline? && node.children.all? { |n| is_inline?(n) })
|
|
end
|
|
|
|
def collapse_spaces!(nodes, was_space = true)
|
|
nodes.each do |node|
|
|
if node.text?
|
|
text = String.new
|
|
|
|
node.text.chars.each do |c|
|
|
if c[/[[:space:]]/]
|
|
text << " " if !was_space
|
|
was_space = true
|
|
else
|
|
text << c
|
|
was_space = false
|
|
end
|
|
end
|
|
|
|
node.content = text
|
|
else
|
|
node.children.each { |n| was_space = collapse_spaces!([n], was_space) }
|
|
end
|
|
end
|
|
|
|
was_space
|
|
end
|
|
|
|
def remove_trailing_space!(nodes)
|
|
last = nodes[-1]
|
|
|
|
if last.text?
|
|
last.content = last.content[0...-1] if last.content[-1] == " "
|
|
elsif last.children.present?
|
|
remove_trailing_space!(last.children)
|
|
end
|
|
end
|
|
|
|
def traverse(node)
|
|
node.children.map { |n| visit(n) }.join
|
|
end
|
|
|
|
def visit(node)
|
|
visitor = "visit_#{node.name}"
|
|
send(visitor, node) if respond_to?(visitor, true)
|
|
end
|
|
|
|
ALLOWED_IMG_SRCS ||= %w{http:// https:// www.}
|
|
|
|
def allowed_hrefs
|
|
@allowed_hrefs ||= begin
|
|
hrefs = SiteSetting.allowed_href_schemes.split("|").map { |scheme| "#{scheme}:" }.to_set
|
|
ALLOWED_IMG_SRCS.each { |src| hrefs << src }
|
|
hrefs << "mailto:"
|
|
hrefs.to_a
|
|
end
|
|
end
|
|
|
|
def visit_a(node)
|
|
if node["href"].present? && node["href"].starts_with?(*allowed_hrefs)
|
|
"[#{traverse(node)}](#{node["href"]})"
|
|
else
|
|
traverse(node)
|
|
end
|
|
end
|
|
|
|
def visit_img(node)
|
|
return if node["src"].blank?
|
|
|
|
if @opts[:keep_img_tags]
|
|
node.to_html
|
|
elsif @opts[:keep_cid_imgs] && node["src"].starts_with?("cid:")
|
|
node.to_html
|
|
elsif node["src"].starts_with?(*ALLOWED_IMG_SRCS)
|
|
title = node["alt"].presence || node["title"].presence
|
|
width = node["width"].to_i
|
|
height = node["height"].to_i
|
|
dimensions = "|#{width}x#{height}" if width > 0 && height > 0
|
|
"![#{title}#{dimensions}](#{node["src"]})"
|
|
end
|
|
end
|
|
|
|
ALLOWED ||= %w{kbd del ins small big sub sup dl dd dt mark}
|
|
ALLOWED.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
"<#{tag}>#{traverse(node)}</#{tag}>"
|
|
end
|
|
end
|
|
|
|
def visit_blockquote(node)
|
|
text = traverse(node)
|
|
text.strip!
|
|
text.gsub!(/\n{2,}/, "\n\n")
|
|
text.gsub!(/^/, "> ")
|
|
"\n\n#{text}\n\n"
|
|
end
|
|
|
|
BLOCKS ||= %w{div tr}
|
|
BLOCKS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
prefix = node.previous_element&.description&.block? ? "" : "\n"
|
|
"#{prefix}#{traverse(node)}\n"
|
|
end
|
|
end
|
|
|
|
def visit_p(node)
|
|
"\n\n#{traverse(node)}\n\n"
|
|
end
|
|
|
|
TRAVERSABLES ||= %w{aside font span thead tbody tfooter u}
|
|
TRAVERSABLES.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
traverse(node)
|
|
end
|
|
end
|
|
|
|
def visit_tt(node)
|
|
"`#{traverse(node)}`"
|
|
end
|
|
|
|
def visit_code(node)
|
|
node.ancestors("pre").present? ? traverse(node) : visit_tt(node)
|
|
end
|
|
|
|
def visit_pre(node)
|
|
text = traverse(node)
|
|
fence = text["`"] ? "~~~" : "```"
|
|
code = node.at("code")
|
|
code_class = code ? code["class"] : ""
|
|
lang = code_class ? code_class[/lang-(\w+)/, 1] : ""
|
|
"\n\n#{fence}#{lang}\n#{traverse(node)}\n#{fence}\n\n"
|
|
end
|
|
|
|
def visit_br(node)
|
|
"\n"
|
|
end
|
|
|
|
def visit_hr(node)
|
|
"\n\n---\n\n"
|
|
end
|
|
|
|
def visit_abbr(node)
|
|
title = node["title"].presence
|
|
title_attr = title ? %[ title="#{title}"] : ""
|
|
"<abbr#{title_attr}>#{traverse(node)}</abbr>"
|
|
end
|
|
|
|
def visit_acronym(node)
|
|
visit_abbr(node)
|
|
end
|
|
|
|
(1..6).each do |n|
|
|
define_method("visit_h#{n}") do |node|
|
|
"#{"#" * n} #{traverse(node)}"
|
|
end
|
|
end
|
|
|
|
CELLS ||= %w{th td}
|
|
CELLS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
"#{traverse(node)} "
|
|
end
|
|
end
|
|
|
|
def visit_table(node)
|
|
if rows = extract_rows(node)
|
|
headers = rows[0].css("td, th")
|
|
text = "| " + headers.map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
|
|
text << "| " + (["-"] * headers.size).join(" | ") + " |\n"
|
|
rows[1..-1].each do |row|
|
|
text << "| " + row.css("td").map { |td| traverse(td).gsub(/\n/, "<br>") }.join(" | ") + " |\n"
|
|
end
|
|
"\n\n#{text}\n\n"
|
|
else
|
|
traverse(node)
|
|
end
|
|
end
|
|
|
|
def extract_rows(table)
|
|
return if table.ancestors("table").present?
|
|
return if (rows = table.css("tr")).empty?
|
|
headers_count = rows[0].css("td, th").size
|
|
return if rows[1..-1].any? { |row| row.css("td").size != headers_count }
|
|
rows
|
|
end
|
|
|
|
LISTS ||= %w{ul ol}
|
|
LISTS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
prefix = node.previous_element&.description&.block? ? "" : "\n"
|
|
suffix = node.ancestors("ul, ol, li").size > 0 ? "" : "\n"
|
|
"#{prefix}#{traverse(node)}#{suffix}"
|
|
end
|
|
end
|
|
|
|
def visit_li(node)
|
|
text = traverse(node)
|
|
|
|
lists = node.ancestors("ul, ol")
|
|
marker = "ol" == lists[0]&.name ? "1. " : "- "
|
|
indent = (" " * marker.size) * [1, lists.size].max
|
|
suffix = node == node.parent.elements[-1] ? "" : "\n"
|
|
|
|
text.gsub!(/\n{2,}/, "\n\n")
|
|
text.gsub!(/^(?!\s*$)/, indent)
|
|
text.lstrip!
|
|
|
|
"#{marker}#{text}#{suffix}"
|
|
end
|
|
|
|
EMPHASES ||= %w{i em}
|
|
EMPHASES.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
wrap = text["*"] ? "_" : "*"
|
|
|
|
"#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
|
|
end
|
|
end
|
|
|
|
STRONGS ||= %w{b strong}
|
|
STRONGS.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || (text["*"] && text["_"])
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
wrap = text["*"] ? "__" : "**"
|
|
|
|
"#{prefix}#{wrap}#{text.strip}#{wrap}#{suffix}"
|
|
end
|
|
end
|
|
|
|
STRIKES ||= %w{s strike}
|
|
STRIKES.each do |tag|
|
|
define_method("visit_#{tag}") do |node|
|
|
text = traverse(node)
|
|
|
|
return "" if text.empty?
|
|
return " " if text.blank?
|
|
return "<#{tag}>#{text}</#{tag}>" if text["\n"] || text["~~"]
|
|
|
|
prefix = text[0][" "]
|
|
suffix = text[-1][" "] if text.size > 1
|
|
|
|
"#{prefix}~~#{text.strip}~~#{suffix}"
|
|
end
|
|
end
|
|
|
|
def visit_text(node)
|
|
node.text
|
|
end
|
|
|
|
end
|