SECURITY: Strip unrendered unicode bidirectional chars in code blocks (#15032)

When rendering the markdown code blocks we replace the
offending characters in the output string with spans highlighting a textual
representation of the character, along with a title attribute with
information about why the character was highlighted.

The list of characters stripped by this fix, which are the bidirectional
characters considered relevant, are:

U+202A
U+202B
U+202C
U+202D
U+202E
U+2066
U+2067
U+2068
U+2069
This commit is contained in:
Martin Brennan
2021-11-22 10:43:03 +10:00
committed by GitHub
parent 10a57825c8
commit fa6b87a1bf
4 changed files with 130 additions and 0 deletions

View File

@@ -5,6 +5,19 @@ require 'nokogiri'
require 'erb'
module PrettyText
DANGEROUS_BIDI_CHARACTERS = [
"\u202A",
"\u202B",
"\u202C",
"\u202D",
"\u202E",
"\u2066",
"\u2067",
"\u2068",
"\u2069",
].freeze
DANGEROUS_BIDI_REGEXP = Regexp.new(DANGEROUS_BIDI_CHARACTERS.join("|")).freeze
@mutex = Mutex.new
@ctx_init = Mutex.new
@@ -278,6 +291,7 @@ module PrettyText
add_nofollow = !options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content(doc, add_nofollow)
strip_hidden_unicode_bidirectional_characters(doc)
if SiteSetting.enable_mentions
add_mentions(doc, user_id: opts[:user_id])
@@ -290,6 +304,24 @@ module PrettyText
loofah_fragment.scrub!(scrubber).to_html
end
def self.strip_hidden_unicode_bidirectional_characters(doc)
return if !DANGEROUS_BIDI_REGEXP.match?(doc.content)
doc.css("code,pre").each do |code_tag|
next if !DANGEROUS_BIDI_REGEXP.match?(code_tag.content)
DANGEROUS_BIDI_CHARACTERS.each do |bidi|
next if !code_tag.content.include?(bidi)
formatted = "<U+#{bidi.ord.to_s(16).upcase}>"
code_tag.inner_html = code_tag.inner_html.gsub(
bidi,
"<span class=\"bidi-warning\" title=\"#{I18n.t("post.hidden_bidi_character")}\">#{formatted}</span>"
)
end
end
end
def self.add_rel_attributes_to_user_content(doc, add_nofollow)
allowlist = []