discourse/app/services/word_watcher.rb
Régis Hanol 4cb3412a56
PERF: improve findAllMatches speed (#22083)
When we introduced unicode support in the regular expressions used in watched words (9a27803) we didn't realize the cost adding the `u` flag would be.

Turns out, it's pretty bad when you have lots of regular expressions to test. A customer had slightly less than 200 watched words, and it would freeze the browser for about 2s on the first check of those regular expressions (roughly 10ms per regular expression).

This commit introduces a new field (`word`) to the serialized watched words which is then converted to a very fast and cheap regular expression on the client-side. We use that regexp to quicly check whether a matcher is even worth trying so that we don't incure the cost of compiling the expensive unicode regexp.

This commit also busts the `WordWatcher` cache since we added a new field to be serialized.

One nice side effect of using `matchAll` instead of a `while / exec` loop is that the likeliness of having a bad regexp matching infinitely is vastly reduced 🙌
2023-06-13 18:34:28 +02:00

268 lines
7.0 KiB
Ruby

# frozen_string_literal: true
class WordWatcher
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
CACHE_VERSION ||= 3
def initialize(raw)
@raw = raw
end
@cache_enabled = true
def self.disable_cache
@cache_enabled = false
end
def self.cache_enabled?
@cache_enabled
end
def self.words_for_action(action)
WatchedWord
.where(action: WatchedWord.actions[action.to_sym])
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id)
.pluck(:word, :replacement, :case_sensitive)
.to_h do |w, r, c|
[w, { word: word_to_regexp(w, whole: false), replacement: r, case_sensitive: c }.compact]
end
end
def self.words_for_action_exists?(action)
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
end
def self.get_cached_words(action)
if cache_enabled?
Discourse
.cache
.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do
words_for_action(action).presence
end
else
words_for_action(action).presence
end
end
def self.serializable_word_matcher_regexp(action, engine: :ruby)
word_matcher_regexp_list(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end
# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
words = get_cached_words(action)
return [] if words.blank?
grouped_words = { case_sensitive: [], case_insensitive: [] }
words.each do |word, attrs|
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
grouped_words[group_key] << word
end
regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }
if !SiteSetting.watched_words_regular_expressions?
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
end
regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
rescue RegexpError
raise if raise_errors
[] # Admin will be alerted via admin_dashboard_data.rb
end
def self.word_matcher_regexps(action, engine: :ruby)
get_cached_words(action)&.to_h { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }
end
def self.word_to_regexp(word, engine: :ruby, whole: true)
if SiteSetting.watched_words_regular_expressions?
# Strip Ruby regexp format if present
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
regexp = "(#{regexp})" if whole
return regexp
end
# Escape regular expression. Avoid using Regexp.escape because it escapes
# more characters than it should (for example, whitespaces)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
# Handle wildcards
regexp = regexp.gsub("\\*", '\S*')
regexp = wrap_regexp(regexp, engine: engine) if whole
regexp
end
def self.wrap_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
"(?:\\W|^)(#{regexp})(?=\\W|$)"
end
end
def self.word_matcher_regexp_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
def self.censor(html)
regexps = word_matcher_regexp_list(:censor)
return html if regexps.blank?
doc = Nokogiri::HTML5.fragment(html)
doc.traverse do |node|
regexps.each do |regexp|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
end
end
doc.to_s
end
def self.censor_text(text)
return text if text.blank?
regexps = word_matcher_regexp_list(:censor)
return text if regexps.blank?
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
end
def self.replace_text(text)
return text if text.blank?
replace(text, :replace)
end
def self.replace_link(text)
return text if text.blank?
replace(text, :link)
end
def self.apply_to_text(text)
text = censor_text(text)
text = replace_text(text)
text = replace_link(text)
text
end
def self.clear_cache!
WatchedWord.actions.each { |a, i| Discourse.cache.delete word_matcher_regexp_key(a) }
end
def requires_approval?
word_matches_for_action?(:require_approval)
end
def should_flag?
word_matches_for_action?(:flag)
end
def should_block?
word_matches_for_action?(:block, all_matches: true)
end
def should_silence?
word_matches_for_action?(:silence)
end
def word_matches_for_action?(action, all_matches: false)
regexps = self.class.word_matcher_regexp_list(action)
return if regexps.blank?
match_list = []
regexps.each do |regexp|
match = regexp.match(@raw)
if !all_matches
return match if match
next
end
next if !match
if SiteSetting.watched_words_regular_expressions?
set = Set.new
@raw
.scan(regexp)
.each do |m|
if Array === m
set.add(m.find(&:present?))
elsif String === m
set.add(m)
end
end
matches = set.to_a
else
matches = @raw.scan(regexp)
matches.flatten!
end
match_list.concat(matches)
end
return if match_list.blank?
match_list.compact!
match_list.uniq!
match_list.sort!
match_list
end
def word_matches?(word, case_sensitive: false)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end
def self.replace_text_with_regexp(text, regexp, replacement)
text.gsub(regexp) do |match|
prefix = ""
# match may be prefixed with a non-word character from the non-capturing group
# Ensure this isn't replaced if watched words regular expression is disabled.
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
prefix = "#{match[0]}"
end
"#{prefix}#{replacement}"
end
end
private_class_method :replace_text_with_regexp
def self.censor_text_with_regexp(text, regexp)
text.gsub(regexp) do |match|
# the regex captures leading whitespaces
padding = match.size - match.lstrip.size
if padding > 0
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
else
REPLACEMENT_LETTER * match.size
end
end
end
private_class_method :censor_text_with_regexp
private
def self.replace(text, watch_word_type)
word_matcher_regexps(watch_word_type)
.to_a
.reduce(text) do |t, (word_regexp, attrs)|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
end
end
end