FIX: Handle all UTF-8 characters (#21344)

Watched words were converted to regular expressions containing \W, which
handled only ASCII characters. Using [^[:word]] instead ensures that
UTF-8 characters are also handled correctly.
This commit is contained in:
Bianca Nenciu
2023-05-15 11:45:04 +02:00
committed by GitHub
parent 23a146a7c6
commit 9a2780397f
7 changed files with 49 additions and 33 deletions

View File

@@ -44,23 +44,23 @@ class WordWatcher
end
end
def self.serializable_word_matcher_regexp(action)
word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
def self.serializable_word_matcher_regexp(action, engine: :ruby)
word_matcher_regexp_list(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end
# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.word_matcher_regexp_list(action, raise_errors: false)
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
words = get_cached_words(action)
return [] if words.blank?
grouped_words = { case_sensitive: [], case_insensitive: [] }
words.each do |w, attrs|
word = word_to_regexp(w)
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
words.each do |word, attrs|
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
grouped_words[group_key] << word
end
@@ -68,10 +68,7 @@ class WordWatcher
regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }
if !SiteSetting.watched_words_regular_expressions?
regexps.transform_values! do |regexp|
regexp = "(#{regexp})"
"(?:\\W|^)#{regexp}(?=\\W|$)"
end
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
end
regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
@@ -80,29 +77,42 @@ class WordWatcher
[] # Admin will be alerted via admin_dashboard_data.rb
end
def self.word_matcher_regexps(action)
def self.word_matcher_regexps(action, engine: :ruby)
if words = get_cached_words(action)
words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
end
end
def self.word_to_regexp(word, whole: false)
def self.word_to_regexp(word, engine: :ruby, whole: true)
if SiteSetting.watched_words_regular_expressions?
# Strip ruby regexp format if present
# Strip Ruby regexp format if present
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
regexp = "(#{regexp})" if whole
return regexp
end
regexp = Regexp.escape(word).gsub("\\*", '\S*')
# Escape regular expression. Avoid using Regexp.escape because it escapes
# more characters than it should (for example, whitespaces)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
if whole && !SiteSetting.watched_words_regular_expressions?
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
end
# Handle wildcards
regexp = regexp.gsub("\\*", '\S*')
regexp = wrap_regexp(regexp, engine: engine) if whole
regexp
end
def self.wrap_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
"(?:\\W|^)(#{regexp})(?=\\W|$)"
end
end
def self.word_matcher_regexp_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
@@ -212,10 +222,8 @@ class WordWatcher
end
def word_matches?(word, case_sensitive: false)
Regexp.new(
WordWatcher.word_to_regexp(word, whole: true),
case_sensitive ? nil : Regexp::IGNORECASE,
).match?(@raw)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end
def self.replace_text_with_regexp(text, regexp, replacement)