FIX: Handle all UTF-8 characters (#21344)

Watched words were converted to regular expressions containing \W, which handled only ASCII characters. Using [^[:word]] instead ensures that UTF-8 characters are also handled correctly.
2025-02-25 18:55:32 -06:00 · 2023-05-15 11:45:04 +02:00
parent 23a146a7c6
commit 9a2780397f
7 changed files with 49 additions and 33 deletions
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@@ -44,23 +44,23 @@ class WordWatcher
    end
  end

-  def self.serializable_word_matcher_regexp(action)
-    word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
+  def self.serializable_word_matcher_regexp(action, engine: :ruby)
+    word_matcher_regexp_list(action, engine: engine).map do |r|
+      { r.source => { case_sensitive: !r.casefold? } }
+    end
  end

  # This regexp is run in miniracer, and the client JS app
  # Make sure it is compatible with major browsers when changing
  # hint: non-chrome browsers do not support 'lookbehind'
-  def self.word_matcher_regexp_list(action, raise_errors: false)
+  def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
    words = get_cached_words(action)
    return [] if words.blank?

    grouped_words = { case_sensitive: [], case_insensitive: [] }

-    words.each do |w, attrs|
-      word = word_to_regexp(w)
-      word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
-
+    words.each do |word, attrs|
+      word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
      group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
      grouped_words[group_key] << word
    end
@@ -68,10 +68,7 @@ class WordWatcher
    regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }

    if !SiteSetting.watched_words_regular_expressions?
-      regexps.transform_values! do |regexp|
-        regexp = "(#{regexp})"
-        "(?:\\W|^)#{regexp}(?=\\W|$)"
-      end
+      regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
    end

    regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
@@ -80,29 +77,42 @@ class WordWatcher
    [] # Admin will be alerted via admin_dashboard_data.rb
  end

-  def self.word_matcher_regexps(action)
+  def self.word_matcher_regexps(action, engine: :ruby)
    if words = get_cached_words(action)
-      words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
+      words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
    end
  end

-  def self.word_to_regexp(word, whole: false)
+  def self.word_to_regexp(word, engine: :ruby, whole: true)
    if SiteSetting.watched_words_regular_expressions?
-      # Strip ruby regexp format if present
+      # Strip Ruby regexp format if present
      regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
      regexp = "(#{regexp})" if whole
      return regexp
    end

-    regexp = Regexp.escape(word).gsub("\\*", '\S*')
+    # Escape regular expression. Avoid using Regexp.escape because it escapes
+    # more characters than it should (for example, whitespaces)
+    regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')

-    if whole && !SiteSetting.watched_words_regular_expressions?
-      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
-    end
+    # Handle wildcards
+    regexp = regexp.gsub("\\*", '\S*')
+
+    regexp = wrap_regexp(regexp, engine: engine) if whole

    regexp
  end

+  def self.wrap_regexp(regexp, engine: :ruby)
+    if engine == :js
+      "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
+    elsif engine == :ruby
+      "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
+    else
+      "(?:\\W|^)(#{regexp})(?=\\W|$)"
+    end
+  end
+
  def self.word_matcher_regexp_key(action)
    "watched-words-list:v#{CACHE_VERSION}:#{action}"
  end
@@ -212,10 +222,8 @@ class WordWatcher
  end

  def word_matches?(word, case_sensitive: false)
-    Regexp.new(
-      WordWatcher.word_to_regexp(word, whole: true),
-      case_sensitive ? nil : Regexp::IGNORECASE,
-    ).match?(@raw)
+    options = case_sensitive ? nil : Regexp::IGNORECASE
+    Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
  end

  def self.replace_text_with_regexp(text, regexp, replacement)