From 39e0442de98e8df86bb597909375bee27cac2829 Mon Sep 17 00:00:00 2001
From: David Taylor <david@taylorhq.com>
Date: Wed, 31 Jul 2019 18:33:49 +0100
Subject: [PATCH] FIX: Various watched words improvements

- Client-side censoring fixed for non-chrome browsers. (Regular expression rewritten to avoid lookback)
- Regex generation is now done on the server, to reduce repeated logic, and make it easier to extend in plugins
- Censor tests are moved to ruby, to ensure everything works end-to-end
- If "watched words regular expressions" is enabled, warn the admin when the generated regex is invalid
---
 .../javascripts/discourse/lib/text.js.es6     |  2 +-
 .../javascripts/discourse/models/topic.js.es6 |  2 +-
 .../pretty-text/censored-words.js.es6         | 86 +++---------------
 .../discourse-markdown/censored.js.es6        | 10 +-
 .../pretty-text/pretty-text.js.es6            |  4 +-
 app/models/admin_dashboard_data.rb            | 13 ++-
 app/serializers/site_serializer.rb            |  6 +-
 app/services/word_watcher.rb                  | 10 +-
 config/locales/server.en.yml                  |  1 +
 lib/pretty_text.rb                            |  2 +-
 spec/components/pretty_text_spec.rb           | 91 +++++++++++++++++--
 spec/services/word_watcher_spec.rb            |  2 +-
 test/javascripts/lib/pretty-text-test.js.es6  | 83 +----------------
 13 files changed, 134 insertions(+), 178 deletions(-)

diff --git a/app/assets/javascripts/discourse/lib/text.js.es6 b/app/assets/javascripts/discourse/lib/text.js.es6
index ee51cb53d10..cb02036cede 100644
--- a/app/assets/javascripts/discourse/lib/text.js.es6
+++ b/app/assets/javascripts/discourse/lib/text.js.es6
@@ -13,7 +13,7 @@ function getOpts(opts) {
     {
       getURL: Discourse.getURLWithCDN,
       currentUser: Discourse.__container__.lookup("current-user:main"),
-      censoredWords: site.censored_words,
+      censoredRegexp: site.censored_regexp,
       siteSettings,
       formatUsername
     },
diff --git a/app/assets/javascripts/discourse/models/topic.js.es6 b/app/assets/javascripts/discourse/models/topic.js.es6
index ca2c4350b71..d44afe930ec 100644
--- a/app/assets/javascripts/discourse/models/topic.js.es6
+++ b/app/assets/javascripts/discourse/models/topic.js.es6
@@ -97,7 +97,7 @@ const Topic = RestModel.extend({
   fancyTitle(title) {
     let fancyTitle = censor(
       emojiUnescape(title || ""),
-      Discourse.Site.currentProp("censored_words")
+      Discourse.Site.currentProp("censored_regexp")
     );
 
     if (Discourse.SiteSettings.support_mixed_text_direction) {
diff --git a/app/assets/javascripts/pretty-text/censored-words.js.es6 b/app/assets/javascripts/pretty-text/censored-words.js.es6
index 32c21dc39b6..4453e3f8949 100644
--- a/app/assets/javascripts/pretty-text/censored-words.js.es6
+++ b/app/assets/javascripts/pretty-text/censored-words.js.es6
@@ -1,75 +1,19 @@
-function escapeRegexp(text) {
-  return text.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&").replace(/\*/g, "S*");
-}
+export function censorFn(regexpString, replacementLetter) {
+  if (regexpString) {
+    let censorRegexp = new RegExp(regexpString, "ig");
+    replacementLetter = replacementLetter || "&#9632;";
 
-function createCensorRegexp(patterns) {
-  return new RegExp(`((?<!\\w)(?:${patterns.join("|")}))(?!\\w)`, "ig");
-}
-
-export function censorFn(
-  censoredWords,
-  replacementLetter,
-  watchedWordsRegularExpressions
-) {
-  let patterns = [];
-
-  replacementLetter = replacementLetter || "&#9632;";
-
-  if (censoredWords && censoredWords.length) {
-    patterns = censoredWords.split("|");
-    if (!watchedWordsRegularExpressions) {
-      patterns = patterns.map(t => `(${escapeRegexp(t)})`);
-    }
-  }
-
-  if (patterns.length) {
-    let censorRegexp;
-
-    try {
-      if (watchedWordsRegularExpressions) {
-        censorRegexp = new RegExp(
-          "((?:" + patterns.join("|") + "))(?![^\\(]*\\))",
-          "ig"
+    return function(text) {
+      text = text.replace(censorRegexp, (fullMatch, ...groupMatches) => {
+        const stringMatch = groupMatches.find(g => typeof g === "string");
+        return fullMatch.replace(
+          stringMatch,
+          new Array(stringMatch.length + 1).join(replacementLetter)
         );
-      } else {
-        censorRegexp = createCensorRegexp(patterns);
-      }
+      });
 
-      if (censorRegexp) {
-        return function(text) {
-          let original = text;
-
-          try {
-            let m = censorRegexp.exec(text);
-            const fourCharReplacement = new Array(5).join(replacementLetter);
-
-            while (m && m[0]) {
-              if (m[0].length > original.length) {
-                return original;
-              } // regex is dangerous
-              if (watchedWordsRegularExpressions) {
-                text = text.replace(censorRegexp, fourCharReplacement);
-              } else {
-                const replacement = new Array(m[0].length + 1).join(
-                  replacementLetter
-                );
-                text = text.replace(
-                  createCensorRegexp([escapeRegexp(m[0])]),
-                  replacement
-                );
-              }
-              m = censorRegexp.exec(text);
-            }
-
-            return text;
-          } catch (e) {
-            return original;
-          }
-        };
-      }
-    } catch (e) {
-      // fall through
-    }
+      return text;
+    };
   }
 
   return function(t) {
@@ -77,6 +21,6 @@ export function censorFn(
   };
 }
 
-export function censor(text, censoredWords, replacementLetter) {
-  return censorFn(censoredWords, replacementLetter)(text);
+export function censor(text, censoredRegexp, replacementLetter) {
+  return censorFn(censoredRegexp, replacementLetter)(text);
 }
diff --git a/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6 b/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6
index e44c036a456..6273a22a61c 100644
--- a/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6
+++ b/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6
@@ -29,15 +29,11 @@ export function setup(helper) {
   });
 
   helper.registerPlugin(md => {
-    const words = md.options.discourse.censoredWords;
+    const censoredRegexp = md.options.discourse.censoredRegexp;
 
-    if (words && words.length > 0) {
+    if (censoredRegexp) {
       const replacement = String.fromCharCode(9632);
-      const censor = censorFn(
-        words,
-        replacement,
-        md.options.discourse.watchedWordsRegularExpressions
-      );
+      const censor = censorFn(censoredRegexp, replacement);
       md.core.ruler.push("censored", state => censorTree(state, censor));
     }
   });
diff --git a/app/assets/javascripts/pretty-text/pretty-text.js.es6 b/app/assets/javascripts/pretty-text/pretty-text.js.es6
index e656c9a7d75..7a867d89c96 100644
--- a/app/assets/javascripts/pretty-text/pretty-text.js.es6
+++ b/app/assets/javascripts/pretty-text/pretty-text.js.es6
@@ -29,7 +29,7 @@ export function buildOptions(state) {
     lookupUploadUrls,
     previewing,
     linkify,
-    censoredWords,
+    censoredRegexp,
     disableEmojis
   } = state;
 
@@ -67,7 +67,7 @@ export function buildOptions(state) {
     formatUsername,
     emojiUnicodeReplacer,
     lookupUploadUrls,
-    censoredWords,
+    censoredRegexp,
     allowedHrefSchemes: siteSettings.allowed_href_schemes
       ? siteSettings.allowed_href_schemes.split("|")
       : null,
diff --git a/app/models/admin_dashboard_data.rb b/app/models/admin_dashboard_data.rb
index 0d260816280..c558954c3d0 100644
--- a/app/models/admin_dashboard_data.rb
+++ b/app/models/admin_dashboard_data.rb
@@ -96,7 +96,7 @@ class AdminDashboardData
                       :image_magick_check, :failing_emails_check,
                       :subfolder_ends_in_slash_check,
                       :pop3_polling_configuration, :email_polling_errored_recently,
-                      :out_of_date_themes, :unreachable_themes
+                      :out_of_date_themes, :unreachable_themes, :watched_words_check
 
     add_problem_check do
       sidekiq_check || queue_size_check
@@ -224,6 +224,17 @@ class AdminDashboardData
     I18n.t('dashboard.force_https_warning', base_path: Discourse.base_path) unless SiteSetting.force_https
   end
 
+  def watched_words_check
+    WatchedWord.actions.keys.each do |action|
+      begin
+        WordWatcher.word_matcher_regexp(action, raise_errors: true)
+      rescue RegexpError => e
+        return I18n.t('dashboard.watched_word_regexp_error', base_path: Discourse.base_path, action: action)
+      end
+    end
+    nil
+  end
+
   def out_of_date_themes
     old_themes = RemoteTheme.out_of_date_themes
     return unless old_themes.present?
diff --git a/app/serializers/site_serializer.rb b/app/serializers/site_serializer.rb
index 1b08914efde..13859d7503c 100644
--- a/app/serializers/site_serializer.rb
+++ b/app/serializers/site_serializer.rb
@@ -30,7 +30,7 @@ class SiteSerializer < ApplicationSerializer
     :wizard_required,
     :topic_featured_link_allowed_category_ids,
     :user_themes,
-    :censored_words,
+    :censored_regexp,
     :shared_drafts_category_id
   )
 
@@ -156,8 +156,8 @@ class SiteSerializer < ApplicationSerializer
     scope.topic_featured_link_allowed_category_ids
   end
 
-  def censored_words
-    WordWatcher.words_for_action(:censor).join('|')
+  def censored_regexp
+    WordWatcher.word_matcher_regexp(:censor)&.source
   end
 
   def shared_drafts_category_id
diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb
index 1e12f860029..0a1cad9bae6 100644
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@@ -20,7 +20,10 @@ class WordWatcher
     end
   end
 
-  def self.word_matcher_regexp(action)
+  # This regexp is run in miniracer, and the client JS app
+  # Make sure it is compatible with major browsers when changing
+  # hint: non-chrome browsers do not support 'lookbehind'
+  def self.word_matcher_regexp(action, raise_errors: false)
     words = get_cached_words(action)
     if words
       words = words.map do |w|
@@ -31,10 +34,13 @@ class WordWatcher
       regexp = words.join('|')
       if !SiteSetting.watched_words_regular_expressions?
         regexp = "(#{regexp})"
-        regexp = "(?<!\\w)(#{regexp})(?!\\w)"
+        regexp = "(?:\\W|^)#{regexp}(?=\\W|$)"
       end
       Regexp.new(regexp, Regexp::IGNORECASE)
     end
+  rescue RegexpError => e
+    raise if raise_errors
+    nil # Admin will be alerted via admin_dashboard_data.rb
   end
 
   def self.word_to_regexp(word)
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index 7ba365e9584..f1811039d2f 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -1304,6 +1304,7 @@ en:
     force_https_warning: "Your website is using SSL. But `<a href='%{base_path}/admin/site_settings/category/all_results?filter=force_https'>force_https</a>` is not yet enabled in your site settings."
     out_of_date_themes: "Updates are available for the following themes:"
     unreachable_themes: "We were unable to check for updates on the following themes:"
+    watched_word_regexp_error: "The regular expression for %{action} watched words is invalid. Please check your <a href='%{base_path}/admin/logs/watched_words'>Watched Word settings</a>, or disable the 'watched words regular expressions' site setting."
 
   site_settings:
     censored_words: "Words that will be automatically replaced with &#9632;&#9632;&#9632;&#9632;"
diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb
index 8b9de1c2d9c..a5591e9ec39 100644
--- a/lib/pretty_text.rb
+++ b/lib/pretty_text.rb
@@ -162,7 +162,7 @@ module PrettyText
         __optInput.customEmoji = #{custom_emoji.to_json};
         __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
         __optInput.lookupUploadUrls = __lookupUploadUrls;
-        __optInput.censoredWords = #{WordWatcher.words_for_action(:censor).join('|').to_json};
+        __optInput.censoredRegexp = #{WordWatcher.word_matcher_regexp(:censor)&.source.to_json};
       JS
 
       if opts[:topicId]
diff --git a/spec/components/pretty_text_spec.rb b/spec/components/pretty_text_spec.rb
index 045ec8baf9b..327dbf6d224 100644
--- a/spec/components/pretty_text_spec.rb
+++ b/spec/components/pretty_text_spec.rb
@@ -1042,15 +1042,88 @@ HTML
     expect(PrettyText.cook("abcde ^:;-P")).to include("emoji")
   end
 
-  it 'can censor words correctly' do
-    begin
-      ['apple', 'banana'].each { |w| Fabricate(:watched_word, word: w, action: WatchedWord.actions[:censor]) }
-      expect(PrettyText.cook('yay banana yay')).not_to include('banana')
-      expect(PrettyText.cook('yay `banana` yay')).not_to include('banana')
-      expect(PrettyText.cook("# banana")).not_to include('banana')
-      expect(PrettyText.cook("# banana")).to include("\u25a0\u25a0")
-    ensure
-      $redis.flushall
+  describe "censoring" do
+    after(:all) { $redis.flushall }
+
+    def expect_cooked_match(raw, expected_cooked)
+      expect(PrettyText.cook(raw)).to eq(expected_cooked)
+    end
+
+    context "with basic words" do
+      fab!(:watched_words) do
+        ["shucks", "whiz", "whizzer", "a**le", "badword*", "shuck$", "café", "$uper"].each do |word|
+          Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: word)
+        end
+      end
+
+      it "works correctly" do
+        expect_cooked_match("aw shucks, golly gee whiz.",
+                            "<p>aw ■■■■■■, golly gee ■■■■.</p>")
+      end
+
+      it "doesn't censor words unless they have boundaries." do
+        expect_cooked_match("you are a whizzard! I love cheesewhiz. Whiz.",
+                            "<p>you are a whizzard! I love cheesewhiz. ■■■■.</p>")
+      end
+
+      it "censors words even if previous partial matches exist." do
+        expect_cooked_match("you are a whizzer! I love cheesewhiz. Whiz.",
+                            "<p>you are a ■■■■■■■! I love cheesewhiz. ■■■■.</p>")
+      end
+
+      it "won't break links by censoring them." do
+        expect_cooked_match("The link still works. [whiz](http://www.whiz.com)",
+                            '<p>The link still works. <a href="http://www.whiz.com" rel="nofollow noopener">■■■■</a></p>')
+      end
+
+      it "escapes regexp characters" do
+        expect_cooked_match(
+          "I have a pen, I have an a**le",
+          "<p>I have a pen, I have an ■■■■■</p>"
+        )
+      end
+
+      it "works for words ending in non-word characters" do
+        expect_cooked_match(
+          "Aw shuck$, I can't fix the problem with money",
+          "<p>Aw ■■■■■■, I can't fix the problem with money</p>")
+      end
+
+      it "works for words ending in accented characters" do
+        expect_cooked_match(
+          "Let's go to a café today",
+          "<p>Let's go to a ■■■■ today</p>")
+      end
+
+      it "works for words starting with non-word characters" do
+        expect_cooked_match(
+          "Discourse is $uper amazing",
+          "<p>Discourse is ■■■■■ amazing</p>")
+      end
+
+      it "handles * as wildcard" do
+        expect_cooked_match(
+          "No badword or apple here plz.",
+          "<p>No ■■■■■■■ or ■■■■■ here plz.</p>")
+      end
+    end
+
+    context "with watched words as regular expressions" do
+      before { SiteSetting.watched_words_regular_expressions = true }
+      it "supports words as regular expressions" do
+        ["xyz*", "plee+ase"].each do |word|
+          Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: word)
+        end
+
+        expect_cooked_match("Pleased to meet you, but pleeeease call me later, xyz123",
+        "<p>Pleased to meet you, but ■■■■■■■■■ call me later, ■■■123</p>")
+      end
+
+      it "supports custom boundaries" do
+        Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "\\btown\\b")
+        expect_cooked_match("Meet downtown in your town at the townhouse on Main St.",
+                            "<p>Meet downtown in your ■■■■ at the townhouse on Main St.</p>")
+      end
     end
   end
 
diff --git a/spec/services/word_watcher_spec.rb b/spec/services/word_watcher_spec.rb
index 89d854ae1a5..034911ec66e 100644
--- a/spec/services/word_watcher_spec.rb
+++ b/spec/services/word_watcher_spec.rb
@@ -24,7 +24,7 @@ describe WordWatcher do
       it "is correct when watched_words_regular_expressions = false" do
         SiteSetting.watched_words_regular_expressions = false
         regexp = WordWatcher.word_matcher_regexp(:block)
-        expect(regexp.inspect).to eq("/(?<!\\w)((#{word1}|#{word2}))(?!\\w)/i")
+        expect(regexp.inspect).to eq("/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i")
       end
     end
   end
diff --git a/test/javascripts/lib/pretty-text-test.js.es6 b/test/javascripts/lib/pretty-text-test.js.es6
index 7974e057f1c..5a509fbeb7a 100644
--- a/test/javascripts/lib/pretty-text-test.js.es6
+++ b/test/javascripts/lib/pretty-text-test.js.es6
@@ -21,7 +21,6 @@ const rawOpts = {
     enable_markdown_linkify: true,
     markdown_linkify_tlds: "com"
   },
-  censoredWords: "shucks|whiz|whizzer|a**le|badword*|shuck$|café|$uper",
   getURL: url => url
 };
 
@@ -974,89 +973,15 @@ QUnit.test("images", assert => {
 });
 
 QUnit.test("censoring", assert => {
-  assert.cooked(
-    "aw shucks, golly gee whiz.",
-    "<p>aw ■■■■■■, golly gee ■■■■.</p>",
-    "it censors words in the Site Settings"
-  );
-
-  assert.cooked(
-    "you are a whizzard! I love cheesewhiz. Whiz.",
-    "<p>you are a whizzard! I love cheesewhiz. ■■■■.</p>",
-    "it doesn't censor words unless they have boundaries."
-  );
-
-  assert.cooked(
-    "you are a whizzer! I love cheesewhiz. Whiz.",
-    "<p>you are a ■■■■■■■! I love cheesewhiz. ■■■■.</p>",
-    "it censors words even if previous partial matches exist."
-  );
-
-  assert.cooked(
-    "The link still works. [whiz](http://www.whiz.com)",
-    '<p>The link still works. <a href="http://www.whiz.com">■■■■</a></p>',
-    "it won't break links by censoring them."
-  );
-
-  assert.cooked(
-    "Call techapj the computer whiz at 555-555-1234 for free help.",
-    "<p>Call techapj the computer ■■■■ at 555-555-1234 for free help.</p>",
-    "uses both censored words and patterns from site settings"
-  );
-
-  assert.cooked(
-    "I have a pen, I have an a**le",
-    "<p>I have a pen, I have an ■■■■■</p>",
-    "it escapes regexp chars"
-  );
-
-  assert.cooked(
-    "Aw shuck$, I can't fix the problem with money",
-    "<p>Aw ■■■■■■, I can't fix the problem with money</p>",
-    "it works for words ending in non-word characters"
-  );
-
-  assert.cooked(
-    "Let's go to a café today",
-    "<p>Let's go to a ■■■■ today</p>",
-    "it works for words ending in accented characters"
-  );
-
-  assert.cooked(
-    "Discourse is $uper amazing",
-    "<p>Discourse is ■■■■■ amazing</p>",
-    "it works for words starting with non-word characters"
-  );
-
-  assert.cooked(
-    "No badword or apple here plz.",
-    "<p>No ■■■■■■■ or ■■■■■ here plz.</p>",
-    "it handles * as wildcard"
-  );
-
   assert.cookedOptions(
     "Pleased to meet you, but pleeeease call me later, xyz123",
     {
-      siteSettings: {
-        watched_words_regular_expressions: true
-      },
-      censoredWords: "xyz*|plee+ase"
+      censoredRegexp: "(xyz*|plee+ase)"
     },
-    "<p>Pleased to meet you, but ■■■■ call me later, ■■■■123</p>",
-    "supports words as regular expressions"
-  );
-
-  assert.cookedOptions(
-    "Meet downtown in your town at the townhouse on Main St.",
-    {
-      siteSettings: {
-        watched_words_regular_expressions: true
-      },
-      censoredWords: "\\btown\\b"
-    },
-    "<p>Meet downtown in your ■■■■ at the townhouse on Main St.</p>",
-    "supports words as regular expressions"
+    "<p>Pleased to meet you, but ■■■■■■■■■ call me later, ■■■123</p>",
+    "supports censoring"
   );
+  // More tests in pretty_text_spec.rb
 });
 
 QUnit.test("code blocks/spans hoisting", assert => {