From 39e0442de98e8df86bb597909375bee27cac2829 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Wed, 31 Jul 2019 18:33:49 +0100 Subject: [PATCH] FIX: Various watched words improvements - Client-side censoring fixed for non-chrome browsers. (Regular expression rewritten to avoid lookback) - Regex generation is now done on the server, to reduce repeated logic, and make it easier to extend in plugins - Censor tests are moved to ruby, to ensure everything works end-to-end - If "watched words regular expressions" is enabled, warn the admin when the generated regex is invalid --- .../javascripts/discourse/lib/text.js.es6 | 2 +- .../javascripts/discourse/models/topic.js.es6 | 2 +- .../pretty-text/censored-words.js.es6 | 86 +++--------------- .../discourse-markdown/censored.js.es6 | 10 +- .../pretty-text/pretty-text.js.es6 | 4 +- app/models/admin_dashboard_data.rb | 13 ++- app/serializers/site_serializer.rb | 6 +- app/services/word_watcher.rb | 10 +- config/locales/server.en.yml | 1 + lib/pretty_text.rb | 2 +- spec/components/pretty_text_spec.rb | 91 +++++++++++++++++-- spec/services/word_watcher_spec.rb | 2 +- test/javascripts/lib/pretty-text-test.js.es6 | 83 +---------------- 13 files changed, 134 insertions(+), 178 deletions(-) diff --git a/app/assets/javascripts/discourse/lib/text.js.es6 b/app/assets/javascripts/discourse/lib/text.js.es6 index ee51cb53d10..cb02036cede 100644 --- a/app/assets/javascripts/discourse/lib/text.js.es6 +++ b/app/assets/javascripts/discourse/lib/text.js.es6 @@ -13,7 +13,7 @@ function getOpts(opts) { { getURL: Discourse.getURLWithCDN, currentUser: Discourse.__container__.lookup("current-user:main"), - censoredWords: site.censored_words, + censoredRegexp: site.censored_regexp, siteSettings, formatUsername }, diff --git a/app/assets/javascripts/discourse/models/topic.js.es6 b/app/assets/javascripts/discourse/models/topic.js.es6 index ca2c4350b71..d44afe930ec 100644 --- a/app/assets/javascripts/discourse/models/topic.js.es6 +++ b/app/assets/javascripts/discourse/models/topic.js.es6 @@ -97,7 +97,7 @@ const Topic = RestModel.extend({ fancyTitle(title) { let fancyTitle = censor( emojiUnescape(title || ""), - Discourse.Site.currentProp("censored_words") + Discourse.Site.currentProp("censored_regexp") ); if (Discourse.SiteSettings.support_mixed_text_direction) { diff --git a/app/assets/javascripts/pretty-text/censored-words.js.es6 b/app/assets/javascripts/pretty-text/censored-words.js.es6 index 32c21dc39b6..4453e3f8949 100644 --- a/app/assets/javascripts/pretty-text/censored-words.js.es6 +++ b/app/assets/javascripts/pretty-text/censored-words.js.es6 @@ -1,75 +1,19 @@ -function escapeRegexp(text) { - return text.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&").replace(/\*/g, "S*"); -} +export function censorFn(regexpString, replacementLetter) { + if (regexpString) { + let censorRegexp = new RegExp(regexpString, "ig"); + replacementLetter = replacementLetter || "■"; -function createCensorRegexp(patterns) { - return new RegExp(`((? `(${escapeRegexp(t)})`); - } - } - - if (patterns.length) { - let censorRegexp; - - try { - if (watchedWordsRegularExpressions) { - censorRegexp = new RegExp( - "((?:" + patterns.join("|") + "))(?![^\\(]*\\))", - "ig" + return function(text) { + text = text.replace(censorRegexp, (fullMatch, ...groupMatches) => { + const stringMatch = groupMatches.find(g => typeof g === "string"); + return fullMatch.replace( + stringMatch, + new Array(stringMatch.length + 1).join(replacementLetter) ); - } else { - censorRegexp = createCensorRegexp(patterns); - } + }); - if (censorRegexp) { - return function(text) { - let original = text; - - try { - let m = censorRegexp.exec(text); - const fourCharReplacement = new Array(5).join(replacementLetter); - - while (m && m[0]) { - if (m[0].length > original.length) { - return original; - } // regex is dangerous - if (watchedWordsRegularExpressions) { - text = text.replace(censorRegexp, fourCharReplacement); - } else { - const replacement = new Array(m[0].length + 1).join( - replacementLetter - ); - text = text.replace( - createCensorRegexp([escapeRegexp(m[0])]), - replacement - ); - } - m = censorRegexp.exec(text); - } - - return text; - } catch (e) { - return original; - } - }; - } - } catch (e) { - // fall through - } + return text; + }; } return function(t) { @@ -77,6 +21,6 @@ export function censorFn( }; } -export function censor(text, censoredWords, replacementLetter) { - return censorFn(censoredWords, replacementLetter)(text); +export function censor(text, censoredRegexp, replacementLetter) { + return censorFn(censoredRegexp, replacementLetter)(text); } diff --git a/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6 b/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6 index e44c036a456..6273a22a61c 100644 --- a/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6 +++ b/app/assets/javascripts/pretty-text/engines/discourse-markdown/censored.js.es6 @@ -29,15 +29,11 @@ export function setup(helper) { }); helper.registerPlugin(md => { - const words = md.options.discourse.censoredWords; + const censoredRegexp = md.options.discourse.censoredRegexp; - if (words && words.length > 0) { + if (censoredRegexp) { const replacement = String.fromCharCode(9632); - const censor = censorFn( - words, - replacement, - md.options.discourse.watchedWordsRegularExpressions - ); + const censor = censorFn(censoredRegexp, replacement); md.core.ruler.push("censored", state => censorTree(state, censor)); } }); diff --git a/app/assets/javascripts/pretty-text/pretty-text.js.es6 b/app/assets/javascripts/pretty-text/pretty-text.js.es6 index e656c9a7d75..7a867d89c96 100644 --- a/app/assets/javascripts/pretty-text/pretty-text.js.es6 +++ b/app/assets/javascripts/pretty-text/pretty-text.js.es6 @@ -29,7 +29,7 @@ export function buildOptions(state) { lookupUploadUrls, previewing, linkify, - censoredWords, + censoredRegexp, disableEmojis } = state; @@ -67,7 +67,7 @@ export function buildOptions(state) { formatUsername, emojiUnicodeReplacer, lookupUploadUrls, - censoredWords, + censoredRegexp, allowedHrefSchemes: siteSettings.allowed_href_schemes ? siteSettings.allowed_href_schemes.split("|") : null, diff --git a/app/models/admin_dashboard_data.rb b/app/models/admin_dashboard_data.rb index 0d260816280..c558954c3d0 100644 --- a/app/models/admin_dashboard_data.rb +++ b/app/models/admin_dashboard_data.rb @@ -96,7 +96,7 @@ class AdminDashboardData :image_magick_check, :failing_emails_check, :subfolder_ends_in_slash_check, :pop3_polling_configuration, :email_polling_errored_recently, - :out_of_date_themes, :unreachable_themes + :out_of_date_themes, :unreachable_themes, :watched_words_check add_problem_check do sidekiq_check || queue_size_check @@ -224,6 +224,17 @@ class AdminDashboardData I18n.t('dashboard.force_https_warning', base_path: Discourse.base_path) unless SiteSetting.force_https end + def watched_words_check + WatchedWord.actions.keys.each do |action| + begin + WordWatcher.word_matcher_regexp(action, raise_errors: true) + rescue RegexpError => e + return I18n.t('dashboard.watched_word_regexp_error', base_path: Discourse.base_path, action: action) + end + end + nil + end + def out_of_date_themes old_themes = RemoteTheme.out_of_date_themes return unless old_themes.present? diff --git a/app/serializers/site_serializer.rb b/app/serializers/site_serializer.rb index 1b08914efde..13859d7503c 100644 --- a/app/serializers/site_serializer.rb +++ b/app/serializers/site_serializer.rb @@ -30,7 +30,7 @@ class SiteSerializer < ApplicationSerializer :wizard_required, :topic_featured_link_allowed_category_ids, :user_themes, - :censored_words, + :censored_regexp, :shared_drafts_category_id ) @@ -156,8 +156,8 @@ class SiteSerializer < ApplicationSerializer scope.topic_featured_link_allowed_category_ids end - def censored_words - WordWatcher.words_for_action(:censor).join('|') + def censored_regexp + WordWatcher.word_matcher_regexp(:censor)&.source end def shared_drafts_category_id diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb index 1e12f860029..0a1cad9bae6 100644 --- a/app/services/word_watcher.rb +++ b/app/services/word_watcher.rb @@ -20,7 +20,10 @@ class WordWatcher end end - def self.word_matcher_regexp(action) + # This regexp is run in miniracer, and the client JS app + # Make sure it is compatible with major browsers when changing + # hint: non-chrome browsers do not support 'lookbehind' + def self.word_matcher_regexp(action, raise_errors: false) words = get_cached_words(action) if words words = words.map do |w| @@ -31,10 +34,13 @@ class WordWatcher regexp = words.join('|') if !SiteSetting.watched_words_regular_expressions? regexp = "(#{regexp})" - regexp = "(? e + raise if raise_errors + nil # Admin will be alerted via admin_dashboard_data.rb end def self.word_to_regexp(word) diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 7ba365e9584..f1811039d2f 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1304,6 +1304,7 @@ en: force_https_warning: "Your website is using SSL. But `force_https` is not yet enabled in your site settings." out_of_date_themes: "Updates are available for the following themes:" unreachable_themes: "We were unable to check for updates on the following themes:" + watched_word_regexp_error: "The regular expression for %{action} watched words is invalid. Please check your Watched Word settings, or disable the 'watched words regular expressions' site setting." site_settings: censored_words: "Words that will be automatically replaced with ■■■■" diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 8b9de1c2d9c..a5591e9ec39 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -162,7 +162,7 @@ module PrettyText __optInput.customEmoji = #{custom_emoji.to_json}; __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer; __optInput.lookupUploadUrls = __lookupUploadUrls; - __optInput.censoredWords = #{WordWatcher.words_for_action(:censor).join('|').to_json}; + __optInput.censoredRegexp = #{WordWatcher.word_matcher_regexp(:censor)&.source.to_json}; JS if opts[:topicId] diff --git a/spec/components/pretty_text_spec.rb b/spec/components/pretty_text_spec.rb index 045ec8baf9b..327dbf6d224 100644 --- a/spec/components/pretty_text_spec.rb +++ b/spec/components/pretty_text_spec.rb @@ -1042,15 +1042,88 @@ HTML expect(PrettyText.cook("abcde ^:;-P")).to include("emoji") end - it 'can censor words correctly' do - begin - ['apple', 'banana'].each { |w| Fabricate(:watched_word, word: w, action: WatchedWord.actions[:censor]) } - expect(PrettyText.cook('yay banana yay')).not_to include('banana') - expect(PrettyText.cook('yay `banana` yay')).not_to include('banana') - expect(PrettyText.cook("# banana")).not_to include('banana') - expect(PrettyText.cook("# banana")).to include("\u25a0\u25a0") - ensure - $redis.flushall + describe "censoring" do + after(:all) { $redis.flushall } + + def expect_cooked_match(raw, expected_cooked) + expect(PrettyText.cook(raw)).to eq(expected_cooked) + end + + context "with basic words" do + fab!(:watched_words) do + ["shucks", "whiz", "whizzer", "a**le", "badword*", "shuck$", "café", "$uper"].each do |word| + Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: word) + end + end + + it "works correctly" do + expect_cooked_match("aw shucks, golly gee whiz.", + "

aw ■■■■■■, golly gee ■■■■.

") + end + + it "doesn't censor words unless they have boundaries." do + expect_cooked_match("you are a whizzard! I love cheesewhiz. Whiz.", + "

you are a whizzard! I love cheesewhiz. ■■■■.

") + end + + it "censors words even if previous partial matches exist." do + expect_cooked_match("you are a whizzer! I love cheesewhiz. Whiz.", + "

you are a ■■■■■■■! I love cheesewhiz. ■■■■.

") + end + + it "won't break links by censoring them." do + expect_cooked_match("The link still works. [whiz](http://www.whiz.com)", + '

The link still works. ■■■■

') + end + + it "escapes regexp characters" do + expect_cooked_match( + "I have a pen, I have an a**le", + "

I have a pen, I have an ■■■■■

" + ) + end + + it "works for words ending in non-word characters" do + expect_cooked_match( + "Aw shuck$, I can't fix the problem with money", + "

Aw ■■■■■■, I can't fix the problem with money

") + end + + it "works for words ending in accented characters" do + expect_cooked_match( + "Let's go to a café today", + "

Let's go to a ■■■■ today

") + end + + it "works for words starting with non-word characters" do + expect_cooked_match( + "Discourse is $uper amazing", + "

Discourse is ■■■■■ amazing

") + end + + it "handles * as wildcard" do + expect_cooked_match( + "No badword or apple here plz.", + "

No ■■■■■■■ or ■■■■■ here plz.

") + end + end + + context "with watched words as regular expressions" do + before { SiteSetting.watched_words_regular_expressions = true } + it "supports words as regular expressions" do + ["xyz*", "plee+ase"].each do |word| + Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: word) + end + + expect_cooked_match("Pleased to meet you, but pleeeease call me later, xyz123", + "

Pleased to meet you, but ■■■■■■■■■ call me later, ■■■123

") + end + + it "supports custom boundaries" do + Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "\\btown\\b") + expect_cooked_match("Meet downtown in your town at the townhouse on Main St.", + "

Meet downtown in your ■■■■ at the townhouse on Main St.

") + end end end diff --git a/spec/services/word_watcher_spec.rb b/spec/services/word_watcher_spec.rb index 89d854ae1a5..034911ec66e 100644 --- a/spec/services/word_watcher_spec.rb +++ b/spec/services/word_watcher_spec.rb @@ -24,7 +24,7 @@ describe WordWatcher do it "is correct when watched_words_regular_expressions = false" do SiteSetting.watched_words_regular_expressions = false regexp = WordWatcher.word_matcher_regexp(:block) - expect(regexp.inspect).to eq("/(? url }; @@ -974,89 +973,15 @@ QUnit.test("images", assert => { }); QUnit.test("censoring", assert => { - assert.cooked( - "aw shucks, golly gee whiz.", - "

aw ■■■■■■, golly gee ■■■■.

", - "it censors words in the Site Settings" - ); - - assert.cooked( - "you are a whizzard! I love cheesewhiz. Whiz.", - "

you are a whizzard! I love cheesewhiz. ■■■■.

", - "it doesn't censor words unless they have boundaries." - ); - - assert.cooked( - "you are a whizzer! I love cheesewhiz. Whiz.", - "

you are a ■■■■■■■! I love cheesewhiz. ■■■■.

", - "it censors words even if previous partial matches exist." - ); - - assert.cooked( - "The link still works. [whiz](http://www.whiz.com)", - '

The link still works. ■■■■

', - "it won't break links by censoring them." - ); - - assert.cooked( - "Call techapj the computer whiz at 555-555-1234 for free help.", - "

Call techapj the computer ■■■■ at 555-555-1234 for free help.

", - "uses both censored words and patterns from site settings" - ); - - assert.cooked( - "I have a pen, I have an a**le", - "

I have a pen, I have an ■■■■■

", - "it escapes regexp chars" - ); - - assert.cooked( - "Aw shuck$, I can't fix the problem with money", - "

Aw ■■■■■■, I can't fix the problem with money

", - "it works for words ending in non-word characters" - ); - - assert.cooked( - "Let's go to a café today", - "

Let's go to a ■■■■ today

", - "it works for words ending in accented characters" - ); - - assert.cooked( - "Discourse is $uper amazing", - "

Discourse is ■■■■■ amazing

", - "it works for words starting with non-word characters" - ); - - assert.cooked( - "No badword or apple here plz.", - "

No ■■■■■■■ or ■■■■■ here plz.

", - "it handles * as wildcard" - ); - assert.cookedOptions( "Pleased to meet you, but pleeeease call me later, xyz123", { - siteSettings: { - watched_words_regular_expressions: true - }, - censoredWords: "xyz*|plee+ase" + censoredRegexp: "(xyz*|plee+ase)" }, - "

Pleased to meet you, but ■■■■ call me later, ■■■■123

", - "supports words as regular expressions" - ); - - assert.cookedOptions( - "Meet downtown in your town at the townhouse on Main St.", - { - siteSettings: { - watched_words_regular_expressions: true - }, - censoredWords: "\\btown\\b" - }, - "

Meet downtown in your ■■■■ at the townhouse on Main St.

", - "supports words as regular expressions" + "

Pleased to meet you, but ■■■■■■■■■ call me later, ■■■123

", + "supports censoring" ); + // More tests in pretty_text_spec.rb }); QUnit.test("code blocks/spans hoisting", assert => {