From d184fe59ca7885741ed9f840d3209a9a5ed861ea Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Thu, 3 Jun 2021 04:39:12 +0300 Subject: [PATCH] FEATURE: Censor Oneboxes (#12902) Previously onebox content was not passed by the censor regex, meaning you could sneak in censored words via onebox. --- app/services/word_watcher.rb | 22 ++++++++++++++++++++++ lib/oneboxer.rb | 5 ++++- spec/components/oneboxer_spec.rb | 23 +++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb index 2ccc7541ced..36699140f0c 100644 --- a/app/services/word_watcher.rb +++ b/app/services/word_watcher.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true class WordWatcher + REPLACEMENT_LETTER ||= CGI.unescape_html("■") def initialize(raw) @raw = raw @@ -70,6 +71,27 @@ class WordWatcher "watched-words-list:#{action}" end + def self.censor(html) + regexp = WordWatcher.word_matcher_regexp(:censor) + return html if regexp.blank? + + doc = Nokogiri::HTML5::fragment(html) + doc.traverse do |node| + if node.text? + node.content = node.content.gsub(regexp) do |match| + # the regex captures leading whitespaces + padding = match.size - match.lstrip.size + if padding > 0 + match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding) + else + REPLACEMENT_LETTER * match.size + end + end + end + end + doc.to_s + end + def self.clear_cache! WatchedWord.actions.each do |a, i| Discourse.cache.delete word_matcher_regexp_key(a) diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 69bb14c2a13..4a7461f871b 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -455,7 +455,10 @@ module Oneboxer onebox_options[:user_agent] = user_agent_override if user_agent_override r = Onebox.preview(uri.to_s, onebox_options) - result = { onebox: r.to_s, preview: r&.placeholder_html.to_s } + result = { + onebox: WordWatcher.censor(r.to_s), + preview: WordWatcher.censor(r&.placeholder_html.to_s) + } # NOTE: Call r.errors after calling placeholder_html if r.errors.any? diff --git a/spec/components/oneboxer_spec.rb b/spec/components/oneboxer_spec.rb index 3628e582d6f..527f0545d21 100644 --- a/spec/components/oneboxer_spec.rb +++ b/spec/components/oneboxer_spec.rb @@ -177,6 +177,29 @@ describe Oneboxer do expect(Oneboxer.external_onebox(url)[:onebox]).to be_present end + it "censors external oneboxes" do + Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word") + + url = 'https://example.com/' + stub_request(:any, url).to_return(status: 200, body: <<~HTML, headers: {}) + + + + + + +

content with bad word

+ + + HTML + + onebox = Oneboxer.external_onebox(url) + expect(onebox[:onebox]).to include('title with') + expect(onebox[:onebox]).not_to include('bad word') + expect(onebox[:preview]).to include('title with') + expect(onebox[:preview]).not_to include('bad word') + end + it "uses the Onebox custom user agent on specified hosts" do SiteSetting.force_custom_user_agent_hosts = "http://codepen.io|https://video.discourse.org/" url = 'https://video.discourse.org/presentation.mp4'