From 3c678df942b9322bfda5eb47d428b1c692ac0bf4 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 8 Oct 2020 11:40:13 +1100 Subject: [PATCH] PERF: avoid lookbehinds when indexing search (#10862) * PERF: avoid lookbehinds when indexing search Previously we used a `EmailCook.url_regexp` this regex used lookbehinds Unfortunately certain strings could lead to pathological behavior causing CPU to skyrocket and regex replace to take a very very long time. EmailCook still needs a fix, but it is less urgent cause it already splits to single lines. That said we will correct that as well in a seperate PR. New implementation is far more naive and relies on the extra spaces search indexer inserts. --- lib/search.rb | 17 +++++++++++------ spec/services/search_indexer_spec.rb | 16 ++++++++++------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/lib/search.rb b/lib/search.rb index 1ea88a44ce9..0a3ab7980a4 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -88,12 +88,17 @@ class Search end end - data.gsub!(EmailCook.url_regexp) do |url| - uri = URI.parse(url) - uri.query = nil - uri.to_s - rescue URI::Error - # Don't fail even if URL turns out to be invalid + data.gsub!(/\S+/) do |str| + if str.match?(/^(https?:\/\/)[\S]+$/) + begin + uri = URI.parse(str) + uri.query = nil + str = uri.to_s + rescue URI::Error + # don't fail if uri does not parse + end + end + str end data diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index aeee36e4aaa..936dca05377 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -158,7 +158,6 @@ describe SearchIndexer do post.rebake! post.reload - topic = post.topic expect(post.post_search_data.raw_data).to eq( "https://meta.discourse.org/some.png" @@ -189,19 +188,25 @@ describe SearchIndexer do topic = Fabricate(:topic, category: category, title: 'this is a test topic') post = Fabricate(:post, topic: topic, raw: <<~RAW) - a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1 + a https://abc.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1 + www.klm.net/?IGNORE=1 test RAW post.rebake! post.reload topic = post.topic + # Note, a random non URL string should be tokenized properly, + # hence www.klm.net?IGNORE=1 it was inserted in autolinking. + # We could consider amending the auto linker to add + # more context to say "hey, this part of ... was a guess by autolinker. + # A blanket treating of non-urls without this logic is risky. expect(post.post_search_data.raw_data).to eq( - "a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1" + "a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test" ) expect(post.post_search_data.search_data).to eq( - "'/xyz=1':14,17 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn.com':9 'com':9 'com.au':10 'net':13,16 'stuff.com.au':10 'test':4A 'topic':5A" + "'/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19" ) end @@ -222,7 +227,6 @@ describe SearchIndexer do post.rebake! post.reload - topic = post.topic expect(post.cooked).to include( CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS @@ -235,7 +239,7 @@ describe SearchIndexer do it 'should strips audio and videos URLs from raw data' do SiteSetting.authorized_extensions = 'mp4' - upload = Fabricate(:video_upload) + Fabricate(:video_upload) post.update!(raw: <<~RAW) link to an external page: https://google.com/?u=bar