mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
PERF: avoid lookbehinds when indexing search (#10862)
* PERF: avoid lookbehinds when indexing search Previously we used a `EmailCook.url_regexp` this regex used lookbehinds Unfortunately certain strings could lead to pathological behavior causing CPU to skyrocket and regex replace to take a very very long time. EmailCook still needs a fix, but it is less urgent cause it already splits to single lines. That said we will correct that as well in a seperate PR. New implementation is far more naive and relies on the extra spaces search indexer inserts.
This commit is contained in:
@@ -88,12 +88,17 @@ class Search
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
data.gsub!(EmailCook.url_regexp) do |url|
|
data.gsub!(/\S+/) do |str|
|
||||||
uri = URI.parse(url)
|
if str.match?(/^(https?:\/\/)[\S]+$/)
|
||||||
uri.query = nil
|
begin
|
||||||
uri.to_s
|
uri = URI.parse(str)
|
||||||
rescue URI::Error
|
uri.query = nil
|
||||||
# Don't fail even if URL turns out to be invalid
|
str = uri.to_s
|
||||||
|
rescue URI::Error
|
||||||
|
# don't fail if uri does not parse
|
||||||
|
end
|
||||||
|
end
|
||||||
|
str
|
||||||
end
|
end
|
||||||
|
|
||||||
data
|
data
|
||||||
|
|||||||
@@ -158,7 +158,6 @@ describe SearchIndexer do
|
|||||||
|
|
||||||
post.rebake!
|
post.rebake!
|
||||||
post.reload
|
post.reload
|
||||||
topic = post.topic
|
|
||||||
|
|
||||||
expect(post.post_search_data.raw_data).to eq(
|
expect(post.post_search_data.raw_data).to eq(
|
||||||
"https://meta.discourse.org/some.png"
|
"https://meta.discourse.org/some.png"
|
||||||
@@ -189,19 +188,25 @@ describe SearchIndexer do
|
|||||||
topic = Fabricate(:topic, category: category, title: 'this is a test topic')
|
topic = Fabricate(:topic, category: category, title: 'this is a test topic')
|
||||||
|
|
||||||
post = Fabricate(:post, topic: topic, raw: <<~RAW)
|
post = Fabricate(:post, topic: topic, raw: <<~RAW)
|
||||||
a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1
|
a https://abc.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1
|
||||||
|
www.klm.net/?IGNORE=1 <a href="http://abc.de.nop.co.uk?IGNORE=1&ingore2=2">test</a>
|
||||||
RAW
|
RAW
|
||||||
|
|
||||||
post.rebake!
|
post.rebake!
|
||||||
post.reload
|
post.reload
|
||||||
topic = post.topic
|
topic = post.topic
|
||||||
|
|
||||||
|
# Note, a random non URL string should be tokenized properly,
|
||||||
|
# hence www.klm.net?IGNORE=1 it was inserted in autolinking.
|
||||||
|
# We could consider amending the auto linker to add
|
||||||
|
# more context to say "hey, this part of <a href>...</a> was a guess by autolinker.
|
||||||
|
# A blanket treating of non-urls without this logic is risky.
|
||||||
expect(post.post_search_data.raw_data).to eq(
|
expect(post.post_search_data.raw_data).to eq(
|
||||||
"a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
|
"a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test"
|
||||||
)
|
)
|
||||||
|
|
||||||
expect(post.post_search_data.search_data).to eq(
|
expect(post.post_search_data.search_data).to eq(
|
||||||
"'/xyz=1':14,17 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn.com':9 'com':9 'com.au':10 'net':13,16 'stuff.com.au':10 'test':4A 'topic':5A"
|
"'/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19"
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -222,7 +227,6 @@ describe SearchIndexer do
|
|||||||
|
|
||||||
post.rebake!
|
post.rebake!
|
||||||
post.reload
|
post.reload
|
||||||
topic = post.topic
|
|
||||||
|
|
||||||
expect(post.cooked).to include(
|
expect(post.cooked).to include(
|
||||||
CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS
|
CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS
|
||||||
@@ -235,7 +239,7 @@ describe SearchIndexer do
|
|||||||
|
|
||||||
it 'should strips audio and videos URLs from raw data' do
|
it 'should strips audio and videos URLs from raw data' do
|
||||||
SiteSetting.authorized_extensions = 'mp4'
|
SiteSetting.authorized_extensions = 'mp4'
|
||||||
upload = Fabricate(:video_upload)
|
Fabricate(:video_upload)
|
||||||
|
|
||||||
post.update!(raw: <<~RAW)
|
post.update!(raw: <<~RAW)
|
||||||
link to an external page: https://google.com/?u=bar
|
link to an external page: https://google.com/?u=bar
|
||||||
|
|||||||
Reference in New Issue
Block a user