PERF: Use PG headlines for blurb generation and highlighting for search.

This commit is contained in:
Guo Xiang Tan 2020-08-06 14:15:31 +08:00
parent ba482c251c
commit 2193d02433
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
5 changed files with 48 additions and 15 deletions

View File

@ -1792,6 +1792,9 @@ backups:
hidden: true hidden: true
search: search:
use_pg_headlines_for_excerpt:
default: false
hidden: true
search_ranking_normalization: search_ranking_normalization:
default: "0" default: "0"
hidden: true hidden: true

View File

@ -2,6 +2,7 @@
class Search class Search
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
HIGHLIGHT_CSS_CLASS = 'search-highlight'
cattr_accessor :preloaded_topic_custom_fields cattr_accessor :preloaded_topic_custom_fields
self.preloaded_topic_custom_fields = Set.new self.preloaded_topic_custom_fields = Set.new
@ -726,12 +727,18 @@ class Search
def single_topic(id) def single_topic(id)
if @opts[:restrict_to_archetype].present? if @opts[:restrict_to_archetype].present?
archetype = @opts[:restrict_to_archetype] == Archetype.default ? Archetype.default : Archetype.private_message archetype = @opts[:restrict_to_archetype] == Archetype.default ? Archetype.default : Archetype.private_message
post = Post.joins(:topic)
.where("topics.id = :id AND topics.archetype = :archetype AND posts.post_number = 1", id: id, archetype: archetype) post = posts_scope
.first .joins(:topic)
.find_by(
"topics.id = :id AND topics.archetype = :archetype AND posts.post_number = 1",
id: id,
archetype: archetype
)
else else
post = Post.find_by(topic_id: id, post_number: 1) post = posts_scope.find_by(topic_id: id, post_number: 1)
end end
return nil unless @guardian.can_see?(post) return nil unless @guardian.can_see?(post)
@results.add(post) @results.add(post)
@ -1096,7 +1103,7 @@ class Search
def aggregate_posts(post_sql) def aggregate_posts(post_sql)
return [] unless post_sql return [] unless post_sql
posts_eager_loads(Post) posts_scope(posts_eager_loads(Post))
.joins("JOIN (#{post_sql}) x ON x.id = posts.topic_id AND x.post_number = posts.post_number") .joins("JOIN (#{post_sql}) x ON x.id = posts.topic_id AND x.post_number = posts.post_number")
.order('row_number') .order('row_number')
end end
@ -1128,7 +1135,7 @@ class Search
def topic_search def topic_search
if @search_context.is_a?(Topic) if @search_context.is_a?(Topic)
posts = posts_eager_loads(posts_query(limit)) posts = posts_scope(posts_eager_loads(posts_query(limit)))
.where('posts.topic_id = ?', @search_context.id) .where('posts.topic_id = ?', @search_context.id)
posts.each do |post| posts.each do |post|
@ -1150,4 +1157,17 @@ class Search
query.includes(topic: topic_eager_loads) query.includes(topic: topic_eager_loads)
end end
def posts_scope(default_scope = Post.all)
if SiteSetting.use_pg_headlines_for_excerpt
default_scope
.joins("INNER JOIN post_search_data pd ON pd.post_id = posts.id")
.select(
"TS_HEADLINE(#{default_ts_config}, pd.raw_data, PLAINTO_TSQUERY('#{@term.present? ? PG::Connection.escape_string(@term) : nil}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
default_scope.arel.projections
)
else
default_scope
end
end
end end

View File

@ -85,8 +85,12 @@ class Search
} }
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
opts[:cooked] = post.post_search_data.raw_data if SiteSetting.use_pg_headlines_for_excerpt
opts[:scrub] = false return post.headline
else
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false
end
else else
opts[:cooked] = post.cooked opts[:cooked] = post.cooked
end end

View File

@ -410,27 +410,31 @@ describe Search do
end end
let(:expected_blurb) do let(:expected_blurb) do
"...quire content longer than the typical test post raw content. It really is some long content, folks. elephant" "hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"search-highlight\">elephant</span>"
end end
it 'returns the post' do it 'returns the post' do
SiteSetting.use_pg_headlines_for_excerpt = true
result = Search.execute('elephant', result = Search.execute('elephant',
type_filter: 'topic', type_filter: 'topic',
include_blurbs: true include_blurbs: true
) )
expect(result.posts).to contain_exactly(reply) expect(result.posts.map(&:id)).to contain_exactly(reply.id)
expect(result.blurb(reply)).to eq(expected_blurb) expect(result.blurb(result.posts.first)).to eq(expected_blurb)
end end
it 'returns the right post and blurb for searches with phrase' do it 'returns the right post and blurb for searches with phrase' do
SiteSetting.use_pg_headlines_for_excerpt = true
result = Search.execute('"elephant"', result = Search.execute('"elephant"',
type_filter: 'topic', type_filter: 'topic',
include_blurbs: true include_blurbs: true
) )
expect(result.posts).to contain_exactly(reply) expect(result.posts.map(&:id)).to contain_exactly(reply.id)
expect(result.blurb(reply)).to eq(expected_blurb) expect(result.blurb(result.posts.first)).to eq(expected_blurb)
end end
it 'applies a small penalty to closed topic when ranking' do it 'applies a small penalty to closed topic when ranking' do

View File

@ -99,6 +99,8 @@ describe SearchController do
end end
it "can search correctly" do it "can search correctly" do
SiteSetting.use_pg_headlines_for_excerpt = true
get "/search/query.json", params: { get "/search/query.json", params: {
term: 'awesome' term: 'awesome'
} }
@ -109,11 +111,11 @@ describe SearchController do
expect(data['posts'].length).to eq(2) expect(data['posts'].length).to eq(2)
expect(data['posts'][0]['id']).to eq(awesome_post_2.id) expect(data['posts'][0]['id']).to eq(awesome_post_2.id)
expect(data['posts'][0]['blurb']).to eq(awesome_post_2.raw) expect(data['posts'][0]['blurb']).to eq("this is my really <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">awesome</span> post")
expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id) expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id)
expect(data['posts'][1]['id']).to eq(awesome_post.id) expect(data['posts'][1]['id']).to eq(awesome_post.id)
expect(data['posts'][1]['blurb']).to eq(awesome_post.raw) expect(data['posts'][1]['blurb']).to eq("this is my really <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">awesome</span> post")
expect(data['topics'][1]['id']).to eq(awesome_post.topic_id) expect(data['topics'][1]['id']).to eq(awesome_post.topic_id)
end end