FIX: Limit PG headline based search blurb generation to 200 characters.

* Recovers omission characters '...' in blurb as well.
This commit is contained in:
Guo Xiang Tan
2020-08-12 15:33:26 +08:00
parent ec173a72d9
commit 93f8396b4b
3 changed files with 45 additions and 5 deletions

View File

@@ -1177,8 +1177,28 @@ class Search
.joins("INNER JOIN post_search_data pd ON pd.post_id = posts.id") .joins("INNER JOIN post_search_data pd ON pd.post_id = posts.id")
.joins("INNER JOIN topics t1 ON t1.id = posts.topic_id") .joins("INNER JOIN topics t1 ON t1.id = posts.topic_id")
.select( .select(
"TS_HEADLINE(#{ts_config}, t1.fancy_title, PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS topic_title_headline", "TS_HEADLINE(
"TS_HEADLINE(#{ts_config}, LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}), PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline", #{ts_config},
t1.fancy_title,
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>'''
) AS topic_title_headline",
"TS_HEADLINE(
#{ts_config},
LEFT(
TS_HEADLINE(
#{ts_config},
LEFT(pd.raw_data, #{MAX_LENGTH_FOR_HEADLINE}),
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel='''', StopSel='''''
),
#{Search::GroupedSearchResults::BLURB_LENGTH}
),
PLAINTO_TSQUERY(#{ts_config}, '#{search_term}'),
'HighlightAll=true, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>'''
) AS headline",
"LEFT(pd.raw_data, 50) AS leading_raw_data",
"RIGHT(pd.raw_data, 50) AS trailing_raw_data",
default_scope.arel.projections default_scope.arel.projections
) )
else else

View File

@@ -78,6 +78,9 @@ class Search
end end
end end
OMISSION = '...'
SCRUB_HEADLINE_REGEXP = /<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")*>([^<]*)<\/span>/
def blurb(post) def blurb(post)
opts = { opts = {
term: @blurb_term, term: @blurb_term,
@@ -86,7 +89,10 @@ class Search
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
if SiteSetting.use_pg_headlines_for_excerpt if SiteSetting.use_pg_headlines_for_excerpt
return post.headline scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? '' : OMISSION
return "#{prefix_omission}#{post.headline}#{postfix_omission}"
else else
opts[:cooked] = post.post_search_data.raw_data opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false opts[:scrub] = false

View File

@@ -410,7 +410,7 @@ describe Search do
end end
let(:expected_blurb) do let(:expected_blurb) do
"hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"search-highlight\">elephant</span>" "#{Search::GroupedSearchResults::OMISSION}hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">elephant</span>"
end end
it 'returns the post' do it 'returns the post' do
@@ -429,7 +429,7 @@ describe Search do
expect(post.topic_title_headline).to eq(topic.fancy_title) expect(post.topic_title_headline).to eq(topic.fancy_title)
end end
it "it limits the headline to #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do it "only applies highlighting to the first #{Search::MAX_LENGTH_FOR_HEADLINE} characters" do
SiteSetting.use_pg_headlines_for_excerpt = true SiteSetting.use_pg_headlines_for_excerpt = true
reply.update!(raw: "#{'a' * Search::MAX_LENGTH_FOR_HEADLINE} #{reply.raw}") reply.update!(raw: "#{'a' * Search::MAX_LENGTH_FOR_HEADLINE} #{reply.raw}")
@@ -443,6 +443,20 @@ describe Search do
expect(post.headline.include?('elephant')).to eq(false) expect(post.headline.include?('elephant')).to eq(false)
end end
it "limits the search headline to #{Search::GroupedSearchResults::BLURB_LENGTH} characters" do
SiteSetting.use_pg_headlines_for_excerpt = true
reply.update!(raw: "#{'a' * Search::GroupedSearchResults::BLURB_LENGTH} elephant")
result = Search.execute('elephant')
expect(result.posts.map(&:id)).to contain_exactly(reply.id)
post = result.posts.first
expect(result.blurb(post)).to eq("#{'a' * Search::GroupedSearchResults::BLURB_LENGTH}#{Search::GroupedSearchResults::OMISSION}")
end
it 'returns the right post and blurb for searches with phrase' do it 'returns the right post and blurb for searches with phrase' do
SiteSetting.use_pg_headlines_for_excerpt = true SiteSetting.use_pg_headlines_for_excerpt = true