FEATURE: Use Postgres unaccent to ignore accents (#16100)

The search_ignore_accents site setting can be used to make the search
indexer remove the accents before indexing the content. The unaccent
function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
This commit is contained in:
Bianca Nenciu
2022-03-07 23:03:10 +02:00
committed by GitHub
parent 6e7cdc5bc3
commit 34b4b53bac
6 changed files with 75 additions and 53 deletions

View File

@@ -17,10 +17,6 @@ class SearchIndexer
@disabled = false
end
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
end
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
raw_data = [a_weight, b_weight, c_weight, d_weight]
@@ -35,10 +31,10 @@ class SearchIndexer
stemmer = table == "user" ? "simple" : Search.ts_config
ranked_index = <<~SQL
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
SQL
ranked_params = {
@@ -109,7 +105,7 @@ class SearchIndexer
table: 'topic',
id: topic_id,
a_weight: title,
b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
)
end
@@ -124,7 +120,7 @@ class SearchIndexer
# the original string. Since there is no way to estimate the length of
# the expected tsvector, we limit the input to ~50% of the maximum
# length of a tsvector (1_048_576 bytes).
d_weight: scrub_html_for_search(cooked)[0..600_000]
d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
) do |params|
params["private_message"] = private_message
end
@@ -294,12 +290,11 @@ class SearchIndexer
attr_reader :scrubbed
def initialize(strip_diacritics: false)
def initialize
@scrubbed = +""
@strip_diacritics = strip_diacritics
end
def self.scrub(html, strip_diacritics: false)
def self.scrub(html)
return +"" if html.blank?
begin
@@ -338,9 +333,9 @@ class SearchIndexer
end
end
me = new(strip_diacritics: strip_diacritics)
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
me.scrubbed.squish
html_scrubber = new
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
html_scrubber.scrubbed.squish
end
MENTION_CLASSES ||= %w{mention mention-group}
@@ -362,7 +357,6 @@ class SearchIndexer
end
def characters(str)
str = Search.strip_diacritics(str) if @strip_diacritics
scrubbed << " #{str} "
end
end