mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
FEATURE: Use Postgres unaccent to ignore accents (#16100)
The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
This commit is contained in:
@@ -17,10 +17,6 @@ class SearchIndexer
|
||||
@disabled = false
|
||||
end
|
||||
|
||||
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
|
||||
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
||||
end
|
||||
|
||||
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
||||
raw_data = [a_weight, b_weight, c_weight, d_weight]
|
||||
|
||||
@@ -35,10 +31,10 @@ class SearchIndexer
|
||||
stemmer = table == "user" ? "simple" : Search.ts_config
|
||||
|
||||
ranked_index = <<~SQL
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
|
||||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
|
||||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
|
||||
SQL
|
||||
|
||||
ranked_params = {
|
||||
@@ -109,7 +105,7 @@ class SearchIndexer
|
||||
table: 'topic',
|
||||
id: topic_id,
|
||||
a_weight: title,
|
||||
b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
||||
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
||||
)
|
||||
end
|
||||
|
||||
@@ -124,7 +120,7 @@ class SearchIndexer
|
||||
# the original string. Since there is no way to estimate the length of
|
||||
# the expected tsvector, we limit the input to ~50% of the maximum
|
||||
# length of a tsvector (1_048_576 bytes).
|
||||
d_weight: scrub_html_for_search(cooked)[0..600_000]
|
||||
d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
|
||||
) do |params|
|
||||
params["private_message"] = private_message
|
||||
end
|
||||
@@ -294,12 +290,11 @@ class SearchIndexer
|
||||
|
||||
attr_reader :scrubbed
|
||||
|
||||
def initialize(strip_diacritics: false)
|
||||
def initialize
|
||||
@scrubbed = +""
|
||||
@strip_diacritics = strip_diacritics
|
||||
end
|
||||
|
||||
def self.scrub(html, strip_diacritics: false)
|
||||
def self.scrub(html)
|
||||
return +"" if html.blank?
|
||||
|
||||
begin
|
||||
@@ -338,9 +333,9 @@ class SearchIndexer
|
||||
end
|
||||
end
|
||||
|
||||
me = new(strip_diacritics: strip_diacritics)
|
||||
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
|
||||
me.scrubbed.squish
|
||||
html_scrubber = new
|
||||
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
|
||||
html_scrubber.scrubbed.squish
|
||||
end
|
||||
|
||||
MENTION_CLASSES ||= %w{mention mention-group}
|
||||
@@ -362,7 +357,6 @@ class SearchIndexer
|
||||
end
|
||||
|
||||
def characters(str)
|
||||
str = Search.strip_diacritics(str) if @strip_diacritics
|
||||
scrubbed << " #{str} "
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user