FEATURE: Use Postgres unaccent to ignore accents (#16100)

The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
2025-02-25 18:55:32 -06:00 · 2022-03-07 23:03:10 +02:00
parent 6e7cdc5bc3
commit 34b4b53bac
6 changed files with 75 additions and 53 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -17,10 +17,6 @@ class SearchIndexer
    @disabled = false
  end

-  def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
-    HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
-  end
-
  def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
    raw_data = [a_weight, b_weight, c_weight, d_weight]

@@ -35,10 +31,10 @@ class SearchIndexer
    stemmer = table == "user" ? "simple" : Search.ts_config

    ranked_index = <<~SQL
-      setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
    SQL

    ranked_params = {
@@ -109,7 +105,7 @@ class SearchIndexer
      table: 'topic',
      id: topic_id,
      a_weight: title,
-      b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
+      b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
    )
  end

@@ -124,7 +120,7 @@ class SearchIndexer
      # the original string. Since there is no way to estimate the length of
      # the expected tsvector, we limit the input to ~50% of the maximum
      # length of a tsvector (1_048_576 bytes).
-      d_weight: scrub_html_for_search(cooked)[0..600_000]
+      d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
    ) do |params|
      params["private_message"] = private_message
    end
@@ -294,12 +290,11 @@ class SearchIndexer

    attr_reader :scrubbed

-    def initialize(strip_diacritics: false)
+    def initialize
      @scrubbed = +""
-      @strip_diacritics = strip_diacritics
    end

-    def self.scrub(html, strip_diacritics: false)
+    def self.scrub(html)
      return +"" if html.blank?

      begin
@@ -338,9 +333,9 @@ class SearchIndexer
        end
      end

-      me = new(strip_diacritics: strip_diacritics)
-      Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
-      me.scrubbed.squish
+      html_scrubber = new
+      Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
+      html_scrubber.scrubbed.squish
    end

    MENTION_CLASSES ||= %w{mention mention-group}
@@ -362,7 +357,6 @@ class SearchIndexer
    end

    def characters(str)
-      str = Search.strip_diacritics(str) if @strip_diacritics
      scrubbed << " #{str} "
    end
  end