FIX: Strip accents from search query

4481836 introduced accent stipping in search_indexer,
but we need to strip it from the query itself as well

TODO in search with diacritics:
 - Still need to fix excerpts on search page
 - need to support accent stripping in in_topic search
 - need to make sure that in:title works correctly
 - need to fix "word boldening" in titles
This commit is contained in:
Daniel Hollas 2018-10-23 03:10:33 +02:00 committed by Sam
parent 7d2e582b28
commit cee51672c9
3 changed files with 36 additions and 13 deletions

View File

@ -167,8 +167,6 @@ class SearchIndexer
class HtmlScrubber < Nokogiri::XML::SAX::Document
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
attr_reader :scrubbed
def initialize(strip_diacritics: false)
@ -196,15 +194,8 @@ class SearchIndexer
end
end
def strip_diacritics(str)
s = str.unicode_normalize(:nfkd)
s.gsub!(DIACRITICS, "")
s.strip!
s
end
def characters(str)
str = strip_diacritics(str) if @strip_diacritics
str = Search.strip_diacritics(str) if @strip_diacritics
scrubbed << " #{str} "
end
end

View File

@ -2,11 +2,19 @@ require_dependency 'search/grouped_search_results'
class Search
INDEX_VERSION = 2.freeze
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
def self.per_facet
5
end
def self.strip_diacritics(str)
s = str.unicode_normalize(:nfkd)
s.gsub!(DIACRITICS, "")
s.strip!
s
end
def self.per_filter
50
end
@ -59,6 +67,9 @@ class Search
end
data.force_encoding("UTF-8")
if SiteSetting.search_ignore_accents
data = strip_diacritics(data)
end
data
end

View File

@ -1020,21 +1020,42 @@ describe Search do
end
end
context 'diacritics' do
context 'ignore_diacritics' do
before { SiteSetting.search_ignore_accents = true }
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Rágis hello') }
it ('allows strips correctly') do
results = Search.execute('hello', type_filter: 'topic')
expect(results.posts.length).to eq(1)
results = Search.execute('ragis', type_filter: 'topic')
expect(results.posts.length).to eq(1)
results = Search.execute('Rágis', type_filter: 'topic', include_blurbs: true)
expect(results.posts.length).to eq(1)
# TODO: this is a test we need to fix!
#expect(results.blurb(results.posts.first)).to include('Rágis')
results = Search.execute('สวัสดี', type_filter: 'topic')
expect(results.posts.length).to eq(1)
end
end
context 'include_diacritics' do
before { SiteSetting.search_ignore_accents = false }
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }
it ('allows strips correctly') do
results = Search.execute('hello', type_filter: 'topic')
expect(results.posts.length).to eq(1)
# TODO when we add diacritic support we should return 1 here
results = Search.execute('regis', type_filter: 'topic')
expect(results.posts.length).to eq(0)
results = Search.execute('Régis', type_filter: 'topic', include_blurbs: true)
expect(results.posts.length).to eq(1)
# this is a test we got to keep working
expect(results.blurb(results.posts.first)).to include('Régis')
results = Search.execute('สวัสดี', type_filter: 'topic')