mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 08:57:10 -06:00
FIX: Strip accents from search query
4481836
introduced accent stipping in search_indexer,
but we need to strip it from the query itself as well
TODO in search with diacritics:
- Still need to fix excerpts on search page
- need to support accent stripping in in_topic search
- need to make sure that in:title works correctly
- need to fix "word boldening" in titles
This commit is contained in:
parent
7d2e582b28
commit
cee51672c9
@ -167,8 +167,6 @@ class SearchIndexer
|
||||
|
||||
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
||||
|
||||
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
||||
|
||||
attr_reader :scrubbed
|
||||
|
||||
def initialize(strip_diacritics: false)
|
||||
@ -196,15 +194,8 @@ class SearchIndexer
|
||||
end
|
||||
end
|
||||
|
||||
def strip_diacritics(str)
|
||||
s = str.unicode_normalize(:nfkd)
|
||||
s.gsub!(DIACRITICS, "")
|
||||
s.strip!
|
||||
s
|
||||
end
|
||||
|
||||
def characters(str)
|
||||
str = strip_diacritics(str) if @strip_diacritics
|
||||
str = Search.strip_diacritics(str) if @strip_diacritics
|
||||
scrubbed << " #{str} "
|
||||
end
|
||||
end
|
||||
|
@ -2,11 +2,19 @@ require_dependency 'search/grouped_search_results'
|
||||
|
||||
class Search
|
||||
INDEX_VERSION = 2.freeze
|
||||
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
||||
|
||||
def self.per_facet
|
||||
5
|
||||
end
|
||||
|
||||
def self.strip_diacritics(str)
|
||||
s = str.unicode_normalize(:nfkd)
|
||||
s.gsub!(DIACRITICS, "")
|
||||
s.strip!
|
||||
s
|
||||
end
|
||||
|
||||
def self.per_filter
|
||||
50
|
||||
end
|
||||
@ -59,6 +67,9 @@ class Search
|
||||
end
|
||||
|
||||
data.force_encoding("UTF-8")
|
||||
if SiteSetting.search_ignore_accents
|
||||
data = strip_diacritics(data)
|
||||
end
|
||||
data
|
||||
end
|
||||
|
||||
|
@ -1020,21 +1020,42 @@ describe Search do
|
||||
end
|
||||
end
|
||||
|
||||
context 'diacritics' do
|
||||
context 'ignore_diacritics' do
|
||||
before { SiteSetting.search_ignore_accents = true }
|
||||
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Rágis hello') }
|
||||
|
||||
it ('allows strips correctly') do
|
||||
results = Search.execute('hello', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
results = Search.execute('ragis', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
results = Search.execute('Rágis', type_filter: 'topic', include_blurbs: true)
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
# TODO: this is a test we need to fix!
|
||||
#expect(results.blurb(results.posts.first)).to include('Rágis')
|
||||
|
||||
results = Search.execute('สวัสดี', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(1)
|
||||
end
|
||||
end
|
||||
|
||||
context 'include_diacritics' do
|
||||
before { SiteSetting.search_ignore_accents = false }
|
||||
let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') }
|
||||
|
||||
it ('allows strips correctly') do
|
||||
results = Search.execute('hello', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
# TODO when we add diacritic support we should return 1 here
|
||||
results = Search.execute('regis', type_filter: 'topic')
|
||||
expect(results.posts.length).to eq(0)
|
||||
|
||||
results = Search.execute('Régis', type_filter: 'topic', include_blurbs: true)
|
||||
expect(results.posts.length).to eq(1)
|
||||
|
||||
# this is a test we got to keep working
|
||||
expect(results.blurb(results.posts.first)).to include('Régis')
|
||||
|
||||
results = Search.execute('สวัสดี', type_filter: 'topic')
|
||||
|
Loading…
Reference in New Issue
Block a user