From bc7b530b0a60423f0aadc423bccb54672a655dd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Fri, 24 Aug 2018 00:38:44 +0200 Subject: [PATCH] FIX: remove diacritics instead of transliterating --- app/services/search_indexer.rb | 4 +++- spec/services/search_indexer_spec.rb | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index e4f16c52f19..7ccc46d6f9b 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -190,8 +190,10 @@ class SearchIndexer end end + DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ + def characters(string) - scrubbed << " #{ActiveSupport::Inflector.transliterate(string).strip} " + scrubbed << " #{string.unicode_normalize(:nfd).gsub(DIACRITICS, "").strip} " end end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index 8b6bcae498f..42a61fa3675 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -30,11 +30,11 @@ describe SearchIndexer do end it 'removes diacritics' do - html = "

Hétérogénéité

" + html = "

HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好

" scrubbed = SearchIndexer::HtmlScrubber.scrub(html) - expect(scrubbed).to eq(" Heterogeneite ") + expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ") end it 'correctly indexes a post according to version' do