mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
FIX: remove diacritics when tokenizing html for search
This commit is contained in:
@@ -121,7 +121,8 @@ class SearchIndexer
|
|||||||
def self.index(obj, force: false)
|
def self.index(obj, force: false)
|
||||||
return if @disabled
|
return if @disabled
|
||||||
|
|
||||||
category_name, tag_names = nil
|
category_name = nil
|
||||||
|
tag_names = nil
|
||||||
topic = nil
|
topic = nil
|
||||||
|
|
||||||
if Topic === obj
|
if Topic === obj
|
||||||
@@ -148,8 +149,7 @@ class SearchIndexer
|
|||||||
|
|
||||||
if Topic === obj && (obj.saved_change_to_title? || force)
|
if Topic === obj && (obj.saved_change_to_title? || force)
|
||||||
if obj.posts
|
if obj.posts
|
||||||
post = obj.posts.find_by(post_number: 1)
|
if post = obj.posts.find_by(post_number: 1)
|
||||||
if post
|
|
||||||
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
|
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
|
||||||
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
||||||
end
|
end
|
||||||
@@ -175,43 +175,22 @@ class SearchIndexer
|
|||||||
def self.scrub(html)
|
def self.scrub(html)
|
||||||
me = new
|
me = new
|
||||||
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
||||||
begin
|
parser.parse("<div>#{html}</div>") if html.present?
|
||||||
copy = +"<div>"
|
|
||||||
copy << html unless html.nil?
|
|
||||||
copy << "</div>"
|
|
||||||
parser.parse(html) unless html.nil?
|
|
||||||
end
|
|
||||||
me.scrubbed
|
me.scrubbed
|
||||||
end
|
end
|
||||||
|
|
||||||
def start_element(name, attributes = [])
|
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
||||||
|
|
||||||
|
def start_element(_, attributes = [])
|
||||||
attributes = Hash[*attributes.flatten]
|
attributes = Hash[*attributes.flatten]
|
||||||
if attributes["alt"]
|
|
||||||
scrubbed << " "
|
ATTRIBUTES.each do |name|
|
||||||
scrubbed << attributes["alt"]
|
characters(attributes[name]) if attributes[name].present?
|
||||||
scrubbed << " "
|
|
||||||
end
|
|
||||||
if attributes["title"]
|
|
||||||
scrubbed << " "
|
|
||||||
scrubbed << attributes["title"]
|
|
||||||
scrubbed << " "
|
|
||||||
end
|
|
||||||
if attributes["data-youtube-title"]
|
|
||||||
scrubbed << " "
|
|
||||||
scrubbed << attributes["data-youtube-title"]
|
|
||||||
scrubbed << " "
|
|
||||||
end
|
|
||||||
if attributes["href"]
|
|
||||||
scrubbed << " "
|
|
||||||
scrubbed << attributes["href"]
|
|
||||||
scrubbed << " "
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def characters(string)
|
def characters(string)
|
||||||
scrubbed << " "
|
scrubbed << " #{ActiveSupport::Inflector.transliterate(string).strip} "
|
||||||
scrubbed << string
|
|
||||||
scrubbed << " "
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -29,6 +29,14 @@ describe SearchIndexer do
|
|||||||
expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
|
expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'removes diacritics' do
|
||||||
|
html = "<p>Hétérogénéité</p>"
|
||||||
|
|
||||||
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||||
|
|
||||||
|
expect(scrubbed).to eq(" Heterogeneite ")
|
||||||
|
end
|
||||||
|
|
||||||
it 'correctly indexes a post according to version' do
|
it 'correctly indexes a post according to version' do
|
||||||
# Preparing so that they can be indexed to right version
|
# Preparing so that they can be indexed to right version
|
||||||
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
||||||
|
|||||||
Reference in New Issue
Block a user