2019-04-29 19:27:42 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2015-10-11 04:41:23 -05:00
|
|
|
require 'rails_helper'
|
2014-06-24 02:10:56 -05:00
|
|
|
|
2016-12-21 20:13:14 -06:00
|
|
|
describe SearchIndexer do
|
2017-08-16 06:38:34 -05:00
|
|
|
let(:post_id) { 99 }
|
2018-09-13 11:53:53 -05:00
|
|
|
|
2019-04-01 20:52:59 -05:00
|
|
|
before do
|
|
|
|
SearchIndexer.enable
|
|
|
|
end
|
|
|
|
|
|
|
|
after do
|
|
|
|
SearchIndexer.disable
|
|
|
|
end
|
|
|
|
|
2018-09-17 03:31:15 -05:00
|
|
|
def scrub(html, strip_diacritics: false)
|
|
|
|
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
|
|
|
end
|
|
|
|
|
2014-06-24 02:10:56 -05:00
|
|
|
it 'correctly indexes chinese' do
|
|
|
|
SiteSetting.default_locale = 'zh_CN'
|
|
|
|
data = "你好世界"
|
|
|
|
|
2020-07-17 03:27:30 -05:00
|
|
|
SearchIndexer.update_posts_index(post_id, "", "", "", data)
|
2014-06-24 02:10:56 -05:00
|
|
|
|
2020-07-17 03:27:30 -05:00
|
|
|
post_search_data = PostSearchData.find_by(post_id: post_id)
|
|
|
|
|
|
|
|
expect(post_search_data.raw_data).to eq("你好 世界")
|
|
|
|
expect(post_search_data.search_data).to eq("'世界':2 '你好':1")
|
2014-06-24 02:10:56 -05:00
|
|
|
end
|
|
|
|
|
2018-04-26 00:46:52 -05:00
|
|
|
it 'extract youtube title' do
|
|
|
|
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
2018-09-17 03:31:15 -05:00
|
|
|
scrubbed = scrub(html)
|
|
|
|
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
|
2018-04-26 00:46:52 -05:00
|
|
|
end
|
|
|
|
|
2018-08-19 19:39:19 -05:00
|
|
|
it 'extract a link' do
|
|
|
|
html = "<a href='http://meta.discourse.org/'>link</a>"
|
2018-09-17 03:31:15 -05:00
|
|
|
scrubbed = scrub(html)
|
|
|
|
expect(scrubbed).to eq("http://meta.discourse.org/ link")
|
2018-08-19 19:39:19 -05:00
|
|
|
end
|
|
|
|
|
2019-04-29 10:15:55 -05:00
|
|
|
it 'extracts @username from mentions' do
|
|
|
|
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
|
|
|
|
scrubbed = scrub(html)
|
|
|
|
expect(scrubbed).to eq('@狮子 @foo')
|
|
|
|
end
|
|
|
|
|
|
|
|
it 'extracts @groupname from group mentions' do
|
|
|
|
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
|
|
|
|
scrubbed = scrub(html)
|
|
|
|
expect(scrubbed).to eq('@автомобилист')
|
|
|
|
end
|
|
|
|
|
2019-04-29 10:26:29 -05:00
|
|
|
it 'extracts emoji name from emoji image' do
|
|
|
|
html = %Q|<img src="#{Discourse.base_url_no_prefix}/images/emoji/twitter/wink.png?v=9" title=":wink:" class="emoji" alt=":wink:">|
|
|
|
|
scrubbed = scrub(html)
|
|
|
|
expect(scrubbed).to eq(':wink:')
|
|
|
|
end
|
|
|
|
|
2018-09-17 03:31:15 -05:00
|
|
|
it 'uses ignore_accent setting to strip diacritics' do
|
2018-08-23 17:38:44 -05:00
|
|
|
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
2018-08-23 10:13:52 -05:00
|
|
|
|
2018-09-17 03:31:15 -05:00
|
|
|
SiteSetting.search_ignore_accents = true
|
|
|
|
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
|
|
|
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
|
2018-08-23 10:13:52 -05:00
|
|
|
|
2018-09-17 03:31:15 -05:00
|
|
|
SiteSetting.search_ignore_accents = false
|
|
|
|
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
|
|
|
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
|
2018-08-23 10:13:52 -05:00
|
|
|
end
|
|
|
|
|
2018-09-13 11:53:53 -05:00
|
|
|
it "doesn't index local files" do
|
|
|
|
html = <<~HTML
|
|
|
|
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
|
|
|
|
<p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>
|
|
|
|
<div class="lightbox-wrapper">
|
|
|
|
<a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">
|
|
|
|
<img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">
|
|
|
|
<div class="meta">
|
2019-03-22 10:52:06 -05:00
|
|
|
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg>
|
2018-09-13 11:53:53 -05:00
|
|
|
<span class="filename">Untitled design (21).jpg</span>
|
|
|
|
<span class="informations">1280x1136 472 KB</span>
|
2019-03-22 10:52:06 -05:00
|
|
|
<svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
|
2018-09-13 11:53:53 -05:00
|
|
|
</div>
|
|
|
|
</a>
|
|
|
|
</div>
|
|
|
|
HTML
|
|
|
|
|
2018-09-17 03:31:15 -05:00
|
|
|
scrubbed = scrub(html)
|
2018-09-13 11:53:53 -05:00
|
|
|
|
2019-04-01 03:18:54 -05:00
|
|
|
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
|
2018-09-13 11:53:53 -05:00
|
|
|
end
|
|
|
|
|
2017-08-16 06:38:34 -05:00
|
|
|
it 'correctly indexes a post according to version' do
|
|
|
|
# Preparing so that they can be indexed to right version
|
2018-02-19 21:41:00 -06:00
|
|
|
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
2019-04-29 02:32:25 -05:00
|
|
|
PostSearchData.find_by(post_id: post_id).update!(version: -1)
|
2014-06-24 02:10:56 -05:00
|
|
|
|
2017-08-16 06:38:34 -05:00
|
|
|
data = "<a>This</a> is a test"
|
2018-02-19 21:41:00 -06:00
|
|
|
SearchIndexer.update_posts_index(post_id, "", "", nil, data)
|
2014-06-24 02:10:56 -05:00
|
|
|
|
2017-08-16 06:38:34 -05:00
|
|
|
raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]
|
2016-08-10 14:40:58 -05:00
|
|
|
expect(raw_data).to eq("This is a test")
|
2019-05-15 16:43:00 -05:00
|
|
|
expect(locale).to eq(SiteSetting.default_locale)
|
2020-07-23 01:10:05 -05:00
|
|
|
expect(version).to eq(SearchIndexer::POST_INDEX_VERSION)
|
2014-06-24 02:10:56 -05:00
|
|
|
end
|
2019-03-19 04:16:57 -05:00
|
|
|
|
|
|
|
describe '.index' do
|
2020-07-27 02:22:54 -05:00
|
|
|
let(:topic) { Fabricate(:topic, title: "this is a title that I am testing") }
|
|
|
|
let(:post) { Fabricate(:post, topic: topic) }
|
2019-03-19 04:16:57 -05:00
|
|
|
|
|
|
|
it 'should index posts correctly' do
|
|
|
|
expect { post }.to change { PostSearchData.count }.by(1)
|
|
|
|
|
|
|
|
expect { post.update!(raw: "this is new content") }
|
2020-07-17 03:27:30 -05:00
|
|
|
.to change { post.reload.post_search_data.search_data }
|
2019-03-19 04:16:57 -05:00
|
|
|
|
|
|
|
expect { post.update!(topic_id: Fabricate(:topic).id) }
|
2020-07-17 03:27:30 -05:00
|
|
|
.to change { post.reload.post_search_data.search_data }
|
2019-03-19 04:16:57 -05:00
|
|
|
end
|
2019-03-31 21:06:27 -05:00
|
|
|
|
|
|
|
it 'should not index posts with empty raw' do
|
|
|
|
expect do
|
|
|
|
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
|
|
|
|
post.save!(validate: false)
|
|
|
|
end.to_not change { PostSearchData.count }
|
|
|
|
end
|
2019-03-31 21:14:29 -05:00
|
|
|
|
|
|
|
it "should not tokenize urls and duplicate title and href in <a>" do
|
2020-07-27 02:22:54 -05:00
|
|
|
post.update!(raw: <<~RAW)
|
2019-03-31 21:14:29 -05:00
|
|
|
https://meta.discourse.org/some.png
|
|
|
|
RAW
|
|
|
|
|
|
|
|
post.rebake!
|
|
|
|
post.reload
|
|
|
|
topic = post.topic
|
|
|
|
|
|
|
|
expect(post.post_search_data.raw_data).to eq(
|
2020-07-17 03:27:30 -05:00
|
|
|
"https://meta.discourse.org/some.png"
|
2020-07-09 01:56:02 -05:00
|
|
|
)
|
2020-07-27 02:22:54 -05:00
|
|
|
|
|
|
|
expect(post.post_search_data.search_data).to eq(
|
|
|
|
"'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B"
|
|
|
|
)
|
2020-07-09 01:56:02 -05:00
|
|
|
end
|
|
|
|
|
2020-07-27 01:46:44 -05:00
|
|
|
it 'should not tokenize versions' do
|
2020-07-27 02:17:49 -05:00
|
|
|
post.update!(raw: '123.223')
|
2020-07-27 01:46:44 -05:00
|
|
|
|
|
|
|
expect(post.post_search_data.search_data).to eq(
|
2020-07-27 02:17:49 -05:00
|
|
|
"'123.223':10 'test':8A 'titl':4A 'uncategor':9B"
|
|
|
|
)
|
|
|
|
|
|
|
|
post.update!(raw: '15.2.231.423')
|
|
|
|
post.reload
|
|
|
|
|
|
|
|
expect(post.post_search_data.search_data).to eq(
|
|
|
|
"'15.2.231.423':10 'test':8A 'titl':4A 'uncategor':9B"
|
2020-07-27 01:46:44 -05:00
|
|
|
)
|
|
|
|
end
|
|
|
|
|
2020-07-09 04:02:02 -05:00
|
|
|
it 'should tokenize host of a URL and removes query string' do
|
2020-07-09 01:56:02 -05:00
|
|
|
category = Fabricate(:category, name: 'awesome category')
|
|
|
|
topic = Fabricate(:topic, category: category, title: 'this is a test topic')
|
|
|
|
|
|
|
|
post = Fabricate(:post, topic: topic, raw: <<~RAW)
|
|
|
|
a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1
|
|
|
|
RAW
|
|
|
|
|
|
|
|
post.rebake!
|
|
|
|
post.reload
|
|
|
|
topic = post.topic
|
|
|
|
|
|
|
|
expect(post.post_search_data.raw_data).to eq(
|
2020-07-17 03:27:30 -05:00
|
|
|
"a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
|
2020-07-09 01:56:02 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
expect(post.post_search_data.search_data).to eq(
|
2020-07-27 02:22:54 -05:00
|
|
|
"'/xyz=1':14,17 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn.com':9 'com':9 'com.au':10 'net':13,16 'stuff.com.au':10 'test':4A 'topic':5A"
|
2019-03-31 21:14:29 -05:00
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
it 'should not include lightbox in search' do
|
|
|
|
Jobs.run_immediately!
|
|
|
|
SiteSetting.crawl_images = true
|
2019-03-31 21:32:25 -05:00
|
|
|
SiteSetting.max_image_width = 1
|
|
|
|
|
|
|
|
stub_request(:get, "https://meta.discourse.org/some.png")
|
|
|
|
.to_return(status: 200, body: file_from_fixtures("logo.png").read)
|
2019-03-31 21:14:29 -05:00
|
|
|
|
|
|
|
src = "https://meta.discourse.org/some.png"
|
|
|
|
|
|
|
|
post = Fabricate(:post, raw: <<~RAW)
|
|
|
|
Let me see how I can fix this image
|
2019-04-01 03:18:54 -05:00
|
|
|
<img src="#{src}" title="GOT" alt="white walkers" width="2" height="2">
|
2019-03-31 21:14:29 -05:00
|
|
|
RAW
|
|
|
|
|
|
|
|
post.rebake!
|
|
|
|
post.reload
|
|
|
|
topic = post.topic
|
|
|
|
|
2019-03-31 21:32:25 -05:00
|
|
|
expect(post.cooked).to include(
|
|
|
|
CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS
|
|
|
|
)
|
|
|
|
|
2019-03-31 21:14:29 -05:00
|
|
|
expect(post.post_search_data.raw_data).to eq(
|
2020-07-17 03:27:30 -05:00
|
|
|
"Let me see how I can fix this image white walkers GOT"
|
2019-03-31 21:14:29 -05:00
|
|
|
)
|
|
|
|
end
|
2019-03-19 04:16:57 -05:00
|
|
|
end
|
2019-04-01 20:52:59 -05:00
|
|
|
|
|
|
|
describe '.queue_post_reindex' do
|
|
|
|
let(:post) { Fabricate(:post) }
|
|
|
|
let(:topic) { post.topic }
|
|
|
|
|
|
|
|
it 'should reset the version of search data for all posts in the topic' do
|
|
|
|
post2 = Fabricate(:post)
|
|
|
|
|
|
|
|
SearchIndexer.queue_post_reindex(topic.id)
|
|
|
|
|
|
|
|
expect(post.reload.post_search_data.version).to eq(
|
|
|
|
SearchIndexer::REINDEX_VERSION
|
|
|
|
)
|
|
|
|
|
|
|
|
expect(post2.reload.post_search_data.version).to eq(
|
2020-07-23 01:10:05 -05:00
|
|
|
SearchIndexer::POST_INDEX_VERSION
|
2019-04-01 20:52:59 -05:00
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2014-06-24 02:10:56 -05:00
|
|
|
end
|