From a57f80204821381d801b9d228d6cda08247487a4 Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Thu, 17 Apr 2014 14:00:22 -0400 Subject: [PATCH] If there's a `TopicEmbed` record for a url, we don't have to crawl it. This should help sites like Boing Boing where sometimes links are crawled before saved in WordPress. --- app/jobs/regular/crawl_topic_link.rb | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb index deb15bef548..c8273473dba 100644 --- a/app/jobs/regular/crawl_topic_link.rb +++ b/app/jobs/regular/crawl_topic_link.rb @@ -83,10 +83,17 @@ module Jobs def execute(args) raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present? - begin - topic_link = TopicLink.where(id: args[:topic_link_id], internal: false, crawled_at: nil).first - return if topic_link.blank? + topic_link = TopicLink.where(id: args[:topic_link_id], internal: false, crawled_at: nil).first + return if topic_link.blank? + # Look for a topic embed for the URL. If it exists, use its title and don't crawl + topic_embed = TopicEmbed.where(embed_url: topic_link.url).includes(:topic).references(:topic).first + if topic_embed.present? + TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', topic_embed.topic.title[0..255]]) + return + end + + begin crawled = false # Special case: Images