Special case: When crawling a link to an image, just put the filename as

the title.
This commit is contained in:
Robin Ward 2014-04-10 13:45:13 -04:00
parent 99e2bab62d
commit e80851b0fa

View File

@ -89,16 +89,27 @@ module Jobs
crawled = false crawled = false
result = CrawlTopicLink.fetch_beginning(topic_link.url) # Special case: Images
doc = Nokogiri::HTML(result) # If the link is to an image, put the filename as the title
if doc if topic_link.url =~ /\.(jpg|gif|png)$/
title = doc.at('title').try(:inner_text) uri = URI(topic_link.url)
if title.present? filename = File.basename(uri.path)
title.gsub!(/\n/, ' ') crawled = (TopicLink.where(id: topic_link.id).update_all(["title = ?, crawled_at = CURRENT_TIMESTAMP", filename]) == 1)
title.gsub!(/ +/, ' ') end
title.strip!
unless crawled
# Fetch the beginning of the document to find the title
result = CrawlTopicLink.fetch_beginning(topic_link.url)
doc = Nokogiri::HTML(result)
if doc
title = doc.at('title').try(:inner_text)
if title.present? if title.present?
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..255]]) == 1) title.gsub!(/\n/, ' ')
title.gsub!(/ +/, ' ')
title.strip!
if title.present?
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..255]]) == 1)
end
end end
end end
end end