2019-05-02 17:17:27 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2013-02-05 13:16:51 -06:00
|
|
|
require 'uri'
|
|
|
|
|
|
|
|
class TopicLink < ActiveRecord::Base
|
2015-09-25 13:07:04 -05:00
|
|
|
|
|
|
|
def self.max_domain_length
|
|
|
|
100
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.max_url_length
|
|
|
|
500
|
|
|
|
end
|
2014-06-25 20:38:23 -05:00
|
|
|
|
2013-02-05 13:16:51 -06:00
|
|
|
belongs_to :topic
|
|
|
|
belongs_to :user
|
|
|
|
belongs_to :post
|
|
|
|
belongs_to :link_topic, class_name: 'Topic'
|
2014-03-17 21:12:07 -05:00
|
|
|
belongs_to :link_post, class_name: 'Post'
|
2013-02-05 13:16:51 -06:00
|
|
|
|
|
|
|
validates_presence_of :url
|
|
|
|
|
|
|
|
validates_length_of :url, maximum: 500
|
|
|
|
|
|
|
|
validates_uniqueness_of :url, scope: [:topic_id, :post_id]
|
|
|
|
|
2013-06-13 12:41:45 -05:00
|
|
|
has_many :topic_link_clicks, dependent: :destroy
|
2013-02-05 13:16:51 -06:00
|
|
|
|
|
|
|
validate :link_to_self
|
|
|
|
|
2014-04-05 13:47:25 -05:00
|
|
|
after_commit :crawl_link_title
|
|
|
|
|
2013-02-05 13:16:51 -06:00
|
|
|
# Make sure a topic can't link to itself
|
|
|
|
def link_to_self
|
|
|
|
errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id)
|
|
|
|
end
|
|
|
|
|
2013-11-15 11:15:46 -06:00
|
|
|
def self.topic_map(guardian, topic_id)
|
2013-06-05 01:10:26 -05:00
|
|
|
|
|
|
|
# Sam: complicated reports are really hard in AR
|
2019-04-12 08:55:27 -05:00
|
|
|
builder = DB.build(<<~SQL)
|
|
|
|
SELECT ftl.url,
|
|
|
|
COALESCE(ft.title, ftl.title) AS title,
|
|
|
|
ftl.link_topic_id,
|
|
|
|
ftl.reflection,
|
|
|
|
ftl.internal,
|
|
|
|
ftl.domain,
|
|
|
|
MIN(ftl.user_id) AS user_id,
|
|
|
|
SUM(clicks) AS clicks
|
|
|
|
FROM topic_links AS ftl
|
|
|
|
LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id
|
|
|
|
LEFT JOIN categories AS c ON c.id = ft.category_id
|
|
|
|
/*where*/
|
|
|
|
GROUP BY ftl.url, ft.title, ftl.title, ftl.link_topic_id, ftl.reflection, ftl.internal, ftl.domain
|
|
|
|
ORDER BY clicks DESC, count(*) DESC
|
|
|
|
LIMIT 50
|
|
|
|
SQL
|
2013-06-05 01:10:26 -05:00
|
|
|
|
|
|
|
builder.where('ftl.topic_id = :topic_id', topic_id: topic_id)
|
|
|
|
builder.where('ft.deleted_at IS NULL')
|
2017-07-22 15:18:15 -05:00
|
|
|
# note that ILIKE means "case insensitive LIKE"
|
|
|
|
builder.where("NOT(ftl.url ILIKE '%.png' OR ftl.url ILIKE '%.jpg' OR ftl.url ILIKE '%.gif')")
|
2014-05-11 14:53:57 -05:00
|
|
|
builder.where("COALESCE(ft.archetype, 'regular') <> :archetype", archetype: Archetype.private_message)
|
2018-07-17 23:14:50 -05:00
|
|
|
builder.where("clicks > 0")
|
2013-06-05 01:10:26 -05:00
|
|
|
|
|
|
|
builder.secure_category(guardian.secure_category_ids)
|
|
|
|
|
2018-06-19 01:13:14 -05:00
|
|
|
builder.query
|
2013-06-05 01:10:26 -05:00
|
|
|
|
|
|
|
end
|
|
|
|
|
2017-07-27 20:20:09 -05:00
|
|
|
def self.counts_for(guardian, topic, posts)
|
2013-06-05 01:10:26 -05:00
|
|
|
return {} if posts.blank?
|
|
|
|
|
2018-06-19 01:13:14 -05:00
|
|
|
# Sam: this is not tidy in AR and also happens to be a critical path
|
|
|
|
# for topic view
|
|
|
|
builder = DB.build("SELECT
|
2013-06-05 01:10:26 -05:00
|
|
|
l.post_id,
|
|
|
|
l.url,
|
|
|
|
l.clicks,
|
2014-04-05 13:47:25 -05:00
|
|
|
COALESCE(t.title, l.title) AS title,
|
2013-06-05 01:10:26 -05:00
|
|
|
l.internal,
|
2014-04-05 13:47:25 -05:00
|
|
|
l.reflection,
|
|
|
|
l.domain
|
2013-06-05 01:10:26 -05:00
|
|
|
FROM topic_links l
|
|
|
|
LEFT JOIN topics t ON t.id = l.link_topic_id
|
|
|
|
LEFT JOIN categories AS c ON c.id = t.category_id
|
|
|
|
/*where*/
|
|
|
|
ORDER BY reflection ASC, clicks DESC")
|
|
|
|
|
|
|
|
builder.where('t.deleted_at IS NULL')
|
2014-05-11 14:53:57 -05:00
|
|
|
builder.where("COALESCE(t.archetype, 'regular') <> :archetype", archetype: Archetype.private_message)
|
2013-06-05 01:10:26 -05:00
|
|
|
|
|
|
|
# not certain if pluck is right, cause it may interfere with caching
|
2018-06-19 01:13:14 -05:00
|
|
|
builder.where('l.post_id in (:post_ids)', post_ids: posts.map(&:id))
|
2013-06-05 01:10:26 -05:00
|
|
|
builder.secure_category(guardian.secure_category_ids)
|
|
|
|
|
2018-06-19 01:13:14 -05:00
|
|
|
result = {}
|
|
|
|
builder.query.each do |l|
|
2013-06-05 01:10:26 -05:00
|
|
|
result[l.post_id] ||= []
|
2017-07-27 20:20:09 -05:00
|
|
|
result[l.post_id] << { url: l.url,
|
|
|
|
clicks: l.clicks,
|
|
|
|
title: l.title,
|
|
|
|
internal: l.internal,
|
|
|
|
reflection: l.reflection }
|
2013-06-05 01:10:26 -05:00
|
|
|
end
|
2018-06-19 01:13:14 -05:00
|
|
|
result
|
2013-06-05 01:10:26 -05:00
|
|
|
end
|
|
|
|
|
2013-02-05 13:16:51 -06:00
|
|
|
def self.extract_from(post)
|
2020-03-11 07:03:20 -05:00
|
|
|
return if post.blank? || post.whisper? || post.user_id.blank?
|
2013-02-07 09:45:24 -06:00
|
|
|
|
2018-12-05 11:21:50 -06:00
|
|
|
current_urls = []
|
2018-10-17 21:52:45 -05:00
|
|
|
reflected_ids = []
|
|
|
|
|
|
|
|
PrettyText
|
|
|
|
.extract_links(post.cooked)
|
|
|
|
.map do |u|
|
2018-12-11 01:03:13 -06:00
|
|
|
uri = UrlHelper.relaxed_parse(u.url)
|
2018-10-17 21:52:45 -05:00
|
|
|
[u, uri]
|
|
|
|
end
|
2020-04-30 01:48:34 -05:00
|
|
|
.reject { |_, p| p.nil? || "mailto" == p.scheme }
|
2018-10-17 21:52:45 -05:00
|
|
|
.uniq { |_, p| p }
|
|
|
|
.each do |link, parsed|
|
2018-03-28 03:20:08 -05:00
|
|
|
|
2018-10-17 21:52:45 -05:00
|
|
|
TopicLink.transaction do
|
2013-02-05 13:16:51 -06:00
|
|
|
begin
|
2018-12-05 11:21:50 -06:00
|
|
|
url, reflected_id = self.ensure_entry_for(post, link, parsed)
|
|
|
|
current_urls << url unless url.nil?
|
|
|
|
reflected_ids << reflected_id unless reflected_id.nil?
|
2018-08-14 05:23:32 -05:00
|
|
|
rescue URI::Error
|
2013-02-05 13:16:51 -06:00
|
|
|
# if the URI is invalid, don't store it.
|
|
|
|
rescue ActionController::RoutingError
|
2013-02-07 09:45:24 -06:00
|
|
|
# If we can't find the route, no big deal
|
2013-02-05 13:16:51 -06:00
|
|
|
end
|
2013-02-07 09:45:24 -06:00
|
|
|
end
|
2018-12-05 11:16:27 -06:00
|
|
|
end
|
|
|
|
|
2018-12-05 11:21:50 -06:00
|
|
|
self.cleanup_entries(post, current_urls, reflected_ids)
|
2018-12-05 11:16:27 -06:00
|
|
|
|
2013-02-05 13:16:51 -06:00
|
|
|
end
|
2014-04-05 13:47:25 -05:00
|
|
|
|
2020-05-13 01:05:39 -05:00
|
|
|
def self.crawl_link_title(topic_link_id)
|
|
|
|
Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id)
|
|
|
|
end
|
|
|
|
|
2014-04-05 13:47:25 -05:00
|
|
|
def crawl_link_title
|
2020-05-13 01:05:39 -05:00
|
|
|
TopicLink.crawl_link_title(id)
|
2014-04-05 13:47:25 -05:00
|
|
|
end
|
2016-06-06 15:58:35 -05:00
|
|
|
|
|
|
|
def self.duplicate_lookup(topic)
|
2016-06-08 11:35:11 -05:00
|
|
|
results = TopicLink
|
2017-07-27 20:20:09 -05:00
|
|
|
.includes(:post, :user)
|
|
|
|
.joins(:post, :user)
|
|
|
|
.where("posts.id IS NOT NULL AND users.id IS NOT NULL")
|
|
|
|
.where(topic_id: topic.id, reflection: false)
|
|
|
|
.last(200)
|
2016-06-06 15:58:35 -05:00
|
|
|
|
|
|
|
lookup = {}
|
2016-06-08 11:35:11 -05:00
|
|
|
results.each do |tl|
|
2016-06-08 16:20:32 -05:00
|
|
|
normalized = tl.url.downcase.sub(/^https?:\/\//, '').sub(/\/$/, '')
|
2016-06-08 11:35:11 -05:00
|
|
|
lookup[normalized] = { domain: tl.domain,
|
2016-06-13 04:11:25 -05:00
|
|
|
username: tl.user.username_lower,
|
2016-06-09 12:02:44 -05:00
|
|
|
posted_at: tl.post.created_at,
|
|
|
|
post_number: tl.post.post_number }
|
2016-06-06 15:58:35 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
lookup
|
|
|
|
end
|
2018-12-05 11:21:50 -06:00
|
|
|
|
|
|
|
private
|
|
|
|
|
2020-05-13 01:05:39 -05:00
|
|
|
# This pattern is used to create topic links very efficiently with minimal
|
|
|
|
# errors under heavy concurrent use
|
|
|
|
#
|
|
|
|
# It avoids a SELECT to find out if the record is there and minimizes all
|
|
|
|
# the work it needs to do in case a record is missing
|
|
|
|
#
|
|
|
|
# It handles calling the required callback and has parity with Rails implementation
|
|
|
|
#
|
|
|
|
# Usually we would rely on ActiveRecord but in this case we have had lots of churn
|
|
|
|
# around creation of topic links leading to hard to debug log messages in production
|
|
|
|
#
|
|
|
|
def self.safe_create_topic_link(
|
|
|
|
post_id:,
|
|
|
|
user_id:,
|
|
|
|
topic_id:,
|
|
|
|
url:,
|
|
|
|
domain: nil,
|
|
|
|
internal: false,
|
|
|
|
link_topic_id: nil,
|
|
|
|
link_post_id: nil,
|
|
|
|
quote: false,
|
|
|
|
extension: nil,
|
|
|
|
reflection: false
|
|
|
|
)
|
|
|
|
|
|
|
|
domain ||= Discourse.current_hostname
|
|
|
|
|
|
|
|
sql = <<~SQL
|
|
|
|
WITH new_row AS(
|
|
|
|
INSERT INTO topic_links(
|
|
|
|
post_id,
|
|
|
|
user_id,
|
|
|
|
topic_id,
|
|
|
|
url,
|
|
|
|
domain,
|
|
|
|
internal,
|
|
|
|
link_topic_id,
|
|
|
|
link_post_id,
|
|
|
|
quote,
|
|
|
|
extension,
|
|
|
|
reflection,
|
|
|
|
created_at,
|
|
|
|
updated_at
|
|
|
|
) VALUES (
|
|
|
|
:post_id,
|
|
|
|
:user_id,
|
|
|
|
:topic_id,
|
|
|
|
:url,
|
|
|
|
:domain,
|
|
|
|
:internal,
|
|
|
|
:link_topic_id,
|
|
|
|
:link_post_id,
|
|
|
|
:quote,
|
|
|
|
:extension,
|
|
|
|
:reflection,
|
|
|
|
:now,
|
|
|
|
:now
|
|
|
|
)
|
|
|
|
ON CONFLICT DO NOTHING
|
|
|
|
RETURNING id
|
|
|
|
)
|
|
|
|
SELECT COALESCE(
|
|
|
|
(SELECT id FROM new_row),
|
|
|
|
(SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url)
|
|
|
|
), (SELECT id FROM new_row) IS NOT NULL
|
|
|
|
SQL
|
|
|
|
|
|
|
|
topic_link_id, new_record = DB.query_single(sql,
|
|
|
|
post_id: post_id,
|
|
|
|
user_id: user_id,
|
|
|
|
topic_id: topic_id,
|
|
|
|
url: url,
|
|
|
|
domain: domain,
|
|
|
|
internal: internal,
|
|
|
|
link_topic_id: link_topic_id,
|
|
|
|
link_post_id: link_post_id,
|
|
|
|
quote: quote,
|
|
|
|
extension: extension,
|
|
|
|
reflection: reflection,
|
|
|
|
now: Time.now
|
|
|
|
)
|
|
|
|
|
|
|
|
if new_record
|
|
|
|
DB.after_commit do
|
|
|
|
crawl_link_title(topic_link_id)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
topic_link_id
|
|
|
|
end
|
|
|
|
|
2018-12-05 11:21:50 -06:00
|
|
|
def self.ensure_entry_for(post, link, parsed)
|
|
|
|
url = link.url
|
|
|
|
internal = false
|
|
|
|
topic_id = nil
|
|
|
|
post_number = nil
|
2019-12-04 00:13:20 -06:00
|
|
|
topic = nil
|
2018-12-05 11:21:50 -06:00
|
|
|
|
|
|
|
if upload = Upload.get_from_url(url)
|
|
|
|
internal = Discourse.store.internal?
|
|
|
|
# Store the same URL that will be used in the cooked version of the post
|
2019-11-17 19:25:42 -06:00
|
|
|
url = UrlHelper.cook_url(upload.url, secure: upload.secure?)
|
2018-12-05 11:21:50 -06:00
|
|
|
elsif route = Discourse.route_for(parsed)
|
|
|
|
internal = true
|
|
|
|
|
|
|
|
# We aren't interested in tracking internal links to users
|
2020-06-29 05:31:20 -05:00
|
|
|
return nil if route[:controller] == "users"
|
2018-12-05 11:21:50 -06:00
|
|
|
|
2020-06-29 05:31:20 -05:00
|
|
|
topic_id = route[:topic_id]
|
|
|
|
topic_slug = route[:slug]
|
2018-12-05 11:21:50 -06:00
|
|
|
post_number = route[:post_number] || 1
|
|
|
|
|
2020-06-29 05:31:20 -05:00
|
|
|
if route[:controller] == "topics" && route[:action] == "show"
|
|
|
|
topic_id ||= route[:id]
|
|
|
|
topic_slug ||= route[:id]
|
|
|
|
end
|
|
|
|
|
|
|
|
topic = Topic.find_by(id: topic_id) if topic_id
|
|
|
|
topic ||= Topic.find_by(slug: topic_slug) if topic_slug.present?
|
2018-12-05 11:21:50 -06:00
|
|
|
|
|
|
|
if topic.present?
|
2019-05-02 17:17:27 -05:00
|
|
|
url = +"#{Discourse.base_url_no_prefix}#{topic.relative_url}"
|
2018-12-05 11:21:50 -06:00
|
|
|
url << "/#{post_number}" if post_number.to_i > 1
|
2020-06-29 05:31:20 -05:00
|
|
|
else
|
|
|
|
topic_id = nil
|
2018-12-05 11:21:50 -06:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# Skip linking to ourselves
|
2019-12-04 00:13:20 -06:00
|
|
|
return nil if topic&.id == post.topic_id
|
2018-12-05 11:21:50 -06:00
|
|
|
|
|
|
|
reflected_post = nil
|
2019-12-04 00:13:20 -06:00
|
|
|
if post_number && topic
|
|
|
|
reflected_post = Post.find_by(topic_id: topic.id, post_number: post_number.to_i)
|
2018-12-05 11:21:50 -06:00
|
|
|
end
|
|
|
|
|
2019-01-03 05:59:22 -06:00
|
|
|
url = url[0...TopicLink.max_url_length]
|
2018-12-05 11:21:50 -06:00
|
|
|
return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length
|
|
|
|
|
2020-05-13 01:05:39 -05:00
|
|
|
file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
|
|
|
|
|
|
|
|
safe_create_topic_link(
|
|
|
|
post_id: post.id,
|
|
|
|
user_id: post.user_id,
|
|
|
|
topic_id: post.topic_id,
|
|
|
|
url: url,
|
|
|
|
domain: parsed.host,
|
|
|
|
internal: internal,
|
|
|
|
link_topic_id: topic&.id,
|
2020-06-29 05:31:20 -05:00
|
|
|
link_post_id: reflected_post&.id,
|
2020-05-13 01:05:39 -05:00
|
|
|
quote: link.is_quote,
|
|
|
|
extension: file_extension,
|
|
|
|
)
|
2018-12-05 11:21:50 -06:00
|
|
|
|
|
|
|
reflected_id = nil
|
|
|
|
|
|
|
|
# Create the reflection if we can
|
2019-12-04 00:13:20 -06:00
|
|
|
if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible?
|
|
|
|
prefix = Discourse.base_url_no_prefix
|
|
|
|
reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}"
|
2018-12-05 11:21:50 -06:00
|
|
|
|
2020-05-13 01:05:39 -05:00
|
|
|
reflected_id = safe_create_topic_link(
|
|
|
|
user_id: post.user_id,
|
|
|
|
topic_id: topic&.id,
|
|
|
|
post_id: reflected_post&.id,
|
|
|
|
url: reflected_url,
|
|
|
|
domain: Discourse.current_hostname,
|
|
|
|
reflection: true,
|
|
|
|
internal: true,
|
|
|
|
link_topic_id: post.topic_id,
|
|
|
|
link_post_id: post.id
|
|
|
|
)
|
2019-12-04 00:13:20 -06:00
|
|
|
|
2018-12-05 11:21:50 -06:00
|
|
|
end
|
|
|
|
|
|
|
|
[url, reflected_id]
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.cleanup_entries(post, current_urls, current_reflected_ids)
|
|
|
|
# Remove links that aren't there anymore
|
|
|
|
if current_urls.present?
|
|
|
|
TopicLink.where(
|
|
|
|
"(url not in (:urls)) AND (post_id = :post_id AND NOT reflection)",
|
|
|
|
urls: current_urls, post_id: post.id
|
|
|
|
).delete_all
|
|
|
|
|
|
|
|
current_reflected_ids.compact!
|
|
|
|
if current_reflected_ids.present?
|
|
|
|
TopicLink.where(
|
|
|
|
"(id not in (:reflected_ids)) AND (link_post_id = :post_id AND reflection)",
|
|
|
|
reflected_ids: current_reflected_ids, post_id: post.id
|
|
|
|
).delete_all
|
|
|
|
else
|
|
|
|
TopicLink
|
|
|
|
.where("link_post_id = :post_id AND reflection", post_id: post.id)
|
|
|
|
.delete_all
|
|
|
|
end
|
|
|
|
else
|
|
|
|
TopicLink
|
|
|
|
.where(
|
|
|
|
"(post_id = :post_id AND NOT reflection) OR (link_post_id = :post_id AND reflection)",
|
|
|
|
post_id: post.id
|
|
|
|
)
|
|
|
|
.delete_all
|
|
|
|
end
|
|
|
|
end
|
2013-02-05 13:16:51 -06:00
|
|
|
end
|
2013-05-23 21:48:32 -05:00
|
|
|
|
|
|
|
# == Schema Information
|
|
|
|
#
|
|
|
|
# Table name: topic_links
|
|
|
|
#
|
|
|
|
# id :integer not null, primary key
|
|
|
|
# topic_id :integer not null
|
|
|
|
# post_id :integer
|
|
|
|
# user_id :integer not null
|
|
|
|
# url :string(500) not null
|
|
|
|
# domain :string(100) not null
|
|
|
|
# internal :boolean default(FALSE), not null
|
|
|
|
# link_topic_id :integer
|
2014-08-27 00:19:25 -05:00
|
|
|
# created_at :datetime not null
|
|
|
|
# updated_at :datetime not null
|
2013-05-23 21:48:32 -05:00
|
|
|
# reflection :boolean default(FALSE)
|
|
|
|
# clicks :integer default(0), not null
|
|
|
|
# link_post_id :integer
|
2019-01-11 13:29:56 -06:00
|
|
|
# title :string
|
2014-04-08 10:35:44 -05:00
|
|
|
# crawled_at :datetime
|
2014-07-14 20:29:44 -05:00
|
|
|
# quote :boolean default(FALSE), not null
|
2017-08-16 09:38:11 -05:00
|
|
|
# extension :string(10)
|
2013-05-23 21:48:32 -05:00
|
|
|
#
|
|
|
|
# Indexes
|
|
|
|
#
|
2020-12-28 22:54:05 -06:00
|
|
|
# index_topic_links_on_extension (extension)
|
|
|
|
# index_topic_links_on_link_post_id_and_reflection (link_post_id,reflection)
|
|
|
|
# index_topic_links_on_post_id (post_id)
|
|
|
|
# index_topic_links_on_topic_id (topic_id)
|
|
|
|
# index_topic_links_on_user_and_clicks (user_id,clicks DESC,created_at DESC) WHERE ((NOT reflection) AND (NOT quote) AND (NOT internal))
|
|
|
|
# index_topic_links_on_user_id (user_id)
|
|
|
|
# unique_post_links (topic_id,post_id,url) UNIQUE
|
2013-05-23 21:48:32 -05:00
|
|
|
#
|