mirror of
https://github.com/discourse/discourse.git
synced 2024-11-30 12:43:54 -06:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
495 lines
15 KiB
Ruby
495 lines
15 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require_relative 'base'
|
|
require 'tiny_tds'
|
|
|
|
# Import script for Telligent communities
|
|
#
|
|
# Users are currently imported from a temp table. This will need some
|
|
# work the next time this import script is used, because that table
|
|
# won't exist. Also, it's really hard to find all attachments, but
|
|
# the script tries to do it anyway.
|
|
|
|
class ImportScripts::Telligent < ImportScripts::Base
|
|
BATCH_SIZE ||= 1000
|
|
LOCAL_AVATAR_REGEX ||= /\A~\/.*(?<directory>communityserver-components-(?:selectable)?avatars)\/(?<path>[^\/]+)\/(?<filename>.+)/i
|
|
REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i
|
|
EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename1>.+)">(?<filename2>.*?)<\/a>/i
|
|
|
|
CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
|
|
TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'
|
|
|
|
def initialize
|
|
super()
|
|
|
|
@client = TinyTds::Client.new(
|
|
host: ENV["DB_HOST"],
|
|
username: ENV["DB_USERNAME"],
|
|
password: ENV["DB_PASSWORD"],
|
|
database: ENV["DB_NAME"]
|
|
)
|
|
end
|
|
|
|
def execute
|
|
add_permalink_normalizations
|
|
import_users
|
|
import_categories
|
|
import_topics
|
|
import_posts
|
|
mark_topics_as_solved
|
|
end
|
|
|
|
def import_users
|
|
puts "", "Importing users..."
|
|
|
|
user_conditions = <<~SQL
|
|
(
|
|
EXISTS(SELECT 1
|
|
FROM te_Forum_Threads t
|
|
WHERE t.UserId = u.UserID) OR
|
|
EXISTS(SELECT 1
|
|
FROM te_Forum_ThreadReplies r
|
|
WHERE r.UserId = u.UserID)
|
|
)
|
|
SQL
|
|
|
|
last_user_id = -1
|
|
total_count = count(<<~SQL)
|
|
SELECT COUNT(1) AS count
|
|
FROM temp_User u
|
|
WHERE #{user_conditions}
|
|
SQL
|
|
|
|
batches do |offset|
|
|
rows = query(<<~SQL)
|
|
SELECT TOP #{BATCH_SIZE} *
|
|
FROM
|
|
(
|
|
SELECT
|
|
u.UserID,
|
|
u.Email,
|
|
u.UserName,
|
|
u.CommonName,
|
|
u.CreateDate,
|
|
p.PropertyName,
|
|
p.PropertyValue
|
|
FROM temp_User u
|
|
LEFT OUTER JOIN temp_UserProperties p ON (u.UserID = p.UserID)
|
|
WHERE u.UserID > #{last_user_id} AND #{user_conditions}
|
|
) x
|
|
PIVOT (
|
|
MAX(PropertyValue)
|
|
FOR PropertyName
|
|
IN (avatarUrl, bio, Location, webAddress, BannedUntil, UserBanReason)
|
|
) y
|
|
ORDER BY UserID
|
|
SQL
|
|
|
|
break if rows.blank?
|
|
last_user_id = rows[-1]["UserID"]
|
|
next if all_records_exist?(:users, rows.map { |row| row["UserID"] })
|
|
|
|
create_users(rows, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: row["UserID"],
|
|
email: row["Email"],
|
|
username: row["UserName"],
|
|
name: row["CommonName"],
|
|
created_at: row["CreateDate"],
|
|
bio_raw: html_to_markdown(row["bio"]),
|
|
location: row["Location"],
|
|
website: row["webAddress"],
|
|
post_create_action: proc do |user|
|
|
import_avatar(user, row["avatarUrl"])
|
|
suspend_user(user, row["BannedUntil"], row["UserBanReason"])
|
|
end
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
# TODO move into base importer (create_user) and use consistent error handling
|
|
def import_avatar(user, avatar_url)
|
|
return if avatar_url.blank? || avatar_url.include?("anonymous")
|
|
|
|
if match_data = avatar_url.match(LOCAL_AVATAR_REGEX)
|
|
avatar_path = File.join(ENV["FILE_BASE_DIR"],
|
|
match_data[:directory].gsub("-", "."),
|
|
match_data[:path].split("-"),
|
|
match_data[:filename])
|
|
|
|
if File.exists?(avatar_path)
|
|
@uploader.create_avatar(user, avatar_path)
|
|
else
|
|
STDERR.puts "Could not find avatar: #{avatar_path}"
|
|
end
|
|
elsif avatar_url.match?(REMOTE_AVATAR_REGEX)
|
|
UserAvatar.import_url_for_user(avatar_url, user) rescue nil
|
|
end
|
|
end
|
|
|
|
def suspend_user(user, banned_until, ban_reason)
|
|
return if banned_until.blank?
|
|
|
|
if banned_until = DateTime.parse(banned_until) > DateTime.now
|
|
user.suspended_till = banned_until
|
|
user.suspended_at = DateTime.now
|
|
user.save!
|
|
|
|
StaffActionLogger.new(Discourse.system_user).log_user_suspend(user, ban_reason)
|
|
end
|
|
end
|
|
|
|
def import_categories
|
|
@new_parent_categories = {}
|
|
@new_parent_categories[:archives] = create_category({ name: "Archives" }, nil)
|
|
@new_parent_categories[:spotlight] = create_category({ name: "Spotlight" }, nil)
|
|
@new_parent_categories[:optimizer] = create_category({ name: "SQL Optimizer" }, nil)
|
|
|
|
puts "", "Importing parent categories..."
|
|
parent_categories = query(<<~SQL)
|
|
SELECT
|
|
GroupID,
|
|
Name, HtmlDescription,
|
|
DateCreated, SortOrder
|
|
FROM cs_Groups g
|
|
WHERE (SELECT COUNT(1)
|
|
FROM te_Forum_Forums f
|
|
WHERE f.GroupId = g.GroupID) > 1
|
|
ORDER BY SortOrder, Name
|
|
SQL
|
|
|
|
create_categories(parent_categories) do |row|
|
|
{
|
|
id: "G#{row['GroupID']}",
|
|
name: clean_category_name(row["Name"]),
|
|
description: html_to_markdown(row["HtmlDescription"]),
|
|
position: row["SortOrder"]
|
|
}
|
|
end
|
|
|
|
puts "", "Importing child categories..."
|
|
child_categories = query(<<~SQL)
|
|
SELECT
|
|
ForumId, GroupId,
|
|
Name, Description,
|
|
DateCreated, SortOrder
|
|
FROM te_Forum_Forums
|
|
ORDER BY GroupId, SortOrder, Name
|
|
SQL
|
|
|
|
create_categories(child_categories) do |row|
|
|
parent_category_id = parent_category_id_for(row)
|
|
|
|
if category_id = replace_with_category_id(row, child_categories, parent_category_id)
|
|
add_category(row['ForumId'], Category.find_by_id(category_id))
|
|
url = "f/#{row['ForumId']}"
|
|
Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url)
|
|
nil
|
|
else
|
|
{
|
|
id: row['ForumId'],
|
|
parent_category_id: parent_category_id,
|
|
name: clean_category_name(row["Name"]),
|
|
description: html_to_markdown(row["Description"]),
|
|
position: row["SortOrder"]
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def parent_category_id_for(row)
|
|
name = row["Name"].downcase
|
|
|
|
if name.include?("beta")
|
|
@new_parent_categories[:archives].id
|
|
elsif name.include?("spotlight")
|
|
@new_parent_categories[:spotlight].id
|
|
elsif name.include?("optimizer")
|
|
@new_parent_categories[:optimizer].id
|
|
elsif row.key?("GroupId")
|
|
category_id_from_imported_category_id("G#{row['GroupId']}")
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
def replace_with_category_id(row, child_categories, parent_category_id)
|
|
name = row["Name"].downcase
|
|
|
|
if name.include?("data modeler") || name.include?("benchmark")
|
|
category_id_from_imported_category_id("G#{row['GroupId']}")
|
|
elsif only_child?(child_categories, parent_category_id)
|
|
parent_category_id
|
|
end
|
|
end
|
|
|
|
def only_child?(child_categories, parent_category_id)
|
|
count = 0
|
|
|
|
child_categories.each do |row|
|
|
count += 1 if parent_category_id_for(row) == parent_category_id
|
|
end
|
|
|
|
count == 1
|
|
end
|
|
|
|
def clean_category_name(name)
|
|
CGI.unescapeHTML(name)
|
|
.sub(/(?:\- )?Forum/i, "")
|
|
.strip
|
|
end
|
|
|
|
def import_topics
|
|
puts "", "Importing topics..."
|
|
|
|
last_topic_id = -1
|
|
total_count = count("SELECT COUNT(1) AS count FROM te_Forum_Threads")
|
|
|
|
batches do |offset|
|
|
rows = query(<<~SQL)
|
|
SELECT TOP #{BATCH_SIZE}
|
|
t.ThreadId, t.ForumId, t.UserId,
|
|
t.Subject, t.Body, t.DateCreated, t.IsLocked, t.StickyDate,
|
|
a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName
|
|
FROM te_Forum_Threads t
|
|
LEFT JOIN te_Attachments a
|
|
ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = t.ThreadId AND
|
|
a.ApplicationContentTypeId = 0)
|
|
WHERE t.ThreadId > #{last_topic_id}
|
|
ORDER BY t.ThreadId
|
|
SQL
|
|
|
|
break if rows.blank?
|
|
last_topic_id = rows[-1]["ThreadId"]
|
|
next if all_records_exist?(:post, rows.map { |row| import_topic_id(row["ThreadId"]) })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID
|
|
|
|
post = {
|
|
id: import_topic_id(row["ThreadId"]),
|
|
title: CGI.unescapeHTML(row["Subject"]),
|
|
raw: raw_with_attachment(row, user_id),
|
|
category: category_id_from_imported_category_id(row["ForumId"]),
|
|
user_id: user_id,
|
|
created_at: row["DateCreated"],
|
|
closed: row["IsLocked"],
|
|
post_create_action: proc do |action_post|
|
|
topic = action_post.topic
|
|
Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until
|
|
url = "f/#{row['ForumId']}/t/#{row['ThreadId']}"
|
|
Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url)
|
|
end
|
|
}
|
|
|
|
if row["StickyDate"] > Time.now
|
|
post[:pinned_until] = row["StickyDate"]
|
|
post[:pinned_at] = row["DateCreated"]
|
|
end
|
|
|
|
post
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_topic_id(topic_id)
|
|
"T#{topic_id}"
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "Importing posts..."
|
|
|
|
last_post_id = -1
|
|
total_count = count("SELECT COUNT(1) AS count FROM te_Forum_ThreadReplies")
|
|
|
|
batches do |offset|
|
|
rows = query(<<~SQL)
|
|
SELECT TOP #{BATCH_SIZE}
|
|
tr.ThreadReplyId, tr.ThreadId, tr.UserId, tr.ParentReplyId,
|
|
tr.Body, tr.ThreadReplyDate,
|
|
CONVERT(BIT,
|
|
CASE WHEN tr.AnswerVerifiedUtcDate IS NOT NULL AND NOT EXISTS(
|
|
SELECT 1
|
|
FROM te_Forum_ThreadReplies x
|
|
WHERE
|
|
x.ThreadId = tr.ThreadId AND x.ThreadReplyId < tr.ThreadReplyId AND x.AnswerVerifiedUtcDate IS NOT NULL
|
|
)
|
|
THEN 1
|
|
ELSE 0 END) AS IsFirstVerifiedAnswer,
|
|
a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName
|
|
FROM te_Forum_ThreadReplies tr
|
|
JOIN te_Forum_Threads t ON (tr.ThreadId = t.ThreadId)
|
|
LEFT JOIN te_Attachments a
|
|
ON (a.ApplicationId = t.ForumId AND a.ApplicationTypeId = 0 AND a.ContentId = tr.ThreadReplyId AND
|
|
a.ApplicationContentTypeId = 1)
|
|
WHERE tr.ThreadReplyId > #{last_post_id}
|
|
ORDER BY tr.ThreadReplyId
|
|
SQL
|
|
|
|
break if rows.blank?
|
|
last_post_id = rows[-1]["ThreadReplyId"]
|
|
next if all_records_exist?(:post, rows.map { |row| row["ThreadReplyId"] })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
imported_parent_id = row["ParentReplyId"] > 0 ? row["ParentReplyId"] : import_topic_id(row["ThreadId"])
|
|
parent_post = topic_lookup_from_imported_post_id(imported_parent_id)
|
|
user_id = user_id_from_imported_user_id(row["UserId"]) || Discourse::SYSTEM_USER_ID
|
|
|
|
if parent_post
|
|
post = {
|
|
id: row["ThreadReplyId"],
|
|
raw: raw_with_attachment(row, user_id),
|
|
user_id: user_id,
|
|
topic_id: parent_post[:topic_id],
|
|
created_at: row["ThreadReplyDate"],
|
|
reply_to_post_number: parent_post[:post_number]
|
|
}
|
|
|
|
post[:custom_fields] = { is_accepted_answer: "true" } if row["IsFirstVerifiedAnswer"]
|
|
post
|
|
else
|
|
puts "Failed to import post #{row['ThreadReplyId']}. Parent was not found."
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def raw_with_attachment(row, user_id)
|
|
raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id)
|
|
raw = html_to_markdown(raw) || ""
|
|
|
|
filename = row["FileName"]
|
|
return raw if filename.blank?
|
|
|
|
path = File.join(
|
|
ENV["FILE_BASE_DIR"],
|
|
"telligent.evolution.components.attachments",
|
|
"%02d" % row["ApplicationTypeId"],
|
|
"%02d" % row["ApplicationId"],
|
|
"%02d" % row["ApplicationContentTypeId"],
|
|
("%010d" % row["ContentId"]).scan(/.{2}/),
|
|
clean_filename(filename)
|
|
)
|
|
|
|
unless embedded_paths.include?(path)
|
|
if File.exists?(path)
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
|
|
raw << "\n" << @uploader.html_for_upload(upload, filename)
|
|
end
|
|
else
|
|
STDERR.puts "Could not find file: #{path}"
|
|
end
|
|
end
|
|
|
|
raw
|
|
end
|
|
|
|
def replace_embedded_attachments(raw, user_id)
|
|
paths = []
|
|
upload_ids = []
|
|
|
|
raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do
|
|
filename, path = attachment_path(Regexp.last_match)
|
|
|
|
if File.exists?(path)
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
if upload.present? && upload.persisted?
|
|
paths << path
|
|
upload_ids << upload.id
|
|
@uploader.html_for_upload(upload, filename)
|
|
end
|
|
else
|
|
STDERR.puts "Could not find file: #{path}"
|
|
end
|
|
end
|
|
|
|
[raw, paths, upload_ids]
|
|
end
|
|
|
|
def clean_filename(filename)
|
|
CGI.unescapeHTML(filename)
|
|
.gsub(/[\x00\/\\:\*\?\"<>\|]/, '_')
|
|
.gsub(/_(?:2B00|2E00|2D00|5B00|5D00|5F00)/, '')
|
|
end
|
|
|
|
def attachment_path(match_data)
|
|
filename, path = join_attachment_path(match_data, filename_index: 2)
|
|
filename, path = join_attachment_path(match_data, filename_index: 1) unless File.exists?(path)
|
|
[filename, path]
|
|
end
|
|
|
|
# filenames are a total mess - try to guess the correct filename
|
|
# works for 70% of all files
|
|
def join_attachment_path(match_data, filename_index:)
|
|
filename = clean_filename(match_data[:"filename#{filename_index}"])
|
|
base_path = File.join(
|
|
ENV["FILE_BASE_DIR"],
|
|
match_data[:directory].gsub("-", "."),
|
|
match_data[:path].split("-")
|
|
)
|
|
|
|
path = File.join(base_path, filename)
|
|
return [filename, path] if File.exists?(path)
|
|
|
|
original_filename = filename.dup
|
|
|
|
filename = filename.gsub("-", " ")
|
|
path = File.join(base_path, filename)
|
|
return [filename, path] if File.exists?(path)
|
|
|
|
filename = filename.gsub("_", "-")
|
|
path = File.join(base_path, filename)
|
|
return [filename, path] if File.exists?(path)
|
|
|
|
[original_filename, File.join(base_path, original_filename)]
|
|
end
|
|
|
|
def mark_topics_as_solved
|
|
puts "", "Marking topics as solved..."
|
|
|
|
DB.exec <<~SQL
|
|
INSERT INTO topic_custom_fields (name, value, topic_id, created_at, updated_at)
|
|
SELECT 'accepted_answer_post_id', pcf.post_id, p.topic_id, p.created_at, p.created_at
|
|
FROM post_custom_fields pcf
|
|
JOIN posts p ON p.id = pcf.post_id
|
|
WHERE pcf.name = 'is_accepted_answer' AND pcf.value = 'true'
|
|
SQL
|
|
end
|
|
|
|
def html_to_markdown(html)
|
|
HtmlToMarkdown.new(html).to_markdown if html.present?
|
|
end
|
|
|
|
def add_permalink_normalizations
|
|
normalizations = SiteSetting.permalink_normalizations
|
|
normalizations = normalizations.blank? ? [] : normalizations.split('|')
|
|
|
|
add_normalization(normalizations, CATEGORY_LINK_NORMALIZATION)
|
|
add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
|
|
|
|
SiteSetting.permalink_normalizations = normalizations.join('|')
|
|
end
|
|
|
|
def add_normalization(normalizations, normalization)
|
|
normalizations << normalization unless normalizations.include?(normalization)
|
|
end
|
|
|
|
def batches
|
|
super(BATCH_SIZE)
|
|
end
|
|
|
|
def query(sql)
|
|
@client.execute(sql).to_a
|
|
end
|
|
|
|
def count(sql)
|
|
query(sql).first["count"]
|
|
end
|
|
end
|
|
|
|
ImportScripts::Telligent.new.perform
|