discourse/script/import_scripts/higher_logic.rb
Justin DiRose e302e32a3f
DEV: Add Higher Logic import script (#12623)
Wrote up a new script to import from Higher Logic. Nothing too crazy going on here. Two major things about this script:

    It requires you to convert a Microsoft SQL file to a format MySQL can read.
    Higher Logic stores posts (at least in the case of the import I ran) with the email thread shown in the post body. The script does its best to truncate this out, but the logic may need to be improved on future imports. For the import I ran, it worked just fine as is. 🤷‍♂️
2021-04-06 16:53:55 -05:00

232 lines
6.2 KiB
Ruby

# frozen_string_literal: true
require "mysql2"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::HigherLogic < ImportScripts::Base
HIGHERLOGIC_DB = "higherlogic"
BATCH_SIZE = 1000
ATTACHMENT_DIR = "/shared/import/data/attachments"
def initialize
super
@client = Mysql2::Client.new(
host: "localhost",
username: "root",
database: HIGHERLOGIC_DB
)
end
def execute
import_groups
import_users
import_group_users
import_categories
import_posts
import_attachments
end
def import_groups
puts '', 'importing groups'
groups = mysql_query <<-SQL
SELECT CommunityKey, CommunityName
FROM Community
ORDER BY CommunityName
SQL
create_groups(groups) do |group|
{
id: group['CommunityKey'],
name: group['CommunityName']
}
end
end
def import_users
puts '', 'importing users'
total_count = mysql_query("SELECT count(*) FROM Contact").first["count"]
batches(BATCH_SIZE) do |offset|
results = mysql_query <<-SQL
SELECT ContactKey, FirstName, LastName, EmailAddress, HLAdminFlag, UserStatus, CreatedOn, Birthday, Bio
FROM Contact
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if results.size < 1
next if all_records_exist? :users, results.map { |u| u['ContactKey'] }
create_users(results, total: total_count, offset: offset) do |user|
next if user['EmailAddress'].blank?
{
id: user['ContactKey'],
email: user['EmailAddress'],
name: "#{user['FirstName']} #{user['LastName']}",
created_at: user['CreatedOn'] == nil ? 0 : Time.zone.at(user['CreatedOn']),
bio_raw: user['Bio'],
active: user['UserStatus'] == "Active",
admin: user['HLAdminFlag'] == 1
}
end
end
end
def import_group_users
puts '', 'importing group users'
group_users = mysql_query(<<-SQL
SELECT CommunityKey, ContactKey
FROM CommunityMember
SQL
).to_a
group_users.each do |row|
next unless user_id = user_id_from_imported_user_id(row['ContactKey'])
next unless group_id = group_id_from_imported_group_id(row['CommunityKey'])
puts '', '.'
GroupUser.find_or_create_by(user_id: user_id, group_id: group_id)
end
end
def import_categories
puts '', 'importing categories'
categories = mysql_query <<-SQL
SELECT DiscussionKey, DiscussionName
FROM Discussion
SQL
create_categories(categories) do |category|
{
id: category['DiscussionKey'],
name: category['DiscussionName']
}
end
end
def import_posts
puts '', 'importing topics and posts'
total_count = mysql_query("SELECT count(*) FROM DiscussionPost").first["count"]
batches(BATCH_SIZE) do |offset|
results = mysql_query <<-SQL
SELECT MessageKey,
ParentMessageKey,
Subject,
ContactKey,
DiscussionKey,
PinnedFlag,
Body,
CreatedOn
FROM DiscussionPost
WHERE CreatedOn > '2020-01-01 00:00:00'
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if results.size < 1
next if all_records_exist? :posts, results.map { |p| p['MessageKey'] }
create_posts(results, total: total_count, offset: offset) do |post|
raw = preprocess_raw(post['Body'])
mapped = {
id: post['MessageKey'],
user_id: user_id_from_imported_user_id(post['ContactKey']),
raw: raw,
created_at: Time.zone.at(post['CreatedOn']),
}
if post['ParentMessageKey'].nil?
mapped[:category] = category_id_from_imported_category_id(post['DiscussionKey']).to_i
mapped[:title] = CGI.unescapeHTML(post['Subject'])
mapped[:pinned] = post['PinnedFlag'] == 1
else
topic = topic_lookup_from_imported_post_id(post['ParentMessageKey'])
if topic.present?
mapped[:topic_id] = topic[:topic_id]
else
puts "Parent post #{post['ParentMessageKey']} doesn't exist. Skipping."
next
end
end
mapped
end
end
end
def import_attachments
puts '', 'importing attachments'
count = 0
total_attachments = mysql_query(<<-SQL
SELECT COUNT(*) count
FROM LibraryEntryFile l
JOIN DiscussionPost p ON p.AttachmentDocumentKey = l.DocumentKey
WHERE p.CreatedOn > '2020-01-01 00:00:00'
SQL
).first['count']
batches(BATCH_SIZE) do |offset|
attachments = mysql_query(<<-SQL
SELECT l.VersionName,
l.FileExtension,
p.MessageKey
FROM LibraryEntryFile l
LEFT JOIN DiscussionPost p ON p.AttachmentDocumentKey = l.DocumentKey
WHERE p.CreatedOn > '2020-01-01 00:00:00'
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if attachments.empty?
attachments.each do |a|
print_status(count += 1, total_attachments, get_start_time("attachments"))
original_filename = "#{a['VersionName']}.#{a['FileExtension']}"
path = File.join(ATTACHMENT_DIR, original_filename)
if File.exists?(path)
if post = Post.find(post_id_from_imported_post_id(a['MessageKey']))
filename = File.basename(original_filename)
upload = create_upload(post.user.id, path, filename)
if upload&.persisted?
html = html_for_upload(upload, filename)
post.raw << "\n\n" << html
post.save!
PostUpload.create!(post: post, upload: upload) unless PostUpload.where(post: post, upload: upload).exists?
end
end
end
end
end
end
def preprocess_raw(body)
raw = body.dup
# trim off any post text beyond ---- to remove email threading
raw = raw.slice(0..(raw.index('------'))) || raw
raw = HtmlToMarkdown.new(raw).to_markdown
raw
end
def mysql_query(sql)
@client.query(sql, cache_rows: false)
end
end
ImportScripts::HigherLogic.new.perform