mirror of
https://github.com/discourse/discourse.git
synced 2024-11-30 04:34:13 -06:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
296 lines
8.5 KiB
Ruby
296 lines
8.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
|
require 'pg'
|
|
require_relative 'base/uploader'
|
|
|
|
=begin
|
|
if you want to create mock users for posts made by anonymous participants,
|
|
run the following SQL prior to importing.
|
|
|
|
-- first attribute any anonymous posts to existing users (if any)
|
|
|
|
UPDATE node
|
|
SET owner_id = p.user_id, anonymous_name = NULL
|
|
FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p
|
|
WHERE p.name = lower(node.anonymous_name)
|
|
AND owner_id IS NULL;
|
|
|
|
-- then create mock users
|
|
|
|
INSERT INTO user_ (email, name, joined, registered)
|
|
SELECT lower(anonymous_name) || '@dummy.com', MIN(anonymous_name), MIN(when_created), MIN(when_created)
|
|
FROM node
|
|
WHERE anonymous_name IS NOT NULL
|
|
GROUP BY lower(anonymous_name);
|
|
|
|
-- then move these posts to the new users
|
|
-- (yes, this is the same query as the first one indeed)
|
|
|
|
UPDATE node
|
|
SET owner_id = p.user_id, anonymous_name = NULL
|
|
FROM ( SELECT lower(name) AS name, user_id FROM user_ ) p
|
|
WHERE p.name = lower(node.anonymous_name)
|
|
AND owner_id IS NULL;
|
|
|
|
=end
|
|
|
|
class ImportScripts::Nabble < ImportScripts::Base
|
|
# CHANGE THESE BEFORE RUNNING THE IMPORTER
|
|
|
|
BATCH_SIZE = 1000
|
|
|
|
DB_NAME = "nabble"
|
|
CATEGORY_ID = 6
|
|
|
|
def initialize
|
|
super
|
|
|
|
@tagmap = []
|
|
@td = PG::TextDecoder::TimestampWithTimeZone.new
|
|
@client = PG.connect(dbname: DB_NAME)
|
|
@uploader = ImportScripts::Uploader.new
|
|
end
|
|
|
|
def execute
|
|
import_users
|
|
create_forum_topics
|
|
import_replies
|
|
end
|
|
|
|
def import_users
|
|
puts "", "importing users"
|
|
|
|
total_count = @client.exec("SELECT COUNT(user_id) FROM user_")[0]["count"]
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
users = @client.query(<<-SQL
|
|
SELECT user_id, name, email, joined
|
|
FROM user_
|
|
ORDER BY joined
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
)
|
|
|
|
break if users.ntuples() < 1
|
|
|
|
next if all_records_exist? :users, users.map { |u| u["user_id"].to_i }
|
|
|
|
create_users(users, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: row["user_id"],
|
|
email: row["email"] || (SecureRandom.hex << "@domain.com"),
|
|
created_at: Time.zone.at(@td.decode(row["joined"])),
|
|
name: row["name"],
|
|
post_create_action: proc do |user|
|
|
import_avatar(user, row["user_id"])
|
|
end
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_avatar(user, org_id)
|
|
filename = 'avatar' + org_id.to_s
|
|
path = File.join('/tmp/nab', filename)
|
|
res = @client.exec("SELECT content FROM file_avatar WHERE name='avatar100.png' AND user_id = #{org_id} LIMIT 1")
|
|
return if res.ntuples() < 1
|
|
|
|
binary = res[0]['content']
|
|
File.open(path, 'wb') { |f|
|
|
f.write(PG::Connection.unescape_bytea(binary))
|
|
}
|
|
|
|
upload = @uploader.create_upload(user.id, path, filename)
|
|
|
|
if upload.persisted?
|
|
user.import_mode = false
|
|
user.create_user_avatar
|
|
user.import_mode = true
|
|
user.user_avatar.update(custom_upload_id: upload.id)
|
|
user.update(uploaded_avatar_id: upload.id)
|
|
else
|
|
Rails.logger.error("Could not persist avatar for user #{user.username}")
|
|
end
|
|
|
|
end
|
|
|
|
def parse_email(msg)
|
|
receiver = Email::Receiver.new(msg)
|
|
mail = Mail.read_from_string(msg)
|
|
mail.body
|
|
|
|
body, elided = receiver.select_body
|
|
body.force_encoding(body.encoding).encode("UTF-8")
|
|
end
|
|
|
|
def create_forum_topics
|
|
puts "", "creating forum topics"
|
|
|
|
app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id']
|
|
topic_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id = #{app_node_id}")[0]["count"]
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
|
|
topics = @client.exec <<-SQL
|
|
SELECT n.node_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
|
|
FROM node AS n
|
|
INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
|
|
WHERE n.parent_id = #{app_node_id}
|
|
ORDER BY n.when_created
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
|
|
break if topics.ntuples() < 1
|
|
|
|
next if all_records_exist? :posts, topics.map { |t| t['node_id'].to_i }
|
|
|
|
create_posts(topics, total: topic_count, offset: offset) do |t|
|
|
raw = body_from(t)
|
|
next unless raw
|
|
raw = process_content(raw)
|
|
raw = process_attachments(raw, t['node_id'])
|
|
|
|
{
|
|
id: t['node_id'],
|
|
title: t['subject'],
|
|
user_id: user_id_from_imported_user_id(t["owner_id"]) || Discourse::SYSTEM_USER_ID,
|
|
created_at: Time.zone.at(@td.decode(t["when_created"])),
|
|
category: CATEGORY_ID,
|
|
raw: raw,
|
|
cook_method: Post.cook_methods[:regular]
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def body_from(p)
|
|
%w(m s).include?(p['msg_fmt']) ? parse_email(p['message']) : p['message']
|
|
rescue Email::Receiver::EmptyEmailError
|
|
puts "Skipped #{p['node_id']}"
|
|
end
|
|
|
|
def process_content(txt)
|
|
txt.gsub! /\<quote author="(.*?)"\>/, '[quote="\1"]'
|
|
txt.gsub! /\<\/quote\>/, '[/quote]'
|
|
txt.gsub!(/\<raw\>(.*?)\<\/raw\>/m) do |match|
|
|
c = Regexp.last_match[1].indent(4)
|
|
"\n#{c}\n"
|
|
end
|
|
|
|
# lines starting with # are comments, not headings, insert a space to prevent markdown
|
|
txt.gsub! /\n#/m, ' #'
|
|
|
|
# in the languagetool forum, quite a lot of XML was not marked as raw
|
|
# so we treat <rule...>...</rule> and <category...>...</category> as raw
|
|
|
|
# uncomment below if you want to use this
|
|
|
|
#txt.gsub!(/<rule(.*?)>(.*?<\/rule>)/m) do |match|
|
|
# c = Regexp.last_match[2].indent(4);
|
|
# "\n <rule#{Regexp.last_match[1]}>#{c}\n"
|
|
#end
|
|
#txt.gsub!(/<category(.*?)>(.*?<\/category>)/m) do |match|
|
|
# c = Regexp.last_match[2].indent(4);
|
|
# "\n <rule#{Regexp.last_match[1]}>#{c}\n"
|
|
#end
|
|
txt
|
|
end
|
|
|
|
def process_attachments(txt, postid)
|
|
txt.gsub!(/<nabble_img src="(.*?)" (.*?)>/m) do |match|
|
|
basename = Regexp.last_match[1]
|
|
get_attachment_upload(basename, postid) do |upload|
|
|
@uploader.embedded_image_html(upload)
|
|
end
|
|
end
|
|
|
|
txt.gsub!(/<nabble_a href="(.*?)">(.*?)<\/nabble_a>/m) do |match|
|
|
basename = Regexp.last_match[1]
|
|
get_attachment_upload(basename, postid) do |upload|
|
|
@uploader.attachment_html(upload, basename)
|
|
end
|
|
end
|
|
txt
|
|
end
|
|
|
|
def get_attachment_upload(basename, postid)
|
|
contents = @client.exec("SELECT content FROM file_node WHERE name='#{basename}' AND node_id = #{postid}")
|
|
if contents.any?
|
|
binary = contents[0]['content']
|
|
fn = File.join('/tmp/nab', basename)
|
|
File.open(fn, 'wb') { |f|
|
|
f.write(PG::Connection.unescape_bytea(binary))
|
|
}
|
|
yield @uploader.create_upload(0, fn, basename)
|
|
end
|
|
end
|
|
|
|
def import_replies
|
|
puts "", "creating topic replies"
|
|
|
|
app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id']
|
|
post_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id != #{app_node_id}")[0]["count"]
|
|
|
|
topic_ids = {}
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
posts = @client.exec <<-SQL
|
|
SELECT n.node_id, n.parent_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt
|
|
FROM node AS n
|
|
INNER JOIN node_msg AS nm ON nm.node_id = n.node_id
|
|
WHERE n.parent_id != #{app_node_id}
|
|
ORDER BY n.when_created
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
|
|
break if posts.ntuples() < 1
|
|
|
|
next if all_records_exist? :posts, posts.map { |p| p['node_id'].to_i }
|
|
|
|
create_posts(posts, total: post_count, offset: offset) do |p|
|
|
parent_id = p['parent_id']
|
|
id = p['node_id']
|
|
|
|
topic_id = topic_ids[parent_id]
|
|
unless topic_id
|
|
topic = topic_lookup_from_imported_post_id(parent_id)
|
|
topic_id = topic[:topic_id] if topic
|
|
end
|
|
next unless topic_id
|
|
|
|
topic_ids[id] = topic_id
|
|
|
|
raw = body_from(p)
|
|
next unless raw
|
|
raw = process_content(raw)
|
|
raw = process_attachments(raw, id)
|
|
{ id: id,
|
|
topic_id: topic_id,
|
|
user_id: user_id_from_imported_user_id(p['owner_id']) || Discourse::SYSTEM_USER_ID,
|
|
created_at: Time.zone.at(@td.decode(p["when_created"])),
|
|
raw: raw,
|
|
cook_method: Post.cook_methods[:regular] }
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
class String
|
|
def indent(count, char = ' ')
|
|
gsub(/([^\n]*)(\n|$)/) do |match|
|
|
last_iteration = ($1 == "" && $2 == "")
|
|
line = ""
|
|
line << (char * count) unless last_iteration
|
|
line << $1
|
|
line << $2
|
|
line
|
|
end
|
|
end
|
|
end
|
|
|
|
ImportScripts::Nabble.new.perform
|