improve vBulletin import script

This commit is contained in:
Arpit Jalan 2016-07-10 14:49:24 +05:30
parent 64ca5552bc
commit 201d344a2d

View File

@ -1,13 +1,15 @@
require 'mysql2' require 'mysql2'
require File.expand_path(File.dirname(__FILE__) + "/base.rb") require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'htmlentities' require 'htmlentities'
require 'php_serialize' # https://github.com/jqr/php-serialize
class ImportScripts::VBulletin < ImportScripts::Base class ImportScripts::VBulletin < ImportScripts::Base
BATCH_SIZE = 1000 BATCH_SIZE = 1000
# CHANGE THESE BEFORE RUNNING THE IMPORTER # CHANGE THESE BEFORE RUNNING THE IMPORTER
DATABASE = "iref" DATABASE = "q23"
TIMEZONE = "Asia/Kolkata" TABLE_PREFIX = "vb_"
TIMEZONE = "America/Los_Angeles"
ATTACHMENT_DIR = '/path/to/your/attachment/folder' ATTACHMENT_DIR = '/path/to/your/attachment/folder'
def initialize def initialize
@ -32,10 +34,14 @@ class ImportScripts::VBulletin < ImportScripts::Base
import_categories import_categories
import_topics import_topics
import_posts import_posts
import_private_messages
import_attachments import_attachments
close_topics close_topics
post_process_posts post_process_posts
create_permalinks
suspend_users
end end
def import_groups def import_groups
@ -43,7 +49,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
groups = mysql_query <<-SQL groups = mysql_query <<-SQL
SELECT usergroupid, title SELECT usergroupid, title
FROM usergroup FROM #{TABLE_PREFIX}usergroup
ORDER BY usergroupid ORDER BY usergroupid
SQL SQL
@ -58,12 +64,12 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_users def import_users
puts "", "importing users" puts "", "importing users"
user_count = mysql_query("SELECT COUNT(userid) count FROM user").first["count"] user_count = mysql_query("SELECT COUNT(userid) count FROM #{TABLE_PREFIX}user").first["count"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL users = mysql_query <<-SQL
SELECT userid, username, homepage, usertitle, usergroupid, joindate, email SELECT userid, username, homepage, usertitle, usergroupid, joindate, email
FROM user FROM #{TABLE_PREFIX}user
ORDER BY userid ORDER BY userid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -85,6 +91,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
title: @htmlentities.decode(user["usertitle"]).strip, title: @htmlentities.decode(user["usertitle"]).strip,
primary_group_id: group_id_from_imported_group_id(user["usergroupid"]), primary_group_id: group_id_from_imported_group_id(user["usergroupid"]),
created_at: parse_timestamp(user["joindate"]), created_at: parse_timestamp(user["joindate"]),
last_seen_at: parse_timestamp(user["lastvisit"]),
post_create_action: proc do |u| post_create_action: proc do |u|
@old_username_to_new_usernames[user["username"]] = u.username @old_username_to_new_usernames[user["username"]] = u.username
import_profile_picture(user, u) import_profile_picture(user, u)
@ -98,7 +105,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_profile_picture(old_user, imported_user) def import_profile_picture(old_user, imported_user)
query = mysql_query <<-SQL query = mysql_query <<-SQL
SELECT filedata, filename SELECT filedata, filename
FROM customavatar FROM #{TABLE_PREFIX}customavatar
WHERE userid = #{old_user["userid"]} WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC ORDER BY dateline DESC
LIMIT 1 LIMIT 1
@ -127,7 +134,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_profile_background(old_user, imported_user) def import_profile_background(old_user, imported_user)
query = mysql_query <<-SQL query = mysql_query <<-SQL
SELECT filedata, filename SELECT filedata, filename
FROM customprofilepic FROM #{TABLE_PREFIX}customprofilepic
WHERE userid = #{old_user["userid"]} WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC ORDER BY dateline DESC
LIMIT 1 LIMIT 1
@ -154,11 +161,11 @@ class ImportScripts::VBulletin < ImportScripts::Base
def import_categories def import_categories
puts "", "importing top level categories..." puts "", "importing top level categories..."
categories = mysql_query("SELECT forumid, title, description, displayorder, parentid FROM forum ORDER BY forumid").to_a categories = mysql_query("SELECT forumid, title, description, displayorder, parentid FROM #{TABLE_PREFIX}forum ORDER BY forumid").to_a
top_level_categories = categories.select { |c| c["parentid"] == -1 } # top_level_categories = categories.select { |c| c["parentid"] == -1 }
create_categories(top_level_categories) do |category| create_categories(categories) do |category|
{ {
id: category["forumid"], id: category["forumid"],
name: @htmlentities.decode(category["title"]).strip, name: @htmlentities.decode(category["title"]).strip,
@ -167,27 +174,27 @@ class ImportScripts::VBulletin < ImportScripts::Base
} }
end end
puts "", "importing children categories..." # puts "", "importing children categories..."
#
children_categories = categories.select { |c| c["parentid"] != -1 } # children_categories = categories.select { |c| c["parentid"] != -1 }
top_level_category_ids = Set.new(top_level_categories.map { |c| c["forumid"] }) # top_level_category_ids = Set.new(top_level_categories.map { |c| c["forumid"] })
#
# cut down the tree to only 2 levels of categories # # cut down the tree to only 2 levels of categories
children_categories.each do |cc| # children_categories.each do |cc|
while !top_level_category_ids.include?(cc["parentid"]) # while !top_level_category_ids.include?(cc["parentid"])
cc["parentid"] = categories.detect { |c| c["forumid"] == cc["parentid"] }["parentid"] # cc["parentid"] = categories.detect { |c| c["forumid"] == cc["parentid"] }["parentid"]
end # end
end # end
#
create_categories(children_categories) do |category| # create_categories(children_categories) do |category|
{ # {
id: category["forumid"], # id: category["forumid"],
name: @htmlentities.decode(category["title"]).strip, # name: @htmlentities.decode(category["title"]).strip,
position: category["displayorder"], # position: category["displayorder"],
description: @htmlentities.decode(category["description"]).strip, # description: @htmlentities.decode(category["description"]).strip,
parent_category_id: category_id_from_imported_category_id(category["parentid"]) # parent_category_id: category_id_from_imported_category_id(category["parentid"])
} # }
end # end
end end
def import_topics def import_topics
@ -196,14 +203,14 @@ class ImportScripts::VBulletin < ImportScripts::Base
# keep track of closed topics # keep track of closed topics
@closed_topic_ids = [] @closed_topic_ids = []
topic_count = mysql_query("SELECT COUNT(threadid) count FROM thread").first["count"] topic_count = mysql_query("SELECT COUNT(threadid) count FROM #{TABLE_PREFIX}thread").first["count"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL topics = mysql_query <<-SQL
SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky, SELECT t.threadid threadid, t.title title, forumid, open, postuserid, t.dateline dateline, views, t.visible visible, sticky,
p.pagetext raw p.pagetext raw
FROM thread t FROM #{TABLE_PREFIX}thread t
JOIN post p ON p.postid = t.firstpostid JOIN #{TABLE_PREFIX}post p ON p.postid = t.firstpostid
ORDER BY t.threadid ORDER BY t.threadid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -237,15 +244,15 @@ class ImportScripts::VBulletin < ImportScripts::Base
puts "", "importing posts..." puts "", "importing posts..."
# make sure `firstpostid` is indexed # make sure `firstpostid` is indexed
mysql_query("CREATE INDEX firstpostid_index ON thread (firstpostid)") mysql_query("CREATE INDEX firstpostid_index ON #{TABLE_PREFIX}thread (firstpostid)")
post_count = mysql_query("SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread)").first["count"] post_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL posts = mysql_query <<-SQL
SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid SELECT postid, userid, threadid, pagetext raw, dateline, visible, parentid
FROM post FROM #{TABLE_PREFIX}post
WHERE postid NOT IN (SELECT firstpostid FROM thread) WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)
ORDER BY postid ORDER BY postid
LIMIT #{BATCH_SIZE} LIMIT #{BATCH_SIZE}
OFFSET #{offset} OFFSET #{offset}
@ -278,7 +285,7 @@ class ImportScripts::VBulletin < ImportScripts::Base
def find_upload(post, attachment_id) def find_upload(post, attachment_id)
sql = "SELECT a.attachmentid attachment_id, a.userid user_id, a.filedataid file_id, a.filename filename, sql = "SELECT a.attachmentid attachment_id, a.userid user_id, a.filedataid file_id, a.filename filename,
a.caption caption a.caption caption
FROM attachment a FROM #{TABLE_PREFIX}attachment a
WHERE a.attachmentid = #{attachment_id}" WHERE a.attachmentid = #{attachment_id}"
results = mysql_query(sql) results = mysql_query(sql)
@ -310,11 +317,119 @@ class ImportScripts::VBulletin < ImportScripts::Base
return nil return nil
end end
def import_private_messages
puts "", "importing private messages..."
topic_count = mysql_query("SELECT COUNT(pmtextid) count FROM #{TABLE_PREFIX}pmtext").first["count"]
batches(BATCH_SIZE) do |offset|
private_messages = mysql_query <<-SQL
SELECT pmtextid, fromuserid, title, message, touserarray, dateline
FROM #{TABLE_PREFIX}pmtext
ORDER BY pmtextid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if private_messages.size < 1
next if all_records_exist? :posts, private_messages.map {|pm| "pm-#{pm['pmtextid']}" }
title_username_of_pm_first_post = {}
create_posts(private_messages, total: topic_count, offset: offset) do |m|
skip = false
mapped = {}
mapped[:id] = "pm-#{m['pmtextid']}"
mapped[:user_id] = user_id_from_imported_user_id(m['fromuserid']) || Discourse::SYSTEM_USER_ID
mapped[:raw] = preprocess_post_raw(m['message']) rescue nil
mapped[:created_at] = Time.zone.at(m['dateline'])
title = @htmlentities.decode(m['title']).strip[0...255]
topic_id = nil
next if mapped[:raw].blank?
# users who are part of this private message.
target_usernames = []
target_userids = []
begin
to_user_array = PHP.unserialize(m['touserarray'])
rescue
puts "#{m['pmtextid']} -- #{m['touserarray']}"
skip = true
end
begin
to_user_array.each do |to_user|
if to_user[0] == "cc" || to_user[0] == "bcc" # not sure if we should include bcc users
to_user[1].each do |to_user_cc|
user_id = user_id_from_imported_user_id(to_user_cc[0])
username = User.find_by(id: user_id).try(:username)
target_userids << user_id || Discourse::SYSTEM_USER_ID
target_usernames << username if username
end
else
user_id = user_id_from_imported_user_id(to_user[0])
username = User.find_by(id: user_id).try(:username)
target_userids << user_id || Discourse::SYSTEM_USER_ID
target_usernames << username if username
end
end
rescue
puts "skipping pm-#{m['pmtextid']} `to_user_array` is not properly serialized -- #{to_user_array.inspect}"
skip = true
end
participants = target_userids
participants << mapped[:user_id]
begin
participants.sort!
rescue
puts "one of the participant's id is nil -- #{participants.inspect}"
end
if title =~ /^Re:/
parent_id = title_username_of_pm_first_post[[title[3..-1], participants]]
parent_id = title_username_of_pm_first_post[[title[4..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[5..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[6..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[7..-1], participants]] unless parent_id
parent_id = title_username_of_pm_first_post[[title[8..-1], participants]] unless parent_id
if t = topic_lookup_from_imported_post_id("pm-#{parent_id}")
topic_id = t[:topic_id]
end
end
else
title_username_of_pm_first_post[[title, participants]] ||= m['pmtextid']
end
unless topic_id
mapped[:title] = title
mapped[:archetype] = Archetype.private_message
mapped[:target_usernames] = target_usernames.join(',')
if mapped[:target_usernames].empty? # pm with yourself?
# skip = true
mapped[:target_usernames] = "system"
puts "pm-#{m['pmtextid']} has no target (#{m['touserarray']})"
end
else
mapped[:topic_id] = topic_id
end
skip ? nil : mapped
end
end
end
def import_attachments def import_attachments
puts '', 'importing attachments...' puts '', 'importing attachments...'
current_count = 0 current_count = 0
total_count = mysql_query("SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread)").first["count"] total_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}post WHERE postid NOT IN (SELECT firstpostid FROM #{TABLE_PREFIX}thread)").first["count"]
success_count = 0 success_count = 0
fail_count = 0 fail_count = 0
@ -353,15 +468,15 @@ class ImportScripts::VBulletin < ImportScripts::Base
sql = <<-SQL sql = <<-SQL
WITH closed_topic_ids AS ( WITH closed_topic_ids AS (
SELECT t.id AS topic_id SELECT t.id AS topic_id
FROM post_custom_fields pcf FROM #{TABLE_PREFIX}post_custom_fields pcf
JOIN posts p ON p.id = pcf.post_id JOIN #{TABLE_PREFIX}posts p ON p.id = pcf.post_id
JOIN topics t ON t.id = p.topic_id JOIN #{TABLE_PREFIX}topics t ON t.id = p.topic_id
WHERE pcf.name = 'import_id' WHERE pcf.name = 'import_id'
AND pcf.value IN (?) AND pcf.value IN (?)
) )
UPDATE topics UPDATE topics
SET closed = true SET closed = true
WHERE id IN (SELECT topic_id FROM closed_topic_ids) WHERE id IN (SELECT topic_id FROM #{TABLE_PREFIX}closed_topic_ids)
SQL SQL
Topic.exec_sql(sql, @closed_topic_ids) Topic.exec_sql(sql, @closed_topic_ids)
@ -430,7 +545,8 @@ class ImportScripts::VBulletin < ImportScripts::Base
.gsub("\u2603", ">") .gsub("\u2603", ">")
# [URL=...]...[/URL] # [URL=...]...[/URL]
raw = raw.gsub(/\[url="?(.+?)"?\](.+)\[\/url\]/i) { "[#{$2}](#{$1})" } raw.gsub!(/\[url="?([^"]+?)"?\](.*?)\[\/url\]/im) { "[#{$2.strip}](#{$1})" }
raw.gsub!(/\[url="?(.+?)"?\](.+)\[\/url\]/im) { "[#{$2.strip}](#{$1})" }
# [URL]...[/URL] # [URL]...[/URL]
# [MP3]...[/MP3] # [MP3]...[/MP3]
@ -446,17 +562,11 @@ class ImportScripts::VBulletin < ImportScripts::Base
"@#{old_username}" "@#{old_username}"
end end
# [MENTION=<user_id>]<username>[/MENTION]
# raw = raw.gsub(/\[mention="?(\d+)"?\](.+?)\[\/mention\]/i) do
# user_id, old_username = $1, $2
# if user = @users.select { |u| u[:userid] == user_id }.first
# old_username = @old_username_to_new_usernames[user[:username]] || user[:username]
# end
# "@#{old_username}"
# end
# [QUOTE]...[/QUOTE] # [QUOTE]...[/QUOTE]
raw = raw.gsub(/\[quote\](.+?)\[\/quote\]/im) { "\n> #{$1}\n" } raw.gsub!(/\[quote\](.+?)\[\/quote\]/im) { |quote|
quote.gsub!(/\[quote\](.+?)\[\/quote\]/im) { "\n#{$1}\n" }
quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
}
# [QUOTE=<username>]...[/QUOTE] # [QUOTE=<username>]...[/QUOTE]
raw = raw.gsub(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do raw = raw.gsub(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do
@ -473,6 +583,27 @@ class ImportScripts::VBulletin < ImportScripts::Base
# [VIDEO=youtube;<id>]...[/VIDEO] # [VIDEO=youtube;<id>]...[/VIDEO]
raw = raw.gsub(/\[video=youtube;([^\]]+)\].*?\[\/video\]/i) { "\n//youtu.be/#{$1}\n" } raw = raw.gsub(/\[video=youtube;([^\]]+)\].*?\[\/video\]/i) { "\n//youtu.be/#{$1}\n" }
# More Additions ....
# [spoiler=Some hidden stuff]SPOILER HERE!![/spoiler]
raw.gsub!(/\[spoiler="?(.+?)"?\](.+?)\[\/spoiler\]/im) { "\n#{$1}\n[spoiler]#{$2}[/spoiler]\n" }
# [IMG][IMG]http://i63.tinypic.com/akga3r.jpg[/IMG][/IMG]
raw.gsub!(/\[IMG\]\[IMG\](.+?)\[\/IMG\]\[\/IMG\]/i) { "[IMG]#{$1}[/IMG]" }
# convert list tags to ul and list=1 tags to ol
# (basically, we're only missing list=a here...)
# (https://meta.discourse.org/t/phpbb-3-importer-old/17397)
raw.gsub!(/\[list\](.*?)\[\/list\]/im, '[ul]\1[/ul]')
raw.gsub!(/\[list=1\](.*?)\[\/list\]/im, '[ol]\1[/ol]')
raw.gsub!(/\[list\](.*?)\[\/list:u\]/im, '[ul]\1[/ul]')
raw.gsub!(/\[list=1\](.*?)\[\/list:o\]/im, '[ol]\1[/ol]')
# convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists:
raw.gsub!(/\[\*\]\n/, '')
raw.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]')
raw.gsub!(/\[\*\](.*?)\n/, '[li]\1[/li]')
raw raw
end end
@ -546,6 +677,68 @@ class ImportScripts::VBulletin < ImportScripts::Base
raw raw
end end
def create_permalinks
puts '', 'Creating Permalinks...', ''
id_mapping = []
Topic.listable_topics.find_each do |topic|
pcf = topic.first_post.custom_fields
if pcf && pcf["import_id"]
id = pcf["import_id"].split('-').last
id_mapping.push("XXX#{id} YYY#{topic.id}")
end
end
# Category.find_each do |cat|
# ccf = cat.custom_fields
# if ccf && ccf["import_id"]
# id = ccf["import_id"].to_i
# id_mapping.push("/forumdisplay.php?#{id} http://forum.quartertothree.com#{cat.url}")
# end
# end
CSV.open(File.expand_path("../vb_map.csv", __FILE__), "w") do |csv|
id_mapping.each do |value|
csv << [value]
end
end
end
def suspend_users
puts '', "updating banned users"
banned = 0
failed = 0
total = mysql_query("SELECT count(*) count FROM #{TABLE_PREFIX}userban").first['count']
system_user = Discourse.system_user
mysql_query("SELECT userid, bandate FROM #{TABLE_PREFIX}userban").each do |b|
user = User.find_by_id(b['userid'])
if user
user.suspended_at = parse_timestamp(user["bandate"])
user.suspended_till = 200.years.from_now
if user.save
StaffActionLogger.new(system_user).log_user_suspend(user, "banned during initial import")
banned += 1
else
puts "Failed to suspend user #{user.username}. #{user.errors.try(:full_messages).try(:inspect)}"
failed += 1
end
else
puts "Not found: #{b['userid']}"
failed += 1
end
print_status banned + failed, total
end
end
def parse_timestamp(timestamp) def parse_timestamp(timestamp)
Time.zone.at(@tz.utc_to_local(timestamp)) Time.zone.at(@tz.utc_to_local(timestamp))
end end