mirror of
https://github.com/discourse/discourse.git
synced 2024-11-27 11:20:57 -06:00
821124f3fd
Update all import scripts to take advantage of all_records_exist?
277 lines
7.5 KiB
Ruby
277 lines
7.5 KiB
Ruby
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
|
require 'pg'
|
|
|
|
class ImportScripts::MyAskBot < ImportScripts::Base
|
|
# CHANGE THESE BEFORE RUNNING THE IMPORTER
|
|
|
|
BATCH_SIZE = 1000
|
|
|
|
OLD_SITE = "ask.cvxr.com"
|
|
DB_NAME = "cvxforum"
|
|
DB_USER = "cvxforum"
|
|
DB_PORT = 5432
|
|
DB_HOST = "ask.cvxr.com"
|
|
DB_PASS = 'yeah, right'
|
|
|
|
# A list of categories to create. Any post with one of these tags will be
|
|
# assigned to that category. Ties are broken by list orer.
|
|
CATEGORIES = [ 'Nonconvex', 'TFOCS', 'MIDCP', 'FAQ' ]
|
|
|
|
def initialize
|
|
super
|
|
|
|
@thread_parents = {}
|
|
@tagmap = []
|
|
@td = PG::TextDecoder::TimestampWithTimeZone.new
|
|
@client = PG.connect(
|
|
:dbname => DB_NAME,
|
|
:host => DB_HOST,
|
|
:port => DB_PORT,
|
|
:user => DB_USER,
|
|
:password => DB_PASS
|
|
)
|
|
end
|
|
|
|
def execute
|
|
create_cats
|
|
import_users
|
|
read_tags
|
|
import_posts
|
|
import_replies
|
|
post_process_posts
|
|
end
|
|
|
|
def create_cats
|
|
puts "", "creating categories"
|
|
CATEGORIES.each do |cat|
|
|
unless Category.where("LOWER(name) = ?", cat.downcase).first
|
|
Category.new(name: cat, user_id: -1).save!
|
|
end
|
|
end
|
|
end
|
|
|
|
def read_tags
|
|
puts "", "reading thread tags..."
|
|
|
|
tag_count = @client.exec(<<-SQL
|
|
SELECT COUNT(A.id)
|
|
FROM askbot_thread_tags A
|
|
JOIN tag B
|
|
ON A.tag_id = B.id
|
|
WHERE A.tag_id > 0
|
|
SQL
|
|
)[0]["count"]
|
|
|
|
tags_done = 0
|
|
batches(BATCH_SIZE) do |offset|
|
|
tags = @client.exec(<<-SQL
|
|
SELECT A.thread_id, B.name
|
|
FROM askbot_thread_tags A
|
|
JOIN tag B
|
|
ON A.tag_id = B.id
|
|
WHERE A.tag_id > 0
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
)
|
|
break if tags.ntuples() < 1
|
|
tags.each do |tag|
|
|
tid = tag["thread_id"].to_i
|
|
tnm = tag["name"].downcase
|
|
if @tagmap[tid]
|
|
@tagmap[tid].push( tnm )
|
|
else
|
|
@tagmap[tid] = [ tnm ]
|
|
end
|
|
tags_done += 1
|
|
print_status tags_done, tag_count
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_users
|
|
puts "", "importing users"
|
|
|
|
total_count = @client.exec(<<-SQL
|
|
SELECT COUNT(id)
|
|
FROM auth_user
|
|
SQL
|
|
)[0]["count"]
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
users = @client.query(<<-SQL
|
|
SELECT id, username, email, is_staff, date_joined, last_seen, real_name, website, location, about
|
|
FROM auth_user
|
|
ORDER BY date_joined
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
)
|
|
|
|
break if users.ntuples() < 1
|
|
|
|
next if all_records_exist? :users, users.map {|u| u["id"].to_i}
|
|
|
|
create_users(users, total: total_count, offset: offset) do |user|
|
|
{
|
|
id: user["id"],
|
|
username: user["username"],
|
|
email: user["email"] || (SecureRandom.hex << "@domain.com"),
|
|
admin: user["is_staff"],
|
|
created_at: Time.zone.at(@td.decode(user["date_joined"])),
|
|
last_seen_at: Time.zone.at(@td.decode(user["last_seen"])),
|
|
name: user["real_name"],
|
|
website: user["website"],
|
|
location: user["location"],
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "importing questions..."
|
|
|
|
post_count = @client.exec(<<-SQL
|
|
SELECT COUNT(A.id)
|
|
FROM askbot_post A
|
|
JOIN askbot_thread B
|
|
ON A.thread_id = B.id
|
|
WHERE NOT B.closed AND A.post_type='question'
|
|
SQL
|
|
)[0]["count"]
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
posts = @client.exec(<<-SQL
|
|
SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
|
|
FROM askbot_post A
|
|
JOIN askbot_thread B
|
|
ON A.thread_id = B.id
|
|
WHERE NOT B.closed AND A.post_type = 'question'
|
|
ORDER BY A.added_at
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
)
|
|
|
|
break if posts.ntuples() < 1
|
|
|
|
next if all_records_exist? :posts, posts.map {|p| p["id"].to_i}
|
|
|
|
create_posts(posts, total: post_count, offset: offset) do |post|
|
|
pid = post["id"]
|
|
tid = post["thread_id"].to_i
|
|
tags = @tagmap[tid]
|
|
cat = nil
|
|
if tags
|
|
CATEGORIES.each do |cname|
|
|
next unless tags.include?(cname.downcase)
|
|
cat = cname
|
|
break
|
|
end
|
|
end
|
|
@thread_parents[tid] = pid
|
|
{
|
|
id: pid,
|
|
title: post["title"],
|
|
category: cat,
|
|
custom_fields: {import_id: pid, import_thread_id: tid, import_tags: tags},
|
|
user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
|
|
created_at: Time.zone.at(@td.decode(post["added_at"])),
|
|
raw: post["text"],
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_replies
|
|
puts "", "importing answers and comments..."
|
|
|
|
post_count = @client.exec(<<-SQL
|
|
SELECT COUNT(A.id)
|
|
FROM askbot_post A
|
|
JOIN askbot_thread B
|
|
ON A.thread_id = B.id
|
|
WHERE NOT B.closed AND A.post_type<>'question'
|
|
SQL
|
|
)[0]["count"]
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
posts = @client.exec(<<-SQL
|
|
SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
|
|
FROM askbot_post A
|
|
JOIN askbot_thread B
|
|
ON A.thread_id = B.id
|
|
WHERE NOT B.closed AND A.post_type <> 'question'
|
|
ORDER BY A.added_at
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
)
|
|
|
|
break if posts.ntuples() < 1
|
|
|
|
next if all_records_exist? :posts, posts.map {|p| p["id"].to_i}
|
|
|
|
create_posts(posts, total: post_count, offset: offset) do |post|
|
|
tid = post["thread_id"].to_i
|
|
next unless thread = @thread_parents[tid]
|
|
next unless parent = topic_lookup_from_imported_post_id(thread)
|
|
pid = post["id"]
|
|
{
|
|
id: pid,
|
|
topic_id: parent[:topic_id],
|
|
custom_fields: {import_id: pid},
|
|
user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
|
|
created_at: Time.zone.at(@td.decode(post["added_at"])),
|
|
raw: post["text"]
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def post_process_posts
|
|
puts "", "Postprocessing posts..."
|
|
current = 0
|
|
max = Post.count
|
|
# Rewrite internal links; e.g.
|
|
# ask.cvxr.com/question/(\d+)/[^'"}]*
|
|
# I am sure this is incomplete, but we didn't make heavy use of internal
|
|
# links on our site.
|
|
tmp = Regexp.quote("http://" << OLD_SITE)
|
|
r1 = /"(#{tmp})?\/question\/(\d+)\/[a-zA-Z-]*\/?"/
|
|
r2 = /\((#{tmp})?\/question\/(\d+)\/[a-zA-Z-]*\/?\)/
|
|
r3 = /<?#tmp\/question\/(\d+)\/[a-zA-Z-]*\/?>?/
|
|
Post.find_each do |post|
|
|
raw = post.raw.gsub(r1) do
|
|
if topic = topic_lookup_from_imported_post_id($2)
|
|
"\"#{topic[:url]}\""
|
|
else
|
|
$&
|
|
end
|
|
end
|
|
raw = raw.gsub(r2) do
|
|
if topic = topic_lookup_from_imported_post_id($2)
|
|
"(#{topic[:url]})"
|
|
else
|
|
$&
|
|
end
|
|
end
|
|
raw = raw.gsub(r3) do
|
|
if topic = topic_lookup_from_imported_post_id($1)
|
|
trec = Topic.find_by(id: topic[:topic_id])
|
|
"[#{trec.title}](#{topic[:url]})"
|
|
else
|
|
$&
|
|
end
|
|
end
|
|
if raw != post.raw
|
|
post.raw = raw
|
|
post.save
|
|
end
|
|
print_status(current += 1, max)
|
|
end
|
|
end
|
|
end
|
|
|
|
ImportScripts::MyAskBot.new.perform
|