mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
FEATURE: Vanilla bulk importer
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
require "pg"
|
||||
require "set"
|
||||
require "redcarpet"
|
||||
require "htmlentities"
|
||||
|
||||
puts "Loading application..."
|
||||
require_relative "../../config/environment"
|
||||
require_relative '../import_scripts/base/uploader'
|
||||
|
||||
module BulkImport; end
|
||||
|
||||
@@ -12,10 +14,53 @@ class BulkImport::Base
|
||||
NOW ||= "now()".freeze
|
||||
PRIVATE_OFFSET ||= 2**30
|
||||
|
||||
CHARSET_MAP = {
|
||||
"armscii8" => nil,
|
||||
"ascii" => Encoding::US_ASCII,
|
||||
"big5" => Encoding::Big5,
|
||||
"binary" => Encoding::ASCII_8BIT,
|
||||
"cp1250" => Encoding::Windows_1250,
|
||||
"cp1251" => Encoding::Windows_1251,
|
||||
"cp1256" => Encoding::Windows_1256,
|
||||
"cp1257" => Encoding::Windows_1257,
|
||||
"cp850" => Encoding::CP850,
|
||||
"cp852" => Encoding::CP852,
|
||||
"cp866" => Encoding::IBM866,
|
||||
"cp932" => Encoding::Windows_31J,
|
||||
"dec8" => nil,
|
||||
"eucjpms" => Encoding::EucJP_ms,
|
||||
"euckr" => Encoding::EUC_KR,
|
||||
"gb2312" => Encoding::EUC_CN,
|
||||
"gbk" => Encoding::GBK,
|
||||
"geostd8" => nil,
|
||||
"greek" => Encoding::ISO_8859_7,
|
||||
"hebrew" => Encoding::ISO_8859_8,
|
||||
"hp8" => nil,
|
||||
"keybcs2" => nil,
|
||||
"koi8r" => Encoding::KOI8_R,
|
||||
"koi8u" => Encoding::KOI8_U,
|
||||
"latin1" => Encoding::ISO_8859_1,
|
||||
"latin2" => Encoding::ISO_8859_2,
|
||||
"latin5" => Encoding::ISO_8859_9,
|
||||
"latin7" => Encoding::ISO_8859_13,
|
||||
"macce" => Encoding::MacCentEuro,
|
||||
"macroman" => Encoding::MacRoman,
|
||||
"sjis" => Encoding::SHIFT_JIS,
|
||||
"swe7" => nil,
|
||||
"tis620" => Encoding::TIS_620,
|
||||
"ucs2" => Encoding::UTF_16BE,
|
||||
"ujis" => Encoding::EucJP_ms,
|
||||
"utf8" => Encoding::UTF_8,
|
||||
}
|
||||
|
||||
def initialize
|
||||
charset = ENV["DB_CHARSET"] || "utf8"
|
||||
db = ActiveRecord::Base.connection_config
|
||||
@encoder = PG::TextEncoder::CopyRow.new
|
||||
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
|
||||
@uploader = ImportScripts::Uploader.new
|
||||
@html_entities = HTMLEntities.new
|
||||
@encoding = CHARSET_MAP[charset]
|
||||
|
||||
@markdown = Redcarpet::Markdown.new(
|
||||
Redcarpet::Render::HTML.new(hard_wrap: true),
|
||||
@@ -27,6 +72,7 @@ class BulkImport::Base
|
||||
|
||||
def run
|
||||
puts "Starting..."
|
||||
Rails.logger.level = 3 # :error, so that we don't create log files that are many GB
|
||||
preload_i18n
|
||||
fix_highest_post_numbers
|
||||
load_imported_ids
|
||||
@@ -194,6 +240,10 @@ class BulkImport::Base
|
||||
topic_id user_id created_at updated_at
|
||||
}
|
||||
|
||||
TOPIC_TAG_COLUMNS ||= %i{
|
||||
topic_id tag_id created_at updated_at
|
||||
}
|
||||
|
||||
def create_groups(rows, &block); create_records(rows, "group", GROUP_COLUMNS, &block); end
|
||||
|
||||
def create_users(rows, &block)
|
||||
@@ -218,6 +268,7 @@ class BulkImport::Base
|
||||
def create_posts(rows, &block); create_records(rows, "post", POST_COLUMNS, &block); end
|
||||
def create_post_actions(rows, &block); create_records(rows, "post_action", POST_ACTION_COLUMNS, &block); end
|
||||
def create_topic_allowed_users(rows, &block); create_records(rows, "topic_allowed_user", TOPIC_ALLOWED_USER_COLUMNS, &block); end
|
||||
def create_topic_tags(rows, &block); create_records(rows, "topic_tag", TOPIC_TAG_COLUMNS, &block); end
|
||||
|
||||
def process_group(group)
|
||||
@groups[group[:imported_id].to_s] = group[:id] = @last_group_id += 1
|
||||
@@ -386,6 +437,12 @@ class BulkImport::Base
|
||||
topic_allowed_user
|
||||
end
|
||||
|
||||
def process_topic_tag(topic_tag)
|
||||
topic_tag[:created_at] = NOW
|
||||
topic_tag[:updated_at] = NOW
|
||||
topic_tag
|
||||
end
|
||||
|
||||
def process_raw(raw)
|
||||
# fix whitespaces
|
||||
raw.gsub!(/(\\r)?\\n/, "\n")
|
||||
@@ -436,6 +493,9 @@ class BulkImport::Base
|
||||
# [IMG]...[/IMG]
|
||||
raw.gsub!(/(?:\s*\[IMG\]\s*)+(.+?)(?:\s*\[\/IMG\]\s*)+/im) { "\n\n#{$1}\n\n" }
|
||||
|
||||
# [IMG=url]
|
||||
raw.gsub!(/\[IMG=([^\]]*)\]/im) { "\n\n#{$1}\n\n" }
|
||||
|
||||
# [URL=...]...[/URL]
|
||||
raw.gsub!(/\[URL="?(.+?)"?\](.+?)\[\/URL\]/im) { "[#{$2.strip}](#{$1})" }
|
||||
|
||||
@@ -462,14 +522,19 @@ class BulkImport::Base
|
||||
raw.gsub!(/\[TD="?.*?"?\](.*?)\[\/TD\]/im, "\\1")
|
||||
|
||||
# [QUOTE]...[/QUOTE]
|
||||
raw.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { |quote|
|
||||
quote.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { "\n#{$1}\n" }
|
||||
quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
|
||||
}
|
||||
raw.gsub!(/\[QUOTE="([^\]]+)"\]/i) { "[QUOTE=#{$1}]" }
|
||||
|
||||
# [QUOTE=<username>;<postid>]...[/QUOTE]
|
||||
raw.gsub!(/\[QUOTE=([^;]+);(\d+)\](.+?)\[\/QUOTE\]/im) do
|
||||
imported_username, imported_postid, quote = $1, $2, $3
|
||||
# Nested Quotes
|
||||
raw.gsub!(/(\[\/?QUOTE.*?\])/mi) { |q| "\n#{q}\n" }
|
||||
|
||||
# raw.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { |quote|
|
||||
# quote.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { "\n#{$1}\n" }
|
||||
# quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
|
||||
# }
|
||||
|
||||
# [QUOTE=<username>;<postid>]
|
||||
raw.gsub!(/\[QUOTE=([^;\]]+);(\d+)\]/i) do
|
||||
imported_username, imported_postid = $1, $2
|
||||
|
||||
username = @mapped_usernames[imported_username] || imported_username
|
||||
post_id = post_id_from_imported_id(imported_postid)
|
||||
@@ -477,9 +542,9 @@ class BulkImport::Base
|
||||
topic_id = @topic_id_by_post_id[post_id]
|
||||
|
||||
if post_number && topic_id
|
||||
"\n[quote=\"#{username}, post:#{post_number}, topic:#{topic_id}\"]\n#{quote}\n[/quote]"
|
||||
"\n[quote=\"#{username}, post:#{post_number}, topic:#{topic_id}\"]\n"
|
||||
else
|
||||
"\n[quote=\"#{username}\"]\n#{quote}\n[/quote]\n"
|
||||
"\n[quote=\"#{username}\"]\n"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -545,6 +610,10 @@ class BulkImport::Base
|
||||
end
|
||||
end
|
||||
|
||||
def create_upload(user_id, path, source_filename)
|
||||
@uploader.create_upload(user_id, path, source_filename)
|
||||
end
|
||||
|
||||
def fix_name(name)
|
||||
name.scrub! if name.valid_encoding? == false
|
||||
return if name.blank?
|
||||
@@ -610,4 +679,14 @@ class BulkImport::Base
|
||||
Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
|
||||
end
|
||||
|
||||
def normalize_text(text)
|
||||
return nil unless text.present?
|
||||
@html_entities.decode(normalize_charset(text.presence || "").scrub)
|
||||
end
|
||||
|
||||
def normalize_charset(text)
|
||||
return text if @encoding == Encoding::UTF_8
|
||||
return text && text.encode(@encoding).force_encoding(Encoding::UTF_8)
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user