FEATURE: Vanilla bulk importer

This commit is contained in:
Neil Lalonde
2018-02-02 16:28:34 -05:00
parent 68d0d39ab7
commit 0edd386b48
5 changed files with 670 additions and 104 deletions

View File

@@ -1,9 +1,11 @@
require "pg"
require "set"
require "redcarpet"
require "htmlentities"
puts "Loading application..."
require_relative "../../config/environment"
require_relative '../import_scripts/base/uploader'
module BulkImport; end
@@ -12,10 +14,53 @@ class BulkImport::Base
NOW ||= "now()".freeze
PRIVATE_OFFSET ||= 2**30
CHARSET_MAP = {
"armscii8" => nil,
"ascii" => Encoding::US_ASCII,
"big5" => Encoding::Big5,
"binary" => Encoding::ASCII_8BIT,
"cp1250" => Encoding::Windows_1250,
"cp1251" => Encoding::Windows_1251,
"cp1256" => Encoding::Windows_1256,
"cp1257" => Encoding::Windows_1257,
"cp850" => Encoding::CP850,
"cp852" => Encoding::CP852,
"cp866" => Encoding::IBM866,
"cp932" => Encoding::Windows_31J,
"dec8" => nil,
"eucjpms" => Encoding::EucJP_ms,
"euckr" => Encoding::EUC_KR,
"gb2312" => Encoding::EUC_CN,
"gbk" => Encoding::GBK,
"geostd8" => nil,
"greek" => Encoding::ISO_8859_7,
"hebrew" => Encoding::ISO_8859_8,
"hp8" => nil,
"keybcs2" => nil,
"koi8r" => Encoding::KOI8_R,
"koi8u" => Encoding::KOI8_U,
"latin1" => Encoding::ISO_8859_1,
"latin2" => Encoding::ISO_8859_2,
"latin5" => Encoding::ISO_8859_9,
"latin7" => Encoding::ISO_8859_13,
"macce" => Encoding::MacCentEuro,
"macroman" => Encoding::MacRoman,
"sjis" => Encoding::SHIFT_JIS,
"swe7" => nil,
"tis620" => Encoding::TIS_620,
"ucs2" => Encoding::UTF_16BE,
"ujis" => Encoding::EucJP_ms,
"utf8" => Encoding::UTF_8,
}
def initialize
charset = ENV["DB_CHARSET"] || "utf8"
db = ActiveRecord::Base.connection_config
@encoder = PG::TextEncoder::CopyRow.new
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
@uploader = ImportScripts::Uploader.new
@html_entities = HTMLEntities.new
@encoding = CHARSET_MAP[charset]
@markdown = Redcarpet::Markdown.new(
Redcarpet::Render::HTML.new(hard_wrap: true),
@@ -27,6 +72,7 @@ class BulkImport::Base
def run
puts "Starting..."
Rails.logger.level = 3 # :error, so that we don't create log files that are many GB
preload_i18n
fix_highest_post_numbers
load_imported_ids
@@ -194,6 +240,10 @@ class BulkImport::Base
topic_id user_id created_at updated_at
}
TOPIC_TAG_COLUMNS ||= %i{
topic_id tag_id created_at updated_at
}
def create_groups(rows, &block); create_records(rows, "group", GROUP_COLUMNS, &block); end
def create_users(rows, &block)
@@ -218,6 +268,7 @@ class BulkImport::Base
def create_posts(rows, &block); create_records(rows, "post", POST_COLUMNS, &block); end
def create_post_actions(rows, &block); create_records(rows, "post_action", POST_ACTION_COLUMNS, &block); end
def create_topic_allowed_users(rows, &block); create_records(rows, "topic_allowed_user", TOPIC_ALLOWED_USER_COLUMNS, &block); end
def create_topic_tags(rows, &block); create_records(rows, "topic_tag", TOPIC_TAG_COLUMNS, &block); end
def process_group(group)
@groups[group[:imported_id].to_s] = group[:id] = @last_group_id += 1
@@ -386,6 +437,12 @@ class BulkImport::Base
topic_allowed_user
end
def process_topic_tag(topic_tag)
topic_tag[:created_at] = NOW
topic_tag[:updated_at] = NOW
topic_tag
end
def process_raw(raw)
# fix whitespaces
raw.gsub!(/(\\r)?\\n/, "\n")
@@ -436,6 +493,9 @@ class BulkImport::Base
# [IMG]...[/IMG]
raw.gsub!(/(?:\s*\[IMG\]\s*)+(.+?)(?:\s*\[\/IMG\]\s*)+/im) { "\n\n#{$1}\n\n" }
# [IMG=url]
raw.gsub!(/\[IMG=([^\]]*)\]/im) { "\n\n#{$1}\n\n" }
# [URL=...]...[/URL]
raw.gsub!(/\[URL="?(.+?)"?\](.+?)\[\/URL\]/im) { "[#{$2.strip}](#{$1})" }
@@ -462,14 +522,19 @@ class BulkImport::Base
raw.gsub!(/\[TD="?.*?"?\](.*?)\[\/TD\]/im, "\\1")
# [QUOTE]...[/QUOTE]
raw.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { |quote|
quote.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { "\n#{$1}\n" }
quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
}
raw.gsub!(/\[QUOTE="([^\]]+)"\]/i) { "[QUOTE=#{$1}]" }
# [QUOTE=<username>;<postid>]...[/QUOTE]
raw.gsub!(/\[QUOTE=([^;]+);(\d+)\](.+?)\[\/QUOTE\]/im) do
imported_username, imported_postid, quote = $1, $2, $3
# Nested Quotes
raw.gsub!(/(\[\/?QUOTE.*?\])/mi) { |q| "\n#{q}\n" }
# raw.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { |quote|
# quote.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { "\n#{$1}\n" }
# quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
# }
# [QUOTE=<username>;<postid>]
raw.gsub!(/\[QUOTE=([^;\]]+);(\d+)\]/i) do
imported_username, imported_postid = $1, $2
username = @mapped_usernames[imported_username] || imported_username
post_id = post_id_from_imported_id(imported_postid)
@@ -477,9 +542,9 @@ class BulkImport::Base
topic_id = @topic_id_by_post_id[post_id]
if post_number && topic_id
"\n[quote=\"#{username}, post:#{post_number}, topic:#{topic_id}\"]\n#{quote}\n[/quote]"
"\n[quote=\"#{username}, post:#{post_number}, topic:#{topic_id}\"]\n"
else
"\n[quote=\"#{username}\"]\n#{quote}\n[/quote]\n"
"\n[quote=\"#{username}\"]\n"
end
end
@@ -545,6 +610,10 @@ class BulkImport::Base
end
end
def create_upload(user_id, path, source_filename)
@uploader.create_upload(user_id, path, source_filename)
end
def fix_name(name)
name.scrub! if name.valid_encoding? == false
return if name.blank?
@@ -610,4 +679,14 @@ class BulkImport::Base
Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
end
def normalize_text(text)
return nil unless text.present?
@html_entities.decode(normalize_charset(text.presence || "").scrub)
end
def normalize_charset(text)
return text if @encoding == Encoding::UTF_8
return text && text.encode(@encoding).force_encoding(Encoding::UTF_8)
end
end