From 606aeb9d5547f99346412b046738f5dc0986593a Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 15 Oct 2015 13:25:10 +1100 Subject: [PATCH] improvements to importer - improve perf of test for existing posts - always use a system guardian when importing posts - for lithuim importer requery raw (transform is not repeatable) --- lib/post_creator.rb | 1 + script/import_scripts/base.rb | 16 ++++++++++++--- script/import_scripts/lithium.rb | 35 +++++++++++++++++++++----------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/lib/post_creator.rb b/lib/post_creator.rb index 22ace0dad38..a198049ab79 100644 --- a/lib/post_creator.rb +++ b/lib/post_creator.rb @@ -57,6 +57,7 @@ class PostCreator opts[:title] = pg_clean_up(opts[:title]) if opts[:title] && opts[:title].include?("\u0000") opts[:raw] = pg_clean_up(opts[:raw]) if opts[:raw] && opts[:raw].include?("\u0000") opts.delete(:reply_to_post_number) unless opts[:topic_id] + @guardian = opts[:guardian] if opts[:guardian] @spam = false end diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index dda270aacee..49b86caccb0 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -197,13 +197,20 @@ class ImportScripts::Base def all_records_exist?(type, import_ids) return false if import_ids.empty? + Post.exec_sql('create temp table import_ids(val varchar(200) primary key)') + + import_id_clause = import_ids.map{|id| "('#{PG::Connection.escape_string(id)}')"}.join(",") + Post.exec_sql("insert into import_ids values #{import_id_clause}") + existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id') - existing = existing.where('value in (?)', import_ids.map(&:to_s)) + existing = existing.joins('JOIN import_ids ON val=value') if existing.count == import_ids.length - # puts "Skipping #{import_ids.length} already imported #{type}" - true + puts "Skipping #{import_ids.length} already imported #{type}" + return true end + ensure + Post.exec_sql('drop table import_ids') end # Iterate through a list of user records to be imported. @@ -444,6 +451,8 @@ class ImportScripts::Base [created, skipped] end + STAFF_GUARDIAN = Guardian.new(User.find(-1)) + def create_post(opts, import_id) user = User.find(opts[:user_id]) post_create_action = opts.delete(:post_create_action) @@ -452,6 +461,7 @@ class ImportScripts::Base opts[:custom_fields] ||= {} opts[:custom_fields]['import_id'] = import_id + opts[:guardian] = STAFF_GUARDIAN if @bbcode_to_md opts[:raw] = opts[:raw].bbcode_to_md(false) rescue opts[:raw] end diff --git a/script/import_scripts/lithium.rb b/script/import_scripts/lithium.rb index 795f04f366a..80024247f57 100644 --- a/script/import_scripts/lithium.rb +++ b/script/import_scripts/lithium.rb @@ -56,14 +56,14 @@ class ImportScripts::Lithium < ImportScripts::Base SiteSetting.allow_html_tables = true import_categories - import_users - import_topics - import_posts - import_likes - import_accepted_answers - import_pms - close_topics - create_permalinks + # import_users + # import_topics + # import_posts + # import_likes + # import_accepted_answers + # import_pms + # close_topics + # create_permalinks post_process_posts end @@ -307,11 +307,12 @@ class ImportScripts::Lithium < ImportScripts::Base end def import_posts - puts "", "importing posts..." post_count = mysql_query("SELECT COUNT(*) count FROM message2 WHERE id <> root_id").first["count"] + puts "", "importing posts... (#{post_count})" + batches(BATCH_SIZE) do |offset| posts = mysql_query <<-SQL SELECT id, body, deleted, user_id, @@ -629,7 +630,6 @@ class ImportScripts::Lithium < ImportScripts::Base import_mode: true } - unless topic_id msg[:title] = @htmlentities.decode(topic["subject"]).strip[0...255] msg[:archetype] = Archetype.private_message @@ -739,15 +739,26 @@ SQL def post_process_posts puts "", "Postprocessing posts..." + current = 0 max = Post.count + mysql_query("create index idxUniqueId on message2(unique_id)") rescue nil + Post.all.find_each do |post| begin - new_raw = postprocess_post_raw(post.raw, post.user_id) + id = post.custom_fields["import_unique_id"] + next unless id + raw = mysql_query("select body from message2 where unique_id = '#{id}'").first['body'] + unless raw + puts "Missing raw for post: #{post.id}" + next + end + new_raw = postprocess_post_raw(raw, post.user_id) post.raw = new_raw post.save rescue PrettyText::JavaScriptError + puts "GOT A JS error on post: #{post.id}" nil ensure print_status(current += 1, max) @@ -825,7 +836,7 @@ SQL end def mysql_query(sql) - @client.query(sql, cache_rows: false) + @client.query(sql, cache_rows: true) end end