discourse/script/import_scripts/zoho.rb

# frozen_string_literal: true

###
###
### The output of this importer is bad.
###
### Improving it means getting better quality export data from Zoho,
### or doing a lot more work on this importer.
###
### Consider leaving data in Zoho and starting fresh in Discourse.
###
###

# Import from Zoho.
# Be sure to get the posts CSV file, AND the user list csv file with people's email addresses.
# You may need to contact Zoho support for the user list.
#
# * Zoho data doesn't indicate which users are admins or moderators, so you'll need to grant
#   those privileges manually after the import finishes.
# * The posts and users csv files don't seem to have consistent usernames, and sometimes use
#   full names instead of usernames. This may cause duplicate users with slightly different
#   usernames to be created.

require 'csv'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require File.expand_path(File.dirname(__FILE__) + "/base/csv_helper.rb")

# Call it like this:
#   bundle exec ruby script/import_scripts/zoho.rb <path-to-csv-files>
class ImportScripts::Zoho < ImportScripts::Base

  include ImportScripts::CsvHelper

  BATCH_SIZE = 1000

  def initialize(path)
    @path = path
    @all_posts = []
    @categories = {} # key is the parent category, value is an array of sub-categories
    @topic_mapping = {}
    @current_row = nil
    super()
  end

  def execute
    import_users
    import_posts
    update_tl0
    update_user_signup_date_based_on_first_post
  end

  def cleanup_zoho_username(s)
    s.strip.gsub(/[^A-Za-z0-9_\.\-]/, '')
  end

  def import_users
    puts "", "Importing users"
    create_users(CSV.parse(File.read(File.join(@path, 'users.csv')))) do |u|
      username = cleanup_zoho_username(u[0])
      {
        id: username,
        username: username,
        email: u[1],
        created_at: Time.zone.now
      }
    end
  end

  def import_posts
    # 0 Forum Name
    # 1 Category Name
    # 2 Topic Title
    # 3 Permalink
    # 4 Posted Time
    # 5 Content
    # 6 Author
    # 7 Attachments
    # 8 Votes

    count = 0

    puts "", "Parsing posts CSV"

    csv_parse(File.join(@path, "posts.csv")) do |row|
      @all_posts << row.dup
      if @categories[row.forum_name].nil?
        @categories[row.forum_name] = []
      end

      unless @categories[row.forum_name].include?(row.category_name)
        @categories[row.forum_name] << row.category_name
      end
    end

    puts "", "Creating categories"

    # Create categories
    @categories.each do |parent, subcats|
      c = create_category({ name: parent }, parent)
      subcats.each do |subcat|
        next if subcat == "Uncategorized" || subcat == "Uncategorised"
        create_category({ name: subcat, parent_category_id: c.id }, "#{parent}:#{subcat}")
      end
    end

    puts "", "Creating topics and posts"

    created, skipped = create_posts(@all_posts, total: @all_posts.size) do |row|
      @current_row = row

      # fetch user
      username = cleanup_zoho_username(row.author)

      next if username.blank? # no author for this post, so skip

      user_id = user_id_from_imported_user_id(username)

      if user_id.nil?
        # user CSV file didn't have a user with this username. create it now with an invalid email address.
        u = create_user(
          { id: username,
            username: username,
            email: "#{username}@example.com",
            created_at: Time.zone.parse(row.posted_time) },
          username
        )
        user_id = u.id
      end

      if @topic_mapping[row.permalink].nil?
        category_id = nil
        if row.category_name != "Uncategorized" && row.category_name != "Uncategorised"
          category_id = category_id_from_imported_category_id("#{row.forum_name}:#{row.category_name}")
        else
          category_id = category_id_from_imported_category_id(row.forum_name)
        end

        # create topic
        {
          id: import_post_id(row),
          user_id: user_id,
          category: category_id,
          title: CGI.unescapeHTML(row.topic_title),
          raw: cleanup_post(row.content),
          created_at: Time.zone.parse(row.posted_time)
        }
        # created_post callback will be called
      else
        {
          id: import_post_id(row),
          user_id: user_id,
          raw: cleanup_post(row.content),
          created_at: Time.zone.parse(row.posted_time),
          topic_id: @topic_mapping[row.permalink]
        }
      end
    end

    puts ""
    puts "Created: #{created}"
    puts "Skipped: #{skipped}"
    puts ""
  end

  def created_post(post)
    unless @topic_mapping[@current_row.permalink]
      @topic_mapping[@current_row.permalink] = post.topic_id
    end
  end

  # Note that Zoho doesn't render code blocks the same way all the time,
  # but this seems to catch the most common format:
  ZOHO_CODE_BLOCK_START = /<ol style="list-style-position: outside;(.)*">/

  TOO_MANY_LINE_BREAKS = /[\n ]{3,}/
  STYLE_ATTR = /(\s)*style="(.)*"/

  def cleanup_post(raw)

    # Check if Zoho's most common form of a code block is present.
    # If so, don't clean up the post as much because we can't tell which markup
    # is inside the code block. These posts will look worse than others.
    has_code_block = !!(raw =~ ZOHO_CODE_BLOCK_START)

    x = raw.gsub(STYLE_ATTR, '')

    if has_code_block
      # We have to assume all lists in this post are meant to be code blocks
      # to make it somewhat readable.
      x.gsub!(/( )*<ol>(\s)*/, "")
      x.gsub!(/( )*<\/ol>/, "")
      x.gsub!('<li>', '')
      x.gsub!('</li>', '')
    else
      # No code block (probably...) so clean up more aggressively.
      x.gsub!("\n", " ")
      x.gsub!('<div>', "\n\n")
      x.gsub('</div>', ' ')
      x.gsub!("<br />", "\n")
      x.gsub!('<span>', '')
      x.gsub!('</span>', '')
      x.gsub!(/<font ([^>]*)>/, '')
      x.gsub!('</font>', '')
    end

    x.gsub!(TOO_MANY_LINE_BREAKS, "\n\n")

    CGI.unescapeHTML(x)
  end

  def import_post_id(row)
    # Try to make up a unique id based on the data Zoho gives us.
    # The posted_time seems to be the same for all posts in a topic, so we can't use that.
    Digest::SHA1.hexdigest "#{row.permalink}:#{row.content}"
  end

end

unless ARGV[0] && Dir.exist?(ARGV[0])
  if ARGV[0] && !Dir.exist?(ARGV[0])
    puts "", "ERROR! Dir #{ARGV[0]} not found.", ""
  end

  puts "", "Usage:", "", "    bundle exec ruby script/import_scripts/zoho.rb DIRNAME", ""
  exit 1
end

ImportScripts::Zoho.new(ARGV[0]).perform
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 17:17:27 -05:00			`# frozen_string_literal: true`
Add eye-catching disclaimer to Zoho importer, which was abandoned before a successful import was completed 2016-02-08 14:31:21 -06:00
			`###`
			`###`
			`### The output of this importer is bad.`
			`###`
			`### Improving it means getting better quality export data from Zoho,`
			`### or doing a lot more work on this importer.`
			`###`
			`### Consider leaving data in Zoho and starting fresh in Discourse.`
			`###`
			`###`

FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`# Import from Zoho.`
			`# Be sure to get the posts CSV file, AND the user list csv file with people's email addresses.`
			`# You may need to contact Zoho support for the user list.`
			`#`
			`# * Zoho data doesn't indicate which users are admins or moderators, so you'll need to grant`
			`# those privileges manually after the import finishes.`
			`# * The posts and users csv files don't seem to have consistent usernames, and sometimes use`
			`# full names instead of usernames. This may cause duplicate users with slightly different`
			`# usernames to be created.`

			`require 'csv'`
			`require File.expand_path(File.dirname(__FILE__) + "/base.rb")`
			`require File.expand_path(File.dirname(__FILE__) + "/base/csv_helper.rb")`

			`# Call it like this:`
			`# bundle exec ruby script/import_scripts/zoho.rb <path-to-csv-files>`
			`class ImportScripts::Zoho < ImportScripts::Base`

			`include ImportScripts::CsvHelper`

			`BATCH_SIZE = 1000`

			`def initialize(path)`
			`@path = path`
			`@all_posts = []`
			`@categories = {} # key is the parent category, value is an array of sub-categories`
			`@topic_mapping = {}`
			`@current_row = nil`
			`super()`
			`end`

			`def execute`
			`import_users`
			`import_posts`
			`update_tl0`
			`update_user_signup_date_based_on_first_post`
			`end`

			`def cleanup_zoho_username(s)`
			`s.strip.gsub(/[^A-Za-z0-9_\.\-]/, '')`
			`end`

			`def import_users`
			`puts "", "Importing users"`
Add rubocop to our build. (#5004) 2017-07-27 20:20:09 -05:00			`create_users(CSV.parse(File.read(File.join(@path, 'users.csv')))) do \|u\|`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`username = cleanup_zoho_username(u[0])`
			`{`
			`id: username,`
			`username: username,`
			`email: u[1],`
that TODO is done 2015-12-04 14:13:28 -06:00			`created_at: Time.zone.now`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`}`
			`end`
			`end`

			`def import_posts`
			`# 0 Forum Name`
			`# 1 Category Name`
			`# 2 Topic Title`
			`# 3 Permalink`
			`# 4 Posted Time`
			`# 5 Content`
			`# 6 Author`
			`# 7 Attachments`
			`# 8 Votes`

			`count = 0`

			`puts "", "Parsing posts CSV"`

			`csv_parse(File.join(@path, "posts.csv")) do \|row\|`
			`@all_posts << row.dup`
			`if @categories[row.forum_name].nil?`
			`@categories[row.forum_name] = []`
			`end`

			`unless @categories[row.forum_name].include?(row.category_name)`
			`@categories[row.forum_name] << row.category_name`
			`end`
			`end`

			`puts "", "Creating categories"`

			`# Create categories`
			`@categories.each do \|parent, subcats\|`
Add rubocop to our build. (#5004) 2017-07-27 20:20:09 -05:00			`c = create_category({ name: parent }, parent)`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`subcats.each do \|subcat\|`
			`next if subcat == "Uncategorized" \|\| subcat == "Uncategorised"`
Add rubocop to our build. (#5004) 2017-07-27 20:20:09 -05:00			`create_category({ name: subcat, parent_category_id: c.id }, "#{parent}:#{subcat}")`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`end`
			`end`

			`puts "", "Creating topics and posts"`

			`created, skipped = create_posts(@all_posts, total: @all_posts.size) do \|row\|`
			`@current_row = row`

			`# fetch user`
			`username = cleanup_zoho_username(row.author)`

			`next if username.blank? # no author for this post, so skip`

			`user_id = user_id_from_imported_user_id(username)`

			`if user_id.nil?`
			`# user CSV file didn't have a user with this username. create it now with an invalid email address.`
			`u = create_user(`
			`{ id: username,`
			`username: username,`
			`email: "#{username}@example.com",`
			`created_at: Time.zone.parse(row.posted_time) },`
			`username`
			`)`
			`user_id = u.id`
			`end`

			`if @topic_mapping[row.permalink].nil?`
			`category_id = nil`
			`if row.category_name != "Uncategorized" && row.category_name != "Uncategorised"`
			`category_id = category_id_from_imported_category_id("#{row.forum_name}:#{row.category_name}")`
			`else`
			`category_id = category_id_from_imported_category_id(row.forum_name)`
			`end`

			`# create topic`
			`{`
			`id: import_post_id(row),`
			`user_id: user_id,`
			`category: category_id,`
Zoho importer: cleanup post content 2015-12-29 09:51:39 -06:00			`title: CGI.unescapeHTML(row.topic_title),`
			`raw: cleanup_post(row.content),`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`created_at: Time.zone.parse(row.posted_time)`
			`}`
			`# created_post callback will be called`
			`else`
			`{`
			`id: import_post_id(row),`
			`user_id: user_id,`
Zoho importer: cleanup post content 2015-12-29 09:51:39 -06:00			`raw: cleanup_post(row.content),`
FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`created_at: Time.zone.parse(row.posted_time),`
			`topic_id: @topic_mapping[row.permalink]`
			`}`
			`end`
			`end`

			`puts ""`
			`puts "Created: #{created}"`
			`puts "Skipped: #{skipped}"`
			`puts ""`
			`end`

			`def created_post(post)`
			`unless @topic_mapping[@current_row.permalink]`
			`@topic_mapping[@current_row.permalink] = post.topic_id`
			`end`
			`end`

Zoho importer: cleanup post content 2015-12-29 09:51:39 -06:00			`# Note that Zoho doesn't render code blocks the same way all the time,`
			`# but this seems to catch the most common format:`
			`ZOHO_CODE_BLOCK_START = /<ol style="list-style-position: outside;(.)*">/`

			`TOO_MANY_LINE_BREAKS = /[\n ]{3,}/`
			`STYLE_ATTR = /(\s)style="(.)"/`

			`def cleanup_post(raw)`

			`# Check if Zoho's most common form of a code block is present.`
			`# If so, don't clean up the post as much because we can't tell which markup`
			`# is inside the code block. These posts will look worse than others.`
			`has_code_block = !!(raw =~ ZOHO_CODE_BLOCK_START)`

			`x = raw.gsub(STYLE_ATTR, '')`

			`if has_code_block`
			`# We have to assume all lists in this post are meant to be code blocks`
			`# to make it somewhat readable.`
			`x.gsub!(/( )<ol>(\s)/, "")`
			`x.gsub!(/( )*<\/ol>/, "")`
			`x.gsub!('<li>', '')`
			`x.gsub!('</li>', '')`
			`else`
			`# No code block (probably...) so clean up more aggressively.`
			`x.gsub!("\n", " ")`
			`x.gsub!('<div>', "\n\n")`
			`x.gsub('</div>', ' ')`
			`x.gsub!("<br />", "\n")`
			`x.gsub!('<span>', '')`
			`x.gsub!('</span>', '')`
			`x.gsub!(/<font ([^>]*)>/, '')`
			`x.gsub!('</font>', '')`
			`end`

			`x.gsub!(TOO_MANY_LINE_BREAKS, "\n\n")`

			`CGI.unescapeHTML(x)`
			`end`

FEATURE: Zoho importer 2015-12-03 09:12:06 -06:00			`def import_post_id(row)`
			`# Try to make up a unique id based on the data Zoho gives us.`
			`# The posted_time seems to be the same for all posts in a topic, so we can't use that.`
			`Digest::SHA1.hexdigest "#{row.permalink}:#{row.content}"`
			`end`

			`end`

			`unless ARGV[0] && Dir.exist?(ARGV[0])`
			`if ARGV[0] && !Dir.exist?(ARGV[0])`
			`puts "", "ERROR! Dir #{ARGV[0]} not found.", ""`
			`end`

			`puts "", "Usage:", "", " bundle exec ruby script/import_scripts/zoho.rb DIRNAME", ""`
			`exit 1`
			`end`

			`ImportScripts::Zoho.new(ARGV[0]).perform`