mirror of
https://github.com/discourse/discourse.git
synced 2025-02-25 18:55:32 -06:00
Improve Telligent importer
* Try multiple filenames and do lots of guessing when searching for attachments * Unescape HTML in filenames and replace invalid characters in filenames * Existing permalinks prevented resuming of import * Prevent duplicate attachments in same post
This commit is contained in:
parent
dadbf2edb4
commit
7f4ef3db9e
@ -1,11 +1,18 @@
|
|||||||
require_relative 'base'
|
require_relative 'base'
|
||||||
require 'tiny_tds'
|
require 'tiny_tds'
|
||||||
|
|
||||||
|
# Import script for Telligent communities
|
||||||
|
#
|
||||||
|
# Users are currently imported from a temp table. This will need some
|
||||||
|
# work the next time this import script is used, because that table
|
||||||
|
# won't exist. Also, it's really hard to find all attachments, but
|
||||||
|
# the script tries to do it anyway.
|
||||||
|
|
||||||
class ImportScripts::Telligent < ImportScripts::Base
|
class ImportScripts::Telligent < ImportScripts::Base
|
||||||
BATCH_SIZE ||= 1000
|
BATCH_SIZE ||= 1000
|
||||||
LOCAL_AVATAR_REGEX ||= /\A~\/.*(?<directory>communityserver-components-(?:selectable)?avatars)\/(?<path>[^\/]+)\/(?<filename>.+)/i
|
LOCAL_AVATAR_REGEX ||= /\A~\/.*(?<directory>communityserver-components-(?:selectable)?avatars)\/(?<path>[^\/]+)\/(?<filename>.+)/i
|
||||||
REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i
|
REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i
|
||||||
EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename>.+)">.*?<\/a>/i
|
EMBEDDED_ATTACHMENT_REGEX ||= /<a href="\/cfs-file(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename1>.+)">(?<filename2>.*?)<\/a>/i
|
||||||
|
|
||||||
CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
|
CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
|
||||||
TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'
|
TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'
|
||||||
@ -174,7 +181,8 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
|
|
||||||
if category_id = replace_with_category_id(row, child_categories, parent_category_id)
|
if category_id = replace_with_category_id(row, child_categories, parent_category_id)
|
||||||
add_category(row['ForumId'], Category.find_by_id(category_id))
|
add_category(row['ForumId'], Category.find_by_id(category_id))
|
||||||
Permalink.create(url: "f/#{row['ForumId']}", category_id: category_id)
|
url = "f/#{row['ForumId']}"
|
||||||
|
Permalink.create(url: url, category_id: category_id) unless Permalink.exists?(url: url)
|
||||||
nil
|
nil
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -268,7 +276,8 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
post_create_action: proc do |post|
|
post_create_action: proc do |post|
|
||||||
topic = post.topic
|
topic = post.topic
|
||||||
Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until
|
Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until
|
||||||
Permalink.create(url: "f/#{row['ForumId']}/t/#{row['ThreadId']}", topic_id: topic.id)
|
url = "f/#{row['ForumId']}/t/#{row['ThreadId']}"
|
||||||
|
Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url)
|
||||||
end
|
end
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,7 +354,7 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
end
|
end
|
||||||
|
|
||||||
def raw_with_attachment(row, user_id)
|
def raw_with_attachment(row, user_id)
|
||||||
raw, embedded_paths = replace_embedded_attachments(row["Body"], user_id)
|
raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id)
|
||||||
raw = html_to_markdown(raw) || ""
|
raw = html_to_markdown(raw) || ""
|
||||||
|
|
||||||
filename = row["FileName"]
|
filename = row["FileName"]
|
||||||
@ -358,13 +367,16 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
"%02d" % row["ApplicationId"],
|
"%02d" % row["ApplicationId"],
|
||||||
"%02d" % row["ApplicationContentTypeId"],
|
"%02d" % row["ApplicationContentTypeId"],
|
||||||
("%010d" % row["ContentId"]).scan(/.{2}/),
|
("%010d" % row["ContentId"]).scan(/.{2}/),
|
||||||
filename
|
clean_filename(filename)
|
||||||
)
|
)
|
||||||
|
|
||||||
unless embedded_paths.include?(path)
|
unless embedded_paths.include?(path)
|
||||||
if File.exists?(path)
|
if File.exists?(path)
|
||||||
upload = @uploader.create_upload(user_id, path, filename)
|
upload = @uploader.create_upload(user_id, path, filename)
|
||||||
raw << "\n" << @uploader.html_for_upload(upload, filename) if upload.present? && upload.persisted?
|
|
||||||
|
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
|
||||||
|
raw << "\n" << @uploader.html_for_upload(upload, filename)
|
||||||
|
end
|
||||||
else
|
else
|
||||||
STDERR.puts "Could not find file: #{path}"
|
STDERR.puts "Could not find file: #{path}"
|
||||||
end
|
end
|
||||||
@ -375,23 +387,17 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
|
|
||||||
def replace_embedded_attachments(raw, user_id)
|
def replace_embedded_attachments(raw, user_id)
|
||||||
paths = []
|
paths = []
|
||||||
|
upload_ids = []
|
||||||
|
|
||||||
raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do
|
raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do
|
||||||
match_data = Regexp.last_match
|
filename, path = attachment_path(Regexp.last_match)
|
||||||
filename = match_data[:filename]
|
|
||||||
|
|
||||||
path = File.join(
|
|
||||||
ENV["FILE_BASE_DIR"],
|
|
||||||
match_data[:directory].gsub("-", "."),
|
|
||||||
match_data[:path].split("-"),
|
|
||||||
filename
|
|
||||||
)
|
|
||||||
|
|
||||||
if File.exists?(path)
|
if File.exists?(path)
|
||||||
upload = @uploader.create_upload(user_id, path, filename)
|
upload = @uploader.create_upload(user_id, path, filename)
|
||||||
|
|
||||||
if upload.present? && upload.persisted?
|
if upload.present? && upload.persisted?
|
||||||
paths << path
|
paths << path
|
||||||
|
upload_ids << upload.id
|
||||||
@uploader.html_for_upload(upload, filename)
|
@uploader.html_for_upload(upload, filename)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
@ -399,7 +405,45 @@ class ImportScripts::Telligent < ImportScripts::Base
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
[raw, paths]
|
[raw, paths, upload_ids]
|
||||||
|
end
|
||||||
|
|
||||||
|
def clean_filename(filename)
|
||||||
|
CGI.unescapeHTML(filename)
|
||||||
|
.gsub(/[\x00\/\\:\*\?\"<>\|]/, '_')
|
||||||
|
.gsub(/_(?:2B00|2E00|2D00|5B00|5D00|5F00)/, '')
|
||||||
|
end
|
||||||
|
|
||||||
|
def attachment_path(match_data)
|
||||||
|
filename, path = join_attachment_path(match_data, filename_index: 2)
|
||||||
|
filename, path = join_attachment_path(match_data, filename_index: 1) unless File.exists?(path)
|
||||||
|
[filename, path]
|
||||||
|
end
|
||||||
|
|
||||||
|
# filenames are a total mess - try to guess the correct filename
|
||||||
|
# works for 70% of all files
|
||||||
|
def join_attachment_path(match_data, filename_index:)
|
||||||
|
filename = clean_filename(match_data[:"filename#{filename_index}"])
|
||||||
|
base_path = File.join(
|
||||||
|
ENV["FILE_BASE_DIR"],
|
||||||
|
match_data[:directory].gsub("-", "."),
|
||||||
|
match_data[:path].split("-")
|
||||||
|
)
|
||||||
|
|
||||||
|
path = File.join(base_path, filename)
|
||||||
|
return [filename, path] if File.exists?(path)
|
||||||
|
|
||||||
|
original_filename = filename.dup
|
||||||
|
|
||||||
|
filename = filename.gsub("-", " ")
|
||||||
|
path = File.join(base_path, filename)
|
||||||
|
return [filename, path] if File.exists?(path)
|
||||||
|
|
||||||
|
filename = filename.gsub("_", "-")
|
||||||
|
path = File.join(base_path, filename)
|
||||||
|
return [filename, path] if File.exists?(path)
|
||||||
|
|
||||||
|
[original_filename, File.join(base_path, original_filename)]
|
||||||
end
|
end
|
||||||
|
|
||||||
def mark_topics_as_solved
|
def mark_topics_as_solved
|
||||||
|
Loading…
Reference in New Issue
Block a user