From 32e02411bd93bfa6167d8591f1cd735c8665ff48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Tue, 14 Apr 2015 18:16:42 +0200 Subject: [PATCH] add custom importer for sfn.org --- script/import_scripts/sfn.rb | 269 +++++++++++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 script/import_scripts/sfn.rb diff --git a/script/import_scripts/sfn.rb b/script/import_scripts/sfn.rb new file mode 100644 index 00000000000..3be18a741fd --- /dev/null +++ b/script/import_scripts/sfn.rb @@ -0,0 +1,269 @@ +# custom importer for www.sfn.org, feel free to borrow ideas + +require 'mysql2' +require File.expand_path(File.dirname(__FILE__) + "/base.rb") + +class ImportScripts::Sfn < ImportScripts::Base + + BATCH_SIZE = 1000 + + def initialize + super + end + + def execute + import_users + import_categories + import_topics + import_posts + end + + def import_users + puts "", "importing users..." + + user_count = mysql_query <<-SQL + SELECT COUNT(DISTINCT cm.ContactKey) AS "count" + FROM CommunityMember cm + LEFT JOIN EgroupSubscription es ON es.ContactKey = cm.ContactKey + WHERE LENGTH(COALESCE(es.EmailAddr_, "")) > 5 + SQL + + user_count = user_count.first["count"] + + batches(BATCH_SIZE) do |offset| + users = mysql_query <<-SQL + SELECT cm.ContactKey AS "id", + cm.InvitedOn AS "created_at", + es.EmailAddr_ AS "email", + es.FullName_ AS "name", + c.Bio AS "bio", + c.ProfileImage AS "avatar" + FROM CommunityMember cm + LEFT JOIN EgroupSubscription es ON es.ContactKey = cm.ContactKey + LEFT JOIN Contact c ON c.ContactKey = cm.ContactKey + WHERE LENGTH(COALESCE(es.EmailAddr_, "")) > 5 + GROUP BY cm.ContactKey + ORDER BY "created_at" + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if users.size < 1 + + create_users(users, total: user_count, offset: offset) do |user| + { + id: user["id"], + name: user["name"], + email: user["email"], + bio_raw: user["bio"], + created_at: user["created_at"], + post_create_action: proc do |newuser| + next if user["avatar"].blank? + + avatar = Tempfile.new("sfn-avatar") + avatar.write(user["avatar"].encode("ASCII-8BIT").force_encoding("UTF-8")) + avatar.rewind + + upload = Upload.create_for(newuser.id, avatar, "avatar.jpg", avatar.size) + if upload.persisted? + newuser.create_user_avatar + newuser.user_avatar.update(custom_upload_id: upload.id) + newuser.update(uploaded_avatar_id: upload.id) + end + + avatar.try(:close!) rescue nil + end + } + end + end + end + + NEW_CATEGORIES = [ + "Abstract Topic Matching Forum", + "Animals in Research", + "Brain Awareness and Teaching", + "Career Advice", + "Career Paths", + "Diversity", + "Early Career Policy Advocates", + "LATP Associates", + "LATP Fellows", + "Mid and Advanced Career", + "Neurobiology of Disease Workshop", + "Neuroscience 2015", + "Neuroscience Scholars Program", + "NSP Associates", + "NSP Fellows", + "Outreach", + "Postdocs and Early Career", + "Program Committee", + "Program Development", + "Roommate Matching Forum", + "Scientific Research", + "Students", + ] + + # EgroupKey => New Category Name + CATEGORY_MAPPING = { + "{DE10E4F4-621A-48BF-9B45-05D9F774A590}" => "Abstract Topic Matching Forum", + "{3FFC1217-1576-4D38-BB81-D6CADC7FB793}" => "Animals in Research", + "{9362BB21-BF6C-4E55-A3E0-18CD5D9F3323}" => "Brain Awareness and Teaching", + "{3AC01B09-A21F-4166-95DA-0E585E271075}" => "Brain Awareness and Teaching", + "{C249728D-8C9E-4138-AA49-D02467C28EAD}" => "Career Advice", + "{01570B85-0124-478F-A8B9-B028BD1B1F2F}" => "Career Paths", + "{2A430528-278A-46CD-BE1A-07CFA1122919}" => "Diversity", + "{2F211345-3C19-43C9-90B5-27BA9FCD4DB0}" => "Diversity", + "{8092297D-8DF4-404A-8BEB-4D5D0DC6A191}" => "Early Career Policy Advocates", + "{8CB58762-D562-448C-9AF1-8DAE6C482C9B}" => "LATP Associates", + "{CDF80A92-925A-46DD-A867-8558FA72D016}" => "LATP Fellows", + "{E71E237B-7C23-4596-AECA-655BD8ED50DB}" => "Mid and Advanced Career", + "{1D674C38-17CB-4C48-826A-D465AC3F8948}" => "Neurobiology of Disease Workshop", + "{3D4F885B-0037-403B-83DD-62FAA8E81DF1}" => "Neuroscience 2015", + "{9ACC3B40-E4A3-4FFD-AADC-C8403EB6231D}" => "Neuroscience 2015", + "{9FC30FFB-E450-4361-8844-0266C3D96868}" => "Neuroscience Scholars Program", + "{3E78123E-87CE-435E-B4B7-7DAB1A21C541}" => "NSP Associates", + "{12D889D3-5CFD-49D5-93E4-32AAB2CFFCDA}" => "NSP Fellows", + "{FA86D79E-170E-4F53-8F1C-942CB3FFB19E}" => "Outreach", + "{D7041C64-3D32-4010-B3D8-71858323CB4A}" => "Outreach", + "{69B76913-4E23-4C80-A11E-9CDB4130722E}" => "Outreach", + "{774878EA-96AD-49F5-9D29-105AEA488007}" => "Outreach", + "{E6349704-FD01-41B1-9C59-68E928DD4318}" => "Postdocs and Early Career", + "{31CF5944-2567-4E79-9730-18EEC23E5B52}" => "Postdocs and Early Career", + "{5625C403-AFAE-4323-A470-33FC32B12B53}" => "Program Committee", + "{8415D871-54F5-4128-B099-E5A376A6B41B}" => "Program Development", + "{B4DF2044-47AB-4329-8BF7-0D832CAB402C}" => "Roommate Matching Forum", + "{6A3A12B9-5C72-472F-97AC-F34983674960}" => "Scientific Research", + "{2CF635E9-4866-451C-A4F2-E2A8A80FED54}" => "Scientific Research", + "{CF2DDCCE-737F-499D-AFE4-E5C36F195C8B}" => "Scientific Research", + "{282B48D7-AC1D-453E-9806-3C6CE6830EF9}" => "Scientific Research", + "{6D750CAF-E96F-4AD1-A45B-7B74FDFF0B40}" => "Scientific Research", + "{10AF5D45-BEB3-4F07-BE77-0BAB6910DE10}" => "Scientific Research", + "{18D7F624-26D1-44B9-BF33-AB5C5A2AB2BF}" => "Scientific Research", + "{6016FF4F-D834-4888-BA03-F9FE8CB1D4CC}" => "Scientific Research", + "{B0290A37-EA39-4CB8-B6CB-3E0B7EF6D036}" => "Scientific Research", + "{97CC60D0-B93A-43FF-BB48-366FAAEE2BAC}" => "Scientific Research", + "{8FC9B57B-2755-4FC5-90E8-CCDB56CF2F66}" => "Scientific Research", + "{57C8BF37-357E-4FE6-952D-906248642792}" => "Scientific Research", + "{7B2A3B63-BC2C-4219-830C-BA1DECB33337}" => "Scientific Research", + "{0ED1D205-0E48-48D2-B82B-3CE80C6C553F}" => "Scientific Research", + "{10355962-D172-4294-AA8E-1BC381B67971}" => "Scientific Research", + "{C84B0222-5232-4B94-9FB8-DDF802241171}" => "Scientific Research", + "{9143F984-0D67-46CB-AAAF-7FE3B6335E07}" => "Scientific Research", + "{1392DC10-37A0-46A6-9979-4568D0224C5F}" => "Scientific Research", + "{E4891409-0F4F-4151-B550-ECE53655E231}" => "Scientific Research", + "{9613BAC2-229B-4563-9E1C-35C31CDDCE2F}" => "Students", + } + + def import_categories + puts "", "importing categories..." + + create_categories(NEW_CATEGORIES) do |category| + { id: category, name: category } + end + end + + def import_topics + puts "", "importing topics..." + + topic_count = mysql_query <<-SQL + SELECT COUNT(MessageID_) AS "count" + FROM EgroupMessages + WHERE ParentId_ = 0 + AND ApprovedRejectedPendingInd = "Approved" + SQL + + topic_count = topic_count.first["count"] + + batches(BATCH_SIZE) do |offset| + topics = mysql_query <<-SQL + SELECT MessageID_ AS "id", + EgroupKey AS "category_id", + ContactKey AS "user_id", + HdrSubject_ AS "title", + Body_ AS "raw", + CreatStamp_ AS "created_at" + FROM EgroupMessages + WHERE ParentId_ = 0 + AND ApprovedRejectedPendingInd = "Approved" + ORDER BY "created_at" + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if topics.size < 1 + + create_posts(topics, total: topic_count, offset: offset) do |topic| + next unless category_id = CATEGORY_MAPPING[topic["category_id"]] + { + id: topic["id"], + category: category_id_from_imported_category_id(category_id), + user_id: user_id_from_imported_user_id(topic["user_id"]) || Discourse::SYSTEM_USER_ID, + title: topic["title"][0..250], + raw: cleanup_raw(topic["raw"]), + created_at: topic["created_at"], + } + end + end + end + + def import_posts + puts "", "importing posts..." + + posts_count = mysql_query <<-SQL + SELECT COUNT(MessageID_) AS "count" + FROM EgroupMessages + WHERE ParentId_ > 0 + AND ApprovedRejectedPendingInd = "Approved" + SQL + + posts_count = posts_count.first["count"] + + batches(BATCH_SIZE) do |offset| + posts = mysql_query <<-SQL + SELECT MessageID_ AS "id", + ContactKey AS "user_id", + ParentID_ AS "topic_id", + Body_ AS "raw", + CreatStamp_ AS "created_at" + FROM EgroupMessages + WHERE ParentId_ > 0 + AND ApprovedRejectedPendingInd = "Approved" + ORDER BY "created_at" + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if posts.size < 1 + + create_posts(posts, total: posts_count, offset: offset) do |post| + next unless parent = topic_lookup_from_imported_post_id(post["topic_id"]) + { + id: post["id"], + topic_id: parent[:topic_id], + user_id: user_id_from_imported_user_id(post["user_id"]) || Discourse::SYSTEM_USER_ID, + raw: cleanup_raw(post["raw"]), + created_at: post["created_at"], + } + end + end + end + + def cleanup_raw(raw) + # fix some html + raw.gsub!(//i, "\n") + # remove "This message has been cross posted to the following eGroups: ..." + raw.gsub!(/^This message has been cross posted to the following eGroups: .+\n-{3,}/i, "") + # remove signatures + raw.gsub!(/-{3,}.+/m, "") + # strip leading/trailing whitespaces + raw.strip + end + + def mysql_query(sql) + @client ||= Mysql2::Client.new(username: "root", database: "sfn") + @client.query(sql) + end + +end + +ImportScripts::Sfn.new.perform