2019-05-02 17:17:27 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-05-26 15:26:18 -05:00
|
|
|
require_relative "database"
|
|
|
|
require "json"
|
|
|
|
require "yaml"
|
|
|
|
|
|
|
|
module ImportScripts::Mbox
|
|
|
|
class Indexer
|
|
|
|
# @param database [ImportScripts::Mbox::Database]
|
|
|
|
# @param settings [ImportScripts::Mbox::Settings]
|
|
|
|
def initialize(database, settings)
|
|
|
|
@database = database
|
2018-01-17 05:03:57 -06:00
|
|
|
@settings = settings
|
2017-05-26 15:26:18 -05:00
|
|
|
@split_regex = settings.split_regex
|
|
|
|
end
|
|
|
|
|
|
|
|
def execute
|
2018-01-17 05:03:57 -06:00
|
|
|
directories = Dir.glob(File.join(@settings.data_dir, "*"))
|
2017-05-26 15:26:18 -05:00
|
|
|
directories.select! { |f| File.directory?(f) }
|
|
|
|
directories.sort!
|
|
|
|
|
|
|
|
directories.each do |directory|
|
|
|
|
puts "indexing files in #{directory}"
|
|
|
|
category = index_category(directory)
|
|
|
|
index_emails(directory, category[:name])
|
|
|
|
end
|
|
|
|
|
|
|
|
puts "", "indexing replies and users"
|
2018-01-17 05:03:57 -06:00
|
|
|
if @settings.group_messages_by_subject
|
|
|
|
@database.sort_emails_by_subject
|
|
|
|
@database.update_in_reply_to_by_email_subject
|
|
|
|
else
|
|
|
|
@database.update_in_reply_to_of_emails
|
|
|
|
@database.sort_emails_by_date_and_reply_level
|
|
|
|
end
|
|
|
|
|
2017-05-26 15:26:18 -05:00
|
|
|
@database.fill_users_from_emails
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2020-04-30 01:48:34 -05:00
|
|
|
METADATA_FILENAME = "metadata.yml"
|
2019-03-24 17:08:03 -05:00
|
|
|
IGNORED_FILE_EXTENSIONS = %w[.dbindex .dbnames .digest .subjects .yml]
|
2017-05-26 15:26:18 -05:00
|
|
|
|
|
|
|
def index_category(directory)
|
|
|
|
metadata_file = File.join(directory, METADATA_FILENAME)
|
|
|
|
|
|
|
|
if File.exist?(metadata_file)
|
|
|
|
# workaround for YML files that contain classname in file header
|
|
|
|
yaml = File.read(metadata_file).sub(/^--- !.*$/, "---")
|
2021-10-27 03:39:28 -05:00
|
|
|
metadata = YAML.safe_load(yaml)
|
2017-05-26 15:26:18 -05:00
|
|
|
else
|
|
|
|
metadata = {}
|
|
|
|
end
|
|
|
|
|
|
|
|
category = {
|
|
|
|
name: metadata["name"].presence || File.basename(directory),
|
2022-04-29 11:24:29 -05:00
|
|
|
description: metadata["description"],
|
|
|
|
parent_category_id: metadata["parent_category_id"].presence,
|
2017-05-26 15:26:18 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
@database.insert_category(category)
|
|
|
|
category
|
|
|
|
end
|
|
|
|
|
|
|
|
def index_emails(directory, category_name)
|
2018-01-17 05:03:57 -06:00
|
|
|
all_messages(directory, category_name) do |receiver, filename, opts|
|
2018-03-06 04:32:12 -06:00
|
|
|
begin
|
|
|
|
msg_id = receiver.message_id
|
|
|
|
parsed_email = receiver.mail
|
2022-04-29 11:24:29 -05:00
|
|
|
|
2018-03-06 04:32:12 -06:00
|
|
|
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
2022-04-29 11:24:29 -05:00
|
|
|
|
|
|
|
if @settings.fix_mailman_via_addresses
|
|
|
|
# Detect cases like this and attempt to get actual sender from other headers:
|
|
|
|
# From: Jane Smith via ListName <ListName@lists.example.com>
|
|
|
|
|
|
|
|
if receiver.mail["X-Mailman-Version"] && from_display_name =~ /\bvia \S+$/i
|
|
|
|
email_from_from_line = opts[:from_line].scan(/From (\S+)/).flatten.first
|
|
|
|
a = Mail::Address.new(email_from_from_line)
|
|
|
|
from_email = a.address
|
|
|
|
from_display_name = a.display_name
|
|
|
|
# if name is not available there, look for it in Reply-To
|
|
|
|
if from_display_name.nil?
|
|
|
|
reply_to = receiver.mail.to_s.scan(/[\n\r]Reply-To: ([^\r\n]+)/).flatten.first
|
|
|
|
from_display_name = Mail::Address.new(reply_to).display_name
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
from_email = from_email.sub(/^(.*)=/, "") if @settings.elide_equals_in_addresses
|
|
|
|
|
2018-03-06 04:32:12 -06:00
|
|
|
body, elided, format = receiver.select_body
|
|
|
|
reply_message_ids = extract_reply_message_ids(parsed_email)
|
|
|
|
|
|
|
|
email = {
|
|
|
|
msg_id: msg_id,
|
|
|
|
from_email: from_email,
|
|
|
|
from_name: from_display_name,
|
|
|
|
subject: extract_subject(receiver, category_name),
|
2018-08-23 02:46:25 -05:00
|
|
|
email_date: timestamp(parsed_email.date),
|
2018-03-06 04:32:12 -06:00
|
|
|
raw_message: receiver.raw_email,
|
|
|
|
body: body,
|
|
|
|
elided: elided,
|
|
|
|
format: format,
|
|
|
|
attachment_count: receiver.attachments.count,
|
|
|
|
charset: parsed_email.charset&.downcase,
|
|
|
|
category: category_name,
|
|
|
|
filename: File.basename(filename),
|
|
|
|
first_line_number: opts[:first_line_number],
|
|
|
|
last_line_number: opts[:last_line_number],
|
|
|
|
index_duration: (monotonic_time - opts[:start_time]).round(4),
|
|
|
|
}
|
|
|
|
|
|
|
|
@database.transaction do |db|
|
|
|
|
db.insert_email(email)
|
|
|
|
db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
|
|
|
end
|
|
|
|
rescue StandardError => e
|
|
|
|
if opts[:first_line_number] && opts[:last_line_number]
|
|
|
|
STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"
|
|
|
|
else
|
|
|
|
STDERR.puts "Failed to index message in #{filename}"
|
|
|
|
end
|
|
|
|
|
|
|
|
STDERR.puts e.message
|
|
|
|
STDERR.puts e.backtrace.inspect
|
2018-01-17 05:03:57 -06:00
|
|
|
end
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def imported_file_checksums(category_name)
|
|
|
|
rows = @database.fetch_imported_files(category_name)
|
|
|
|
rows.each_with_object({}) do |row, hash|
|
2018-01-17 10:03:36 -06:00
|
|
|
filename = File.basename(row["filename"])
|
|
|
|
hash[filename] = row["checksum"]
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def all_messages(directory, category_name)
|
|
|
|
checksums = imported_file_checksums(category_name)
|
|
|
|
|
|
|
|
Dir.foreach(directory) do |filename|
|
|
|
|
filename = File.join(directory, filename)
|
|
|
|
next if ignored_file?(filename, checksums)
|
|
|
|
|
|
|
|
puts "indexing #{filename}"
|
|
|
|
|
|
|
|
if @split_regex.present?
|
2022-04-29 11:24:29 -05:00
|
|
|
each_mail(filename) do |raw_message, first_line_number, last_line_number, from_line|
|
2018-01-17 05:03:57 -06:00
|
|
|
opts = {
|
|
|
|
first_line_number: first_line_number,
|
|
|
|
last_line_number: last_line_number,
|
2022-04-29 11:24:29 -05:00
|
|
|
start_time: monotonic_time,
|
|
|
|
from_line: from_line,
|
2018-01-17 05:03:57 -06:00
|
|
|
}
|
2017-11-18 06:53:21 -06:00
|
|
|
receiver = read_mail_from_string(raw_message)
|
2018-01-17 05:03:57 -06:00
|
|
|
yield receiver, filename, opts if receiver.present?
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
else
|
2018-01-17 05:03:57 -06:00
|
|
|
opts = { start_time: monotonic_time }
|
2017-11-18 06:53:21 -06:00
|
|
|
receiver = read_mail_from_file(filename)
|
2018-01-17 05:03:57 -06:00
|
|
|
yield receiver, filename, opts if receiver.present?
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
mark_as_fully_indexed(category_name, filename)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def mark_as_fully_indexed(category_name, filename)
|
|
|
|
imported_file = {
|
|
|
|
category: category_name,
|
2018-01-17 10:03:36 -06:00
|
|
|
filename: File.basename(filename),
|
2017-05-26 15:26:18 -05:00
|
|
|
checksum: calc_checksum(filename),
|
|
|
|
}
|
|
|
|
|
|
|
|
@database.insert_imported_file(imported_file)
|
|
|
|
end
|
|
|
|
|
|
|
|
def each_mail(filename)
|
2019-05-30 15:20:57 -05:00
|
|
|
raw_message = +""
|
2017-05-26 15:26:18 -05:00
|
|
|
first_line_number = 1
|
|
|
|
last_line_number = 0
|
|
|
|
|
2022-04-29 11:24:29 -05:00
|
|
|
from_line = nil
|
|
|
|
|
2017-05-26 15:26:18 -05:00
|
|
|
each_line(filename) do |line|
|
2021-04-27 08:43:31 -05:00
|
|
|
if line.scrub =~ @split_regex
|
2018-01-17 05:03:57 -06:00
|
|
|
if last_line_number > 0
|
2022-04-29 11:24:29 -05:00
|
|
|
yield raw_message, first_line_number, last_line_number, from_line
|
2019-05-30 15:20:57 -05:00
|
|
|
raw_message = +""
|
2018-01-17 05:03:57 -06:00
|
|
|
first_line_number = last_line_number + 1
|
|
|
|
end
|
2022-04-29 11:24:29 -05:00
|
|
|
|
|
|
|
from_line = line
|
2017-05-26 15:26:18 -05:00
|
|
|
else
|
|
|
|
raw_message << line
|
|
|
|
end
|
|
|
|
|
|
|
|
last_line_number += 1
|
|
|
|
end
|
|
|
|
|
2022-04-29 11:24:29 -05:00
|
|
|
yield raw_message, first_line_number, last_line_number, from_line if raw_message.present?
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def each_line(filename)
|
|
|
|
raw_file = File.open(filename, "r")
|
|
|
|
text_file = filename.end_with?(".gz") ? Zlib::GzipReader.new(raw_file) : raw_file
|
|
|
|
|
|
|
|
text_file.each_line { |line| yield line }
|
|
|
|
ensure
|
|
|
|
raw_file.close if raw_file
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_mail_from_file(filename)
|
|
|
|
raw_message = File.read(filename)
|
|
|
|
read_mail_from_string(raw_message)
|
|
|
|
end
|
|
|
|
|
|
|
|
def read_mail_from_string(raw_message)
|
2018-01-17 05:03:57 -06:00
|
|
|
unless raw_message.blank?
|
|
|
|
Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false)
|
2023-01-07 05:53:14 -06:00
|
|
|
end
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def extract_reply_message_ids(mail)
|
2018-03-30 07:37:19 -05:00
|
|
|
Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def extract_subject(receiver, list_name)
|
|
|
|
subject = receiver.subject
|
2022-04-29 11:24:29 -05:00
|
|
|
subject.blank? ? nil : subject.strip.gsub(/\t+/, " ")
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
2018-01-17 10:03:36 -06:00
|
|
|
def ignored_file?(path, checksums)
|
|
|
|
filename = File.basename(path)
|
2017-05-26 15:26:18 -05:00
|
|
|
|
2018-01-17 05:03:57 -06:00
|
|
|
filename.start_with?(".") || filename == METADATA_FILENAME ||
|
|
|
|
IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
|
2018-01-17 10:03:36 -06:00
|
|
|
fully_indexed?(path, filename, checksums)
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
2018-01-17 10:03:36 -06:00
|
|
|
def fully_indexed?(path, filename, checksums)
|
2017-05-26 15:26:18 -05:00
|
|
|
checksum = checksums[filename]
|
2018-01-17 10:03:36 -06:00
|
|
|
checksum.present? && calc_checksum(path) == checksum
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def calc_checksum(filename)
|
|
|
|
Digest::SHA256.file(filename).hexdigest
|
|
|
|
end
|
2018-01-17 05:03:57 -06:00
|
|
|
|
|
|
|
def monotonic_time
|
|
|
|
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
|
|
end
|
2018-08-23 02:46:25 -05:00
|
|
|
|
|
|
|
def timestamp(datetime)
|
|
|
|
Time.zone.at(datetime).to_i if datetime
|
|
|
|
end
|
2017-05-26 15:26:18 -05:00
|
|
|
end
|
|
|
|
end
|