From 0739c3b1d1d5c3dfd2692eff8a048269e3fa2bf5 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 22 Nov 2018 18:46:39 +1100 Subject: [PATCH] DEV: this introduces a script capable of caching critical DNS locally This is useful for cases where you want to add resiliency to DNS lookups for redis and postgres, so they will continue to work even if there is a DNS outage --- script/cache_critical_dns | 180 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100755 script/cache_critical_dns diff --git a/script/cache_critical_dns b/script/cache_critical_dns new file mode 100755 index 00000000000..dedd453724b --- /dev/null +++ b/script/cache_critical_dns @@ -0,0 +1,180 @@ +#!/usr/bin/env ruby + +require 'resolv' +require 'time' +require 'timeout' +require 'socket' + +HOSTS_PATH = "/etc/hosts" + +CRITICAL_HOST_ENV_VARS = %w{ + DISCOURSE_DB_HOST + DISCOURSE_DB_BACKUP_HOST + DISCOURSE_REDIS_HOST + DISCOURSE_REDIS_SLAVE_HOST +} + +def log(msg) + STDERR.puts "#{Time.now.iso8601}: #{msg}" +end + +def error(msg) + log(msg) +end + +def swap_address(hosts, name, ips) + new_file = [] + + hosts.split("\n").each do |line| + line = line.strip + if line[0] != '#' + _, hostname = line.strip.split(/\s+/) + next if hostname == name + end + new_file << line + end + + ips.each do |ip| + new_file << "#{ip} #{name} # AUTO GENERATED: #{Time.now.iso8601}" + end + + new_file.join("\n") +end + +def hosts_entries(dns, name) + host = ENV[name] + + results = dns.getresources(host, Resolv::DNS::Resource::IN::A) + results.concat dns.getresources(host, Resolv::DNS::Resource::IN::AAAA) + + results.map do |result| + "#{result.address}" + end +end + +def send_counter(name, description, labels, value) + host = "localhost" + port = 9405 + + if labels + labels = labels.map do |k, v| + "\"#{k}\": \"#{v}\"" + end.join(",") + else + labels = "" + end + + json = <<~JSON + { + "_type": "Custom", + "type": "Counter", + "name": "#{name}", + "description": "#{description}", + "labels": { #{labels} }, + "value": #{value} + } + JSON + + payload = +"POST /send-metrics HTTP/1.1\r\n" + payload << "Host: #{host}\r\n" + payload << "Connection: Close\r\n" + payload << "Content-Type: application/json\r\n" + payload << "Content-Length: #{json.bytesize}\r\n" + payload << "\r\n" + payload << json + + socket = TCPSocket.new host, port + socket.write payload + socket.flush + result = socket.read + first_line = result.split("\n")[0] + if first_line.strip != "HTTP/1.1 200 OK" + error("Failed to report metric #{result}") + end + socket.close +rescue => e + error("Failed to send metric to Prometheus #{e}") +end + +def report_success + send_counter('critical_dns_successes_total', 'critical DNS resolution success', nil, 1) +end + +def report_failure(errors) + errors.each do |host, count| + send_counter('critical_dns_failures_total', 'critical DNS resolution failures', host ? { host: host } : nil, count) + end +end + +@vars = CRITICAL_HOST_ENV_VARS.map do |name| + begin + host = ENV[name] + next if !host || host.length == 0 + IPAddr.new(ENV[name]) + nil + rescue IPAddr::InvalidAddressError, IPAddr::AddressFamilyError + name + end +end.compact + +def loop + errors = {} + + Resolv::DNS.open do |dns| + dns.timeouts = 2 + + resolved = {} + + hosts = @vars.each do |var| + host = ENV[var] + + begin + entries = hosts_entries(dns, var) + rescue => e + error("Failed to resolve DNS for #{name} - #{e}") + errors[host] ||= 0 + errors[host] += 1 + end + + if entries&.length > 0 + resolved[host] = entries + else + error("Failed to find any DNS entry for #{var} : #{ENV[var]}") + errors[host] ||= 0 + errors[host] += 1 + end + + end + + hosts_content = File.read(HOSTS_PATH) + hosts = Resolv::Hosts.new(HOSTS_PATH) + + changed = false + resolved.each do |name, ips| + if hosts.getaddresses(name).map(&:to_s).sort != ips.sort + log("IP addresses for #{name} changed to #{ips}") + hosts_content = swap_address(hosts_content, name, ips) + changed = true + end + end + + if changed + File.write(HOSTS_PATH, hosts_content) + end + + end +rescue => e + error("Failed to access DNS - #{e}") + errors[nil] = 1 +ensure + if errors == {} + report_success + else + report_failure(errors) + end +end + +while true + loop + sleep 30 +end