FEATURE: An API to help sites build robots.txt files programatically

This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site.
2024-11-23 01:16:38 -06:00 · 2018-04-16 15:43:20 -04:00 · 2018-04-16 15:43:20 -04:00 · 3d7dbdedc0
commit 3d7dbdedc0
parent cfe88a67e1
4 changed files with 67 additions and 35 deletions
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@ -36,26 +36,59 @@ class RobotsTxtController < ApplicationController
  }

  def index
-    if SiteSetting.allow_index_in_robots_txt
-      path = :index
+    if SiteSetting.allow_index_in_robots_txt?
+      @robots_info = fetch_robots_info
+      render :index, content_type: 'text/plain'
+    else
+      render :no_index, content_type: 'text/plain'
+    end
+  end

-      @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent|
-        [agent, SiteSetting.slow_down_crawler_rate]
-      }
+  # If you are hosting Discourse in a subfolder, you will need to create your robots.txt
+  # in the root of your web server with the appropriate paths. This method will return
+  # JSON that can be used by a script to create a robots.txt that works well with your
+  # existing site.
+  def builder
+    render json: fetch_robots_info
+  end

-      if SiteSetting.whitelisted_crawler_user_agents.present?
-        @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
-        @disallowed_user_agents = ['*']
-      elsif SiteSetting.blacklisted_crawler_user_agents.present?
-        @allowed_user_agents = ['*']
-        @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
-      else
-        @allowed_user_agents = ['*']
+protected
+
+  def fetch_robots_info
+    deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_uri + p }
+    deny_all = [ "#{Discourse.base_uri}/" ]
+
+    result = {
+      header: "# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file",
+      agents: []
+    }
+
+    if SiteSetting.whitelisted_crawler_user_agents.present?
+      SiteSetting.whitelisted_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << { name: agent, disallow: deny_paths }
+      end
+
+      result[:agents] << { name: '*', disallow: deny_all }
+    elsif SiteSetting.blacklisted_crawler_user_agents.present?
+      result[:agents] << { name: '*', disallow: deny_paths }
+      SiteSetting.blacklisted_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << { name: agent, disallow: deny_all }
      end
    else
-      path = :no_index
+      result[:agents] << { name: '*', disallow: deny_paths }
    end

-    render path, content_type: 'text/plain'
+    if SiteSetting.slow_down_crawler_user_agents.present?
+      SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << {
+          name: agent,
+          delay: SiteSetting.slow_down_crawler_rate,
+          disallow: deny_paths
+        }
+      end
+    end
+
+    result
  end
+
 end
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@ -1,30 +1,18 @@
-# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file
+<%= @robots_info[:header] %>
 <% if Discourse.base_uri.present? %>
 # This robots.txt file is not used. Please append the content below in the robots.txt file located at the root
 <% end %>
 #
-<% @allowed_user_agents.each do |user_agent| %>
-User-agent: <%= user_agent %>
-<% end %>
-<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
-Disallow: <%= Discourse.base_uri + path %>
+<% @robots_info[:agents].each do |agent| %>
+User-agent: <%= agent[:name] %>
+<%- if agent[:delay] -%>
+Crawl-delay: <%= agent[:delay] %>
+<%- end -%>
+<% agent[:disallow].each do |path| %>
+Disallow: <%= path %>
 <% end %>

-<% if @disallowed_user_agents %>
-  <% @disallowed_user_agents.each do |user_agent| %>
-User-agent: <%= user_agent %>
-Disallow: <%= Discourse.base_uri + "/" %>

-  <% end %>
 <% end %>

 <%= server_plugin_outlet "robots_txt_index" %>
-
-<% @crawler_delayed_agents.each do |agent, delay| %>
-User-agent: <%= agent %>
-Crawl-delay: <%= delay %>
-<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
-Disallow: <%= Discourse.base_uri + path %>
-<% end %>
-
-<% end %>
--- a/config/routes.rb
+++ b/config/routes.rb
@ -744,6 +744,7 @@ Discourse::Application.routes.draw do
  get "favicon/proxied" => "static#favicon", format: false

  get "robots.txt" => "robots_txt#index"
+  get "robots-builder.json" => "robots_txt#builder"
  get "offline.html" => "offline#index"
  get "manifest.json" => "metadata#manifest", as: :manifest
  get "opensearch" => "metadata#opensearch", format: :xml
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@ -1,6 +1,16 @@
 require 'rails_helper'

 RSpec.describe RobotsTxtController do
+  describe '#builder' do
+    it "returns json information for building a robots.txt" do
+      get "/robots-builder.json"
+      json = ::JSON.parse(response.body)
+      expect(json).to be_present
+      expect(json['header']).to be_present
+      expect(json['agents']).to be_present
+    end
+  end
+
  describe '#index' do

    context 'subfolder' do