FEATURE: An API to help sites build robots.txt files programatically

This is mainly useful for subfolder sites, who need to expose their
robots.txt contents to a parent site.
This commit is contained in:
Robin Ward 2018-04-16 15:43:20 -04:00
parent cfe88a67e1
commit 3d7dbdedc0
4 changed files with 67 additions and 35 deletions

View File

@ -36,26 +36,59 @@ class RobotsTxtController < ApplicationController
} }
def index def index
if SiteSetting.allow_index_in_robots_txt if SiteSetting.allow_index_in_robots_txt?
path = :index @robots_info = fetch_robots_info
render :index, content_type: 'text/plain'
else
render :no_index, content_type: 'text/plain'
end
end
@crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent| # If you are hosting Discourse in a subfolder, you will need to create your robots.txt
[agent, SiteSetting.slow_down_crawler_rate] # in the root of your web server with the appropriate paths. This method will return
} # JSON that can be used by a script to create a robots.txt that works well with your
# existing site.
def builder
render json: fetch_robots_info
end
if SiteSetting.whitelisted_crawler_user_agents.present? protected
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
@disallowed_user_agents = ['*'] def fetch_robots_info
elsif SiteSetting.blacklisted_crawler_user_agents.present? deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_uri + p }
@allowed_user_agents = ['*'] deny_all = [ "#{Discourse.base_uri}/" ]
@disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
else result = {
@allowed_user_agents = ['*'] header: "# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file",
agents: []
}
if SiteSetting.whitelisted_crawler_user_agents.present?
SiteSetting.whitelisted_crawler_user_agents.split('|').each do |agent|
result[:agents] << { name: agent, disallow: deny_paths }
end
result[:agents] << { name: '*', disallow: deny_all }
elsif SiteSetting.blacklisted_crawler_user_agents.present?
result[:agents] << { name: '*', disallow: deny_paths }
SiteSetting.blacklisted_crawler_user_agents.split('|').each do |agent|
result[:agents] << { name: agent, disallow: deny_all }
end end
else else
path = :no_index result[:agents] << { name: '*', disallow: deny_paths }
end end
render path, content_type: 'text/plain' if SiteSetting.slow_down_crawler_user_agents.present?
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
result[:agents] << {
name: agent,
delay: SiteSetting.slow_down_crawler_rate,
disallow: deny_paths
}
end
end
result
end end
end end

View File

@ -1,30 +1,18 @@
# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file <%= @robots_info[:header] %>
<% if Discourse.base_uri.present? %> <% if Discourse.base_uri.present? %>
# This robots.txt file is not used. Please append the content below in the robots.txt file located at the root # This robots.txt file is not used. Please append the content below in the robots.txt file located at the root
<% end %> <% end %>
# #
<% @allowed_user_agents.each do |user_agent| %> <% @robots_info[:agents].each do |agent| %>
User-agent: <%= user_agent %> User-agent: <%= agent[:name] %>
<% end %> <%- if agent[:delay] -%>
<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %> Crawl-delay: <%= agent[:delay] %>
Disallow: <%= Discourse.base_uri + path %> <%- end -%>
<% agent[:disallow].each do |path| %>
Disallow: <%= path %>
<% end %> <% end %>
<% if @disallowed_user_agents %>
<% @disallowed_user_agents.each do |user_agent| %>
User-agent: <%= user_agent %>
Disallow: <%= Discourse.base_uri + "/" %>
<% end %>
<% end %> <% end %>
<%= server_plugin_outlet "robots_txt_index" %> <%= server_plugin_outlet "robots_txt_index" %>
<% @crawler_delayed_agents.each do |agent, delay| %>
User-agent: <%= agent %>
Crawl-delay: <%= delay %>
<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
Disallow: <%= Discourse.base_uri + path %>
<% end %>
<% end %>

View File

@ -744,6 +744,7 @@ Discourse::Application.routes.draw do
get "favicon/proxied" => "static#favicon", format: false get "favicon/proxied" => "static#favicon", format: false
get "robots.txt" => "robots_txt#index" get "robots.txt" => "robots_txt#index"
get "robots-builder.json" => "robots_txt#builder"
get "offline.html" => "offline#index" get "offline.html" => "offline#index"
get "manifest.json" => "metadata#manifest", as: :manifest get "manifest.json" => "metadata#manifest", as: :manifest
get "opensearch" => "metadata#opensearch", format: :xml get "opensearch" => "metadata#opensearch", format: :xml

View File

@ -1,6 +1,16 @@
require 'rails_helper' require 'rails_helper'
RSpec.describe RobotsTxtController do RSpec.describe RobotsTxtController do
describe '#builder' do
it "returns json information for building a robots.txt" do
get "/robots-builder.json"
json = ::JSON.parse(response.body)
expect(json).to be_present
expect(json['header']).to be_present
expect(json['agents']).to be_present
end
end
describe '#index' do describe '#index' do
context 'subfolder' do context 'subfolder' do