Add option to customize user agents in robots.txt

This commit is contained in:
Alejandro Celaya 2024-07-05 08:52:41 +02:00
parent 76c42bc17c
commit 4b52c92e97
9 changed files with 96 additions and 44 deletions

View File

@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
## [Unreleased]
### Added
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options.
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config option.
* [#2109](https://github.com/shlinkio/shlink/issues/2109) Add option to customize user agents robots.txt, via `ROBOTS_USER_AGENTS=foo,bar,baz` env var, or config option.
### Changed
* [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024.

View File

@ -8,6 +8,7 @@ return [
'robots' => [
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
'user-agents' => splitByComma(Config\EnvVars::ROBOTS_USER_AGENTS->loadFromEnv()),
],
];

View File

@ -4,40 +4,35 @@ declare(strict_types=1);
use Shlinkio\Shlink\Core\Config\EnvVars;
return (static function (): array {
/** @var string|null $disableTrackingFrom */
$disableTrackingFrom = EnvVars::DISABLE_TRACKING_FROM->loadFromEnv();
use function Shlinkio\Shlink\Core\splitByComma;
return [
return [
'tracking' => [
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
// This applies only if IP address tracking is enabled
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
'tracking' => [
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
// This applies only if IP address tracking is enabled
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
// If true, visits will not be tracked at all
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
// If true, visits will not be tracked at all
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
// If true, the referrer will not be tracked
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
// If true, the referrer will not be tracked
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
// If true, the user agent will not be tracked
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
// If true, the user agent will not be tracked
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
'disable_tracking_from' => $disableTrackingFrom === null
? []
: array_map(trim(...), explode(',', $disableTrackingFrom)),
],
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
'disable_tracking_from' => splitByComma(EnvVars::DISABLE_TRACKING_FROM->loadFromEnv()),
],
];
})();
];

View File

@ -32,6 +32,7 @@ return [
Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'],
Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'],
Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'],
Options\RobotsOptions::class => [ValinorConfigFactory::class, 'config.robots'],
RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class,
RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class,
@ -189,7 +190,7 @@ return [
'Logger_Shlink',
Options\QrCodeOptions::class,
],
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'],
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, Options\RobotsOptions::class],
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
'em',

View File

@ -260,3 +260,16 @@ function enumToString(string $enum): string
{
return sprintf('["%s"]', implode('", "', enumValues($enum)));
}
/**
* Split provided string by comma and return a list of the results.
* An empty array is returned if provided value is empty
*/
function splitByComma(?string $value): array
{
if ($value === null || trim($value) === '') {
return [];
}
return array_map(trim(...), explode(',', $value));
}

View File

@ -10,6 +10,7 @@ use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
use Shlinkio\Shlink\Core\Options\RobotsOptions;
use function sprintf;
@ -17,7 +18,7 @@ use const PHP_EOL;
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
{
public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls)
public function __construct(private CrawlingHelperInterface $crawlingHelper, private RobotsOptions $robotsOptions)
{
}
@ -33,11 +34,15 @@ readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterf
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
ROBOTS;
if ($this->allowAllShortUrls) {
$userAgents = $this->robotsOptions->hasUserAgents() ? $this->robotsOptions->userAgents : ['*'];
foreach ($userAgents as $userAgent) {
yield sprintf('User-agent: %s%s', $userAgent, PHP_EOL);
}
if ($this->robotsOptions->allowAllShortUrls) {
// Disallow rest URLs, but allow all short codes
yield 'Disallow: /rest/';
return;

View File

@ -71,6 +71,7 @@ enum EnvVars: string
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
case ROBOTS_USER_AGENTS = 'ROBOTS_USER_AGENTS';
case TIMEZONE = 'TIMEZONE';
case MEMORY_LIMIT = 'MEMORY_LIMIT';

View File

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core\Options;
use function count;
final readonly class RobotsOptions
{
public function __construct(
public bool $allowAllShortUrls = false,
/** @var string[] */
public array $userAgents = [],
) {
}
public function hasUserAgents(): bool
{
return count($this->userAgents) > 0;
}
}

View File

@ -11,6 +11,7 @@ use PHPUnit\Framework\MockObject\MockObject;
use PHPUnit\Framework\TestCase;
use Shlinkio\Shlink\Core\Action\RobotsAction;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
use Shlinkio\Shlink\Core\Options\RobotsOptions;
class RobotsActionTest extends TestCase
{
@ -24,15 +25,15 @@ class RobotsActionTest extends TestCase
#[Test, DataProvider('provideShortCodes')]
public function buildsRobotsLinesFromCrawlableShortCodes(
array $shortCodes,
bool $allowAllShortUrls,
RobotsOptions $options,
string $expected,
): void {
$this->helper
->expects($allowAllShortUrls ? $this->never() : $this->once())
->expects($options->allowAllShortUrls ? $this->never() : $this->once())
->method('listCrawlableShortCodes')
->willReturn($shortCodes);
$response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals());
$response = $this->action($options)->handle(ServerRequestFactory::fromGlobals());
self::assertEquals(200, $response->getStatusCode());
self::assertEquals($expected, $response->getBody()->__toString());
@ -41,7 +42,7 @@ class RobotsActionTest extends TestCase
public static function provideShortCodes(): iterable
{
yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS
yield 'three short codes' => [['foo', 'bar', 'baz'], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
@ -51,7 +52,7 @@ class RobotsActionTest extends TestCase
Allow: /baz
Disallow: /
ROBOTS];
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
@ -63,31 +64,43 @@ class RobotsActionTest extends TestCase
Allow: /baz
Disallow: /
ROBOTS];
yield 'no short codes' => [[], false, <<<ROBOTS
yield 'no short codes' => [[], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /
ROBOTS];
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS
yield 'three short codes and allow all short urls' => [
['foo', 'bar', 'some'],
new RobotsOptions(allowAllShortUrls: true),
<<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
ROBOTS,
];
yield 'no short codes and allow all short urls' => [[], new RobotsOptions(allowAllShortUrls: true), <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
ROBOTS];
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS
yield 'allow user agents' => [[], new RobotsOptions(userAgents: ['foo', 'bar']), <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
User-agent: foo
User-agent: bar
Disallow: /
ROBOTS];
}
private function action(bool $allowAllShortUrls = false): RobotsAction
private function action(RobotsOptions $options): RobotsAction
{
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls);
return new RobotsAction($this->helper, $options);
}
}