From 4b52c92e97b327a72f16edcdd448c8d75b10c787 Mon Sep 17 00:00:00 2001 From: Alejandro Celaya Date: Fri, 5 Jul 2024 08:52:41 +0200 Subject: [PATCH] Add option to customize user agents in robots.txt --- CHANGELOG.md | 3 +- config/autoload/robots.global.php | 1 + config/autoload/tracking.global.php | 49 +++++++++----------- module/Core/config/dependencies.config.php | 3 +- module/Core/functions/functions.php | 13 ++++++ module/Core/src/Action/RobotsAction.php | 11 +++-- module/Core/src/Config/EnvVars.php | 1 + module/Core/src/Options/RobotsOptions.php | 22 +++++++++ module/Core/test/Action/RobotsActionTest.php | 37 ++++++++++----- 9 files changed, 96 insertions(+), 44 deletions(-) create mode 100644 module/Core/src/Options/RobotsOptions.php diff --git a/CHANGELOG.md b/CHANGELOG.md index c70ff102..5ff88fcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this ## [Unreleased] ### Added -* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options. +* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config option. +* [#2109](https://github.com/shlinkio/shlink/issues/2109) Add option to customize user agents robots.txt, via `ROBOTS_USER_AGENTS=foo,bar,baz` env var, or config option. ### Changed * [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024. diff --git a/config/autoload/robots.global.php b/config/autoload/robots.global.php index 0ab9c5d2..8954dc53 100644 --- a/config/autoload/robots.global.php +++ b/config/autoload/robots.global.php @@ -8,6 +8,7 @@ return [ 'robots' => [ 'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false), + 'user-agents' => splitByComma(Config\EnvVars::ROBOTS_USER_AGENTS->loadFromEnv()), ], ]; diff --git a/config/autoload/tracking.global.php b/config/autoload/tracking.global.php index 4d7a6e9a..267bb76d 100644 --- a/config/autoload/tracking.global.php +++ b/config/autoload/tracking.global.php @@ -4,40 +4,35 @@ declare(strict_types=1); use Shlinkio\Shlink\Core\Config\EnvVars; -return (static function (): array { - /** @var string|null $disableTrackingFrom */ - $disableTrackingFrom = EnvVars::DISABLE_TRACKING_FROM->loadFromEnv(); +use function Shlinkio\Shlink\Core\splitByComma; - return [ +return [ - 'tracking' => [ - // Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations - // This applies only if IP address tracking is enabled - 'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true), + 'tracking' => [ + // Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations + // This applies only if IP address tracking is enabled + 'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true), - // Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence - 'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true), + // Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence + 'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true), - // A query param that, if provided, will disable tracking of one particular visit. Always takes precedence - 'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(), + // A query param that, if provided, will disable tracking of one particular visit. Always takes precedence + 'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(), - // If true, visits will not be tracked at all - 'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false), + // If true, visits will not be tracked at all + 'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false), - // If true, visits will be tracked, but neither the IP address, nor the location will be resolved - 'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false), + // If true, visits will be tracked, but neither the IP address, nor the location will be resolved + 'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false), - // If true, the referrer will not be tracked - 'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false), + // If true, the referrer will not be tracked + 'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false), - // If true, the user agent will not be tracked - 'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false), + // If true, the user agent will not be tracked + 'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false), - // A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default - 'disable_tracking_from' => $disableTrackingFrom === null - ? [] - : array_map(trim(...), explode(',', $disableTrackingFrom)), - ], + // A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default + 'disable_tracking_from' => splitByComma(EnvVars::DISABLE_TRACKING_FROM->loadFromEnv()), + ], - ]; -})(); +]; diff --git a/module/Core/config/dependencies.config.php b/module/Core/config/dependencies.config.php index 24f32360..8f8b609b 100644 --- a/module/Core/config/dependencies.config.php +++ b/module/Core/config/dependencies.config.php @@ -32,6 +32,7 @@ return [ Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'], Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'], Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'], + Options\RobotsOptions::class => [ValinorConfigFactory::class, 'config.robots'], RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class, RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class, @@ -189,7 +190,7 @@ return [ 'Logger_Shlink', Options\QrCodeOptions::class, ], - Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'], + Action\RobotsAction::class => [Crawling\CrawlingHelper::class, Options\RobotsOptions::class], ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [ 'em', diff --git a/module/Core/functions/functions.php b/module/Core/functions/functions.php index d9fc2b5e..5900e8e1 100644 --- a/module/Core/functions/functions.php +++ b/module/Core/functions/functions.php @@ -260,3 +260,16 @@ function enumToString(string $enum): string { return sprintf('["%s"]', implode('", "', enumValues($enum))); } + +/** + * Split provided string by comma and return a list of the results. + * An empty array is returned if provided value is empty + */ +function splitByComma(?string $value): array +{ + if ($value === null || trim($value) === '') { + return []; + } + + return array_map(trim(...), explode(',', $value)); +} diff --git a/module/Core/src/Action/RobotsAction.php b/module/Core/src/Action/RobotsAction.php index cb3c99ea..29c2c8d2 100644 --- a/module/Core/src/Action/RobotsAction.php +++ b/module/Core/src/Action/RobotsAction.php @@ -10,6 +10,7 @@ use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\ServerRequestInterface; use Psr\Http\Server\RequestHandlerInterface; use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface; +use Shlinkio\Shlink\Core\Options\RobotsOptions; use function sprintf; @@ -17,7 +18,7 @@ use const PHP_EOL; readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface { - public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls) + public function __construct(private CrawlingHelperInterface $crawlingHelper, private RobotsOptions $robotsOptions) { } @@ -33,11 +34,15 @@ readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterf # For more information about the robots.txt standard, see: # https://www.robotstxt.org/orig.html - User-agent: * ROBOTS; - if ($this->allowAllShortUrls) { + $userAgents = $this->robotsOptions->hasUserAgents() ? $this->robotsOptions->userAgents : ['*']; + foreach ($userAgents as $userAgent) { + yield sprintf('User-agent: %s%s', $userAgent, PHP_EOL); + } + + if ($this->robotsOptions->allowAllShortUrls) { // Disallow rest URLs, but allow all short codes yield 'Disallow: /rest/'; return; diff --git a/module/Core/src/Config/EnvVars.php b/module/Core/src/Config/EnvVars.php index 59fafb17..01436967 100644 --- a/module/Core/src/Config/EnvVars.php +++ b/module/Core/src/Config/EnvVars.php @@ -71,6 +71,7 @@ enum EnvVars: string case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH'; case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED'; case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS'; + case ROBOTS_USER_AGENTS = 'ROBOTS_USER_AGENTS'; case TIMEZONE = 'TIMEZONE'; case MEMORY_LIMIT = 'MEMORY_LIMIT'; diff --git a/module/Core/src/Options/RobotsOptions.php b/module/Core/src/Options/RobotsOptions.php new file mode 100644 index 00000000..860c1603 --- /dev/null +++ b/module/Core/src/Options/RobotsOptions.php @@ -0,0 +1,22 @@ +userAgents) > 0; + } +} diff --git a/module/Core/test/Action/RobotsActionTest.php b/module/Core/test/Action/RobotsActionTest.php index 32c1b036..1d83fb8c 100644 --- a/module/Core/test/Action/RobotsActionTest.php +++ b/module/Core/test/Action/RobotsActionTest.php @@ -11,6 +11,7 @@ use PHPUnit\Framework\MockObject\MockObject; use PHPUnit\Framework\TestCase; use Shlinkio\Shlink\Core\Action\RobotsAction; use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface; +use Shlinkio\Shlink\Core\Options\RobotsOptions; class RobotsActionTest extends TestCase { @@ -24,15 +25,15 @@ class RobotsActionTest extends TestCase #[Test, DataProvider('provideShortCodes')] public function buildsRobotsLinesFromCrawlableShortCodes( array $shortCodes, - bool $allowAllShortUrls, + RobotsOptions $options, string $expected, ): void { $this->helper - ->expects($allowAllShortUrls ? $this->never() : $this->once()) + ->expects($options->allowAllShortUrls ? $this->never() : $this->once()) ->method('listCrawlableShortCodes') ->willReturn($shortCodes); - $response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals()); + $response = $this->action($options)->handle(ServerRequestFactory::fromGlobals()); self::assertEquals(200, $response->getStatusCode()); self::assertEquals($expected, $response->getBody()->__toString()); @@ -41,7 +42,7 @@ class RobotsActionTest extends TestCase public static function provideShortCodes(): iterable { - yield 'three short codes' => [['foo', 'bar', 'baz'], false, << [['foo', 'bar', 'baz'], new RobotsOptions(), << [['foo', 'bar', 'some', 'thing', 'baz'], false, << [['foo', 'bar', 'some', 'thing', 'baz'], new RobotsOptions(), << [[], false, << [[], new RobotsOptions(), << [['foo', 'bar', 'some'], true, << [ + ['foo', 'bar', 'some'], + new RobotsOptions(allowAllShortUrls: true), + << [[], new RobotsOptions(allowAllShortUrls: true), << [[], true, << [[], new RobotsOptions(userAgents: ['foo', 'bar']), <<helper, allowAllShortUrls: $allowAllShortUrls); + return new RobotsAction($this->helper, $options); } }