Merge pull request #2149 from acelaya-forks/feature/robots-user-agents

Add option to customize user agents in robots.txt
This commit is contained in:
Alejandro Celaya 2024-07-06 10:08:03 +02:00 committed by GitHub
commit b6b2530cb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 99 additions and 46 deletions

View File

@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
## [Unreleased] ## [Unreleased]
### Added ### Added
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options. * [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config option.
* [#2109](https://github.com/shlinkio/shlink/issues/2109) Add option to customize user agents robots.txt, via `ROBOTS_USER_AGENTS=foo,bar,baz` env var, or config option.
### Changed ### Changed
* [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024. * [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024.

View File

@ -48,7 +48,7 @@
"shlinkio/shlink-config": "^3.0", "shlinkio/shlink-config": "^3.0",
"shlinkio/shlink-event-dispatcher": "^4.1", "shlinkio/shlink-event-dispatcher": "^4.1",
"shlinkio/shlink-importer": "^5.3.2", "shlinkio/shlink-importer": "^5.3.2",
"shlinkio/shlink-installer": "dev-develop#11e66d8 as 9.2", "shlinkio/shlink-installer": "dev-develop#ccda72e as 9.2",
"shlinkio/shlink-ip-geolocation": "^4.0", "shlinkio/shlink-ip-geolocation": "^4.0",
"shlinkio/shlink-json": "^1.1", "shlinkio/shlink-json": "^1.1",
"spiral/roadrunner": "^2024.1", "spiral/roadrunner": "^2024.1",

View File

@ -45,7 +45,8 @@ return [
Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class, Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class,
Option\UrlShortener\EnableTrailingSlashConfigOption::class, Option\UrlShortener\EnableTrailingSlashConfigOption::class,
Option\UrlShortener\ShortUrlModeConfigOption::class, Option\UrlShortener\ShortUrlModeConfigOption::class,
Option\UrlShortener\RobotsAllowAllShortUrlsConfigOption::class, Option\Robots\RobotsAllowAllShortUrlsConfigOption::class,
Option\Robots\RobotsUserAgentsConfigOption::class,
Option\Tracking\IpAnonymizationConfigOption::class, Option\Tracking\IpAnonymizationConfigOption::class,
Option\Tracking\OrphanVisitsTrackingConfigOption::class, Option\Tracking\OrphanVisitsTrackingConfigOption::class,
Option\Tracking\DisableTrackParamConfigOption::class, Option\Tracking\DisableTrackParamConfigOption::class,

View File

@ -8,6 +8,7 @@ return [
'robots' => [ 'robots' => [
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false), 'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
'user-agents' => splitByComma(Config\EnvVars::ROBOTS_USER_AGENTS->loadFromEnv()),
], ],
]; ];

View File

@ -4,40 +4,35 @@ declare(strict_types=1);
use Shlinkio\Shlink\Core\Config\EnvVars; use Shlinkio\Shlink\Core\Config\EnvVars;
return (static function (): array { use function Shlinkio\Shlink\Core\splitByComma;
/** @var string|null $disableTrackingFrom */
$disableTrackingFrom = EnvVars::DISABLE_TRACKING_FROM->loadFromEnv();
return [ return [
'tracking' => [ 'tracking' => [
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations // Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
// This applies only if IP address tracking is enabled // This applies only if IP address tracking is enabled
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true), 'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence // Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true), 'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence // A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(), 'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
// If true, visits will not be tracked at all // If true, visits will not be tracked at all
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false), 'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved // If true, visits will be tracked, but neither the IP address, nor the location will be resolved
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false), 'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
// If true, the referrer will not be tracked // If true, the referrer will not be tracked
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false), 'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
// If true, the user agent will not be tracked // If true, the user agent will not be tracked
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false), 'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default // A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
'disable_tracking_from' => $disableTrackingFrom === null 'disable_tracking_from' => splitByComma(EnvVars::DISABLE_TRACKING_FROM->loadFromEnv()),
? [] ],
: array_map(trim(...), explode(',', $disableTrackingFrom)),
],
]; ];
})();

View File

@ -32,6 +32,7 @@ return [
Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'], Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'],
Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'], Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'],
Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'], Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'],
Options\RobotsOptions::class => [ValinorConfigFactory::class, 'config.robots'],
RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class, RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class,
RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class, RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class,
@ -189,7 +190,7 @@ return [
'Logger_Shlink', 'Logger_Shlink',
Options\QrCodeOptions::class, Options\QrCodeOptions::class,
], ],
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'], Action\RobotsAction::class => [Crawling\CrawlingHelper::class, Options\RobotsOptions::class],
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [ ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
'em', 'em',

View File

@ -260,3 +260,16 @@ function enumToString(string $enum): string
{ {
return sprintf('["%s"]', implode('", "', enumValues($enum))); return sprintf('["%s"]', implode('", "', enumValues($enum)));
} }
/**
* Split provided string by comma and return a list of the results.
* An empty array is returned if provided value is empty
*/
function splitByComma(?string $value): array
{
if ($value === null || trim($value) === '') {
return [];
}
return array_map(trim(...), explode(',', $value));
}

View File

@ -10,6 +10,7 @@ use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface; use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface; use Psr\Http\Server\RequestHandlerInterface;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface; use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
use Shlinkio\Shlink\Core\Options\RobotsOptions;
use function sprintf; use function sprintf;
@ -17,7 +18,7 @@ use const PHP_EOL;
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
{ {
public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls) public function __construct(private CrawlingHelperInterface $crawlingHelper, private RobotsOptions $robotsOptions)
{ {
} }
@ -33,11 +34,15 @@ readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterf
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
User-agent: *
ROBOTS; ROBOTS;
if ($this->allowAllShortUrls) { $userAgents = $this->robotsOptions->hasUserAgents() ? $this->robotsOptions->userAgents : ['*'];
foreach ($userAgents as $userAgent) {
yield sprintf('User-agent: %s%s', $userAgent, PHP_EOL);
}
if ($this->robotsOptions->allowAllShortUrls) {
// Disallow rest URLs, but allow all short codes // Disallow rest URLs, but allow all short codes
yield 'Disallow: /rest/'; yield 'Disallow: /rest/';
return; return;

View File

@ -71,6 +71,7 @@ enum EnvVars: string
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH'; case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED'; case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS'; case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
case ROBOTS_USER_AGENTS = 'ROBOTS_USER_AGENTS';
case TIMEZONE = 'TIMEZONE'; case TIMEZONE = 'TIMEZONE';
case MEMORY_LIMIT = 'MEMORY_LIMIT'; case MEMORY_LIMIT = 'MEMORY_LIMIT';

View File

@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core\Options;
use function count;
final readonly class RobotsOptions
{
public function __construct(
public bool $allowAllShortUrls = false,
/** @var string[] */
public array $userAgents = [],
) {
}
public function hasUserAgents(): bool
{
return count($this->userAgents) > 0;
}
}

View File

@ -11,6 +11,7 @@ use PHPUnit\Framework\MockObject\MockObject;
use PHPUnit\Framework\TestCase; use PHPUnit\Framework\TestCase;
use Shlinkio\Shlink\Core\Action\RobotsAction; use Shlinkio\Shlink\Core\Action\RobotsAction;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface; use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
use Shlinkio\Shlink\Core\Options\RobotsOptions;
class RobotsActionTest extends TestCase class RobotsActionTest extends TestCase
{ {
@ -24,15 +25,15 @@ class RobotsActionTest extends TestCase
#[Test, DataProvider('provideShortCodes')] #[Test, DataProvider('provideShortCodes')]
public function buildsRobotsLinesFromCrawlableShortCodes( public function buildsRobotsLinesFromCrawlableShortCodes(
array $shortCodes, array $shortCodes,
bool $allowAllShortUrls, RobotsOptions $options,
string $expected, string $expected,
): void { ): void {
$this->helper $this->helper
->expects($allowAllShortUrls ? $this->never() : $this->once()) ->expects($options->allowAllShortUrls ? $this->never() : $this->once())
->method('listCrawlableShortCodes') ->method('listCrawlableShortCodes')
->willReturn($shortCodes); ->willReturn($shortCodes);
$response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals()); $response = $this->action($options)->handle(ServerRequestFactory::fromGlobals());
self::assertEquals(200, $response->getStatusCode()); self::assertEquals(200, $response->getStatusCode());
self::assertEquals($expected, $response->getBody()->__toString()); self::assertEquals($expected, $response->getBody()->__toString());
@ -41,7 +42,7 @@ class RobotsActionTest extends TestCase
public static function provideShortCodes(): iterable public static function provideShortCodes(): iterable
{ {
yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS yield 'three short codes' => [['foo', 'bar', 'baz'], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
@ -51,7 +52,7 @@ class RobotsActionTest extends TestCase
Allow: /baz Allow: /baz
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
@ -63,31 +64,43 @@ class RobotsActionTest extends TestCase
Allow: /baz Allow: /baz
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'no short codes' => [[], false, <<<ROBOTS yield 'no short codes' => [[], new RobotsOptions(), <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
User-agent: * User-agent: *
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS yield 'three short codes and allow all short urls' => [
['foo', 'bar', 'some'],
new RobotsOptions(allowAllShortUrls: true),
<<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
ROBOTS,
];
yield 'no short codes and allow all short urls' => [[], new RobotsOptions(allowAllShortUrls: true), <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
User-agent: * User-agent: *
Disallow: /rest/ Disallow: /rest/
ROBOTS]; ROBOTS];
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS yield 'allow user agents' => [[], new RobotsOptions(userAgents: ['foo', 'bar']), <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
User-agent: * User-agent: foo
Disallow: /rest/ User-agent: bar
Disallow: /
ROBOTS]; ROBOTS];
} }
private function action(bool $allowAllShortUrls = false): RobotsAction private function action(RobotsOptions $options): RobotsAction
{ {
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls); return new RobotsAction($this->helper, $options);
} }
} }