mirror of
https://github.com/shlinkio/shlink.git
synced 2024-12-22 15:13:59 -06:00
Add option to customize user agents in robots.txt
This commit is contained in:
parent
76c42bc17c
commit
4b52c92e97
@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
|
||||
|
||||
## [Unreleased]
|
||||
### Added
|
||||
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options.
|
||||
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config option.
|
||||
* [#2109](https://github.com/shlinkio/shlink/issues/2109) Add option to customize user agents robots.txt, via `ROBOTS_USER_AGENTS=foo,bar,baz` env var, or config option.
|
||||
|
||||
### Changed
|
||||
* [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024.
|
||||
|
@ -8,6 +8,7 @@ return [
|
||||
|
||||
'robots' => [
|
||||
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
|
||||
'user-agents' => splitByComma(Config\EnvVars::ROBOTS_USER_AGENTS->loadFromEnv()),
|
||||
],
|
||||
|
||||
];
|
||||
|
@ -4,40 +4,35 @@ declare(strict_types=1);
|
||||
|
||||
use Shlinkio\Shlink\Core\Config\EnvVars;
|
||||
|
||||
return (static function (): array {
|
||||
/** @var string|null $disableTrackingFrom */
|
||||
$disableTrackingFrom = EnvVars::DISABLE_TRACKING_FROM->loadFromEnv();
|
||||
use function Shlinkio\Shlink\Core\splitByComma;
|
||||
|
||||
return [
|
||||
return [
|
||||
|
||||
'tracking' => [
|
||||
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
|
||||
// This applies only if IP address tracking is enabled
|
||||
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
|
||||
'tracking' => [
|
||||
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
|
||||
// This applies only if IP address tracking is enabled
|
||||
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
|
||||
|
||||
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
|
||||
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
|
||||
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
|
||||
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
|
||||
|
||||
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
|
||||
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
|
||||
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
|
||||
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
|
||||
|
||||
// If true, visits will not be tracked at all
|
||||
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
|
||||
// If true, visits will not be tracked at all
|
||||
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
|
||||
|
||||
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
|
||||
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
|
||||
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
|
||||
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
|
||||
|
||||
// If true, the referrer will not be tracked
|
||||
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
|
||||
// If true, the referrer will not be tracked
|
||||
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
|
||||
|
||||
// If true, the user agent will not be tracked
|
||||
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
|
||||
// If true, the user agent will not be tracked
|
||||
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
|
||||
|
||||
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
|
||||
'disable_tracking_from' => $disableTrackingFrom === null
|
||||
? []
|
||||
: array_map(trim(...), explode(',', $disableTrackingFrom)),
|
||||
],
|
||||
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
|
||||
'disable_tracking_from' => splitByComma(EnvVars::DISABLE_TRACKING_FROM->loadFromEnv()),
|
||||
],
|
||||
|
||||
];
|
||||
})();
|
||||
];
|
||||
|
@ -32,6 +32,7 @@ return [
|
||||
Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'],
|
||||
Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'],
|
||||
Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'],
|
||||
Options\RobotsOptions::class => [ValinorConfigFactory::class, 'config.robots'],
|
||||
|
||||
RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class,
|
||||
RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class,
|
||||
@ -189,7 +190,7 @@ return [
|
||||
'Logger_Shlink',
|
||||
Options\QrCodeOptions::class,
|
||||
],
|
||||
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'],
|
||||
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, Options\RobotsOptions::class],
|
||||
|
||||
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
|
||||
'em',
|
||||
|
@ -260,3 +260,16 @@ function enumToString(string $enum): string
|
||||
{
|
||||
return sprintf('["%s"]', implode('", "', enumValues($enum)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Split provided string by comma and return a list of the results.
|
||||
* An empty array is returned if provided value is empty
|
||||
*/
|
||||
function splitByComma(?string $value): array
|
||||
{
|
||||
if ($value === null || trim($value) === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
return array_map(trim(...), explode(',', $value));
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ use Psr\Http\Message\ResponseInterface;
|
||||
use Psr\Http\Message\ServerRequestInterface;
|
||||
use Psr\Http\Server\RequestHandlerInterface;
|
||||
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
||||
use Shlinkio\Shlink\Core\Options\RobotsOptions;
|
||||
|
||||
use function sprintf;
|
||||
|
||||
@ -17,7 +18,7 @@ use const PHP_EOL;
|
||||
|
||||
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
||||
{
|
||||
public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls)
|
||||
public function __construct(private CrawlingHelperInterface $crawlingHelper, private RobotsOptions $robotsOptions)
|
||||
{
|
||||
}
|
||||
|
||||
@ -33,11 +34,15 @@ readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterf
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
|
||||
ROBOTS;
|
||||
|
||||
if ($this->allowAllShortUrls) {
|
||||
$userAgents = $this->robotsOptions->hasUserAgents() ? $this->robotsOptions->userAgents : ['*'];
|
||||
foreach ($userAgents as $userAgent) {
|
||||
yield sprintf('User-agent: %s%s', $userAgent, PHP_EOL);
|
||||
}
|
||||
|
||||
if ($this->robotsOptions->allowAllShortUrls) {
|
||||
// Disallow rest URLs, but allow all short codes
|
||||
yield 'Disallow: /rest/';
|
||||
return;
|
||||
|
@ -71,6 +71,7 @@ enum EnvVars: string
|
||||
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
|
||||
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
|
||||
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
|
||||
case ROBOTS_USER_AGENTS = 'ROBOTS_USER_AGENTS';
|
||||
case TIMEZONE = 'TIMEZONE';
|
||||
case MEMORY_LIMIT = 'MEMORY_LIMIT';
|
||||
|
||||
|
22
module/Core/src/Options/RobotsOptions.php
Normal file
22
module/Core/src/Options/RobotsOptions.php
Normal file
@ -0,0 +1,22 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Shlinkio\Shlink\Core\Options;
|
||||
|
||||
use function count;
|
||||
|
||||
final readonly class RobotsOptions
|
||||
{
|
||||
public function __construct(
|
||||
public bool $allowAllShortUrls = false,
|
||||
/** @var string[] */
|
||||
public array $userAgents = [],
|
||||
) {
|
||||
}
|
||||
|
||||
public function hasUserAgents(): bool
|
||||
{
|
||||
return count($this->userAgents) > 0;
|
||||
}
|
||||
}
|
@ -11,6 +11,7 @@ use PHPUnit\Framework\MockObject\MockObject;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
use Shlinkio\Shlink\Core\Action\RobotsAction;
|
||||
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
||||
use Shlinkio\Shlink\Core\Options\RobotsOptions;
|
||||
|
||||
class RobotsActionTest extends TestCase
|
||||
{
|
||||
@ -24,15 +25,15 @@ class RobotsActionTest extends TestCase
|
||||
#[Test, DataProvider('provideShortCodes')]
|
||||
public function buildsRobotsLinesFromCrawlableShortCodes(
|
||||
array $shortCodes,
|
||||
bool $allowAllShortUrls,
|
||||
RobotsOptions $options,
|
||||
string $expected,
|
||||
): void {
|
||||
$this->helper
|
||||
->expects($allowAllShortUrls ? $this->never() : $this->once())
|
||||
->expects($options->allowAllShortUrls ? $this->never() : $this->once())
|
||||
->method('listCrawlableShortCodes')
|
||||
->willReturn($shortCodes);
|
||||
|
||||
$response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals());
|
||||
$response = $this->action($options)->handle(ServerRequestFactory::fromGlobals());
|
||||
|
||||
self::assertEquals(200, $response->getStatusCode());
|
||||
self::assertEquals($expected, $response->getBody()->__toString());
|
||||
@ -41,7 +42,7 @@ class RobotsActionTest extends TestCase
|
||||
|
||||
public static function provideShortCodes(): iterable
|
||||
{
|
||||
yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS
|
||||
yield 'three short codes' => [['foo', 'bar', 'baz'], new RobotsOptions(), <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
@ -51,7 +52,7 @@ class RobotsActionTest extends TestCase
|
||||
Allow: /baz
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS
|
||||
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], new RobotsOptions(), <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
@ -63,31 +64,43 @@ class RobotsActionTest extends TestCase
|
||||
Allow: /baz
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'no short codes' => [[], false, <<<ROBOTS
|
||||
yield 'no short codes' => [[], new RobotsOptions(), <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS
|
||||
yield 'three short codes and allow all short urls' => [
|
||||
['foo', 'bar', 'some'],
|
||||
new RobotsOptions(allowAllShortUrls: true),
|
||||
<<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /rest/
|
||||
ROBOTS,
|
||||
];
|
||||
yield 'no short codes and allow all short urls' => [[], new RobotsOptions(allowAllShortUrls: true), <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /rest/
|
||||
ROBOTS];
|
||||
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS
|
||||
yield 'allow user agents' => [[], new RobotsOptions(userAgents: ['foo', 'bar']), <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /rest/
|
||||
User-agent: foo
|
||||
User-agent: bar
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
}
|
||||
|
||||
private function action(bool $allowAllShortUrls = false): RobotsAction
|
||||
private function action(RobotsOptions $options): RobotsAction
|
||||
{
|
||||
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls);
|
||||
return new RobotsAction($this->helper, $options);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user