mirror of
https://github.com/shlinkio/shlink.git
synced 2025-01-11 08:32:02 -06:00
Merge pull request #2149 from acelaya-forks/feature/robots-user-agents
Add option to customize user agents in robots.txt
This commit is contained in:
commit
b6b2530cb6
@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
### Added
|
### Added
|
||||||
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options.
|
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config option.
|
||||||
|
* [#2109](https://github.com/shlinkio/shlink/issues/2109) Add option to customize user agents robots.txt, via `ROBOTS_USER_AGENTS=foo,bar,baz` env var, or config option.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
* [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024.
|
* [#2096](https://github.com/shlinkio/shlink/issues/2096) Update to RoadRunner 2024.
|
||||||
|
@ -48,7 +48,7 @@
|
|||||||
"shlinkio/shlink-config": "^3.0",
|
"shlinkio/shlink-config": "^3.0",
|
||||||
"shlinkio/shlink-event-dispatcher": "^4.1",
|
"shlinkio/shlink-event-dispatcher": "^4.1",
|
||||||
"shlinkio/shlink-importer": "^5.3.2",
|
"shlinkio/shlink-importer": "^5.3.2",
|
||||||
"shlinkio/shlink-installer": "dev-develop#11e66d8 as 9.2",
|
"shlinkio/shlink-installer": "dev-develop#ccda72e as 9.2",
|
||||||
"shlinkio/shlink-ip-geolocation": "^4.0",
|
"shlinkio/shlink-ip-geolocation": "^4.0",
|
||||||
"shlinkio/shlink-json": "^1.1",
|
"shlinkio/shlink-json": "^1.1",
|
||||||
"spiral/roadrunner": "^2024.1",
|
"spiral/roadrunner": "^2024.1",
|
||||||
|
@ -45,7 +45,8 @@ return [
|
|||||||
Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class,
|
Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class,
|
||||||
Option\UrlShortener\EnableTrailingSlashConfigOption::class,
|
Option\UrlShortener\EnableTrailingSlashConfigOption::class,
|
||||||
Option\UrlShortener\ShortUrlModeConfigOption::class,
|
Option\UrlShortener\ShortUrlModeConfigOption::class,
|
||||||
Option\UrlShortener\RobotsAllowAllShortUrlsConfigOption::class,
|
Option\Robots\RobotsAllowAllShortUrlsConfigOption::class,
|
||||||
|
Option\Robots\RobotsUserAgentsConfigOption::class,
|
||||||
Option\Tracking\IpAnonymizationConfigOption::class,
|
Option\Tracking\IpAnonymizationConfigOption::class,
|
||||||
Option\Tracking\OrphanVisitsTrackingConfigOption::class,
|
Option\Tracking\OrphanVisitsTrackingConfigOption::class,
|
||||||
Option\Tracking\DisableTrackParamConfigOption::class,
|
Option\Tracking\DisableTrackParamConfigOption::class,
|
||||||
|
@ -8,6 +8,7 @@ return [
|
|||||||
|
|
||||||
'robots' => [
|
'robots' => [
|
||||||
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
|
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
|
||||||
|
'user-agents' => splitByComma(Config\EnvVars::ROBOTS_USER_AGENTS->loadFromEnv()),
|
||||||
],
|
],
|
||||||
|
|
||||||
];
|
];
|
||||||
|
@ -4,40 +4,35 @@ declare(strict_types=1);
|
|||||||
|
|
||||||
use Shlinkio\Shlink\Core\Config\EnvVars;
|
use Shlinkio\Shlink\Core\Config\EnvVars;
|
||||||
|
|
||||||
return (static function (): array {
|
use function Shlinkio\Shlink\Core\splitByComma;
|
||||||
/** @var string|null $disableTrackingFrom */
|
|
||||||
$disableTrackingFrom = EnvVars::DISABLE_TRACKING_FROM->loadFromEnv();
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
|
||||||
'tracking' => [
|
'tracking' => [
|
||||||
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
|
// Tells if IP addresses should be anonymized before persisting, to fulfil data protection regulations
|
||||||
// This applies only if IP address tracking is enabled
|
// This applies only if IP address tracking is enabled
|
||||||
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
|
'anonymize_remote_addr' => (bool) EnvVars::ANONYMIZE_REMOTE_ADDR->loadFromEnv(true),
|
||||||
|
|
||||||
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
|
// Tells if visits to not-found URLs should be tracked. The disable_tracking option takes precedence
|
||||||
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
|
'track_orphan_visits' => (bool) EnvVars::TRACK_ORPHAN_VISITS->loadFromEnv(true),
|
||||||
|
|
||||||
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
|
// A query param that, if provided, will disable tracking of one particular visit. Always takes precedence
|
||||||
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
|
'disable_track_param' => EnvVars::DISABLE_TRACK_PARAM->loadFromEnv(),
|
||||||
|
|
||||||
// If true, visits will not be tracked at all
|
// If true, visits will not be tracked at all
|
||||||
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
|
'disable_tracking' => (bool) EnvVars::DISABLE_TRACKING->loadFromEnv(false),
|
||||||
|
|
||||||
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
|
// If true, visits will be tracked, but neither the IP address, nor the location will be resolved
|
||||||
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
|
'disable_ip_tracking' => (bool) EnvVars::DISABLE_IP_TRACKING->loadFromEnv(false),
|
||||||
|
|
||||||
// If true, the referrer will not be tracked
|
// If true, the referrer will not be tracked
|
||||||
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
|
'disable_referrer_tracking' => (bool) EnvVars::DISABLE_REFERRER_TRACKING->loadFromEnv(false),
|
||||||
|
|
||||||
// If true, the user agent will not be tracked
|
// If true, the user agent will not be tracked
|
||||||
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
|
'disable_ua_tracking' => (bool) EnvVars::DISABLE_UA_TRACKING->loadFromEnv(false),
|
||||||
|
|
||||||
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
|
// A list of IP addresses, patterns or CIDR blocks from which tracking is disabled by default
|
||||||
'disable_tracking_from' => $disableTrackingFrom === null
|
'disable_tracking_from' => splitByComma(EnvVars::DISABLE_TRACKING_FROM->loadFromEnv()),
|
||||||
? []
|
],
|
||||||
: array_map(trim(...), explode(',', $disableTrackingFrom)),
|
|
||||||
],
|
|
||||||
|
|
||||||
];
|
];
|
||||||
})();
|
|
||||||
|
@ -32,6 +32,7 @@ return [
|
|||||||
Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'],
|
Options\TrackingOptions::class => [ValinorConfigFactory::class, 'config.tracking'],
|
||||||
Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'],
|
Options\QrCodeOptions::class => [ValinorConfigFactory::class, 'config.qr_codes'],
|
||||||
Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'],
|
Options\RabbitMqOptions::class => [ValinorConfigFactory::class, 'config.rabbitmq'],
|
||||||
|
Options\RobotsOptions::class => [ValinorConfigFactory::class, 'config.robots'],
|
||||||
|
|
||||||
RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class,
|
RedirectRule\ShortUrlRedirectRuleService::class => ConfigAbstractFactory::class,
|
||||||
RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class,
|
RedirectRule\ShortUrlRedirectionResolver::class => ConfigAbstractFactory::class,
|
||||||
@ -189,7 +190,7 @@ return [
|
|||||||
'Logger_Shlink',
|
'Logger_Shlink',
|
||||||
Options\QrCodeOptions::class,
|
Options\QrCodeOptions::class,
|
||||||
],
|
],
|
||||||
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'],
|
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, Options\RobotsOptions::class],
|
||||||
|
|
||||||
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
|
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
|
||||||
'em',
|
'em',
|
||||||
|
@ -260,3 +260,16 @@ function enumToString(string $enum): string
|
|||||||
{
|
{
|
||||||
return sprintf('["%s"]', implode('", "', enumValues($enum)));
|
return sprintf('["%s"]', implode('", "', enumValues($enum)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split provided string by comma and return a list of the results.
|
||||||
|
* An empty array is returned if provided value is empty
|
||||||
|
*/
|
||||||
|
function splitByComma(?string $value): array
|
||||||
|
{
|
||||||
|
if ($value === null || trim($value) === '') {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return array_map(trim(...), explode(',', $value));
|
||||||
|
}
|
||||||
|
@ -10,6 +10,7 @@ use Psr\Http\Message\ResponseInterface;
|
|||||||
use Psr\Http\Message\ServerRequestInterface;
|
use Psr\Http\Message\ServerRequestInterface;
|
||||||
use Psr\Http\Server\RequestHandlerInterface;
|
use Psr\Http\Server\RequestHandlerInterface;
|
||||||
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
||||||
|
use Shlinkio\Shlink\Core\Options\RobotsOptions;
|
||||||
|
|
||||||
use function sprintf;
|
use function sprintf;
|
||||||
|
|
||||||
@ -17,7 +18,7 @@ use const PHP_EOL;
|
|||||||
|
|
||||||
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
||||||
{
|
{
|
||||||
public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls)
|
public function __construct(private CrawlingHelperInterface $crawlingHelper, private RobotsOptions $robotsOptions)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,11 +34,15 @@ readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterf
|
|||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
User-agent: *
|
|
||||||
|
|
||||||
ROBOTS;
|
ROBOTS;
|
||||||
|
|
||||||
if ($this->allowAllShortUrls) {
|
$userAgents = $this->robotsOptions->hasUserAgents() ? $this->robotsOptions->userAgents : ['*'];
|
||||||
|
foreach ($userAgents as $userAgent) {
|
||||||
|
yield sprintf('User-agent: %s%s', $userAgent, PHP_EOL);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->robotsOptions->allowAllShortUrls) {
|
||||||
// Disallow rest URLs, but allow all short codes
|
// Disallow rest URLs, but allow all short codes
|
||||||
yield 'Disallow: /rest/';
|
yield 'Disallow: /rest/';
|
||||||
return;
|
return;
|
||||||
|
@ -71,6 +71,7 @@ enum EnvVars: string
|
|||||||
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
|
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
|
||||||
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
|
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
|
||||||
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
|
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
|
||||||
|
case ROBOTS_USER_AGENTS = 'ROBOTS_USER_AGENTS';
|
||||||
case TIMEZONE = 'TIMEZONE';
|
case TIMEZONE = 'TIMEZONE';
|
||||||
case MEMORY_LIMIT = 'MEMORY_LIMIT';
|
case MEMORY_LIMIT = 'MEMORY_LIMIT';
|
||||||
|
|
||||||
|
22
module/Core/src/Options/RobotsOptions.php
Normal file
22
module/Core/src/Options/RobotsOptions.php
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace Shlinkio\Shlink\Core\Options;
|
||||||
|
|
||||||
|
use function count;
|
||||||
|
|
||||||
|
final readonly class RobotsOptions
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
public bool $allowAllShortUrls = false,
|
||||||
|
/** @var string[] */
|
||||||
|
public array $userAgents = [],
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public function hasUserAgents(): bool
|
||||||
|
{
|
||||||
|
return count($this->userAgents) > 0;
|
||||||
|
}
|
||||||
|
}
|
@ -11,6 +11,7 @@ use PHPUnit\Framework\MockObject\MockObject;
|
|||||||
use PHPUnit\Framework\TestCase;
|
use PHPUnit\Framework\TestCase;
|
||||||
use Shlinkio\Shlink\Core\Action\RobotsAction;
|
use Shlinkio\Shlink\Core\Action\RobotsAction;
|
||||||
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
||||||
|
use Shlinkio\Shlink\Core\Options\RobotsOptions;
|
||||||
|
|
||||||
class RobotsActionTest extends TestCase
|
class RobotsActionTest extends TestCase
|
||||||
{
|
{
|
||||||
@ -24,15 +25,15 @@ class RobotsActionTest extends TestCase
|
|||||||
#[Test, DataProvider('provideShortCodes')]
|
#[Test, DataProvider('provideShortCodes')]
|
||||||
public function buildsRobotsLinesFromCrawlableShortCodes(
|
public function buildsRobotsLinesFromCrawlableShortCodes(
|
||||||
array $shortCodes,
|
array $shortCodes,
|
||||||
bool $allowAllShortUrls,
|
RobotsOptions $options,
|
||||||
string $expected,
|
string $expected,
|
||||||
): void {
|
): void {
|
||||||
$this->helper
|
$this->helper
|
||||||
->expects($allowAllShortUrls ? $this->never() : $this->once())
|
->expects($options->allowAllShortUrls ? $this->never() : $this->once())
|
||||||
->method('listCrawlableShortCodes')
|
->method('listCrawlableShortCodes')
|
||||||
->willReturn($shortCodes);
|
->willReturn($shortCodes);
|
||||||
|
|
||||||
$response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals());
|
$response = $this->action($options)->handle(ServerRequestFactory::fromGlobals());
|
||||||
|
|
||||||
self::assertEquals(200, $response->getStatusCode());
|
self::assertEquals(200, $response->getStatusCode());
|
||||||
self::assertEquals($expected, $response->getBody()->__toString());
|
self::assertEquals($expected, $response->getBody()->__toString());
|
||||||
@ -41,7 +42,7 @@ class RobotsActionTest extends TestCase
|
|||||||
|
|
||||||
public static function provideShortCodes(): iterable
|
public static function provideShortCodes(): iterable
|
||||||
{
|
{
|
||||||
yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS
|
yield 'three short codes' => [['foo', 'bar', 'baz'], new RobotsOptions(), <<<ROBOTS
|
||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
@ -51,7 +52,7 @@ class RobotsActionTest extends TestCase
|
|||||||
Allow: /baz
|
Allow: /baz
|
||||||
Disallow: /
|
Disallow: /
|
||||||
ROBOTS];
|
ROBOTS];
|
||||||
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS
|
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], new RobotsOptions(), <<<ROBOTS
|
||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
@ -63,31 +64,43 @@ class RobotsActionTest extends TestCase
|
|||||||
Allow: /baz
|
Allow: /baz
|
||||||
Disallow: /
|
Disallow: /
|
||||||
ROBOTS];
|
ROBOTS];
|
||||||
yield 'no short codes' => [[], false, <<<ROBOTS
|
yield 'no short codes' => [[], new RobotsOptions(), <<<ROBOTS
|
||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
User-agent: *
|
User-agent: *
|
||||||
Disallow: /
|
Disallow: /
|
||||||
ROBOTS];
|
ROBOTS];
|
||||||
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS
|
yield 'three short codes and allow all short urls' => [
|
||||||
|
['foo', 'bar', 'some'],
|
||||||
|
new RobotsOptions(allowAllShortUrls: true),
|
||||||
|
<<<ROBOTS
|
||||||
|
# For more information about the robots.txt standard, see:
|
||||||
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /rest/
|
||||||
|
ROBOTS,
|
||||||
|
];
|
||||||
|
yield 'no short codes and allow all short urls' => [[], new RobotsOptions(allowAllShortUrls: true), <<<ROBOTS
|
||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
User-agent: *
|
User-agent: *
|
||||||
Disallow: /rest/
|
Disallow: /rest/
|
||||||
ROBOTS];
|
ROBOTS];
|
||||||
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS
|
yield 'allow user agents' => [[], new RobotsOptions(userAgents: ['foo', 'bar']), <<<ROBOTS
|
||||||
# For more information about the robots.txt standard, see:
|
# For more information about the robots.txt standard, see:
|
||||||
# https://www.robotstxt.org/orig.html
|
# https://www.robotstxt.org/orig.html
|
||||||
|
|
||||||
User-agent: *
|
User-agent: foo
|
||||||
Disallow: /rest/
|
User-agent: bar
|
||||||
|
Disallow: /
|
||||||
ROBOTS];
|
ROBOTS];
|
||||||
}
|
}
|
||||||
|
|
||||||
private function action(bool $allowAllShortUrls = false): RobotsAction
|
private function action(RobotsOptions $options): RobotsAction
|
||||||
{
|
{
|
||||||
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls);
|
return new RobotsAction($this->helper, $options);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user