Add option to allow all URLs to be crawlable via robots.txt

This commit is contained in:
Alejandro Celaya 2024-04-21 17:09:20 +02:00
parent a89b53af4f
commit 163244f40f
8 changed files with 72 additions and 14 deletions

View File

@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com), and this project adheres to [Semantic Versioning](https://semver.org). The format is based on [Keep a Changelog](https://keepachangelog.com), and this project adheres to [Semantic Versioning](https://semver.org).
## [Unreleased]
### Added
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options.
### Changed
* *Nothing*
### Deprecated
* *Nothing*
### Removed
* *Nothing*
### Fixed
* *Nothing*
## [4.1.0] - 2024-04-14 ## [4.1.0] - 2024-04-14
### Added ### Added
* [#1330](https://github.com/shlinkio/shlink/issues/1330) All visit-related endpoints now expose the `visitedUrl` prop for any visit. * [#1330](https://github.com/shlinkio/shlink/issues/1330) All visit-related endpoints now expose the `visitedUrl` prop for any visit.

View File

@ -47,7 +47,7 @@
"shlinkio/shlink-config": "^3.0", "shlinkio/shlink-config": "^3.0",
"shlinkio/shlink-event-dispatcher": "^4.1", "shlinkio/shlink-event-dispatcher": "^4.1",
"shlinkio/shlink-importer": "^5.3.2", "shlinkio/shlink-importer": "^5.3.2",
"shlinkio/shlink-installer": "^9.1", "shlinkio/shlink-installer": "dev-develop#11e66d8 as 9.2",
"shlinkio/shlink-ip-geolocation": "^4.0", "shlinkio/shlink-ip-geolocation": "^4.0",
"shlinkio/shlink-json": "^1.1", "shlinkio/shlink-json": "^1.1",
"spiral/roadrunner": "^2023.3", "spiral/roadrunner": "^2023.3",

View File

@ -45,6 +45,7 @@ return [
Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class, Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class,
Option\UrlShortener\EnableTrailingSlashConfigOption::class, Option\UrlShortener\EnableTrailingSlashConfigOption::class,
Option\UrlShortener\ShortUrlModeConfigOption::class, Option\UrlShortener\ShortUrlModeConfigOption::class,
Option\UrlShortener\RobotsAllowAllShortUrlsConfigOption::class,
Option\Tracking\IpAnonymizationConfigOption::class, Option\Tracking\IpAnonymizationConfigOption::class,
Option\Tracking\OrphanVisitsTrackingConfigOption::class, Option\Tracking\OrphanVisitsTrackingConfigOption::class,
Option\Tracking\DisableTrackParamConfigOption::class, Option\Tracking\DisableTrackParamConfigOption::class,

View File

@ -0,0 +1,13 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core;
return [
'robots' => [
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
],
];

View File

@ -189,7 +189,7 @@ return [
'Logger_Shlink', 'Logger_Shlink',
Options\QrCodeOptions::class, Options\QrCodeOptions::class,
], ],
Action\RobotsAction::class => [Crawling\CrawlingHelper::class], Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'],
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [ ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
'em', 'em',

View File

@ -15,9 +15,9 @@ use function sprintf;
use const PHP_EOL; use const PHP_EOL;
class RobotsAction implements RequestHandlerInterface, StatusCodeInterface readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
{ {
public function __construct(private readonly CrawlingHelperInterface $crawlingHelper) public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls)
{ {
} }
@ -37,6 +37,12 @@ class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
ROBOTS; ROBOTS;
if ($this->allowAllShortUrls) {
// Disallow rest URLs, but allow all short codes
yield 'Disallow: /rest/';
return;
}
$shortCodes = $this->crawlingHelper->listCrawlableShortCodes(); $shortCodes = $this->crawlingHelper->listCrawlableShortCodes();
foreach ($shortCodes as $shortCode) { foreach ($shortCodes as $shortCode) {
yield sprintf('Allow: /%s%s', $shortCode, PHP_EOL); yield sprintf('Allow: /%s%s', $shortCode, PHP_EOL);

View File

@ -69,8 +69,9 @@ enum EnvVars: string
case DEFAULT_DOMAIN = 'DEFAULT_DOMAIN'; case DEFAULT_DOMAIN = 'DEFAULT_DOMAIN';
case AUTO_RESOLVE_TITLES = 'AUTO_RESOLVE_TITLES'; case AUTO_RESOLVE_TITLES = 'AUTO_RESOLVE_TITLES';
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH'; case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
case TIMEZONE = 'TIMEZONE';
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED'; case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
case TIMEZONE = 'TIMEZONE';
case MEMORY_LIMIT = 'MEMORY_LIMIT'; case MEMORY_LIMIT = 'MEMORY_LIMIT';
public function loadFromEnv(mixed $default = null): mixed public function loadFromEnv(mixed $default = null): mixed

View File

@ -14,24 +14,25 @@ use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
class RobotsActionTest extends TestCase class RobotsActionTest extends TestCase
{ {
private RobotsAction $action;
private MockObject & CrawlingHelperInterface $helper; private MockObject & CrawlingHelperInterface $helper;
protected function setUp(): void protected function setUp(): void
{ {
$this->helper = $this->createMock(CrawlingHelperInterface::class); $this->helper = $this->createMock(CrawlingHelperInterface::class);
$this->action = new RobotsAction($this->helper);
} }
#[Test, DataProvider('provideShortCodes')] #[Test, DataProvider('provideShortCodes')]
public function buildsRobotsLinesFromCrawlableShortCodes(array $shortCodes, string $expected): void public function buildsRobotsLinesFromCrawlableShortCodes(
{ array $shortCodes,
bool $allowAllShortUrls,
string $expected,
): void {
$this->helper $this->helper
->expects($this->once()) ->expects($allowAllShortUrls ? $this->never() : $this->once())
->method('listCrawlableShortCodes') ->method('listCrawlableShortCodes')
->willReturn($shortCodes); ->willReturn($shortCodes);
$response = $this->action->handle(ServerRequestFactory::fromGlobals()); $response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals());
self::assertEquals(200, $response->getStatusCode()); self::assertEquals(200, $response->getStatusCode());
self::assertEquals($expected, $response->getBody()->__toString()); self::assertEquals($expected, $response->getBody()->__toString());
@ -40,7 +41,7 @@ class RobotsActionTest extends TestCase
public static function provideShortCodes(): iterable public static function provideShortCodes(): iterable
{ {
yield 'three short codes' => [['foo', 'bar', 'baz'], <<<ROBOTS yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
@ -50,7 +51,7 @@ class RobotsActionTest extends TestCase
Allow: /baz Allow: /baz
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], <<<ROBOTS yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
@ -62,12 +63,31 @@ class RobotsActionTest extends TestCase
Allow: /baz Allow: /baz
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'no short codes' => [[], <<<ROBOTS yield 'no short codes' => [[], false, <<<ROBOTS
# For more information about the robots.txt standard, see: # For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html # https://www.robotstxt.org/orig.html
User-agent: * User-agent: *
Disallow: / Disallow: /
ROBOTS]; ROBOTS];
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
ROBOTS];
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /rest/
ROBOTS];
}
private function action(bool $allowAllShortUrls = false): RobotsAction
{
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls);
} }
} }