mirror of
https://github.com/shlinkio/shlink.git
synced 2024-11-21 16:38:37 -06:00
Add option to allow all URLs to be crawlable via robots.txt
This commit is contained in:
parent
a89b53af4f
commit
163244f40f
17
CHANGELOG.md
17
CHANGELOG.md
@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com), and this project adheres to [Semantic Versioning](https://semver.org).
|
||||
|
||||
## [Unreleased]
|
||||
### Added
|
||||
* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options.
|
||||
|
||||
### Changed
|
||||
* *Nothing*
|
||||
|
||||
### Deprecated
|
||||
* *Nothing*
|
||||
|
||||
### Removed
|
||||
* *Nothing*
|
||||
|
||||
### Fixed
|
||||
* *Nothing*
|
||||
|
||||
|
||||
## [4.1.0] - 2024-04-14
|
||||
### Added
|
||||
* [#1330](https://github.com/shlinkio/shlink/issues/1330) All visit-related endpoints now expose the `visitedUrl` prop for any visit.
|
||||
|
@ -47,7 +47,7 @@
|
||||
"shlinkio/shlink-config": "^3.0",
|
||||
"shlinkio/shlink-event-dispatcher": "^4.1",
|
||||
"shlinkio/shlink-importer": "^5.3.2",
|
||||
"shlinkio/shlink-installer": "^9.1",
|
||||
"shlinkio/shlink-installer": "dev-develop#11e66d8 as 9.2",
|
||||
"shlinkio/shlink-ip-geolocation": "^4.0",
|
||||
"shlinkio/shlink-json": "^1.1",
|
||||
"spiral/roadrunner": "^2023.3",
|
||||
|
@ -45,6 +45,7 @@ return [
|
||||
Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class,
|
||||
Option\UrlShortener\EnableTrailingSlashConfigOption::class,
|
||||
Option\UrlShortener\ShortUrlModeConfigOption::class,
|
||||
Option\UrlShortener\RobotsAllowAllShortUrlsConfigOption::class,
|
||||
Option\Tracking\IpAnonymizationConfigOption::class,
|
||||
Option\Tracking\OrphanVisitsTrackingConfigOption::class,
|
||||
Option\Tracking\DisableTrackParamConfigOption::class,
|
||||
|
13
config/autoload/robots.global.php
Normal file
13
config/autoload/robots.global.php
Normal file
@ -0,0 +1,13 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Shlinkio\Shlink\Core;
|
||||
|
||||
return [
|
||||
|
||||
'robots' => [
|
||||
'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false),
|
||||
],
|
||||
|
||||
];
|
@ -189,7 +189,7 @@ return [
|
||||
'Logger_Shlink',
|
||||
Options\QrCodeOptions::class,
|
||||
],
|
||||
Action\RobotsAction::class => [Crawling\CrawlingHelper::class],
|
||||
Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'],
|
||||
|
||||
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [
|
||||
'em',
|
||||
|
@ -15,9 +15,9 @@ use function sprintf;
|
||||
|
||||
use const PHP_EOL;
|
||||
|
||||
class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
||||
readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
||||
{
|
||||
public function __construct(private readonly CrawlingHelperInterface $crawlingHelper)
|
||||
public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls)
|
||||
{
|
||||
}
|
||||
|
||||
@ -37,6 +37,12 @@ class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
|
||||
|
||||
ROBOTS;
|
||||
|
||||
if ($this->allowAllShortUrls) {
|
||||
// Disallow rest URLs, but allow all short codes
|
||||
yield 'Disallow: /rest/';
|
||||
return;
|
||||
}
|
||||
|
||||
$shortCodes = $this->crawlingHelper->listCrawlableShortCodes();
|
||||
foreach ($shortCodes as $shortCode) {
|
||||
yield sprintf('Allow: /%s%s', $shortCode, PHP_EOL);
|
||||
|
@ -69,8 +69,9 @@ enum EnvVars: string
|
||||
case DEFAULT_DOMAIN = 'DEFAULT_DOMAIN';
|
||||
case AUTO_RESOLVE_TITLES = 'AUTO_RESOLVE_TITLES';
|
||||
case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH';
|
||||
case TIMEZONE = 'TIMEZONE';
|
||||
case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED';
|
||||
case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS';
|
||||
case TIMEZONE = 'TIMEZONE';
|
||||
case MEMORY_LIMIT = 'MEMORY_LIMIT';
|
||||
|
||||
public function loadFromEnv(mixed $default = null): mixed
|
||||
|
@ -14,24 +14,25 @@ use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
|
||||
|
||||
class RobotsActionTest extends TestCase
|
||||
{
|
||||
private RobotsAction $action;
|
||||
private MockObject & CrawlingHelperInterface $helper;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
$this->helper = $this->createMock(CrawlingHelperInterface::class);
|
||||
$this->action = new RobotsAction($this->helper);
|
||||
}
|
||||
|
||||
#[Test, DataProvider('provideShortCodes')]
|
||||
public function buildsRobotsLinesFromCrawlableShortCodes(array $shortCodes, string $expected): void
|
||||
{
|
||||
public function buildsRobotsLinesFromCrawlableShortCodes(
|
||||
array $shortCodes,
|
||||
bool $allowAllShortUrls,
|
||||
string $expected,
|
||||
): void {
|
||||
$this->helper
|
||||
->expects($this->once())
|
||||
->expects($allowAllShortUrls ? $this->never() : $this->once())
|
||||
->method('listCrawlableShortCodes')
|
||||
->willReturn($shortCodes);
|
||||
|
||||
$response = $this->action->handle(ServerRequestFactory::fromGlobals());
|
||||
$response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals());
|
||||
|
||||
self::assertEquals(200, $response->getStatusCode());
|
||||
self::assertEquals($expected, $response->getBody()->__toString());
|
||||
@ -40,7 +41,7 @@ class RobotsActionTest extends TestCase
|
||||
|
||||
public static function provideShortCodes(): iterable
|
||||
{
|
||||
yield 'three short codes' => [['foo', 'bar', 'baz'], <<<ROBOTS
|
||||
yield 'three short codes' => [['foo', 'bar', 'baz'], false, <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
@ -50,7 +51,7 @@ class RobotsActionTest extends TestCase
|
||||
Allow: /baz
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], <<<ROBOTS
|
||||
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], false, <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
@ -62,12 +63,31 @@ class RobotsActionTest extends TestCase
|
||||
Allow: /baz
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'no short codes' => [[], <<<ROBOTS
|
||||
yield 'no short codes' => [[], false, <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
ROBOTS];
|
||||
yield 'three short codes and allow all short urls' => [['foo', 'bar', 'some'], true, <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /rest/
|
||||
ROBOTS];
|
||||
yield 'no short codes and allow all short urls' => [[], true, <<<ROBOTS
|
||||
# For more information about the robots.txt standard, see:
|
||||
# https://www.robotstxt.org/orig.html
|
||||
|
||||
User-agent: *
|
||||
Disallow: /rest/
|
||||
ROBOTS];
|
||||
}
|
||||
|
||||
private function action(bool $allowAllShortUrls = false): RobotsAction
|
||||
{
|
||||
return new RobotsAction($this->helper, allowAllShortUrls: $allowAllShortUrls);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user