Merge pull request #1091 from acelaya-forks/feature/improved-crawling

Feature/improved crawling
This commit is contained in:
Alejandro Celaya 2021-05-22 11:48:55 +02:00 committed by GitHub
commit 663ae9f6bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 399 additions and 19 deletions

View File

@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
php-version: ['7.4']
php-version: ['8.0']
steps:
- name: Checkout code
uses: actions/checkout@v2
@ -30,7 +30,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
php-version: ['7.4']
php-version: ['8.0']
steps:
- name: Checkout code
uses: actions/checkout@v2
@ -242,7 +242,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
php-version: ['7.4']
php-version: ['8.0']
steps:
- name: Checkout code
uses: actions/checkout@v2

View File

@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
* `disable_referrer_tracking`: If true, the referrer will not be tracked.
* `disable_ua_tracking`: If true, the user agent will not be tracked.
* [#955](https://github.com/shlinkio/shlink/issues/955) Added new option to set short URLs as crawlable, making them be listed in the robots.txt as Allowed.
### Changed
* [#1036](https://github.com/shlinkio/shlink/issues/1036) Updated to `happyr/doctrine-specification` 2.0.
* [#1039](https://github.com/shlinkio/shlink/issues/1039) Updated to `endroid/qr-code` 4.0.

View File

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace ShlinkMigrations;
use Doctrine\DBAL\Schema\Schema;
use Doctrine\DBAL\Types\Types;
use Doctrine\Migrations\AbstractMigration;
final class Version20210522051601 extends AbstractMigration
{
public function up(Schema $schema): void
{
$shortUrls = $schema->getTable('short_urls');
$this->skipIf($shortUrls->hasColumn('crawlable'));
$shortUrls->addColumn('crawlable', Types::BOOLEAN, ['default' => false]);
}
public function down(Schema $schema): void
{
$shortUrls = $schema->getTable('short_urls');
$this->skipIf(! $shortUrls->hasColumn('crawlable'));
$shortUrls->dropColumn('crawlable');
}
}

View File

@ -116,6 +116,15 @@
"domain": {
"type": "string",
"description": "The domain in which the short URL was created. Null if it belongs to default domain."
},
"title": {
"type": "string",
"nullable": true,
"description": "A descriptive title of the short URL."
},
"crawlable": {
"type": "boolean",
"description": "Tells if this URL will be included as 'Allow' in Shlink's robots.txt."
}
},
"example": {
@ -133,7 +142,9 @@
"validUntil": null,
"maxVisits": 100
},
"domain": "example.com"
"domain": "example.com",
"title": "The title",
"crawlable": false
}
},
"ShortUrlMeta": {

View File

@ -41,6 +41,10 @@
"type": "string",
"nullable": true,
"description": "A descriptive title of the short URL."
},
"crawlable": {
"type": "boolean",
"description": "Tells if this URL will be included as 'Allow' in Shlink's robots.txt."
}
}
}

View File

@ -140,7 +140,8 @@
"maxVisits": 100
},
"domain": null,
"title": "Welcome to Steam"
"title": "Welcome to Steam",
"crawlable": false
},
{
"shortCode": "12Kb3",
@ -157,7 +158,8 @@
"maxVisits": null
},
"domain": null,
"title": null
"title": null,
"crawlable": false
},
{
"shortCode": "123bA",
@ -172,7 +174,8 @@
"maxVisits": null
},
"domain": "example.com",
"title": null
"title": null,
"crawlable": false
}
],
"pagination": {
@ -273,6 +276,10 @@
"title": {
"type": "string",
"description": "A descriptive title of the short URL."
},
"crawlable": {
"type": "boolean",
"description": "Tells if this URL will be included as 'Allow' in Shlink's robots.txt."
}
}
}
@ -305,7 +312,9 @@
"validUntil": null,
"maxVisits": 500
},
"domain": null
"domain": null,
"title": null,
"crawlable": false
}
}
},

View File

@ -74,7 +74,8 @@
"maxVisits": 100
},
"domain": null,
"title": null
"title": null,
"crawlable": false
},
"text/plain": "https://doma.in/abc123"
}

View File

@ -54,7 +54,8 @@
"maxVisits": 100
},
"domain": null,
"title": null
"title": null,
"crawlable": false
}
}
},
@ -147,6 +148,10 @@
"type": "string",
"description": "A descriptive title of the short URL.",
"nullable": true
},
"crawlable": {
"type": "boolean",
"description": "Tells if this URL will be included as 'Allow' in Shlink's robots.txt."
}
}
}
@ -184,7 +189,8 @@
"maxVisits": 100
},
"domain": null,
"title": "Shlink - The URL shortener"
"title": "Shlink - The URL shortener",
"crawlable": false
}
}
},

View File

@ -48,6 +48,7 @@ return [
Action\RedirectAction::class => ConfigAbstractFactory::class,
Action\PixelAction::class => ConfigAbstractFactory::class,
Action\QrCodeAction::class => ConfigAbstractFactory::class,
Action\RobotsAction::class => ConfigAbstractFactory::class,
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => ConfigAbstractFactory::class,
ShortUrl\Helper\ShortUrlStringifier::class => ConfigAbstractFactory::class,
@ -57,6 +58,8 @@ return [
Mercure\MercureUpdatesGenerator::class => ConfigAbstractFactory::class,
Importer\ImportedLinksProcessor::class => ConfigAbstractFactory::class,
Crawling\CrawlingHelper::class => ConfigAbstractFactory::class,
],
'aliases' => [
@ -129,6 +132,7 @@ return [
ShortUrl\Helper\ShortUrlStringifier::class,
'Logger_Shlink',
],
Action\RobotsAction::class => [Crawling\CrawlingHelper::class],
ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => ['em'],
ShortUrl\Helper\ShortUrlStringifier::class => ['config.url_shortener.domain', 'config.router.base_path'],
@ -146,6 +150,8 @@ return [
Service\ShortUrl\ShortCodeHelper::class,
Util\DoctrineBatchHelper::class,
],
Crawling\CrawlingHelper::class => ['em'],
],
];

View File

@ -95,4 +95,9 @@ return static function (ClassMetadata $metadata, array $emConfig): void {
->columnName('title_was_auto_resolved')
->option('default', false)
->build();
$builder->createField('crawlable', Types::BOOLEAN)
->columnName('crawlable')
->option('default', false)
->build();
};

View File

@ -9,6 +9,14 @@ use Shlinkio\Shlink\Core\Action;
return [
'routes' => [
[
'name' => Action\RobotsAction::class,
'path' => '/robots.txt',
'middleware' => [
Action\RobotsAction::class,
],
'allowed_methods' => [RequestMethod::METHOD_GET],
],
[
'name' => Action\RedirectAction::class,
'path' => '/{shortCode}',

View File

@ -0,0 +1,49 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core\Action;
use Fig\Http\Message\StatusCodeInterface;
use GuzzleHttp\Psr7\Response;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
use Psr\Http\Server\RequestHandlerInterface;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
use function sprintf;
use const PHP_EOL;
class RobotsAction implements RequestHandlerInterface, StatusCodeInterface
{
private CrawlingHelperInterface $crawlingHelper;
public function __construct(CrawlingHelperInterface $crawlingHelper)
{
$this->crawlingHelper = $crawlingHelper;
}
public function handle(ServerRequestInterface $request): ResponseInterface
{
return new Response(self::STATUS_OK, ['Content-type' => 'text/plain'], $this->buildRobots());
}
private function buildRobots(): iterable
{
yield <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
ROBOTS;
$shortCodes = $this->crawlingHelper->listCrawlableShortCodes();
foreach ($shortCodes as $shortCode) {
yield sprintf('Allow: /%s%s', $shortCode, PHP_EOL);
}
yield 'Disallow: /';
}
}

View File

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core\Crawling;
use Doctrine\ORM\EntityManagerInterface;
use Shlinkio\Shlink\Core\Entity\ShortUrl;
use Shlinkio\Shlink\Core\Repository\ShortUrlRepositoryInterface;
class CrawlingHelper implements CrawlingHelperInterface
{
private EntityManagerInterface $em;
public function __construct(EntityManagerInterface $em)
{
$this->em = $em;
}
public function listCrawlableShortCodes(): iterable
{
/** @var ShortUrlRepositoryInterface $repo */
$repo = $this->em->getRepository(ShortUrl::class);
yield from $repo->findCrawlableShortCodes();
}
}

View File

@ -0,0 +1,13 @@
<?php
declare(strict_types=1);
namespace Shlinkio\Shlink\Core\Crawling;
interface CrawlingHelperInterface
{
/**
* @return string[]|iterable
*/
public function listCrawlableShortCodes(): iterable;
}

View File

@ -42,6 +42,7 @@ class ShortUrl extends AbstractEntity
private ?ApiKey $authorApiKey = null;
private ?string $title = null;
private bool $titleWasAutoResolved = false;
private bool $crawlable = false;
private function __construct()
{
@ -78,6 +79,7 @@ class ShortUrl extends AbstractEntity
$instance->authorApiKey = $meta->getApiKey();
$instance->title = $meta->getTitle();
$instance->titleWasAutoResolved = $meta->titleWasAutoResolved();
$instance->crawlable = $meta->isCrawlable();
return $instance;
}
@ -200,6 +202,11 @@ class ShortUrl extends AbstractEntity
return $this->title;
}
public function crawlable(): bool
{
return $this->crawlable;
}
public function update(
ShortUrlEdit $shortUrlEdit,
?ShortUrlRelationResolverInterface $relationResolver = null
@ -220,6 +227,9 @@ class ShortUrl extends AbstractEntity
$relationResolver = $relationResolver ?? new SimpleShortUrlRelationResolver();
$this->tags = $relationResolver->resolveTags($shortUrlEdit->tags());
}
if ($shortUrlEdit->crawlableWasProvided()) {
$this->crawlable = $shortUrlEdit->crawlable();
}
if (
$this->title === null
|| $shortUrlEdit->titleWasProvided()

View File

@ -30,6 +30,8 @@ final class ShortUrlEdit implements TitleResolutionModelInterface
private ?string $title = null;
private bool $titleWasAutoResolved = false;
private ?bool $validateUrl = null;
private bool $crawlablePropWasProvided = false;
private bool $crawlable = false;
private function __construct()
{
@ -61,6 +63,7 @@ final class ShortUrlEdit implements TitleResolutionModelInterface
$this->maxVisitsPropWasProvided = array_key_exists(ShortUrlInputFilter::MAX_VISITS, $data);
$this->tagsPropWasProvided = array_key_exists(ShortUrlInputFilter::TAGS, $data);
$this->titlePropWasProvided = array_key_exists(ShortUrlInputFilter::TITLE, $data);
$this->crawlablePropWasProvided = array_key_exists(ShortUrlInputFilter::CRAWLABLE, $data);
$this->longUrl = $inputFilter->getValue(ShortUrlInputFilter::LONG_URL);
$this->validSince = parseDateField($inputFilter->getValue(ShortUrlInputFilter::VALID_SINCE));
@ -69,6 +72,7 @@ final class ShortUrlEdit implements TitleResolutionModelInterface
$this->validateUrl = getOptionalBoolFromInputFilter($inputFilter, ShortUrlInputFilter::VALIDATE_URL);
$this->tags = $inputFilter->getValue(ShortUrlInputFilter::TAGS);
$this->title = $inputFilter->getValue(ShortUrlInputFilter::TITLE);
$this->crawlable = $inputFilter->getValue(ShortUrlInputFilter::CRAWLABLE);
}
public function longUrl(): ?string
@ -162,4 +166,14 @@ final class ShortUrlEdit implements TitleResolutionModelInterface
{
return $this->validateUrl;
}
public function crawlable(): bool
{
return $this->crawlable;
}
public function crawlableWasProvided(): bool
{
return $this->crawlablePropWasProvided;
}
}

View File

@ -31,6 +31,7 @@ final class ShortUrlMeta implements TitleResolutionModelInterface
private array $tags = [];
private ?string $title = null;
private bool $titleWasAutoResolved = false;
private bool $crawlable = false;
private function __construct()
{
@ -80,6 +81,7 @@ final class ShortUrlMeta implements TitleResolutionModelInterface
$this->apiKey = $inputFilter->getValue(ShortUrlInputFilter::API_KEY);
$this->tags = $inputFilter->getValue(ShortUrlInputFilter::TAGS);
$this->title = $inputFilter->getValue(ShortUrlInputFilter::TITLE);
$this->crawlable = $inputFilter->getValue(ShortUrlInputFilter::CRAWLABLE);
}
public function getLongUrl(): string
@ -188,4 +190,9 @@ final class ShortUrlMeta implements TitleResolutionModelInterface
return $copy;
}
public function isCrawlable(): bool
{
return $this->crawlable;
}
}

View File

@ -288,4 +288,28 @@ class ShortUrlRepository extends EntitySpecificationRepository implements ShortU
$qb->andWhere($qb->expr()->isNull('s.domain'));
}
}
public function findCrawlableShortCodes(): iterable
{
$blockSize = 1000;
$qb = $this->getEntityManager()->createQueryBuilder();
$qb->select('DISTINCT s.shortCode')
->from(ShortUrl::class, 's')
->where($qb->expr()->eq('s.crawlable', ':crawlable'))
->setParameter('crawlable', true)
->setMaxResults($blockSize);
$page = 0;
do {
$qbClone = (clone $qb)->setFirstResult($blockSize * $page);
$iterator = $qbClone->getQuery()->toIterable();
$resultsFound = false;
$page++;
foreach ($iterator as ['shortCode' => $shortCode]) {
$resultsFound = true;
yield $shortCode;
}
} while ($resultsFound);
}
}

View File

@ -41,4 +41,6 @@ interface ShortUrlRepositoryInterface extends ObjectRepository, EntitySpecificat
public function findOneMatching(ShortUrlMeta $meta): ?ShortUrl;
public function findOneByImportedUrl(ImportedShlinkUrl $url): ?ShortUrl;
public function findCrawlableShortCodes(): iterable;
}

View File

@ -66,11 +66,11 @@ class VisitRepository extends EntitySpecificationRepository implements VisitRepo
do {
$qb = (clone $originalQueryBuilder)->andWhere($qb->expr()->gt('v.id', $lastId));
$iterator = $qb->getQuery()->iterate();
$iterator = $qb->getQuery()->toIterable();
$resultsFound = false;
/** @var Visit $visit */
foreach ($iterator as $key => [$visit]) {
foreach ($iterator as $key => $visit) {
$resultsFound = true;
yield $key => $visit;
}

View File

@ -35,6 +35,7 @@ class ShortUrlDataTransformer implements DataTransformerInterface
'meta' => $this->buildMeta($shortUrl),
'domain' => $shortUrl->getDomain(),
'title' => $shortUrl->title(),
'crawlable' => $shortUrl->crawlable(),
];
}

View File

@ -32,6 +32,7 @@ class ShortUrlInputFilter extends InputFilter
public const API_KEY = 'apiKey';
public const TAGS = 'tags';
public const TITLE = 'title';
public const CRAWLABLE = 'crawlable';
private function __construct(array $data, bool $requireLongUrl)
{
@ -105,5 +106,7 @@ class ShortUrlInputFilter extends InputFilter
$this->add($this->createTagsInput(self::TAGS, false));
$this->add($this->createInput(self::TITLE, false));
$this->add($this->createBooleanInput(self::CRAWLABLE, false));
}
}

View File

@ -436,4 +436,37 @@ class ShortUrlRepositoryTest extends DatabaseTestCase
self::assertNull($this->repo->findOneByImportedUrl($buildImported('my-cool-slug', 'doma.in')));
self::assertNull($this->repo->findOneByImportedUrl($buildImported('another-slug')));
}
/** @test */
public function findCrawlableShortCodesReturnsExpectedResult(): void
{
$createShortUrl = fn (bool $crawlable) => ShortUrl::fromMeta(
ShortUrlMeta::fromRawData(['crawlable' => $crawlable, 'longUrl' => 'foo.com']),
);
$shortUrl1 = $createShortUrl(true);
$this->getEntityManager()->persist($shortUrl1);
$shortUrl2 = $createShortUrl(false);
$this->getEntityManager()->persist($shortUrl2);
$shortUrl3 = $createShortUrl(true);
$this->getEntityManager()->persist($shortUrl3);
$shortUrl4 = $createShortUrl(true);
$this->getEntityManager()->persist($shortUrl4);
$shortUrl5 = $createShortUrl(false);
$this->getEntityManager()->persist($shortUrl5);
$this->getEntityManager()->flush();
$iterable = $this->repo->findCrawlableShortCodes();
$results = [];
foreach ($iterable as $shortCode) {
$results[] = $shortCode;
}
self::assertCount(3, $results);
self::assertContains($shortUrl1->getShortCode(), $results);
self::assertContains($shortUrl3->getShortCode(), $results);
self::assertContains($shortUrl4->getShortCode(), $results);
self::assertNotContains($shortUrl2->getShortCode(), $results);
self::assertNotContains($shortUrl5->getShortCode(), $results);
}
}

View File

@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace ShlinkioTest\Shlink\Core\Action;
use Laminas\Diactoros\ServerRequestFactory;
use PHPUnit\Framework\TestCase;
use Prophecy\PhpUnit\ProphecyTrait;
use Prophecy\Prophecy\ObjectProphecy;
use Shlinkio\Shlink\Core\Action\RobotsAction;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface;
class RobotsActionTest extends TestCase
{
use ProphecyTrait;
private RobotsAction $action;
private ObjectProphecy $helper;
protected function setUp(): void
{
$this->helper = $this->prophesize(CrawlingHelperInterface::class);
$this->action = new RobotsAction($this->helper->reveal());
}
/**
* @test
* @dataProvider provideShortCodes
*/
public function buildsRobotsLinesFromCrawlableShortCodes(array $shortCodes, string $expected): void
{
$getShortCodes = $this->helper->listCrawlableShortCodes()->willReturn($shortCodes);
$response = $this->action->handle(ServerRequestFactory::fromGlobals());
self::assertEquals(200, $response->getStatusCode());
self::assertEquals($expected, $response->getBody()->__toString());
self::assertEquals('text/plain', $response->getHeaderLine('Content-Type'));
$getShortCodes->shouldHaveBeenCalledOnce();
}
public function provideShortCodes(): iterable
{
yield 'three short codes' => [['foo', 'bar', 'baz'], <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Allow: /foo
Allow: /bar
Allow: /baz
Disallow: /
ROBOTS];
yield 'five short codes' => [['foo', 'bar', 'some', 'thing', 'baz'], <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Allow: /foo
Allow: /bar
Allow: /some
Allow: /thing
Allow: /baz
Disallow: /
ROBOTS];
yield 'no short codes' => [[], <<<ROBOTS
# For more information about the robots.txt standard, see:
# https://www.robotstxt.org/orig.html
User-agent: *
Disallow: /
ROBOTS];
}
}

View File

@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace ShlinkioTest\Shlink\Core\Crawling;
use Doctrine\ORM\EntityManagerInterface;
use PHPUnit\Framework\TestCase;
use Prophecy\PhpUnit\ProphecyTrait;
use Prophecy\Prophecy\ObjectProphecy;
use Shlinkio\Shlink\Core\Crawling\CrawlingHelper;
use Shlinkio\Shlink\Core\Entity\ShortUrl;
use Shlinkio\Shlink\Core\Repository\ShortUrlRepositoryInterface;
class CrawlingHelperTest extends TestCase
{
use ProphecyTrait;
private CrawlingHelper $helper;
private ObjectProphecy $em;
protected function setUp(): void
{
$this->em = $this->prophesize(EntityManagerInterface::class);
$this->helper = new CrawlingHelper($this->em->reveal());
}
/** @test */
public function listCrawlableShortCodesDelegatesIntoRepository(): void
{
$repo = $this->prophesize(ShortUrlRepositoryInterface::class);
$findCrawlableShortCodes = $repo->findCrawlableShortCodes()->willReturn([]);
$getRepo = $this->em->getRepository(ShortUrl::class)->willReturn($repo->reveal());
$result = $this->helper->listCrawlableShortCodes();
foreach ($result as $shortCode) {
// Result is a generator and therefore, it needs to be iterated
}
$findCrawlableShortCodes->shouldHaveBeenCalledOnce();
$getRepo->shouldHaveBeenCalledOnce();
}
}

View File

@ -59,6 +59,7 @@ class MercureUpdatesGeneratorTest extends TestCase
],
'domain' => null,
'title' => $title,
'crawlable' => false,
],
'visit' => [
'referer' => '',

View File

@ -26,6 +26,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => null,
'title' => 'My cool title',
'crawlable' => true,
];
private const SHORT_URL_DOCS = [
'shortCode' => 'ghi789',
@ -41,6 +42,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => null,
'title' => null,
'crawlable' => false,
];
private const SHORT_URL_CUSTOM_SLUG_AND_DOMAIN = [
'shortCode' => 'custom-with-domain',
@ -56,6 +58,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => 'some-domain.com',
'title' => null,
'crawlable' => false,
];
private const SHORT_URL_META = [
'shortCode' => 'def456',
@ -73,6 +76,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => null,
'title' => null,
'crawlable' => false,
];
private const SHORT_URL_CUSTOM_SLUG = [
'shortCode' => 'custom',
@ -88,6 +92,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => null,
'title' => null,
'crawlable' => false,
];
private const SHORT_URL_CUSTOM_DOMAIN = [
'shortCode' => 'ghi789',
@ -105,6 +110,7 @@ class ListShortUrlsTest extends ApiTestCase
],
'domain' => 'example.com',
'title' => null,
'crawlable' => false,
];
/**

View File

@ -35,6 +35,7 @@ class ShortUrlsFixture extends AbstractFixture implements DependentFixtureInterf
'longUrl' => 'https://shlink.io',
'tags' => ['foo'],
'title' => 'My cool title',
'crawlable' => true,
]), $relationResolver),
'2018-05-01',
);

View File

@ -3,4 +3,3 @@ parameters:
checkGenericClassInNonGenericObjectType: false
ignoreErrors:
- '#If condition is always false#'
- '#setOrderBy\(\) expects array\<int, string\>, array\<string, string\> given#'

View File

@ -1,5 +0,0 @@
# For more information about the robots.txt standard, see:
# http://www.robotstxt.org/orig.html
User-agent: *
Disallow: /