diff --git a/composer.json b/composer.json index 2474cebf..0bb1186a 100644 --- a/composer.json +++ b/composer.json @@ -25,6 +25,7 @@ "geoip2/geoip2": "^2.9", "guzzlehttp/guzzle": "^7.0", "happyr/doctrine-specification": "^2.0", + "jaybizzle/crawler-detect": "^1.2", "laminas/laminas-config": "^3.3", "laminas/laminas-config-aggregator": "^1.1", "laminas/laminas-diactoros": "^2.1.3", diff --git a/data/migrations/Version20210522124633.php b/data/migrations/Version20210522124633.php new file mode 100644 index 00000000..ea486e93 --- /dev/null +++ b/data/migrations/Version20210522124633.php @@ -0,0 +1,28 @@ +getTable('visits'); + $this->skipIf($visits->hasColumn(self::POTENTIAL_BOT_COLUMN)); + $visits->addColumn(self::POTENTIAL_BOT_COLUMN, Types::BOOLEAN, ['default' => false]); + } + + public function down(Schema $schema): void + { + $visits = $schema->getTable('visits'); + $this->skipIf(! $visits->hasColumn(self::POTENTIAL_BOT_COLUMN)); + $visits->dropColumn(self::POTENTIAL_BOT_COLUMN); + } +} diff --git a/docs/async-api/async-api.json b/docs/async-api/async-api.json index 3360d897..b1313d1e 100644 --- a/docs/async-api/async-api.json +++ b/docs/async-api/async-api.json @@ -190,6 +190,10 @@ }, "visitLocation": { "$ref": "#/components/schemas/VisitLocation" + }, + "potentialBot": { + "type": "boolean", + "description": "Tells if Shlink thinks this visits comes potentially from a bot or crawler" } }, "example": { @@ -204,7 +208,8 @@ "longitude": -122.0946, "regionName": "California", "timezone": "America/Los_Angeles" - } + }, + "potentialBot": false } }, "OrphanVisit": { @@ -243,6 +248,7 @@ "regionName": "California", "timezone": "America/Los_Angeles" }, + "potentialBot": false, "visitedUrl": "https://doma.in", "type": "base_url" } diff --git a/docs/swagger/definitions/Visit.json b/docs/swagger/definitions/Visit.json index e004e4fe..ad5fd97b 100644 --- a/docs/swagger/definitions/Visit.json +++ b/docs/swagger/definitions/Visit.json @@ -17,6 +17,10 @@ }, "visitLocation": { "$ref": "./VisitLocation.json" + }, + "potentialBot": { + "type": "boolean", + "description": "Tells if Shlink thinks this visits comes potentially from a bot or crawler" } } } diff --git a/docs/swagger/paths/v1_short-urls_{shortCode}_visits.json b/docs/swagger/paths/v1_short-urls_{shortCode}_visits.json index 03d66a99..1bef6110 100644 --- a/docs/swagger/paths/v1_short-urls_{shortCode}_visits.json +++ b/docs/swagger/paths/v1_short-urls_{shortCode}_visits.json @@ -98,7 +98,8 @@ "referer": "https://twitter.com", "date": "2015-08-20T05:05:03+04:00", "userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", - "visitLocation": null + "visitLocation": null, + "potentialBot": false }, { "referer": "https://t.co", @@ -112,13 +113,15 @@ "longitude": -122.0946, "regionName": "California", "timezone": "America/Los_Angeles" - } + }, + "potentialBot": false }, { "referer": null, "date": "2015-08-20T05:05:03+04:00", "userAgent": "some_web_crawler/1.4", - "visitLocation": null + "visitLocation": null, + "potentialBot": true } ], "pagination": { diff --git a/docs/swagger/paths/v2_tags_{tag}_visits.json b/docs/swagger/paths/v2_tags_{tag}_visits.json index d9d9dda7..ab442793 100644 --- a/docs/swagger/paths/v2_tags_{tag}_visits.json +++ b/docs/swagger/paths/v2_tags_{tag}_visits.json @@ -95,7 +95,8 @@ "referer": "https://twitter.com", "date": "2015-08-20T05:05:03+04:00", "userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", - "visitLocation": null + "visitLocation": null, + "potentialBot": false }, { "referer": "https://t.co", @@ -109,13 +110,15 @@ "longitude": -122.0946, "regionName": "California", "timezone": "America/Los_Angeles" - } + }, + "potentialBot": false }, { "referer": null, "date": "2015-08-20T05:05:03+04:00", "userAgent": "some_web_crawler/1.4", - "visitLocation": null + "visitLocation": null, + "potentialBot": true } ], "pagination": { diff --git a/docs/swagger/paths/v2_visits_orphan.json b/docs/swagger/paths/v2_visits_orphan.json index 683f40ec..7876e703 100644 --- a/docs/swagger/paths/v2_visits_orphan.json +++ b/docs/swagger/paths/v2_visits_orphan.json @@ -87,6 +87,7 @@ "date": "2015-08-20T05:05:03+04:00", "userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", "visitLocation": null, + "potentialBot": false, "visitedUrl": "https://doma.in", "type": "base_url" }, @@ -103,6 +104,7 @@ "regionName": "California", "timezone": "America/Los_Angeles" }, + "potentialBot": false, "visitedUrl": "https://doma.in/foo", "type": "invalid_short_url" }, @@ -111,6 +113,7 @@ "date": "2015-08-20T05:05:03+04:00", "userAgent": "some_web_crawler/1.4", "visitLocation": null, + "potentialBot": true, "visitedUrl": "https://doma.in/foo/bar/baz", "type": "regular_404" } diff --git a/module/Core/config/entities-mappings/Shlinkio.Shlink.Core.Entity.Visit.php b/module/Core/config/entities-mappings/Shlinkio.Shlink.Core.Entity.Visit.php index efcccb65..8886e141 100644 --- a/module/Core/config/entities-mappings/Shlinkio.Shlink.Core.Entity.Visit.php +++ b/module/Core/config/entities-mappings/Shlinkio.Shlink.Core.Entity.Visit.php @@ -65,4 +65,9 @@ return static function (ClassMetadata $metadata, array $emConfig): void { ->columnName('type') ->length(255) ->build(); + + $builder->createField('potentialBot', Types::BOOLEAN) + ->columnName('potential_bot') + ->option('default', false) + ->build(); }; diff --git a/module/Core/functions/functions.php b/module/Core/functions/functions.php index 62df2070..867f7c7d 100644 --- a/module/Core/functions/functions.php +++ b/module/Core/functions/functions.php @@ -7,6 +7,7 @@ namespace Shlinkio\Shlink\Core; use Cake\Chronos\Chronos; use DateTimeInterface; use Fig\Http\Message\StatusCodeInterface; +use Jaybizzle\CrawlerDetect\CrawlerDetect; use Laminas\InputFilter\InputFilter; use PUGX\Shortid\Factory as ShortIdFactory; use Shlinkio\Shlink\Common\Util\DateRange; @@ -128,3 +129,13 @@ function kebabCaseToCamelCase(string $name): string { return lcfirst(str_replace(' ', '', ucwords(str_replace('-', ' ', $name)))); } + +function isCrawler(string $userAgent): bool +{ + static $detector; + if ($detector === null) { + $detector = new CrawlerDetect(); + } + + return $detector->isCrawler($userAgent); +} diff --git a/module/Core/src/Entity/Visit.php b/module/Core/src/Entity/Visit.php index 98d1a4c5..358bedde 100644 --- a/module/Core/src/Entity/Visit.php +++ b/module/Core/src/Entity/Visit.php @@ -13,6 +13,8 @@ use Shlinkio\Shlink\Core\Model\Visitor; use Shlinkio\Shlink\Core\Visit\Model\VisitLocationInterface; use Shlinkio\Shlink\Importer\Model\ImportedShlinkVisit; +use function Shlinkio\Shlink\Core\isCrawler; + class Visit extends AbstractEntity implements JsonSerializable { public const TYPE_VALID_SHORT_URL = 'valid_short_url'; @@ -29,6 +31,7 @@ class Visit extends AbstractEntity implements JsonSerializable private string $type; private ?ShortUrl $shortUrl; private ?VisitLocation $visitLocation = null; + private bool $potentialBot; private function __construct(?ShortUrl $shortUrl, string $type) { @@ -49,6 +52,7 @@ class Visit extends AbstractEntity implements JsonSerializable { $instance = new self($shortUrl, self::TYPE_IMPORTED); $instance->userAgent = $importedVisit->userAgent(); + $instance->potentialBot = isCrawler($instance->userAgent); $instance->referer = $importedVisit->referer(); $instance->date = Chronos::instance($importedVisit->date()); @@ -88,6 +92,7 @@ class Visit extends AbstractEntity implements JsonSerializable $this->referer = $visitor->getReferer(); $this->remoteAddr = $this->processAddress($anonymize, $visitor->getRemoteAddress()); $this->visitedUrl = $visitor->getVisitedUrl(); + $this->potentialBot = $visitor->isPotentialBot(); } private function processAddress(bool $anonymize, ?string $address): ?string @@ -166,6 +171,7 @@ class Visit extends AbstractEntity implements JsonSerializable 'date' => $this->date->toAtomString(), 'userAgent' => $this->userAgent, 'visitLocation' => $this->visitLocation, + 'potentialBot' => $this->potentialBot, ]; } diff --git a/module/Core/src/Model/Visitor.php b/module/Core/src/Model/Visitor.php index 9564a41c..e9bdc36e 100644 --- a/module/Core/src/Model/Visitor.php +++ b/module/Core/src/Model/Visitor.php @@ -8,6 +8,7 @@ use Psr\Http\Message\ServerRequestInterface; use Shlinkio\Shlink\Common\Middleware\IpAddressMiddlewareFactory; use Shlinkio\Shlink\Core\Options\TrackingOptions; +use function Shlinkio\Shlink\Core\isCrawler; use function substr; final class Visitor @@ -21,6 +22,7 @@ final class Visitor private string $referer; private string $visitedUrl; private ?string $remoteAddress; + private bool $potentialBot; public function __construct(string $userAgent, string $referer, ?string $remoteAddress, string $visitedUrl) { @@ -28,6 +30,7 @@ final class Visitor $this->referer = $this->cropToLength($referer, self::REFERER_MAX_LENGTH); $this->visitedUrl = $this->cropToLength($visitedUrl, self::VISITED_URL_MAX_LENGTH); $this->remoteAddress = $this->cropToLength($remoteAddress, self::REMOTE_ADDRESS_MAX_LENGTH); + $this->potentialBot = isCrawler($userAgent); } private function cropToLength(?string $value, int $length): ?string @@ -70,14 +73,22 @@ final class Visitor return $this->visitedUrl; } + public function isPotentialBot(): bool + { + return $this->potentialBot; + } + public function normalizeForTrackingOptions(TrackingOptions $options): self { - $instance = self::emptyInstance(); + $instance = new self( + $options->disableUaTracking() ? '' : $this->userAgent, + $options->disableReferrerTracking() ? '' : $this->referer, + $options->disableIpTracking() ? null : $this->remoteAddress, + $this->visitedUrl, + ); - $instance->userAgent = $options->disableUaTracking() ? '' : $this->userAgent; - $instance->referer = $options->disableReferrerTracking() ? '' : $this->referer; - $instance->remoteAddress = $options->disableIpTracking() ? null : $this->remoteAddress; - $instance->visitedUrl = $this->visitedUrl; + // Keep the fact that the visit was a potential bot, even if we no longer save the user agent + $instance->potentialBot = $this->potentialBot; return $instance; } diff --git a/module/Core/test/Entity/VisitTest.php b/module/Core/test/Entity/VisitTest.php index 7be3c3fc..2d2cb4f8 100644 --- a/module/Core/test/Entity/VisitTest.php +++ b/module/Core/test/Entity/VisitTest.php @@ -12,19 +12,35 @@ use Shlinkio\Shlink\Core\Model\Visitor; class VisitTest extends TestCase { - /** @test */ - public function isProperlyJsonSerialized(): void + /** + * @test + * @dataProvider provideUserAgents + */ + public function isProperlyJsonSerialized(string $userAgent, bool $expectedToBePotentialBot): void { - $visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor('Chrome', 'some site', '1.2.3.4', '')); + $visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor($userAgent, 'some site', '1.2.3.4', '')); self::assertEquals([ 'referer' => 'some site', 'date' => $visit->getDate()->toAtomString(), - 'userAgent' => 'Chrome', + 'userAgent' => $userAgent, 'visitLocation' => null, + 'potentialBot' => $expectedToBePotentialBot, ], $visit->jsonSerialize()); } + public function provideUserAgents(): iterable + { + yield 'Chrome' => [ + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', + false, + ]; + yield 'Firefox' => ['Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0', false]; + yield 'Facebook' => ['cf-facebook', true]; + yield 'Twitter' => ['IDG Twitter Links Resolver', true]; + yield 'Guzzle' => ['guzzlehttp', true]; + } + /** * @test * @dataProvider provideAddresses diff --git a/module/Core/test/Mercure/MercureUpdatesGeneratorTest.php b/module/Core/test/Mercure/MercureUpdatesGeneratorTest.php index 1d460623..86d1b3d5 100644 --- a/module/Core/test/Mercure/MercureUpdatesGeneratorTest.php +++ b/module/Core/test/Mercure/MercureUpdatesGeneratorTest.php @@ -66,6 +66,7 @@ class MercureUpdatesGeneratorTest extends TestCase 'userAgent' => '', 'visitLocation' => null, 'date' => $visit->getDate()->toAtomString(), + 'potentialBot' => false, ], ], json_decode($update->getData())); } @@ -91,6 +92,7 @@ class MercureUpdatesGeneratorTest extends TestCase 'userAgent' => '', 'visitLocation' => null, 'date' => $orphanVisit->getDate()->toAtomString(), + 'potentialBot' => false, 'visitedUrl' => $orphanVisit->visitedUrl(), 'type' => $orphanVisit->type(), ], diff --git a/module/Core/test/Visit/Transformer/OrphanVisitDataTransformerTest.php b/module/Core/test/Visit/Transformer/OrphanVisitDataTransformerTest.php index 61193c86..c836cd7c 100644 --- a/module/Core/test/Visit/Transformer/OrphanVisitDataTransformerTest.php +++ b/module/Core/test/Visit/Transformer/OrphanVisitDataTransformerTest.php @@ -42,6 +42,7 @@ class OrphanVisitDataTransformerTest extends TestCase 'date' => $visit->getDate()->toAtomString(), 'userAgent' => '', 'visitLocation' => null, + 'potentialBot' => false, 'visitedUrl' => '', 'type' => Visit::TYPE_BASE_URL, ], @@ -57,6 +58,7 @@ class OrphanVisitDataTransformerTest extends TestCase 'date' => $visit->getDate()->toAtomString(), 'userAgent' => 'foo', 'visitLocation' => null, + 'potentialBot' => false, 'visitedUrl' => 'https://example.com/foo', 'type' => Visit::TYPE_INVALID_SHORT_URL, ], @@ -74,6 +76,7 @@ class OrphanVisitDataTransformerTest extends TestCase 'date' => $visit->getDate()->toAtomString(), 'userAgent' => 'user-agent', 'visitLocation' => $location, + 'potentialBot' => false, 'visitedUrl' => 'https://doma.in/foo/bar', 'type' => Visit::TYPE_REGULAR_404, ], diff --git a/module/Rest/test-api/Action/OrphanVisitsTest.php b/module/Rest/test-api/Action/OrphanVisitsTest.php index ea890f9f..06857653 100644 --- a/module/Rest/test-api/Action/OrphanVisitsTest.php +++ b/module/Rest/test-api/Action/OrphanVisitsTest.php @@ -12,17 +12,18 @@ class OrphanVisitsTest extends ApiTestCase private const INVALID_SHORT_URL = [ 'referer' => 'https://doma.in/foo', 'date' => '2020-03-01T00:00:00+00:00', - 'userAgent' => 'shlink-tests-agent', + 'userAgent' => 'cf-facebook', 'visitLocation' => null, + 'potentialBot' => true, 'visitedUrl' => 'foo.com', 'type' => 'invalid_short_url', - ]; private const REGULAR_NOT_FOUND = [ 'referer' => 'https://doma.in/foo/bar', 'date' => '2020-02-01T00:00:00+00:00', 'userAgent' => 'shlink-tests-agent', 'visitLocation' => null, + 'potentialBot' => false, 'visitedUrl' => '', 'type' => 'regular_404', ]; @@ -31,6 +32,7 @@ class OrphanVisitsTest extends ApiTestCase 'date' => '2020-01-01T00:00:00+00:00', 'userAgent' => 'shlink-tests-agent', 'visitLocation' => null, + 'potentialBot' => false, 'visitedUrl' => '', 'type' => 'base_url', ]; diff --git a/module/Rest/test-api/Fixtures/VisitsFixture.php b/module/Rest/test-api/Fixtures/VisitsFixture.php index 412c79d5..62e1527d 100644 --- a/module/Rest/test-api/Fixtures/VisitsFixture.php +++ b/module/Rest/test-api/Fixtures/VisitsFixture.php @@ -58,7 +58,7 @@ class VisitsFixture extends AbstractFixture implements DependentFixtureInterface '2020-02-01', )); $manager->persist($this->setVisitDate( - Visit::forInvalidShortUrl(new Visitor('shlink-tests-agent', 'https://doma.in/foo', '1.2.3.4', 'foo.com')), + Visit::forInvalidShortUrl(new Visitor('cf-facebook', 'https://doma.in/foo', '1.2.3.4', 'foo.com')), '2020-03-01', ));